天天看點

seo必備網站分析工具,關鍵詞百度搜尋結果查詢導出源碼

seo必備網站分析工具,關鍵詞百度搜尋結果查詢導出源碼

兩個簡單的版本,關于百度搜尋結果的采集抓取,可以擷取到競争對手的網站,加以分析和研究,隻需輸入關鍵詞和搜尋頁碼,即可完成對于競争對手的擷取和研究,給出兩個版本,希望可以起到參考和幫助!

seo必備網站分析工具,關鍵詞百度搜尋結果查詢導出源碼

版本一

特點

  • cookies讀取,随機選取一個通路網頁
  • 導出結果排除了百度自家産品
  • excel導出資料
  • 簡單多線程案例可參考
#百度搜尋結果抓取
#author/微信:huguo00289
# -*- coding: utf-8 -*-

import requests,time,random
from fake_useragent import UserAgent
from lxml import etree
import threading
import xlsxwriter



class Baidu_search():
    def __init__(self):
        self.url="https://www.baidu.com/s?wd="
        self.ua=UserAgent()
        self.search_datas=[]



    #擷取cookies
    def get_cookies(self):
        with open("cookie.txt", "r", encoding="utf-8") as f:
            cookies = f.readlines()
            cookie=random.choice(cookies)
            cookie=cookie.strip()
        return cookie


    #擷取搜尋結果
    def get_search_objects(self,search_url):
        headers={
            "User-Agent":self.ua.random,
            'Cookie':self.get_cookies(),
        }
        html=requests.get(search_url,headers=headers,timeout=8).content.decode("utf-8")
        time.sleep(2)
        req=etree.HTML(html)
        h3s=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a')
        hrefs=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a/@href')
        for h3,href in zip(h3s,hrefs):
            h3=h3.xpath('.//text()')
            h3=''.join(h3)
            href=self.get_website_url(href)
            data=h3,href
            self.search_datas.append(data)
            print(data)




    # 擷取真實位址
    def get_website_url(self,baidu_url):
        r = requests.head(baidu_url, stream=True)
        website_url = r.headers['Location']
        # print(website_url)
        return website_url


    #插入excel
    def write_to_xlsx(self, file_name):
        workbook = xlsxwriter.Workbook(f'{file_name}_{time.strftime("%Y-%m-%d ", time.localtime())}.xlsx')  # 建立一個Excel檔案
        worksheet = workbook.add_worksheet(file_name)
        title = ['标題', '網址']  # 表格title
        worksheet.write_row('A1', title)
        for index, data in enumerate(self.search_datas):
            # content = content.rstrip()
            # keyword, rank, include_num, chart_url, title, game_id, company_num, long_words_num = data
            num0 = str(index + 2)
            row = 'A' + num0
            # data = [name, size, game_id]
            worksheet.write_row(row, data)
        workbook.close()

        print("搜尋結果資料插入excel表格成功!")





    def main(self,keyword,num):
        for i in range(0, num):
            print(f'正在查詢第{i+1}頁百度搜尋結果資料..')
            ym = i * 10
            search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
            self.get_search_objects(search_url)

        self.write_to_xlsx(keyword)


    #多線程
    def Thread_main(self,keyword,num):
        threadings=[]
        for i in range(0, num):
            print(f'正在查詢第{i+1}頁百度搜尋結果資料..')
            ym = i * 10
            search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
            t=threading.Thread(target=self.get_search_objects,args=(search_url,))
            threadings.append(t)
            t.start()

        for x in threadings:
            x.join()

        print("多線程查詢百度搜尋結果完成")

        print(self.search_datas)


if __name__=='__main__':
    keyword="工業設計"
    num=10
    spider=Baidu_search()
    spider.main(keyword,num)
    #spider.Thread_main(keyword, num)

           

版本二

  • cookies 固定,不可變
  • 資料幾乎全部導出,排名也已經寫入
#關鍵詞百度搜尋結果查詢
#20191121 by 微信:huguo00289
# -*- coding: UTF-8 -*-

import requests,time
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json


def ua():
    ua = UserAgent()
    return ua.random

headers={
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie':Cookie ,
    'Host': 'www.baidu.com',
    'Referer': 'https://www.baidu.com/?tn=48021271_6_hao_pg',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent':ua()
    #'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}


#擷取百度跳轉真實網址
def get_trueurl(url):
    try:
        r = requests.head(url, stream=True)
        zsurl = r.headers['Location']
    except:
        zsurl=url
    return zsurl

#擷取網頁資訊
def get_response(url):
    """
    #代理ip
    proxy = '120.83.105.195:9999'
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy
    }
    response=requests.get(url,headers=ua(),proxies=proxies,timeout=10)"""
    #response = requests.get(url, headers=ua(),timeout=10)
    response = requests.get(url, headers=headers, timeout=10)
    print(f'狀态碼:{response.status_code}')
    time.sleep(2)
    response.encoding='utf-8'
    req=response.text
    return req

#查詢搜尋結果
def get_bdpm(keyword,num):
    """
    #轉換為utf-8編碼
    key_word = urllib.parse.quote(keyword)
    print(key_word)
    """
    for i in range(0,int(num)):
        print(f'正在查詢{i + 1}頁搜尋結果...')
        ym=i * 10
        url=f"https://www.baidu.com/s?wd={keyword}&ie=UTF-8&pn={ym}"
        #print(url)
        req=get_response(url)
        #print(req)
        soup=BeautifulSoup(req,'lxml')
        divs=soup.find('div',id="content_left").find_all('div')
        for div in divs:
            if 'class="result'in str(div):
                try:
                    pm=div['id']
                except:
                    pm=''
                title=div.find('a').get_text()
                title=title.strip()
                href=div.find('a')['href']
                zsurl=get_trueurl(href)
                print(pm,title,zsurl)
        time.sleep(5)







if __name__ == '__main__':
    while True:
        keyword =input('請輸入要查詢的關鍵詞:')
        num = input('請輸入要查詢的頁碼數:')
        try:
            get_bdpm(keyword,num)
        except IndexError as e:
            print(e)
            print("查詢結果失敗!")