天天看點

Selenium爬取京東資料

導入必要的包

這些包我是記不住了,看來每次用都要來看一下

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from lxml import etree
import time
import pymongo
           

啟動Chrome

将phantomjs放到環境變量裡後,同樣可以用

wait是簡寫,友善之後調用

此時可以打開chrome

broswer = webdriver.Chrome()
broswer.maximize_window()
wait = WebDriverWait(broswer,10)
           

搜尋美食頁面

By.CSS_SELECTOR是指通過選擇器selector,標明位置,引号中内容為網頁中複制該位置的selector

先找到輸入框的位置,在找到可以點選确定的位置

total為總頁數

def search():
    broswer.get("https://www.jd.com")
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))
        )
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button"))
        )
        input.send_keys("美食")
        submit.click()
        #等待頁數加載出來
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > em:nth-child(1) > b"))
        )
        return total.text
    except TimeoutException:
        search()
           

解析單頁面

沒啥好說的,就是一直不清楚什麼時候要用excract_first()

還有一個問題是當有子标簽的時候,如何提取标簽内的所有内容,比如“美食”字段被設為紅色,進而打斷一句話

def parse(html):
    n=0
    html = etree.HTML(html)
    goods = html.xpath("//div[@id='J_goodsList']/ul/li")
    for good in goods:
        price = good.xpath("./div/div/strong/i/text()")
        title = good.xpath("./div/div/a/em/text()")
        evaluate = good.xpath("./div/div/strong/a/text()")
        store = good.xpath("./div/div/span/a/text()")
        if len(store)==0:
            store="沒有店鋪"
        else:
            store = store[0]
        if len(evaluate)==0:
            evaluate="沒有評價"
        else: 
            evaluate = evaluate[0]
        n+=1
        food = {
            "title":title[0],
            "price":price[0],
            "store":store,
            "evaluate":evaluate,
        }
        save_to_mongo(food)
    print("共%d項"%n)
           

翻頁

通過每次輸入頁碼,點選确定來翻頁

def next_page(page_num):
#     滾動滑輪到最下方
    while len(broswer.find_elements_by_class_name('gl-item')) < 60:
        broswer.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(1)

    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
        )
        input.clear()
        input.send_keys(page_num)
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > a"))
        )
        submit.click()
        #         擷取網頁源代碼
        html = broswer.page_source
        parse(html)
    except:
        next_page(page_num)
           

儲存到mongodb

MONGO_HOST = '127.0.0.1'
MONGO_DB = 'JD'
client = pymongo.MongoClient(host=MONGO_HOST)
mdb = client[MONGO_DB]
def save_to_mongo(result):
    try:
        if mdb[MONGO_DB].insert(result):
            print("儲存成功",result)
    except:
        print("儲存錯誤")
           

主程式

def main():
    total = int(search())
    for page in range(2,total+1):
        next_page(page)
        print("第%s頁"%page)
    
if __name__=="__main__":
    main()
           

繼續閱讀