導入必要的包
這些包我是記不住了,看來每次用都要來看一下
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from lxml import etree
import time
import pymongo
啟動Chrome
将phantomjs放到環境變量裡後,同樣可以用
wait是簡寫,友善之後調用
此時可以打開chrome
broswer = webdriver.Chrome()
broswer.maximize_window()
wait = WebDriverWait(broswer,10)
搜尋美食頁面
By.CSS_SELECTOR是指通過選擇器selector,標明位置,引号中内容為網頁中複制該位置的selector
先找到輸入框的位置,在找到可以點選确定的位置
total為總頁數
def search():
broswer.get("https://www.jd.com")
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button"))
)
input.send_keys("美食")
submit.click()
#等待頁數加載出來
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > em:nth-child(1) > b"))
)
return total.text
except TimeoutException:
search()
解析單頁面
沒啥好說的,就是一直不清楚什麼時候要用excract_first()
還有一個問題是當有子标簽的時候,如何提取标簽内的所有内容,比如“美食”字段被設為紅色,進而打斷一句話
def parse(html):
n=0
html = etree.HTML(html)
goods = html.xpath("//div[@id='J_goodsList']/ul/li")
for good in goods:
price = good.xpath("./div/div/strong/i/text()")
title = good.xpath("./div/div/a/em/text()")
evaluate = good.xpath("./div/div/strong/a/text()")
store = good.xpath("./div/div/span/a/text()")
if len(store)==0:
store="沒有店鋪"
else:
store = store[0]
if len(evaluate)==0:
evaluate="沒有評價"
else:
evaluate = evaluate[0]
n+=1
food = {
"title":title[0],
"price":price[0],
"store":store,
"evaluate":evaluate,
}
save_to_mongo(food)
print("共%d項"%n)
翻頁
通過每次輸入頁碼,點選确定來翻頁
def next_page(page_num):
# 滾動滑輪到最下方
while len(broswer.find_elements_by_class_name('gl-item')) < 60:
broswer.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
)
input.clear()
input.send_keys(page_num)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > a"))
)
submit.click()
# 擷取網頁源代碼
html = broswer.page_source
parse(html)
except:
next_page(page_num)
儲存到mongodb
MONGO_HOST = '127.0.0.1'
MONGO_DB = 'JD'
client = pymongo.MongoClient(host=MONGO_HOST)
mdb = client[MONGO_DB]
def save_to_mongo(result):
try:
if mdb[MONGO_DB].insert(result):
print("儲存成功",result)
except:
print("儲存錯誤")
主程式
def main():
total = int(search())
for page in range(2,total+1):
next_page(page)
print("第%s頁"%page)
if __name__=="__main__":
main()