踩坑
在使用pyquery時,對于id或class類都可以成功選取,但是直接使用标簽名時選區時一直傳回空值,擷取不到節點。原來是因為pyquery預設解析後的文檔是xmlns格式,這造成了無法按标簽名稱去選取
解決方法
html = browser.page_source
doc = pq(html, parser = ‘html’)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import time
import pymongo
# chrome_option = webdriver.ChromeOptions()
# chrome_option.add_argument('--headless')
# browser = webdriver.Chrome(chrome_option=chrome_option)
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search():
browser.get('https://www.jd.com')
try:
key_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button > i')))
key_input.clear()
key_input.send_keys('年貨')
submit.click()
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
total = wait.until(EC.presence_of_element_located((
By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))
get_product()
return int(total.text)
except TimeoutException:
return search()
def next_page(page_number):
try:
num_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
num_input.clear()
num_input.send_keys(page_number)
submit.click()
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page_number)))
get_product()
except TimeoutException:
return next_page(page_number)
def get_product():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList .clearfix .gl-item')))
html = browser.page_source
doc = pq(html, parser='html')
items = doc('#J_goodsList .gl-warp.clearfix .gl-item .gl-i-wrap').items()
for item in items:
product = {
'image': item.find('.p-img a img').attr('data-lazy-img'),
'price': item.find('.p-price strong i').text(),
'comment': item.find('.p-commit strong a').text(),
'name': item.find('.p-name.p-name-type-2 a em').text().replace('\n', ' '),
'shop': item.find('.p-shop').text()
}
print(product)
save_to_mongo(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存儲到MongDB成功', result)
except Exception:
print('存儲到MongDB失敗', result)
def main():
total = search()
for i in range(2, total+1):
next_page(i)
browser.close()
MONGO_URL = 'localhost'
MONGO_DB = 'jd'
MONGO_TABLE = '年貨'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
if __name__ == '__main__':
main()
參考 崔慶才老師使用Selenium爬取淘寶商品
python中pyquery無法擷取标簽名的dom節點