爬蟲小練習：Chrome無彈窗爬取淘寶美食資訊

2023-05-21 12:02:10

# coding:utf-8

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from lxml import etree

import re


options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

def search():
	try:
		print('打開網頁中...')
		driver.get('https://www.taobao.com/')
		driver.maximize_window()

		search = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="q"]')))
		submit = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="J_TSearchForm"]/div[1]/button')))
		search.send_keys('美食')
		submit.click()
		print('搜尋中...')
		# 等待頁面關鍵元素完全加載，以頁面底部的總頁數為參照
		total_page = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-pager"]/div/div/div/div[1]')))
		print('搜尋完成！')
		get_product()
		# print(total_page.text)
		return total_page.text
	except:
		print('搜尋失敗！再來一次...')
		search()

def next_page(number):
	try:
		number_input = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-pager"]/div/div/div/div[2]/input')))
		submit = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="mainsrp-pager"]/div/div/div/div[2]/span[3]')))
		number_input.clear()
		number_input.send_keys(number)
		submit.click()
		print('搜尋第%s頁中...'%(number))
		WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')))
		print('搜尋第%s頁完成...'%(number))
		get_product()
	except:
		next_page(number)

def get_product():
	print('加載商品頁面中...')
	WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')))
	print('目前頁商品加載完成...')
	selector = etree.HTML(driver.page_source)
	# print(html)
	shop_names = selector.xpath('//*[@id="mainsrp-itemlist"]//div[1]/a/span[2]/text()')
	product_names = selector.xpath('//*[@id="mainsrp-itemlist"]//div/div[1]/a/img/@alt')
	product_prices = selector.xpath('//*[@id="mainsrp-itemlist"]//div[2]/div[1]/div[1]/strong/text()')
	image_urls = selector.xpath('//*[@id="mainsrp-itemlist"]//div/div[1]/a/img/@src')

	for item in zip(shop_names,product_names,product_prices,image_urls):
		# print(item)
		# print(type(item[0])) <class 'lxml.etree._ElementUnicodeResult'>
		# print(type(item[1])) <class 'lxml.etree._ElementUnicodeResult'>
		# print(type(item[2])) <class 'lxml.etree._ElementUnicodeResult'>
		# print(type(item[3])) <class 'lxml.etree._ElementUnicodeResult'>
		product = {
			'shop_names': item[0],
			'product_names': item[1],
			'product_prices': '¥' + item[2],
			'image_urls': item[3]
		}
		print(product)


def main():
	count_text = search()
	count = int(re.compile('(\d+)').search(count_text).group(1))
	for i in range(2,count+1):
		next_page(i)


if __name__ == '__main__':
	main()

爬蟲小練習：Chrome無彈窗爬取淘寶美食資訊

繼續閱讀

Python爬蟲之網站超清圖檔爬取(2021.3.29)

Python入門級爬取百度百科詞條

16Python爬蟲---Scrapy常用指令

Python爬蟲基本庫的使用第二章基本庫的使用

Python爬蟲（四）lxml、xpath安裝子產品導入查找節點屬性查找 @ 符号使用謂語選取未知節點擷取文本和屬性

爬蟲學習之04-request子產品擷取糗事百科一張熱圖

python3下用selenium庫和chrome的headless模式實作網頁抓取（注釋中有用phantomJS的小段代碼）

【Python爬蟲案例學習19】多程序爬取某圖檔網站

python爬蟲實戰之爬取成語大全

【爬取百度首頁】-将整個html源碼儲存-headers使用一、網頁分析二、代碼實作與步驟三、結果分析

爬取百度貼吧

爬取貓眼電影--靜态網頁反爬與多線程/多程序爬取網頁解析爬取代碼多線程與多程序

requests子產品進行人人網模拟登陸

2023爬蟲學習筆記 -- 多線程操作

Python爬蟲學習（1）

Boss直聘Python爬蟲實戰