导入必要的包
这些包我是记不住了,看来每次用都要来看一下
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from lxml import etree
import time
import pymongo
启动Chrome
将phantomjs放到环境变量里后,同样可以用
wait是简写,方便之后调用
此时可以打开chrome
broswer = webdriver.Chrome()
broswer.maximize_window()
wait = WebDriverWait(broswer,10)
搜索美食页面
By.CSS_SELECTOR是指通过选择器selector,选定位置,引号中内容为网页中复制该位置的selector
先找到输入框的位置,在找到可以点击确定的位置
total为总页数
def search():
broswer.get("https://www.jd.com")
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button"))
)
input.send_keys("美食")
submit.click()
#等待页数加载出来
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > em:nth-child(1) > b"))
)
return total.text
except TimeoutException:
search()
解析单页面
没啥好说的,就是一直不清楚什么时候要用excract_first()
还有一个问题是当有子标签的时候,如何提取标签内的所有内容,比如“美食”字段被设为红色,从而打断一句话
def parse(html):
n=0
html = etree.HTML(html)
goods = html.xpath("//div[@id='J_goodsList']/ul/li")
for good in goods:
price = good.xpath("./div/div/strong/i/text()")
title = good.xpath("./div/div/a/em/text()")
evaluate = good.xpath("./div/div/strong/a/text()")
store = good.xpath("./div/div/span/a/text()")
if len(store)==0:
store="没有店铺"
else:
store = store[0]
if len(evaluate)==0:
evaluate="没有评价"
else:
evaluate = evaluate[0]
n+=1
food = {
"title":title[0],
"price":price[0],
"store":store,
"evaluate":evaluate,
}
save_to_mongo(food)
print("共%d项"%n)
翻页
通过每次输入页码,点击确定来翻页
def next_page(page_num):
# 滚动滑轮到最下方
while len(broswer.find_elements_by_class_name('gl-item')) < 60:
broswer.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
)
input.clear()
input.send_keys(page_num)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_bottomPage > span.p-skip > a"))
)
submit.click()
# 获取网页源代码
html = broswer.page_source
parse(html)
except:
next_page(page_num)
保存到mongodb
MONGO_HOST = '127.0.0.1'
MONGO_DB = 'JD'
client = pymongo.MongoClient(host=MONGO_HOST)
mdb = client[MONGO_DB]
def save_to_mongo(result):
try:
if mdb[MONGO_DB].insert(result):
print("保存成功",result)
except:
print("保存错误")
主程序
def main():
total = int(search())
for page in range(2,total+1):
next_page(page)
print("第%s页"%page)
if __name__=="__main__":
main()