利用selenium模拟登陸,爬取文章資訊
代碼如下:
import time
from selenium import webdriver
from lxml import etree
import json
browser = webdriver.Chrome()
url = 'http://www.dxy.cn/bbs/index.html'
browser.get(url)
time.sleep(3)
browser.maximize_window()#打開網頁視窗
time.sleep(5)
#browser.switch_to.frame(0)#找到郵箱賬号登入框對應的iframe
web_login = browser.find_element_by_xpath('//*[@id="headerwarp"]/div/div[1]/div/a[1]').click()#點選登陸
web_computer = browser.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/a[2]/i').click()#點到電腦登陸界面
web_loginput = browser.find_element_by_xpath('//*[@id="username"]').send_keys('***********')#輸入賬号
password = browser.find_element_by_xpath('//*[@id="user"]/div[1]/div[1]/div[1]/div[2]/input')#找到密碼輸入框
password.send_keys('**********')#輸入自己的密碼
login_em = browser.find_element_by_xpath('//*[@id="user"]/div[1]/div[3]/button')#找到登陸按鈕
login_em.click()#點選登陸按鈕
time.sleep(30)
#此處有驗證碼,暫時未解決,人工驗證後爬取資訊
browser.get('http://www.dxy.cn/bbs/topic/509959?keywords=%E6%99%95%E5%8E%A5%E5%BE%85%E6%9F%A5%E2%80%94%E2%80%94%E8%AF%B7%E6%95%99%E5%90%84%E4%BD%8D%E5%90%8C%E4%BB%81+-+%E5%BF%83%E8%A1%80%E7%AE%A1%E4%B8%93%E4%B8%9A%E8%AE%A8%E8%AE%BA%E7%89%88+-%E4%B8%81%E9%A6%99%E5%9B%AD%E8%AE%BA%E5%9D%9B%E2%80%8B+')
html = browser.page_source
selector = etree.HTML(html)
use = selector.xpath("""//*/table/tbody/tr/td[1]/div[2]/a/text()""")
s = selector.xpath('//*[@id="postcontainer"]')[0].xpath('div//td[@class="postbody"]')
print(len(selector.xpath('//*[@id="postcontainer"]')))
L = []
for uses,ss in zip(use,s):
a = "使用者:" + uses
b = ":" + ''.join(ss.xpath('text()')).strip()
#b = "回複内容:" + ss.strip()
dic = {a:b}
L.append(dic)
with open("丁香園資訊.csv", 'a', encoding="utf-8") as f:
f.write(json.dumps(dic, ensure_ascii=False)+'\n')
print(L)#列印資訊,檢視是否有誤
遺留問題:未解決驗證碼問題