任務1:利用cookie可以免去登入的煩惱(驗證碼)
'''
隻需要有登入後的cookie,就可以繞過驗證碼
登入後的cookie可以通過Selenium用第三方(微網誌)進行登入,不需要進行淘寶的滑動驗證碼
'''
import requests
from urllib.parse import urlencode
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
# 登入後的cookie
'cookie': 'xxx',
}
params = {
'q': 'iphone',
'imgfile':'',
'commend': 'all',
'ssid': 's5-e',
'search_type': 'item',
'sourceId': 'tb.index',
'spm': 'a21bo.2017.201856-taobao-item.2',
'ie': 'utf8',
'initiative_id': 'tbindexz_20170306',
}
url = 'https://s.taobao.com/search?' + urlencode(params)
s = requests.Session()
response = s.get(url,headers=headers,verify=False).text
print(response)
任務2:爬取淘寶商品資訊
from selenium import webdriver
# 通用選擇
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 逾時
from selenium.common.exceptions import TimeoutException
from lxml import etree
import json
import random
import time
browser = webdriver.Chrome()
browser.set_window_size(1400, 900)
'''
爬到第19頁的時候淘寶會彈窗,熟悉的滑動解鎖= =,以後解決了這個滑動在繼續吧
總體思路沒問題
'''
def taobao_login():
'''
淘寶的滑動驗證碼過不去,使用第三方登入
'''
'''
# 需要先登入
login = browser.find_element(By.ID,'J_Quick2Static').click()
username = browser.find_element(By.CSS_SELECTOR,'#TPL_username_1')
username.send_keys('XXX')
password = browser.find_element(By.CSS_SELECTOR,'#TPL_password_1')
password.send_keys('XXX')
button = browser.find_element(By.ID, 'J_SubmitStatic').click()
ActionChains(browser).move_by_offset(random.randint(10, 60), random.randint(10, 60)).perform()
# 判斷大小
input2 = browser.find_element(By.ID,'nc_1__scale_text')
print(input2.size)
# 滑動驗證碼
action = ActionChains(browser)
source = browser.find_element(By.ID,'nc_1_n1z')
# 按住不放
action.click_and_hold(source).perform()
# 需要滑動的坐标
action.move_by_offset(298,0)
# 釋放滑鼠
action.release().perform()
'''
def weibo_login():
'''
賬号密碼輸入後有滑動驗證碼,滑動成功也無法登入,考慮繞過去,采用第三方登入
通過微網誌賬号登入
'''
weibo_button = browser.find_element(By.CSS_SELECTOR, '.weibo-login')
weibo_button.click()
# 網速有點慢 需要先注冊一個微網誌賬号且綁定一個淘寶賬号,真麻煩= =
# 多登入幾次,微網誌就會彈出目前網絡逾時,請稍後再試(600002),需要等一段時間才能進去
time.sleep(2)
username = browser.find_element(By.NAME, 'username')
username.send_keys('賬号')
time.sleep(1)
password = browser.find_element(By.NAME, 'password')
password.send_keys('密碼')
browser.find_element(By.CSS_SELECTOR, '.W_btn_g').click()
def index_page(page):
print('正在抓取第', page, '頁')
try:
if page > 1:
# 等待直到頁碼輸入框出現
input_box = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
# 等待确定按鈕可以被點選
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input_box.clear()
input_box.send_keys(page)
submit.click()
# time.sleep(2)
# 等待直到跳轉的頁碼等于高亮顯示的頁碼,說明跳轉成功
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
# 等待直到所有商品加載出來
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
# 爬取詳細資訊
# time.sleep(3)
get_products()
except TimeoutException:
# 逾時就重試一次
index_page(page)
def get_products():
html = etree.HTML(browser.page_source)
items = html.xpath('//div[@class="m-itemlist"]//div[@class="items"]/div')
for item in items:
product = {}
product['image'] = item.xpath('.//img/@data-src')
product['price'] = item.xpath('.//strong/text()')
product['title'] = item.xpath('.//img[@class="J_ItemPic img"]/@alt')
product['shop'] = item.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()')
product['location'] = item.xpath('.//div[@class="location"]/text()')
print(product)
result = json.dumps(product, ensure_ascii=False) + ',\n'
with open('product.json', 'ab') as f:
f.write(result.encode('utf-8'))
# print(browser.get_cookies())
# print(browser.page_source)
wait = WebDriverWait(browser, 10)
MAX_PACE = 100
def main():
try:
browser.get('http://www.taobao.com')
print(browser.window_handles)
input = browser.find_element(By.ID, 'q')
# 搜尋的資訊
input.send_keys('iphone')
input.send_keys(Keys.ENTER)
browser.find_element(By.ID, 'J_Quick2Static').click()
time.sleep(1)
# 處理登入
weibo_login()
except:
main()
for i in range(1, MAX_PACE + 1):
index_page(i)
if __name__ == '__main__':
main()
總結:
1.學會利用cookie繞過驗證碼
2.學會從第三方進入需要爬取的網站
3.淘寶的滑動解鎖(真的麻煩)
4.通過selenium模拟點選,爬取網站,雖然隻爬了20頁,但是思路沒問題
淘寶必殺:滑動三連

小知識:
import json
a = {"name":"123","age":123}
text = json.dumps(a,ensure_ascii=False) + ",\n"
with open('1234.json','wb') as f:
f.write(text.encode('utf-8'))