目錄
- “淘寶商品資訊定向爬蟲”執行個體介紹
-
- 功能描述
- 程式的結構設計
- 代碼實作
“淘寶商品資訊定向爬蟲”執行個體介紹
功能描述
目标:擷取淘寶搜尋頁面的資訊,提取其中的商品名稱和價格。
了解:淘寶的搜尋接口 翻頁的處理
技術路線:requests re
程式的結構設計
步驟1:送出商品搜尋請求,循環擷取頁面
步驟2:對于每個頁面,提取商品名稱和價格資訊
步驟3:将資訊輸出到螢幕上
代碼實作
用爬蟲爬取淘寶,淘寶網有robots協定是以不能直接爬取,需要登入擷取頭部headers資訊。
步驟1:登入淘寶,進入搜尋頁,F12
步驟2:選擇Network,Ctrl+R重新整理,找到上方以search?為開頭的檔案,右鍵
步驟3:選擇copy,copy as cURL(bash)
步驟4:在轉換,将上一步複制的内容粘貼到curl command視窗
5.複制右側的headers内容,在程式中用以變量header儲存,作為參數傳給requests.get(url,headers=header)
代碼:
#淘寶商品資訊定向爬蟲
import re
import requests
def getHTMLText(url):#從網絡擷取網友内容
try:
headers = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'referer': 'https://s.taobao.com/search?q=lianyiq&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'thw=cn; UM_distinctid=1720211afb513a-0b07283cc9ace7-376b4502-100200-1720211afb617b; enc=rYjgSgMyYkg%2FWHRnkhBKczSuTdnmKNicHZCxkPESCfbhLRolDnVeHRnbdLgMUyHYA5%2Fvp9b6FITVmMBkYTpCQw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; cna=Aj2tEmM7FXsCAd7SFZ7S6J2H; miid=44504151680266511; __guid=154677242.3002398488893500400.1589181913539.9639; t=7e6e4712321c707a754fc6568421c9b2; _m_h5_tk=8377efe03ede70801f1356259b646123_1589211624015; _m_h5_tk_enc=2d19bca4701c28e42fa77085806ce2e2; cookie2=1678c735d9d63323283551e33bf578d5; v=0; _tb_token_=e6306b649d53b; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=E%2FWMJmPt9MNtf3%2Bry9c2P; unb=4108961318; uc3=id2=Vy0T7fP3FE8z2A%3D%3D&lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=F5RHpCj6joaWIOg%3D&vt3=F8dBxGXFrgwrDIwhRj0%3D; csg=189c426b; lgc=tb253414682; cookie17=Vy0T7fP3FE8z2A%3D%3D; dnk=tb253414682; skt=8bcd7b231758f963; existShop=MTU4OTM3ODA3OA%3D%3D; uc4=id4=0%40VXqdHlRhUtsIpwQSgkmFlck8ep4m&nk4=0%40FY4MthZ8rXYbFhGt1m4DD7eA6Nemhg%3D%3D; tracknick=tb253414682; _cc_=W5iHLLyFfA%3D%3D; _l_g_=Ug%3D%3D; sg=28e; _nk_=tb253414682; cookie1=UIZs8e27JotrvGDNmcz3ohsrN8Jj6xEX6DshhvBtiN8%3D; tfstk=cxhhBI4vgvyBDnHIlwNIr5ESuRGhaNbUA-eZ_XwjuTQZk9Ga8s4dQt5kcMZZHpB5.; mt=ci=89_1; uc1=cookie14=UoTUM2M264i6GA%3D%3D&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&pas=0&cookie21=VFC%2FuZ9ainBZ&existShop=false&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D; JSESSIONID=3F6CDD6929AF9A0B3703ABC5E8E83DE2; monitor_count=9; l=eBxygvKRQ3TIO3fLBOfwourza77OsIRAXuPzaNbMiT5P_S1p5BAPWZbdRJ89CnhVh64WR3rEQAfvBeYBqIv4n5U62j-la1Dmn; isg=BHd3G57G5fIUk2F6YAvC2ajYBmvBPEueNmQ3fckknsateJe60Q0z7utaX9gm0yMW',
}
r=requests.get(url,headers=headers)
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
except:
print("怕去失敗")
def parsePage(ilt,html):#解析
try:
plt = re.findall(r'\"view_price\":\"\d+\.\d*\"',html)
tlt = re.findall(r'\"raw_title\":\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split('\"')[3])
title = tlt[i].split('\"')[3]
ilt.append([title,price])
except:
print("解析出錯")
def printGoodsList(ilt,num):#輸出
tplt="{0:^10}\t{1:{3}^20}\t{2:^14}"#中間一行用第三元素填充(中文)
#print(tplt.format("排名","學校名稱","城市","總分",chr(12288)))
#tplt="{:4}\t{:20}\t{:16}"
print(tplt.format("序号","價格","商品名稱",chr(12288)))
count=0
for g in ilt:
count+=1
if count <= num:
print(tplt.format(count,g[0],g[1],chr(12288)))
def main():
goods='連衣裙'
depth=1
start_url="https://s.taobao.com/search?q="+goods
infolist=[]
num=200
for i in range(depth):
try:
url=start_url+'$S='+str(44*1)
html=getHTMLText(url)
parsePage(infolist,html)
except:
continue
printGoodsList(infolist,num)
main()
輸出結果·展示:(部分資料)