使用者代理池
使用者代理池就是将不同的使用者代理組建成為一個池子,随後随機調用。
作用:每次通路代表使用的浏覽器不一樣
import urllib.request
import re
import random
uapools=[
\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0\',
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER\',
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\',
\'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12\',
]
def ua(uapools):
thisua=random.choice(uapools)
print(thisua)
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(10):
ua(uapools)
thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/";
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat=\'<div class="content">.*?<span>(.*?)</span>.*?</div>\'
res=re.compile(pat,re.S).findall(data)
for j in range(len(res)):
print(res[j])
print(\'---------------------\')
IP代理與IP代理池的建構的兩種方案
搜尋西刺、大象代理IP
盡量選國外的IP。
import urllib.request
ip="219.131.240.35"
proxy=urllib.request.ProxyHandler({"http":ip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url="https://www.baidu.com/"
data=urllib.request.urlopen(url).read()
fp=open("ip_baidu.html","wb")
fp.write(data)
fp.close()
IP代理池建構的第一種方式(适合代理IP穩定的情況)
import random
import urllib.request
ippools=[
"163.125.70.22",
"111.231.90.122",
"121.69.37.6",
]
def ip(ippools):
thisip=random.choice(ippools)
print(thisip)
proxy=urllib.request.ProxyHandler({"http":thisip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
for i in range(5):
try:
ip(ippools)
url="https://www.baidu.com/"
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
print(len(data))
fp=open("ip_res/ip_baidu_"+str(i+1)+".html","w")
fp.write(data)
fp.close()
except Exception as err:
print(err)
IP代理池建構的第二種方式(接口調用法,更适合代理IP不穩定的情況)
此方法因為經濟原因暫時鴿着。
淘寶商品圖檔爬蟲
現在的淘寶反爬蟲,下面這份代碼已經爬不了了,但可以作為練習。
import urllib.request
import re
import random
keyname="python"
key=urllib.request.quote(keyname) #網址不能有中文,這裡進行中文
uapools=[
\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0\',
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER\',
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\',
\'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12\',
]
def ua(uapools):
thisua=random.choice(uapools)
print(thisua)
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
for i in range(1,11): #第1頁到第10頁
ua(uapools)
url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat=\'pic_url":"//(.*?)"\'
imglist=re.compile(pat).findall(data)
print(len(imglist))
for j in range(len(imglist)):
thisimg=imglist[j]
thisimgurl="https://"+thisimg
localfile="淘寶圖檔/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,localfile)
同時使用使用者代理池和IP代理池
封裝成函數:
import urllib.request
import re
import random
uapools=[
\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0\',
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER\',
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\',
\'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12\',
]
ippools=[
"163.125.70.22",
"111.231.90.122",
"121.69.37.6",
]
def ua_ip(myurl):
def ip(ippools,uapools):
thisip=random.choice(ippools)
print(thisip)
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent", thisua)
proxy=urllib.request.ProxyHandler({"http":thisip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(5):
try:
ip(ippools,uapools)
url=myurl
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
print(len(data))
break
except Exception as err:
print(err)
return data
data=ua_ip("https://www.baidu.com/")
fp=open("uaip.html","w",encoding="utf-8")
fp.write(data)
fp.close()
封裝成子產品:
把子產品拷貝到python目錄
使用:
from uaip import *
data=ua_ip("https://www.baidu.com/")
fp=open("baidu.html","w",encoding="utf-8")
fp.write(data)
fp.close()
抓包分析
fiddler工具:用作代理伺服器,request和response都要經過fiddler
選用火狐浏覽器,設定網絡:
設定HTTPS協定:打開fiddler的工具的選項,打上勾
然後點Actions選導入到桌面。
再回到火狐的設定
導入桌面上的證書
常用指令clear:清屏
自動進行Ajax異步請求資料
如微網誌,拖到下面的時候資料才加載出來,不是同步出來的。再如“點選加載更多”,都是異步,需要抓包分析。
看下面這個栗子。
騰訊視訊評論(深度解讀)爬蟲實戰
在火狐浏覽器打開騰訊視訊,比如https://v.qq.com/x/cover/j6cgzhtkuonf6te.html
點選檢視更多解讀,這時fiddler會有一個js檔案:
裡面的内容就是評論。
找到一條評論轉一下碼:
在火狐裡ctrl+f看看有沒有這條評論。
copy js檔案的url。
點選檢視更多評論,再觸發一個json,copy url
分析兩個url:
簡化一下網頁試試:https://video.coral.qq.com/filmreviewr/c/upcomment/j6cgzhtkuonf6te?reqnum=3&commentid=6227734628246412645
通過分析,我們可以知道j6cg……是視訊id,reqnum是每次檢視的評論數量,commentid是評論id
https://video.coral.qq.com/filmreviewr/c/upcomment/【vid】?reqnum=【num】&commentid=【cid】
-
單頁評論爬蟲
有一些特殊字元比如圖檔現在還不知道怎麼處理……以後再說吧
import urllib.request
import re
from uaip import *
vid="j6cgzhtkuonf6te"
cid="6227734628246412645"
num="3" #每頁提取3個
url="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?reqnum="+num+"&commentid="+cid
data=ua_ip(url)
titlepat=\'"title":"(.*?)","abstract":"\'
commentpat=\'"content":"(.*?)",\'
titleall=re.compile(titlepat,re.S).findall(data)
commentall=re.compile(commentpat,re.S).findall(data)
# print(len(commentall))
for i in range(len(titleall)):
try:
print("評論标題是:"+eval("u\'"+titleall[i]+"\'"))
print("評論内容是:"+eval("u\'"+commentall[i]+"\'"))
print(\'---------------\')
except Exception as err:
print(err)
-
翻頁評論爬蟲
檢視網頁源代碼可以發現last:後面的内容為下一頁的id
import urllib.request import re from uaip import * vid="j6cgzhtkuonf6te" cid="6227734628246412645" num="3" for j in range(10): #爬取1~10頁内容 print("第"+str(j+1)+"頁") url = "https://video.coral.qq.com/filmreviewr/c/upcomment/" + vid + "?reqnum=" + num + "&commentid=" + cid data = ua_ip(url) titlepat = \'"title":"(.*?)","abstract":"\' commentpat = \'"content":"(.*?)",\' titleall = re.compile(titlepat, re.S).findall(data) commentall = re.compile(commentpat, re.S).findall(data) lastpat=\'"last":"(.*?)"\' cid=re.compile(lastpat,re.S).findall(data)[0] for i in range(len(titleall)): try: print("評論标題是:" + eval("u\'" + titleall[i] + "\'")) print("評論内容是:" + eval("u\'" + commentall[i] + "\'")) print(\'---------------\') except Exception as err: print(err)
對于短評(普通評論)方法類似,這裡就不贅述了,看下面這個短評爬蟲代碼:
将https://video.coral.qq.com/varticle/1743283224/comment/v2?callback=_varticle1743283224commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6442954225602101929&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1566363507957
簡化成:https://video.coral.qq.com/varticle/1743283224/comment/v2?orinum=10&oriorder=o&pageflag=1&cursor=6442954225602101929
import urllib.request
import re
from uaip import *
vid="1743283224"
cid="6442954225602101929"
num="5"
for j in range(10): #爬取1~10頁内容
print("第"+str(j+1)+"頁")
url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid
data = ua_ip(url)
commentpat = \'"content":"(.*?)"\'
commentall = re.compile(commentpat, re.S).findall(data)
lastpat=\'"last":"(.*?)"\'
cid=re.compile(lastpat,re.S).findall(data)[0]
# print(len(gg))
# print(len(commentall))
for i in range(len(commentall)):
try:
print("評論内容是:" + eval("u\'" + commentall[i] + "\'"))
print(\'---------------\')
except Exception as err:
print(err)