前言
在這裡我就不再一一介紹每個步驟的具體操作了,因為在爬取老版今日頭條資料的時候都已經講的非常清楚了,是以在這裡我隻會在重點上講述這個是這麼實作的,如果想要看具體步驟請先去看我今日頭條的文章内容,裡面有非常詳細的介紹以及是怎麼找到加密js代碼和api接口。
Python3爬取今日頭條文章視訊資料,完美解決as、cp、_signature的加密方法
WAP端跟APP端完全沒啥差別,是以能用WAP端就用WAP端爬取資料,APP端涉及逆向APP比較複雜,是以推薦爬取WAP端的資料。
QQ群聊
855262907
對比送出資料Form Data
因為有拿後續資料的操作,肯定就會有值進行修改,是以我們得需要知道哪些值被
修改
了,哪些值是
固定
的,是以對比
Form Data
尤為重要,看下面的内容就知道問題所在。
下面Name為app的就是傳回廣告資料的連接配接位址
第一次請求的Form Data:
資料取出來後進行JSON格式化,可以百度搜尋
JSON格式化
,這樣友善我們進行對比。
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip位址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 0, // 第一次為0,第二次為11,後續每次加10
"channel": "24h",
"loid": "1",
"orders_info": [], //後面的文章有講解
"current_rot": "", //第二次為1,2第三次為1,2,3,4後續以此類推,第四次1,2,3,4,5,6
"article_id": "",
"refresh_type": 1, //第一次為1,後續兼為2
"seq": "", //第二次為5,10第三次為5,10,15,20後續以此類推,第四次5,10,15,20,25,30
"seq_loid": "" //全部為1,因為每次傳回的資料有兩條是以有兩個1,第二次為1,1第三次為1,1,1,1
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
第二次請求的Form Data:
隻需要往下滑動就可以重新整理出來資料
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip位址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 11,
"channel": "24h",
"loid": "1",
"orders_info": ["272163938,15437425,3123255098,1000,505,110,2", "273302998,14922017,1731713627,1000,4109,110,2"],
"current_rot": "1,2",
"article_id": "",
"refresh_type": 2,
"seq": "5,10",
"seq_loid": "1,1"
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
第三次請求的Form Data:
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip位址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 21,
"channel": "24h",
"loid": "1",
"orders_info": ["272163938,15437425,3123255098,1000,505,110,2", "273302998,14922017,1731713627,1000,4109,110,2", "273124923,16877408,2641249431,1000,4109,110,2", "273311058,17099839,3340342053,1000,808,110,2"],
"current_rot": "1,2,3,4",
"article_id": "",
"refresh_type": 2,
"seq": "5,10,15,20",
"seq_loid": "1,1,1,1"
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
我們請求了三次,發現廣告資料連接配接的
Form Data
有變化的隻有
slot
裡面的
cur、orders_info、current_rot、refresh_type、seq、seq_loid
這幾個字段。
破解請求Form Data
搜尋
orders_info
得到構造請求
Form Data
的
JS函數
,直接開始讀源碼打斷點。
發現我們的構造請求Form Data已經真相大白了,簡單吧。
這下我們隻需要知道
window.SSPAd
是怎麼生成的即可,直接搜尋
window.SSPAd
。
發現
window.SSPAd
是
new s
生成的,搜尋
var s =
,在這裡面又發現了我們之前搜尋orders_info的時候,
orders_info
是在
requestOrder
裡面的。
然後通過打斷點發現
window.orders_info
是由
getOrderInfo
傳回的,這個是每次請求傳回的結果裡面的資料,并且通過觀察其他的參數發現,他們都是有規律的,隻有
orders_info
沒有規律,是以這一切都聯系到了一起了,簡單吧。
e.prototype.getOrderInfo = function(e) {
return e.oid + "," + e.advertiser_id + "," + e.product_id + "," + e.product_type + "," + e.industry_id + "," + e.order_source + "," + e.act_type
}
直接上代碼
import requests
import json
requests.packages.urllib3.disable_warnings()
'''
騰訊新聞廣告資料爬取
'''
class news_qq():
def __init__(self,number):
self.session = requests.Session()
self.cur = 0
self.orders_info = []
self.current_rot_tmp = 0
self.current_rot_list = []
self.current_rot = ''
self.refresh_type = 1
self.seq = ''
self.seq_loid = ''
for num in range(number): # 這是控制循環次數的
self.payload = {
"adReqData": {
"chid": 6,
"ipv4": self.get_client_ip(),
"adtype": 0,
"pf": "aphone",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [
{
"cur": self.cur,
"channel": "24h",
"loid": "1",
"orders_info": self.orders_info,
"current_rot": self.current_rot,
"article_id": "",
"refresh_type": self.refresh_type,
"seq": self.seq,
"seq_loid": self.seq_loid
}
],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": False,
"plugin_bucket_id": "",
"plugin_osv": "5.0.0",
"wap_source": "default"
}
}
js = self.app() # 這個就是擷取到的廣告json資料
print(js)
# 擷取本機IP位址
def get_client_ip(self):
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'ipv4.gdt.qq.com',
'Origin': 'https://xw.qq.com',
'Referer': 'https://xw.qq.com/m/24h',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
url = 'https://ipv4.gdt.qq.com/get_client_ip'
ip = self.session.get(url,headers=headers).text
return ip
# 構造送出資料
# orders_info等于傳回值中的e.oid + "," + e.advertiser_id + "," + e.product_id + "," + e.product_type + "," + e.industry_id + "," + e.order_source + "," + e.act_type
def set_params(self,js):
self.cur += 11 if self.cur == 0 else 10
adlist = json.loads(js['adList'])
order_tmp = 0
order_source = adlist['index'][0]['stream']['order_source'].split(',')
for order in adlist['order']:
oid = order['oid']
advertiser_id = order['advertiser_id']
product_id = order['product_id']
product_type = order['product_type']
industry_id = order['industry_id']
act_type = order['act_type']
self.orders_info.append(','.join([oid,str(advertiser_id), str(product_id), str(product_type), str(industry_id),order_source[order_tmp],str(act_type)]))
order_tmp += 1
self.current_rot_tmp += 1
self.current_rot_list.append(str(self.current_rot_tmp))
self.current_rot = ','.join(self.current_rot_list)
self.refresh_type = 2
self.seq += adlist['index'][0]['stream']['seq'] if self.seq == '' else ',' + adlist['index'][0]['stream']['seq']
self.seq_loid += '1,1' if self.seq_loid == '' else ',' + '1,1'
# 擷取廣告資料
def app(self):
url = 'https://news.ssp.qq.com/app'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'news.ssp.qq.com',
'Origin': 'https://xw.qq.com',
'Referer': 'https://xw.qq.com/m/24h',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
response = self.session.post(url,headers=headers,data=json.dumps(self.payload),verify=False)
js = response.json()
self.set_params(js)
return js
if __name__ == '__main__':
news_qq(1000)