想成為優秀的鬥魚主播,首先得掌握優秀的自拍技能;這次寫個有意思的, 爬取鬥魚小姐姐的自拍頭像...
效果圖:
001
002
003
004
005
分析頻道
頻道API
擷取關鍵參數
分析參數
檢視Json
請求API, 爬蟲負責翻頁,https://www.douyu.com/gapi/rkc/directory/2_201/1
腳本運作界面
腳本運作
源碼():
4月13日10時更新: 可按照主播人氣, 對圖檔進行排序, 并實作了圖檔去重
import requests
from lxml import etree
import json
import os
import time
def getResponse(url):
headers = {
# 設定使用者代理頭(為狼披上羊皮)
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
response = requests.get(url, headers = headers)
return response
def getAllChannelMark(response):
data_etree = etree.HTML(response.content)
title_list = data_etree.xpath('//div[@class="leftnav-cate"]//li/a')
title_mark_list = []
for title in title_list:
title_name = title.xpath('@title')
title_mark = title.xpath('@data-rk')
if title_name and title_mark:
tmp_title = {"title_name": title_name, "title_mark": title_mark}
title_mark_list.append(tmp_title)
return title_mark_list
def getChanneTitleMark(title_mark_list):
for index, title_mark in enumerate(title_mark_list):
print("編号:",index,"=>",title_mark["title_name"], end="")
if index%4 == 0:
print()
checkNumPass = True
while checkNumPass:
try:
channelNum = int(input("請輸入主題對應的編号(例如: 33):"))
checkNumPass = False
except:
print("輸入的編号格式有誤")
ChanneTitleMark = title_mark_list[channelNum]["title_mark"]
return ChanneTitleMark
def checkNumFormat(message):
canPass = False
num = 0
while not canPass:
try:
num = int(input(message))
canPass = True
except:
print("輸入的格式有誤請重新輸入!")
return num
def getSourceJson(ChanneTitleMark):
num = checkNumFormat("請輸入需要爬取的主播圖檔數量(例如: 200):")
# 用于生産url的變量
url_index = 0
# 設定去重清單
name_list = []
while num > 0:
JsonUrl = "https://www.douyu.com/gapi/rkc/directory/"+str(ChanneTitleMark[0])+"/" + str(url_index)
SourceJson = getResponse(JsonUrl).content
# 擷取多個主播的資訊
anchors = json.loads(SourceJson)["data"]["rl"]
# # 計算本輪擷取的主播數量
# anchor_num = len(anchors)
# # 計算出待擷取的圖檔數量
# last_num = num
# num = num - anchor_num
# # 如果本次資訊過量,則截取部分json資訊
# if num <= 0:
# anchors = anchors[0:last_num]
groupAnchorInfoList = []
for anchor in anchors:
tmp_anchor_info = {}
# 主播照片
tmp_anchor_info["anchor_img"] = anchor["rs1"]
# 主播名
tmp_anchor_info["anchor_name"] = anchor["nn"]
# 直播房間id
tmp_anchor_info["anchor_rid"] = anchor["rid"]
# 主題
tmp_anchor_info["anchor_rn"] = anchor["rn"]
# 即時熱度(人氣)
tmp_anchor_info["anchor_ol"] = str(anchor["ol"])
# 将人氣補齊到百萬級别
if len(str(anchor["ol"])) < 7:
ol_tmp = "0000000" + str(anchor["ol"])
tmp_anchor_info["anchor_ol"] = ol_tmp[-7:]
# 頻道名
tmp_anchor_info["channelName"] = anchor["c2name"]
# 如果已經存在此主播圖檔, 則不添加
if tmp_anchor_info["anchor_name"] not in name_list:
groupAnchorInfoList.append(tmp_anchor_info)
name_list.append(tmp_anchor_info["anchor_name"])
# 擷取一頁, 儲存一次
url_index += 1
num = saveImage(groupAnchorInfoList, num)
def saveImage(groupAnchorInfoList, num):
# 延遲0.2秒
time.sleep(0.2)
for AnchorInfo in groupAnchorInfoList:
if num > 0:
# 建立檔案夾
try:
os.makedirs("./images/%s"%(AnchorInfo["channelName"]))
except Exception as e:
pass
# 寫入圖檔
file_path = "./images/%s/%s"%(AnchorInfo["channelName"], AnchorInfo["anchor_ol"]+"_"+AnchorInfo["anchor_name"]+"_"+AnchorInfo["anchor_rn"]+".jpg")
file_data = getResponse(AnchorInfo["anchor_img"]).content
try:
with open(file_path, "wb+") as f:
f.write(file_data)
print(">",file_path, "下載下傳成功", "剩餘", num, "張")
except Exception as e:
pass
num = num - 1
return num
def main():
response = getResponse("https://www.douyu.com/directory/all")
title_mark_list = getAllChannelMark(response)
ChanneTitleMark = getChanneTitleMark(title_mark_list)
getSourceJson(ChanneTitleMark)
if __name__ == '__main__':
main()
由于分析擷取了API, 是以爬蟲效率很高, 鬥魚的"顔值"(第33個)頻道大概有940個主播, 耗時1分鐘全部爬完...