頭榜,一個集合主播資訊及資訊的網站,内容比較齊全,現今直播火熱,想要找尋各種播主資訊,這類網站可以搜集到相關熱門主播資訊。
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICMyYTMvw1dvwlMvwlM3VWaWV2Zh1Wa-cmbw5iayNHM0JXYnBXOvwFMygTOzATMtUGall3LcVmdhNXLwRHdo9CXt92YucWbpRWdvx2Yx5yazF2Lc9CX6MHc0RHaiojIsJye.png)
目标網址:
http://www.toubang.tv/baike/list/20.html
清單頁,而且暫時沒有發現清單頁規律,加密了?
http://www.toubang.tv/baike/list/20.html?p=hJvm3qMpTkj7J/RNmtAVNw==
http://www.toubang.tv/baike/list/20.html?p=rjaUfcMsOOYXKBBBp5YUUA==
很明顯,p後面所帶的參數就是頁碼,但是搞不明白是如何實作的一串字元串,目測沒有明顯的頁碼規律。
沒有過多的研究,霸王硬上弓,硬搞吧!
直接把所有清單頁上周遊一遍,擷取到頁碼連結,這裡我簡單的使用了遞歸函數
獲得所有清單頁的集合,至于去重,這裡直接使用了set(),直接将集合轉化為set
遞歸代碼
def get_apgeurls(apgeurls):
page_urls=[]
for apgeurl in apgeurls:
page_url=get_pageurl(apgeurl)
page_urls.extend(page_url)
page_urls=set(page_urls)
#print(len(page_urls))
if len(page_urls) < 66:
return get_apgeurls(page_urls) #鍊輪
else:
return page_url
複制
好在分頁數不多,算是一個比較笨的實作方法,注意return的使用,遞歸函數調用函數本身,return會傳回None,這裡通過百度查詢相關資料擷取到解決方案。
其他的一些擷取内容,都是正常操作,這裡就不再闡述了!
提一下多線程吧!
def get_urllists(urls):
threads = []
for url in urls:
t=threading.Thread(target=get_urllist,args=(url,))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
print('>>> 擷取采集連結完畢!')
複制
這裡需要注意一個參數的調用的時候,args=(url,),同時多線程的使用,采集報錯是一個很頭疼的問題,基本都是伺服器反應不過來,難道還是得采用Scrapy架構,大範圍抓取。
運作效果:
采集效果:
附源碼參考:
#www.toubang.tv/
#20200606 by WX:huguo00289
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import requests,time,os
from lxml import etree
import threading #多線程
def ua():
ua=UserAgent()
headers={'User-Agent':ua.random}
return headers
def get_pageurl(url):
pageurl=[]
html=requests.get(url,headers=ua()).content.decode('utf-8')
time.sleep(1)
req=etree.HTML(html)
pagelists=req.xpath('//div[@class="row-page tc"]/a/@href')
for pagelist in pagelists:
if "baike" in pagelist:
pagelist=f"http://www.toubang.tv{pagelist}"
pageurl.append(pagelist)
#print(len(pageurl))
return pageurl
def get_apgeurls(apgeurls):
page_urls=[]
for apgeurl in apgeurls:
page_url=get_pageurl(apgeurl)
page_urls.extend(page_url)
page_urls=set(page_urls)
#print(len(page_urls))
if len(page_urls) < 5:
#if len(page_urls) < 65:
return get_apgeurls(page_urls) #鍊輪
else:
return page_urls
def get_urllist(url):
html = requests.get(url, headers=ua()).content.decode('utf-8')
time.sleep(1)
req = etree.HTML(html)
hrefs=req.xpath('//div[@class="h5 ellipsis"]/a/@href')
print(hrefs)
for href in hrefs:
href=f'http://www.toubang.tv{href}'
get_info(href)
def get_urllists(urls):
threads = []
for url in urls:
t=threading.Thread(target=get_urllist,args=(url,))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
print('>>> 擷取采集連結完畢!')
def get_info(url):
html = requests.get(url, headers=ua()).content.decode('utf-8')
time.sleep(1)
req = etree.HTML(html)
name=req.xpath('//div[@class="h3 ellipsis"]/span[@class="title"]/text()')[0]
os.makedirs(f'{name}/', exist_ok=True) # 建立目錄
briefs=req.xpath('//dl[@class="game-tag clearfix"]/dd/span//text()')
brief_img=req.xpath('//div[@class="i-img fl mr20"]/img/@src')[0].split('=')[1]
print(name)
print(briefs)
print(brief_img)
down_img(brief_img, name)
informations=req.xpath('//table[@class="table-fixed table-hover hot-search-play"]/tbody/tr[@class="baike-bar"]/td//text()')
for information in informations:
if '\r' and '\n' and '\t' not in information:
print(information)
text=req.xpath('//div[@class="text-d"]/p//text()')
print(text)
text_imgs=req.xpath('//div[@id="wrapBox1"]/ul[@id="count1"]/li/a[@class="img_wrap"]/@href')
print(text_imgs)
threads=[]
for text_img in text_imgs:
t=threading.Thread(target=down_img,args=(text_img,name))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
print("圖檔下載下傳完成!")
def down_img(img_url,name):
img_name=img_url.split('/')[-1]
r=requests.get(img_url,headers=ua(),timeout=8)
time.sleep(2)
with open(f'{name}/{img_name}','wb') as f:
f.write(r.content)
print(f'>>>儲存{img_name}圖檔成功!')
def main():
url = "http://www.toubang.tv/baike/list/20.html?p=hJvm3qMpTkjm8Rev+NDBTw=="
apgeurls = [url]
page_urls = get_apgeurls(apgeurls)
print(page_urls)
get_urllists(page_urls)
if __name__=='__main__':
main()
複制
微信公衆号:二爺記