堆糖網,圖檔桌面網站,存在反爬,發現傳回的json資料錯亂嚴重,隻能爬取部分資料,圖檔資料缺失很厲害,應用python進行圖檔抓取采集下載下傳,一個多程序及多線程的使用例子。
網址入口
get方式,參數
json資料
運作效果
單線程
#www.duitang.com
#20200603 by WX:huguo00289
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import urllib.parse
import requests,time,os,json
def ua():
ua=UserAgent()
headers={'User-Agent':ua.random,'Cookie': 'sessionid=ef6912ba-38d9-4b6e-a3d9-8d6526805f07; js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1590492733,1591182385; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1591182414'}
#headers = {'User-Agent': ua.random}
return headers
def get_imgs(i,keyword):
kd=urllib.parse.quote(keyword)
url=f"https://www.duitang.com/napi/blog/list/by_search/?kw={kd}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={24*i}&_=159118241418{i}"
html=requests.get(url,headers=ua(),timeout=8).content.decode('utf-8')
time.sleep(1)
datas=json.loads(html)
object_lists=datas['data']['object_list']
print(len(object_lists))
for object_list in object_lists:
print(object_list)
img_url=object_list['album']['covers'][0]
img_name='%s%s'%(object_list['album']['id'],os.path.splitext(img_url)[1])
print(img_url,img_name)
down_img(img_url, img_name,keyword)
def down_img(img_url,img_name,keyword):
os.makedirs(f'{keyword}/',exist_ok=True) #建立目錄
r=requests.get(img_url,headers=ua(),timeout=5)
with open(f'{keyword}/{img_name}','wb') as f:
f.write(r.content)
print(f'>>>儲存{img_name}圖檔成功!')
def main(keyword):
for i in range(1,10):
print(f'>>>正在爬取第{i}頁圖檔内容')
get_imgs(i,keyword)
print('采集圖檔完畢!')
if __name__=='__main__':
main("按鈕")
複制
多行程及多程序
#www.duitang.com
#20200603 by WX:huguo00289
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import urllib.parse
import requests,time,os,json
import threading #多線程
import multiprocessing #多程序
def ua():
ua=UserAgent()
headers={'User-Agent':ua.random,'Cookie': 'sessionid=ef6912ba-38d9-4b6e-a3d9-8d6526805f07; js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1590492733,1591182385; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1591182414'}
#headers = {'User-Agent': ua.random}
return headers
def get_imgs(num,keyword):
kd=urllib.parse.quote(keyword)
print(f'>>>正在爬取第{num}頁圖檔内容')
url=f"https://www.duitang.com/napi/blog/list/by_search/?kw={kd}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={24*num}&_=159118241418{num}"
html=requests.get(url,headers=ua(),timeout=8).content.decode('utf-8')
time.sleep(1)
datas=json.loads(html)
object_lists=datas['data']['object_list']
print(len(object_lists))
threads = []
for object_list in object_lists:
print(object_list)
img_url=object_list['album']['covers'][0]
img_name='%s%s'%(object_list['album']['id'],os.path.splitext(img_url)[1])
print(img_url,img_name)
t = threading.Thread(target=down_img, args=(img_url,img_name,keyword))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
print(num, 'is ok')
def down_img(img_url,img_name,keyword):
os.makedirs(f'{keyword}/',exist_ok=True) #建立目錄
r=requests.get(img_url,headers=ua(),timeout=5)
with open(f'{keyword}/{img_name}','wb') as f:
f.write(r.content)
print(f'>>>儲存{img_name}圖檔成功!')
#單程序
def main(keyword):
for i in range(1,10):
get_imgs(i,keyword)
print('采集圖檔完畢!')
#多程序
def maindjc(keyword):
pool = multiprocessing.Pool(processes=4) # 開4個程序
for i in range(1, 10):
pool.apply_async(func=get_imgs, args=(i, keyword))
pool.close()
pool.join()
print('采集圖檔完畢!')
if __name__=='__main__':
maindjc("美女")
複制
參考來源:
[Python 爬蟲]煎蛋網 OOXX 妹子圖爬蟲(2)——多線程+多程序下載下傳圖檔
https://tendcode.com/article/jiandan-meizi-spider-2/