程式簡介
百度圖檔爬蟲的封裝接口2018年實作的,現在還能用...不錯,謝謝百度的不封之恩,先将其貢獻給所有熱愛技術的開發者
輸入:關鍵詞、下載下傳數量、重定尺寸(可省)
輸出:自動建立檔案夾下載下傳對應數量的百度圖檔,圖檔由md5指令
程式/資料集下載下傳
代碼分析
導入子產品
import numpy as np
import hashlib
import requests
import json
import cv2
import os
evalMd5函數用來計算圖檔md5,好進行命名和過濾相同圖檔
def evalMd5(sentence,charset='utf8'):
'''
計算一段字元串的md5
:param sentence: 字元串
:param charset: 字元集
:return: md5值
'''
#将字元串編碼成bytes
if type(sentence) != bytes:
sentence = sentence.encode(charset)
md5 = hashlib.md5(sentence).hexdigest()
return md5
resizeImg函數用來重定圖檔尺寸
def resizeImg(oldPath,size,newPath):
'''
重定圖檔尺寸
:param oldPath: 圖檔路徑
:param size: 重定大小
:param newPath: 圖檔儲存路徑
:return: None
'''
oldPath = oldPath.replace('\\','/')
newPath = newPath.replace('\\','/')
oldImg = cv2.imdecode(np.fromfile(oldPath,dtype=np.uint8),-1)
try:
newImg = cv2.resize(oldImg,size,) #為圖檔重新指定尺寸
cv2.imwrite(newPath,newImg)
cv2.imencode('.'+newPath.split('.')[-1],newImg)[1].tofile(newPath)
except:
#圖檔格式不對發生錯誤,删除
os.remove(oldPath)
核心函數download會調用上面的函數進行批量圖檔下載下傳
def download(keyWord,imgNumber,imgSize=None):
'''
下載下傳圖檔到關鍵詞檔案夾
:param keyWord: 關鍵詞
:param imgNumber: 圖檔數量
:param imgSize: 圖檔重定大小
:return: None
'''
#建立關鍵詞檔案夾
dirname = keyWord
if not os.path.exists(dirname):
os.mkdir(dirname)
#開始爬圖檔
url = 'https://image.baidu.com/search/acjson'#圖檔網址
same = 0#重複下載下傳數
error = 0#錯誤數
passNum = 0#無連結數
for i in range(30,30*10000+30,30):
param = {
'tn': 'resultjson_com','ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyWord,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyWord,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1488942260214': ''
}
#所有圖檔位址清單
data = requests.get(url,params=param).text.replace('\\','\\\\')
try:
data = json.loads(data)['data']
except:
#json資料可能不合法,直接跳過
error += 1
if error >=20:
return None
continue
for item in data:
imgUrl = item.get("middleURL")#圖檔位址
if passNum>=20:
return None
if imgUrl is None:
passNum+=1
continue
suffix = imgUrl.split('.')[-1]#圖檔字尾
imgContent = requests.get(imgUrl).content#圖檔内容
imgMd5 = evalMd5(imgContent)#圖檔md5
imgPath = os.path.join(dirname,'%s.%s'%(imgMd5,suffix))#圖檔路徑
oldFinish = len(os.listdir(dirname))
open(imgPath, 'wb').write(imgContent)#寫入
#重定尺寸
if imgSize:
resizeImg(imgPath,imgSize,imgPath)
newFinish = len(os.listdir(dirname))
print('key:%s goal:%d finish:%d'%(keyWord,imgNumber,newFinish))
#圖檔數達标,退出
if newFinish >= imgNumber:
return None
#重複下載下傳圖檔達到100次,說明已經下載下傳完所有圖檔,退出
if newFinish == oldFinish:
same+=1
if same >= 20:
return
來測試一下看看效果吧~
imgNumber = 10
keys = ['電子琴','蘋果']
imgSize = None
for keyWord in keys:
download(keyWord,imgNumber,imgSize)
key:電子琴 goal:10 finish:1
key:電子琴 goal:10 finish:2
key:電子琴 goal:10 finish:3
key:電子琴 goal:10 finish:4
key:電子琴 goal:10 finish:5
key:電子琴 goal:10 finish:6
key:電子琴 goal:10 finish:7
key:電子琴 goal:10 finish:8
key:電子琴 goal:10 finish:9
key:電子琴 goal:10 finish:10
key:蘋果 goal:10 finish:1
key:蘋果 goal:10 finish:2
key:蘋果 goal:10 finish:3
key:蘋果 goal:10 finish:4
key:蘋果 goal:10 finish:5
key:蘋果 goal:10 finish:6
key:蘋果 goal:10 finish:7
key:蘋果 goal:10 finish:8
key:蘋果 goal:10 finish:9
key:蘋果 goal:10 finish:10