天天看點

python下載下傳百度圖檔_python爬蟲——批量下載下傳百度圖檔

程式簡介

百度圖檔爬蟲的封裝接口2018年實作的,現在還能用...不錯,謝謝百度的不封之恩,先将其貢獻給所有熱愛技術的開發者

輸入:關鍵詞、下載下傳數量、重定尺寸(可省)

輸出:自動建立檔案夾下載下傳對應數量的百度圖檔,圖檔由md5指令

python下載下傳百度圖檔_python爬蟲——批量下載下傳百度圖檔

程式/資料集下載下傳

python下載下傳百度圖檔_python爬蟲——批量下載下傳百度圖檔

代碼分析

導入子產品

import numpy as np

import hashlib

import requests

import json

import cv2

import os

evalMd5函數用來計算圖檔md5,好進行命名和過濾相同圖檔

def evalMd5(sentence,charset='utf8'):

'''

計算一段字元串的md5

:param sentence: 字元串

:param charset: 字元集

:return: md5值

'''

#将字元串編碼成bytes

if type(sentence) != bytes:

sentence = sentence.encode(charset)

md5 = hashlib.md5(sentence).hexdigest()

return md5

resizeImg函數用來重定圖檔尺寸

def resizeImg(oldPath,size,newPath):

'''

重定圖檔尺寸

:param oldPath: 圖檔路徑

:param size: 重定大小

:param newPath: 圖檔儲存路徑

:return: None

'''

oldPath = oldPath.replace('\\','/')

newPath = newPath.replace('\\','/')

oldImg = cv2.imdecode(np.fromfile(oldPath,dtype=np.uint8),-1)

try:

newImg = cv2.resize(oldImg,size,) #為圖檔重新指定尺寸

cv2.imwrite(newPath,newImg)

cv2.imencode('.'+newPath.split('.')[-1],newImg)[1].tofile(newPath)

except:

#圖檔格式不對發生錯誤,删除

os.remove(oldPath)

核心函數download會調用上面的函數進行批量圖檔下載下傳

def download(keyWord,imgNumber,imgSize=None):

'''

下載下傳圖檔到關鍵詞檔案夾

:param keyWord: 關鍵詞

:param imgNumber: 圖檔數量

:param imgSize: 圖檔重定大小

:return: None

'''

#建立關鍵詞檔案夾

dirname = keyWord

if not os.path.exists(dirname):

os.mkdir(dirname)

#開始爬圖檔

url = 'https://image.baidu.com/search/acjson'#圖檔網址

same = 0#重複下載下傳數

error = 0#錯誤數

passNum = 0#無連結數

for i in range(30,30*10000+30,30):

param = {

'tn': 'resultjson_com','ipn': 'rj',

'ct': 201326592,

'is': '',

'fp': 'result',

'queryWord': keyWord,

'cl': 2,

'lm': -1,

'ie': 'utf-8',

'oe': 'utf-8',

'adpicid': '',

'st': -1,

'z': '',

'ic': 0,

'word': keyWord,

's': '',

'se': '',

'tab': '',

'width': '',

'height': '',

'face': 0,

'istype': 2,

'qc': '',

'nc': 1,

'fr': '',

'pn': i,

'rn': 30,

'gsm': '1e',

'1488942260214': ''

}

#所有圖檔位址清單

data = requests.get(url,params=param).text.replace('\\','\\\\')

try:

data = json.loads(data)['data']

except:

#json資料可能不合法,直接跳過

error += 1

if error >=20:

return None

continue

for item in data:

imgUrl = item.get("middleURL")#圖檔位址

if passNum>=20:

return None

if imgUrl is None:

passNum+=1

continue

suffix = imgUrl.split('.')[-1]#圖檔字尾

imgContent = requests.get(imgUrl).content#圖檔内容

imgMd5 = evalMd5(imgContent)#圖檔md5

imgPath = os.path.join(dirname,'%s.%s'%(imgMd5,suffix))#圖檔路徑

oldFinish = len(os.listdir(dirname))

open(imgPath, 'wb').write(imgContent)#寫入

#重定尺寸

if imgSize:

resizeImg(imgPath,imgSize,imgPath)

newFinish = len(os.listdir(dirname))

print('key:%s goal:%d finish:%d'%(keyWord,imgNumber,newFinish))

#圖檔數達标,退出

if newFinish >= imgNumber:

return None

#重複下載下傳圖檔達到100次,說明已經下載下傳完所有圖檔,退出

if newFinish == oldFinish:

same+=1

if same >= 20:

return

來測試一下看看效果吧~

imgNumber = 10

keys = ['電子琴','蘋果']

imgSize = None

for keyWord in keys:

download(keyWord,imgNumber,imgSize)

key:電子琴 goal:10 finish:1

key:電子琴 goal:10 finish:2

key:電子琴 goal:10 finish:3

key:電子琴 goal:10 finish:4

key:電子琴 goal:10 finish:5

key:電子琴 goal:10 finish:6

key:電子琴 goal:10 finish:7

key:電子琴 goal:10 finish:8

key:電子琴 goal:10 finish:9

key:電子琴 goal:10 finish:10

key:蘋果 goal:10 finish:1

key:蘋果 goal:10 finish:2

key:蘋果 goal:10 finish:3

key:蘋果 goal:10 finish:4

key:蘋果 goal:10 finish:5

key:蘋果 goal:10 finish:6

key:蘋果 goal:10 finish:7

key:蘋果 goal:10 finish:8

key:蘋果 goal:10 finish:9

key:蘋果 goal:10 finish:10

python下載下傳百度圖檔_python爬蟲——批量下載下傳百度圖檔