百度图片是动态加载的,本例只是抓取了网页上的js源码,做的正则匹配
#encoding=utf-8
import urllib, urllib2
import os
import re
url = r'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1492068395730_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&
imgPath = r'/home/lhy/PycharmProjects/images/imgs/fish'
imgHtml = urllib2.urlopen(url).read().decode('utf-8')
# test html
# print(imgHtml)
urls = re.findall(r'"objURL":"(.*?)"', imgHtml)
if not os.path.isdir(imgPath):
os.mkdir(imgPath)
index = 1
for url in urls:
print("下载:", url)
# 未能正确获得网页 就进行异常处理
try:
res = urllib2.urlopen(url)
if str(res.status) != '200':
print('未下载成功:', url)
continue
except Exception as e:
print('未下载成功:', url)
filename = os.path.join(imgPath, str(index) + '.jpg')
with open(filename, 'wb') as f:
f.write(res.read())
print('下载完成\n')
index += 1
print("下载结束,一共下载了 %s 张图片" % (index - 1))