天天看點

Python爬取百度圖檔一、爬取連結的正規表達式二、爬蟲代碼

一、爬取連結的正規表達式

# .的使用舉例

a = ‘xy123’

b = re.findall(‘x…’, a)

print b # [‘xy12’]

*****************************

# *的使用舉例

a = ‘xyxy123’

b = re.findall(‘x*’,a)

print b # [‘x’, ”, ‘x’, ”, ”, ”, ”, ”]

*****************************

# ?的使用舉例

a = ‘xy123’

b = re.findall(‘x?’, a)

print b # [‘x’, ”, ”, ”, ”, ”]

==========================================

”’上面的内容全部都是隻需要了解即可,需要掌握的隻有下面這一種組合方式(.*?)”’

==========================================

secret_code = ‘hadkfalifexxIxxfasdjifja134xxlovexx23345sdfxxyouxx8dfse’

*****************************

# .*的使用舉例

b = re.findall(‘xx.*xx’, secret_code)

print b # [‘xxIxxfasdjifja134xxlovexx23345sdfxxyouxx’]

*****************************

# .*?的使用舉例

c = re.findall(‘xx.*?xx’, secret_code)

print c # [‘xxIxx’, ‘xxlovexx’, ‘xxyouxx’]

*****************************

# (.*?)使用括号與不使用括号的差别

d = re.findall(‘xx(.*?)xx’, secret_code)

print d # [‘I’, ‘love’, ‘you’]

for each in d:

print each # I love you

*****************************

# 有換行的情況下比對

s = ”’sdfxxhello

xxfsdfxxworldxxasdf”’

d = re.findall(‘xx(.*?)xx’, s)

print d # [‘fsdf’]

d = re.findall(‘xx(.*?)xx’, s, re.S)

print d # [‘hello\n’, ‘world’]

二、爬蟲代碼

1. 爬取無反爬蟲措施的靜态網站

# coding=utf-8
import urllib
import re

# 傳回網站的源代碼
def getHtml(url):
    # 打開網站
    page = urllib.urlopen(url)
    # 擷取網站的源代碼
    html = page.read()
    return html

def getImg(html):
    # 正規表達式,用來在網頁源代碼中找到圖檔的資源路徑
    reg = 'data-objurl="(.*?)"'
    # 将正規表達式轉換成正規表達式對象
    imgre = re.compile(reg)
    # 找出網頁源碼中包含img re的所有内容,傳回一個序列
    imglist = re.findall(imgre, html)
    x = 
    # 下載下傳圖檔
    for imgurl in imglist:
        # 設定要下載下傳的圖檔資源路徑和圖檔儲存名字
        urllib.urlretrieve(imgurl, '%s.jpg' % x)
        x += 
    return imglist


html = getHtml("http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=披薩")
print(html)
print(getImg(html))
           

url直接輸入位址欄的位址,但該代碼爬取不了url,大概因為百度反爬了吧。

2.爬取有反爬蟲措施的百度圖檔

# coding = utf-8
import urllib
import urllib2
import re
import os
#爬取有反爬蟲措施的百度圖檔

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html


def getImg(html):
    reg = 'ObjURL":"(.*?)"'
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)
    newimglist = []
    for img in imglist:
        newimglist.append(img.replace("\\",""))
    return newimglist


def downLoad(urls, path):
    index = 
    for url in urls:
        print("Downloading:", url)
        try:
            res = urllib2.Request(url)
            # if 404
            if str(res.status_code)[]=="4":
                print("download failed!", url)
                continue
        except Exception as e:
            print("download failed!", url)
        filename=os.path.join(path,str(index)+".jpg")
        urllib.urlretrieve(url,filename)
        index += 


html = getHtml("https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%8A%AB%E8%90%A8&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E6%8A%AB%E8%90%A8&s=&se=&tab=&width=&height=&)

# print html
print(getImg(html))
savepath = "/spider_data"
downLoad(getImg(html),savepath)
           

與靜态網站代碼主要差別在于爬取的url格式:1)靜态網站可以直接使用位址欄url;2)有反爬網站url擷取方式如下。

打開百度圖檔網頁->搜尋“披薩”->審查元素->檢查網絡->過濾資源清單中搜尋https://image.baidu.com/search ->在acjson右鍵複制連結->OK!

Python爬取百度圖檔一、爬取連結的正規表達式二、爬蟲代碼

此外,還增加了一些輔助代碼。如getImg(html)增加了對urllist的格式修改。

該代碼可以爬,但是爬下來的圖檔很多打不開,而且運作一會就報錯IOError: [Errno socket error] [Errno 10060] 。網友說添加一個header和設定timeout可以預防,見下一個方法。

3. 完美爬取代碼

#coding=utf-8
from urllib import quote
import urllib2 as urllib
import re
import os


class BaiduImage():

    def __init__(self, keyword, count=, save_path="img", rn=):
        self.keyword = keyword
        self.count = count
        self.save_path = save_path
        self.rn = rn

        self.__imageList = []
        self.__totleCount = 

        self.__encodeKeyword = quote(self.keyword)
        self.__acJsonCount = self.__get_ac_json_count()

        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"
        self.headers = {'User-Agent': self.user_agent, "Upgrade-Insecure-Requests": ,
                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                        "Accept-Encoding": "gzip, deflate, sdch",
                        "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
                        "Cache-Control": "no-cache"}
        # "Host": Host,

    def search(self):
        for i in range(, self.__acJsonCount):
            url = self.__get_search_url(i * self.rn)
            response = self.__get_response(url).replace("\\", "")
            image_url_list = self.__pick_image_urls(response)
            self.__save(image_url_list)

    def __save(self, image_url_list, save_path=None):
        if save_path:
            self.save_path = save_path

        print "已經存儲 " + str(self.__totleCount) + "張"
        print "正在存儲 " + str(len(image_url_list)) + "張,存儲路徑:" + self.save_path

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        for image in image_url_list:
            host = self.get_url_host(image)
            self.headers["Host"] = host

            with open(self.save_path + "/%s.jpg" % self.__totleCount, "wb") as p:
                try:
                    req = urllib.Request(image, headers=self.headers)
                    # 設定一個urlopen的逾時,如果10秒通路不到,就跳到下一個位址,防止程式卡在一個地方。
                    img = urllib.urlopen(req, timeout=)
                    p.write(img.read())
                    p.close()
                    self.__totleCount += 
                except Exception as e:
                    print "Exception" + str(e)
                    p.close()
                    if os.path.exists("img/%s.jpg" % self.__totleCount):
                        os.remove("img/%s.jpg" % self.__totleCount)

        print "已存儲 " + str(self.__totleCount) + " 張圖檔"

    def __pick_image_urls(self, response):
        reg = r'"ObjURL":"(http://img[0-9]\.imgtn.*?)"'
        imgre = re.compile(reg)
        imglist = re.findall(imgre, response)
        return imglist

    def __get_response(self, url):
        page = urllib.urlopen(url)
        return page.read()

    def __get_search_url(self, pn):
        return "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=" + self.__encodeKeyword + "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=" + self.__encodeKeyword + "&s=&se=&tab=&width=&height=& + str(pn) + "&rn=" + str(self.rn) + "&gsm=1000000001e&1486375820481="

    def get_url_host(self, url):
        reg = r'http://(.*?)/'
        hostre = re.compile(reg)
        host = re.findall(hostre, url)
        if len(host) > :
            return host[]
        return ""

    def __get_ac_json_count(self):
        a = self.count % self.rn
        c = self.count / self.rn
        if a:
            c += 
        return c
           
# coding=utf-8
from BaiduImage import BaiduImage
import sys


keyword = "披薩"
save_path = "/spider_data_"

search = BaiduImage(keyword, save_path=save_path)
search.search()
           

該代碼可以預防前面說的錯誤,而且隻要輸入想要圖檔的關鍵詞,就能爬下來,比較友善!至于為什麼可行?我也不知道啊~

但是輸入關鍵詞便能爬取相關圖檔,是因為jason格式裡面url的規律,隻要将url裡面的關鍵詞部分替換成我們輸入的即可。

參考資料:

http://blog.csdn.net/u014015972/article/details/50541839

http://bbs.csdn.net/topics/390933778

http://www.jb51.net/article/105891.htm

http://blog.csdn.net/dodouaj/article/details/54908665