天天看點

爬取整個網站圖檔

網站:http://pic.netbian.com

爬取整個網站圖檔
#-*- coding:utf-8 -*-
import urllib2
import re,sys,os

reload(sys)
sys.setdefaultencoding("utf-8")
#http://pic.netbian.com

import requests

num=1
headers = {
    'Referer': 'http://pic.netbian.com/e/search/result/?searchid=1224',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
}
def down_img(url,root):
    global num
    response = requests.get(url, headers=headers)
    pic= response.text
   

    reg = re.compile('a href="(/tupian/\w+.html)" target="_blank" rel="external nofollow" ')
        
    file_pic= re.findall(reg,pic)

    if not os.path.exists("Pic"):
        os.makedirs("Pic")
    if not os.path.exists("Pic/"+root):
        os.makedirs("Pic/"+root) 

    for i in file_pic:
        img_url="http://pic.netbian.com/"+i
        response = requests.get(img_url, headers=headers)
        pic_text= response.text
        reg = re.compile(r'/uploads.*?.jpg')
        file_address= re.findall(reg,pic_text)
        img_address="http://pic.netbian.com/"+file_address[0]
        #print img_address
        request =  urllib2.Request(url=img_address, headers=headers)
        response = urllib2.urlopen(request)
        new_name="Pic/"+root+"/"+"%d" % num +".jpg"
        print new_name
        if not os.path.exists(new_name):
            with open(new_name, "wb") as f:
                f.write(response.read())
        num+=1
 
type_pic=["4kfengjing","4kyouxi","4kmeinv","4kdongman","4kyingshi","4kmingxing",
          "4kqiche","4kdongwu","4krenwu","4kmeishi","4kzongjiao","4kbeijing"]
type_img=int(raw_input("請輸入選擇下載下傳的類型:\n1.風景\n2.遊戲\n3.美女\n4.動漫\n5.影視\n6.明星\n7.汽車\n8.動物\n9.人物\n10.美食\n11.宗教\n12.背景\n".encode(sys.getfilesystemencoding())))
page=int(raw_input("請輸入下載下傳頁數:頁數>=1\n".encode(sys.getfilesystemencoding())))
for i in range(1,page+1):#此處頁面的個數,可根據情況修改
    url=("http://pic.netbian.com/%s/index_"% type_pic[type_img-1])+str(i)+".html"
    if "_1.html" in url:
        url=url.replace("_1.html",".html")
    print url
    down_img(url,type_pic[type_img-1])
    
print "finish!\n"
           
爬取整個網站圖檔
爬取整個網站圖檔

說明:首先選擇下載下傳類型,然後輸入下載下傳頁數(page>=1),即可進行下載下傳。

首次運作會在同級目錄建立Pic目錄,然後根據下載下傳類型建立二級目錄。

同樣類型以下載下傳的圖檔不會二次下載下傳。一頁圖檔個數為21張。