天天看點

爬取妹子圖

import os
import requests
from lxml import etree
import urllib.parse





def scarch(url):
    word = input('請輸入你想要的妹子類型……')
    keyword = urllib.parse.quote(word)
    url = url + keyword + "/"
    # 發送第一次請求      https://www.mzitu.com/search/黑絲/
    response = requests.get(url=url,headers=headers)
    etrees = etree.HTML(response.text)

    all_pageNum = etrees.xpath('//div[@class="nav-links"]/a[4]/text()')
    all_pageNum = "".join(all_pageNum)
    if len(all_pageNum)<0:
        print(f"你找的{word}類型的妹子圖檔共有1頁")
    else:
        print(f"你找的{word}類型的妹子圖檔共有{all_pageNum}頁")
    start_page = int(input("請輸入開始頁碼:"))
    end_page = int(input("請輸入結束頁碼:"))
    f = f"./{word}/"

    if not os.path.exists(f):
        os.mkdir(f)
        print(f"已為您預設建立目錄,目錄名稱為{word}")
    for i in range(start_page,end_page+1):
        # print("OK!!!")
        yes = str(i)
        new_url = f"https://www.mzitu.com/search/{keyword}/page/"
        img_url = new_url + yes +"/"
        # 發送第二次請求    # https: // www.mzitu.com / search / % E7 % BE % 8E % E5 % A5 % B3 / page / 5 /
        # print(img_url)
        response = requests.get(url=img_url, headers=headers)
        # print(response.status_code)
        etrees_page = etree.HTML(response.text)

        img_detail_li = etrees_page.xpath('//ul[@id="pins"]/li')
        get_taotu(img_detail_li,f)

# 擷取套圖詳情
def get_taotu(img_detail_li,f):
    try:
        for i in img_detail_li:     # # https: // m.mzitu.com / 190349
            all_href = i.xpath('./a/@href')[0]
            all_title = str(i.xpath('./a/img/@alt')[0])
            print(all_href,type(all_href))
            print(all_title)
            print(type(all_title))
            print(f"開始下載下傳套圖{all_title}")
            # for x in all_href:
            #     print(x)
            # exit()
            filepath = f + all_title
            if not os.path.exists(filepath):
                os.makedirs(filepath)
            response = requests.get(url=all_href, headers=headers)
            # page_text = response.text
            # filename = f + str(a) + ".html"
            # with open(filename,"w",encoding="utf-8") as f:
            #     f.write(page_text)
            # exit()
            etrees_page = etree.HTML(response.text)
            img_allNum = int(etrees_page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
            # print(type(img_allNum), img_allNum)
            print(f"目前頁共有{img_allNum}張圖檔")
            # exit()

            for v in range(1,img_allNum+1):
                img_bigurl = all_href + "/" + str(v)
                # print(img_bigurl)
                response = requests.get(url=img_bigurl, headers=headers)
                # print(response.status_code)

                # filename = f + str(v) + ".html"
                # with open(filename,"w",encoding="utf-8") as f:
                #     f.write(response.text)
                #     print("寫入成功!!!")
                # exit()
                etrees_page = etree.HTML(response.text)
                # print(response.text)
                # 擷取圖檔名字
                # / html / body / div[2] / div[1] / h2
                img_name = str(etrees_page.xpath('/html/body/div[2]/div[1]/h2/text()')[0])
                # img_name = "".join(img_name)
                # print(type(img_name),img_name)
                # 擷取圖檔内容
                img_url = str(etrees_page.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0])
                # img_url = "".join(img_url)
                # print(type(img_url),img_url)
                # exit()
                response = requests.get(url=img_url, headers=headers)
                print(f"開始下載下傳高清大圖------{img_name}")
                page_content = response.content
                filename = filepath + "/" + str(v) + ".jpg"
                with open (filename,"wb") as f:
                    f.write(page_content)
    except:
        print("這一套套圖下載下傳完畢。。。")


if __name__ == '__main__':
    url = "https://www.mzitu.com/search/"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
        'Referer': url
    }


    scarch(url)