python3爬蟲-分析Ajax，抓取今日頭條街拍美圖

# coding=utf-8
from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException,Timeout
import json
from bs4 import BeautifulSoup
from pymongo import MongoClient
from multiprocessing import Pool
import os
import string
from hashlib import md5


def get_response(url):
    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
        }
        # proxies = {'http':'118.11.2.3:8080'}
        response = requests.get(url, headers=headers, timeout=5)
        print(url + 'request success')
        return response
    except Timeout:
        print(url + 'request timeout')


def get_page_index(offset, keyword):

    data = {
        "offset": offset,
        "format": "json",
        "keyword": keyword,
        "autoload": "true",
        "count": "20",
        "cur_tab": "1",
        "from":"search_tab"
    }


    url = "https://www.toutiao.com/search_content/?" + urlencode(data)
    print(url)
    try:
        response = get_response(url)
        print(response.status_code)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('request error')
        return None

def conn_mongodb():
    client = MongoClient('localhost', 27017)
    db = client['jiepai']
    jiepai = db['jiepai']
    return jiepai

def save_image_url(data):
    jiepai = conn_mongodb()
    jiepai.update({'title':data.get('title')}, {'$set':data}, upsert=True)

def get_image_url():
    jiepai = conn_mongodb()
    data = jiepai.find({}, {'title': 1, 'images_list': 1, '_id': 0})
    return data


def download_image(data):

    base_dir = os.path.abspath(os.path.dirname(__file__))
    if not os.path.exists(base_dir + '\jiepai'):
        os.mkdir(base_dir + '\jiepai')
    for item in data:
        print(item.get('title'))
        title = item.get('title')
        images_list = item.get('images_list')
        print('images_lsit',images_list)
        # every file name
        file_name = title.strip(string.punctuation)
        file_name = str(file_name).replace('?','')
        if not os.path.exists(base_dir + '\jiepai/' + file_name):
            os.mkdir(base_dir + '\jiepai\\' + file_name)
        # save images path
        file_path = base_dir + '\jiepai\\' + file_name
        for image_url in images_list:
            print(image_url)
            response = get_response(image_url)
            html = response.content
            image_name = md5(html).hexdigest() + '.jpg'

            with open(file_path + '\\' + image_name, 'wb') as f:
                f.write(html)
                print('download success')


def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            a_gourp_image_detail = {}
            images_list = []
            title = item.get('title')
            # print(title)
            if title is not None:
                a_gourp_image_detail['title'] = title
                images = item.get('image_detail')
                # print(images)
                if images:
                    for image in images:
                        # print(image.get('url'))
                        images_list.append(image.get('url'))
            # if images_list:
            a_gourp_image_detail['images_list'] = list(set(images_list))
            print(a_gourp_image_detail)
            save_image_url(a_gourp_image_detail)


def main(offset):

    html = get_page_index(offset, '街拍')
    # print(html)
    parse_page_index(html)


if __name__ == "__main__":
    # 多程序爬取圖檔連結，并儲存到 Mongodb
    # groups = [x*20 for x in range(0,5)]
    # pool = Pool()
    # pool.map(main, groups)

    # 從 mongodb 中擷取連結，多程序下載下傳圖檔，并儲存
    data = get_image_url()
    datas = [item for item in data]

    pool = Pool()
    pool.map(download_image, data)
    # download_image()

python3爬蟲-分析Ajax，抓取今日頭條街拍美圖

繼續閱讀

tabpanel 使用問題

為什麼把CSS放頭部，script放下面

關于 underscore 中模闆引擎的應用示範樣例

underscore 模闆标簽修改。

Ajax——模闆引擎

使用underscore的template自定義模闆

underscore模闆功能的使用和學習

CSS之折疊菜單

web開發之前後端渲染

[HTML5]自定義屬性 data-* 和 jQuery.data 詳解

七牛雲-C#SDK-上傳-前期準備

403 Forbidden，You don't have permission to access / on this server.Forbidden

Ubuntu14.04 LTS下安裝mongodb

vue-cli簡介（中文翻譯）

Ajax發送和擷取json資料到Spring mvc 1.spring mvc後端2.web前段

JSONObject包導入異常 java.lang.NoClassDefFoundErrorweb項目的導入包的問題