首先分析街拍圖集的網頁請求頭部:

找到圖檔位址,接下來我們就可以來寫代碼了:
1.導入必要的庫:
import requests
import json
import re
import pymongo
import os
from hashlib import md5
from multiprocessing import Pool
from json.decoder import JSONDecodeError
from requests.exceptions import RequestException
from urllib.parse import urlencode
from bs4 import BeautifulSoup
2.擷取索引頁并分析:
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 3
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print(' 請求索引頁出錯')
return None
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
3.擷取詳情頁并分析:
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print(' 請求詳情頁出錯')
return None
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
images_pattern = re.compile('gallery: (.*?),\n', re.S)
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:
download_images(image)
return {
'title': title,
'url': url,
'images': images
}
4.使用 MongoDB 資料庫存儲資料:
首先定義一個 config.py 檔案,配置預設參數:
寫入 MongoDB:
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print(' 存儲到 MongoDB 成功', result)
return True
5.存儲圖檔到本地:
def download_images(url):
print(' 正在下載下傳', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_images(response.content)
return None
except RequestException:
print(' 請求圖檔出錯')
return None
def save_images(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
6.最後定義 main()函數,并開啟多線程抓取20頁圖集:
def save_images(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
def main(offset):
text = get_page_index(offset, KEYWORD)
for url in parse_page_index(text):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result:
save_to_mongo(result)
if __name__ == '__main__':
groups = [x * 20 for x in range(GROUP_START, GROUP_END + 1)]
pool = Pool()
pool.map(main, groups)
代碼GitHub位址:
https://github.com/weixuqin/PythonProjects/tree/master/jiepai