一、環境以及微網誌接口
環境:python3
微網誌接口:由于電腦端的反爬措施過于嚴密,是以我就選擇繞過電腦端的直接選擇手機端的接口。接口查找步驟如下:
1、使用谷歌浏覽器進入微網誌,搜尋想要爬取的微網誌部落客:
然後點選進入微網誌首頁。
2、按F12,點選響應式設計模式(就是模拟手機模式)
選擇xhr,然後按F5重新整理,就會出現很多的接口。選擇如下圖這個接口:
選擇帶小齒輪和具有since_id特征的那個。因為since_id後面翻頁需要用到。
這樣微網誌接口就拿到了。
二、代碼實作:
import requests
import json
import jsonpath
from pyquery import PyQuery as pq
import xlwt
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
#定義開始時間
time_end = 1590940800.0
#定義結束時間
time_start = 1577808000.0
#定義控制條件
bool_s = 1
work_book=xlwt.Workbook(encoding='utf-8')
sheet=work_book.add_sheet('sheet1')
y = 0
arr_list = ["部落客","簡介","關注數","粉絲數","總發帖數","釋出時間","标注","點贊數","轉發數","評論數","是否源于轉發","博文"]
sheet.write(y,0,arr_list[0])
sheet.write(y,1,arr_list[1])
sheet.write(y,2,arr_list[2])
sheet.write(y,3,arr_list[3])
sheet.write(y,4,arr_list[4])
sheet.write(y,5,arr_list[5])
sheet.write(y,6,arr_list[6])
sheet.write(y,7,arr_list[7])
sheet.write(y,8,arr_list[8])
sheet.write(y,9,arr_list[9])
sheet.write(y,10,arr_list[10])
sheet.write(y,11,arr_list[11])
#4456718664899043
since_id = ''
s_id = '&since_id='
url = input("請輸入網址:")
y = y + 1
while bool_s > 0:
#url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=3598339130&containerid=1076033598339130&since_id='
url_id = url +s_id+ str(since_id)
print("網址一:",url_id)
if since_id is False:
bool_s = -1
response = requests.get(url_id,headers=headers)
response.encoding = 'UTF-8'
json_data = json.loads(response.text)
#部落客名稱
json_url_name = '$.data.user.screen_name'
#簡介
json_url_verified_reason = '$.data.user.verified_reason'
#關注數
json_url_follow = '$..follow_count'
#粉絲數
json_url_followers = '$..followers_count'
#總發帖數
json_url_total = '$..statuses_count'
#釋出時間
json_url_time = '$.data.created_at'
#點贊數
json_url_attitudes = '$.data.attitudes_count'
#轉發數
json_url_reposts = '$.data.reposts_count'
#評論數
json_url_comments = '$.data.comments_count'
#擷取頁碼since_id
json_url_sign = '$..since_id'
# 博文key
json_url_key = '$..mblog.bid'
#擷取類型
json_url_type = '$.data.pic_num'
#類型二次定位
json_url_type2 = '$.data.page_info'
#擷取時間
json_url_time_two = '$..created_at'
#longText路徑
json_url_longText = '$.data.retweeted_status'
data_url_key = jsonpath.jsonpath(json_data,json_url_key)
since = jsonpath.jsonpath(json_data, json_url_sign)
try:
if since is False:
bool_s = -1
else:
since_id = since[0]
except Exception as e:
print("sinceid出錯",e)
if data_url_key is False:
bool_s = -1
break
for i in range(0,len(data_url_key)):
url_two = 'https://m.weibo.cn/statuses/show?id='+str(data_url_key[i])+'&display=0&retcode=6102'
print(url_two)
passage_response = requests.get(url_two,headers=headers)
passage_response.encoding = 'UTF-8'
try:
json_passage = json.loads(passage_response.text)
except Exception as e:
print("二層讀取博文報錯",e)
#擷取文本
json_url_text = '$.data.text'
data_date = jsonpath.jsonpath(json_passage,json_url_time_two)
#寫如部落客名稱
data_name = jsonpath.jsonpath(json_passage,json_url_name)
print(data_name)
sheet.write(y,0,data_name)
#寫入簡介
data_reason = jsonpath.jsonpath(json_passage,json_url_verified_reason)
sheet.write(y,1,data_reason)
#寫入關注數
data_follow = jsonpath.jsonpath(json_passage,json_url_follow)
sheet.write(y,2,data_follow[0])
#寫入粉絲數
data_followers = jsonpath.jsonpath(json_passage,json_url_followers)
sheet.write(y,3,data_followers[0])
#寫入總發帖數
data_total = jsonpath.jsonpath(json_passage,json_url_total)
sheet.write(y,4,data_total[0])
#寫入釋出時間
data_create_time = jsonpath.jsonpath(json_passage,json_url_time)
times = time.mktime(time.strptime(data_date[0][0:20] + data_date[0][26:30], "%a %b %d %H:%M:%S %Y"))
timeArray = time.localtime(times)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
print(otherStyleTime)
sheet.write(y,5,otherStyleTime)
#寫入标标注
data_type = jsonpath.jsonpath(json_passage,json_url_type)
if int(data_type[0]) > 0:
sheet.write(y,6,"圖檔")
else:
data_info = jsonpath.jsonpath(json_passage,json_url_type2)
if data_info is False:
sheet.write(y,6,"文字")
else:
sheet.write(y,6,"視訊")
# sheet.write(y,6,data_type[0])
#寫入點贊數
data_a = jsonpath.jsonpath(json_passage,json_url_attitudes)
sheet.write(y,7,data_a[0])
#寫入轉發數
data_re = jsonpath.jsonpath(json_passage,json_url_reposts)
sheet.write(y,8,data_re[0])
#寫入評論數
data_com = jsonpath.jsonpath(json_passage,json_url_comments)
sheet.write(y,9,data_com[0])
#寫入是否源于轉發
data_status = jsonpath.jsonpath(json_passage,json_url_longText)
if data_status is False:
sheet.write(y,10,"否")
else:
sheet.write(y,10,"是")
#寫入博文
data_text = jsonpath.jsonpath(json_passage,json_url_text)
try:
data_text_ex = pq(data_text[0]).text()
sheet.write(y,11,data_text_ex)
except Exception as e:
print("寫入錯誤",e)
y = y + 1
#每隔一秒爬取一小頁
time.sleep(1)
print(time.mktime(time.strptime(data_date[0][0:20] + data_date[0][25:30], "%a %b %d %H:%M:%S %Y")),data_date[0][0:20] + data_date[0][26:30])
#判斷截止時間,可以自己設定截止時間
if time.mktime(time.strptime(data_date[0][0:20] + data_date[0][26:30], "%a %b %d %H:%M:%S %Y")) < time_end:
bool_s = -1
break
print("第{}行爬取完成".format(y))
time.sleep(3)
work_book.save('{}.xls'.format(data_name[0]))
print("爬取完成!!!")
三、成果展示:
爬取的字段可以自己增加或者進行删減。
轉載請注明出處