天天看点

python爬取贴吧楼主的所有回复_Python爬取贴吧用户关系网

[Python] 纯文本查看 复制代码import requests

import re

import json

import time

from lxml import etree

import dill

# 全局变量

tieba_prefix = "http://tieba.baidu.com"

userdict = {}

# 参数信息类

class para:

headers = None

cookies = None

max_loop = None

max_page = None

max_num = None

# 用户信息类

class userinfo(object):

def __init__(self, url):

self.url = url

self.id = None

self.username = None

self.age = None

self.tie = None

self.sex = None

self.concern_num = None

self.concern_url = None

self.concern_list = []

self.fans_num = None

self.fans_url = None

self.fans_list = []

# 保存到文件

def saveToFile(self):

dictObj = {

"url": self.url,

"id": self.id,

"username": self.username,

"age": self.age,

"tie": self.tie,

"sex": self.sex,

"concern_num": self.concern_num,

"concern_url": self.concern_url,

"fans_num": self.fans_num,

"fans_url": self.fans_url

}

# url解析

def getHtmlFromUrl(url, loop_info):

response = requests.get(url, headers=para.headers, cookies=para.cookies)

print('当前页面:' + url)

print(loop_info)

if response.status_code == 200:

# 很抱歉,您要访问的页面不存在。

if response.url == 'http://static.tieba.baidu.com/tb/error.html?ErrType=1':

data = response.content.decode('gbk') # gbk编码

html = etree.HTML(data)

result = html.xpath('//div[@id="errorText"]/p/text()')

if len(result) > 0:

print(result[0])

else:

print('获取错误消息失败')

return

data = response.content.decode('utf-8')

html = etree.HTML(data)

# 抱歉,您访问的用户已被屏蔽。

if response.url == 'http://tieba.baidu.com/tb/static-ucenter/html/error.html':

result = html.xpath('//div[@id="errorText"]/p/text()')

if len(result) > 0:

print(result[0])

else:

print('获取错误消息失败')

return

# 正常结果

return html

else:

print('页面获取失败')

print(response.status_code)

print(response.history)

# 获取用户页面信息

def get_concern_info(html, user, id, loop_info):

# 识别id

if id == '':

result = html.xpath('//a[@class="nav_icon nav_main"]/@href')[0]

matchObj = re.search(r'.*?id=(tb.*)', result)

if matchObj:

id = matchObj.group(1)

else:

print("id No match!!")

return

# 用户名

username = html.xpath(

'//span[starts-with(@class,"userinfo_username ")]/text()')[0]

# 吧龄

result = html.xpath(

'//div[@class="userinfo_userdata"]/span[2]/text()')[0][3:-1]

age = float(result)

# 发帖数

result = html.xpath(

'//div[@class="userinfo_userdata"]/span[4]/text()')[0][3:]

# 发帖数上万时显示小数

if result[-1] == '万':

tie = int(float(result[0:-1]) * 10000)

else:

tie = int(result)

# 性别

sex = html.xpath(

'//div[@class="userinfo_userdata"]/span[1]/@class')[0][26:]

# 关注数

result = html.xpath(

'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/text()'

)

if len(result) > 0:

concern_num = result[0]

# 关注页

result = html.xpath(

'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/@href'

)

concern_url = tieba_prefix + result[0]

# 粉丝数

result = html.xpath(

'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/text()'

)

if len(result) > 0:

fans_num = result[0]

# 粉丝页

result = html.xpath(

'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/@href'

)

fans_url = tieba_prefix + result[0]

# 完善用户信息

user.id = id

user.username = username

user.age = age

user.tie = tie

user.sex = sex

# 属性可能不存在

if 'concern_num' in locals():

user.concern_num = concern_num

user.concern_url = concern_url

if 'fans_num' in locals():

user.fans_num = fans_num

user.fans_url = fans_url

# 用户信息实时导出

#user.saveToFile()

# 追加已获取用户

userdict[id] = user

print('加入用户:' + username)

# 迭代上限检测

if loop_info['Node'] <= para.max_loop:

#初始化循环信息

loop_info['Node'] = loop_info['Node'] + 1

# 关注页迭代

if concern_url != None:

loop_info['Origin'] = username + " 关注页"

loop_concern(concern_url, loop_info, user)

# 粉丝页迭代

if fans_url != None:

loop_info['Origin'] = username + " 粉丝页"

loop_concern(fans_url, loop_info, user)

# 关注/粉丝页循环 但是百度关注显示上限为500

def loop_concern(url, loop_info, user):

# 初始化

loop_info['Page'] = 1

while True:

# 页面处理

html = getHtmlFromUrl(url, loop_info)

# 当前页获取失败时,终止

if html == None:

break

get_concern(html, loop_info, user)

# 页面获取上限时,终止

if loop_info['Page'] >= para.max_page:

break

#循环信息

loop_info['Page'] = loop_info['Page'] + 1

# 下一页url

result = html.xpath(

'//div[@class="pager pager-center"]/a[@class="next"]/@href')

if len(result) > 0:

url = tieba_prefix + result[0]

else:

# 最后一页时退出

break

# 关注/粉丝页提取

def get_concern(html, loop_info, user):

# 初始化

loop_info['Num'] = 0

pageIdList = html.xpath('//div[@class="user"]/@portrait')

pageUrlList = html.xpath('//span[@class="name"]/a/@href')

# pageUrlList size默认和pageIdList size相等

for i in range(len(pageIdList)):

u_id = pageIdList[i]

u_url = tieba_prefix + pageUrlList[i]

# 获取上限时,终止

if loop_info['Num'] >= para.max_num:

break

#循环信息

loop_info['Num'] = loop_info['Num'] + 1

# 已经获取的用户不再重复爬取

if u_id not in userdict.keys():

u_html = getHtmlFromUrl(u_url, loop_info)

# 当前用户获取失败时,跳过

if u_html == None:

continue

# 建立子用户信息

sub_user = userinfo(u_url)

get_concern_info(u_html, sub_user, u_id, loop_info)

#加入到关注/粉丝列表

if loop_info['Origin'][-3:] == '关注页':

user.concern_list.append(userdict[u_id])

elif loop_info['Origin'][-3:] == '粉丝页':

user.fans_list.append(userdict[u_id])

def main(max_loop, max_page, max_num, origin_url):

#最大迭代层数

para.max_loop = max_loop

#最大页数

para.max_page = max_page

# 一页最大为20

para.max_num = max_num

# 写入headers

para.headers = {

'user-agent':

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'

}

# 打开所保存的cookies内容文件

f = open(r'cookies.txt', 'r')

# 初始化cookies

para.cookies = {}

# 写入cookies

for line in f.read().split(';'):

#其设置为1就会把字符串拆分成2份

name, value = line.strip().split('=', 1)

#字典cookies添加

para.cookies[name] = value

# 初始化循环消息

loop_info = {'Node': 0, 'Page': 0, 'Num': 0, 'Origin': ''}

# 建立用户信息

user = userinfo(origin_url)

# 第一个用户链接

html = getHtmlFromUrl(origin_url, loop_info)

if html == None:

print("原始输入错误")

return

# 获取用户信息

get_concern_info(html, user, '', loop_info)

return userdict

if __name__ == '__main__':

origin_url = '贴吧用户url'

main(2, 10, 2, origin_url)

# 保存结果

filename = r"crawler_data.pkl"

dill.dump_session(filename)

print("完成!!")