#encoding=utf8
import requests
from lxml import etree
class QiuShi(object):
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}
url = 'http://www.qiushibaike.com/text/'
def __init__(self):
filed = ['作者','性别','年齡','段子内容','好笑','評論']
# self.write = CSV('qiushi.csv',filed)
print(filed)
# 總頁碼
def totalUrl(self):
urls = [self.url+'page/{}?s=4985075'.format(i) for i in range(,)]
for url in urls:
print(u'正在擷取:'+url.split('/')[-]+u'頁')
self.getInfo(url)
# 抓取詳細資訊
def getInfo(self,url):
item= {}
html = requests.get(url,headers = self.headers).text
data = etree.HTML(html)
infos = data.xpath('//*[@class="article block untagged mb15"]')
print(infos)
for info in infos:
try:
item[] = info.xpath('div[1]/a[2]/h2/text()')[]
try:
age = info.xpath('div[1]/div[@class="articleGender womenIcon"]/text()')[]
item[] = u'女'
item[] = age
except:
age = info.xpath('div[1]/div[@class="articleGender manIcon"]/text()')[]
item[] = u'男'
item[] = age
except:
item[] = u'匿名使用者'
item[] = u'不詳'
item[] = u'不詳'
item[] = info.xpath('a/div/span/text()')[].strip()
item[] = info.xpath('div[2]/span[1]/i/text()')[]
item[] = data.xpath('//*[@class="qiushi_comments"]/i/text()')[]
row = [item[i] for i in range(, )]
# self.write.writeRow(row)
print(row)
# with open('C:\\QiuShiBaiKe.cvs', 'w+') as f:
# # f.write('{},{},{},{},{}'.format(row, work_year, money, palace, '\n'))
# f.write(row+"")
if __name__ == '__main__':
qiushi = QiuShi()
qiushi.totalUrl()