天天看點

python3 [入門基礎實戰] 爬蟲入門之爬取糗事百科

#encoding=utf8
import requests
from lxml import etree


class QiuShi(object):
    headers = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
    }

    url = 'http://www.qiushibaike.com/text/'

    def __init__(self):
        filed = ['作者','性别','年齡','段子内容','好笑','評論']
        # self.write = CSV('qiushi.csv',filed)
        print(filed)

    # 總頁碼
    def totalUrl(self):
        urls = [self.url+'page/{}?s=4985075'.format(i) for i in range(,)]
        for url in urls:
            print(u'正在擷取:'+url.split('/')[-]+u'頁')
            self.getInfo(url)

    # 抓取詳細資訊
    def getInfo(self,url):
        item= {}
        html = requests.get(url,headers = self.headers).text
        data = etree.HTML(html)

        infos = data.xpath('//*[@class="article block untagged mb15"]')
        print(infos)

        for info in infos:
            try:
                item[] = info.xpath('div[1]/a[2]/h2/text()')[]
                try:
                    age = info.xpath('div[1]/div[@class="articleGender womenIcon"]/text()')[]
                    item[] = u'女'
                    item[] = age
                except:
                    age = info.xpath('div[1]/div[@class="articleGender manIcon"]/text()')[]
                    item[] = u'男'
                    item[] = age
            except:
                item[] = u'匿名使用者'
                item[] = u'不詳'
                item[] = u'不詳'
            item[] = info.xpath('a/div/span/text()')[].strip()
            item[] = info.xpath('div[2]/span[1]/i/text()')[]
            item[] = data.xpath('//*[@class="qiushi_comments"]/i/text()')[]
            row = [item[i] for i in range(, )]
            # self.write.writeRow(row)
            print(row)
            # with open('C:\\QiuShiBaiKe.cvs', 'w+') as f:
            #     # f.write('{},{},{},{},{}'.format(row, work_year, money, palace, '\n'))
            #     f.write(row+"")

if __name__ == '__main__':
    qiushi = QiuShi()
    qiushi.totalUrl()