天天看點

簡易爬蟲代碼實作——基于python2.7

簡易爬蟲代碼實作——基于python2.7

# -*- coding:utf-8 -*-

import urllib2, urllib, time

class Tiebaspider(object):

    def __init__(self, tieba_name, start_page, end_page):

        self.base_url = 'https://tieba.baidu.com/f?'

        self.name = tieba_name

        self.start = start_page

        self.end = end_page

        self.headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64)             AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",             'Connection': 'keep-alive' }

    # 發送請求

    def send_request(self, url):

        time.sleep(2)

        try:

            request = urllib2.Request(url, headers=self.headers)

            response = urllib2.urlopen(request)

            if response.code == 200:

                return response.read()

        except Exception as e:

            print e

    # 下載下傳檔案

    def write_data(self, data, page):

        filename = 'tieba/' + str(page) + '頁.html'

        print '%s正在下載下傳...' % filename

        with open(filename, 'w') as f:

            f.write(data)

    # 排程方法

    def start_work(self):

        for page in range(self.start, self.end + 1):

            pn = (page - 1) * 50

            params = { 'kw': self.name, 'pn': pn }

            # 字典轉碼後與base_url進行拼接

            params_str = urllib.urlencode(params)

            url = self.base_url + params_str

            data = self.send_request(url)

            self.write_data(data, page)

if __name__ == '__main__':

    tieba_name = raw_input('請輸入貼吧名字:')

    start_page = int(raw_input('開始頁:'))

    end_page = int(raw_input('結束頁:'))

    spider = Tiebaspider(tieba_name, start_page, end_page)

    spider.start_work()

哈哈哈