天天看點

一個糗事百科的簡單爬蟲

抓取糗事百科的資料并存入資料庫;

額, 看下代碼應該就知道使用方法了;

#coding=utf-8

import requests;
import sqlite3;
from bs4 import BeautifulSoup as bs;

class QBSpider:
    def __init__(self, connectedDb, tableName):
        self.setDb(connectedDb);
        self.setTableName(tableName);
        self._createTable(tableName);
        
    def setDb(self, db):
        self._db = db;
       
    def getDb(self):
        return self._db;
        
    def setTableName(self, tn):
        self._tn = tn;
        
    def getTableName(self):
        return self._tn;
        
    def _processElement(self, elements):
        id = 0;  #用來辨別不同的内容
        content = "";
        date = "";
        image = None;
        url = "";
        for element in elements:
            eClass = " ".join(element["class"]);
            if(eClass == "content"):
                content = element.text.strip("\n");
                date = element["title"];
                
            elif(eClass == "stats clearfix"):
                url = element.findChildren(name = "span", attrs = {"class":"stats-comments"})[0].a["href"];
                
            elif(eClass == "stats-buttons bar clearfix"):
                id = element["id"][14:];
                
            elif(eClass == "thumb"):
                image = element.a.img["src"];   #沒玩過糗百, 不知道一次能發幾張照片, 看到大多數人都隻發一張, 是以, 這裡隻提取了一個url;
                
        return id, content, image, date, url;
                
        
    def fetch(self, url):
        bsoup = bs(requests.get(url, headers = {
            "Host":"www.qiushibaike.com",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36"
        }).text);
        
        root = bsoup.find(name = "div", attrs = {"id":"content-left"});
        childElements = root.findChildren(name = "div", attrs = {"class":"article block untagged mb15"});
        for childElement in childElements:
            self.insert(self._processElement(childElement.findChildren(name = "div")));
        
    def insert(self, args):  #id, content, image, date, url
        try:
            self.getDb().cursor().execute("insert into {0}(id, content, image, date, url) values(?, ?, ?, ?, ?)".format(self.getTableName()), args);
            self.getDb().commit();
        except:
            print("insert error");
            self.getDb().rollback();
        
    def _createTable(self, tableName):
        try:
            self.getDb().cursor().execute("create table {0} (id INTEGER NOT NULL PRIMARY KEY, content varchar(6144) NULL, image varchar(2048) NULL, date datetime NULL, url varchar(1024) NULL)".format(tableName));
            self.getDb().commit();
        except:
            self.getDb().rollback();
            
if(__name__ == "__main__"):   #test
    db = sqlite3.connect("./test.db");
    spider = QBSpider(db, "qbData");
    spider.fetch("http://www.qiushibaike.com/hot");
           

抓取的資料:

一個糗事百科的簡單爬蟲
一個糗事百科的簡單爬蟲