天天看點

4-騰訊微網誌資料收集

為了練習python,于是寫了個代碼實作實時擷取騰訊微網誌廣播大廳的最新微網誌資料,每條查詢結果以json的格式儲存在檔案當中

承接前一篇微網誌

http://loma1990.blog.51cto.com/6082839/1308205

主要在createApiCaller()函數裡面添加appkey open_id access_token

已經實測 擷取了280多萬條微網誌資料

結果如下:

#! /usr/bin/env python
#coding=utf-8
#Author:loma
#持續擷取騰訊微網誌廣播大廳的微網誌資料
'''
 * @author loma
 * qq:124172231 mail:[email protected]
 * Copyright (c) 2013, loma All Rights Reserved.
'''
import urllib2
import urllib
import webbrowser
import urlparse 
import os
import time
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')           
class ApiManager:
                                                                                                                 
    #擷取使用api的公共參數
    def getPublicParams(self,appKey,access_token,open_id):
        params = {};
        params['oauth_consumer_key'] = appKey;
        params['access_token'] = access_token;
        params['openid'] = open_id;
        params['oauth_version'] = '2.a';
        params['scope'] = 'all';
        return params;
                                                                                                                 
    #通過Ie打開授權頁面
    def OAuth2(self,appKey,redirect_url,response_type = 'token'):
        format = 'https://open.t.qq.com/cgi-bin/oauth2/authorize?client_id=%s&response_type=%s&redirect_uri=%s';
        url = format%(appKey,response_type,redirect_url);
        webbrowser.open_new_tab(url);
                                                                                                                 
    #從得到的URl解析出access_token和client_id到params字典中
    def decodeUrl(self,urlStr):
        urlStr = urlStr.replace('#','?');
        result=urlparse.urlparse(urlStr);
        params=urlparse.parse_qs(result.query,True);
        for a in params:
              print params[a]
                                                                                                                     
        return params;
                                                                                                                     
    #調用騰訊微網誌開放平台的api   
    def doRequest(self,appKey,open_id,access_token,apiStr,params):
        apiHead = 'http://open.t.qq.com/api/';
        requestParams = self.getPublicParams(appKey,access_token,open_id);
        requestParams.update(params);
        url = apiHead + apiStr + "?" + urllib.urlencode(requestParams);
        data = urllib2.urlopen(url);
        return data;
                                                                                                                 
#ApiCaller負責API的調用
class ApiCaller:
                                                                                                                 
    def __init__(self):
        self.count = 0
        self.apiManager = ApiManager()
        self.Callers = {}
                                                                                                                 
    #增加調用者
    def addCaller(self,appKey,open_id,access_token):
        self.Callers[len(self.Callers) + 1] = {"appKey":appKey,"open_id":open_id,"access_token":access_token}
        self.count = self.count + 1
                                                                                                                 
    #調用api
    def callAPI(self,apiStr,params):
        if self.count <= 0 or self.count > len(self.Callers):
            return False,None
                                                                                                                     
        caller = self.Callers[self.count]
        if self.count > 1:
            self.count = self.count % (len(self.Callers) -1) + 1
        while True:
            try:
                data = self.apiManager.doRequest(caller["appKey"],caller["open_id"],caller["access_token"],apiStr,params)
                break;
            except:
                print (u"10s後重試連接配接").encode('gbk')
                time.sleep(10);
                continue;
                                                                                                                         
        return True,data
                                                                                                                 
                                                                                                                 
class Spider:
    def __init__(self,callers):
        self.totalNum = 0 #用來儲存微網誌數量
        self.caller = callers
        self.idSet = set()  #用來儲存所有微網誌的id
        self.idList = [] #用來儲存一次json裡面的微網誌id
        self.isStop = False #用來結束spiding的循環的
        self.tag = 0
                                                                                                                     
        #不定時爬取微網誌資料
    def spiding(self,dataFile,sleepTime,apiStr,apiParams):
        self.initIdSet(dataFile)
        self.isStop = False
        weiboSum = 0
        while True:
            print (u"努力地爬取資料中....%d"%(self.tag)).encode('gbk')
            self.tag = self.tag + 1
            #調用API擷取微網誌資料
            re,data = self.caller.callAPI(apiStr,apiParams)
            time.sleep(sleepTime)
            data = data.read()
            try:
                js = json.loads(data)
            except:
                print 'loads failed'
                continue
            if js['data'] == None:
                apiParams['pos'] = 0
                continue
            info = js['data']['info']
            pos = js['data']['pos']
            apiParams['pos'] = pos
                                                                                                                         
            #儲存json裡所有微網誌的id,用于判斷是否有重複的微網誌
            self.idList = []
            for weibo in info:
                self.idList.insert(0,weibo['id'])
                                                                                                                         
            #過濾掉重複的微網誌
            js['data']['info'] = filter(lambda x:x['id'] not in self.idSet and self.idList.count(x['id']) == 1,info)
                                                                                                                         
            info = js['data']['info']
            print (u"此次擷取的微網誌個數:%d"%(len(info))).encode('gbk')
            weiboSum = weiboSum + len(info)
            self.totalNum = self.totalNum + len(info)
            print (u"累計擷取微網誌:%d 全部微網誌數量:%d"%(weiboSum,self.totalNum)).encode('gbk')
                                                                                                                         
            for weibo in info:
                    self.idSet.add(weibo['id'])   
                                                                                                                        
            if self.isStop == True:
                break
            jsStr = json.dumps(js,ensure_ascii=False,encoding='utf8')
           # print jsStr.decode('utf8').encode('gbk')
            file = open(dataFile,'a')
            file.write(jsStr + '\n')
            file.close()
                                                                                                                     
    def stopSpiding(self):
        self.isStop = True
                                                                                                                     
    def initIdSet(self,dataFile):
        try:
            file = open(dataFile,"r")
        except:
            return
        for line in file:
            js = json.loads(line)
            info = js['data']['info']
            for weibo in info:
                id = weibo['id']
                #print weibo['text'].decode('utf8').encode('gbk')
                self.idSet.add(id)
                self.totalNum = self.totalNum + 1
        file.close()
                                                                                                                     
                                                                                                                 
#授權調用
def doOAuth():
    #需要設定的地方
    appkey = '801348303';
    redirect_url = 'http://loma1990.blog.51cto.com';
                                                                                                                 
    #執行個體化
    manager = ApiManager();
                                                                                                                 
    #調用授權頁面
    manager.OAuth2(appkey,redirect_url);
                                                                                                                 
    #等待使用者輸入授權後跳轉到的頁面裡的url
    url = raw_input('Input the url');
                                                                                                                 
    #提取access_token和openid
    params = manager.decodeUrl(url);
    print params['openid']
    print "\"%s\",\"%s\",\"%s\""%(appkey,params['openid'],params['access_token'])
    #api參數設定
    apiParams = {'format':'json','pos':'0','reqnum':'100'};
                                                                                                                 
    #調用api擷取資料
    data = manager.doRequest(appkey,params['openid'],params['access_token'],'statuses/public_timeline',apiParams);
                                                                                                                 
    #将擷取的資料儲存到指定檔案中
    file = open("d:\\weibo.txt","wb");
    file.write(data.read());
    file.close();
                                                                                                                 
    #列印資料
    print data.read();
                                                                                                                 
#ApiCaller建立
def createApiCaller():
    callers = ApiCaller()
    #要在這邊調用addCaller增加你的appKey,open_id,access_token 然後直接運作就可以了
    return callers
#doOAuth()
callers = createApiCaller()
#api參數設定
apiParams = {'format':'json','pos':0,'reqnum':100};
spider = Spider(callers)
spider.spiding('d:\\weiboBase.txt',5,'statuses/public_timeline',apiParams)