最近抽了點時間,将之前開發中使用到的一些開源庫進行了下總結,主要是為了回顧一下自己所使用的一些庫基礎知識,并且加深了解,在這些庫中,首先一個庫就是libcurl,這個庫很強大,當時在做openstack swift API時使用到了,這個庫一個輕量級的HTTP程式設計庫,裡面封裝了一套基于HTTP的上層應用協定的資料包的基本操作,其支援FTP,FTPS,TFTP,HTTP,HTTPS,GOPHER,TELNET,DICT,FILE和LDAP,跨平台,支援Windows,Unix,Linux等,線程安全,支援Ipv6。并且易于使用。下面就從安裝開始,之前的開發是在Windows下開發的,現在的工作環境換為了Linux,但是不要緊,安裝過程差不多,步驟如下:
1)下載下傳libcurl檔案,下載下傳指令:sudo wget http://curl.haxx.se/download/curl-7.35.0.tar.gz
2)使用指令解壓檔案:tar -zxvf curl-7.35.0.tar.gz
3)./configure --prefix=/usr/local/libcurl
4)make
5)make install
經過上面幾個步驟,libcurl庫基本上就安裝好了,下面我們就稍微寫過小測試程式來測試測試,代碼如下:
#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/xpressive/xpressive_dynamic.hpp>
#include <boost/typeof/typeof.hpp>
#include <curl/curl.h>
#include <string>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;
using namespace boost::xpressive;
#define MAX_BUFFERSIZE 1024*10
class HttpCurl
{
public:
HttpCurl()
{
conn = NULL;
memset(errBuffer,0,sizeof(errBuffer));
}
~HttpCurl()
{
curl_easy_cleanup(conn);
}
bool HttpCurlInit(string& context)
{
CURLcode code;
string error;
code = curl_global_init(CURL_GLOBAL_DEFAULT);
if(CURLE_OK != code)
{
printf("Failed to global init default\n");
return false;
}
conn = curl_easy_init();
if(NULL == conn)
{
printf("Failed to create CURL\n");
return false;
}
code = curl_easy_setopt(conn,CURLOPT_ERRORBUFFER,error.c_str());
if(CURLE_OK != code)
{
printf("Failed to set error buffer\n");
return false;
}
code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
if(CURLE_OK != code)
{
printf("Failed to set write\n");
return false;
}
code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
if(CURLE_OK != code)
{
printf("Failed to set write data\n");
return false;
}
return true;
}
bool setUrl(string& url)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
if(CURLE_OK != code)
{
printf("Failed to set URL\n");
return false;
}
return true;
}
bool getHttpResponse()
{
CURLcode code;
std::string error;
code = curl_easy_perform(conn);
if(CURLE_OK != code)
{
printf("Failed to get [%s]",error.c_str());
return false;
}
return true;
}
static long write(void* data,int size,int nmemb,std::string& context)
{
long sizes = size*nmemb;
std::string temp((char*)data,sizes);
context += temp;
return sizes;
}
bool save(const string& context,std::string filename)
{
CURLcode code;
int retcode = 0;
code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
if((CURLE_OK == code)&& retcode ==200)
{
int length = strlen(context.c_str());
FILE* file = fopen(filename.c_str(),"w+");
fseek(file,0,SEEK_SET);
fwrite(context.c_str(),1,length,file);
fclose(file);
return true;
}
return false;
}
private:
CURL* conn;
char errBuffer[MAX_BUFFERSIZE];
};
測試程式代碼如下:
int main()
{
string context;
HttpCurl curl;
curl.HttpCurlInit(context);
curl.setUrl("www.renren.com");
curl.getHttpResponse();
curl.save(context,"text.txt");
return 0;
}
通過簡單的測試用例,可以看出libcurl使用的一些基本流程,接下來我們再看看一個使用libcurl寫成的單線程爬蟲程式,這個程式實作思路很簡單,有機會将其改成多線程,代碼如下:
class Spider
{
public:
Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
{
urlSet.clear();
finishUrlSet.clear();
}
~Spider(){}
bool init(std::string& context)
{
return httpCurl->HttpCurlInit(context);
}
void parseUrl(const string& context)
{
const string tag = "href";
const string tag2 = "\"";
string::size_type tempBegin,tempEnd,iter;
tempBegin = tempEnd = 0;
iter= context.find(tag);
while(iter != string::npos)
{
tempBegin = context.find(tag2,iter);
if(tempBegin != string::npos)
{
++tempBegin;
tempEnd = context.find(tag2,tempBegin);
}
if(tempEnd != string::npos && tempEnd > tempBegin)
{
string url;
url.assign(context,tempBegin,(tempEnd-tempBegin));
urlSet.insert(url);
}
iter = context.find(tag,tempEnd);
}
}
void filterUrl()
{
string tag = "http";
urlSet_Iter iter = urlSet.begin();
for(;iter != urlSet.end();)
{
string::size_type index = (*iter).find(tag);
if(index == string::npos)
urlSet.erase(iter);
iter++;
}
}
bool write(const string& context,const string& filename)
{
return httpCurl->save(context,filename);
}
void start(std::string url,std::string& context)
{
httpCurl->setUrl(url);
httpCurl->getHttpResponse();
parseUrl(context);
filterUrl();
}
void displayUrl()
{
urlSet_Iter iter = urlSet.begin();
for(; iter != urlSet.end();++iter)
{
cout<<*iter<<endl;
}
}
void loop(const std::string& url,std::string& context)
{
start(url,context);
for(urlSet_Iter iter = urlSet.begin();iter != urlSet.end();++iter)
{
if(finishUrlSet.find(*iter) != finishUrlSet.end())
continue;
printf("%s\n",(*iter).c_str());
char filename[64];
memset(filename,0,sizeof(filename));
sprintf(filename,"%d.html",fileIndex++);
context.clear();
start(*iter,context);
write(context,filename);
finishUrlSet.insert(*iter);
}
}
private:
shared_ptr<HttpCurl> httpCurl;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef std::set<string>::iterator urlSet_Iter;
};
測試程式:
#include "curlTest.h"
int main()
{
string context;
shared_ptr<HttpCurl> curl(new HttpCurl());
Spider spider(curl);
spider.init(context);
spider.loop("www.renren.com",context);
return 0;
}
測試結果:
http://a.xnimg.cn/favicon-rr.ico?ver=3
http://a.xnimg.cn/n/core/res/certificate.jpg
http://a.xnimg.cn/wap/apple_icon_.png
http://app.renren.com
http://app.renren.com/?origin=40206
http://app.renren.com/activity/specialpage?activity=girlgame&origin=40240
http://app.renren.com/activity/specialpage?activity=newgame&origin=40238
http://app.renren.com/activity/specialpage?activity=sanguo&origin=40239
http://app.renren.com/list?category=10&type=1&origin=40060&menu=1
http://app.renren.com/list?category=10&type=1&origin=40065&menu=1
http://app.renren.com/list?category=10&type=1&origin=40207&menu=1
http://app.renren.com/list?category=11&type=1
http://app.renren.com/list?category=11&type=1&origin=3142&added=0
http://app.renren.com/list?category=11&type=1&origin=40131
http://app.renren.com/list?category=11&type=1&origin=40132
http://app.renren.com/list?category=11&type=1&origin=40189
http://app.renren.com/list?category=11&type=1&origin=50298
http://app.renren.com/list?category=12&type=1&origin=3142&added=0
http://app.renren.com/list?category=12&type=1&origin=40131
http://app.renren.com/list?category=12&type=1&origin=40132
http://app.renren.com/list?category=13&type=1
http://app.renren.com/list?category=13&type=1&origin=3142&added=0
http://app.renren.com/list?category=13&type=1&origin=40131
http://app.renren.com/list?category=13&type=1&origin=40132
http://app.renren.com/list?category=13&type=1&origin=40188
http://app.renren.com/list?category=13&type=1&origin=40189
http://app.renren.com/list?category=13&type=1&origin=50298
http://app.renren.com/list?category=14&type=1
http://app.renren.com/list?category=14&type=1&origin=3113
http://app.renren.com/list?category=14&type=1&origin=3142&added=0
http://app.renren.com/list?category=14&type=1&origin=40188
http://app.renren.com/list?category=14&type=1&origin=50298
http://app.renren.com/list?category=15&type=1&origin=3142&added=0
http://app.renren.com/list?category=15&type=1&origin=40131
http://app.renren.com/list?category=17&type=1&origin=3142&added=0
http://app.renren.com/list?category=19&type=1
http://app.renren.com/list?category=19&type=1&origin=3113
http://app.renren.com/list?category=19&type=1&origin=3142&added=0
http://app.renren.com/list?category=19&type=1&origin=40132
http://app.renren.com/list?category=19&type=1&origin=40188
http://app.renren.com/list?category=19&type=1&origin=50298
http://app.renren.com/list?category=20&type=1&origin=40066&menu=2
http://app.renren.com/list?category=20&type=1&origin=40210&menu=2
^C
總結
本篇博文主要是安裝了下libcurl,并且在此基礎上實作了一個簡單的爬蟲程式,代碼結構很簡單,至于libcurl的一些接口介紹什麼的,大家可以去其官網上看,在這裡就略過了,在寫這個爬蟲程式時,一開始一心想用boost裡的regex做正規表達式比對url,但是中間遇到了問題,是以臨時自己寫了個提取url的parseURL,另外這個爬蟲程式使用的是單線程的,有機會的話,我會将其改成支援多線程,其實很多東西在于自己去實踐,才能體會到其中的奧秘,好了,這篇博文到此就結束了,多謝。
如果需要,請注明轉載,多謝