天天看點

JAVA nio異步爬蟲_java 使用webmagic 爬蟲架構爬取部落格園資料

importjava.sql.PreparedStatement;importjava.sql.SQLException;importjava.util.ArrayList;importjava.util.Date;importjava.util.List;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importus.codecraft.webmagic.Page;importus.codecraft.webmagic.Site;importus.codecraft.webmagic.Spider;importus.codecraft.webmagic.processor.PageProcessor;importcom.mysql.jdbc.Connection;importcom.nio.webmagic.jdbc.MySqlJdbcUtils;importcom.nio.webmagic.model.JavaBokeModel;

public class JavaBoKePageProcessor implementsPageProcessor {private static Connection conn=null;private static PreparedStatement ps =null;//标題和連結擷取

private static String TITLEQUERY="div.post_item_body h3 a.titlelnk";//作者

private static String AUTHORQUERY="div.post_item_foot a.lightblue ";//簡介

private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";//插入sql語句

private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";//初始連結

private staticConnection getConnection(){if (conn==null) {

conn=MySqlJdbcUtils.getOpenConnection();

}returnconn;

}

private synchronized void insertDb(ListjavaBokes){try{

ps=conn.prepareStatement(insertSql);for(JavaBokeModel javaBoke:javaBokes) {

ps.setString(1, javaBoke.getTitle().toString());

ps.setString(2, javaBoke.getLinke().toString());

ps.setString(3, javaBoke.getAuthor().toString());

ps.setString(4, javaBoke.getAuthorUrl().toString());

ps.setString(5, javaBoke.getSummary().toString());

ps.executeUpdate();

}

}catch(SQLException e) {//TODO Auto-generated catch block

e.printStackTrace();

}

}//初始化帶爬取網頁位址

private static Listurls(){

List listUrl =new ArrayList();for (int i = 2; i <=200; i++) {//listUrl.add("http://www.cnblogs.com/cate/java/"+i);

listUrl.add("http://www.cnblogs.com/cate/java/"+i);

}

listUrl.toArray(newString[listUrl.size()]);returnlistUrl;

}

private staticString seletDocumentText(String htmlText,String Query){

Document doc=Jsoup.parse(htmlText);

String select=doc.select(Query).text();returnselect;

}

private staticString seletDocumentLink(String htmlText,String Query){

Document doc=Jsoup.parse(htmlText);

String select= doc.select(Query).attr("href");returnselect;

}@Overridepublic voidprocess(Page page) {//page.addTargetRequests(urls());//div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()'//定義如何抽取頁面資訊,并儲存下來

List htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all();

List javaBokes=new ArrayList();for(String html:htmls) {

JavaBokeModel javaBoke=newJavaBokeModel();//标題和連結

String title =seletDocumentText(html,TITLEQUERY);

String linke=seletDocumentLink(html,TITLEQUERY);//作者和作者首頁

String author=seletDocumentText(html, AUTHORQUERY);

String authorUrl=seletDocumentLink(html, AUTHORQUERY);//簡介

String summary=seletDocumentText(html, SUMMARYQUERY);

javaBoke.setTitle(title);

javaBoke.setAuthor(author);

javaBoke.setAuthorUrl(authorUrl);

javaBoke.setLinke(linke);

javaBoke.setSummary(summary);

javaBokes.add(javaBoke);

}

insertDb(javaBokes);

}

@OverridepublicSite getSite() {//抓去網站的相關配置包括:編碼、重試次數、抓取間隔

return Site.me().setSleepTime(1000).setRetryTimes(10);

}public static voidmain(String[] args) {longstartTime ,endTime;

System.out.println("========小爬蟲【啟動】喽!=========");

getConnection();

startTime= newDate().getTime();//入口

Spider create = Spider.create(newJavaBoKePageProcessor());//定義入口位址

create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run();try{

ps.close();

conn.close();

}catch(Exception e) {//TODO: handle exception

}

endTime= newDate().getTime();

System.out.println("========小爬蟲【結束】喽!=========");

System.out.println("用時為:"+(endTime-startTime)/1000+"s");

}

}