天天看點

webmagic原始執行個體

import java.io.IOException;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.Iterator;

import java.util.List;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Request;

import us.codecraft.webmagic.ResultItems;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Html;

public class ganjitest implements PageProcessor{

private Site site=Site.me().setSleepTime(3000).setRetryTimes(2);

List<Request> list=null; //取得目前記憶體中存放的相關的網頁連結

int num=0; //取得所有的連結

public void Show(Page page){

String addressUrl=null;

String url=page.getUrl().toString();

if(url.trim()!="http://www.xinhuanet.com/" || !url.equals("http://www.xinhuanet.com/")){

try{

page.putField("address",addressUrl=page.getUrl().regex("(http://news\\.xinhuanet\\.com/politics/.*)").toString());

}catch(Exception e){

System.out.println("url not contains");

}

}else{

page.putField("address", addressUrl=page.getUrl().toString());

}

page.putField("name", page.getHtml().xpath("//title/text()").toString());

//提取meta中的content的值

String str=page.getHtml().xpath("//meta[@name='keywords']").toString();

int a=str.lastIndexOf("content=\"");

int b=str.lastIndexOf("\"");

page.putField("meta", str.substring(a+9, b));

//提取最後重新整理時間

URL hp = null;

URLConnection hpCon;

try {

   SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 

hp = new URL(addressUrl);

hpCon = hp.openConnection();

//轉化為本地時間

page.putField("lasttimes",sdf.format(new Date(hpCon.getLastModified())));

} catch (Exception e) {

e.printStackTrace();

if(page.getResultItems().get("name")==null){

page.setSkip(true);

}

}

public void process(Page page) {

//存放到記憶體中的網頁

page.addTargetRequests(page.getHtml().links().regex("(http://news.xinhuanet.com/politics/.*)").all());

list= page.getTargetRequests();

num+=list.size();

System.out.println(num);

if(num>100){

return ;

}else{

Show(page);

}

}

public Site getSite() {

return site;

}

public static void main(String[] args) {

//初始化進入的首頁

System.out.println("開始");

//從哪裡開始抓

Spider.create(new ganjitest()).addUrl("http://www.xinhuanet.com/").thread(5).run();

System.out.println("結束");

}

}