天天看點

java爬蟲之webMagic學習

webMagic爬蟲

    • webMagic介紹
    • 項目引入webMagic
    • webMagic配置
    • PageProcessor類
    • pipeline類
    • 儲存爬取的資料
    • 參考文檔

webMagic介紹

WebMagic是一個簡單靈活的Java爬蟲架構。它提供簡單靈活的API,隻需少量代碼即可實作一個爬蟲。基于WebMagic,你可以快速開發出一個高效、易維護的爬蟲。

項目引入webMagic

  1. pom.xml加入相關依賴
    java爬蟲之webMagic學習
  2. 直接将源碼放入項目裡面(我是這樣做的)
    java爬蟲之webMagic學習

webMagic配置

  1. 建立config.json檔案放在項目src/main/resources下
{
  "site": {
  	# 網站域名
    "domain": "139.159.3.18",
    # 請求頭,主要是模拟浏覽器請求
    "headers": {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
      "authorization": "Your own authorization here."
    },
    # 如果爬取的網站需要登入,在這裡設定cookie資訊
    "cookie": {
      "JSESSIONID":"FBCC0D50EC568B1A7A6EF7FD94C50079"
    },
    "retryTimes": 3,
    "sleepTime": 500
  },
  "base_dir": "/Users/zz/"
}
           

建立Configuration配置類用來擷取config.json配置

2. 不用config.json,完全可以用代碼來配置,使用Site類。例如:

Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(2000)
            .setTimeOut(60000)
            .setCharset("utf-8")
            .addCookie("域","名稱", "内容")
            .addCookie("域","名稱", "内容");
           

PageProcessor類

import java.util.List;

import cn.dofuntech.spider.collector.site99.Configuration;
import cn.dofuntech.spider.webmagic.Page;
import cn.dofuntech.spider.webmagic.Site;
import cn.dofuntech.spider.webmagic.Spider;
import cn.dofuntech.spider.webmagic.pipeline.Pipeline;
import cn.dofuntech.spider.webmagic.processor.PageProcessor;
import cn.dofuntech.spider.webmagic.scheduler.BloomFilterDuplicateRemover;
import cn.dofuntech.spider.webmagic.scheduler.FileCacheQueueScheduler;
import cn.dofuntech.spider.webmagic.selector.Selectable;

import com.hs2e.common.collect.ListUtils;
import com.hs2e.common.lang.StringUtils;

/**
 *
 * 爬取症狀詳細資訊
 */
public class ZzPageProcessor2 implements PageProcessor {
	// 擷取配置
    private Site               site            = new Configuration().getSite();
	// 爬取符合正則的網頁
    public static final String URL_LIST_PINYIN = "https://jbk.99.com.cn/zz/py/[A-Z]-[0-9]\\.html";
    
    public void process(Page page) {

        //拼音症狀清單頁面
        if (page.getUrl().regex(URL_LIST_PINYIN).match()) {
        	// 擷取所有url
            page.addTargetRequests(page.getHtml().xpath("//div[@class=\"part-cont3\"]/dl/dt").links().all());
        }
        //症狀詳情頁
        else {
            Selectable selectable = page.getHtml().xpath("//div[@id='d-top2']//li/font");

            List<Selectable> nodes = selectable.nodes();
//            System.out.println("症狀:" + nodes.get(0).$("font", "text"));
//            System.out.println("部位:" + nodes.get(1).$("font", "text"));

            List<String> a2 = ListUtils.newArrayList();
            List<String> a3 = ListUtils.newArrayList();
            nodes.get(2).$("font > a").nodes().forEach(a -> {
                a2.add(a.$("a", "text").get());
            });
            nodes.get(3).$("font > a").nodes().forEach(a -> {
                a3.add(a.$("a", "text").get());
            });

//            System.out.println("科室:" + StringUtils.join(a2, " "));
//            System.out.println("疾病:" + StringUtils.join(a3, " "));
            
            page.putField("name", nodes.get(0).$("font", "text").toString());
            page.putField("bw", nodes.get(1).$("font", "text").toString());
            page.putField("dept", StringUtils.join(a2, ",").trim());
            page.putField("disease", StringUtils.join(a3, ",").trim());
        }

    }

    public Site getSite() {
        return site;
    }
    
    @SuppressWarnings("resource")
    public void start(Pipeline pipeline,String url)
    {
        String pipelinePath = new Configuration().getZzPath();
        int crawlSize = 100_0000;
        Spider.create(new ZzPageProcessor2())
            .setScheduler(new FileCacheQueueScheduler(pipelinePath)
                .setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize)))
            .addUrl(url)
            .addPipeline(pipeline)
            .thread(300)
            .run();
    }

    /**
     * 下載下傳關注清單的使用者資料,用于提取 url_tokens
     * @param args 無須其他參數
     */
    public static void main(String[] args) {
        String pipelinePath = new Configuration().getZzPath();
        int crawlSize = 100_0000;
        System.out.println(crawlSize);
        Spider.create(new ZzPageProcessor2()).setScheduler(//new QueueScheduler()
        new FileCacheQueueScheduler(pipelinePath).setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize))).addPipeline(new ZzPipeline()).addUrl("https://jbk.99.com.cn/zz/py/Z-2.html").thread(200).run();
    }
}
           

pipeline類

webmagic内置有很多pipeline,基本能夠滿足開發者的需求。例如ResultItemsCollectorPipeline将爬取的所有資料儲存ResultItems集合裡面;FilePipeline将爬取的資料或者url寫入帶檔案裡面,等等。還可以自定義pipeline

  1. ResultItemsCollectorPipeline使用執行個體
package cn.dofuntech.spider.collector.site99.download;

import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.dofuntech.spider.webmagic.ResultItems;
import cn.dofuntech.spider.webmagic.Task;
import cn.dofuntech.spider.webmagic.pipeline.ResultItemsCollectorPipeline;

/**
 * <p>
 * 症狀詳細資訊的 pipeline
 * </p>
 * <font size=0.25>Copyright (C) 2019 dofuntech. All Rights Reserved.</font>
 * @author
 * @version 1.0
 * filename:ZzPipeline.java 
 */
public class ZzPipeline extends ResultItemsCollectorPipeline {

    private Logger      logger   = LoggerFactory.getLogger(getClass());
    
	List<ResultItems> collector = new ArrayList<ResultItems>();
    
    @Override
    public void process(ResultItems resultItems, Task task) {
        collector.add(resultItems);
    }

    @Override
    public List<ResultItems> getCollected()
    {
        return collector;
    }
}

           
  1. FilePipeline使用執行個體
public class ZzPipeline extends FilePipeline {

    private Logger      logger   = LoggerFactory.getLogger(getClass());

    static final String URL      = "url";
    static final String RESPONSE = "response";

    /**
     * create a ZhihuPipeline with default path"/data/webporter/"
     */
    public ZzPipeline() {
        setPath("/data/webporter/");
    }

    public ZzPipeline(String path) {
        setPath(path);
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
       String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
        try {
            PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")), "UTF-8"));
            Map<String, Object> results = resultItems.getAll();

            printWriter.println(results.get(URL));
           printWriter.println(results.get(RESPONSE));
            printWriter.close();
        }
        catch (IOException e) {
            logger.warn("write file error", e);
        }
    }
    
}
           

儲存爬取的資料

ZzPipeline zzPipeline = new ZzPipeline();
 new ZzPageProcessor2().start(zzPipeline, url);
 List<ResultItems> resultItems = zzPipeline.getCollected();
 List<BasicZz> list = new ArrayList<>();
 
if (ListUtils.isNotEmpty(resultItems))
                {
                    for (ResultItems r : resultItems)
                    {
                        try
                        {
                            BasicZz z = new BasicZz();
                            z.setName(r.get("name").toString());
                            z.setDept(r.get("dept").toString());
                            z.setDisease(r.get("disease").toString());
                            z.setBw(r.get("bw").toString());
                            list.add(z);
                        }
                        catch (Exception e)
                        {
                            continue;
                        }
                    }
                   
                        baseZzService.saveBatch(list);
                   
                }
           

到這裡就簡單的實作了一個爬蟲

參考文檔

  1. WebMagic中文文檔:http://webmagic.io/docs/zh/
  2. pipeline使用:https://blog.csdn.net/qq_36783371/article/details/79943211
  3. 自定義pipeline:https://www.jianshu.com/p/52785e3cf41e