天天看点

java中使用HtmlUnit爬虫

1.pom文件中添加依赖

<dependency>
    <groupId>net.sourceforge.htmlunit</groupId>
    <artifactId>htmlunit</artifactId>
    <version>2.27</version>
</dependency>
           

2.写一个获取页面的util方法

public HtmlPage getHtmlPageResponse(WebClient webClient,String url) throws Exception {
   
    webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
    webClient.getOptions().setActiveXNative(false);
    webClient.getOptions().setCssEnabled(true);//是否启用CSS
    webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
    webClient.getOptions().setRedirectEnabled(true);
    webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX

    webClient.getCookieManager().setCookiesEnabled(true);

    webClient.getOptions().setTimeout(timeout);//设置“浏览器”的请求超时时间
    webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间

    HtmlPage page;
    try {
        page = webClient.getPage(url);
    } catch (Exception e) {
        webClient.close();
        throw e;
    }
    webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程

    return page;
}
           

3.调用util方法

WebClient webClient = new WebClient();
HtmlPage htmlPage = httpUtils.getHtmlPageResponse(webClient,infoSource.getSourceUrl());
Document document = Jsoup.parse(htmlPage.asXml());//获取html文档
//处理document获取需要的内容