天天看點

java中使用HtmlUnit爬蟲

1.pom檔案中添加依賴

<dependency>
    <groupId>net.sourceforge.htmlunit</groupId>
    <artifactId>htmlunit</artifactId>
    <version>2.27</version>
</dependency>
           

2.寫一個擷取頁面的util方法

public HtmlPage getHtmlPageResponse(WebClient webClient,String url) throws Exception {
   
    webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執行出錯的時候是否抛出異常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀态非200時是否抛出異常
    webClient.getOptions().setActiveXNative(false);
    webClient.getOptions().setCssEnabled(true);//是否啟用CSS
    webClient.getOptions().setJavaScriptEnabled(true); //很重要,啟用JS
    webClient.getOptions().setRedirectEnabled(true);
    webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,設定支援AJAX

    webClient.getCookieManager().setCookiesEnabled(true);

    webClient.getOptions().setTimeout(timeout);//設定“浏覽器”的請求逾時時間
    webClient.setJavaScriptTimeout(timeout);//設定JS執行的逾時時間

    HtmlPage page;
    try {
        page = webClient.getPage(url);
    } catch (Exception e) {
        webClient.close();
        throw e;
    }
    webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//該方法阻塞線程

    return page;
}
           

3.調用util方法

WebClient webClient = new WebClient();
HtmlPage htmlPage = httpUtils.getHtmlPageResponse(webClient,infoSource.getSourceUrl());
Document document = Jsoup.parse(htmlPage.asXml());//擷取html文檔
//處理document擷取需要的内容