1.pom檔案中添加依賴
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
2.寫一個擷取頁面的util方法
public HtmlPage getHtmlPageResponse(WebClient webClient,String url) throws Exception {
webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執行出錯的時候是否抛出異常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀态非200時是否抛出異常
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(true);//是否啟用CSS
webClient.getOptions().setJavaScriptEnabled(true); //很重要,啟用JS
webClient.getOptions().setRedirectEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,設定支援AJAX
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setTimeout(timeout);//設定“浏覽器”的請求逾時時間
webClient.setJavaScriptTimeout(timeout);//設定JS執行的逾時時間
HtmlPage page;
try {
page = webClient.getPage(url);
} catch (Exception e) {
webClient.close();
throw e;
}
webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//該方法阻塞線程
return page;
}
3.調用util方法
WebClient webClient = new WebClient();
HtmlPage htmlPage = httpUtils.getHtmlPageResponse(webClient,infoSource.getSourceUrl());
Document document = Jsoup.parse(htmlPage.asXml());//擷取html文檔
//處理document擷取需要的内容