部落客個人首頁
前言
今天來分享一個非常好用的Java爬蟲庫 讓我們可以拿到一串HTML文本字元串 可以随意擷取到自己需要獲得的資料 這個庫的名字是Jsoup 内置了可以讓我們像使用CSS的選擇器一樣的方法來擷取頁面上的元素并且一些API和JavaScript操作DOM的API使一緻的
使用
pom依賴
我們需要用到httpclient發送http請求擷取資料
我們需要用到junit做單元測試 不用也可以
jsoup就是主要的啦
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
httpclient 工具類
package top.liwenxiang.jsoup;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
/**
* 封裝了一些采用HttpClient發送HTTP請求的方法
* @see 本工具所采用的是最新的HttpComponents-Client-4.2.1
*/
public class HttpClientUtil {
private HttpClientUtil(){}
private static Log logger = LogFactory.getLog(HttpClientUtil.class);
public static HttpResponse sendXmlDataByPost(String url,String xmlData) throws ClientProtocolException, IOException {
HttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost(url);
StringEntity entity = new StringEntity(xmlData);
httpPost.setEntity(entity);
httpPost.setHeader("Content-Type","text/xml;charset=UTF-8");
HttpResponse execute = httpClient.execute(httpPost);
// 外部擷取輸入流可以 .getEntity().getContent();
return execute;
}
/**
* 發送HTTP_GET請求
* @see 該方法會自動關閉連接配接,釋放資源
* @param requestURL 請求位址(含參數)
* @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
* @return 遠端主機響應正文
*/
public static String sendGetRequest(String reqURL, String decodeCharset) throws ParseException{
long responseLength = 0; //響應長度
String responseContent = null; //響應内容
HttpClient httpClient = new DefaultHttpClient(); //建立預設的httpClient執行個體
HttpGet httpGet = new HttpGet(reqURL); //建立org.apache.http.client.methods.HttpGet
try{
HttpResponse response = httpClient.execute(httpGet); //執行GET請求
HttpEntity entity = response.getEntity(); //擷取響應實體
if(null != entity){
responseLength = entity.getContentLength();
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity); //Consume response content
}
System.out.println("請求位址: " + httpGet.getURI());
System.out.println("響應狀态: " + response.getStatusLine());
System.out.println("響應長度: " + responseLength);
System.out.println("響應内容: " + responseContent);
}catch(ClientProtocolException e){
logger.debug("該異常通常是協定錯誤導緻,比如構造HttpGet對象時傳入的協定不對(将'http'寫成'htp')或者伺服器端傳回的内容不符合HTTP協定要求等,堆棧資訊如下", e);
}catch(IOException e){
logger.debug("該異常通常是網絡原因引起的,如HTTP伺服器未啟動等,堆棧資訊如下", e);
}finally{
httpClient.getConnectionManager().shutdown(); //關閉連接配接,釋放資源
}
return responseContent;
}
/**
* 發送HTTP_POST請求
* @see 該方法為
sendPostRequest(String,String,boolean,String,String)
的簡化方法
* @see 該方法在對請求資料的編碼和響應資料的解碼時,所采用的字元集均為UTF-8
* @see 當
isEncoder=true
時,其會自動對
sendData
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,"UTF-8")
* @param isEncoder 用于指明請求資料是否需要UTF-8編碼,true為需要
*/
public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder){
return sendPostRequest(reqURL, sendData, isEncoder, null, null);
}
/**
* 發送HTTP_POST請求
* @see 該方法會自動關閉連接配接,釋放資源
* @see 當
isEncoder=true
時,其會自動對
sendData
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 請求位址
* @param sendData 請求參數,若有多個參數則應拼接成param11=value11&22=value22&33=value33的形式後,傳入該參數中
* @param isEncoder 請求資料是否需要encodeCharset編碼,true為需要
* @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
* @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
* @return 遠端主機響應正文
*/
public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder, String encodeCharset, String decodeCharset){
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(reqURL);
//httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded; charset=UTF-8");
httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded");
try{
if(isEncoder){
List formParams = new ArrayList();
for(String str : sendData.split("&")){
formParams.add(new BasicNameValuePair(str.substring(0,str.indexOf("=")), str.substring(str.indexOf("=")+1)));
}
httpPost.setEntity(new StringEntity(URLEncodedUtils.format(formParams, encodeCharset==null ? "UTF-8" : encodeCharset)));
}else{
httpPost.setEntity(new StringEntity(sendData));
}
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
}catch(Exception e){
logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊如下", e);
}finally{
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 發送HTTP_POST請求
* @see 該方法會自動關閉連接配接,釋放資源
* @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 請求位址
* @param params 請求參數
* @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
* @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
* @return 遠端主機響應正文
*/
public static String sendPostRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(reqURL);
List formParams = new ArrayList(); //建立參數隊列
for(Map.Entry entry : params.entrySet()){
formParams.add(new BasicNameValuePair((String) entry.getKey(), (String)entry.getValue()));
}
try{
httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
}catch(Exception e){
logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊如下", e);
}finally{
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 發送HTTPS_POST請求
* @see 該方法為
sendPostSSLRequest(String,Map,String,String)
方法的簡化方法
* @see 該方法在對請求資料的編碼和響應資料的解碼時,所采用的字元集均為UTF-8
* @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,"UTF-8")
*/
public static String sendPostSSLRequest(String reqURL, Map params){
return sendPostSSLRequest(reqURL, params, null, null);
}
/**
* 發送HTTPS_POST請求
* @see 該方法會自動關閉連接配接,釋放資源
* @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 請求位址
* @param params 請求參數
* @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
* @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
* @return 遠端主機響應正文
*/
public static String sendPostSSLRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
String responseContent = "";
HttpClient httpClient = new DefaultHttpClient();
X509TrustManager xtm = new X509TrustManager(){
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public X509Certificate[] getAcceptedIssuers() {return null;}
};
try {
SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[]{xtm}, null);
SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", 443, socketFactory));
HttpPost httpPost = new HttpPost(reqURL);
List formParams = new ArrayList();
for(Map.Entry entry : params.entrySet()){
formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
}
httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
} catch (Exception e) {
logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊為", e);
} finally {
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 發送HTTP_POST請求
* @see 若發送的
params
中含有中文,記得按照雙方約定的字元集将中文
URLEncoder.encode(string,encodeCharset)
* @see 本方法預設的連接配接逾時時間為30秒,預設的讀取逾時時間為30秒
* @param reqURL 請求位址
* @param params 發送到遠端主機的正文資料,其資料類型為
java.util.Map
* @return 遠端主機響應正文`HTTP狀态碼,如
"SUCCESS`200"
若通信過程中發生異常則傳回"Failed`HTTP狀态碼",如
"Failed`500"
*/
public static String sendPostRequestByJava(String reqURL, Map<String,Object> params){
StringBuilder sendData = new StringBuilder();
for(Map.Entry entry : params.entrySet()){
sendData.append(entry.getKey()).append("=").append(entry.getValue()).append("&");
}
if(sendData.length() > 0){
sendData.setLength(sendData.length() - 1); //删除最後一個&符号
}
return sendPostRequestByJava(reqURL, sendData.toString());
}
/**
* 發送HTTP_POST請求
* @see 若發送的
sendData
中含有中文,記得按照雙方約定的字元集将中文
URLEncoder.encode(string,encodeCharset)
* @see 本方法預設的連接配接逾時時間為30秒,預設的讀取逾時時間為30秒
* @param reqURL 請求位址
* @param sendData 發送到遠端主機的正文資料
* @return 遠端主機響應正文`HTTP狀态碼,如
"SUCCESS`200"
若通信過程中發生異常則傳回"Failed`HTTP狀态碼",如
"Failed`500"
*/
public static String sendPostRequestByJava(String reqURL, String sendData){
HttpURLConnection httpURLConnection = null;
OutputStream out = null; //寫
InputStream in = null; //讀
int httpStatusCode = 0; //遠端主機響應的HTTP狀态碼
try{
URL sendUrl = new URL(reqURL);
httpURLConnection = (HttpURLConnection)sendUrl.openConnection();
httpURLConnection.setRequestMethod("POST");
httpURLConnection.setDoOutput(true); //訓示應用程式要将資料寫入URL連接配接,其值預設為false
httpURLConnection.setUseCaches(false);
httpURLConnection.setConnectTimeout(30000); //30秒連接配接逾時
httpURLConnection.setReadTimeout(30000); //30秒讀取逾時
out = httpURLConnection.getOutputStream();
out.write(sendData.toString().getBytes());
//清空緩沖區,發送資料
out.flush();
//擷取HTTP狀态碼
httpStatusCode = httpURLConnection.getResponseCode();
in = httpURLConnection.getInputStream();
byte[] byteDatas = new byte[in.available()];
in.read(byteDatas);
return new String(byteDatas) + "`" + httpStatusCode;
}catch(Exception e){
logger.debug(e.getMessage());
return "Failed`" + httpStatusCode;
}finally{
if(out != null){
try{
out.close();
}catch (Exception e){
logger.debug("關閉輸出流時發生異常,堆棧資訊如下", e);
}
}
if(in != null){
try{
in.close();
}catch(Exception e){
logger.debug("關閉輸入流時發生異常,堆棧資訊如下", e);
}
}
if(httpURLConnection != null){
httpURLConnection.disconnect();
httpURLConnection = null;
}
}
}
/**
* https posp請求,可以繞過證書校驗
* @param url
* @param params
* @return
*/
public static final String sendHttpsRequestByPost(String url, Map<String,Object> params) throws ParseException {
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
//建立TrustManager
X509TrustManager xtm = new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
//這個好像是HOST驗證
X509HostnameVerifier hostnameVerifier = new X509HostnameVerifier() {
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
public void verify(String arg0, SSLSocket arg1) throws IOException {}
public void verify(String arg0, String[] arg1, String[] arg2) throws SSLException {}
public void verify(String arg0, X509Certificate arg1) throws SSLException {}
};
try {
//TLS1.0與SSL3.0基本上沒有太大的差别,可粗略了解為TLS是SSL的繼承者,但它們使用的是相同的SSLContext
SSLContext ctx = SSLContext.getInstance("TLS");
//使用TrustManager來初始化該上下文,TrustManager隻是被SSL的Socket所使用
ctx.init(null, new TrustManager[] { xtm }, null);
//建立SSLSocketFactory
SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
socketFactory.setHostnameVerifier(hostnameVerifier);
//通過SchemeRegistry将SSLSocketFactory注冊到我們的HttpClient上
httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", socketFactory, 443));
HttpPost httpPost = new HttpPost(url);
List formParams = new ArrayList(); // 建構POST請求的表單參數
for (Map.Entry entry : params.entrySet()) {
formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
}
httpPost.setEntity(new UrlEncodedFormEntity(formParams, "UTF-8"));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity(); // 擷取響應實體
if (entity != null) {
responseContent = EntityUtils.toString(entity, "UTF-8");
}
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 關閉連接配接,釋放資源
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 發送HTTP_POST請求,json格式資料
* @param url
* @param body
* @return
* @throws Exception
*/
public static String sendPostByJson(String url, String body) throws Exception {
CloseableHttpClient httpclient = HttpClients.custom().build();
HttpPost post = null;
String resData = null;
CloseableHttpResponse result = null;
try {
post = new HttpPost(url);
HttpEntity entity2 = new StringEntity(body, Consts.UTF_8);
post.setConfig(RequestConfig.custom().setConnectTimeout(30000).setSocketTimeout(30000).build());
post.setHeader("Content-Type", "application/json");
post.setEntity(entity2);
result = httpclient.execute(post);
if (HttpStatus.SC_OK == result.getStatusLine().getStatusCode()) {
resData = EntityUtils.toString(result.getEntity());
}
} finally {
if (result != null) {
result.close();
}
if (post != null) {
post.releaseConnection();
}
httpclient.close();
}
return resData;
}
}
API
在Jsoup中主要有兩個對象 一個是Document另一個是Elements或者Element
通過Document我們能夠得到一個完整的文檔對象
然後可以通過Document提供的API進行查詢到對應的元素(Element)或者是傳回多個元素(Elements)
然後我們還可以通過對應的Api擷取到具體屬性的值x
更直接的可以直接使用Document裡面提供好的API進行類似CSS選擇器的查詢方式擷取元素
迫不及待了吧 我們來看看如何使用
下面代碼注釋很詳細哦 看注釋吧 自己要多練習哦
package top.liwenxiang.jsoup;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class 爬蟲庫使用 {
public static void main(String[] args) throws ParseException {
// 發送http請求 爬取到資料
String html = HttpClientUtil.sendGetRequest("http://liwenxiang.top","utf-8");
System.out.println(html);
System.out.println("------------------------");
// 建立Document對象 将擷取到的html字元串當做參數傳入
Document doc = Jsoup.parse(html);
// 通過getElementsByTag方法擷取到所有标簽是title的元素 傳回多個是以是Elements
// 對應的API還有很多 用法都是一樣的
// Element elementsByTag = doc.getElementById("title"); 通過ID擷取是擷取單個是以是Element
// Elements elementsByTag = doc.getElementsByClass("title");
Elements elementsByTag = doc.getElementsByTag("title");
System.out.println(elementsByTag.get(0).text());
System.out.println("------------------------");
// 通過getElementsByTag方法擷取到所有标簽是a的元素 傳回多個是以是Elements
Elements elementsByAttribute = doc.getElementsByTag("a");
// 可以擷取屬性
int size = elementsByAttribute.size();
for(int i = 0 ; i < size; i++) {
// 通過Element對象就可以擷取到改标簽中的任意屬性值 都有對應的API 通過get方法就可以得到一個Element對象
Element element = elementsByAttribute.get(i);
Elements elementsByAttribute2 = element.getElementsByAttribute("href");
if(elementsByAttribute2.size() != 0) {
System.out.println(elementsByAttribute2.get(0).text());
}
}
// 選擇器 這個最為友善 CSS中的選擇器使用比對方式 這裡大多都能比對
Elements select = doc.select(".home1");
System.out.println(select.first().text());
Elements select1 = doc.select("a[href=http://t.cn/AiC20SOG]");
// text 是擷取到文本
System.out.println(select1.first().text());
// 還有很多好用的方法 快去挖掘吧~~~~
}
}
結語
今天的分享就這麼多啦~~~