天天看點

Java爬蟲庫 - Jsoup 使用

部落客個人首頁

前言

今天來分享一個非常好用的Java爬蟲庫 讓我們可以拿到一串HTML文本字元串 可以随意擷取到自己需要獲得的資料 這個庫的名字是Jsoup 内置了可以讓我們像使用CSS的選擇器一樣的方法來擷取頁面上的元素并且一些API和JavaScript操作DOM的API使一緻的

使用

pom依賴

我們需要用到httpclient發送http請求擷取資料

我們需要用到junit做單元測試 不用也可以

jsoup就是主要的啦

<dependency>
	    <groupId>org.apache.httpcomponents</groupId>
	    <artifactId>httpclient</artifactId>
	    <version>4.5.3</version>
	</dependency>
  
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
  	
  	<dependency>
  		 <groupId>org.jsoup</groupId>
	      <artifactId>jsoup</artifactId>
	      <version>1.10.2</version>
  	</dependency>
           

httpclient 工具類

package top.liwenxiang.jsoup;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
 
/**
 * 封裝了一些采用HttpClient發送HTTP請求的方法
 * @see 本工具所采用的是最新的HttpComponents-Client-4.2.1
 */
public class HttpClientUtil {
    private HttpClientUtil(){}
     
    private static Log logger = LogFactory.getLog(HttpClientUtil.class);
    
    public static HttpResponse sendXmlDataByPost(String url,String xmlData) throws ClientProtocolException, IOException {
    	HttpClient httpClient = HttpClients.createDefault();
    	HttpPost httpPost = new HttpPost(url);
    	StringEntity entity = new StringEntity(xmlData);
    	httpPost.setEntity(entity);
    	httpPost.setHeader("Content-Type","text/xml;charset=UTF-8");
    	HttpResponse execute = httpClient.execute(httpPost);
    	// 外部擷取輸入流可以 .getEntity().getContent();
    	return execute;
    }
    
    /**
     * 發送HTTP_GET請求
     * @see 該方法會自動關閉連接配接,釋放資源
     * @param requestURL    請求位址(含參數)
     * @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
     * @return 遠端主機響應正文
     */
    public static String sendGetRequest(String reqURL, String decodeCharset) throws ParseException{
        long responseLength = 0;       //響應長度
        String responseContent = null; //響應内容
        HttpClient httpClient = new DefaultHttpClient(); //建立預設的httpClient執行個體
        HttpGet httpGet = new HttpGet(reqURL);           //建立org.apache.http.client.methods.HttpGet
        try{
            HttpResponse response = httpClient.execute(httpGet); //執行GET請求
            HttpEntity entity = response.getEntity();            //擷取響應實體
            if(null != entity){
                responseLength = entity.getContentLength();
                responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
                EntityUtils.consume(entity); //Consume response content
            }
            System.out.println("請求位址: " + httpGet.getURI());
            System.out.println("響應狀态: " + response.getStatusLine());
            System.out.println("響應長度: " + responseLength);
            System.out.println("響應内容: " + responseContent);
        }catch(ClientProtocolException e){
        	logger.debug("該異常通常是協定錯誤導緻,比如構造HttpGet對象時傳入的協定不對(将'http'寫成'htp')或者伺服器端傳回的内容不符合HTTP協定要求等,堆棧資訊如下", e);
        }catch(IOException e){
        	logger.debug("該異常通常是網絡原因引起的,如HTTP伺服器未啟動等,堆棧資訊如下", e);
        }finally{
            httpClient.getConnectionManager().shutdown(); //關閉連接配接,釋放資源
        }
        return responseContent;
    }
     
     
    /**
     * 發送HTTP_POST請求
     * @see 該方法為
sendPostRequest(String,String,boolean,String,String)
的簡化方法
     * @see 該方法在對請求資料的編碼和響應資料的解碼時,所采用的字元集均為UTF-8
     * @see 當
isEncoder=true
時,其會自動對
sendData
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,"UTF-8")
     * @param isEncoder 用于指明請求資料是否需要UTF-8編碼,true為需要
     */
    public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder){
        return sendPostRequest(reqURL, sendData, isEncoder, null, null);
    }
     
     
    /**
     * 發送HTTP_POST請求
     * @see 該方法會自動關閉連接配接,釋放資源
     * @see 當
isEncoder=true
時,其會自動對
sendData
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
     * @param reqURL        請求位址
     * @param sendData      請求參數,若有多個參數則應拼接成param11=value11&22=value22&33=value33的形式後,傳入該參數中
     * @param isEncoder     請求資料是否需要encodeCharset編碼,true為需要
     * @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
     * @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
     * @return 遠端主機響應正文
     */
    public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder, String encodeCharset, String decodeCharset){
        String responseContent = null;
        HttpClient httpClient = new DefaultHttpClient();
         
        HttpPost httpPost = new HttpPost(reqURL);
        //httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded; charset=UTF-8");
        httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded");
        try{
            if(isEncoder){
                List formParams = new ArrayList();
                for(String str : sendData.split("&")){
                    formParams.add(new BasicNameValuePair(str.substring(0,str.indexOf("=")), str.substring(str.indexOf("=")+1)));
                }
                httpPost.setEntity(new StringEntity(URLEncodedUtils.format(formParams, encodeCharset==null ? "UTF-8" : encodeCharset)));
            }else{
                httpPost.setEntity(new StringEntity(sendData));
            }
             
            HttpResponse response = httpClient.execute(httpPost);
            HttpEntity entity = response.getEntity();
            if (null != entity) {
                responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
                EntityUtils.consume(entity);
            }
        }catch(Exception e){
            logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊如下", e);
        }finally{
            httpClient.getConnectionManager().shutdown();
        }
        return responseContent;
    }
     
     
    /**
     * 發送HTTP_POST請求
     * @see 該方法會自動關閉連接配接,釋放資源
     * @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
     * @param reqURL        請求位址
     * @param params        請求參數
     * @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
     * @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
     * @return 遠端主機響應正文
     */
    public static String sendPostRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
        String responseContent = null;
        HttpClient httpClient = new DefaultHttpClient();
         
        HttpPost httpPost = new HttpPost(reqURL);
        List formParams = new ArrayList(); //建立參數隊列
        for(Map.Entry entry : params.entrySet()){
            formParams.add(new BasicNameValuePair((String) entry.getKey(), (String)entry.getValue()));
        }
        try{
            httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
             
            HttpResponse response = httpClient.execute(httpPost);
            HttpEntity entity = response.getEntity();
            if (null != entity) {
                responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
                EntityUtils.consume(entity);
            }
        }catch(Exception e){
            logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊如下", e);
        }finally{
            httpClient.getConnectionManager().shutdown();
        }
        return responseContent;
    }
     
     
    /**
     * 發送HTTPS_POST請求
     * @see 該方法為
sendPostSSLRequest(String,Map,String,String)
方法的簡化方法
     * @see 該方法在對請求資料的編碼和響應資料的解碼時,所采用的字元集均為UTF-8
     * @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,"UTF-8")
     */
    public static String sendPostSSLRequest(String reqURL, Map params){
        return sendPostSSLRequest(reqURL, params, null, null);
    }
     
     
    /**
     * 發送HTTPS_POST請求
     * @see 該方法會自動關閉連接配接,釋放資源
     * @see 該方法會自動對
params
中的[中文][|][ ]等特殊字元進行
URLEncoder.encode(string,encodeCharset)
     * @param reqURL        請求位址
     * @param params        請求參數
     * @param encodeCharset 編碼字元集,編碼請求資料時用之,其為null時預設采用UTF-8解碼
     * @param decodeCharset 解碼字元集,解析響應資料時用之,其為null時預設采用UTF-8解碼
     * @return 遠端主機響應正文
     */
    public static String sendPostSSLRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
        String responseContent = "";
        HttpClient httpClient = new DefaultHttpClient();
        X509TrustManager xtm = new X509TrustManager(){
            public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
            public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
            public X509Certificate[] getAcceptedIssuers() {return null;}
        };
        try {
            SSLContext ctx = SSLContext.getInstance("TLS");
            ctx.init(null, new TrustManager[]{xtm}, null);
            SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
            httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", 443, socketFactory));
             
            HttpPost httpPost = new HttpPost(reqURL);
            List formParams = new ArrayList();
            for(Map.Entry entry : params.entrySet()){
                formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
            }
            httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
             
            HttpResponse response = httpClient.execute(httpPost);
            HttpEntity entity = response.getEntity();
            if (null != entity) {
                responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
                EntityUtils.consume(entity);
            }
        } catch (Exception e) {
            logger.debug("與[" + reqURL + "]通信過程中發生異常,堆棧資訊為", e);
        } finally {
            httpClient.getConnectionManager().shutdown();
        }
        return responseContent;
    }
     
     
    /**
     * 發送HTTP_POST請求
     * @see 若發送的
params
中含有中文,記得按照雙方約定的字元集将中文
URLEncoder.encode(string,encodeCharset)
     * @see 本方法預設的連接配接逾時時間為30秒,預設的讀取逾時時間為30秒
     * @param reqURL 請求位址
     * @param params 發送到遠端主機的正文資料,其資料類型為
java.util.Map
     * @return 遠端主機響應正文`HTTP狀态碼,如
"SUCCESS`200"

若通信過程中發生異常則傳回"Failed`HTTP狀态碼",如
"Failed`500"
     */
    public static String sendPostRequestByJava(String reqURL, Map<String,Object> params){
        StringBuilder sendData = new StringBuilder();
        for(Map.Entry entry : params.entrySet()){
            sendData.append(entry.getKey()).append("=").append(entry.getValue()).append("&");
        }
        if(sendData.length() > 0){
            sendData.setLength(sendData.length() - 1); //删除最後一個&符号
        }
        return sendPostRequestByJava(reqURL, sendData.toString());
    }
     
     
    /**
     * 發送HTTP_POST請求
     * @see 若發送的
sendData
中含有中文,記得按照雙方約定的字元集将中文
URLEncoder.encode(string,encodeCharset)
     * @see 本方法預設的連接配接逾時時間為30秒,預設的讀取逾時時間為30秒
     * @param reqURL   請求位址
     * @param sendData 發送到遠端主機的正文資料
     * @return 遠端主機響應正文`HTTP狀态碼,如
"SUCCESS`200"

若通信過程中發生異常則傳回"Failed`HTTP狀态碼",如
"Failed`500"
     */
    public static String sendPostRequestByJava(String reqURL, String sendData){
        HttpURLConnection httpURLConnection = null;
        OutputStream out = null; //寫
        InputStream in = null;   //讀
        int httpStatusCode = 0;  //遠端主機響應的HTTP狀态碼
        try{
            URL sendUrl = new URL(reqURL);
            httpURLConnection = (HttpURLConnection)sendUrl.openConnection();
            httpURLConnection.setRequestMethod("POST");
            httpURLConnection.setDoOutput(true);        //訓示應用程式要将資料寫入URL連接配接,其值預設為false
            httpURLConnection.setUseCaches(false);
            httpURLConnection.setConnectTimeout(30000); //30秒連接配接逾時
            httpURLConnection.setReadTimeout(30000);    //30秒讀取逾時
             
            out = httpURLConnection.getOutputStream();
            out.write(sendData.toString().getBytes());
             
            //清空緩沖區,發送資料
            out.flush();
             
            //擷取HTTP狀态碼
            httpStatusCode = httpURLConnection.getResponseCode();
             
            in = httpURLConnection.getInputStream();            
            byte[] byteDatas = new byte[in.available()];
            in.read(byteDatas);
            return new String(byteDatas) + "`" + httpStatusCode;
        }catch(Exception e){
            logger.debug(e.getMessage());
            return "Failed`" + httpStatusCode;
        }finally{
            if(out != null){
                try{
                    out.close();
                }catch (Exception e){
                    logger.debug("關閉輸出流時發生異常,堆棧資訊如下", e);
                }
            }
            if(in != null){
                try{
                    in.close();
                }catch(Exception e){
                    logger.debug("關閉輸入流時發生異常,堆棧資訊如下", e);
                }
            }
            if(httpURLConnection != null){
                httpURLConnection.disconnect();
                httpURLConnection = null;
            }
        }
    }
    
    /**
     * https posp請求,可以繞過證書校驗
     * @param url
     * @param params
     * @return
     */
    public static final String sendHttpsRequestByPost(String url, Map<String,Object> params) throws ParseException {
		String responseContent = null;
		HttpClient httpClient = new DefaultHttpClient();
		//建立TrustManager
		X509TrustManager xtm = new X509TrustManager() {
			public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
			public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
			public X509Certificate[] getAcceptedIssuers() {
				return null;
			}
		};
		//這個好像是HOST驗證
		X509HostnameVerifier hostnameVerifier = new X509HostnameVerifier() {
			public boolean verify(String arg0, SSLSession arg1) {
				return true;
			}
			public void verify(String arg0, SSLSocket arg1) throws IOException {}
			public void verify(String arg0, String[] arg1, String[] arg2) throws SSLException {}
			public void verify(String arg0, X509Certificate arg1) throws SSLException {}
		};
		try {
			//TLS1.0與SSL3.0基本上沒有太大的差别,可粗略了解為TLS是SSL的繼承者,但它們使用的是相同的SSLContext
			SSLContext ctx = SSLContext.getInstance("TLS");
			//使用TrustManager來初始化該上下文,TrustManager隻是被SSL的Socket所使用
			ctx.init(null, new TrustManager[] { xtm }, null);
			//建立SSLSocketFactory
			SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
			socketFactory.setHostnameVerifier(hostnameVerifier);
			//通過SchemeRegistry将SSLSocketFactory注冊到我們的HttpClient上
			httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", socketFactory, 443));
			HttpPost httpPost = new HttpPost(url);
			List formParams = new ArrayList(); // 建構POST請求的表單參數
			for (Map.Entry entry : params.entrySet()) {
				formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
			}
			httpPost.setEntity(new UrlEncodedFormEntity(formParams, "UTF-8"));
			HttpResponse response = httpClient.execute(httpPost);
			HttpEntity entity = response.getEntity(); // 擷取響應實體
			if (entity != null) {
				responseContent = EntityUtils.toString(entity, "UTF-8");
			}
		} catch (KeyManagementException e) {
			e.printStackTrace();
		} catch (NoSuchAlgorithmException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			// 關閉連接配接,釋放資源
			httpClient.getConnectionManager().shutdown();
		}
		return responseContent;
	}
    
    
    /**
     * 發送HTTP_POST請求,json格式資料
     * @param url
     * @param body
     * @return
     * @throws Exception
     */
    public static String sendPostByJson(String url, String body) throws Exception {
		CloseableHttpClient httpclient = HttpClients.custom().build();
		HttpPost post = null;
		String resData = null;
		CloseableHttpResponse result = null;
		try {
			post = new HttpPost(url);
			HttpEntity entity2 = new StringEntity(body, Consts.UTF_8);
			post.setConfig(RequestConfig.custom().setConnectTimeout(30000).setSocketTimeout(30000).build());
			post.setHeader("Content-Type", "application/json");
			post.setEntity(entity2);
			result = httpclient.execute(post);
			if (HttpStatus.SC_OK == result.getStatusLine().getStatusCode()) {
				resData = EntityUtils.toString(result.getEntity());
			}
		} finally {
			if (result != null) {
				result.close();
			}
			if (post != null) {
				post.releaseConnection();
			}
			httpclient.close();
		}
		return resData;
	}
}
           

API

在Jsoup中主要有兩個對象 一個是Document另一個是Elements或者Element

通過Document我們能夠得到一個完整的文檔對象

然後可以通過Document提供的API進行查詢到對應的元素(Element)或者是傳回多個元素(Elements)

然後我們還可以通過對應的Api擷取到具體屬性的值x

更直接的可以直接使用Document裡面提供好的API進行類似CSS選擇器的查詢方式擷取元素

迫不及待了吧 我們來看看如何使用

下面代碼注釋很詳細哦 看注釋吧 自己要多練習哦

package top.liwenxiang.jsoup;

import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class 爬蟲庫使用 {
	 public static void main(String[] args) throws ParseException {

	 		   // 發送http請求 爬取到資料
	     	   String html = HttpClientUtil.sendGetRequest("http://liwenxiang.top","utf-8");
	     	   System.out.println(html);
	     	   System.out.println("------------------------");
	     	   // 建立Document對象 将擷取到的html字元串當做參數傳入
	     	   Document doc = Jsoup.parse(html);
	     	   // 通過getElementsByTag方法擷取到所有标簽是title的元素 傳回多個是以是Elements
		 	   // 對應的API還有很多 用法都是一樣的
		 	   // Element elementsByTag = doc.getElementById("title");	 通過ID擷取是擷取單個是以是Element
		 	   // Elements elementsByTag = doc.getElementsByClass("title");
	     	   Elements elementsByTag = doc.getElementsByTag("title");
	     	   System.out.println(elementsByTag.get(0).text());
	     	   System.out.println("------------------------");
			   // 通過getElementsByTag方法擷取到所有标簽是a的元素 傳回多個是以是Elements
	     	   Elements elementsByAttribute = doc.getElementsByTag("a");
	     	   // 可以擷取屬性
	     	   int size = elementsByAttribute.size();
	     	   for(int i = 0 ; i < size; i++) {
	     	   	    // 通過Element對象就可以擷取到改标簽中的任意屬性值 都有對應的API  通過get方法就可以得到一個Element對象
	     		    Element element = elementsByAttribute.get(i);
	     		    Elements elementsByAttribute2 = element.getElementsByAttribute("href");
	     		    if(elementsByAttribute2.size() != 0) {
	     		    		System.out.println(elementsByAttribute2.get(0).text());
	     		    }
	     	   }	   
	     	   
	     	   // 選擇器 這個最為友善  CSS中的選擇器使用比對方式 這裡大多都能比對
	     	   Elements select = doc.select(".home1");
	     	   System.out.println(select.first().text());

				 Elements select1 = doc.select("a[href=http://t.cn/AiC20SOG]");
				 // text 是擷取到文本
				 System.out.println(select1.first().text());

				 // 還有很多好用的方法 快去挖掘吧~~~~
	 }
}

           

結語

今天的分享就這麼多啦~~~