博主个人主页
前言
今天来分享一个非常好用的Java爬虫库 让我们可以拿到一串HTML文本字符串 可以随意获取到自己需要获得的数据 这个库的名字是Jsoup 内置了可以让我们像使用CSS的选择器一样的方法来获取页面上的元素并且一些API和JavaScript操作DOM的API使一致的
使用
pom依赖
我们需要用到httpclient发送http请求获取数据
我们需要用到junit做单元测试 不用也可以
jsoup就是主要的啦
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
httpclient 工具类
package top.liwenxiang.jsoup;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
/**
* 封装了一些采用HttpClient发送HTTP请求的方法
* @see 本工具所采用的是最新的HttpComponents-Client-4.2.1
*/
public class HttpClientUtil {
private HttpClientUtil(){}
private static Log logger = LogFactory.getLog(HttpClientUtil.class);
public static HttpResponse sendXmlDataByPost(String url,String xmlData) throws ClientProtocolException, IOException {
HttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost(url);
StringEntity entity = new StringEntity(xmlData);
httpPost.setEntity(entity);
httpPost.setHeader("Content-Type","text/xml;charset=UTF-8");
HttpResponse execute = httpClient.execute(httpPost);
// 外部获取输入流可以 .getEntity().getContent();
return execute;
}
/**
* 发送HTTP_GET请求
* @see 该方法会自动关闭连接,释放资源
* @param requestURL 请求地址(含参数)
* @param decodeCharset 解码字符集,解析响应数据时用之,其为null时默认采用UTF-8解码
* @return 远程主机响应正文
*/
public static String sendGetRequest(String reqURL, String decodeCharset) throws ParseException{
long responseLength = 0; //响应长度
String responseContent = null; //响应内容
HttpClient httpClient = new DefaultHttpClient(); //创建默认的httpClient实例
HttpGet httpGet = new HttpGet(reqURL); //创建org.apache.http.client.methods.HttpGet
try{
HttpResponse response = httpClient.execute(httpGet); //执行GET请求
HttpEntity entity = response.getEntity(); //获取响应实体
if(null != entity){
responseLength = entity.getContentLength();
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity); //Consume response content
}
System.out.println("请求地址: " + httpGet.getURI());
System.out.println("响应状态: " + response.getStatusLine());
System.out.println("响应长度: " + responseLength);
System.out.println("响应内容: " + responseContent);
}catch(ClientProtocolException e){
logger.debug("该异常通常是协议错误导致,比如构造HttpGet对象时传入的协议不对(将'http'写成'htp')或者服务器端返回的内容不符合HTTP协议要求等,堆栈信息如下", e);
}catch(IOException e){
logger.debug("该异常通常是网络原因引起的,如HTTP服务器未启动等,堆栈信息如下", e);
}finally{
httpClient.getConnectionManager().shutdown(); //关闭连接,释放资源
}
return responseContent;
}
/**
* 发送HTTP_POST请求
* @see 该方法为
sendPostRequest(String,String,boolean,String,String)
的简化方法
* @see 该方法在对请求数据的编码和响应数据的解码时,所采用的字符集均为UTF-8
* @see 当
isEncoder=true
时,其会自动对
sendData
中的[中文][|][ ]等特殊字符进行
URLEncoder.encode(string,"UTF-8")
* @param isEncoder 用于指明请求数据是否需要UTF-8编码,true为需要
*/
public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder){
return sendPostRequest(reqURL, sendData, isEncoder, null, null);
}
/**
* 发送HTTP_POST请求
* @see 该方法会自动关闭连接,释放资源
* @see 当
isEncoder=true
时,其会自动对
sendData
中的[中文][|][ ]等特殊字符进行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 请求地址
* @param sendData 请求参数,若有多个参数则应拼接成param11=value11&22=value22&33=value33的形式后,传入该参数中
* @param isEncoder 请求数据是否需要encodeCharset编码,true为需要
* @param encodeCharset 编码字符集,编码请求数据时用之,其为null时默认采用UTF-8解码
* @param decodeCharset 解码字符集,解析响应数据时用之,其为null时默认采用UTF-8解码
* @return 远程主机响应正文
*/
public static String sendPostRequest(String reqURL, String sendData, boolean isEncoder, String encodeCharset, String decodeCharset){
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(reqURL);
//httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded; charset=UTF-8");
httpPost.setHeader(HTTP.CONTENT_TYPE, "application/x-www-form-urlencoded");
try{
if(isEncoder){
List formParams = new ArrayList();
for(String str : sendData.split("&")){
formParams.add(new BasicNameValuePair(str.substring(0,str.indexOf("=")), str.substring(str.indexOf("=")+1)));
}
httpPost.setEntity(new StringEntity(URLEncodedUtils.format(formParams, encodeCharset==null ? "UTF-8" : encodeCharset)));
}else{
httpPost.setEntity(new StringEntity(sendData));
}
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
}catch(Exception e){
logger.debug("与[" + reqURL + "]通信过程中发生异常,堆栈信息如下", e);
}finally{
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 发送HTTP_POST请求
* @see 该方法会自动关闭连接,释放资源
* @see 该方法会自动对
params
中的[中文][|][ ]等特殊字符进行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 请求地址
* @param params 请求参数
* @param encodeCharset 编码字符集,编码请求数据时用之,其为null时默认采用UTF-8解码
* @param decodeCharset 解码字符集,解析响应数据时用之,其为null时默认采用UTF-8解码
* @return 远程主机响应正文
*/
public static String sendPostRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(reqURL);
List formParams = new ArrayList(); //创建参数队列
for(Map.Entry entry : params.entrySet()){
formParams.add(new BasicNameValuePair((String) entry.getKey(), (String)entry.getValue()));
}
try{
httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
}catch(Exception e){
logger.debug("与[" + reqURL + "]通信过程中发生异常,堆栈信息如下", e);
}finally{
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 发送HTTPS_POST请求
* @see 该方法为
sendPostSSLRequest(String,Map,String,String)
方法的简化方法
* @see 该方法在对请求数据的编码和响应数据的解码时,所采用的字符集均为UTF-8
* @see 该方法会自动对
params
中的[中文][|][ ]等特殊字符进行
URLEncoder.encode(string,"UTF-8")
*/
public static String sendPostSSLRequest(String reqURL, Map params){
return sendPostSSLRequest(reqURL, params, null, null);
}
/**
* 发送HTTPS_POST请求
* @see 该方法会自动关闭连接,释放资源
* @see 该方法会自动对
params
中的[中文][|][ ]等特殊字符进行
URLEncoder.encode(string,encodeCharset)
* @param reqURL 请求地址
* @param params 请求参数
* @param encodeCharset 编码字符集,编码请求数据时用之,其为null时默认采用UTF-8解码
* @param decodeCharset 解码字符集,解析响应数据时用之,其为null时默认采用UTF-8解码
* @return 远程主机响应正文
*/
public static String sendPostSSLRequest(String reqURL, Map<String,Object> params, String encodeCharset, String decodeCharset){
String responseContent = "";
HttpClient httpClient = new DefaultHttpClient();
X509TrustManager xtm = new X509TrustManager(){
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public X509Certificate[] getAcceptedIssuers() {return null;}
};
try {
SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[]{xtm}, null);
SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", 443, socketFactory));
HttpPost httpPost = new HttpPost(reqURL);
List formParams = new ArrayList();
for(Map.Entry entry : params.entrySet()){
formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
}
httpPost.setEntity(new UrlEncodedFormEntity(formParams, encodeCharset==null ? "UTF-8" : encodeCharset));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (null != entity) {
responseContent = EntityUtils.toString(entity, decodeCharset==null ? "UTF-8" : decodeCharset);
EntityUtils.consume(entity);
}
} catch (Exception e) {
logger.debug("与[" + reqURL + "]通信过程中发生异常,堆栈信息为", e);
} finally {
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 发送HTTP_POST请求
* @see 若发送的
params
中含有中文,记得按照双方约定的字符集将中文
URLEncoder.encode(string,encodeCharset)
* @see 本方法默认的连接超时时间为30秒,默认的读取超时时间为30秒
* @param reqURL 请求地址
* @param params 发送到远程主机的正文数据,其数据类型为
java.util.Map
* @return 远程主机响应正文`HTTP状态码,如
"SUCCESS`200"
若通信过程中发生异常则返回"Failed`HTTP状态码",如
"Failed`500"
*/
public static String sendPostRequestByJava(String reqURL, Map<String,Object> params){
StringBuilder sendData = new StringBuilder();
for(Map.Entry entry : params.entrySet()){
sendData.append(entry.getKey()).append("=").append(entry.getValue()).append("&");
}
if(sendData.length() > 0){
sendData.setLength(sendData.length() - 1); //删除最后一个&符号
}
return sendPostRequestByJava(reqURL, sendData.toString());
}
/**
* 发送HTTP_POST请求
* @see 若发送的
sendData
中含有中文,记得按照双方约定的字符集将中文
URLEncoder.encode(string,encodeCharset)
* @see 本方法默认的连接超时时间为30秒,默认的读取超时时间为30秒
* @param reqURL 请求地址
* @param sendData 发送到远程主机的正文数据
* @return 远程主机响应正文`HTTP状态码,如
"SUCCESS`200"
若通信过程中发生异常则返回"Failed`HTTP状态码",如
"Failed`500"
*/
public static String sendPostRequestByJava(String reqURL, String sendData){
HttpURLConnection httpURLConnection = null;
OutputStream out = null; //写
InputStream in = null; //读
int httpStatusCode = 0; //远程主机响应的HTTP状态码
try{
URL sendUrl = new URL(reqURL);
httpURLConnection = (HttpURLConnection)sendUrl.openConnection();
httpURLConnection.setRequestMethod("POST");
httpURLConnection.setDoOutput(true); //指示应用程序要将数据写入URL连接,其值默认为false
httpURLConnection.setUseCaches(false);
httpURLConnection.setConnectTimeout(30000); //30秒连接超时
httpURLConnection.setReadTimeout(30000); //30秒读取超时
out = httpURLConnection.getOutputStream();
out.write(sendData.toString().getBytes());
//清空缓冲区,发送数据
out.flush();
//获取HTTP状态码
httpStatusCode = httpURLConnection.getResponseCode();
in = httpURLConnection.getInputStream();
byte[] byteDatas = new byte[in.available()];
in.read(byteDatas);
return new String(byteDatas) + "`" + httpStatusCode;
}catch(Exception e){
logger.debug(e.getMessage());
return "Failed`" + httpStatusCode;
}finally{
if(out != null){
try{
out.close();
}catch (Exception e){
logger.debug("关闭输出流时发生异常,堆栈信息如下", e);
}
}
if(in != null){
try{
in.close();
}catch(Exception e){
logger.debug("关闭输入流时发生异常,堆栈信息如下", e);
}
}
if(httpURLConnection != null){
httpURLConnection.disconnect();
httpURLConnection = null;
}
}
}
/**
* https posp请求,可以绕过证书校验
* @param url
* @param params
* @return
*/
public static final String sendHttpsRequestByPost(String url, Map<String,Object> params) throws ParseException {
String responseContent = null;
HttpClient httpClient = new DefaultHttpClient();
//创建TrustManager
X509TrustManager xtm = new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
//这个好像是HOST验证
X509HostnameVerifier hostnameVerifier = new X509HostnameVerifier() {
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
public void verify(String arg0, SSLSocket arg1) throws IOException {}
public void verify(String arg0, String[] arg1, String[] arg2) throws SSLException {}
public void verify(String arg0, X509Certificate arg1) throws SSLException {}
};
try {
//TLS1.0与SSL3.0基本上没有太大的差别,可粗略理解为TLS是SSL的继承者,但它们使用的是相同的SSLContext
SSLContext ctx = SSLContext.getInstance("TLS");
//使用TrustManager来初始化该上下文,TrustManager只是被SSL的Socket所使用
ctx.init(null, new TrustManager[] { xtm }, null);
//创建SSLSocketFactory
SSLSocketFactory socketFactory = new SSLSocketFactory(ctx);
socketFactory.setHostnameVerifier(hostnameVerifier);
//通过SchemeRegistry将SSLSocketFactory注册到我们的HttpClient上
httpClient.getConnectionManager().getSchemeRegistry().register(new Scheme("https", socketFactory, 443));
HttpPost httpPost = new HttpPost(url);
List formParams = new ArrayList(); // 构建POST请求的表单参数
for (Map.Entry entry : params.entrySet()) {
formParams.add(new BasicNameValuePair((String)entry.getKey(), (String)entry.getValue()));
}
httpPost.setEntity(new UrlEncodedFormEntity(formParams, "UTF-8"));
HttpResponse response = httpClient.execute(httpPost);
HttpEntity entity = response.getEntity(); // 获取响应实体
if (entity != null) {
responseContent = EntityUtils.toString(entity, "UTF-8");
}
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
httpClient.getConnectionManager().shutdown();
}
return responseContent;
}
/**
* 发送HTTP_POST请求,json格式数据
* @param url
* @param body
* @return
* @throws Exception
*/
public static String sendPostByJson(String url, String body) throws Exception {
CloseableHttpClient httpclient = HttpClients.custom().build();
HttpPost post = null;
String resData = null;
CloseableHttpResponse result = null;
try {
post = new HttpPost(url);
HttpEntity entity2 = new StringEntity(body, Consts.UTF_8);
post.setConfig(RequestConfig.custom().setConnectTimeout(30000).setSocketTimeout(30000).build());
post.setHeader("Content-Type", "application/json");
post.setEntity(entity2);
result = httpclient.execute(post);
if (HttpStatus.SC_OK == result.getStatusLine().getStatusCode()) {
resData = EntityUtils.toString(result.getEntity());
}
} finally {
if (result != null) {
result.close();
}
if (post != null) {
post.releaseConnection();
}
httpclient.close();
}
return resData;
}
}
API
在Jsoup中主要有两个对象 一个是Document另一个是Elements或者Element
通过Document我们能够得到一个完整的文档对象
然后可以通过Document提供的API进行查询到对应的元素(Element)或者是返回多个元素(Elements)
然后我们还可以通过对应的Api获取到具体属性的值x
更直接的可以直接使用Document里面提供好的API进行类似CSS选择器的查询方式获取元素
迫不及待了吧 我们来看看如何使用
下面代码注释很详细哦 看注释吧 自己要多练习哦
package top.liwenxiang.jsoup;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class 爬虫库使用 {
public static void main(String[] args) throws ParseException {
// 发送http请求 爬取到数据
String html = HttpClientUtil.sendGetRequest("http://liwenxiang.top","utf-8");
System.out.println(html);
System.out.println("------------------------");
// 创建Document对象 将获取到的html字符串当做参数传入
Document doc = Jsoup.parse(html);
// 通过getElementsByTag方法获取到所有标签是title的元素 返回多个所以是Elements
// 对应的API还有很多 用法都是一样的
// Element elementsByTag = doc.getElementById("title"); 通过ID获取是获取单个所以是Element
// Elements elementsByTag = doc.getElementsByClass("title");
Elements elementsByTag = doc.getElementsByTag("title");
System.out.println(elementsByTag.get(0).text());
System.out.println("------------------------");
// 通过getElementsByTag方法获取到所有标签是a的元素 返回多个所以是Elements
Elements elementsByAttribute = doc.getElementsByTag("a");
// 可以获取属性
int size = elementsByAttribute.size();
for(int i = 0 ; i < size; i++) {
// 通过Element对象就可以获取到改标签中的任意属性值 都有对应的API 通过get方法就可以得到一个Element对象
Element element = elementsByAttribute.get(i);
Elements elementsByAttribute2 = element.getElementsByAttribute("href");
if(elementsByAttribute2.size() != 0) {
System.out.println(elementsByAttribute2.get(0).text());
}
}
// 选择器 这个最为方便 CSS中的选择器使用匹配方式 这里大多都能匹配
Elements select = doc.select(".home1");
System.out.println(select.first().text());
Elements select1 = doc.select("a[href=http://t.cn/AiC20SOG]");
// text 是获取到文本
System.out.println(select1.first().text());
// 还有很多好用的方法 快去挖掘吧~~~~
}
}
结语
今天的分享就这么多啦~~~