天天看点

java远程抓取网页信息

//笔记

//

//

jar包:httpclient-4.3.5.jar,jsoup-1.7.2.jar

 //创建httpclient实例,采用默认的参数配置

 CloseableHttpClient httpClient = HttpClients.createDefault();

 //使用post提交  (个人把HttpPost理解为是一个浏览器)

 HttpPost httppost = new HttpPost("要抓取网页验证码的url地址"); 

 //设置请求的头  先用正常的浏览器获取验证码,F12查看Headers的信息对照着填写

 httppost.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/

jar包:httpclient-4.3.5.jar 

//创建httpclient实例,采用默认的参数配置

 CloseableHttpClient httpClient = HttpClients.createDefault();

 //使用post提交  (个人把HttpPost理解为是一个浏览器)

 HttpPost httppost = new HttpPost("要抓取网页验证码的url地址"); 

 //设置请求的头  先用正常的浏览器获取验证码,F12查看Headers的信息对照着填写

 httppost.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

httppost.setHeader("Accept-Encoding","");

httppost.setHeader("Accept-Language","");

httppost.setHeader("Content-Type","");

httppost.setHeader("Cookie",");

httppost.setHeader("Host","");

httppost.setHeader("Origin","");

httppost.setHeader("User-Agent","");

//设置请求配置参数,分别为连接池获取超时时间,服务器超时时间,服务器返回数据的时间

RequestConfig requestConfi=RequestConfig.custom().setConnectionRequestTimeout(3000).setConnectTimeout(8000).setSocketTimeout(8000).build();

//将配置信息添加到httppost中

httppost.setConfig(requestConfi);

//设置请求参数

List params=new ArrayList();

params.add(new BasicNameValuePair("参数名",参数值));  

params.add(new BasicNameValuePair("参数名",参数值));  

try {

UrlEncodedFormEntity uefEntity=new UrlEncodedFormEntity(formparams, "UTF-8");  

httppost.setEntity(uefEntity); //往httppost里添加参数

//执行 发出请求

CloseableHttpResponse resp = httpclient.execute(httppost);  

try { 

HttpEntity entity = resp.getEntity();  

String reslut = EntityUtils.toString(entity, "UTF-8");  

//使用jsoup解析html

Document doc = Jsoup.parse(reslut);//解析HTML字符串返回一个Document实现

//Document 对象已经拿到,根据不同的网页解析

//例如 取一个div calss为mainmaintableright下的tr标签下的td数据

Elements div = doc.select(".mainmaintableright").select("tr").select("td"); 

//for div  存储

} finally {  

       resp.close();  

     }  

} catch (Exception e) {

e.printStackTrace();

//个人笔记,新手。

继续阅读