天天看點

java遠端抓取網頁資訊

//筆記

//

//

jar包:httpclient-4.3.5.jar,jsoup-1.7.2.jar

 //建立httpclient執行個體,采用預設的參數配置

 CloseableHttpClient httpClient = HttpClients.createDefault();

 //使用post送出  (個人把HttpPost了解為是一個浏覽器)

 HttpPost httppost = new HttpPost("要抓取網頁驗證碼的url位址"); 

 //設定請求的頭  先用正常的浏覽器擷取驗證碼,F12檢視Headers的資訊對照着填寫

 httppost.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/

jar包:httpclient-4.3.5.jar 

//建立httpclient執行個體,采用預設的參數配置

 CloseableHttpClient httpClient = HttpClients.createDefault();

 //使用post送出  (個人把HttpPost了解為是一個浏覽器)

 HttpPost httppost = new HttpPost("要抓取網頁驗證碼的url位址"); 

 //設定請求的頭  先用正常的浏覽器擷取驗證碼,F12檢視Headers的資訊對照着填寫

 httppost.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

httppost.setHeader("Accept-Encoding","");

httppost.setHeader("Accept-Language","");

httppost.setHeader("Content-Type","");

httppost.setHeader("Cookie",");

httppost.setHeader("Host","");

httppost.setHeader("Origin","");

httppost.setHeader("User-Agent","");

//設定請求配置參數,分别為連接配接池擷取逾時時間,伺服器逾時時間,伺服器傳回資料的時間

RequestConfig requestConfi=RequestConfig.custom().setConnectionRequestTimeout(3000).setConnectTimeout(8000).setSocketTimeout(8000).build();

//将配置資訊添加到httppost中

httppost.setConfig(requestConfi);

//設定請求參數

List params=new ArrayList();

params.add(new BasicNameValuePair("參數名",參數值));  

params.add(new BasicNameValuePair("參數名",參數值));  

try {

UrlEncodedFormEntity uefEntity=new UrlEncodedFormEntity(formparams, "UTF-8");  

httppost.setEntity(uefEntity); //往httppost裡添加參數

//執行 送出請求

CloseableHttpResponse resp = httpclient.execute(httppost);  

try { 

HttpEntity entity = resp.getEntity();  

String reslut = EntityUtils.toString(entity, "UTF-8");  

//使用jsoup解析html

Document doc = Jsoup.parse(reslut);//解析HTML字元串傳回一個Document實作

//Document 對象已經拿到,根據不同的網頁解析

//例如 取一個div calss為mainmaintableright下的tr标簽下的td資料

Elements div = doc.select(".mainmaintableright").select("tr").select("td"); 

//for div  存儲

} finally {  

       resp.close();  

     }  

} catch (Exception e) {

e.printStackTrace();

//個人筆記,新手。

繼續閱讀