天天看點

利用HttpClient擷取網頁内容

HTTP協定是目前網際網路上最重要的協定,許多軟體與服務都需要依賴HTTP協定。

雖然java.net這個package中包含了對HTTP的基本支援,但還有很多進階和複雜的功能無法實作,這不能不說是一個遺憾。

HttpClient作為Apache的開源項目項目之一,為基于HTTP協定的操作提供了強大的用戶端執行支援,最新的版本為3.0RC3。

下面通過一個例子簡要展示HttpClient的使用方法:

--------------------------------------------------------------------------------

import java.io.BufferedReader;

import java.io.IOException;

JAVA手機網[www.cnjm.net] import java.io.InputStream;

import java.io.InputStreamReader;

iimport java.io.UnsupportedEncodingException;

import java.util.*;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HostConfiguration;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpConnection;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.methods.PostMethod;

public class HttpClientExample {

//獲得ConnectionManager,設定相關參數

private static MultiThreadedHttpConnectionManager manager =

new MultiThreadedHttpConnectionManager();

private static int connectionTimeOut = 20000;

private static int socketTimeOut = 10000;

private static int maxConnectionPerHost = 5;

private static int maxTotalConnections = 40;

//标志初始化是否完成的flag

private static boolean initialed = false;

//初始化ConnectionManger的方法

public static void SetPara() {

manager.getParams().setConnectionTimeout(connectionTimeOut);

manager.getParams().setSoTimeout(socketTimeOut);

manager.getParams()

.setDefaultMaxConnectionsPerHost(maxConnectionPerHost);

manager.getParams().setMaxTotalConnections(maxTotalConnections);

initialed = true;

JAVA手機網[www.cnjm.net] }

//通過get方法擷取網頁内容

public static String getGetResponseWithHttpClient(String url, String encode) {

JAVA手機網[www.cnjm.net] HttpClient client = new HttpClient(manager);

if (initialed) {

JAVA手機網[www.cnjm.net] HttpClientExample.SetPara();

}

GetMethod get = new GetMethod(url);

get.setFollowRedirects(true);

JAVA手機網[www.cnjm.net] String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(get);

JAVA手機網[www.cnjm.net]

JAVA手機網[www.cnjm.net] //在目标頁面情況未知的條件下,不推薦使用getResponseBodyAsString()方法

//String strGetResponseBody = post.getResponseBodyAsString();

BufferedReader in = new BufferedReader(

new InputStreamReader(

get.getResponseBodyAsStream(),

get.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

JAVA手機網[www.cnjm.net] resultBuffer.append("/n");

}

in.close();

result = resultBuffer.toString();

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

get.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

JAVA手機網[www.cnjm.net] } finally {

get.releaseConnection();

return result;

}

}

public static String getPostResponseWithHttpClient(String url,

String encode) {

HttpClient client = new HttpClient(manager);

JAVA手機網[www.cnjm.net]

if (initialed) {

HttpClientExample.SetPara();

}

JAVA手機網[www.cnjm.net] PostMethod post = new PostMethod(url);

post.setFollowRedirects(false);

StringBuffer resultBuffer = new StringBuffer();

String result = null;

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(

JAVA手機網[www.cnjm.net] new InputStreamReader(

post.getResponseBodyAsStream(),

post.getResponseCharSet()));

String inputLine = null;

JAVA手機網[www.cnjm.net] while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

JAVA手機網[www.cnjm.net] resultBuffer.append("/n");

}

in.close();

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

post.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

JAVA手機網[www.cnjm.net] } finally {

post.releaseConnection();

JAVA手機網[www.cnjm.net] return result;

}

}

public static String getPostResponseWithHttpClient(String url,

String encode,

NameValuePair[] nameValuePair) {

HttpClient client = new HttpClient(manager);

if (initialed) {

HttpClientExample.SetPara();

}

JAVA手機網[www.cnjm.net] PostMethod post = new PostMethod(url);

post.setRequestBody(nameValuePair);

post.setFollowRedirects(false);

JAVA手機網[www.cnjm.net]

String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(

new InputStreamReader(

post.getResponseBodyAsStream(),

post.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

JAVA手機網[www.cnjm.net] resultBuffer.append(inputLine);

resultBuffer.append("/n");

}

in.close();

JAVA手機網[www.cnjm.net]

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

post.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

JAVA手機網[www.cnjm.net] result = "";

} finally {

post.releaseConnection();

return result;

}

}

JAVA手機網[www.cnjm.net] private static String ConverterStringCode(String source, String srcEncode, String destEncode) {

if (src != null) {

try {

return new String(src.getBytes(srcEncode), destEncode);

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return "";

}

} else {

return "";

JAVA手機網[www.cnjm.net] }

}

}

--------------------------------------------------------------------------------

之後,就可以通過下面的代碼獲得目标網頁:

String source = HttpClientExample.getGetResponseWithHttpClient("www.sina.com.cn", "GBK");

注意,在預設情況下,HttpClient的Request的Head中

User-Agent的值是Jakarta Commons-HttpClient 3.0RC1,如果需要改變它(例如,變為Mozilla/4.0),必須在調用之前運作如下語句:

System.getProperties().setProperty("httpclient.useragent", "Mozilla/4.0");