//生成HttpMethod的方法就不舉例了,網上很多,這裡隻是寫明如何使得Httpclient适用所有編碼的網頁抓取
/**
* 擷取頁面html内容
* @param method
* @param methodType
* @return String
* @throws UnsupportedEncodingException
* @throws IOException
*/
private static String readInputStream(HttpMethod method) throws Exception{
String charset = "UTF-8";
if(method instanceof PostMethod){
charset = ((PostMethod)method).getResponseCharSet();
}else{
charset = ((GetMethod)method).getResponseCharSet();
}
byte[] bytes = method.getResponseBody();
String body = new String(bytes,"UTF-8");
charset = getCharSetByBody(body,charset);
return new String(bytes,charset);
}
/**
* 根據頁面body擷取字元編碼
* @param html
* @param charset
* @return
*/
private static String getCharSetByBody(String html,String charset){
Document document = parseJSoupDocumentFromHtml(html, Constants.parseBaseUri);
Elements elements = document.select("meta");
for(Element metaElement : elements){
if(metaElement!=null && StringUtils.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){
String content = metaElement.attr("content");
charset = getCharSet(content);
break;
}
}
return charset;
}
/**
* 正則擷取字元編碼
* @param content
* @return
*/
private static String getCharSet(String content){
String regex = ".*charset=([^;]*).*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if(matcher.find())
return matcher.group(1);
else
return null;
}