天天看點

JSoup 擷取正文,自動識别頁面編碼Charset

 
           
public static String getContent(String url) throws Exception{
		HttpClient hc = new HttpClient();
		HttpMethod hm = new GetMethod(url);
		int statusCode = -1;
		byte[] result = null;
	    statusCode = hc.executeMethod(hm);
	    if(statusCode != HttpStatus.SC_OK)//判断返回
	       return "";
	    if(hm.getResponseBody()!=null){//获取页面数据
	      result = hm.getResponseBody();//hm.getStatusLine()――http状态和请求结果
	    }
	    String charset = JsoupUtils.getCharset(url); //通过jsoup获得页面的charset
		hm.releaseConnection();
		String data = null;
		if(result != null)              
	       data = new String(result,charset);//字符编码设置
		return data;
	}
           

[代码] 获得字符集

/**
	 * 获得字符集
	 */
	public static String getCharset (String siteurl) throws Exception{
		URL url = new URL(siteurl);
		Document doc = Jsoup.parse(url, 6*1000);
		Elements eles = doc.select("meta[http-equiv=Content-Type]");
		Iterator<Element> itor = eles.iterator();
		while (itor.hasNext()) 
			return RegularUtils.matchCharset(itor.next().toString());
		return "gb2312";
	}
           

[代码] 使用正则表达式获得页面字符

/**
 * 获得页面字符
 */
public static String matchCharset(String content) {
	String chs = "gb2312";
	p = Pattern.compile("(?<=charset=)(.+)(?=\")");
	Matcher m = p.matcher(content);
	if (m.find())
		return m.group();
	return chs;
}
           

 

繼續閱讀