天天看點

擷取網頁源碼的幾種方式

public class Request {

	private static Logger logger = Logger.getLogger(Request.class);
	private HttpClient client;

	public Request() {
		client = new DefaultHttpClient();
	}

	public String getPageSource(String pageUrl,String encoding) {    
        StringBuffer sb = new StringBuffer();    
        try {    
            URL url = new URL(pageUrl);    
            BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), encoding));    
            String line;    
            while ((line = in.readLine()) != null) {    
                sb.append(line);    
            }    
            in.close();    
        } catch (Exception ex) {    
            System.err.println(ex);    
        }    
        return sb.toString();    
    }   
	
	public static Document getRequestByJSOUP(String url){
		Connection conn = Jsoup.connect(url).timeout(60 * 1000);
		Document doc;
		try {
			doc = conn.get();
			return doc;
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public String getRequest(String url, String charset) {
		byte[] cbyte = getRequest(url);
		if (cbyte == null) {
			System.out.println("采集url:" + url + "\t 沒有擷取到資料");
			return null;
		}
		String content = null;
		try {
			content = new String(cbyte, charset);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return content;
	}

	/**
	 * 發出get請求
	 * @param url
	 * @param charset
	 * @param refererURL
	 * @param cookie
	 * @return
	 */
	public byte[] getRequest(String url) {
		if (url.contains("https://")) {
			enableSSL(client);
		}
		int requestStatus = 0;
		HttpGet getMethod = null;
//		HttpPost getMethod = null;
		HttpResponse httpResponse = null;
		try {
			getMethod = new HttpGet(url);
//			getMethod = new HttpPost(url);
			getMethod.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
			getMethod.addHeader("Accept-Language","zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
			getMethod.addHeader("Accept-Encoding", "gzip, deflate");
			getMethod.addHeader("Content-Type", "text/html; charset=utf8");

			httpResponse = client.execute(getMethod);
			requestStatus = httpResponse.getStatusLine().getStatusCode();
			if (requestStatus == HttpStatus.SC_OK) {
				try {
					byte[] temp = getResponseBody(httpResponse);
					return temp;
				} catch (Exception e1) {
					e1.printStackTrace();
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			IOUtil.CloseHttpGet(getMethod);
//			IOUtil.CloseHttpPost(getMethod);
			IOUtil.CloseHttpClient(client);
		}
		return null;
	}

	/**
	 * 處理可能出現的壓縮格式
	 * @param method
	 * @return byte[]
	 */
	public synchronized byte[] getResponseBody(HttpResponse response) {
		GZIPInputStream gzipInput = null;
		ByteArrayOutputStream out = null;
		HttpEntity entity = null;
		byte[] result = null;
		try {
			Header contentEncodingHeader = response.getFirstHeader("Content-Encoding");
			entity = response.getEntity();
			if (contentEncodingHeader != null && entity != null) {
				String contentEncoding = contentEncodingHeader.getValue();
				if (contentEncoding.toLowerCase(Locale.US).indexOf("gzip") != -1) {
					try {
						gzipInput = new GZIPInputStream(entity.getContent());
					} catch (EOFException e) {
						logger.error("read gzip inputstream eof exception!");
					}
					out = new ByteArrayOutputStream();
					byte[] buffer = new byte[256];
					int n;
					while ((n = gzipInput.read(buffer)) >= 0) {
						out.write(buffer, 0, n);
					}
					result = out.toByteArray();
				}
			} else {
				result = EntityUtils.toByteArray(entity);

			}
		} catch (Exception e) {
			logger.error("read response body exception! ", e);
		} finally {
			IOUtil.CloseInputStream(gzipInput);
			IOUtil.CloseOutputStream(out);
			IOUtil.CloseHttpEntity(entity);
		}
		return result;
	}

	/**
	 * 擷取cookie資訊
	 * @return
	 */
	public String getCookie() {
		List<Cookie> cookies = ((AbstractHttpClient) client).getCookieStore().getCookies();
		StringBuilder sb = new StringBuilder();
		for (Cookie cookie : cookies) {
			sb.append(cookie.getName() + "=" + cookie.getValue() + "; ");
		}
		return sb.toString();
	}

	/**
	 * 對URL中的中文進行編碼
	 * @param url
	 * @param charset
	 * @return
	 */
	@SuppressWarnings("unused")
	private synchronized String encodeURL(String url, String charset) {
		StringBuffer encodeURL = new StringBuffer();
		try {
			Pattern pattern = Pattern.compile("[ \\[\\]{}\\s\u4e00-\u9fa5]");
			Matcher matcher = pattern.matcher(url);
			while (matcher.find()) {
				matcher.appendReplacement(encodeURL,URLEncoder.encode(matcher.group(), charset));
			}
			matcher.appendTail(encodeURL);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return encodeURL.toString();
	}
	
	public HttpClient getClient() {
		return client;
	}

	public void setClient(HttpClient client) {
		this.client = client;
	}

	/**
	 * 通路HTTPS的網站 調用SSL,開啟SSL加密通道
	 * @param httpclient
	 */
	@SuppressWarnings("deprecation")
	private void enableSSL(HttpClient httpclient) {
		try {
			SSLContext sslcontext = SSLContext.getInstance("TLS");
			sslcontext.init(null, new TrustManager[] { truseAllManager }, null);
			SSLSocketFactory sf = new SSLSocketFactory(sslcontext);
			sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
			Scheme https = new Scheme("https", sf, 443);
			httpclient.getConnectionManager().getSchemeRegistry().register(https);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * 重寫驗證方法,取消檢測SSL
	 */
	private TrustManager truseAllManager = new X509TrustManager() {
		public void checkClientTrusted(java.security.cert.X509Certificate[] arg0, String arg1)throws CertificateException {
		}
		public void checkServerTrusted(java.security.cert.X509Certificate[] arg0, String arg1)throws CertificateException {
		}
		public java.security.cert.X509Certificate[] getAcceptedIssuers() {
			return null;
		}
	};

	public static void main(String[] args) {
		Request request = new Request();
		Calendar cal = new GregorianCalendar();
		long time = cal.getTimeInMillis();
		System.out.println(time);
//		String content = request.getRequest("http://s.weibo.com/top/summary?cate=realtimehot","utf8");
		String content = request.getRequest("http://d.weibo.com/100803?ssovie4c55=0","utf8");
//		String content = Request.getRequestByJSOUP("http://d.weibo.com/100803_-_page_hot_list?cfs=&Pl_Discover_Pt6Rank__5_filter=hothtlist_type%3D0#_0").html();
		System.out.println(content);
//		Pattern pa = Pattern.compile("<script>STK && STK.pageletM && STK.pageletM.view\\((.*)\\)</script>");
		Pattern pa = Pattern.compile("<script>FM.view\\((.*)\\)</script>");
		Matcher matcher = pa.matcher(content);
		while (matcher.find()) {	
			System.out.println(matcher.group());
		}
	}
}
           

轉載于:https://my.oschina.net/chenfu/blog/720109