public class Request {
private static Logger logger = Logger.getLogger(Request.class);
private HttpClient client;
public Request() {
client = new DefaultHttpClient();
}
public String getPageSource(String pageUrl,String encoding) {
StringBuffer sb = new StringBuffer();
try {
URL url = new URL(pageUrl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), encoding));
String line;
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (Exception ex) {
System.err.println(ex);
}
return sb.toString();
}
public static Document getRequestByJSOUP(String url){
Connection conn = Jsoup.connect(url).timeout(60 * 1000);
Document doc;
try {
doc = conn.get();
return doc;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public String getRequest(String url, String charset) {
byte[] cbyte = getRequest(url);
if (cbyte == null) {
System.out.println("采集url:" + url + "\t 没有获取到数据");
return null;
}
String content = null;
try {
content = new String(cbyte, charset);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return content;
}
/**
* 发出get请求
* @param url
* @param charset
* @param refererURL
* @param cookie
* @return
*/
public byte[] getRequest(String url) {
if (url.contains("https://")) {
enableSSL(client);
}
int requestStatus = 0;
HttpGet getMethod = null;
// HttpPost getMethod = null;
HttpResponse httpResponse = null;
try {
getMethod = new HttpGet(url);
// getMethod = new HttpPost(url);
getMethod.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
getMethod.addHeader("Accept-Language","zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
getMethod.addHeader("Accept-Encoding", "gzip, deflate");
getMethod.addHeader("Content-Type", "text/html; charset=utf8");
httpResponse = client.execute(getMethod);
requestStatus = httpResponse.getStatusLine().getStatusCode();
if (requestStatus == HttpStatus.SC_OK) {
try {
byte[] temp = getResponseBody(httpResponse);
return temp;
} catch (Exception e1) {
e1.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
IOUtil.CloseHttpGet(getMethod);
// IOUtil.CloseHttpPost(getMethod);
IOUtil.CloseHttpClient(client);
}
return null;
}
/**
* 处理可能出现的压缩格式
* @param method
* @return byte[]
*/
public synchronized byte[] getResponseBody(HttpResponse response) {
GZIPInputStream gzipInput = null;
ByteArrayOutputStream out = null;
HttpEntity entity = null;
byte[] result = null;
try {
Header contentEncodingHeader = response.getFirstHeader("Content-Encoding");
entity = response.getEntity();
if (contentEncodingHeader != null && entity != null) {
String contentEncoding = contentEncodingHeader.getValue();
if (contentEncoding.toLowerCase(Locale.US).indexOf("gzip") != -1) {
try {
gzipInput = new GZIPInputStream(entity.getContent());
} catch (EOFException e) {
logger.error("read gzip inputstream eof exception!");
}
out = new ByteArrayOutputStream();
byte[] buffer = new byte[256];
int n;
while ((n = gzipInput.read(buffer)) >= 0) {
out.write(buffer, 0, n);
}
result = out.toByteArray();
}
} else {
result = EntityUtils.toByteArray(entity);
}
} catch (Exception e) {
logger.error("read response body exception! ", e);
} finally {
IOUtil.CloseInputStream(gzipInput);
IOUtil.CloseOutputStream(out);
IOUtil.CloseHttpEntity(entity);
}
return result;
}
/**
* 获取cookie信息
* @return
*/
public String getCookie() {
List<Cookie> cookies = ((AbstractHttpClient) client).getCookieStore().getCookies();
StringBuilder sb = new StringBuilder();
for (Cookie cookie : cookies) {
sb.append(cookie.getName() + "=" + cookie.getValue() + "; ");
}
return sb.toString();
}
/**
* 对URL中的中文进行编码
* @param url
* @param charset
* @return
*/
@SuppressWarnings("unused")
private synchronized String encodeURL(String url, String charset) {
StringBuffer encodeURL = new StringBuffer();
try {
Pattern pattern = Pattern.compile("[ \\[\\]{}\\s\u4e00-\u9fa5]");
Matcher matcher = pattern.matcher(url);
while (matcher.find()) {
matcher.appendReplacement(encodeURL,URLEncoder.encode(matcher.group(), charset));
}
matcher.appendTail(encodeURL);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return encodeURL.toString();
}
public HttpClient getClient() {
return client;
}
public void setClient(HttpClient client) {
this.client = client;
}
/**
* 访问HTTPS的网站 调用SSL,开启SSL加密通道
* @param httpclient
*/
@SuppressWarnings("deprecation")
private void enableSSL(HttpClient httpclient) {
try {
SSLContext sslcontext = SSLContext.getInstance("TLS");
sslcontext.init(null, new TrustManager[] { truseAllManager }, null);
SSLSocketFactory sf = new SSLSocketFactory(sslcontext);
sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
Scheme https = new Scheme("https", sf, 443);
httpclient.getConnectionManager().getSchemeRegistry().register(https);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 重写验证方法,取消检测SSL
*/
private TrustManager truseAllManager = new X509TrustManager() {
public void checkClientTrusted(java.security.cert.X509Certificate[] arg0, String arg1)throws CertificateException {
}
public void checkServerTrusted(java.security.cert.X509Certificate[] arg0, String arg1)throws CertificateException {
}
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
};
public static void main(String[] args) {
Request request = new Request();
Calendar cal = new GregorianCalendar();
long time = cal.getTimeInMillis();
System.out.println(time);
// String content = request.getRequest("http://s.weibo.com/top/summary?cate=realtimehot","utf8");
String content = request.getRequest("http://d.weibo.com/100803?ssovie4c55=0","utf8");
// String content = Request.getRequestByJSOUP("http://d.weibo.com/100803_-_page_hot_list?cfs=&Pl_Discover_Pt6Rank__5_filter=hothtlist_type%3D0#_0").html();
System.out.println(content);
// Pattern pa = Pattern.compile("<script>STK && STK.pageletM && STK.pageletM.view\\((.*)\\)</script>");
Pattern pa = Pattern.compile("<script>FM.view\\((.*)\\)</script>");
Matcher matcher = pa.matcher(content);
while (matcher.find()) {
System.out.println(matcher.group());
}
}
}
转载于:https://my.oschina.net/chenfu/blog/720109