天天看點

Java配合爬蟲代理IP采集大衆點評店鋪資訊Java配合爬蟲代理IP采集大衆點評店鋪資訊

Java配合爬蟲代理IP采集大衆點評店鋪資訊

大衆點評店鋪網址格式如下:

http://www.dianping.com/shop/6000000/

http://www.dianping.com/shop/6000001/

shop後面的ID是連續的,範圍是1-1500萬,當然有許多店鋪是不存在的(404錯誤),實際的店鋪數量在700萬左右,這裡是用的窮舉法,當然也可以進入網頁按深度索引。

程式采集過程中會發現大衆點評采取了嚴格的反爬蟲措施,如果一個IP一秒一個進行采集,大概采集500-1000個左右就會出現403錯誤,IP被當機了,一段時間後才解封,如果當機了你不死心,繼續大量采,就永久當機了。

其實這個問題很好解決,使用爬蟲代理IP,那403迎刃而解,IP采用無憂代理IP,網址 http://www.data5u.com/buy/dynamic.html

代碼用到了HtmlUnit和Jsoup,如下:

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.NameValuePair;

/**
 * 這個DEMO主要為了測試爬蟲(動态)代理IP的穩定性
 * 完美支援企業資訊天眼查、電商Ebay、亞馬遜、新浪微網誌、法院文書、分類資訊等
 * 也可以作為爬蟲參考項目,如需使用,請自行修改webParseHtml方法
 */
public class TestDynamicIpContinue {
	
	public static List ipList = new ArrayList<>();
	public static boolean gameOver = false;
	
	public static void main(String[] args) throws Exception {
		// 每隔幾秒提取一次IP
		long fetchIpSeconds = 5;
		int testTime = 3;
		
		// 請填寫無憂代理IP訂單号,填寫之後才可以提取到IP哦
		String order = "88888888888888888888888888888";
		
		// 你要抓去的目标網址
		String targetUrl = "http://www.dianping.com/shop/6000000/";
		
		// 設定referer資訊,如果抓取淘寶、天貓需要設定
		String referer = "";
		// 開啟對https的支援
		boolean https = true;
		// 是否輸出Header資訊
		boolean outputHeaderInfo = false;
		// 是否加載JS,加載JS會導緻速度變慢
		boolean useJS = false;
		// 請求逾時時間,機關毫秒,預設5秒
		int timeOut = 10000;
		
		if (order == null || "".equals(order)) {
			System.err.println("請輸入爬蟲(動态)代理訂單号");
			return;
		}
		System.out.println(">>>>>>>>>>>>>>動态IP測試開始<<<<<<<<<<<<<<");
		System.out.println("***************");
		System.out.println("提取IP間隔 " + fetchIpSeconds + " 秒 ");
		System.out.println("爬蟲目标網址  " + targetUrl);
		System.out.println("***************\n");
		TestDynamicIpContinue tester = new TestDynamicIpContinue();
		new Thread(tester.new GetIP(fetchIpSeconds * 1000, testTime, order, targetUrl, useJS, timeOut, referer, https, outputHeaderInfo)).start();
	
		while(!gameOver){
			try {
				Thread.sleep(100);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		System.out.println(">>>>>>>>>>>>>>動态IP測試結束<<<<<<<<<<<<<<");
		System.exit(0);
	}
    
	// 抓取IP138,檢測IP
	public class Crawler extends Thread{
		@Override
		public void run() {
			webParseHtml(targetUrl);
		}
		
		long sleepMs = 200;
		boolean useJs = false;
		String targetUrl = "";
		int timeOut = 5000;
		String ipport = "";
		
		String referer;
		boolean https;
		boolean outputHeaderInfo;
		
		public Crawler(long sleepMs, String targetUrl, boolean useJs, int timeOut, String ipport, String referer, boolean https, boolean outputHeader) {
			this.sleepMs = sleepMs;
			this.targetUrl = targetUrl;
			this.useJs = useJs;
			this.timeOut = timeOut;
			this.ipport = ipport;
			
			this.referer = referer;
			this.https = https;
			this.outputHeaderInfo = outputHeader;
		}
		public String webParseHtml(String url) {
			String html = "";
			BrowserVersion[] versions = { BrowserVersion.CHROME, BrowserVersion.FIREFOX_38, BrowserVersion.INTERNET_EXPLORER_11, BrowserVersion.INTERNET_EXPLORER_8};
			WebClient client = new WebClient(versions[(int)(versions.length * Math.random())]);
			try {
				client.getOptions().setThrowExceptionOnFailingStatusCode(false);
				client.getOptions().setJavaScriptEnabled(useJs);
				client.getOptions().setCssEnabled(false);
				client.getOptions().setThrowExceptionOnScriptError(false);
				client.getOptions().setTimeout(timeOut);
				client.getOptions().setAppletEnabled(true);
				client.getOptions().setGeolocationEnabled(true);
				client.getOptions().setRedirectEnabled(true);
				
				// 對于HTTPS網站,加上這行代碼可以跳過SSL驗證
				client.getOptions().setUseInsecureSSL(https);
				
				if (referer != null && !"".equals(referer)) {
					client.addRequestHeader("Referer", referer);
				}
				
				if (ipport != null) {
					ProxyConfig proxyConfig = new ProxyConfig((ipport.split(",")[0]).split(":")[0], Integer.parseInt((ipport.split(",")[0]).split(":")[1]));
					client.getOptions().setProxyConfig(proxyConfig);
				}else {
					System.out.print(".");
					return "";
				}
			
				long startMs = System.currentTimeMillis();
				
				Page page = client.getPage(url);
				WebResponse response = page.getWebResponse();
				
				if (outputHeaderInfo) {
					// 輸出header資訊
					List headers = response.getResponseHeaders();
					for (NameValuePair nameValuePair : headers) {
						System.out.println(nameValuePair.getName() + "-->" + nameValuePair.getValue());
					}
				}
				
				boolean isJson = false ;
				if (response.getContentType().equals("application/json")) {
					html = response.getContentAsString();
					isJson = true ;
				}else if(page.isHtmlPage()){
					html = ((HtmlPage)page).asXml();
				}
				
				long endMs = System.currentTimeMillis();
				
				Document doc = Jsoup.parse(html);System.out.println(getName() + " " + ipport + " 用時 " + (endMs - startMs) + "毫秒 :" + doc.select("title").text());				
			} catch (Exception e) {
				System.err.println(ipport + ":" + e.getMessage());
			} finally {
				client.close();
			}
			return html;
		}
		
	}
	
	// 定時擷取動态IP
	public class GetIP implements Runnable{
		long sleepMs = 1000;
		int maxTime = 3;
		String order = "";
		String targetUrl;
		boolean useJs;
		int timeOut;
		String referer;
		boolean https;
		boolean outputHeaderInfo;
		
		public GetIP(long sleepMs, int maxTime, String order, String targetUrl, boolean useJs, int timeOut, String referer, boolean https, boolean outputHeaderInfo) {
			this.sleepMs = sleepMs;
			this.maxTime = maxTime;
			this.order = order;
			this.targetUrl = targetUrl;
			this.useJs = useJs;
			this.timeOut = timeOut;
			this.referer=referer;
			this.https=https;
			this.outputHeaderInfo=outputHeaderInfo;
		}
		
		@Override
		public void run() {
			int time = 1;
			while(!gameOver){
				if(time >= 4){
					gameOver = true;
					break;
				}
				try {
					java.net.URL url = new java.net.URL("http://api.ip.data5u.com/dynamic/get.html?order=" + order + "&ttl&random=true");
					
			    	HttpURLConnection connection = (HttpURLConnection)url.openConnection();
			    	connection.setConnectTimeout(3000);
			    	connection = (HttpURLConnection)url.openConnection();
			    	
			        InputStream raw = connection.getInputStream();  
			        InputStream in = new BufferedInputStream(raw);  
			        byte[] data = new byte[in.available()];
			        int bytesRead = 0;  
			        int offset = 0;  
			        while(offset < data.length) {  
			            bytesRead = in.read(data, offset, data.length - offset);  
			            if(bytesRead == -1) {  
			                break;  
			            }  
			            offset += bytesRead;  
			        }  
			        in.close();  
			        raw.close();
					String[] res = new String(data, "UTF-8").split("\n");
					System.out.println(">>>>>>>>>>>>>>目前傳回IP量 " + res.length);
					for (String ip : res) {
						new Crawler(100, targetUrl, useJs, timeOut, ip, referer, https, outputHeaderInfo).start();
					}
				} catch (Exception e) {
					System.err.println(">>>>>>>>>>>>>>擷取IP出錯, " + e.getMessage());
				}
				try {
					Thread.sleep(sleepMs);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
	}
	
	
	public String joinList(List list){
		StringBuilder re = new StringBuilder();
		for (String string : list) {
			re.append(string).append(",");
		}
		return re.toString();
	}


	public String trim(String html) {
		if (html != null) {
			return html.replaceAll(" ", "").replaceAll("\n", "");
		}
		return null;
	}
	
}