Java爬蟲搜尋原理實作

新人國慶沒事做，又研究了一下爬蟲搜尋，兩三天時間總算是把原理鬧的差不多了，基本實作了爬蟲搜尋的原理，本次實作還是倆程式，分别是按廣度優先和深度優先完成的，廣度優先沒啥問題，深度優先請慎用，有極大的機率會造成死循環情況，下面深度優先的測試網站就造成了死循環。。。。好吧，我承認是我人品不太好。。。下面有請代碼君出場~~~~~~~~~~~~~~~

1.廣度優先

/**
 * 完成廣度優先搜尋
 */
package net.meteor.java;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author 魏詩堯
 * @version 1.8
 * @emali [email protected]
 */
public class SearchCrawlerBreadth {

	// 将網頁源碼下載下傳到本地
	private void downHTML(String urlstr, String htmltxt) {
		// 聲明連結
		HttpURLConnection con = null;
		// 聲明輸入流
		InputStream in = null;
		// 聲明輸出流
		FileOutputStream out = null;

		try {
			// 執行個體化url
			URL url = new URL(urlstr);
			// 打開連結
			con = (HttpURLConnection) url.openConnection();

			con.connect();
			// 打開輸入流
			in = con.getInputStream();
			// 打開輸出流建立接收檔案
			out = new FileOutputStream(htmltxt);

			byte[] b = new byte[1024];

			int len = 0;
			// 将檔案寫入接收檔案
			while ((len = in.read(b, 0, 1024)) != -1) {
				out.write(b, 0, len);
			}
			// 開始第二次爬行
			new SearchCrawlerBreadth().readTxt("src/href.txt");

		} catch (Exception e) {
			System.out.println("未知主機！！");
		} finally {
			try {
				// 關閉流
				if (out != null)
					out.close();
				if (in != null)
					in.close();

			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 頁面解析
	private void readTxt(String hreftxt) {
		// 聲明輸入流
		InputStream in = null;
		FileWriter file = null;
		BufferedReader br = null;

		try {
			// 執行個體化IO流，允許檔案追加寫
			file = new FileWriter(hreftxt, true);

			in = new FileInputStream("src/html.txt");

			br = new BufferedReader(new InputStreamReader(in));
			// 開始解析html
			while (br.readLine() != null) {

				String line = br.readLine();
				// 建立正規表達式
				Pattern pattern = Pattern.compile(
						"<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
						Pattern.CASE_INSENSITIVE);
				// 建立比對器
				Matcher matcher = pattern.matcher(line);
				// 開始與正規表達式進行比對
				while (matcher.find()) {
					String str = matcher.group(1);
					// 跳過鍊到本頁面内連結和無效連結
					if (str.length() < 1) {
						continue;
					}

					if (str.charAt(0) == '#') {
						continue;
					}

					if (str.startsWith("/")) {
						continue;
					}
					
					if (str.indexOf("mailto:") != -1) {
						continue;
					}
					if (str.toLowerCase().indexOf("javascript") != -1) {
						continue;
					}

					if (str.startsWith("'")) {
						continue;
					}
					// 将有效連結列印到螢幕
					System.out.println(str);
					// 将有效連結寫入到檔案
					file.write(str + "\r\n");

				}

			}

		} catch (Exception e) {
			System.out.println("無效連結！！");
		} finally {
			// 關閉IO流
			try {
				if (file != null)
					file.close();
				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 進行深度搜尋
	private void search() {
		// 聲明IO流
		InputStream in = null;

		BufferedReader br = null;

		try {
			// 執行個體化IO流

			in = new FileInputStream("src/href.txt");
			br = new BufferedReader(new InputStreamReader(in));
			// 建立SearchCrawler的對象
			SearchCrawlerBreadth sc = new SearchCrawlerBreadth();
			// 開始按行讀取有效連結的檔案
			while (br.readLine() != null) {
				String line = br.readLine();
				// 遞歸調用爬蟲爬行頁面
				sc.downHTML(line, "src/html.txt");
			}

		} catch (IOException e) {

			e.printStackTrace();

		} finally {
			try {
				// 關閉流
				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e2) {

				e2.printStackTrace();
			}
		}

	}

	public static void main(String[] args) throws Exception {
		// 傳入要爬行的頁面和儲存HTML源碼的檔案位址
		new SearchCrawlerBreadth().downHTML("http://www.hao123.com/", "src/html.txt");
		// 調用第二次的搜尋
		new SearchCrawlerBreadth().search();
	}
}

上面廣度優先沒啥問題，本人昨天淩晨3點多做的測試，15分鐘左右的時間，這隻小爬蟲爬到了30W+的連結，能力還是蠻強大的麼，順便提一下，白天測試的時候會非常非常的慢，推薦各位測試君在晚上12點以後做測試。。。。。雖然不太人道。。。

下面是深度優先的代碼，測試的時候每次都能造成死循環。。。好吧，我承認我沒有人品。。。其實基本方法和廣度優先沒啥差別，我每個頁面爬出來的連結隻拿第一個去爬下一個頁面，總共爬多少層我懶的木有定義，就是想看看最多能爬到哪。。。然後每次都能悲劇的死循環了。。。我明明也設定了跳出的方法了啊，我有判斷有效連結的方式，但是我的判斷并不完善麼，跳出方法我寫到了catch中，隻要有一個無效連結，就可以跳出來了麼。。。今天淩晨全都是死循環。。。。無奈了。。。。下面請代碼君上場~~~~~~~~~~

/**
 * 完成深度優先搜尋
 * 爬蟲進行深度優先很有可能會出現死循環的情況
 */
package net.meteor.java;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author 魏詩堯
 * @version 1.8
 * @emali [email protected]
 */
public class SearchCrawlerDepth {
	// 聲明一個靜态集合，用來存放爬蟲爬到的URL
	private static HashSet<String> set = new HashSet<String>();

	// 将網頁源碼下載下傳到本地
	private void downHTMLDepth(String urlstr, String htmltxt) {
		// 聲明連結
		HttpURLConnection con = null;
		// 聲明輸入流
		InputStream in = null;
		// 聲明輸出流
		FileOutputStream out = null;

		try {
			// 執行個體化url
			URL url = new URL(urlstr);
			// 打開連結
			con = (HttpURLConnection) url.openConnection();

			con.connect();
			// 打開輸入流
			in = con.getInputStream();
			// 打開輸出流建立接收檔案
			out = new FileOutputStream(htmltxt);

			byte[] b = new byte[1024];

			int len = 0;
			// 将檔案寫入接收檔案
			while ((len = in.read(b, 0, 1024)) != -1) {
				out.write(b, 0, len);
			}

			new SearchCrawlerDepth().readTxtDepth("src/hrefdepth.txt");
		} catch (Exception e) {
			System.out.println("未知主機！！，爬行結束！！");
		} finally {
			try {
				// 關閉流
				if (out != null)
					out.close();
				if (in != null)
					in.close();

			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 頁面解析
	private void readTxtDepth(String hreftxt) {
		// 聲明輸入流
		InputStream in = null;

		BufferedReader br = null;

		try {
			// 執行個體化IO流，允許檔案追加寫

			in = new FileInputStream("src/htmldepth1.txt");

			br = new BufferedReader(new InputStreamReader(in));
			// 開始解析html
			A: while (br.readLine() != null) {

				String line = br.readLine();
				// 建立正規表達式
				Pattern pattern = Pattern.compile(
						"<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
						Pattern.CASE_INSENSITIVE);
				// 建立比對器
				Matcher matcher = pattern.matcher(line);
				// 開始與正規表達式進行比對
				while (matcher.find()) {
					String str = matcher.group(1);
					// 跳過鍊到本頁面内連結和無效連結
					if (str.length() < 1) {
						continue;
					}

					if (str.charAt(0) == '#') {
						continue;
					}
					
					if (str.startsWith("/")) {
						continue;
					}

					if (str.indexOf("mailto:") != -1) {
						continue;
					}
					if (str.toLowerCase().indexOf("javascript") != -1) {
						continue;
					}

					if (str.startsWith("'")) {
						continue;
					}
					// 将有效連結列印到螢幕
					System.out.println(str);
					// 将第一個有效連結寫入到hashset
					 while (str != null) {
						set.add(str);
						new SearchCrawlerDepth().downHTMLDepth(str, "src/htmldepth1.txt");
						break A;
					} 
				}
			}
		} catch (Exception e) {
			System.out.println("無效連結！！本次爬行結束！！");
			new SearchCrawlerDepth().searchDepth();
		} finally {
			// 關閉IO流
			try {

				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	public void searchDepth() {

		FileWriter file = null;

		try {
			// 聲明檔案路徑，可以追加寫
			file = new FileWriter("src/hrefdepth1.txt", true);
			// 用疊代器周遊得到連結
			Iterator<String> it = set.iterator();
			while (it.hasNext()) {
				System.out.println(it);
				file.write(it + "\r\n");
			}

		} catch (IOException e) {
			System.out.println("無效連結，本次爬行結束！！");
			e.printStackTrace();
		} finally {

			try {
				if (file != null)
					file.close();
			} catch (IOException e) {
				
				e.printStackTrace();
			}
		}
	}
	
	public static void main(String[] args) {
		new SearchCrawlerDepth().downHTMLDepth("http://www.hao123.com", "src/htmldepth1.txt");
		new SearchCrawlerDepth().searchDepth();
	}
}

上面這兩篇代碼本身是十分不完善的，時間原因，我基本隻實作了最基本的原理，能改動增加的地方還有很多，主要是增加，很多地方都可增加代碼來增強程式的健壯性。。。比如有效連結判斷的地方，我們從href标簽中取出來的内容除了我寫的幾條判斷意外還有好多東西都沒有處理掉，這個地方還是能增加很多東西的。。。

掃描二維碼關注「極客挖掘機」公衆号！

作者：極客挖掘機

定期發表作者的思考：技術、産品、營運、自我提升等。

本文版權歸作者極客挖掘機和部落格園共有，歡迎轉載，但未經作者同意必須保留此段聲明，且在文章頁面明顯位置給出原文連接配接，否則保留追究法律責任的權利。

如果您覺得作者的文章對您有幫助，就來作者個人小站逛逛吧：

極客挖掘機