天天看點

初識Lucene-索引與搜尋

Lucene首先要建立索引,才能進行搜尋,使用了最新的lucene-4.2.1包

1.Indexer.java

/**
 * 索引器
 * @author shishengjie
 *
 */
public class Indexer {
	
	private IndexWriter writer;//寫索引,負責建立索引或打開已有索引等等

	public Indexer(String indexDir) throws IOException {
		//Directory描述索引存放的位置
		Directory dir = FSDirectory.open(new File(indexDir));
		//分析器,文本檔案在被索引之前需要經過Analyzer處理
		Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_42);
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42,
				luceneAnalyzer);
		writer = new IndexWriter(dir, config);//建立寫索引

	}

	public void close() throws IOException {
		writer.close();
	}
	/**
	 * 檔案過濾器 過濾所有非.txt檔案
	 * @author shishengjie
	 *
	 */
	private static class TextFilesFilter implements FileFilter {

		@Override
		public boolean accept(File pathname) {
			// TODO Auto-generated method stub
			return pathname.getName().toLowerCase().endsWith(".txt");
		}

	}
	/**
	 * 建立索引
	 * @param dataDir
	 * @param filter
	 * @return
	 * @throws IOException
	 */
	public int index(String dataDir, FileFilter filter) throws IOException {
		File files[] = new File(dataDir).listFiles();//索引檔案夾下所有檔案
		for (File f : files) {
			if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
					&& (filter == null || filter.accept(f))) {
				indexFile(f);//将檔案加入所索引
			}
		}
		return writer.numDocs();
	}

	/**
	 * 向索引中添加文檔
	 * @param f
	 * @throws IOException
	 */
	private void indexFile(File f) throws IOException {
		System.out.println("Indexing " + f.getCanonicalPath());
		//根據檔案名擷取文檔,Document代表一些域Filed的集合
		Document doc = getDocument(f);
		//加入到索引中
		writer.addDocument(doc);
	}

	/**
	 * 根據檔案傳回Document
	 * @param f
	 * @return
	 * @throws IOException
	 */
	private Document getDocument(File f) throws IOException {
		Document doc = new Document();//建立文檔
		doc.add(new Field("contents", new FileReader(f)));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));//添加域Filed
		return doc;
	}

	public static void main(String[] args) {
		try {
			if (args.length != 2) {
				throw new IllegalAccessException("Usage:java "
						+ Indexer.class.getName() + " <index dir> <data dir>");
			}

			String indexDir = args[0];	//此處存放索引檔案
			String dataDir = args[1];	//對該檔案夾下的檔案建立索引

			long start = System.currentTimeMillis();
			//建立索引,indexDir為索引存放位置
			Indexer indexer = new Indexer(indexDir);
			int numIndexed;
			try {
				//對dataDir檔案夾下的檔案建立索引
				numIndexed = indexer.index(dataDir, new TextFilesFilter());
			} finally {
				indexer.close();
			}
			long end = System.currentTimeMillis();
			System.out.println("Indexing " + numIndexed + " files took "
					+ (end - start) + " milliseconds");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}
           

運作時需要輸入2個參數,第一個為要存放索引的檔案夾,第二個為要索引哪個檔案夾下的檔案

如:C:\Users\shishengjie\Desktop\lucene\indexDir  C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\

将會掃描C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\檔案夾下的txt檔案,為其建立索引,建立的索引存放在C:\Users\shishengjie\Desktop\lucene\indexDir下面

輸出為:

Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\CHANGES.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\JRE_VERSION_MIGRATION.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\LICENSE.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\MIGRATE.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\NOTICE.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\README.txt
Indexing C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\SYSTEM_REQUIREMENTS.txt
Indexing 21 files took 2479 milliseconds
           

2.Searcher.java

/**
 * 查找器
 * @author shishengjie
 *
 */
public class Searcher {

	public static void main(String[] args) throws Exception {

		if (args.length != 2) {
			throw new IllegalAccessException("Usage:java "
					+ Searcher.class.getName() + " <index dir> <data dir>");
		}
		String indexDir = args[0];//索引檔案夾
		String q = args[1];//要檢索的字元
		search(indexDir, q);//查找
	}

	/**
	 * 查找
	 * @param indexDir
	 * @param q
	 * @throws Exception
	 */
	private static void search(String indexDir, String q) throws Exception {
		//索引存放的位置處建立Directory
		Directory dir = FSDirectory.open(new File(indexDir));
		// 讀取索引的indexReader
		IndexReader indexReader = IndexReader.open(dir);
		// 建立indexSearcher,用于搜尋由IndexWriter類建立的索引
		IndexSearcher is = new IndexSearcher(indexReader);
		//解析查詢字元串
		QueryParser parser = new QueryParser(Version.LUCENE_42, "contents",
				new StandardAnalyzer(Version.LUCENE_42));
		//将人可讀的查詢解析為Query
		Query query = parser.parse(q);
		long start = System.currentTimeMillis();
		//查詢,以TopDocs對象的形式傳回搜尋結果集
		//TopDocs是一個簡單的指針容器,指向前N個排名的搜尋結果
		//TopDocs隻包括對于文檔的引用IndexSearcher.doc時才加載
		TopDocs hits = is.search(query, 10);
		long end = System.currentTimeMillis();
		System.err.println("Found " + hits.totalHits + " document(s) (in "
				+ +(end - start) + " milliseconds) that matched query '" + q
				+ "'");
		//輸出比對的文本
		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);//傳回比對文本
			System.out.println(doc.get("fullpath"));
		}

	}

}
           

運作時需要輸入2個參數,第一個為要存放索引的檔案夾,第二個為要查詢的單詞

如:C:\Users\shishengjie\Desktop\lucene\indexDir  patent

将會讀取索引,查找patent檔案

輸出為:

Found 6 document(s) (in 191 milliseconds) that matched query 'java'
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\JRE_VERSION_MIGRATION.txt
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\SYSTEM_REQUIREMENTS.txt
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\README.txt
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\NOTICE.txt
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\LICENSE.txt
C:\Users\shishengjie\Desktop\lucene\lucene-4.2.1\CHANGES.txt
           

繼續閱讀