1.lucene簡介
lucene主要的功能是用來全文檢索。他可以搜尋出非結構性的資料,比如WORD文檔等。
2.lucene使用
1)pom檔案引入
<properties>
<lucene.version>4.7.2</lucene.version>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<!---->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
<!--用于高亮顯示-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.7.2</version>
</dependency>
</dependencies>
2)建立lucene工具類
package com.show.lucence.lucence.utils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class AnalyzerUtil {
public static final Class INTEGER_TYPE = Integer.class;
public static final Class DOUBLE_TYPE = Double.class;
public static final Class FLOAT_TYPE = Float.class;
public static final Class LONG_TYPE = Long.class;
public static final Class BIGDECIMAL_TYPE = BigDecimal.class;
public static final Class STRING_TYPE = String.class;
public static final Map<Class,Integer> CLASS_MAP = new HashMap<Class, Integer>() {
{
put(INTEGER_TYPE,0);
put(DOUBLE_TYPE,1);
put(FLOAT_TYPE,2);
put(LONG_TYPE,3);
put(BIGDECIMAL_TYPE,4);
put(STRING_TYPE,5);
}
};
//寫入
public static void writerDoc(Directory directory, Analyzer analyzer, Object entity) throws Exception {
Document doc = newDocument(entity);
//建立寫入器
IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
indexWriter.addDocument(doc);
indexWriter.close();
}
//更新
public static void updateDoc(Directory directory, Analyzer analyzer, Object entity,String index0,String value0) throws Exception{
IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
Document doc = newDocument(entity);
Term term = new Term(index0,value0);
indexWriter.updateDocument(term,doc);
indexWriter.close();
}
//删除
public static void deleteDoc(Directory directory, Analyzer analyzer,String index0,String value0) throws Exception {
IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_47,analyzer));
Term term = new Term(index0,value0);
if(index0 == null && value0 == null){
indexWriter.deleteAll();
}else {
indexWriter.deleteDocuments(term);
}
indexWriter.close();
}
//查詢
public static <T> List<T> searchDoc(Directory directory, Analyzer analyzer,Class<T> clazz,String searchInfo) throws Exception {
Map<String,Class> fieldType = new HashMap<String,Class>();
String[] fields = getFields(clazz, fieldType);
DirectoryReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,fields,analyzer);
List<T> objList = new ArrayList<T>();
searchDoc(searchInfo, clazz, fieldType, fields, searcher, parser, objList);
reader.close();
directory.close();
return objList;
}
private static <T> void searchDoc(String searchInfo, Class<T> clazz, Map<String, Class> fieldType, String[] fields, IndexSearcher searcher, MultiFieldQueryParser parser, List<T> objList) throws Exception {
Query query = parser.parse(searchInfo);
TopDocs search = searcher.search(query, null, 10);
System.out.println(search.totalHits);
ScoreDoc[] scoreDocs = search.scoreDocs;
//讀出
for (ScoreDoc scoreDoc: scoreDocs) {
Document document = searcher.doc(scoreDoc.doc);
T obj = (T)clazz.newInstance();
for (String field : fields) {
Class aClass = fieldType.get(field);
String fieldValue = document.get(field);
Method declaredMethod = clazz.getDeclaredMethod("set" + toUpperCaseFirst(field),aClass);
if(CLASS_MAP.get(aClass) == 5){
Highlighter highlighter = highlighter(query);
String bestFragment = highlighter.getBestFragment(parser.getAnalyzer(), field, fieldValue);
if (bestFragment != null){
fieldValue = bestFragment;
}
}
transToReal(declaredMethod,obj,aClass, fieldValue);
}
objList.add(obj);
}
}
//高亮顯示
private static Highlighter highlighter(Query query){
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));
Fragmenter fragmenter = new SimpleFragmenter(100);
highlighter.setTextFragmenter(fragmenter);
return highlighter;
}
private static Document newDocument(Object entity) throws IllegalAccessException, InvocationTargetException, NoSuchMethodException {
//建立文檔對象,并添加相關字段值
Document doc = new Document();
Class<?> clazz = entity.getClass();
java.lang.reflect.Field[] declaredFields = clazz.getDeclaredFields();
for (java.lang.reflect.Field declaredField : declaredFields) {
declaredField.setAccessible(true);
String fieldName = declaredField.getName();
Object invoke = clazz.getDeclaredMethod("get" + toUpperCaseFirst(fieldName)).invoke(entity);
doc.add(new Field(fieldName,invoke.toString(),Field.Store.YES,Field.Index.ANALYZED));
}
return doc;
}
private static <T> String[] getFields(Class<T> clazz, Map<String, Class> fieldType) {
java.lang.reflect.Field[] declaredFields = clazz.getDeclaredFields();
String[] fields = new String[declaredFields.length];
for (int i = 0; i < fields.length; i++) {
java.lang.reflect.Field declaredField = declaredFields[i];
declaredField.setAccessible(true);
String fieldName = declaredField.getName();
fields[i] = fieldName;
fieldType.put(fieldName,declaredField.getType());
}
return fields;
}
private static void transToReal(Method declaredMethod,Object obj,Class aClass, String fieldValue) throws Exception {
System.out.println(CLASS_MAP.get(aClass));
switch (CLASS_MAP.get(aClass)){
case 0:
declaredMethod.invoke(obj, Integer.valueOf(fieldValue));
break;
case 1:
declaredMethod.invoke(obj,Double.valueOf(fieldValue));
break;
case 2:
declaredMethod.invoke(obj,Float.valueOf(fieldValue));
break;
case 3:
declaredMethod.invoke(obj,Long.valueOf(fieldValue));
break;
case 4:
declaredMethod.invoke(obj,new BigDecimal(fieldValue));
break;
default:
declaredMethod.invoke(obj,fieldValue);
}
}
private static String toUpperCaseFirst(String fieldName) {
return fieldName.substring(0, 1).toUpperCase() + fieldName.substring(1);
}
}
3)自定義測試對象
package com.show.lucence.lucence.domain;
import java.io.Serializable;
public class Goods implements Serializable {
private Integer goodsId;
private String goodsName;
private Double goodsPrice;
private String goodsRemark;
public Goods() {
super();
}
public Goods(Integer goodsId, String goodsName,Double goodsPrice,String goodsRemark) {
super();
this.goodsId = goodsId;//商品ID
this.goodsName = goodsName;//商品名稱
this.goodsPrice = goodsPrice;//商品價格
this.goodsRemark = goodsRemark;//商品備注、描述
}
public Integer getGoodsId() {
return goodsId;
}
public void setGoodsId(Integer goodsId) {
this.goodsId = goodsId;
}
public String getGoodsName() {
return goodsName;
}
public void setGoodsName(String goodsName) {
this.goodsName = goodsName;
}
public Double getGoodsPrice() {
return goodsPrice;
}
public void setGoodsPrice(Double goodsPrice) {
this.goodsPrice = goodsPrice;
}
public String getGoodsRemark() {
return goodsRemark;
}
public void setGoodsRemark(String goodsRemark) {
this.goodsRemark = goodsRemark;
}
@Override
public String toString() {
return "Goods{" +
"goodsId=" + goodsId +
", goodsName='" + goodsName + '\'' +
", goodsPrice=" + goodsPrice +
", goodsRemark='" + goodsRemark + '\'' +
'}';
}
}
4)建測試類測試-這裡使用的IK分詞器
package com.show.lucence.lucence.analyzers;
import com.show.lucence.lucence.domain.Goods;
import com.show.lucence.lucence.utils.AnalyzerUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.BeforeClass;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.List;
public class AnalyzerTest<T> {
private static Directory directory = null;
private static Analyzer analyzer = null;
private static Goods goods = null;
@BeforeClass
public static void init() throws IOException {
//索引庫
directory = getDirectory();
//分詞器
analyzer = getAnalyzer();
//測試自定義的對象
goods = getGoods();
}
@Test
public void writerDoc() throws Exception {
AnalyzerUtil.writerDoc(directory,analyzer,goods);
}
@Test
public void updateDoc() throws Exception {
AnalyzerUtil.updateDoc(directory,analyzer,goods,"goodsId",String.valueOf(goods.getGoodsId()));
}
@Test
public void deleteDoc() throws Exception {
AnalyzerUtil.deleteDoc(directory,analyzer,"goodsId",String.valueOf(goods.getGoodsId()));
}
@Test
public void searchDoc() throws Exception {
List<Goods> goodsList = AnalyzerUtil.searchDoc(directory, analyzer,Goods.class,"力實作自");
System.out.println(goodsList);
}
private static Goods getGoods() {
String goodsRemark = "個性獨特 努力實作自我";
//goodsRemark = "我是超級王牌";
return new Goods(1,"Pin",11.0,goodsRemark);
}
private static Analyzer getAnalyzer() {
//指定分詞器,版本一般指定為最高
//StandardAnalyzer分詞器
Analyzer analyzer =
//new StandardAnalyzer(Version.LUCENE_47);
new IKAnalyzer(false);
return analyzer;
}
//設定對應的索引庫--目前不會變
private static Directory getDirectory() throws IOException {
//建構索引庫
return FSDirectory.open(new File("D:\\pragramFIle\\lucence\\src\\main\\resources\\com\\show\\lucence"));
}
@Test
public void testTemplate1() throws Exception {
Class<? extends Goods> aClass = goods.getClass();
Field goodsName = aClass.getDeclaredField("goodsName");
goodsName.setAccessible(true);
System.out.println(goodsName.getName());
}
}
5)使用lucene時可以使用luke(lucene索引庫檢視工具)來檢視分好的索引庫。lucene與luke的版本一定要相适應,這裡lucene用的4.7.2,我用的是lukeall-4.7.1,可以檢視。下完jar包之後,可以在jar包下建立一個.bat檔案,檔案内容start javaw -jar lukeall-4.7.1.jar。索引庫建立成功後,可以點選.bat檔案啟動可視化工具,然後顯示如下:
6)IK擴充字典項使用:
在項目resource路徑下建立IKAnalyzer.cfg.xml檔案,檔案内容如下:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 擴充配置</comment>
<!--使用者可以在這裡配置自己的擴充字典,示例: -->
<entry key="ext_dict">/dicdata/use.dic.dic;/dicdata/googlepy.dic</entry>
<!-- 使用者可以在這裡配置自己的擴充停止詞字典 -->
<entry key="ext_stopwords">/dicdata/ext_stopword.dic</entry>
</properties>
附加:
####lucene建構步驟:
##建立索引:
#####1.建構索引庫
目的:用來制定生成的索引檔案的存放位置
#####2.指定對應的分詞器
常見的分詞器:
######>>1)StandardAnalyzer分詞器: 單字分詞:就是按照中文一個字一個字地進行分詞.如:”我愛中國”
效果:我,愛,中,國
######>>2)CJKAnalyzer分詞器:二分法分詞:按兩個字進行切分,如”我是中國人”
效果:我是,是中,中國,國人
######>>3)IK分詞器:自定義詞庫
擴充詞典(新建立詞功能): 有些詞IK分詞器不識别,例如:”白富美,高富帥
停用詞典(停用某些詞功能): 有些詞不需要建立索引 例如:“哦”,“啊”,“的””
IK分詞器的詞庫有限,新增加的詞條可以通過配置檔案添加到IK的詞庫中,也可以把一些不用的詞條去除:
#####3.建立文檔對象
文檔對象:
######>主要是添加索引的内容
#####4.建立寫入器
######>主要通過指定的分詞器進行分詞,并将文檔對象寫到索引庫裡面。
##查詢索引:
#####1.設定搜尋目錄
#####2.選擇搜尋器,指定設定好的搜尋目錄
#####3.選擇對應的執行語句解析器
#####4.搜尋器執行查詢語句(可以查出索引所在的文檔/以及查到符合索引的文檔數)