最基本的内容
如果你存储到硬盘的话,那么目录下就有这么几个文件
_0.fnm存储域的名字信息
_0.fdt域store.YES的数据信息
_0.fdx域store.YES的数据信息
_0.fdx保存字体在内容中出现的次数,评分和排序
_0.nrm存储评分信息
_0.prx偏移量
_0.tii和_0.tis 索引内容里所有内容信息
_0_1.del所有删除的文件信息,相当于回收站
CharTermAttribute:存储每一个与语汇单词的信息
PositionIncrementAttribute:位置增量属性,存储语汇单词之间的距离
OffsetAttribute:存储语汇单词之间的偏移量
TypeAttribute:使用的分词器的类型
1.我用的是lucene5.5版本的,建议读者用2.3版本的mmseg4j,其他2.3以下版本都不兼容
2.先是分词器类,记得将mmseg4j-core-1.10.0.jar解压,随意放在一个目录下,目录里有个data文件夹,里面有chars.dic,units.dic,words.dic这几个重要的文件,下面的那个目录就是date文件夹的绝对路径(下面是我的路径,你们随意,也可以放相对路径,我这只是测试)
package cn.com.demo.chnese;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MsgAnalyzi extends Analyzer{
@Override
protected TokenStreamComponents createComponents(String str) {
Dictionary dic=Dictionary.getInstance("C:\\Users\\Administrator\\Desktop\\mmseg4j-core-1.10.0\\data");
Tokenizer nize=new MMSegTokenizer(new MaxWordSeg(dic));
TokenStream tokenStream=new MsgTokenFilter(nize);
return new TokenStreamComponents(nize, tokenStream);
}
}
3.继承TokenFilter
package cn.com.demo.chnese;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class MsgTokenFilter extends TokenFilter{
private CharTermAttribute charTerm=null;
private PositionIncrementAttribute positionIncrement=null;
//用来存储同义词
private Stack<String> stack=null;
//定义的状态
private State state=null;
protected MsgTokenFilter(TokenStream input) {
super(input);
charTerm=this.addAttribute(CharTermAttribute.class);
positionIncrement=this.addAttribute(PositionIncrementAttribute.class);
stack=new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(stack.size()>0){
//将元素出栈,并获取同义词
String str=stack.pop();
//还原状态
restoreState(state);
//先清空再添加
charTerm.setEmpty();
charTerm.append(str);
//设置位置为0,表示同义词
positionIncrement.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
//如词汇中有同义词,捕获当前状态
if(this.incream(charTerm.toString())){
state=captureState();
}
return true;
}
//词汇中有同义词就将内容放入栈中
public boolean incream(String name){
Map<String,String[]> map=new HashMap<String,String[]>();
map.put("中国", new String[]{"天朝","中原"});
map.put("我", new String[]{"朕","俺"});
map.put("主义", new String[]{"注意","猪亿"});
String[] strs=map.get(name);
if(strs == null){
return false;
}
for(String s:strs){
stack.push(s);
}
return true;
}
}
4.现再可以测试了
package cn.com.demo.chnese;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class Test {
public static void main(String[] args) throws Exception {
Analyzer ana = new MsgAnalyzi();
String str = "我是中国主义接班人,想着生命勇敢前进";
Anly(str, ana);
uuil(str);
}
public static void Anly(String str, Analyzer analyzer) {
try {
TokenStream tokenStream = analyzer.tokenStream(str, new StringReader(str));
tokenStream.reset();
// for(Token token=new Token();(token=(Token)TokenUtils.nextToken(tokenStream, token))!=null;){
// System.out.print("["+token+"]");
// }
// System.out.println();
// 上面的这个可以代替下面的方法显示
CharTermAttribute ter = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
System.out.print("[" + ter + "]");
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void uuil(String content) throws Exception {
Directory dire=new RAMDirectory();
IndexWriter write=new IndexWriter(dire, new IndexWriterConfig(new MsgAnalyzi()));
Document doc=new Document();
doc.add(new TextField("content", content, Store.YES));
write.addDocument(doc);
write.close();
DirectoryReader reader=DirectoryReader.open(dire);
IndexSearcher seacher=new IndexSearcher(reader);
QueryParser parser=new QueryParser("content", new MsgAnalyzi());
TopDocs top=seacher.search(parser.parse("注意"), 10);
ScoreDoc[] score=top.scoreDocs;
doc=seacher.doc(score[0].doc);
System.out.println(doc.get("content"));
}
}