最基本的内容
如果你存儲到硬碟的話,那麼目錄下就有這麼幾個檔案
_0.fnm存儲域的名字資訊
_0.fdt域store.YES的資料資訊
_0.fdx域store.YES的資料資訊
_0.fdx儲存字型在内容中出現的次數,評分和排序
_0.nrm存儲評分資訊
_0.prx偏移量
_0.tii和_0.tis 索引内容裡所有内容資訊
_0_1.del所有删除的檔案資訊,相當于資源回收筒
CharTermAttribute:存儲每一個與語彙單詞的資訊
PositionIncrementAttribute:位置增量屬性,存儲語彙單詞之間的距離
OffsetAttribute:存儲語彙單詞之間的偏移量
TypeAttribute:使用的分詞器的類型
1.我用的是lucene5.5版本的,建議讀者用2.3版本的mmseg4j,其他2.3以下版本都不相容
2.先是分詞器類,記得将mmseg4j-core-1.10.0.jar解壓,随意放在一個目錄下,目錄裡有個data檔案夾,裡面有chars.dic,units.dic,words.dic這幾個重要的檔案,下面的那個目錄就是date檔案夾的絕對路徑(下面是我的路徑,你們随意,也可以放相對路徑,我這隻是測試)
package cn.com.demo.chnese;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MsgAnalyzi extends Analyzer{
@Override
protected TokenStreamComponents createComponents(String str) {
Dictionary dic=Dictionary.getInstance("C:\\Users\\Administrator\\Desktop\\mmseg4j-core-1.10.0\\data");
Tokenizer nize=new MMSegTokenizer(new MaxWordSeg(dic));
TokenStream tokenStream=new MsgTokenFilter(nize);
return new TokenStreamComponents(nize, tokenStream);
}
}
3.繼承TokenFilter
package cn.com.demo.chnese;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class MsgTokenFilter extends TokenFilter{
private CharTermAttribute charTerm=null;
private PositionIncrementAttribute positionIncrement=null;
//用來存儲同義詞
private Stack<String> stack=null;
//定義的狀态
private State state=null;
protected MsgTokenFilter(TokenStream input) {
super(input);
charTerm=this.addAttribute(CharTermAttribute.class);
positionIncrement=this.addAttribute(PositionIncrementAttribute.class);
stack=new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(stack.size()>0){
//将元素出棧,并擷取同義詞
String str=stack.pop();
//還原狀态
restoreState(state);
//先清空再添加
charTerm.setEmpty();
charTerm.append(str);
//設定位置為0,表示同義詞
positionIncrement.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
//如詞彙中有同義詞,捕獲目前狀态
if(this.incream(charTerm.toString())){
state=captureState();
}
return true;
}
//詞彙中有同義詞就将内容放入棧中
public boolean incream(String name){
Map<String,String[]> map=new HashMap<String,String[]>();
map.put("中國", new String[]{"天朝","中原"});
map.put("我", new String[]{"朕","俺"});
map.put("主義", new String[]{"注意","豬億"});
String[] strs=map.get(name);
if(strs == null){
return false;
}
for(String s:strs){
stack.push(s);
}
return true;
}
}
4.現再可以測試了
package cn.com.demo.chnese;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class Test {
public static void main(String[] args) throws Exception {
Analyzer ana = new MsgAnalyzi();
String str = "我是中國主義接班人,想着生命勇敢前進";
Anly(str, ana);
uuil(str);
}
public static void Anly(String str, Analyzer analyzer) {
try {
TokenStream tokenStream = analyzer.tokenStream(str, new StringReader(str));
tokenStream.reset();
// for(Token token=new Token();(token=(Token)TokenUtils.nextToken(tokenStream, token))!=null;){
// System.out.print("["+token+"]");
// }
// System.out.println();
// 上面的這個可以代替下面的方法顯示
CharTermAttribute ter = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
System.out.print("[" + ter + "]");
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void uuil(String content) throws Exception {
Directory dire=new RAMDirectory();
IndexWriter write=new IndexWriter(dire, new IndexWriterConfig(new MsgAnalyzi()));
Document doc=new Document();
doc.add(new TextField("content", content, Store.YES));
write.addDocument(doc);
write.close();
DirectoryReader reader=DirectoryReader.open(dire);
IndexSearcher seacher=new IndexSearcher(reader);
QueryParser parser=new QueryParser("content", new MsgAnalyzi());
TopDocs top=seacher.search(parser.parse("注意"), 10);
ScoreDoc[] score=top.scoreDocs;
doc=seacher.doc(score[0].doc);
System.out.println(doc.get("content"));
}
}