1.引用jieba.net-0.38.2; jiebanet.segment 1.6.0 ;lucene.net 30.3
2.複制resources檔案夾到項目根目錄下面;
3.把要查詢的資料成txt檔案;
4.重寫的檔案;
using System.Collections.Generic;
using System.IO;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;
public class JiebaAnalyzer : Analyzer
{
protected static readonly ISet<string> DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
private static ISet<string> StopWords;
static JiebaAnalyzer()
{
var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile);
if (File.Exists(stopWordsFile))
{
var lines = File.ReadAllLines(stopWordsFile);
StopWords = new HashSet<string>();
foreach (var line in lines)
{
StopWords.Add(line.Trim());
}
}
else
{
StopWords = DefaultStopWords;
}
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
var seg = new JiebaSegmenter();
TokenStream result = new JiebaTokenizer(seg, reader);
// 此篩選器是必需的,因為解析器将查詢轉換為小寫形式
result = new LowerCaseFilter(result);
result = new StopFilter(true, result, StopWords);
return result;
}
}
using System.Collections.Generic;
using System.IO;
using System.Linq;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
public class JiebaTokenizer : Tokenizer
{
private JiebaSegmenter segmenter;
private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private ITypeAttribute typeAtt;
private List<JiebaNet.Segmenter.Token> tokens;
private int position = -1;
public JiebaTokenizer(JiebaSegmenter seg, TextReader input) : this(seg, input.ReadToEnd()) { }
public JiebaTokenizer(JiebaSegmenter seg, string input)
{
segmenter = seg;
termAtt = AddAttribute<ITermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
var text = input;
tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
}
public override bool IncrementToken()
{
ClearAttributes();
position++;
if (position < tokens.Count)
{
var token = tokens[position];
termAtt.SetTermBuffer(token.Word);
offsetAtt.SetOffset(token.StartIndex, token.EndIndex);
typeAtt.Type = "Jieba";
return true;
}
End();
return false;
}
public IEnumerable<JiebaNet.Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
{
return segmenter.Tokenize(text, mode);
}
}
5建立分詞;
using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using SignalSmart.Models;
private void LuceneCreate(){
string indexPath = Context.Server.MapPath("ListFolder"); // 索引文檔儲存位置
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory); //判斷索引庫是否存在
if (isUpdate)
{
// 如果索引目錄被鎖定(比如索引過程中程式異常退出),則首先解鎖
// Lucene.Net在寫索引庫之前會自動加鎖,在close的時候會自動解鎖
// 不能多線程執行,隻能處理意外被永遠鎖定的情況
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory); //unlock:強制解鎖,待優化
}
}
// 建立向索引庫寫操作對象 IndexWriter(索引目錄,指定使用盤古分詞進行切詞,最大寫入長度限制)
// 補充:使用IndexWriter打開directory時會自動對索引庫檔案上鎖
IndexWriter writer = new IndexWriter(directory,new Jieba.Common.JiebaAnalyzer() , !isUpdate,
IndexWriter.MaxFieldLength.UNLIMITED);
//讀取所有檔案
string[] filelist = System.IO.Directory.GetFiles(Server.MapPath("listfolder/upload/"));
// 防止重複索引,如果不存在則删除0條
// writer.DeleteDocuments(new Term("id"));// 防止已存在的資料 => delete from t where id=i
writer.DeleteAll();
foreach (string item in filelist)
{
if (!File.Exists(item))
{
continue;
}
StreamReader sr = new StreamReader(item);
string contents = sr.ReadToEnd();
string[] strlist = System.Text.RegularExpressions.Regex.Split(contents, "<換行>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//建立索引
string txt = File.ReadAllText(item);
// 一條Document相當于一條記錄
Document document = new Document();
// 每個Document可以有自己的屬性(字段),所有字段名都是自定義的,值都是string類型
// Field.Store.YES不僅要對文章進行分詞記錄,也要儲存原文,就不用去資料庫裡查一次了
document.Add(new Field("id", strlist[0], Field.Store.YES, Field.Index.NOT_ANALYZED));
// 需要進行全文檢索的字段加 Field.Index. ANALYZED
// Field.Index.ANALYZED:指定文章内容按照分詞後結果儲存,否則無法實作後續的模糊查詢
// WITH_POSITIONS_OFFSETS:訓示不僅儲存分割後的詞,還儲存詞之間的距離
document.Add(new Field("title", strlist[1], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("content", strlist[2], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("imageurl", strlist[3], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
// 把文檔寫入索引庫
writer.AddDocument(document);
}
writer.Close(); // Close後自動對索引庫檔案解鎖
directory.Close(); // 不要忘了Close,否則索引結果搜不到
Response.write( "索引檔案建立成功!");
}
6
.調用可以快速搜尋;
using System;
using System.Collections.Generic;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System.IO;
using System.Linq;
public List<MySearchUnit> Search(string indexPath, string _flag, string keyword, int PageIndex, int PageSize, out int TotalCount)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
IndexReader reader = IndexReader.Open(directory, true);
List<MySearchUnit> searchlist = new List<MySearchUnit>();
// 查詢條件
PhraseQuery query = new PhraseQuery();
// 等同于 where contains("msg",kw)
//query.Add(new Term("msg", keyword));
//-----
Jieba.Common.JiebaAnalyzer analyzer = new Jieba.Common.JiebaAnalyzer();
BooleanQuery bq = new BooleanQuery();
Lucene.Net.Util.Version version = Lucene.Net.Util.Version.LUCENE_30;
if (_flag != "")
{
QueryParser qpflag = new QueryParser(version, "flag", analyzer);
Query qflag = qpflag.Parse(_flag);
bq.Add(qflag, Occur.SHOULD);//與運算
}
Query queryKeyword = null;
if (keyword != "")
{
string[] arrResult = CutWords(keyword);
string[] fields = new string[arrResult.Length];//查詢字段
for(int i = 0; i < arrResult.Length; i++)
{
fields[i] = "title";
}
queryKeyword = MultiFieldQueryParser.Parse(version, arrResult, fields, analyzer);
bq.Add(queryKeyword, Occur.SHOULD);//與運算
}
//------
TopScoreDocCollector collector = TopScoreDocCollector.Create(100, false);
IndexSearcher searcher = new IndexSearcher(reader);//true-表示隻讀
searcher.Search(bq, collector);
if (PageIndex < 1) PageIndex = 1;
if (collector == null || collector.TotalHits == 0)
{
TotalCount = 0;
return null;
}
else
{
int start = PageSize * (PageIndex - 1);
//結束數
int limit = PageSize;
ScoreDoc[] hits = collector.TopDocs(start, limit).ScoreDocs;
List<MySearchUnit> list = new List<MySearchUnit>();
int counter = 1;
TotalCount = collector.TotalHits;
foreach (ScoreDoc sd in hits)//周遊搜尋到的結果
{
try
{
Document doc = searcher.Doc(sd.Doc);
string id = doc.Get("id");
string title = doc.Get("title");
string content = doc.Get("content");
// string flag = doc.Get("flag");
string imageurl = doc.Get("imageurl");
//string updatetime = doc.Get("updatetime");
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
highlighter.FragmentSize = 50;
content = highlighter.GetBestFragment(keyword, content);
string titlehighlight = highlighter.GetBestFragment(keyword, title);
if (titlehighlight != "") title = titlehighlight;
list.Add(new MySearchUnit(id, title, content, "", imageurl, ""));
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
counter++;
}
return list;
}
//st.Stop();
//Response.Write("查詢時間:" + st.ElapsedMilliseconds + " 毫秒<br/>");
}
protected string[] CutWords(string keyword)
{
var segment = new JiebaNet.Segmenter.JiebaSegmenter();
var result = segment.CutForSearch(keyword).ToList();
string[] arr = result.ToArray();
return arr;
}