jieba與lucene.net使用

1.引用jieba.net-0.38.2; jiebanet.segment 1.6.0 ;lucene.net 30.3

2.複制resources檔案夾到項目根目錄下面；

jieba與lucene.net使用

3.把要查詢的資料成txt檔案；

4.重寫的檔案；

using System.Collections.Generic;
using System.IO;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;

 public class JiebaAnalyzer : Analyzer
    {
        protected static readonly ISet<string> DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

        private static ISet<string> StopWords;

        static JiebaAnalyzer()
        {
            var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile);
            if (File.Exists(stopWordsFile))
            {
                var lines = File.ReadAllLines(stopWordsFile);
                StopWords = new HashSet<string>();
                foreach (var line in lines)
                {
                    StopWords.Add(line.Trim());
                }
            }
            else
            {
                StopWords = DefaultStopWords;
            }
        }

        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var seg = new JiebaSegmenter();
            TokenStream result = new JiebaTokenizer(seg, reader);
            // 此篩選器是必需的，因為解析器将查詢轉換為小寫形式
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, StopWords);
            return result;
        }
    }

using System.Collections.Generic;
using System.IO;
using System.Linq;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;

public class JiebaTokenizer : Tokenizer
    {
        private JiebaSegmenter segmenter;
        private ITermAttribute termAtt;
        private IOffsetAttribute offsetAtt;
        private ITypeAttribute typeAtt;

        private List<JiebaNet.Segmenter.Token> tokens;
        private int position = -1;

        public JiebaTokenizer(JiebaSegmenter seg, TextReader input) : this(seg, input.ReadToEnd()) { }

        public JiebaTokenizer(JiebaSegmenter seg, string input)
        {
            segmenter = seg;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            var text = input;
            tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
        }

        public override bool IncrementToken()
        {
            ClearAttributes();
            position++;
            if (position < tokens.Count)
            {
                var token = tokens[position];
                termAtt.SetTermBuffer(token.Word);
                offsetAtt.SetOffset(token.StartIndex, token.EndIndex);
                typeAtt.Type = "Jieba";
                return true;
            }

            End();
            return false;
        }

        public IEnumerable<JiebaNet.Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
        {
            return segmenter.Tokenize(text, mode);
        }
    }

5建立分詞；

using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using SignalSmart.Models;

private void LuceneCreate(){

 string indexPath = Context.Server.MapPath("ListFolder"); // 索引文檔儲存位置
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
            bool isUpdate = IndexReader.IndexExists(directory); //判斷索引庫是否存在
            if (isUpdate)
            {
                //  如果索引目錄被鎖定（比如索引過程中程式異常退出），則首先解鎖
                //  Lucene.Net在寫索引庫之前會自動加鎖，在close的時候會自動解鎖
                //  不能多線程執行，隻能處理意外被永遠鎖定的情況
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);  //unlock:強制解鎖，待優化
                }
            }
            //  建立向索引庫寫操作對象  IndexWriter(索引目錄,指定使用盤古分詞進行切詞,最大寫入長度限制)
            //  補充:使用IndexWriter打開directory時會自動對索引庫檔案上鎖
            IndexWriter writer = new IndexWriter(directory,new Jieba.Common.JiebaAnalyzer() , !isUpdate,
                IndexWriter.MaxFieldLength.UNLIMITED);



            //讀取所有檔案
            string[] filelist = System.IO.Directory.GetFiles(Server.MapPath("listfolder/upload/"));

            //  防止重複索引，如果不存在則删除0條
            // writer.DeleteDocuments(new Term("id"));// 防止已存在的資料 => delete from t where id=i
            writer.DeleteAll();
            foreach (string item in filelist)
            {
                if (!File.Exists(item))
                {
                    continue;
                }
                StreamReader sr = new StreamReader(item);
                string contents = sr.ReadToEnd();
                string[] strlist = System.Text.RegularExpressions.Regex.Split(contents, "<換行>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);


                //建立索引
                string txt = File.ReadAllText(item);
                //  一條Document相當于一條記錄
                Document document = new Document();
                //  每個Document可以有自己的屬性（字段），所有字段名都是自定義的，值都是string類型
                //  Field.Store.YES不僅要對文章進行分詞記錄，也要儲存原文，就不用去資料庫裡查一次了
                document.Add(new Field("id", strlist[0], Field.Store.YES, Field.Index.NOT_ANALYZED));
                //  需要進行全文檢索的字段加 Field.Index. ANALYZED
                //  Field.Index.ANALYZED:指定文章内容按照分詞後結果儲存，否則無法實作後續的模糊查詢 
                //  WITH_POSITIONS_OFFSETS:訓示不僅儲存分割後的詞，還儲存詞之間的距離
                document.Add(new Field("title", strlist[1], Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS));
                document.Add(new Field("content", strlist[2], Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS));
                document.Add(new Field("imageurl", strlist[3], Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS));

                //  把文檔寫入索引庫
                writer.AddDocument(document);

            }


            writer.Close(); // Close後自動對索引庫檔案解鎖
            directory.Close();  //  不要忘了Close，否則索引結果搜不到

            Response.write( "索引檔案建立成功！");



}

.調用可以快速搜尋；

using System;
using System.Collections.Generic;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System.IO;
using System.Linq;


 public List<MySearchUnit> Search(string indexPath, string _flag, string keyword, int PageIndex, int PageSize, out int TotalCount)
        {


            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);


            List<MySearchUnit> searchlist = new List<MySearchUnit>();
            // 查詢條件
            PhraseQuery query = new PhraseQuery();
            // 等同于 where contains("msg",kw)
            //query.Add(new Term("msg", keyword));


            //-----
            Jieba.Common.JiebaAnalyzer analyzer = new Jieba.Common.JiebaAnalyzer();
            BooleanQuery bq = new BooleanQuery();
            Lucene.Net.Util.Version version = Lucene.Net.Util.Version.LUCENE_30;
            if (_flag != "")
            {
                QueryParser qpflag = new QueryParser(version, "flag", analyzer);
                Query qflag = qpflag.Parse(_flag);
                bq.Add(qflag, Occur.SHOULD);//與運算
            }
            Query queryKeyword = null;
            if (keyword != "")
            {

                string[] arrResult = CutWords(keyword);
                 

                string[] fields = new string[arrResult.Length];//查詢字段
                for(int i = 0; i < arrResult.Length; i++)
                {
                    fields[i] = "title";
                }

               
                queryKeyword = MultiFieldQueryParser.Parse(version, arrResult, fields, analyzer);
                bq.Add(queryKeyword, Occur.SHOULD);//與運算
            }


            //------


            TopScoreDocCollector collector = TopScoreDocCollector.Create(100, false);
            IndexSearcher searcher = new IndexSearcher(reader);//true-表示隻讀
            searcher.Search(bq, collector);

            if (PageIndex < 1) PageIndex = 1;

            if (collector == null || collector.TotalHits == 0)
            {
                TotalCount = 0;
                return null;
            }
            else
            {
                int start = PageSize * (PageIndex - 1);
                //結束數
                int limit = PageSize;
                ScoreDoc[] hits = collector.TopDocs(start, limit).ScoreDocs;
                List<MySearchUnit> list = new List<MySearchUnit>();
                int counter = 1;
                TotalCount = collector.TotalHits;
                foreach (ScoreDoc sd in hits)//周遊搜尋到的結果
                {
                    try
                    {
                        Document doc = searcher.Doc(sd.Doc);
                        string id = doc.Get("id");
                        string title = doc.Get("title");
                        string content = doc.Get("content");
                        // string flag = doc.Get("flag");
                        string imageurl = doc.Get("imageurl");
                        //string updatetime = doc.Get("updatetime");

                        PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
                        PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
                        highlighter.FragmentSize = 50;
                        content = highlighter.GetBestFragment(keyword, content);
                        string titlehighlight = highlighter.GetBestFragment(keyword, title);
                        if (titlehighlight != "") title = titlehighlight;
                        list.Add(new MySearchUnit(id, title, content, "", imageurl, ""));
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message);
                    }
                    counter++;
                }
                return list;
            }
            //st.Stop();
            //Response.Write("查詢時間：" + st.ElapsedMilliseconds + " 毫秒<br/>");

        }


        protected string[] CutWords(string keyword)
        {
            var segment = new JiebaNet.Segmenter.JiebaSegmenter();
            var result = segment.CutForSearch(keyword).ToList();
            string[] arr = result.ToArray();
            return arr;
        }

jieba與lucene.net使用

繼續閱讀

Solr+MMSEG4J的簡單學習目錄三,準備工具（不用tomcat）

WordCloud中屏蔽詞+背景色設定

python 分詞計算文檔TF-IDF值并排序

命名實體識别——日期識别

Anaconda 安裝第三方子產品(以jieba工具為例)

appium+python爬取其他人微信朋友圈（二）

從jieba到wordcloud背景思路TF-IDF算法完整代碼代碼解讀

jieba+wordcloud 詞雲分析 202302 QCon 議題 TOP 關鍵詞

一文帶你了解【自然語言處理（NLP）】的基本概念及應用1.1 自然語言處理

淺析文本挖掘——jieba子產品的應用

七夕情人節，讓python來為表白助力吧！生成愛心詞雲圖玫瑰詞雲圖制作

python函數以及random庫和jieba庫#手寫筆記#大學生#知識點總結#python#期末考試

Elasticsearcha安裝及配置jieba分詞分析器

文本分析--jieba中文分詞

【NLP開發】Python實作中文、英文分詞

【詞雲】wordcloud安裝與使用