中文分詞之結巴分詞~~~附使用場景+demo（net）

常用技能（更新ing）： http://www.cnblogs.com/dunitian/p/4822808.html#skill 技能總綱（更新ing）： http://www.cnblogs.com/dunitian/p/5493793.html 線上示範： http://cppjieba-webdemo.herokuapp.com 完整demo： https://github.com/dunitian/TempCode/tree/master/2016-09-05 逆天修改版： https://github.com/dunitian/TempCode/blob/master/2016-09-05/jieba.NET.0.38.2.zip

先說下注意點，結巴分詞他沒有對分詞進行一次去重，我們得自己幹這件事；字典得自行配置或者設定成輸出到bin目錄

應用場景舉例（搜尋那塊大家都知道，說點其他的）

——————————————————————————————————————————————————

言歸正傳：看一組民間統計資料：（非Net版，指的是官方版）

net版的IKanalyzer和盤古分詞好多年沒更新了，是以這次選擇了結巴分詞（這個名字也很符合分詞的意境~~結巴說話，是不是也是一種分詞的方式呢？）

下面簡單示範一下：

1.先引入包：

2.字典設定：

3.簡單封裝的幫助類：

using System.Linq;
using JiebaNet.Segmenter;
using System.Collections.Generic;

namespace LoTLib.Word.Split
{
    #region 分詞類型
    public enum JiebaTypeEnum
    {
        /// <summary>
        /// 精确模式---最基礎和自然的模式，試圖将句子最精确地切開，适合文本分析
        /// </summary>
        Default,
        /// <summary>
        /// 全模式---可以成詞的詞語都掃描出來, 速度更快，但是不能解決歧義
        /// </summary>
        CutAll,
        /// <summary>
        /// 搜尋引擎模式---在精确模式的基礎上對長詞再次切分，提高召回率，适合用于搜尋引擎分詞
        /// </summary>
        CutForSearch,
        /// <summary>
        /// 精确模式-不帶HMM
        /// </summary>
        Other
    } 
    #endregion

    /// <summary>
    /// 結巴分詞
    /// </summary>
    public static partial class WordSplitHelper
    {
        /// <summary>
        /// 擷取分詞之後的字元串集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var jieba = new JiebaSegmenter();
            switch (type)
            {
                case JiebaTypeEnum.Default:
                    return jieba.Cut(objStr);                 //精确模式-帶HMM
                case JiebaTypeEnum.CutAll:
                    return jieba.Cut(objStr, cutAll: true);   //全模式
                case JiebaTypeEnum.CutForSearch:
                    return jieba.CutForSearch(objStr);        //搜尋引擎模式
                default:
                    return jieba.Cut(objStr, false, false);   //精确模式-不帶HMM
            }
        }

        /// <summary>
        /// 擷取分詞之後的字元串
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var words = GetSplitWords(objStr, type);
            //沒結果則傳回空字元串
            if (words == null || words.Count() < 1)
            {
                return string.Empty;
            }
            words = words.Distinct();//有時候詞有重複的，得自己處理一下
            return string.Join(",", words);//根據個人需求傳回
        }
    }
}

調用很簡單：

string str = "bootstrap-datetimepicker 進一步跟進~~~開始時間和結束時間的樣式顯示";
            Console.WriteLine("\n精确模式-帶HMM：\n");
            Console.WriteLine(str.GetSplitWordStr());

            Console.WriteLine("\n全模式：\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutAll));

            Console.WriteLine("\n搜尋引擎模式：\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutForSearch));

            Console.WriteLine("\n精确模式-不帶HMM：\n");
            Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.Other));

            Console.ReadKey();

效果：

--------------------------

有人可能會說，那内容關鍵詞提取呢？==》别急，看下面：

這種方式所對應的字典是它=》idf.txt

簡單說下Constants==》

完整幫助類（最新看github）：

using System.Linq;
using JiebaNet.Segmenter;
using System.Collections.Generic;
using JiebaNet.Analyser;

namespace LoTLib.Word.Split
{
    #region 分詞類型
    public enum JiebaTypeEnum
    {
        /// <summary>
        /// 精确模式---最基礎和自然的模式，試圖将句子最精确地切開，适合文本分析
        /// </summary>
        Default,
        /// <summary>
        /// 全模式---可以成詞的詞語都掃描出來, 速度更快，但是不能解決歧義
        /// </summary>
        CutAll,
        /// <summary>
        /// 搜尋引擎模式---在精确模式的基礎上對長詞再次切分，提高召回率，适合用于搜尋引擎分詞
        /// </summary>
        CutForSearch,
        /// <summary>
        /// 精确模式-不帶HMM
        /// </summary>
        Other
    }
    #endregion

    /// <summary>
    /// 結巴分詞
    /// </summary>
    public static partial class WordSplitHelper
    {
        #region 公用系列
        /// <summary>
        /// 擷取分詞之後的字元串集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var jieba = new JiebaSegmenter();
            switch (type)
            {
                case JiebaTypeEnum.Default:
                    return jieba.Cut(objStr);                 //精确模式-帶HMM
                case JiebaTypeEnum.CutAll:
                    return jieba.Cut(objStr, cutAll: true);   //全模式
                case JiebaTypeEnum.CutForSearch:
                    return jieba.CutForSearch(objStr);        //搜尋引擎模式
                default:
                    return jieba.Cut(objStr, false, false);   //精确模式-不帶HMM
            }
        }

        /// <summary>
        /// 提取文章關鍵詞集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <returns></returns>
        public static IEnumerable<string> GetArticleKeywords(string objStr)
        {
            var idf = new TfidfExtractor();
            return idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos);//名詞和動詞
        }

        /// <summary>
        /// 傳回拼接後的字元串
        /// </summary>
        /// <param name="words"></param>
        /// <returns></returns>
        public static string JoinKeyWords(IEnumerable<string> words)
        {
            //沒結果則傳回空字元串
            if (words == null || words.Count() < 1)
            {
                return string.Empty;
            }
            words = words.Distinct();//有時候詞有重複的，得自己處理一下
            return string.Join(",", words);//根據個人需求傳回
        }
        #endregion

        #region 擴充相關
        /// <summary>
        /// 擷取分詞之後的字元串
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var words = GetSplitWords(objStr, type);
            return JoinKeyWords(words);
        }

        /// <summary>
        /// 提取文章關鍵詞字元串
        /// </summary>
        /// <param name="objStr"></param>
        /// <returns></returns>
        public static string GetArticleKeywordStr(this string objStr)
        {
            var words = GetArticleKeywords(objStr);
            return JoinKeyWords(words);
        } 
        #endregion
    }
}

還有耐心或者隻看末尾的有福了~

web端的字典配置那是個煩啊，逆天把源碼微調了下

中文分詞之結巴分詞~~~附使用場景+demo（net）

中文分詞之結巴分詞~~~附使用場景+demo（net）
使用方法和上面一樣

中文分詞之結巴分詞~~~附使用場景+demo（net）

中文分詞之結巴分詞~~~附使用場景+demo（net）
web版示範： https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/assets/js/note.js https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/Controllers/PartialViewController.cs

作者：

毒逆天

出處：

https://www.cnblogs.com/dotnetcrazy

打賞：

18i4JpL6g54yAPAefdtgqwRrZ43YJwAV5z

本文版權歸作者和部落格園共有。歡迎轉載，但必須保留此段聲明，且在文章頁面明顯位置給出原文連接配接！

中文分詞之結巴分詞~~~附使用場景+demo（net）

繼續閱讀

torch.nn.Embedding的使用torch.nn.Embedding

nn.Embedding()參數的了解nn.Embedding()

pytorch中nn.RNN()總結

聯考志願填報：人工智能專業怎麼樣？人工智能行業發展前景如何？

【Python學習筆記】- Day6

Windows版本的Google word2vec和Stanford GloVe工具

seq2sqe與attenton實作聊天機器人

奮戰聊天機器人（四）自然語言進行中的文本分類nltk中的貝葉斯分類器

從詞向量衡量标準到全局向量的詞嵌入模型GloVe再到一詞多義的解決方式衡量标準Evaluation引子全局向量的詞嵌入應用對一詞多義的思考Reference

GloVe與word2vec的差別，及GloVe的缺陷

統計學習大作業-BERT模型1 文本處理-BERT模型2 參考資料：

目前音樂推薦系統研究中的挑戰和願景摘要1. 介紹2. 重大的挑戰3. 未來方向和願景

MovieTaster-使用Item2Vec做電影推薦 MovieTaster-使用Item2Vec做電影推薦

anaconda中科大鏡像

NLP從入門到放棄_IBM Model1IBM Model1

解碼器用于語義分割：資料依賴的解碼可以實作靈活的特征聚合