常用技能(更新ing): http://www.cnblogs.com/dunitian/p/4822808.html#skill 技能總綱(更新ing): http://www.cnblogs.com/dunitian/p/5493793.html 線上示範: http://cppjieba-webdemo.herokuapp.com 完整demo: https://github.com/dunitian/TempCode/tree/master/2016-09-05 逆天修改版: https://github.com/dunitian/TempCode/blob/master/2016-09-05/jieba.NET.0.38.2.zip
先說下注意點,結巴分詞他沒有對分詞進行一次去重,我們得自己幹這件事;字典得自行配置或者設定成輸出到bin目錄
應用場景舉例(搜尋那塊大家都知道,說點其他的)
——————————————————————————————————————————————————
言歸正傳:看一組民間統計資料:(非Net版,指的是官方版)
net版的IKanalyzer和盤古分詞好多年沒更新了,是以這次選擇了結巴分詞(這個名字也很符合分詞的意境~~結巴說話,是不是也是一種分詞的方式呢?)
下面簡單示範一下:
1.先引入包:
2.字典設定: 3.簡單封裝的幫助類:調用很簡單:using System.Linq; using JiebaNet.Segmenter; using System.Collections.Generic; namespace LoTLib.Word.Split { #region 分詞類型 public enum JiebaTypeEnum { /// <summary> /// 精确模式---最基礎和自然的模式,試圖将句子最精确地切開,适合文本分析 /// </summary> Default, /// <summary> /// 全模式---可以成詞的詞語都掃描出來, 速度更快,但是不能解決歧義 /// </summary> CutAll, /// <summary> /// 搜尋引擎模式---在精确模式的基礎上對長詞再次切分,提高召回率,适合用于搜尋引擎分詞 /// </summary> CutForSearch, /// <summary> /// 精确模式-不帶HMM /// </summary> Other } #endregion /// <summary> /// 結巴分詞 /// </summary> public static partial class WordSplitHelper { /// <summary> /// 擷取分詞之後的字元串集合 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var jieba = new JiebaSegmenter(); switch (type) { case JiebaTypeEnum.Default: return jieba.Cut(objStr); //精确模式-帶HMM case JiebaTypeEnum.CutAll: return jieba.Cut(objStr, cutAll: true); //全模式 case JiebaTypeEnum.CutForSearch: return jieba.CutForSearch(objStr); //搜尋引擎模式 default: return jieba.Cut(objStr, false, false); //精确模式-不帶HMM } } /// <summary> /// 擷取分詞之後的字元串 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var words = GetSplitWords(objStr, type); //沒結果則傳回空字元串 if (words == null || words.Count() < 1) { return string.Empty; } words = words.Distinct();//有時候詞有重複的,得自己處理一下 return string.Join(",", words);//根據個人需求傳回 } } }
效果:string str = "bootstrap-datetimepicker 進一步跟進~~~開始時間和結束時間的樣式顯示"; Console.WriteLine("\n精确模式-帶HMM:\n"); Console.WriteLine(str.GetSplitWordStr()); Console.WriteLine("\n全模式:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutAll)); Console.WriteLine("\n搜尋引擎模式:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutForSearch)); Console.WriteLine("\n精确模式-不帶HMM:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.Other)); Console.ReadKey();
--------------------------
有人可能會說,那内容關鍵詞提取呢?==》别急,看下面:
這種方式所對應的字典是它=》idf.txt 簡單說下Constants==》 完整幫助類(最新看github):using System.Linq; using JiebaNet.Segmenter; using System.Collections.Generic; using JiebaNet.Analyser; namespace LoTLib.Word.Split { #region 分詞類型 public enum JiebaTypeEnum { /// <summary> /// 精确模式---最基礎和自然的模式,試圖将句子最精确地切開,适合文本分析 /// </summary> Default, /// <summary> /// 全模式---可以成詞的詞語都掃描出來, 速度更快,但是不能解決歧義 /// </summary> CutAll, /// <summary> /// 搜尋引擎模式---在精确模式的基礎上對長詞再次切分,提高召回率,适合用于搜尋引擎分詞 /// </summary> CutForSearch, /// <summary> /// 精确模式-不帶HMM /// </summary> Other } #endregion /// <summary> /// 結巴分詞 /// </summary> public static partial class WordSplitHelper { #region 公用系列 /// <summary> /// 擷取分詞之後的字元串集合 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var jieba = new JiebaSegmenter(); switch (type) { case JiebaTypeEnum.Default: return jieba.Cut(objStr); //精确模式-帶HMM case JiebaTypeEnum.CutAll: return jieba.Cut(objStr, cutAll: true); //全模式 case JiebaTypeEnum.CutForSearch: return jieba.CutForSearch(objStr); //搜尋引擎模式 default: return jieba.Cut(objStr, false, false); //精确模式-不帶HMM } } /// <summary> /// 提取文章關鍵詞集合 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static IEnumerable<string> GetArticleKeywords(string objStr) { var idf = new TfidfExtractor(); return idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos);//名詞和動詞 } /// <summary> /// 傳回拼接後的字元串 /// </summary> /// <param name="words"></param> /// <returns></returns> public static string JoinKeyWords(IEnumerable<string> words) { //沒結果則傳回空字元串 if (words == null || words.Count() < 1) { return string.Empty; } words = words.Distinct();//有時候詞有重複的,得自己處理一下 return string.Join(",", words);//根據個人需求傳回 } #endregion #region 擴充相關 /// <summary> /// 擷取分詞之後的字元串 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var words = GetSplitWords(objStr, type); return JoinKeyWords(words); } /// <summary> /// 提取文章關鍵詞字元串 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static string GetArticleKeywordStr(this string objStr) { var words = GetArticleKeywords(objStr); return JoinKeyWords(words); } #endregion } }
還有耐心或者隻看末尾的有福了~
web端的字典配置那是個煩啊,逆天把源碼微調了下
使用方法和上面一樣 web版示範: https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/assets/js/note.js https://github.com/dunitian/LoTCode/blob/master/PawChina/PawChina/PawChina.UI/Areas/PawRoot/Controllers/PartialViewController.cs
結巴中文分詞相關:
https://github.com/fxsjy/jieba https://github.com/anderscui/jieba.NET作者:
毒逆天出處:
https://www.cnblogs.com/dotnetcrazy打賞:
18i4JpL6g54yAPAefdtgqwRrZ43YJwAV5z本文版權歸作者和部落格園共有。歡迎轉載,但必須保留此段聲明,且在文章頁面明顯位置給出原文連接配接!