天天看點

特征提取 --- 中文文本特征抽取

jieba庫

ex_1

import jieba

def cut_word(text):
    text = ' '.join(list(jieba.cut(text)))
    return text
    
def cut_chinese_demo2():
    data = ["每一個公民的合法權利都值得守護",
            "每一個維權訴求都值得珍視。",
            "當且僅當舉報管道暢通無阻、",
            "解決問題馬上就辦,",
            "才能少一些驚詫眼球的“誇張舉報”"]
    data_new = []
    for sen in data:
        data_new.append(cut_word(sen))
    transfer = CountVectorizer()
    data_final = transfer.fit_transform(data_new)
    print("data_new:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
           

繼續閱讀