天天看点

特征提取 --- 中文文本特征抽取

jieba库

ex_1

import jieba

def cut_word(text):
    text = ' '.join(list(jieba.cut(text)))
    return text
    
def cut_chinese_demo2():
    data = ["每一个公民的合法权利都值得守护",
            "每一个维权诉求都值得珍视。",
            "当且仅当举报渠道畅通无阻、",
            "解决问题马上就办,",
            "才能少一些惊诧眼球的“夸张举报”"]
    data_new = []
    for sen in data:
        data_new.append(cut_word(sen))
    transfer = CountVectorizer()
    data_final = transfer.fit_transform(data_new)
    print("data_new:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
           

继续阅读