天天看點

[文本語義相似] 基于ngram-tf-idf的餘弦距離(sklearn實作)

文本相似在問答系統中有很重要的應用,如基于知識的問答系統(Knowledge-based QA),基于文檔的問答系統(Documen-based QA),以及基于FAQ的問答系統(Community-QA)等。像 對于問題的内容,需要進行相似度比對,進而選擇出與問題最接近,同時最合理的答案。本節介紹 基于ngram-tf-idf的餘弦距離計算相似度。

基于sklearn的方式如下:

import os
import re
import jieba
import pickle
import logging
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 
 
 
class StopWords(object):
    '''
    '''
    def __init__(self, stopwords_file=stopwords_file ):
        self.stopwords = set( [ word.strip() for word in open(stopwords_file, 'r') ] )
    
    def del_stopwords(self, words):
        return [ word for word in words if word not in self.stopwords ]
 
stop_word = StopWords()

# gen 3-gram
def _list_3_ngram(words, n=3, m=2):
    pattern1 = re.compile(r'[0-9]')
    if len(words) < n:
        n = len(words)
    temp=[words[i - k:i] for k in range(m, n + 1) for i in range(k, len(words) + 1) ]
    return [item for item in temp if len(''.join(item).strip())>0 and len(pattern1.findall(''.join(item).strip()))==0]
    
# 是否分詞、  及其停用詞語
def _seg_word(words_list, jieba_flag=True, del_stopword=False):
    if jieba_flag:
        word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
    else:
        word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
    word_list = [['_'.join(i) for i in _list_3_ngram(words,n=3, m=2)] for words in words_list]

    return [ ' '.join(word) for word in word_list  ]
 
 
# 不實用自帶停用詞
vectorizer = CountVectorizer(stop_words = None, token_pattern='(?u)\\b\\w\\w*\\b') 
transformer = TfidfTransformer()
 
word_list = ['我愛北京天安門', '你好,在幹嘛呢', '這個什麼價錢']
word_list = _seg_word(word_list)
dic = vectorizer.fit_transform(word_list)
tfidf = transformer.fit_transform(dic)
 
# 儲存模型
dic_path = './bow.model'
with open(dic_path, 'wb') as f:
    pickle.dump(vectorizer, f)
tfidf_model_path = 'tfidf.model'
with open(tfidf_model_path, 'wb') as f:
    pickle.dump(transformer, f)
 
# 加載模型
with open(dic_path, 'rb') as f:
    vectorizer = pickle.load(f)
with open(tfidf_model_path, 'rb') as f:
    transformer = pickle.load(f)
 
 
def _normalize(x):
   x /= (np.array(x)**2).sum(axis=1, keepdims=True)**0.5 
   return x
 
# 得到句子向量
word1 = ['你好,在幹嘛呢']
ngram_tfidf_embedding1 = transformer.transform( vectorizer.transform(_seg_word([word1])) )
ngram_tfidf_embedding1 = ngram_tfidf_embedding1.toarray().sum(axis=0)
ngram_tfidf_embedding1  = ngram_tfidf_embedding1[np.newaxis, :].astype(float)
ngram_tfidf_embedding1 = _normalize(ngram_tfidf_embedding1)
word2 = ['這個什麼價錢']
ngram_tfidf_embedding2 = transformer.transform( vectorizer.transform(_seg_word([word2])) )
ngram_tfidf_embedding2 = ngram_tfidf_embedding2.toarray().sum(axis=0)
ngram_tfidf_embedding2  = ngram_tfidf_embedding2[np.newaxis, :].astype(float)
ngram_tfidf_embedding2 = _normalize(ngram_tfidf_embedding2)
 
 
# 擷取兩個句子相似度得分
score = np.dot( ngram_tfidf_embedding1[0], ngram_tfidf_embedding2[0] ) 
 
           

繼續閱讀