文本相似在問答系統中有很重要的應用,如基于知識的問答系統(Knowledge-based QA),基于文檔的問答系統(Documen-based QA),以及基于FAQ的問答系統(Community-QA)等。像 對于問題的内容,需要進行相似度比對,進而選擇出與問題最接近,同時最合理的答案。本節介紹 基于ngram-tf-idf的餘弦距離計算相似度。
基于sklearn的方式如下:
import os
import re
import jieba
import pickle
import logging
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
class StopWords(object):
'''
'''
def __init__(self, stopwords_file=stopwords_file ):
self.stopwords = set( [ word.strip() for word in open(stopwords_file, 'r') ] )
def del_stopwords(self, words):
return [ word for word in words if word not in self.stopwords ]
stop_word = StopWords()
# gen 3-gram
def _list_3_ngram(words, n=3, m=2):
pattern1 = re.compile(r'[0-9]')
if len(words) < n:
n = len(words)
temp=[words[i - k:i] for k in range(m, n + 1) for i in range(k, len(words) + 1) ]
return [item for item in temp if len(''.join(item).strip())>0 and len(pattern1.findall(''.join(item).strip()))==0]
# 是否分詞、 及其停用詞語
def _seg_word(words_list, jieba_flag=True, del_stopword=False):
if jieba_flag:
word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
else:
word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
word_list = [['_'.join(i) for i in _list_3_ngram(words,n=3, m=2)] for words in words_list]
return [ ' '.join(word) for word in word_list ]
# 不實用自帶停用詞
vectorizer = CountVectorizer(stop_words = None, token_pattern='(?u)\\b\\w\\w*\\b')
transformer = TfidfTransformer()
word_list = ['我愛北京天安門', '你好,在幹嘛呢', '這個什麼價錢']
word_list = _seg_word(word_list)
dic = vectorizer.fit_transform(word_list)
tfidf = transformer.fit_transform(dic)
# 儲存模型
dic_path = './bow.model'
with open(dic_path, 'wb') as f:
pickle.dump(vectorizer, f)
tfidf_model_path = 'tfidf.model'
with open(tfidf_model_path, 'wb') as f:
pickle.dump(transformer, f)
# 加載模型
with open(dic_path, 'rb') as f:
vectorizer = pickle.load(f)
with open(tfidf_model_path, 'rb') as f:
transformer = pickle.load(f)
def _normalize(x):
x /= (np.array(x)**2).sum(axis=1, keepdims=True)**0.5
return x
# 得到句子向量
word1 = ['你好,在幹嘛呢']
ngram_tfidf_embedding1 = transformer.transform( vectorizer.transform(_seg_word([word1])) )
ngram_tfidf_embedding1 = ngram_tfidf_embedding1.toarray().sum(axis=0)
ngram_tfidf_embedding1 = ngram_tfidf_embedding1[np.newaxis, :].astype(float)
ngram_tfidf_embedding1 = _normalize(ngram_tfidf_embedding1)
word2 = ['這個什麼價錢']
ngram_tfidf_embedding2 = transformer.transform( vectorizer.transform(_seg_word([word2])) )
ngram_tfidf_embedding2 = ngram_tfidf_embedding2.toarray().sum(axis=0)
ngram_tfidf_embedding2 = ngram_tfidf_embedding2[np.newaxis, :].astype(float)
ngram_tfidf_embedding2 = _normalize(ngram_tfidf_embedding2)
# 擷取兩個句子相似度得分
score = np.dot( ngram_tfidf_embedding1[0], ngram_tfidf_embedding2[0] )