天天看點

lda+word2vec 主題模型結合深度學習

         最近碩士畢業小論文想寫LDA結合深度學習的,論文看來看去,看的頭大,也沒什麼好的創新點,雜七雜八小的創新帶你想了一大堆,要麼自己給否了,要麼後來在看論文的時候發現寫過了(那麼LOW的點也能寫),想寫個差不多的’有價值意義的創新點,結果想來想去想着頭大,就繼續看論文。

         言歸正傳,看了大幾十篇外文文獻了(外文的聞起來就厲害,實際上好的也沒幾個),結合點就一兩個有價值的,推薦幾篇有創新點的論文《A Hybrid Document Feature Extraction Method Using Latent Dirichlet Allocation andWord2Vec》;《 Improving topic models with latent featureword representations》;《Topic2Vec:learning distributed representations of topics》,别的大多在應用層面,創新的遇見的比較少,當然要是有好的論文求推薦(腦海裡一萬個感謝)。

        這裡實作了《AHybrid Document Feature Extraction Method Using Latent Dirichlet Allocation andWord2Vec》裡面的方法,這裡簡要說下他的思路:

1、  選取主題T的前N個詞作為主題詞

2、對主題詞做歸一化處理,即,計算每個詞w占主題的權重

3、将主題映射到word2vec空間上,即:w在ord2vec空間的坐标*w占主題T的權重

4、計算文檔在word2vec空間的坐标:每個詞在word2vec空間下的坐标相加,再除以總詞數

5、計算每篇文檔與各個主題的距離doc_t

6、這裡我用KNN分類器,做分類模型

7、計算測試文檔與各個主題的距離,計算過程如4、5,将得到的矩陣帶入模型中預測

       上代碼了:

from gensim import models,corpora,similarities
from sklearn.cross_validation import train_test_split
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from gensim.models import LdaModel
import numpy as np
import os
import random
from sklearn.neighbors import KNeighborsClassifier 
from gensim.models import word2vec
import sklearn.metrics as metrics 
doc=[] 
label_c=[]
ii=0
for x in os.listdir("D:\Documents\data\zhaiyao\\1\\"):
    print x
    for line in open("D:\Documents\data\zhaiyao\\1\\"+x,"r").readlines():
        doc.append(line.strip().split("  "))
        label_c.append(ii)
    ii+=1
size_word=100
size_lda=200
print "訓練詞向量"
model_wv=word2vec.Word2Vec(doc, size=size_word,workers=2,min_count=1,iter=10)#size次元,min_count最少出現次數
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(doc, label_c, test_size=0.2)
dictionary = corpora.Dictionary(doc)
corpus = [ dictionary.doc2bow(text) for text in x_train_1 ]
print "訓練LDA"
lda = LdaModel(corpus=corpus,id2word=dictionary, num_topics=size_lda)
#lda.get_document_topics
for doc_num in x_test_1:
    doc_bow=dictionary.doc2bow(doc_num)#将訓練集轉
    doc_lda = lda[doc_bow]
    print "輸出新文檔的主題分布",doc_lda#輸出新文檔的主題分布
    break
for topic in doc_lda:
    print "該主題站的比例","%s\t%f\n"%(lda.print_topic(topic[0]), topic[1])
print lda.get_topic_terms(0, topn=10)#擷取某個主題0下的前topn個詞語,越相關的在前面 
aa=[x[0] for x in lda.get_topic_terms(0, topn=10)]
for it in aa:
    print "第%d個主題的第%d個主題詞"%(0,it), dictionary[it]
#在整個樣本下,某個詞屬于某些主題的可能性;擷取某個詞最有可能的主題清單,word_id為詞id值,minimum_probability為門檻值;
print  lda.get_term_topics(0, minimum_probability=0.05)
#per_word_topics為True時還得到文檔中每個詞屬于的主題清單,minimum_probability确定主題的門檻值,minimum_phi_value判斷詞屬于主題的門檻值
print lda.get_document_topics(corpus[0], minimum_probability=None, minimum_phi_value=0.1, per_word_topics=False)


print "映射主題到word2vec空間"
the_id=[]#每個主題的前10個詞的ID
the_vl=[]#每個主題的前10個詞的value
the_w =[]#每個主題的前10個詞的占權重
print "計算主題内個詞的權重"
for x in range(size_lda):
    the_id.append([xx[0] for xx in lda.get_topic_terms(x,topn=5)])
    the_sum=sum([xx[1] for xx in lda.get_topic_terms(x,topn=5)])
    the_w.append([xx[1]/the_sum for xx in lda.get_topic_terms(x,topn=5)])
    #print x,"主題",the_sum,the_w
print "開始映射到坐标"
m=0
the_wv=np.zeros([size_lda,size_word])#每個主題映射到word2vec,主題數,word2vec
#主題下每個詞在word2vec下坐标權重求和
for it in the_id:
    n=0
    for it_id in it:
        word_t=dictionary[it_id]
        #print word_t+"**",np.shape(model_wv[word_t]),the_w[m][n]
        the_wv[m]+=[x_word*the_w[m][n] for x_word in model_wv[word_t.encode("utf-8")]]
        n+=1
    m+=1
doc_word=np.zeros([len(x_train_1),size_word])
print "映射訓練文檔到word2vec"
m=0
for each_doc in x_train_1:
    for each_word in each_doc:
        #print each_word
        doc_word[m]+=model_wv[each_word]
    n=0
    for doc_word_each in doc_word[m]:
        doc_word[m][n]=doc_word[m][n]/len(doc_word[m])
        n+=1
    m+=1
print "計算訓練集文檔與個主題間距"
def destince(a,b):#計算a,b間歐氏距離距離
    dt=0
    for each_dt in range(len(a)):
        dt+=(a[each_dt]-b[each_dt])*(a[each_dt]-b[each_dt])
    return np.sqrt(dt)
doc_t=np.zeros([len(doc_word),size_lda])
m=0
for each_d in doc_word:
    n=0
    for each_t in the_wv:
        doc_t[m][n]=destince(each_d,each_t)
        n+=1
    m+=1
doc_word_test=np.zeros([len(x_test_1),size_word])
print "映射測試文檔到word2vec"
m=0
for each_doc in x_test_1:
    for each_word in each_doc:
        #print each_word
        doc_word_test[m]+=model_wv[each_word]
    n=0
    for doc_word_each in doc_word_test[m]:
        doc_word_test[m][n]=doc_word_test[m][n]/size_word
        n+=1
    m+=1
print "計算測試集文檔與個主題間距"
doc_t_test=np.zeros([len(doc_word_test),size_lda])
m=0
for each_d in doc_word_test:
    n=0
    for each_t in the_wv:
        doc_t_test[m][n]=destince(each_d,each_t)
        n+=1
    m+=1

KN =KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric='cosine') 
KN.fit(doc_t,y_train_1) 
test_labels_KN=KN.predict(doc_t_test)
print "準确率",metrics.accuracy_score(test_labels_KN,y_test_1)#準确率
print "召回率",confusion_matrix(test_labels_KN,y_test_1)#召回率
           

過程列印的比較多,我就撿最後的貼下:

預測結果
訓練集大小 (5932L, 100L)
準确率 0.433962264151
召回率 [[428  83  63 389]
 [ 11   0   2   9]
 [ 14   0   2  25]
 [193  25  26 214]]
2017-11-20  16:15:07,Mon doc2vec.py INFO precomputing L2-norms of doc weight vectors
相似度計算 0.927223719677
           

還沒有調參什麼的,筆記本不咋地懶得等,有條件了多運作幾次,語料再大些,效果肯定好.

這裡也可以用doc2vec代替word2vec ,效果更好

呃,doc2vec 模型就是

model
           

,再上一篇部落格裡有源代碼,直接運作就行,我這裡懶得加了

代碼如下:

#doc2vec +lda
print "開始映射到坐标"
m=0
the_wv=np.zeros([size_lda,size_doc2])#
#主題下每個詞在doc2vec下坐标權重求和
for it in the_id:
    n=0
    for it_id in it:
        word_t=dictionary[it_id]
        #print word_t+"**",np.shape(model[word_t.encode("utf-8")]),the_w[m][n]
        the_wv[m]+=[x_word*the_w[m][n] for x_word in model[word_t.encode("utf-8")]]
        n+=1
    m+=1
doc_word=np.zeros([len(x_train_1),size_word])
print "映射訓練文檔到doc2vec"
m=0
for each_doc in x_train_1:
    for each_word in each_doc:
        #print each_word
        doc_word[m]+=model[each_word]
    n=0
    for doc_word_each in doc_word[m]:
        doc_word[m][n]=doc_word[m][n]/len(doc_word[m])
        n+=1
    m+=1
print "計算訓練集文檔與個主題間距"
def destince(a,b):#計算a,b間歐氏距離距離
    dt=0
    for each_dt in range(len(a)):
        dt+=(a[each_dt]-b[each_dt])*(a[each_dt]-b[each_dt])
    return np.sqrt(dt)
doc_t=np.zeros([len(doc_word),size_lda])
m=0
for each_d in doc_word:
    n=0
    for each_t in the_wv:
        doc_t[m][n]=destince(each_d,each_t)
        n+=1
    m+=1
doc_word_test=np.zeros([len(x_test_1),size_word])
print "映射測試文檔到doc2vec"
m=0
for each_doc in x_test_1:
    for each_word in each_doc:
        #print each_word
        doc_word_test[m]+=model[each_word]
    n=0
    for doc_word_each in doc_word_test[m]:
        doc_word_test[m][n]=doc_word_test[m][n]/size_word
        n+=1
    m+=1
print "計算測試集文檔與個主題間距"
doc_t_test=np.zeros([len(doc_word_test),size_lda])
m=0
for each_d in doc_word_test:
    n=0
    for each_t in the_wv:
        doc_t_test[m][n]=destince(each_d,each_t)
        n+=1
    m+=1

KN =KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric='cosine') 
KN.fit(doc_t,y_train_1) 
test_labels_KN=KN.predict(doc_t_test)
print "準确率",metrics.accuracy_score(test_labels_KN,y_test_1)#準确率
print "召回率",confusion_matrix(test_labels_KN,y_test_1)#召回率
pret=0.0
for doc_num in range(len(x_test_1)):
    inferred_vector = model.infer_vector(x_test_1[doc_num])
    sims = model.docvecs.most_similar([inferred_vector], topn=3)
    pre=[to[0] for to in sims ]
    sims_doc=[label_c[ind] for ind in pre]
    label_=dict([(sims_doc.count(i),i) for i in sims_doc])
    if y_test_1[doc_num]==label_[max(label_.keys())]:#sims是一個tuples,(index_of_document, similarity)
        pret+=1
print "相似度計算",pret/len(x_test)
           

繼續閱讀