天天看點

word2vec和常見CNN+RNN網格結構組成的文本分類模型加了word2vec的CNN模型加入了word2vec的RNN模型加入了word2vec的Bi-GRU加入了word2vec的CNN+RNN 串聯加入了word2vec的 CNN+RNN 并聯

作者為了應付畢業,是以在補充深度學習相關知識,這是我嘗試把word2vec和深度學習互相結合的一次記錄。

  • 資料集來源
  • 資料集預處理
  • 生成word2vec模型
  • 搭建網絡并且訓練

資料集來源

本文的資料集源自kaggle比賽中的NLP入門比賽,災難新聞預報警。

資料集預處理

資料導入:

import numpy as np
import pandas as pd
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
           

資料預處理:

import re
import os, sys
import string
# 停用詞
from nltk.corpus import stopwords
# 小寫
def text_to_lowercase(text):
    return text.lower()
#去掉标點符号
def text_remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
#去掉url
def text_remove_url(text):
    return re.sub(r"http\S+", "", text)
#去掉@符号
def text_remove_twitter_handle(text):
    return re.sub('@[^\s]+','',text)
#去掉Python strip() 方法用于移除字元串頭尾指定的字元(預設為空格或換行符)或字元序列。
def text_remove_leadtrail_spaces(text):
    return text.strip()
def clean_text(text):
    # order matters
    text1 = text_remove_twitter_handle(text)
    text2 = text_remove_url(text1)
    text3 = text_remove_punctuation(text2)
    text4 = text_to_lowercase(text3)
    text5 = text_remove_leadtrail_spaces(text4)
    return text5
# x = train_df["text"]
# 類似于list周遊性操作
# y = [clean_text(i) for i in x]
# text processing
train_df['text_processed'] =[clean_text(i) for i in train_df["text"]]
# x1 = test_df["text"]
# y1 = [clean_text(i) for i in x1]
# text processing
#清洗資料
test_df['text_processed'] =[clean_text(i) for i in test_df["text"]]
feature=train_df['text_processed']
target=train_df['target']
           

生成word2vec模型

from gensim.models import Word2Vec
# 訓練模型,詞向量的長度設定為500# , 疊代次數為8# ,采用skip-gram模型# ,采用負采樣# 視窗選擇6# 最小詞頻是7# ,模型儲存為pkl格式
w2v_model=Word2Vec(feature, size=500, sg=1,hs=0,window=6, iter=8,min_count=7)
w2v_model.wv.save_word2vec_format("./word2Vec" + ".pkl", binary=True) 
           

導入工具包

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.layers.merge import concatenate
# 搭模組化型
from keras.models import Sequential, Model
# 這個是層的搭建
from keras.layers import Dense, Embedding, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers import  BatchNormalization
from keras.layers import Convolution1D, Conv1D,MaxPooling1D
from keras.layers import Dense, Embedding, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
rom keras.utils import to_categorical
           

資料集分割和轉數字

# 文本标簽分類數量
NUM_CLASS=2
# 輸入次元
INPUT_SIZE=64
# # 序列對齊文本資料
# Tokenizer是一個用于向量化文本,或将文本轉換為序列
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(feature)
vocab = tokenizer.word_index
x_ids=tokenizer.texts_to_sequences(feature)
pad_s=pad_sequences(x_ids, maxlen=INPUT_SIZE)
from keras.utils import to_categorical
target_u=to_categorical(target,NUM_CLASS)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(pad_s,target_u,random_state=22,test_size=0.2)
           

搭建網絡并且訓練

給Embedding加入word2vec

embeding_matrix=np.zeros((len(vocab)+1,500))
for word,i in vocab.items():
    try:
        embeding_vector=w2v_model[str(word)]
        embeding_matrix[i]=embeding_vector
    except KeyError:
        continue
           

textCNN模型加入word2vec

from keras.layers import Flatten,Dropout
main_input=Input(shape=(INPUT_SIZE,),dtype='float64')
embedder=Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)
embed=embedder(main_input)
cnn1=Conv1D(256,3,padding='same',strides=1,activation='relu')(embed)
cnn1=MaxPooling1D(pool_size=38)(cnn1)
cnn2=Conv1D(256,4,padding='same',strides=1,activation='relu')(embed)
cnn2=MaxPooling1D(pool_size=37)(cnn2)
cnn3=Conv1D(256,5,padding='same',strides=1,activation='relu')(embed)
cnn3=MaxPooling1D(pool_size=36)(cnn3)
cnn=concatenate([cnn1,cnn2,cnn3],axis=-1)
flat=Flatten()(cnn)
drop=Dropout(0.2)(flat)
main_output=Dense(NUM_CLASS,activation='softmax')(drop)
model=Model(inputs=main_input,outputs=main_output)
model.summary()
           

模型搭建結果:

word2vec和常見CNN+RNN網格結構組成的文本分類模型加了word2vec的CNN模型加入了word2vec的RNN模型加入了word2vec的Bi-GRU加入了word2vec的CNN+RNN 串聯加入了word2vec的 CNN+RNN 并聯
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

模型訓練結果:

word2vec和常見CNN+RNN網格結構組成的文本分類模型加了word2vec的CNN模型加入了word2vec的RNN模型加入了word2vec的Bi-GRU加入了word2vec的CNN+RNN 串聯加入了word2vec的 CNN+RNN 并聯

其他的模型

加了word2vec的CNN模型

model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) #使用Embeeding層将每個詞編碼轉換為詞向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())  # (批)規範化層
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

加入了word2vec的RNN模型

model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

加入了word2vec的Bi-GRU

# 模型結構:詞嵌入-雙向GRU*2-全連接配接
model = Sequential()
# 64是序列号
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

加入了word2vec的CNN+RNN 串聯

# 模型結構:詞嵌入-卷積池化-GRU*2-全連接配接
model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Convolution1D(256, 3, padding='same', strides = 1))
model.add(Activation('relu'))
model.add(MaxPool1D(pool_size=2))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

加入了word2vec的 CNN+RNN 并聯

# 模型結構:詞嵌入-卷積池化-全連接配接 ---拼接-全連接配接
#                -雙向GRU-全連接配接
main_input = Input(shape=(INPUT_SIZE,), dtype='float64')
embed = Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)(main_input)
cnn = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn = MaxPool1D(pool_size=4)(cnn)
cnn = Flatten()(cnn)
cnn = Dense(256)(cnn)
rnn = Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))(embed)
rnn = Dense(256)(rnn)
con = concatenate([cnn,rnn], axis=-1)
main_output = Dense(NUM_CLASS, activation='softmax')(con)
model = Model(inputs = main_input, outputs = main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])
           

繼續閱讀