前言
參加了貝殼找房的房産問答比對比賽(比賽連結:https://www.datafountain.cn/competitions/474),于是利用matchzoo庫解決房産行業問答比對比賽問題。
比賽流程
導入第三方庫包
import matchzoo as mz
import pandas as pd
import numpy as np
import numpy as np
import tensorflow.keras as K
from matchzoo.preprocessors import BasicPreprocessor
from sklearn.model_selection import train_test_split
import datetime
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import tensorflow as tf
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from random import choice
# from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import re, os
import codecs
from keras.callbacks import Callback
資料預處理
1.讀取資料集
#資料讀取及處理
train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_left.columns=['id','q1']
train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
train_right.columns=['id','id_sub','q2','label']
df_train = train_left.merge(train_right, how='left')
df_train['q2'] = df_train['q2'].fillna('好的')
test_left = pd.read_csv('./test/test.query.tsv',sep='\t',header=None, encoding='gbk')
test_left.columns = ['id','q1']
test_right = pd.read_csv('./test/test.reply.tsv',sep='\t',header=None, encoding='gbk')
test_right.columns=['id','id_sub','q2']
df_test = test_left.merge(test_right, how='left')
2.檢查訓練集和測試集
訓練集

測試集
3.将資料集轉化成matchzoo的形式
#構造訓練集和驗證集
sent1_=df_train.q1.values
sent2_=df_train.q2.values
label_=df_train.label.values
all_data=pd.DataFrame()
all_data['id_left']=range(len(df_train))
all_data['text_left']=sent1_
all_data['id_right']=range(len(df_train))
all_data['text_right']=sent2_
all_data['label']=label_
#構造測試集
_sent1=df_test.q1.values
_sent2=df_test.q2.values
# _label=label[2501:]
tmp_data=pd.DataFrame()
tmp_data['id_left']=range(len(df_test))
tmp_data['text_left']=_sent1
tmp_data['id_right']=range(len(df_test))
tmp_data['text_right']=_sent2
# test_data['label']=_label
4.轉化成matchzoo需要的格式,構造資料管道
def load_data(df_data):
# df_data = pd.read_csv(data_path, sep='\t', header=None)
# df_data = pd.DataFrame(df_data.values, columns=['id_left', 'text_left', 'id_right', 'text_right', 'label'])
df_data = mz.pack(df_data)
return df_data
train_data = load_data(all_data)
test_data=load_data(tmp_data)
#用最基礎的BasicPreprocessor
#将比對的文本都拓展成15個字元的長度
preprocessor=BasicPreprocessor(15,15)
train = train_data[:train_split]
dev = train_data[train_split:]
train_pack_processed = preprocessor.fit_transform(train)
# 其實就是做了一個字元轉id操作,是以對于中文文本,不需要分詞
dev_pack_processed = preprocessor.transform(dev)
test_pack_processed = preprocessor.transform(test_data)
train_data_generator = mz.DataGenerator(train_pack_processed
, batch_size=32
, shuffle=True) # 訓練資料生成器
test_x, test_y = test_pack_processed.unpack()
dev_x, dev_y = dev_pack_processed.unpack()
4.建立目錄的源碼
def mkdir(path):
# 引入子產品
import os
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判斷路徑是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判斷結果
if not isExists:
# 如果不存在則建立目錄
# 建立目錄操作函數
os.makedirs(path)
print(path+' 建立成功')
return True
else:
# 如果目錄存在則不建立,并提示目錄已存在
print(path+' 目錄已存在')
return False
模型訓練
DenseBaseline
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model = mz.models.DenseBaseline()
model.params['task'] = task
model.params['mlp_num_units'] = 3
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','DenseBaseline')
logdir1 = os.path.join('outputs', 'model', 'DenseBaseline')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'DenseBaseline'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+'DenseBaseline'+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
DenseBaseline的運作結果(之後的模型結果由于篇幅就不展示了)
DRMMTKS
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model =mz.models.DRMMTKS()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 100
model.params['top_k'] = 20
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 5
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.guess_and_fill_missing_params(verbose=0)
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','DRMMTKS')
logdir1 = os.path.join('outputs', 'model', 'DRMMTKS')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'DRMMTKS'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+'DRMMTKS'+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
KNRM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model =mz.models.KNRM()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 10
model.params['embedding_trainable'] = True
model.params['kernel_num'] = 11
model.params['sigma'] = 0.1
model.params['exact_sigma'] = 0.001
model.guess_and_fill_missing_params(verbose=0)
# model.build()
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','KNRM')
logdir1 = os.path.join('outputs', 'model', 'KNRM')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'KNRM'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+'KNRM'+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MVLSTM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model =mz.models.MVLSTM()
model.params['lstm_units'] = 32
model.params['top_k'] = 50
model.params['mlp_num_layers'] = 2
model.params['mlp_num_units'] = 20
model.params['mlp_num_fan_out'] = 10
model.params['mlp_activation_func'] = 'relu'
model.params['dropout_rate'] = 0.5
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MVLSTM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
HBMP
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model = mz.contrib.models.HBMP()
model.guess_and_fill_missing_params(verbose=0)
model.params['embedding_input_dim'] = 200
model.params['embedding_output_dim'] = 100
model.params['embedding_trainable'] = True
model.params['alpha'] = 0.1
model.params['mlp_num_layers'] = 3
model.params['mlp_num_units'] = [10, 10]
model.params['lstm_num_units'] = 5
model.params['dropout_rate'] = 0.1
# model.build()
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='HBMP'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ArcI
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model =mz.models.ArcI()
model.params['num_blocks'] = 1
model.params['left_filters'] = [32]
model.params['right_filters'] = [32]
model.params['left_kernel_sizes'] = [3]
model.params['right_kernel_sizes'] = [3]
model.params['left_pool_sizes'] = [2]
model.params['right_pool_sizes'] = [4]
model.params['conv_activation_func'] = 'relu'
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 64
model.params['mlp_num_fan_out'] = 32
model.params['mlp_activation_func'] = 'relu'
model.params['dropout_rate'] = 0.5
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ArcI'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ConvKNRM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model = mz.models.ConvKNRM()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 300
model.params['embedding_trainable'] = True
model.params['filters'] = 128
model.params['conv_activation_func'] = 'tanh'
model.params['max_ngram'] = 3
model.params['use_crossmatch'] = True
model.params['kernel_num'] = 11
model.params['sigma'] = 0.1
model.params['exact_sigma'] = 0.001
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ConvKNRM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
DUET
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
model = mz.models.DUET()
model.params['embedding_input_dim'] = 1000
model.params['embedding_output_dim'] = 300
model.params['lm_filters'] = 32
model.params['lm_hidden_sizes'] = [64, 32]
model.params['dropout_rate'] = 0.5
model.params['dm_filters'] = 32
model.params['dm_kernel_size'] = 3
model.params['dm_d_mpool'] = 4
model.params['dm_hidden_sizes'] = [64, 32]
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='DUET'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ESIM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
# model = mz.models.ConvKNRM()
model = mz.contrib.models.ESIM()
model.params['task'] = task
model.params['input_shapes'] = [(15, ), (15, )]
model.params['lstm_dim'] = 300
model.params['mlp_num_units'] = 300
model.params['embedding_input_dim'] = 5000
model.params['embedding_output_dim'] = 10
model.params['embedding_trainable'] = False
model.params['mlp_num_layers'] = 0
model.params['mlp_num_fan_out'] = 300
model.params['mlp_activation_func'] = 'tanh'
model.params['mask_value'] = 0
model.params['dropout_rate'] = 0.5
model.params['optimizer'] = K.optimizers.Adam(lr=4e-4)
model.guess_and_fill_missing_params()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ESIM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MatchLSTM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
# model = mz.models.ConvKNRM()
model =mz.contrib.models.MatchLSTM()
model.guess_and_fill_missing_params(verbose=0)
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 100
model.params['embedding_trainable'] = True
model.params['fc_num_units'] = 200
model.params['lstm_num_units'] = 256
model.params['dropout_rate'] = 0.5
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MatchLSTM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MatchSRNN
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
# model = mz.models.ConvKNRM()
model =mz.contrib.models.MatchSRNN()
model.params['channels'] = 4
model.params['units'] = 10
model.params['dropout_rate'] = 0.0
model.params['direction'] = 'lt'
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MatchSRNN'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
BiMPM
### 定義任務,包含兩種,一個是Ranking,一個是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 建立模型以及修改參數(可以使用mz.models.list_available()檢視可用的模型清單)
# model = mz.models.ConvKNRM()
model = mz.contrib.models.BiMPM()
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 訓練, 評估, 預測
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='BiMPM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 儲存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('儲存成功')
#這裡是加載
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')