TF之LSTM：基于Tensorflow架構采用PTB資料集建立LSTM網絡的自然語言模組化

關于PTB資料集

PTB （Penn Treebank Dataset）文本資料集是語言模型學習中目前最被廣泛使用資料集。

ptb.test.txt #測試集資料檔案

ptb.train.txt #訓練集資料檔案

ptb.valid.txt #驗證集資料檔案

這三個資料檔案中的資料已經經過了預處理，包含了10000 個不同的詞語和語句結束标記符（在文本中就是換行符）以及标記稀有詞語的特殊符号。

為了讓使用PTB資料集更加友善，TensorFlow提供了兩個函數來幫助實作資料的預處理。首先，TensorFlow提供了ptb_raw_data函數來讀取PTB的原始資料，并将原始資料中的單詞轉化為單詞ID。

訓練資料中總共包含了929589 個單詞，而這些單詞被組成了一個非常長的序列。這個序列通過特殊的辨別符給出了每句話結束的位置。在這個資料集中，句子結束的辨別符ID為2。

資料集的下載下傳位址：TF的PTB資料集 (别的資料集不比對的話會出現錯誤)

代碼實作

本代碼使用2層 LSTM 網絡，且每層有 200 個隐藏單元。在訓練中截斷的輸入序列長度為 32，且使用 Dropout 和梯度截斷等方法控制模型的過拟合與梯度爆炸等問題。當簡單地訓練 3 個 Epoch 後，測試複雜度（Perplexity）降低到了 210，如果多輪訓練會更低。

# -*- coding: utf-8 -*-

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

import collections

import os

import sys

import tensorflow as tf

Py3 = sys.version_info[0] == 3

def _read_words(filename):

with tf.gfile.GFile(filename, "r") as f:

if Py3:

return f.read().replace("\n", "<eos>").split()

else:

return f.read().decode("utf-8").replace("\n", "<eos>").split()

def _build_vocab(filename):

data = _read_words(filename)

counter = collections.Counter(data)

count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

words, _ = list(zip(*count_pairs))

word_to_id = dict(zip(words, range(len(words))))

return word_to_id

def _file_to_word_ids(filename, word_to_id):

return [word_to_id[word] for word in data if word in word_to_id]

def ptb_raw_data(data_path=None):

"""Load PTB raw data from data directory "data_path".

Reads PTB text files, converts strings to integer ids,

and performs mini-batching of the inputs.

The PTB dataset comes from Tomas Mikolov's webpage:

http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

Args:

data_path: string path to the directory where simple-examples.tgz has

been extracted.

Returns:

tuple (train_data, valid_data, test_data, vocabulary)

where each of the data objects can be passed to PTBIterator.

"""

train_path = os.path.join(data_path, "ptb.train.txt")

valid_path = os.path.join(data_path, "ptb.valid.txt")

test_path = os.path.join(data_path, "ptb.test.txt")

word_to_id = _build_vocab(train_path)

train_data = _file_to_word_ids(train_path, word_to_id)

valid_data = _file_to_word_ids(valid_path, word_to_id)

test_data = _file_to_word_ids(test_path, word_to_id)

vocabulary = len(word_to_id)

return train_data, valid_data, test_data, vocabulary

def ptb_producer(raw_data, batch_size, num_steps, name=None):

"""Iterate on the raw PTB data.

This chunks up raw_data into batches of examples and returns Tensors that

are drawn from these batches.

Args:

raw_data: one of the raw data outputs from ptb_raw_data.

batch_size: int, the batch size.

num_steps: int, the number of unrolls.

name: the name of this operation (optional).

A pair of Tensors, each shaped [batch_size, num_steps]. The second element

of the tuple is the same data time-shifted to the right by one.

Raises:

tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.

with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):

raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)

data_len = tf.size(raw_data)

batch_len = data_len // batch_size

data = tf.reshape(raw_data[0 : batch_size * batch_len],

[batch_size, batch_len])

epoch_size = (batch_len - 1) // num_steps

assertion = tf.assert_positive(

epoch_size,

message="epoch_size == 0, decrease batch_size or num_steps")

with tf.control_dependencies([assertion]):

epoch_size = tf.identity(epoch_size, name="epoch_size")

i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()

x = tf.strided_slice(data, [0, i * num_steps],

[batch_size, (i + 1) * num_steps])

x.set_shape([batch_size, num_steps])

y = tf.strided_slice(data, [0, i * num_steps + 1],

[batch_size, (i + 1) * num_steps + 1])

y.set_shape([batch_size, num_steps])

return x, y

from reader import *

import numpy as np

data_path = 'F:/File_Python/Python_daydayup/data/simple-examples/data' #F:/File_Python/Python_daydayup/data/simple-examples/data

# 隐藏層單元數與LSTM層級數

hidden_size = 200

num_layers = 2

#詞典規模

vocab_size = 10000

learning_rate = 1.0

train_batch_size = 16

# 訓練資料截斷長度

train_num_step = 32

# 在測試時不需要使用截斷，測試資料為一個超長序列

eval_batch_size = 1

eval_num_step = 1

num_epoch = 3

#結點不被Dropout的機率

keep_prob = 0.5

# 用于控制梯度爆炸的參數

max_grad_norm = 5

# 通過ptbmodel 的類描述模型

class PTBModel(object):

def __init__(self, is_training, batch_size, num_steps):

# 記錄使用的Batch大小和截斷長度

self.batch_size = batch_size

self.num_steps = num_steps

# 定義輸入層，次元為批量大小×截斷長度

self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])

# 定義預期輸出

self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])

# 定義使用LSTM結構為循環體，帶Dropout的深度RNN

lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)

if is_training:

lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)

cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)

# 初始化狀态為0

self.initial_state = cell.zero_state(batch_size, tf.float32)

# 将單詞ID轉換為單詞向量，embedding的次元為vocab_size*hidden_size

embedding = tf.get_variable('embedding', [vocab_size, hidden_size])

# 将一個批量内的單詞ID轉化為詞向量，轉化後的輸入次元為批量大小×截斷長度×隐藏單元數

inputs = tf.nn.embedding_lookup(embedding, self.input_data)

# 隻在訓練時使用Dropout

if is_training: inputs = tf.nn.dropout(inputs, keep_prob)

# 定義輸出清單，這裡先将不同時刻LSTM的輸出收集起來，再通過全連接配接層得到最終輸出

outputs = []

# state 儲存不同批量中LSTM的狀态，初始為0

state = self.initial_state

with tf.variable_scope('RNN'):

for time_step in range(num_steps):

if time_step > 0: tf.get_variable_scope().reuse_variables()

# 從輸入資料擷取目前時間步的輸入與前一時間步的狀态，并傳入LSTM結構

cell_output, state = cell(inputs[:, time_step, :], state)

# 将目前輸出加入輸出隊列

outputs.append(cell_output)

# 将輸出隊列展開成[batch,hidden*num_step]的形狀，再reshape為[batch*num_step, hidden]

output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

# 将LSTM的輸出傳入全連接配接層以生成最後的預測結果。最後結果在每時刻上都是長度為vocab_size的張量

# 且經過softmax層後表示下一個位置不同詞的機率

weight = tf.get_variable('weight', [hidden_size, vocab_size])

bias = tf.get_variable('bias', [vocab_size])

logits = tf.matmul(output, weight) + bias

# 定義交叉熵損失函數，一個序列的交叉熵之和

loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(

[logits], # 預測的結果

[tf.reshape(self.targets, [-1])], # 期望正确的結果，這裡将[batch_size, num_steps]壓縮為一維張量

[tf.ones([batch_size * num_steps], dtype=tf.float32)]) # 損失的權重，所有為1表明不同批量和時刻的重要程度一樣

# 計算每個批量的平均損失

self.cost = tf.reduce_sum(loss) / batch_size

self.final_state = state

# 隻在訓練模型時定義反向傳播操作

if not is_training: return

trainable_variable = tf.trainable_variables()

# 控制梯度爆炸問題

grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variable), max_grad_norm)

# 如果需要使用Adam作為優化器，可以改為tf.train.AdamOptimizer(learning_rate)，學習率需要降低至0.001左右

optimizer = tf.train.GradientDescentOptimizer(learning_rate)

# 定義訓練步驟

self.train_op = optimizer.apply_gradients(zip(grads, trainable_variable))

def run_epoch(session, model, data, train_op, output_log, epoch_size):

total_costs = 0.0

iters = 0

state = session.run(model.initial_state)

# # 使用目前資料訓練或測試模型

for step in range(epoch_size):

x, y = session.run(data)

# 在目前批量上運作train_op并計算損失值，交叉熵計算的是下一個單詞為給定單詞的機率

cost, state, _ = session.run([model.cost, model.final_state, train_op],

{model.input_data: x, model.targets: y, model.initial_state: state})

# 将不同時刻和批量的機率就可得到困惑度的對數形式，将這個和做指數運算就可得到困惑度

total_costs += cost

iters += model.num_steps

# 隻在訓練時輸出日志

if output_log and step % 100 == 0:

print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))

return np.exp(total_costs / iters)

def main():

train_data, valid_data, test_data, _ = ptb_raw_data(data_path)

# 計算一個epoch需要訓練的次數

train_data_len = len(train_data)

train_batch_len = train_data_len // train_batch_size

train_epoch_size = (train_batch_len - 1) // train_num_step

valid_data_len = len(valid_data)

valid_batch_len = valid_data_len // eval_batch_size

valid_epoch_size = (valid_batch_len - 1) // eval_num_step

test_data_len = len(test_data)

test_batch_len = test_data_len // eval_batch_size

test_epoch_size = (test_batch_len - 1) // eval_num_step

initializer = tf.random_uniform_initializer(-0.05, 0.05)

with tf.variable_scope("language_model", reuse=None, initializer=initializer):

train_model = PTBModel(True, train_batch_size, train_num_step)

with tf.variable_scope("language_model", reuse=True, initializer=initializer):

eval_model = PTBModel(False, eval_batch_size, eval_num_step)

# 訓練模型。

with tf.Session() as session:

tf.global_variables_initializer().run()

train_queue = ptb_producer(train_data, train_model.batch_size, train_model.num_steps)

eval_queue = ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)

test_queue = ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)

coord = tf.train.Coordinator()

threads = tf.train.start_queue_runners(sess=session, coord=coord)

for i in range(num_epoch):

print("In iteration: %d" % (i + 1))

run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)

valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)

print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))

test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)

print("Test Perplexity: %.3f" % test_perplexity)

coord.request_stop()

coord.join(threads)

if __name__ == "__main__":

main()

TF之LSTM：基于Tensorflow架構采用PTB資料集建立LSTM網絡的自然語言模組化

關于PTB資料集

代碼實作

繼續閱讀

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

解碼器用于語義分割：資料依賴的解碼可以實作靈活的特征聚合

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入