天天看點

機器學習之阿裡雲天池大賽—新聞分類(二)雙向長短記憶網絡

普通的長短時記憶神經網絡隻能對正向資料進行處理,而雙向長短時記憶神經網絡可以處理内容的上下文,通過新聞分類大賽結果可以看到雙向長短時記憶神經網絡具有一定的提升。

在pytorch中實作雙向長短時記憶神經網絡和簡單,隻需在參數中設定bidirectional=True即可,同時對循環神經網絡中的ht和ct的定義num_directions=2,Linear 層in_features參數也變成hidden_size*2,如果不清楚可以參考pytorch官方文檔,其他内容保持不變。

導入包

import torch
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter
from torchtext.datasets import AG_NEWS
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
           

讀取資料

# 訓練資料
dataset = pd.read_csv("data/train_set.csv", sep="\t")
# 訓練集
train_dataset = dataset.sample(frac=0.9)


# 測試集
#test_dataset = dataset[~dataset.index.isin(train_dataset.index)]
           

訓練資料初始化

tokenizer = get_tokenizer("basic_english")
counter = Counter()
for (lebel, line) in train_dataset.iloc:
    counter.update(tokenizer(line))

# 詞表
vocab = Vocab(counter, min_freq=1)

# 單詞總數
vocab_num = len(vocab)
# 将英文句子轉成ID清單
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# 将标簽轉成ID
label_pipeline = lambda x: int(x)
# 将訓練集句子轉成ID向量
def get_label_line_tensors(data):
    lines = []
    labels = []
    for (label, line) in data:
        lines.append(torch.tensor(text_pipeline(line)).to(device))
        labels.append(torch.tensor([label_pipeline(label)]).to(device))
    return labels, lines
           

定義模型

# 定義模型
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,  n_layers, vocab_size, dropout_p=0.1):
        super(RNN_LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.dropout_p = dropout_p
        
        # 定義神經網絡層
        self.embeding = nn.Embedding(vocab_size, input_size)
        self.rnn = nn.LSTM(input_size, hidden_size, n_layers, dropout=self.dropout_p, bidirectional=True)
        self.out = nn.Linear(hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=0)
        
    # 前饋
    def forward(self, input_words, hidden, cell):
        seq_len = input_words.size()[0]
        embeding = self.embeding(input_words).view(seq_len, 1, -1)
        out, (hn, cn) = self.rnn(embeding, (hidden, cell))
        out = self.softmax(self.out(out))
        return out, (hn, cn)
        
    # 初始化隐含層
    def init_hidden(self):
        hidden = torch.zeros(self.n_layers*2, 1, self.hidden_size)
        return hidden.to(device)
    
    # 初始化 cell state
    def init_cell_state(self):
        cell = torch.zeros(self.n_layers*2, 1, self.hidden_size)
        return cell.to(device)
           

定義訓練方法

# 資料訓練
def train(line, label, loss):
    optimizer.zero_grad()
    hidden = model.init_hidden()
    cell = model.init_cell_state()
    o, (h, c)  = model(line, hidden, cell)
    #oh = torch.sum(o, dim=0)
    oh = o[-1:].squeeze(0)
    l = loss(oh, label)
    
    l.backward()
    optimizer.step()
    return l
           

執行訓練

labels, lines = get_label_line_tensors(train_dataset.iloc)
import random
import time
lossNum = []
begin = time.time()
print(begin)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
for i in range(300001):
    idx = random.randint(0, 179999)
    l = train(lines[idx], labels[idx], loss)
    lossNum.append(l)
    if i % 10000==0 or i==(159999):
        print(l)
print(time.time()-begin)
           

模型儲存

torch.save(model, "model60.pkl")
           

定義一個簡單的預測函數

#結果預測 為了快速度得到結果,使用簡單粗暴的方法擷取預測結果
def get_l(g):
    id = -1
    g1 = torch.max(g)
    for i in range(14):
        if g1==g[i]:
            id = i
            break
    return id
           

執行測試資料預測并儲存

# 訓練資料
submit_dataset = pd.read_csv("data/test_a.csv", sep="\t")
t_lines_ = []
for line in submit_dataset.iloc:
    t_lines_.append(torch.tensor(text_pipeline(line["text"])).to(device))

#預測結果
writes = []
for w in t_lines_:
    check_h = model.init_hidden()
    check_c = model.init_cell_state()
    check_o, (_, _) = model(w, check_h, check_c)
    writes.append(get_l(check_o[-1, -1]))
#寫入資料到csv
tt = pd.DataFrame({"label": []})
tt["label"] = writes[0]
tt.to_csv("submit2.csv", sep="\t", index=False)
           
機器學習之阿裡雲天池大賽—新聞分類(二)雙向長短記憶網絡