天天看点

基于基因序列的分类问题(lstm的输出到底怎么用)

给出一个基因序列 来判断 是阳性还是阴性。

首先看下数据:

基于基因序列的分类问题(lstm的输出到底怎么用)

 然后看下模型图:

基于基因序列的分类问题(lstm的输出到底怎么用)

再开始代码之间我叙述一个问题就是lstm的输出到底怎么用 这里给出两种用法:

用法一:本实验用的方法:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNN_LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self,):
        super().__init__()
        input_size = 3
        hidden_size = 32
        output_size = 32
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=1,kernel_size=2,stride=1)
        # input_size:输入lstm单元向量的长度 ,hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size,output_size,num_layers=1)  # ,batch_first=True
#         self.linear_1 = nn.Linear(output_size,1)
#         self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(1280,2)
        self.softmax=nn.Softmax(dim=1)
    def forward(self,x,batch_size):
        x = x.type(torch.FloatTensor)
        x=x.to(device)
        x=x.unsqueeze(1)
        x =self.conv1(x) 
        x=x.squeeze(1)
        # 输入 lstm的矩阵形状是:[序列长度,batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out,(h_n,c_n)= self.lstm(x, None)
        lstm_out=lstm_out.view(batch_size,-1)
        lstm_out=self.linear_2(lstm_out)
        prediction=self.softmax(lstm_out)
        return prediction
           

 用法二:我的另一篇博客基于学生成绩期末成绩预测

基于学生做题记录的成绩预测_py机器学习深度学习的博客-CSDN博客

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self, ):
        super().__init__()
        input_size = 12
        hidden_size = 128
        output_size = 1
        # input_size:输入lstm单元向量的长度 ,hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=1)  # ,batch_first=True
        # --------------------------------------------------------------------------
 
        self.multihead_Linear_k = nn.Linear(hidden_size, hidden_size)
        self.multihead_Linear_q = nn.Linear(hidden_size, hidden_size)
        self.multihead_Linear_v = nn.Linear(hidden_size, hidden_size)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=8)
        # 因此模型维度 hidden_size 必须可以被头部数量整除
        # --------------------------------------------------------------------------
 
        self.lstm_2 = nn.LSTM(hidden_size, hidden_size, num_layers=1)
        # --------------------------------------------------------------------------
 
        self.linear_1 = nn.Linear(hidden_size, 1)
        self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(700,2)
        self.softmax=nn.Softmax(dim=1)
 
    def forward(self, x, batch_size):
        x = torch.Tensor(x.numpy()).to(device)
        x = x.transpose(0, 1).to(device)
 
        # 输入 lstm的矩阵形状是:[序列长度,batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out, h_n = self.lstm(x, None)
        #         print(lstm_out.shape) [序列长度,batch_size, 64]
 
        #         query,key,value的输入形状一定是 [sequence_size, batch_size, emb_size] 比如:value.shape torch.Size( [序列长度,batch_size, 64])
        query = self.multihead_Linear_q(lstm_out)
        key = self.multihead_Linear_k(lstm_out)
        value = self.multihead_Linear_v(lstm_out)
 
        # multihead_attention 输入矩阵计算 :
        attn_output, attn_output_weights = self.multihead_attn(query, key, value)
        #   输出 attn_output.shape torch.Size([序列长度,batch_size, 64])
 
        lstm_out_2, h_n_2 = self.lstm_2(attn_output, h_n)
        lstm_out_2 = lstm_out_2.transpose(0, 1)
        #         print("lstm_out_2.shape",lstm_out_2.shape)# lstm_out_2.shape torch.Size([20, 600, 128])
        # [序列长度,batch_size, 64]
        #         prediction=lstm_out_2[-1].to(device)
        #         print(prediction.shape)# torch.Size([batch_size, 64])
 
        # 使用卷积
        # 两个全连接+激活函数
        prediction = self.linear_1(lstm_out_2)
        prediction = prediction.squeeze(2)
        prediction = self.ReLU(prediction)
        prediction = self.linear_2(prediction)
        prediction = prediction.squeeze(1)
        prediction=self.softmax(prediction)
        #         print("prediction.shape",prediction.shape)
        #          torch.Size([batch_size,1])
        return prediction

————————————————
版权声明:本文为CSDN博主「py机器学习深度学习」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_38735017/article/details/120675907
           

然后直接上本项目的代码:-------------------------------------------------------------------------------------------

from collections import Counter
import torch
from torch import nn
from torch import optim
import math
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from torch.utils.data import random_split
from collections import Counter
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
from torch.utils.tensorboard.writer import SummaryWriter
from torch.utils.data import random_split
import os
SEED = 1210
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(SEED) #设置几乎所有的随机种子 随机种子,可使得结果可复现
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

word_enbeding={"A":[1,0,0,0],"C":[0,1,0,0],"T":[0,0,1,0],"G":[0,0,0,1]}





data_y=[]
data_x=[]
with open("benchmark.fasta") as f:
    for line in f:
        line=list(line)[0:-1]
        if len(line)<12:
            if line[1]=="+":
                data_y.append(1)
            if line[1]=="-":
                data_y.append(0)
        else:
            data_x_one=[]
            for i in line:
                data_x_one.append(word_enbeding[i])  
            if len(data_x_one)<41:
                data_x_one.append(np.array([0,0,0,0]))
            data_x.append(data_x_one)



print(data_x[1])
print(data_y[1:10])
print(len(data_x))
print(len(data_y))


class mydataset(Dataset):
        def __init__(self): # 读取加载数据
            self._x=torch.tensor(np.array(data_x).astype(float))
            self._y=torch.tensor(np.array(data_y).astype(float))
            self._len=len(data_y)
            
        def __getitem__(self,item):
            return self._x[item],self._y[item]
        
        def __len__(self):# 返回整个数据的长度 
            return self._len
        
data=mydataset()
 
# 划分 训练集 测试集 
train_data,test_data=random_split(data,[round(0.8*data._len),round(0.2*data._len)])#这个参数有的版本没有 generator=torch.Generator().manual_seed(0)
#                     随机混乱顺序划分的     四舍五入

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNN_LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self,):
        super().__init__()
        input_size = 3
        hidden_size = 32
        output_size = 32
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=1,kernel_size=2,stride=1)
        # input_size:输入lstm单元向量的长度 ,hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size,output_size,num_layers=1)  # ,batch_first=True
#         self.linear_1 = nn.Linear(output_size,1)
#         self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(1280,2)
        self.softmax=nn.Softmax(dim=1)
    def forward(self,x,batch_size):
        x = x.type(torch.FloatTensor)
        x=x.to(device)
        x=x.unsqueeze(1)
        x =self.conv1(x) 
        x=x.squeeze(1)
        # 输入 lstm的矩阵形状是:[序列长度,batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out,(h_n,c_n)= self.lstm(x, None)
        lstm_out=lstm_out.view(batch_size,-1)
        lstm_out=self.linear_2(lstm_out)
        prediction=self.softmax(lstm_out)
        return prediction

# 这个函数是测试用来测试x_test y_test 数据 函数
def eval_test(model):  # 返回的是这10个 测试数据的平均loss
    test_epoch_loss = []
    with torch.no_grad():
        optimizer.zero_grad()
        for step, (test_x, test_y) in enumerate(test_loader):
            y_pre = model(test_x,batch_size )
            test_y = test_y.to(device)        
            test_loss = loss_function(y_pre, test_y.long())
            test_epoch_loss.append(test_loss.item())
    return np.mean(test_epoch_loss)


epochs = 20
batch_size = 128
# 在模型测试中 这两个值:batch_size = 19 固定得 epochs = 随便设置
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)

# 创建LSTM()类的对象,定义损失函数和优化器

model = CNN_LSTM().to(device)
loss_function = torch.nn.CrossEntropyLoss().to(device)# 损失函数的计算 交叉熵损失函数计算
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # 建立优化器实例
print(model)


sum_train_epoch_loss = []  # 存储每个epoch 下 训练train数据的loss
sum_test_epoch_loss = []  # 存储每个epoch 下 测试 test数据的loss
best_test_loss = 10000
for epoch in range(epochs):
    epoch_loss = []
    for step, (train_x, train_y) in enumerate(train_loader):
        y_pred = model(train_x,batch_size)
        # 训练过程中,正向传播生成网络的输出,计算输出和实际值之间的损失值
        single_loss = loss_function(y_pred.cpu(),train_y.long())
        single_loss.backward()  # 调用backward()自动生成梯度
        optimizer.step()  # 使用optimizer.step()执行优化器,把梯度传播回每个网络
        
        epoch_loss.append(single_loss.item())
        
    train_epoch_loss = np.mean(epoch_loss)
    test_epoch_loss = eval_test(model)  # 测试数据的平均loss
    
    if test_epoch_loss<best_test_loss:
        best_test_loss=test_epoch_loss
        print("best_test_loss",best_test_loss)
        best_model=model
    sum_train_epoch_loss.append(train_epoch_loss)
    sum_test_epoch_loss.append(test_epoch_loss)
    print("epoch:" + str(epoch) + "  train_epoch_loss: " + str(train_epoch_loss) + "  test_epoch_loss: " + str(test_epoch_loss))

torch.save(best_model, 'best_model.pth')


# 画图
# sum_train_epoch_loss=[]
# sum_test_epoch_loss=[]
fig = plt.figure(facecolor='white', figsize=(10,7 ))
plt.xlabel('第几个epoch')
plt.ylabel('loss值')
plt.xlim(xmax=len(sum_train_epoch_loss),xmin=0)
plt.ylim(ymax=max(sum_train_epoch_loss),ymin=0)
#画两条(0-9)的坐标轴并设置轴标签x,y
 
x1 =[i for i in range(0,len(sum_train_epoch_loss),1)] # 随机产生300个平均值为2,方差为1.2的浮点数,即第一簇点的x轴坐标
y1 = sum_train_epoch_loss # 随机产生300个平均值为2,方差为1.2的浮点数,即第一簇点的y轴坐标

x2 = [i for i in range(0,len(sum_test_epoch_loss),1)]
y2 = sum_test_epoch_loss

colors1 = '#00CED4' #点的颜色
colors2 = '#DC143C'
area = np.pi * 4**1  # 点面积 
# 画散点图
plt.scatter(x1, y1, s=area, c=colors1, alpha=0.4, label='train_loss')
plt.scatter(x2, y2, s=area, c=colors2, alpha=0.4, label='val_loss')
# plt.plot([0,9.5],[9.5,0],linewidth = '0.5',color='#000000')
plt.legend()
# plt.savefig(r'C:\Users\jichao\Desktop\大论文\12345svm.png', dpi=300)
plt.show()



import sklearn
#模型加载:
model.load_state_dict(torch.load('best_model.pth').cpu().state_dict())
model.eval()
test_pred=[]
test_true=[]
# 直观的进行测试:一共95个学生的信息 76个训练 19个进行训练
with torch.no_grad():
    optimizer.zero_grad()
    for step, (test_x, test_y) in enumerate(test_loader):
        y_pre = model(test_x,128).cpu()
        y_pre=torch.argmax(y_pre,dim=1)
        for i in y_pre:
            test_pred.append(i)
        for i in test_y:
            test_true.append(i)
            
            
        
Acc=accuracy_score(test_pred,test_true) 
Mcc=sklearn.metrics.confusion_matrix(test_pred,test_true)
Sn=sklearn.metrics.precision_score(test_pred,test_true)
Sp=sklearn.metrics.recall_score(test_pred,test_true)
print(Acc)
print(Mcc)
print(Sn)
print(Sp)



           

 损失曲线:

基于基因序列的分类问题(lstm的输出到底怎么用)
基于基因序列的分类问题(lstm的输出到底怎么用)

 评价指标:

基于基因序列的分类问题(lstm的输出到底怎么用)

 感觉还可以 网上接的一个活 价格350 时间3小时。