基于基因序列的分类问题（lstm的输出到底怎么用）

给出一个基因序列来判断是阳性还是阴性。

首先看下数据：

然后看下模型图：

基于基因序列的分类问题（lstm的输出到底怎么用）

再开始代码之间我叙述一个问题就是lstm的输出到底怎么用这里给出两种用法：

用法一：本实验用的方法:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNN_LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self,):
        super().__init__()
        input_size = 3
        hidden_size = 32
        output_size = 32
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=1,kernel_size=2,stride=1)
        # input_size：输入lstm单元向量的长度 ，hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size,output_size,num_layers=1)  # ,batch_first=True
#         self.linear_1 = nn.Linear(output_size,1)
#         self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(1280,2)
        self.softmax=nn.Softmax(dim=1)
    def forward(self,x,batch_size):
        x = x.type(torch.FloatTensor)
        x=x.to(device)
        x=x.unsqueeze(1)
        x =self.conv1(x) 
        x=x.squeeze(1)
        # 输入 lstm的矩阵形状是：[序列长度，batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out,(h_n,c_n)= self.lstm(x, None)
        lstm_out=lstm_out.view(batch_size,-1)
        lstm_out=self.linear_2(lstm_out)
        prediction=self.softmax(lstm_out)
        return prediction

用法二：我的另一篇博客基于学生成绩期末成绩预测

基于学生做题记录的成绩预测_py机器学习深度学习的博客-CSDN博客

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self, ):
        super().__init__()
        input_size = 12
        hidden_size = 128
        output_size = 1
        # input_size：输入lstm单元向量的长度 ，hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=1)  # ,batch_first=True
        # --------------------------------------------------------------------------
 
        self.multihead_Linear_k = nn.Linear(hidden_size, hidden_size)
        self.multihead_Linear_q = nn.Linear(hidden_size, hidden_size)
        self.multihead_Linear_v = nn.Linear(hidden_size, hidden_size)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=8)
        # 因此模型维度 hidden_size 必须可以被头部数量整除
        # --------------------------------------------------------------------------
 
        self.lstm_2 = nn.LSTM(hidden_size, hidden_size, num_layers=1)
        # --------------------------------------------------------------------------
 
        self.linear_1 = nn.Linear(hidden_size, 1)
        self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(700,2)
        self.softmax=nn.Softmax(dim=1)
 
    def forward(self, x, batch_size):
        x = torch.Tensor(x.numpy()).to(device)
        x = x.transpose(0, 1).to(device)
 
        # 输入 lstm的矩阵形状是：[序列长度，batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out, h_n = self.lstm(x, None)
        #         print(lstm_out.shape) [序列长度,batch_size, 64]
 
        #         query，key，value的输入形状一定是 [sequence_size, batch_size, emb_size] 比如：value.shape torch.Size( [序列长度,batch_size, 64])
        query = self.multihead_Linear_q(lstm_out)
        key = self.multihead_Linear_k(lstm_out)
        value = self.multihead_Linear_v(lstm_out)
 
        # multihead_attention 输入矩阵计算 ：
        attn_output, attn_output_weights = self.multihead_attn(query, key, value)
        #   输出 attn_output.shape torch.Size([序列长度,batch_size, 64])
 
        lstm_out_2, h_n_2 = self.lstm_2(attn_output, h_n)
        lstm_out_2 = lstm_out_2.transpose(0, 1)
        #         print("lstm_out_2.shape",lstm_out_2.shape)# lstm_out_2.shape torch.Size([20, 600, 128])
        # [序列长度,batch_size, 64]
        #         prediction=lstm_out_2[-1].to(device)
        #         print(prediction.shape)# torch.Size([batch_size, 64])
 
        # 使用卷积
        # 两个全连接+激活函数
        prediction = self.linear_1(lstm_out_2)
        prediction = prediction.squeeze(2)
        prediction = self.ReLU(prediction)
        prediction = self.linear_2(prediction)
        prediction = prediction.squeeze(1)
        prediction=self.softmax(prediction)
        #         print("prediction.shape",prediction.shape)
        #          torch.Size([batch_size,1])
        return prediction

————————————————
版权声明：本文为CSDN博主「py机器学习深度学习」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/qq_38735017/article/details/120675907

然后直接上本项目的代码：-------------------------------------------------------------------------------------------

from collections import Counter
import torch
from torch import nn
from torch import optim
import math
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
from torch.utils.data import random_split
from collections import Counter
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
from torch.utils.tensorboard.writer import SummaryWriter
from torch.utils.data import random_split
import os
SEED = 1210
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(SEED) #设置几乎所有的随机种子 随机种子，可使得结果可复现
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

word_enbeding={"A":[1,0,0,0],"C":[0,1,0,0],"T":[0,0,1,0],"G":[0,0,0,1]}





data_y=[]
data_x=[]
with open("benchmark.fasta") as f:
    for line in f:
        line=list(line)[0:-1]
        if len(line)<12:
            if line[1]=="+":
                data_y.append(1)
            if line[1]=="-":
                data_y.append(0)
        else:
            data_x_one=[]
            for i in line:
                data_x_one.append(word_enbeding[i])  
            if len(data_x_one)<41:
                data_x_one.append(np.array([0,0,0,0]))
            data_x.append(data_x_one)



print(data_x[1])
print(data_y[1:10])
print(len(data_x))
print(len(data_y))


class mydataset(Dataset):
        def __init__(self): # 读取加载数据
            self._x=torch.tensor(np.array(data_x).astype(float))
            self._y=torch.tensor(np.array(data_y).astype(float))
            self._len=len(data_y)
            
        def __getitem__(self,item):
            return self._x[item],self._y[item]
        
        def __len__(self):# 返回整个数据的长度 
            return self._len
        
data=mydataset()
 
# 划分 训练集 测试集 
train_data,test_data=random_split(data,[round(0.8*data._len),round(0.2*data._len)])#这个参数有的版本没有 generator=torch.Generator().manual_seed(0)
#                     随机混乱顺序划分的     四舍五入

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNN_LSTM(nn.Module):  # 注意Module首字母需要大写
    def __init__(self,):
        super().__init__()
        input_size = 3
        hidden_size = 32
        output_size = 32
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=1,kernel_size=2,stride=1)
        # input_size：输入lstm单元向量的长度 ，hidden_size输出lstm单元向量的长度。也是输入、输出隐藏层向量的长度
        self.lstm = nn.LSTM(input_size,output_size,num_layers=1)  # ,batch_first=True
#         self.linear_1 = nn.Linear(output_size,1)
#         self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(1280,2)
        self.softmax=nn.Softmax(dim=1)
    def forward(self,x,batch_size):
        x = x.type(torch.FloatTensor)
        x=x.to(device)
        x=x.unsqueeze(1)
        x =self.conv1(x) 
        x=x.squeeze(1)
        # 输入 lstm的矩阵形状是：[序列长度，batch_size,每个向量的维度] [序列长度,batch, 64]
        lstm_out,(h_n,c_n)= self.lstm(x, None)
        lstm_out=lstm_out.view(batch_size,-1)
        lstm_out=self.linear_2(lstm_out)
        prediction=self.softmax(lstm_out)
        return prediction

# 这个函数是测试用来测试x_test y_test 数据 函数
def eval_test(model):  # 返回的是这10个 测试数据的平均loss
    test_epoch_loss = []
    with torch.no_grad():
        optimizer.zero_grad()
        for step, (test_x, test_y) in enumerate(test_loader):
            y_pre = model(test_x,batch_size )
            test_y = test_y.to(device)        
            test_loss = loss_function(y_pre, test_y.long())
            test_epoch_loss.append(test_loss.item())
    return np.mean(test_epoch_loss)


epochs = 20
batch_size = 128
# 在模型测试中 这两个值：batch_size = 19 固定得 epochs = 随便设置
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)

# 创建LSTM()类的对象，定义损失函数和优化器

model = CNN_LSTM().to(device)
loss_function = torch.nn.CrossEntropyLoss().to(device)# 损失函数的计算 交叉熵损失函数计算
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # 建立优化器实例
print(model)


sum_train_epoch_loss = []  # 存储每个epoch 下 训练train数据的loss
sum_test_epoch_loss = []  # 存储每个epoch 下 测试 test数据的loss
best_test_loss = 10000
for epoch in range(epochs):
    epoch_loss = []
    for step, (train_x, train_y) in enumerate(train_loader):
        y_pred = model(train_x,batch_size)
        # 训练过程中，正向传播生成网络的输出，计算输出和实际值之间的损失值
        single_loss = loss_function(y_pred.cpu(),train_y.long())
        single_loss.backward()  # 调用backward()自动生成梯度
        optimizer.step()  # 使用optimizer.step()执行优化器，把梯度传播回每个网络
        
        epoch_loss.append(single_loss.item())
        
    train_epoch_loss = np.mean(epoch_loss)
    test_epoch_loss = eval_test(model)  # 测试数据的平均loss
    
    if test_epoch_loss<best_test_loss:
        best_test_loss=test_epoch_loss
        print("best_test_loss",best_test_loss)
        best_model=model
    sum_train_epoch_loss.append(train_epoch_loss)
    sum_test_epoch_loss.append(test_epoch_loss)
    print("epoch:" + str(epoch) + "  train_epoch_loss： " + str(train_epoch_loss) + "  test_epoch_loss: " + str(test_epoch_loss))

torch.save(best_model, 'best_model.pth')


# 画图
# sum_train_epoch_loss=[]
# sum_test_epoch_loss=[]
fig = plt.figure(facecolor='white', figsize=(10,7 ))
plt.xlabel('第几个epoch')
plt.ylabel('loss值')
plt.xlim(xmax=len(sum_train_epoch_loss),xmin=0)
plt.ylim(ymax=max(sum_train_epoch_loss),ymin=0)
#画两条（0-9）的坐标轴并设置轴标签x，y
 
x1 =[i for i in range(0,len(sum_train_epoch_loss),1)] # 随机产生300个平均值为2，方差为1.2的浮点数，即第一簇点的x轴坐标
y1 = sum_train_epoch_loss # 随机产生300个平均值为2，方差为1.2的浮点数，即第一簇点的y轴坐标

x2 = [i for i in range(0,len(sum_test_epoch_loss),1)]
y2 = sum_test_epoch_loss

colors1 = '#00CED4' #点的颜色
colors2 = '#DC143C'
area = np.pi * 4**1  # 点面积 
# 画散点图
plt.scatter(x1, y1, s=area, c=colors1, alpha=0.4, label='train_loss')
plt.scatter(x2, y2, s=area, c=colors2, alpha=0.4, label='val_loss')
# plt.plot([0,9.5],[9.5,0],linewidth = '0.5',color='#000000')
plt.legend()
# plt.savefig(r'C:\Users\jichao\Desktop\大论文\12345svm.png', dpi=300)
plt.show()



import sklearn
#模型加载：
model.load_state_dict(torch.load('best_model.pth').cpu().state_dict())
model.eval()
test_pred=[]
test_true=[]
# 直观的进行测试：一共95个学生的信息 76个训练 19个进行训练
with torch.no_grad():
    optimizer.zero_grad()
    for step, (test_x, test_y) in enumerate(test_loader):
        y_pre = model(test_x,128).cpu()
        y_pre=torch.argmax(y_pre,dim=1)
        for i in y_pre:
            test_pred.append(i)
        for i in test_y:
            test_true.append(i)
            
            
        
Acc=accuracy_score(test_pred,test_true) 
Mcc=sklearn.metrics.confusion_matrix(test_pred,test_true)
Sn=sklearn.metrics.precision_score(test_pred,test_true)
Sp=sklearn.metrics.recall_score(test_pred,test_true)
print(Acc)
print(Mcc)
print(Sn)
print(Sp)

损失曲线：

基于基因序列的分类问题（lstm的输出到底怎么用）

评价指标：

基于基因序列的分类问题（lstm的输出到底怎么用）

感觉还可以网上接的一个活价格350 时间3小时。

基于基因序列的分类问题（lstm的输出到底怎么用）

再开始代码之间我叙述一个问题就是lstm的输出到底怎么用这里给出两种用法：

继续阅读

libsvm for python 安装

学习软件测试基础测试第七天

Zeppelin 配置访问 REST APIApache Zeppelin Configuration REST API

【Torch】最简洁logging使用指南

笔试面试题目：滑动窗口(二)

27. Remove Element(列表)题目代码

数据结构与算法（27）——排序（二）

Dijkstra--简易版（最短路径）

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

Cloud Studio初体验

使用 ctypes 进行 Python 和 C 的混合编程

【python】【数据处理】画多维数据分布图

【python】netconf协议对接管理设备

「Python 网络自动化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 网络设备

在python中创建excel并写入

hdu7108哈希

基于基因序列的分类问题（lstm的输出到底怎么用）

再开始代码之间我叙述一个问题就是lstm的输出到底怎么用 这里给出两种用法：

继续阅读

再开始代码之间我叙述一个问题就是lstm的输出到底怎么用这里给出两种用法：