利用深度学习对蛋白质二级结构三态预测
通过输入蛋白质的氨基酸序列,预测其蛋白质结构,本文先通过训练100多万条的 pseudo label 数据,获得一个 pre-pretrain-model,接着使用3万多条labeled的数据进行fine-tuning,获得一个最终的模型作为预测
原创不易,共同学习.请诸位大神在转载时标明出处.
数据集示例
>1UCSA
NKASVVANQLIPINTALTLIMMKAEVVTPMGIPAEEIPKLVGMQVNRAVPLGTTLMPDMVKNYE
CCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEECCCECCCCECCHHHECCCC
依次对应为:
seqs = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'Q': 5, 'E': 6, 'G': 7, 'H': 8,
'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16,
'W': 17, 'Y': 18, 'V': 19}
label = {'C': 0, 'H': 1, 'E': 2}
生成的对应矩阵为:
2 11 0 15 19 19 0 2 5 10 9 14 9 2 16 0 10 16 10 9 12 12 11 0 6 19 19 16 14 12 7 9 14 0 6 6 9 14 11 10 19 7 12 5 19 2 1 0 19 14 10 7 16 16 10 12 14 3 12 19 11 2 18 6
0 0 0 2 2 2 2 0 0 0 2 0 0 0 0 2 0 0 1 1 1 2 2 2 2 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 0 0 0 2 2 0 0 0 2 0 0 0 0 2 0 0 1 1 1 2 0 0 0 0
将矩阵作为输入,main代码如下:
import pdb
import sys
import os
from arg import getArgparse
# os.environ["CUDA_VISIBLE_DEVICES"] = '1'
import torch
from torch import nn
from network import S4PRED
from get_dataset import loadfasta
from torch.utils.data import DataLoader
import torch.optim as optim
import datetime
from sklearn.metrics import f1_score, precision_score, recall_score
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
start = datetime.datetime.now()
args_dict = getArgparse()
device = torch.device(args_dict['device'])
learn_rate = args_dict['learn_rate']
epochs = args_dict['epochs']
batch_size = args_dict['batch_size']
save_dpath = args_dict['save_path']
# test_flag = args_dict['test_flag']
if not os.path.exists(save_dpath):
os.mkdir(save_dpath)
criterion = nn.CrossEntropyLoss(ignore_index=3)
model = S4PRED().to(device)
optimizer = optim.Adam(model.parameters(), lr=learn_rate, betas=(0.9, 0.999))
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 80], gamma=0.1)
accuracy = [0.0]
def main():
test_loader = DataLoader(loadfasta("test"), batch_size=1, shuffle=False, num_workers=4)
final_model_path = os.path.join(save_dpath, 'final_model.pkl')
pre_model_path = os.path.join(save_dpath, 'pre_train_best.pkl')
if os.path.exists(final_model_path):
print('Starting test...')
test(model, test_loader)
return
train_loader = DataLoader(loadfasta("train"), batch_size=batch_size, shuffle=True, num_workers=8)
valid_loader = DataLoader(loadfasta("valid"), batch_size=1, shuffle=False, num_workers=4)
if os.path.exists(pre_model_path):
print('Load the pre-trained model...')
model.load_state_dict(torch.load(os.path.join(save_dpath, 'pre_train_best.pkl')))
print('Starting fine-tuning...')
for epoch in range(epochs):
print('##Epoch-%s' % epoch)
train(model, train_loader)
valid(model, valid_loader, epoch, False)
else:
pre_train_loader = DataLoader(loadfasta("pre_train"), batch_size=batch_size, shuffle=True, num_workers=8)
for epoch in range(epochs):
print('##Epoch-%s' % epoch)
# pre_train
train(model, pre_train_loader)
valid(model, valid_loader, epoch, True)
# fine tuning
print('Load the pre-trained model...')
model.load_state_dict(torch.load(os.path.join(save_dpath, 'pre_train_best.pkl')))
print('Starting fine-tuning...')
for epoch in range(epochs):
print('##Epoch-%s' % epoch)
train(model, train_loader)
valid(model, valid_loader, epoch, False)
print('Starting test...')
test(model, test_loader)
def train(model, train_loader):
model.train()
train_loss = 0
epoch_acc = 0
for i, data in enumerate(train_loader):
sequence, label = data
optimizer.zero_grad()
output = model(sequence.to(torch.int).to(device)) # batch_size * 4 * 700
loss = criterion(output, label.to(torch.long).to(device))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25)
optimizer.step()
train_loss += loss.item()
if len(output.shape) == 2:
output = torch.unsqueeze(output, 0)
_, predicted = torch.max(output.data, 1) # batch_size * 700
# predicted batch_size * 700
# label batch_size * 700
epoch_acc += getAccuracy(predicted.to('cpu'), label.to(torch.long).to('cpu'))
lr_scheduler.step()
print('Train Accuracy: ', epoch_acc / len(train_loader), ' Train Loss', train_loss / len(train_loader),
datetime.datetime.now() - start)
def valid(model, valid_loader, epoch, pretrain):
model.eval()
val_loss = 0
epoch_acc = 0
with torch.no_grad():
for i, data in enumerate(valid_loader):
sequence, label = data
output = model(sequence.to(torch.int).to(device)) # batch_size * 4 * 700
if len(output.shape) == 2:
output = torch.unsqueeze(output, 0)
loss = criterion(output, label.to(torch.long).to(device))
val_loss += loss.item()
_, predicted = torch.max(output.data, 1) # batch_size * 700
epoch_acc += getAccuracy(predicted.to('cpu'), label.to(torch.long).to(device))
print('Val Accuracy: ', epoch_acc / len(valid_loader), ' Val Loss', val_loss / len(valid_loader))
# save model
if pretrain == True:
torch.save(model.state_dict(), os.path.join(save_dpath, 'pretrain_{}.pkl'.format(epoch)))
torch.save(model.state_dict(), os.path.join(save_dpath, 'pretrain_last.pkl'))
if epoch_acc / len(valid_loader) > max(accuracy):
torch.save(model.state_dict(), os.path.join(save_dpath, 'pre_train_best.pkl'))
accuracy.append(epoch_acc / len(valid_loader))
else:
torch.save(model.state_dict(), os.path.join(save_dpath, 'final_model.pkl'))
def test(model, test_loader):
model.eval()
test_acc = 0
f1 = 0
precision = 0
recall = 0
with torch.no_grad():
for i, data in enumerate(test_loader):
sequence, label = data
model.load_state_dict(torch.load(os.path.join(save_dpath, 'final_model.pkl')))
output = model(sequence.to(torch.int).to(device)) # batch_size * 4 * 700
if len(output.shape) == 2:
output = torch.unsqueeze(output, 0)
# output = output[None]
_, predicted = torch.max(output.data, 1) # batch_size * 700
test_acc += getAccuracy(predicted.to('cpu'), label.to(torch.long).to('cpu'))
res = get_score(predicted.to('cpu'), label.to(torch.long).to('cpu'))
f1 += res[0]
precision += res[1]
recall += res[2]
print('Test Accuracy: ', test_acc / len(test_loader), '\nF1 score', f1 / len(test_loader),
'\nPrecision score', precision / len(test_loader),
'\nRecall score', recall / len(test_loader), datetime.datetime.now() - start)
def getAccuracy(output, label):
accuracy = 0
for i in range(label.size(0)):
total = 0
count = 0
for j in range(label.size(1)):
if label[i][j].item() != 3:
total += 1
if label[i][j].item() == output[i][j].item():
count += 1
else:
break
accuracy += count / total
return accuracy / label.size(0)
def get_score(output, label):
f1 = 0
precision = 0
recall = 0
for i in range(label.size(0)):
j = 0
for j in range(len(label[i])):
if label[i][j] == 3:
break
f1 += f1_score(label[i][:j + 1], output[i][:j + 1], average='weighted', zero_division=1)
precision += precision_score(label[i][:j + 1], output[i][:j + 1], average='weighted', zero_division=1)
recall += recall_score(label[i][:j + 1], output[i][:j + 1], average='weighted', zero_division=1)
return f1 / label.size(0), precision / label.size(0), recall / label.size(0)
if __name__ == "__main__":
main()
network代码:
import torch.nn as nn
import torch.nn.functional as F
class ResidueEmbedding(nn.Embedding):
def __init__(self, vocab_size=21, embed_size=128, padding_idx=None):
super().__init__(vocab_size, embed_size, padding_idx=padding_idx)
class GRUnet(nn.Module):
def __init__(self,lstm_hdim=1024, embed_size=128, num_layers=3, bidirectional=True, lstm=False, outsize=4):
super().__init__()
self.lstm_hdim = lstm_hdim
self.embed = ResidueEmbedding(vocab_size=22, embed_size=embed_size, padding_idx=21)
self.lstm = nn.GRU(128, 1024, num_layers=3, bidirectional=True, batch_first=True, dropout=0)
self.outlayer = nn.Linear(lstm_hdim*2, outsize)
self.finalact = F.log_softmax
def forward(self, x):
x = self.embed(x) # torch.Size([8, 5980, 128])
x, _ = self.lstm(x)
x = self.outlayer(x)
x = self.finalact(x, dim=-1)
x = x.permute(0, 2, 1)
# print('x3', x.shape)
return x.squeeze()
class S4PRED(nn.Module):
def __init__(self):
super().__init__()
self.model_1 = GRUnet()
def forward(self, x):
y_1 = self.model_1(x)
return y_1
参数输入代码arg.py
import argparse
def getArgparse():
parser = argparse.ArgumentParser()
parser.add_argument('--device', metavar='device', type=str, default='cpu',
help='Device to run on, Either: cpu or coda (default; cpu)')
parser.add_argument('--batch_size', metavar='batch_size', type=int, default='64',
help='This is batch_size (default; 64)',
)
parser.add_argument('--learn_rate', metavar='learn_rate', type=float, default='0.0001',
help='this is learn_rate (default; 0.0001)')
parser.add_argument('--epochs', metavar='epochs', type=int, default='10',
help='this is epochs (default; 10)')
parser.add_argument('--save_path', metavar='lines', type=str, default='model/',
help='this is for save model path (default; default)')
args = parser.parse_args()
return vars(args)
# CUDA_VISIBLE_DEVICES=1 python main.py --device cuda --batch_size 64 --learn_rate 0.0001 --epochs 10 --save_path model/first/
数据导入代码:
import numpy as np
import torch
import torch.utils.data as data_utils
def loadfasta(type):
if type == 'pre_train':
seq = np.loadtxt('dataset/pseudo/pseudo_seq.txt')
lab = np.loadtxt('dataset/pseudo/pseudo_lab.txt')
dataset = data_utils.TensorDataset(torch.tensor(seq), torch.tensor(lab))
return dataset
if type == 'train':
seq = np.loadtxt('dataset/train/train_seq.txt')
lab = np.loadtxt('dataset/train/train_lab.txt')
dataset = data_utils.TensorDataset(torch.tensor(seq), torch.tensor(lab))
return dataset
if type == 'valid':
seq = np.loadtxt('dataset/valid/valid_seq.txt')
lab = np.loadtxt('dataset/valid/valid_lab.txt')
dataset = data_utils.TensorDataset(torch.tensor(seq), torch.tensor(lab))
return dataset
if type == 'test':
seq = np.loadtxt('dataset/test/cb513_seq.txt')
lab = np.loadtxt('dataset/test/cb513_lab.txt')
dataset = data_utils.TensorDataset(torch.tensor(seq), torch.tensor(lab))
return dataset