Welcome to my CSDN: https://spike.blog.csdn.net/
This article address: https://blog.csdn.net/caroline_wendy/article/details/131461900
Introduction to the MetaPredict algorithm:
Intrinsically Disordered Regions (IDRs) are ubiquitous in all life domains and play various functional roles. While folded domains are often well described by a three-dimensional structure, IDRs exist in a series of interconverted states, called aggregates. This structural heterogeneity means that IDRs are largely missing in the PDB, leading to a lack of computational methods for predicting conformational properties of aggregates from sequences. Here, we combine rational sequence design, large-scale molecular simulations, and deep learning to develop ALBATROSS, a deep learning model for predicting IDR aggregate size from sequences. ALBATROSS enables instantaneous prediction of proteome-wide aggregate-average properties. ALBATROSS is lightweight and easy to use, and is available as a locally installed software package or as a point-and-click cloud interface. We first demonstrate the applicability of our predictor by examining the generalization of sequence-ensemble relations in IDRs. We then exploited the high-throughput nature of ALBATROSS to characterize the emerging biophysical behavior of IDRs both within and beyond the proteome.
Use tools metapredict
:
-
GitHub:metapredict: A machine learning-based tool for predicting protein disorder.
-
Usage documentation: https://metapredict.readthedocs.io/en/latest/
Paper:Direct prediction of intrinsically disordered protein conformational properties from sequence
- Update time 2023-05-28
Online interface: https://metapredict.net/
The reference test document comes from @惠望: https://wiki.biomap-int.com/pages/viewpage.action?pageId=100631778
1. Project configuration
Test T1157s1_A1029.fasta
, from CASP15:
>A
DRVRALRRETVEMFYYGFDNYMKVAFPEDELRPVSCTPLTRDLKNPRNFELNDVLGNYSLTLIDSLSTLAILASAPAEDSGTGPKALRDFQDGVAALVEQYGDGRPGPSGVGRRARGFDLDSKVQVFETVIRGVGGLLSAHLFAIGALPITGYQPLRQEDDLFNPPPIPWPNGFTYDGQLLRLALDLAQRLLPAFYTKTGLPYPRVNLRHGIPFYVNSPLHEDPPAKGTTEGPPEITETCSAGAGSLVLEFTVLSRLTGDPRFEQAAKRAFWAVWYRKSQIGLIGAGVDAEQGHWIGTYSVIGAGADSFFEYALKSHILLSGHALPNQTHPSPLHKDVNWMDPNTLFEPLSDAENSAESFLEAWHHAHAAIKRHLYSEREHPHYDNVNLWTGSLVSHWVDSLGAYYSGLLVLAGEVDEAIETNLLYAAIWTRYAALPERWSLREKTVEGGLGWWPLRPEFIESTYHLYRATKDPWYLYVGEMVLRDITRRCWTPCGWAGLQNVLSGEKSDRMESFFLGETTKYMYLLFDDDHPLNKLDASFVFTTEGHPLILPKPKSARRSRNSPRSSQKALTVYQGEGFTNSCPPRPSITPLSGSVIAARDDIYHPARMVDLHLLTTSKHALDGGQMSGQHMAKSNYTLYPWTLPPELLPSNGTCAKVYQPHEVTLEFASNTQQVLGGSAFNFMLSGQNLERLSTDRIRVLSLSGLKITLQLVEEGEREWRVTKLNGIPLGRDEYVVINRAILGDVSDPRFNLVRDPVIAKLQQLHQVNLLDDTTTEEHPDNLDTLDTASAIDLPQDQSSDSEVPDPANLSALLPDLSSFVKSLFARLSNLTSPSPDPSSNLPLNVVINQTAILPTGIGAAPLPPAASNSPSGAPIPVFGPVPESLFPWKTIYAAGEACAGPLPDSAPRENQVILIRRGGCSFSDKLANIPAFTPSEESLQLVVVVSDDEHEGQSGLVRPLLDEIQHTPGGMPRRHPIAMVMVGGGETVYQQLSVASAIGIQRRYYIESSGVKVKNIIVDDGDGGVDG
Install the Python package:
pip install metapredict==2.61
import metapredict as meta
2. Function call
2.1 Core function Predict Disorder Batch
Test, calculate the probability value of Residue Disorder from 0 to 1, the larger the value, the more likely it is a Disorder site, the threshold is 0.5, and it is mapped to 0 and 1 for binarization, 1 indicates disorder, and 0 indicates fold.
def predict_batch(seq_list):
if not isinstance(seq_list, list):
seq_list = [seq_list]
output = meta.predict_disorder_batch(seq_list)
assert len(seq_list) == len(output)
res_list = []
for sample in output:
sample_disorder = sample[1]
print(f"disorder range: {
np.min(sample_disorder)}~{
np.max(sample_disorder)}")
sample_disorder_idx = list(np.where(sample_disorder > 0.5, 1, 0))
print(f"sample_disorder_idx: {
sample_disorder_idx}")
# 获取 disorder 区间
d_list, tmp_list = [], []
for i, v in enumerate(sample_disorder_idx):
if v == 1: # 无序
tmp_list.append(i)
else:
if tmp_list:
d_list.append(copy.copy(tmp_list))
tmp_list = []
domain = []
for r in d_list:
if (r[-1] - r[0]) >= 2:
domain.append([r[0], r[-1]])
# seq, disorder_idx, domain
res_list.append([sample[0], sample_disorder_idx, domain])
return res_list
Among them, the threshold value of 0.5 comes from the suggested value of the algorithm.
Altering the disorder theshhold - To alter the disorder threshold, simply set
disorder_threshold=my_value
wheremy_value
is a float. The higher the threshold value, the more conservative metapredict will be for designating a region as disordered. Default = 0.5 (V2) and 0.42 (legacy).
output:
disorder range: 0.0~0.9868000149726868
seq:
DRVRALRRETVEMFYYGFDNYMKVAFPEDELRPVSCTPLTRDLKNPRNFELNDVLGNYSLTLIDSLSTLAILASAPAEDSGTGPKALRDFQDGVAALVEQYGDGRPGPSGVGRRARGFDLDSKVQVFETVIRGVGGLLSAHLFAIGALPITGYQPLRQEDDLFNPPPIPWPNGFTYDGQLLRLALDLAQRLLPAFYTKTGLPYPRVNLRHGIPFYVNSPLHEDPPAKGTTEGPPEITETCSAGAGSLVLEFTVLSRLTGDPRFEQAAKRAFWAVWYRKSQIGLIGAGVDAEQGHWIGTYSVIGAGADSFFEYALKSHILLSGHALPNQTHPSPLHKDVNWMDPNTLFEPLSDAENSAESFLEAWHHAHAAIKRHLYSEREHPHYDNVNLWTGSLVSHWVDSLGAYYSGLLVLAGEVDEAIETNLLYAAIWTRYAALPERWSLREKTVEGGLGWWPLRPEFIESTYHLYRATKDPWYLYVGEMVLRDITRRCWTPCGWAGLQNVLSGEKSDRMESFFLGETTKYMYLLFDDDHPLNKLDASFVFTTEGHPLILPKPKSARRSRNSPRSSQKALTVYQGEGFTNSCPPRPSITPLSGSVIAARDDIYHPARMVDLHLLTTSKHALDGGQMSGQHMAKSNYTLYPWTLPPELLPSNGTCAKVYQPHEVTLEFASNTQQVLGGSAFNFMLSGQNLERLSTDRIRVLSLSGLKITLQLVEEGEREWRVTKLNGIPLGRDEYVVINRAILGDVSDPRFNLVRDPVIAKLQQLHQVNLLDDTTTEEHPDNLDTLDTASAIDLPQDQSSDSEVPDPANLSALLPDLSSFVKSLFARLSNLTSPSPDPSSNLPLNVVINQTAILPTGIGAAPLPPAASNSPSGAPIPVFGPVPESLFPWKTIYAAGEACAGPLPDSAPRENQVILIRRGGCSFSDKLANIPAFTPSEESLQLVVVVSDDEHEGQSGLVRPLLDEIQHTPGGMPRRHPIAMVMVGGGETVYQQLSVASAIGIQRRYYIESSGVKVKNIIVDDGDGGVDG
disorder list:
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
disordered domains:
[[224, 230], [553, 570], [772, 810], [832, 880]]
2.2 Disorder Domains (Official)
predict_disorder_domains
The predicted value of disorder is predict_disorder_batch
consistent with that of , but the conditions for selecting domains are different and the range is larger.
source code:
output = meta.predict_disorder_domains(seq)
print(output)
print(f"disorder: {output.disorder}")
print(f"disordered_domain_boundaries: {output.disordered_domain_boundaries}")
for boundary in output.disordered_domain_boundaries:
s = boundary[0]
e = boundary[1]
print(f"disorder: {np.min(output.disorder[s:e])} ~ {np.max(output.disorder[s:e])}")
print(f"folded_domain_boundaries: {output.folded_domain_boundaries}")
print(f"disordered_domains: {output.disordered_domains}")
print(f"folded_domains: {output.folded_domains}")
output:
DisorderObject for sequence with 1029 residues, 2 IDRs, and 3 folded domains
Available dot variables are:
.sequence
.disorder
.disordered_domain_boundaries
.folded_domain_boundaries
.disordered_domains
.folded_domains
disorder: [0.5593 0.5207 0.4646 ... 0.7331 0.7067 0.6378]
disordered_domain_boundaries: [[553, 571], [772, 880]]
disorder: 0.5241 ~ 0.854
disorder: 0.2163 ~ 0.9868
folded_domain_boundaries: [[0, 553], [571, 772], [880, 1029]]
disordered_domains: ['KPKSARRSRNSPRSSQKA', 'DDTTTEEHPDNLDTLDTASAIDLPQDQSSDSEVPDPANLSALLPDLSSFVKSLFARLSNLTSPSPDPSSNLPLNVVINQTAILPTGIGAAPLPPAASNSPSGAPIPVF']
folded_domains: ['DRVRALRRETVEMFYYGFDNYMKVAFPEDELRPVSCTPLTRDLKNPRNFELNDVLGNYSLTLIDSLSTLAILASAPAEDSGTGPKALRDFQDGVAALVEQYGDGRPGPSGVGRRARGFDLDSKVQVFETVIRGVGGLLSAHLFAIGALPITGYQPLRQEDDLFNPPPIPWPNGFTYDGQLLRLALDLAQRLLPAFYTKTGLPYPRVNLRHGIPFYVNSPLHEDPPAKGTTEGPPEITETCSAGAGSLVLEFTVLSRLTGDPRFEQAAKRAFWAVWYRKSQIGLIGAGVDAEQGHWIGTYSVIGAGADSFFEYALKSHILLSGHALPNQTHPSPLHKDVNWMDPNTLFEPLSDAENSAESFLEAWHHAHAAIKRHLYSEREHPHYDNVNLWTGSLVSHWVDSLGAYYSGLLVLAGEVDEAIETNLLYAAIWTRYAALPERWSLREKTVEGGLGWWPLRPEFIESTYHLYRATKDPWYLYVGEMVLRDITRRCWTPCGWAGLQNVLSGEKSDRMESFFLGETTKYMYLLFDDDHPLNKLDASFVFTTEGHPLILP', 'LTVYQGEGFTNSCPPRPSITPLSGSVIAARDDIYHPARMVDLHLLTTSKHALDGGQMSGQHMAKSNYTLYPWTLPPELLPSNGTCAKVYQPHEVTLEFASNTQQVLGGSAFNFMLSGQNLERLSTDRIRVLSLSGLKITLQLVEEGEREWRVTKLNGIPLGRDEYVVINRAILGDVSDPRFNLVRDPVIAKLQQLHQVNLL', 'GPVPESLFPWKTIYAAGEACAGPLPDSAPRENQVILIRRGGCSFSDKLANIPAFTPSEESLQLVVVVSDDEHEGQSGLVRPLLDEIQHTPGGMPRRHPIAMVMVGGGETVYQQLSVASAIGIQRRYYIESSGVKVKNIIVDDGDGGVDG']
2.3 Predict pLDDT
metapredict
Supports predicting the pLDDT value of Residue, which is used to evaluate the quality of the sequence.
source code:
output = meta.predict_pLDDT(seq)
print(f"mean_plddt: {np.mean(output)}")
2.4 Drawing Graph Disorder
Call directly graph_disorder
to draw:
meta.graph_disorder(seq, pLDDT_scores=True)
Disorder Scores are negatively correlated with pLDDT, the plot is as follows:
3. Test structure
Using ChimeraX to test the structure, the command script is as follows:
@classmethod
def get_chimerax_select_cmd(cls, seq_list, mod_num="1"):
"""
[kaɪˈmɪrə]
select #1:553-571 #1:772-880
"""
res_list = cls.predict_batch(seq_list)
r_str_list = []
for res in res_list:
domains = res[2]
c_list = ["select"]
for domain in domains:
c_str = f"#{
mod_num}:{
domain[0]}-{
domain[1]}"
c_list.append(c_str)
r_str = " ".join(c_list)
r_str_list.append(r_str)
return r_str_list
output:
ChimeraX: select #1:224-230 #1:553-570 #1:772-810 #1:832-880
Test the structure, the green highlight is the disordered area:
other
reference:
About ChatGPT's translation prompt:
您好,如果你准备好了,请回答“是”,以下是一段蛋白质相关的段落,你作为一个生物学家,帮忙翻译成中文。"[需要翻译的段落]"
The source code is as follows:
#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2022. All rights reserved.
Created by C. L. Wang on 2023/6/29
"""
import copy
import os
import metapredict as meta
import numpy as np
from protein_utils.seq_utils import get_seq_from_fasta
from root_dir import DATA_DIR
class SeqIdrPredictor(object):
"""
序列的 IDRs 区域预测
pip install metapredict==2.61
"""
def __init__(self):
pass
@staticmethod
def predict_batch(seq_list):
"""
核心函数
"""
if not isinstance(seq_list, list):
seq_list = [seq_list]
output = meta.predict_disorder_batch(seq_list)
assert len(seq_list) == len(output)
res_list = []
for sample in output:
sample_disorder = sample[1]
print(f"disorder range: {
np.min(sample_disorder)}~{
np.max(sample_disorder)}")
sample_disorder_idx = list(np.where(sample_disorder > 0.5, 1, 0))
# 获取 disorder 区间
d_list, tmp_list = [], []
for i, v in enumerate(sample_disorder_idx):
if v == 1: # 无序
tmp_list.append(i)
else:
if tmp_list:
d_list.append(copy.copy(tmp_list))
tmp_list = []
domains = []
for r in d_list:
if (r[-1] - r[0]) >= 2:
domains.append([r[0], r[-1]])
# seq, disorder_idx, domain
res_list.append([sample[0], sample_disorder_idx, domains])
return res_list
@classmethod
def get_chimerax_select_cmd(cls, seq_list, mod_num="1"):
"""
[kaɪˈmɪrə]
select #1:553-571 #1:772-880
"""
res_list = cls.predict_batch(seq_list)
r_str_list = []
for res in res_list:
domains = res[2]
c_list = ["select"]
for domain in domains:
c_str = f"#{
mod_num}:{
domain[0]}-{
domain[1]}"
c_list.append(c_str)
r_str = " ".join(c_list)
r_str_list.append(r_str)
return r_str_list
@staticmethod
def predict_disorder_domains(seq, is_print=False):
output = meta.predict_disorder_domains(seq)
if is_print:
output = meta.predict_disorder_domains(seq)
print(output)
print(f"disorder: {
output.disorder}")
print(f"disordered_domain_boundaries: {
output.disordered_domain_boundaries}")
for boundary in output.disordered_domain_boundaries:
s = boundary[0]
e = boundary[1]
print(f"disorder: {
np.min(output.disorder[s:e])} ~ {
np.max(output.disorder[s:e])}")
print(f"folded_domain_boundaries: {
output.folded_domain_boundaries}")
print(f"disordered_domains: {
output.disordered_domains}")
print(f"folded_domains: {
output.folded_domains}")
return output
def main():
fasta_dir = os.path.join(DATA_DIR, "CASP15-Monomer-Targets-56", "fasta")
fasta_path = os.path.join(fasta_dir, "T1157s1_A1029.fasta")
seq = get_seq_from_fasta(fasta_path)[0]
sip = SeqIdrPredictor()
res_list = sip.predict_batch(seq)
print(f"seq:\n{
res_list[0][0]}")
print(f"disorder list:\n{
res_list[0][1]}")
print(f"disordered domains:\n{
res_list[0][2]}")
sip.predict_disorder_domains(seq, is_print=True)
output = meta.predict_pLDDT(seq)
print(f"mean_plddt: {
np.mean(output)}")
meta.graph_disorder(seq, pLDDT_scores=True)
r_str_list = sip.get_chimerax_select_cmd(seq)
print(f"ChimeraX: {
r_str_list[0]}")
if __name__ == '__main__':
main()