生物信息010-Biopython操作DNA,RNA和蛋白质序列

如何将一条DNA编码序列翻译成蛋白质序列,并写入fasta文件

读入DNA序列

from Bio import Seq
from Bio.Alphabet import IUPAC
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)
print(dna)
运行结果:
ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACCGTTAA

翻译成蛋白质序列

from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# read the input sequence
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)

# transcribe and translate
mrna = dna.transcribe()
protein = mrna.translate()
print(protein)

运行结果:
MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR*

写入fasta文件


"""
从Bio导入 4 个模块
Seq 用来创建序列对象
IUPAC用来定义一个序列对象用的生物字符集
SeqRecord 创建一个包含ID,注释,描述等的序列记录对象
SeqIO 提供了方法来读写格式化的序列文件
"""

from Bio import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# read the input sequence
dna = open("data/hemoglobin-gene.txt").read().strip()
dna = Seq.Seq(dna, IUPAC.unambiguous_dna)

# transcribe and translate
#
mrna = dna.transcribe()
protein = mrna.translate()

# write the protein sequence to a file
protein_record = SeqRecord(protein, id='sp|P69905.2|HBA_HUMAN',
description="Hemoglobin subunit alpha, Homo sapiens")

outfile = open("data/HBA_HUMAN.fasta", "w")
SeqIO.write(protein_record, outfile,"fasta")
outfile.close()

运行查看结果文件HBA_HUMAN.fasta
>sp|P69905.2|HBA_HUMAN Hemoglobin subunit alpha, Homo sapiens
MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG
KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP
AVHASLDKFLASVSTVLTSKYR*

Seq对象

from Bio import Seq
my_seq = Seq.Seq('AGCATCGTAGCATGCAC')
print(my_seq)

运行结果:
AGCATCGTAGCATGCAC

转录和翻译序列

#默认的序列是编码链
from Bio import Seq
my_seq = Seq.Seq('AGCATCGTAGCATGCAC')
rna = my_seq.transcribe()
print(rna)

运行结果:
AGCAUCGUAGCAUGCAC

#如果是模板链
#先反向互补,后在转录
from Bio import Seq
from Bio.Alphabet import IUPAC
dna = Seq.Seq('AGCATCGTAGCATGCAC',IUPAC.unambiguous_dna)
cdna = dna.reverse_complement()
print(cdna)
mrna = cdna.transcribe()
print(mrna)

运行结果:
GTGCATGCTACGATGCT
GUGCAUGCUACGAUGCU

把序列当成字符串工作

#索引,切片,分割,转换,序列大小写,计算字符个数

from Bio import Seq
dna = Seq.Seq('AGCATCGTAGCATGCAC GCATGCAC')

print(dna[0])
print(dna[0:3])
print(dna.split('T'))
print(dna.count("A"))
print(dna.count('A')/ float(len(dna)))
print(dna.find('CGTA'))

运行结果:
A
AGC
[Seq('AGCA', Alphabet()), Seq('CG', Alphabet()), Seq('AGCA', Alphabet()), Seq('GCAC GCA', Alphabet()), Seq('GCAC', Alphabet())]
7
0.2692307692307692
5

用Bio.SeqIO 模块来解析一个多序列FASTA文件

from Bio import SeqIO

fasta_file = open("data/Uniprot.fasta","r")
for seq_record in SeqIO.parse(fasta_file, "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

fasta_file.close()
运行结果:
sp|P03372|ESR1_HUMAN
Seq('MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNY...ATV', SingleLetterAlphabet())
595
sp|P62333|PRS10_HUMAN
Seq('MADPRDKALQDYRKKLLEHKEIDGRLKELREQLKELTKQYEKSENDLKALQSVG...KPV', SingleLetterAlphabet())
389
sp|P62509|ERR3_MOUSE
Seq('MDSVELCLPESFSLHYEEELLCRMSNKDRHIDSSCSSFIKTEPSSPASLTDSVN...AKV', SingleLetterAlphabet())
458

用SeqIO模块来解析一个记录文件,将其内容储存到一个列表或字典中


from Bio import SeqIO

# read fasta entries to a list
uniprot_iterator = SeqIO.parse("data/Uniprot.fasta", "fasta")
records = list(uniprot_iterator)
print(records[0].id)
print(records[0].seq)

print('-' * 40)

# read fasta entries to a dictionary
uniprot_iterator = SeqIO.parse("data/Uniprot.fasta", "fasta")
records = SeqIO.to_dict(uniprot_iterator)
print(records['sp|P03372|ESR1_HUMAN'].id)
print(records['sp|P03372|ESR1_HUMAN'].seq)

运行结果:
sp|P03372|ESR1_HUMAN
MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV
----------------------------------------
sp|P03372|ESR1_HUMAN
MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV

序列文件格式的转换


from Bio import SeqIO

genbank_file = open ("data/AY810830.gbk", "r")
output_file = open("data/AY810830.fasta", "w")
records = SeqIO.parse(genbank_file, "genbank")
SeqIO.write(records, output_file, "fasta")
output_file.close()

猜你喜欢

转载自blog.csdn.net/sunchengquan/article/details/79805780
今日推荐