python 提取NCBI上的CDS

 # sequence.gb文件  https://www.ncbi.nlm.nih.gov/nuccore/NC_000006.12?report=genbank 

#position.txt

 

#生成的文件

postion_HLA_seq.fa

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

records = SeqIO.read('sequence.gb', 'gb')

features = records.features
for feature in features:
	if feature.type == "CDS":
		rec =  SeqRecord(feature.location.extract(records).seq)
		print feature.location.start.position, feature.location.end.position
		rec.id = 'HLA_CDS'

SeqIO.write(rec, 'HLA_CDS.fa', 'fasta')	

print rec.seq[362], rec.seq[96], rec.seq[602]

fp = open('postion_HLA_seq.fa', 'w')

with open('position.txt') as fp1:
	pos_uniq = list(set(fp1.read().strip().split('\n')))
	print len(pos_uniq)

for line in pos_uniq:
	pos = int(line) 
	tmpRes = SeqRecord(rec.seq[pos - 50 : pos] + rec.seq[pos: pos + 49])
	tmpRes.id = 'HLA_POS_CDS_{}: {}-{}'.format(pos, pos - 50, pos+49)
	tmpRes.description = ''
	SeqIO.write(tmpRes, fp, 'fasta')
fp.close()

猜你喜欢

转载自blog.csdn.net/Cassiel60/article/details/89397022
今日推荐