保存igv的信息,igvtools接口的使用:将igvtools查看bam文件的结果保存为txt文件,并提取信息

igvtools算是非常权威的bam文件查看器,市面上就这么一款软件,暂时没发现功能表现类似的。
在这里插入图片描述
上面的图片通常是你打开bam文件的可视化结果,当你聚焦到某一个位置的时候,igvtools会告诉你这一列有多少A,T,C,G,N。现在你想保存这个信息到txt文件,什么,你想先找找软件有没这个功能?对不起,没有。通常你这里可以截个图就完成任务了。那么如果你想保存信息到txt文件其实是有办法的,不过需要用到igvtools的接口,它的命令行模式完成。

./igvtools count -w 1 --bases '+bam+' result.wig +'fasta'

运行的时候你只需替换上面命令的bam为你的bam文件名,fasta是你的参考序列如hg19,或者特定的参考文件。运行后会生成一个result.wig文件,名字可变,但拓展名是.wig。

打开result.wig文件,内容如下:
在这里插入图片描述
第一列是位置,后面依次是A,C,G,T,N的数量,这样你打开igvtools可视化的信息就保存到这个 .wig文件里了,不放心的可以打开igvtools可视化在核对下。

接下来你可能想统计一下每个位置的突变频率,并显示参考fasta的碱基是什么,这个有点复杂,被注释掉的信息还有几个统计信息,直接给下面的代码吧:

import os
import sys
import re
fasta=sys.argv[1]
bam=sys.argv[2]
#fasta='psa.191021.fa'
#bam='gzy0000000S3_ct_M.sort.bam'
#a=os.system('./igvtools count -w 1 --bases '+bam+' result.wig '+fasta)

chrName=[]
refSeq=[]
nameRange=[]
with open(fasta,'r') as f:
    line=f.readlines()
    for i in line:
        print(len(i))
    lines=[i.strip()for i in line if len(i)!=1]
    #print(lines)
    #num=[i for i in range((len(lines))) if i%2==0]
    ref=[]
    x=0
    for i in lines:
        #print(i)
        if i[0]=='>':
            chrName.append(i.strip())
            nameRange.append(x)
        
        x+=1
        
        #ref=lines[i].strip()[1:]
        #seq=lines[i+1].strip()
        #lineNum=len(seq)
    nameRange.append(len(lines))
    #print(nameRange)
    t=1
    seq=''
    while t<=len(nameRange)-1:
        #print(lines[(int(nameRange[t-1])+1):int(nameRange[t])])
        seq=''.join(lines[(int(nameRange[t-1])+1):int(nameRange[t])])
        #print(seq)
        refSeq.append([seq])
        t+=1
        #chrName.append(ref)
#print(chrName)
A=[]
C=[]
G=[]
T=[]
N=[]
complexList=[]
maxLis=0
posLis=[] # second line
with open('result.wig','r') as f:
    #maxLis=len(f.readlines())
    for i in f:
        complexList.append(re.split(r'\s+',i)[0])
        if re.split(r'\s+',i)[0].isdigit():
            posLis.append(re.split(r'\s+',i)[0])
            A.append(re.split(r'\s+',i)[1])
            C.append(re.split(r'\s+',i)[2])
            G.append(re.split(r'\s+',i)[3])
            T.append(re.split(r'\s+',i)[4])
            N.append(re.split(r'\s+',i)[5])
        maxLis+=1
rangeLis=[]
index=0
for i in complexList:
    if i=='variableStep':
        rangeLis.append(index)
    index+=1
rangeLis.append(maxLis)
resultLis=[]
i=1
while i<=len(rangeLis)-1:
    resultLis.append(rangeLis[i]-rangeLis[i-1]-1)
    i+=1

chrLis=[] #first line

indexNum=0

while indexNum<=len(resultLis)-1:
    chrLis+=[chrName[indexNum]]*resultLis[indexNum]
    indexNum+=1
'''
variableStep=[]
indexV=0
for i in complexList:
    if i=='variableStep':
        variableStep.append(indexV)
    indexV+=1
'''
sliceList=[] #list for num
i=1
while i<=(len(rangeLis)-1):
    sliceList+=[complexList[rangeLis[i-1]+1:rangeLis[i]]]
    i+=1

#d='AAAAACCATCATATGCGGAACTAAATGCACAAAGACCTCATTATTGGAACATTAGCCATGATTATTTTAATATTCTCACAGCTTTGCAATTTTGAGAATATACTAGCATTATATAAGAAGGAAGAGGAGAAGGAGGAATAAGAGGGAAAGGAGGAGAAAGAGAAGTTGGTAAACAGAGGCCTAGTTAAGAATTCCTTGCCTTAGTGGTGAACAAGGACTAAACACAGACAATGGGTGAAACACAGACGCTAATTCACATAACAGAGAGTAGGCAACCTTAAGAATGAATTGATGCAGACTCCTATAGAATTCCTCTGTTATGACTGGGTTCTTATTTTCTCCTCCTTGTATGTAGTTGAAATTTCATCATTATGAATAGTTCCTTGGATCTTTTTTTAAAG'
t=0
#refString=''
#print(refSeq[26])
#print(sliceList[26])
#print(''.join([refSeq[26][0][int(i)] for i in sliceList[26]]))
#print(refSeq)
#print(refSeq)
refLine=[]
while t<=(len(sliceList)-1):
    
    #a=''.join([refSeq[t][0][int(i)-1] for i in sliceList[t]])
    refLine+=[refSeq[t][0][int(i)-1].upper() for i in sliceList[t]]
    print(refLine)
    #a=refSeq[t][0][int(sliceList[t][0])-1:int(sliceList[t][-1])]
    #refString+=a
    
    t+=1
#refLine=list(refString) # third line
'''
Cpg=[0]*len(A) # ninth line
NonCpg=[0]*len(A) #tenth line

CpgPos=[]
NonCpgPos=[]

i=0
while i<len(A)-1:
    if refLine[i]=='C' and refLine[i+1]=='G':
        CpgPos.append(i)
    elif refLine[i]=='C' and refLine[i+1]!='G':
        NonCpgPos.append(i)
    i+=1

#print(CpgPos)
#print(NonCpgPos)

for i in CpgPos:
    try:
        Cpg[i]=float(T[i])/(float(T[i])+float(C[i]))
    except ZeroDivisionError:
        Cpg[i]='Nan'
for i in NonCpgPos:
    try:
        NonCpg[i]=float(T[i])/(float(T[i])+float(C[i]))
    except ZeroDivisionError:
        NonCpg[i]='Nan'
'''
#file='2.txt'
file=sys.argv[4]
table=[]
filterList=[]
dic={'A':A,'C':C,'T':T,'G':G}
#print(refLine)
snpFrequency=[]
helpDic={'A':['T','C','G'],'T':['A','G','C'],'C':['A','T','G'],'G':['A','C','T']}
#par=0.3
par=float(sys.argv[3])
print(par)
with open(file,'w') as f:
    f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'frequency'+'\n')
    for i in range(len(A)):
        print(refLine[i])
        if refLine[i]=='N':
            filterList.append(i)
        else:
            #letter=['A','T','C','G']
            #letter.remove(refLine[i])
            #helpLis=helpDic[refLine[i]]
            snpFrequency.append(max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i])))
            if float(dic[helpDic[refLine[i]][0]][i])==0 and float(dic[helpDic[refLine[i]][1]][i])==0 and float(dic[helpDic[refLine[i]][2]][i])==0:
                #filterList.append(i)
                pass
            elif ((max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i]))>=par)and (
            (max([float(dic[k][i+1]) for k in helpDic[refLine[i+1]]])/(float(A[i+1])+float(T[i+1])+float(C[i+1])+float(G[i+1])+float(N[i+1]))<=par)
            and (max([float(dic[k][i-1]) for k in helpDic[refLine[i-1]]])/(float(A[i-1])+float(T[i-1])+float(C[i-1])+float(G[i-1])+float(N[i-1]))<=par))
    ) :
                #filterList.append(i)
                pass
            elif (max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i]))<=par):
                #filterList.append(i)
                pass
            else:
                #pass
                f.write(chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n')


'''
for i in range(len(A)):
    finallStr=''
    #finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(Cpg[i])+'\t'+str(NonCpg[i])+'\n'
    
    if i not in filterList:
        finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n'
    
    #finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n'
    table.append(finallStr)
'''

'''
with open(file,'w') as f:
    #f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'Cpg'+'\t'+'NonCpg'+'\t'+'\n')
    f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'frequency'+'\n')
    for i in range(len(A)):
        if i not in filterList:
            f.write(chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n')
    #f.writelines(table)

'''

发布了8 篇原创文章 · 获赞 2 · 访问量 2032

猜你喜欢

转载自blog.csdn.net/a_giant_pig/article/details/103011731