读书笔记 Bioinformatics Algorithms Chapter5

Chapter5  HOW DO WE COMPARE DNA SEQUENCES 

Bioinformatics Algorithms-An_Active Learning Approach

http://bioinformaticsalgorithms.com/

 
一、
1983年,Russell Doolitte 将血小板源生长因子[platelet derived growth factor(PDGF),一种刺激细胞增殖的物质]和其它已知基因比对,发现它的序列和原癌基因(oncogene)的序列有很高的相似度。这让科学家猜测某些癌症是因为基因在不合适的时机作用所致(scientists hypothesized that some forms of cancer might be caused by a good gene doing the right thing at the wrong time.)。
二、提出问题 序列比对:动态规划法
 
1.全局比对:
状态转移方程
 
  1 '''
  2 Code Challenge: Solve the Global Alignment Problem.
  3      Input: Two protein strings written in the single-letter amino acid alphabet.
  4      Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the
  5     BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
  6 ----------
  7 Sample Input:
  8 PLEASANTLY
  9 MEANLY
 10 ----------
 11 Sample Output:
 12 8
 13 PLEASANTLY
 14 -MEA--N-LY
 15 ----------
 16 @ Lo Kowngho  2018.9
 17 '''
 18 import numpy
 19 from os.path import dirname
 20 
 21 def Grade(Symb1,Symb2):
 22     Indx1 = symbolList[Symb1]
 23     Indx2 = symbolList[Symb2]
 24     return matrix[Indx1][Indx2]
 25     
 26 def Init_Graph_Global(l1,l2):
 27     Graph = numpy.zeros([l2,l1])
 28     for x in range(1,l2):
 29         Graph[x][0] = Graph[x-1][0]-5
 30     for y in range(1,l1):
 31         Graph[0][y] = Graph[0][y-1]-5
 32     return Graph
 33         
 34 def Init_Path(l1,l2):
 35     Path = numpy.zeros([l2,l1])
 36     for x in range(1,l2):
 37         Path[x][0] = 1
 38     for y in range(1,l1):
 39         Path[0][y] = 2
 40     return Path
 41 
 42 def buildGlobalAlignmentGraph(text1,text2):
 43     
 44     l1 = len(text1)
 45     l2 = len(text2)
 46     Graph = Init_Graph_Global(l1, l2)
 47     Path = Init_Path(l1, l2)
 48         
 49     for x in range(1,l2):
 50         for y in range(1,l1):
 51             Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[y],text2[x]))
 52             if Graph[x-1][y]-5==Graph[x][y]:
 53                 Path[x][y]=1
 54             elif Graph[x][y-1]-5==Graph[x][y]:
 55                 Path[x][y]=2
 56             else:
 57                 Path[x][y]=3
 58     return [Graph,Path]
 59     
 60     
 61 def OutputGlobalAligement(Path,Graph,text1,text2):
 62     outT1 = ''
 63     outT2 = ''
 64     l1 = len(text1)
 65     l2 = len(text2)
 66     x = l2-1
 67     y = l1-1
 68     while(x!=0 or y!=0):
 69         if Path[x][y]==1:
 70             outT1 += '-'
 71             outT2 += text2[x]
 72             x -= 1            
 73         elif Path[x][y]==2:
 74             outT1 += text1[y]
 75             outT2 += '-'
 76             y -= 1
 77         else:
 78             outT1 += text1[y]
 79             outT2 += text2[x]
 80             x -= 1
 81             y -= 1
 82     return [outT1[::-1],outT2[::-1]]    
 83     
 84 def ImportScoreMatrix():
 85     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
 86     symbolList = dataset[0].split()
 87     for i in range(len(symbolList)):
 88         symbolList[i]=[symbolList[i],i]
 89     symbolList = dict(symbolList)
 90     print(symbolList)
 91     matrix = []
 92     for i in range(1,len(dataset)):
 93         matrix.append(dataset[i].split()[1:])
 94     for l in range(len(matrix)):
 95         for i in range(len(matrix[l])):
 96             matrix[l][i]=int(matrix[l][i])
 97     return [matrix,symbolList]
 98 
 99 if __name__ == '__main__':
100     
101     [matrix,symbolList] = ImportScoreMatrix()
102 
103     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
104     text1 = '0'+dataset[0]
105     text2 = '0'+dataset[1]
106     
107     [Graph,Path] = buildGlobalAlignmentGraph(text1, text2)
108     
109     [outT1,outT2] = OutputGlobalAligement(Path,Graph,text1,text2)
110     
111     print(int(Graph[-1][-1]))
112     print(outT1)
113     print(outT2)
全局比对 python
 
2. 局部比对
可以把局部比对想象成下面的Free Taxi场景,在开始和结尾都不受罚分约束,只在中间的某一过程受罚分约束。
              
在全局比对的基础上,状态转移方程在加上一个0,表示每一个点,既可以由→、↓、 ↘经过罚分得到,也可以直接由起点,不经罚分得到(Free Taxi)。
  1 '''
  2 Code Challenge: Solve the Local Alignment Problem.
  3      Input: Two protein strings written in the single-letter amino acid alphabet.
  4      Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum
  5      score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
  6 ---------------
  7 Sample Input:
  8 MEANLY
  9 PENALTY
 10 ---------------
 11 Sample Output:
 12 15
 13 EANL-Y
 14 ENALTY
 15 ---------------
 16 Lo Kwongho 2018.9
 17 '''
 18 import numpy
 19 from os.path import dirname
 20 
 21 def Grade(Symb1,Symb2):
 22     Indx1 = symbolList[Symb1]
 23     Indx2 = symbolList[Symb2]
 24     return matrix[Indx1][Indx2]
 25 
 26 def Init_Graph_Local(l1,l2):
 27     Graph = numpy.zeros([l1,l2])
 28     return Graph
 29         
 30 def Init_Path(l1,l2):
 31     Path = numpy.ones([l1,l2])*-1
 32     for x in range(1,l1):
 33         Path[x][0] = 1
 34     for y in range(1,l2):
 35         Path[0][y] = 2
 36     return Path
 37 
 38 def buildLocalAlignmentGraph(text1,text2):
 39     l1 = len(text1)
 40     l2 = len(text2)
 41     Graph = Init_Graph_Local(l1, l2)
 42     Path = Init_Path(l1, l2)
 43     
 44     for x in range(1,l1):
 45         for y in range(1,l2):
 46             Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[x],text2[y]),0)
 47             if Graph[x-1][y]-5 == Graph[x][y]:
 48                 Path[x][y] = 1
 49             elif Graph[x][y-1]-5==Graph[x][y]:
 50                 Path[x][y] = 2
 51             elif Graph[x][y] == 0:
 52                 Path[x][y] = 0
 53             else:
 54                 Path[x][y] = 3
 55     maxVal = 0
 56     maxIndx = [-1,-1]
 57     for x in range(1,l1):
 58         for y in range(1,l2):
 59             if Graph[x][y]>maxVal:
 60                 maxVal=Graph[x][y]
 61                 maxIndx=[x,y]
 62                 
 63     return [Graph,Path,maxIndx]
 64 
 65 def OutputLocalAligement(Path,Graph,text1,text2,maxIndx):
 66     outT1 = ''
 67     outT2 = ''
 68     l1 = len(text1)
 69     l2 = len(text2)
 70     x = maxIndx[0]
 71     y = maxIndx[1]
 72     while(x!=0 or y!=0):
 73         if Path[x][y]==1:
 74             outT1 += text1[x]
 75             outT2 += '-'
 76             x -= 1            
 77         elif Path[x][y]==2:
 78             outT1 += '-'
 79             outT2 += text2[y]
 80             y -= 1
 81         elif Path[x][y]==3:
 82             outT1 += text1[x]
 83             outT2 += text2[y]
 84             x -= 1
 85             y -= 1
 86         else:
 87             x=0
 88             y=0
 89     return [outT1[::-1],outT2[::-1]]    
 90 
 91 
 92 def ImportScoreMatrix():
 93     dataset = open(dirname(__file__)+'PAM250.txt').read().strip().split('\n')
 94     symbolList = dataset[0].split()
 95     for i in range(len(symbolList)):
 96         symbolList[i]=[symbolList[i],i]
 97     symbolList = dict(symbolList)
 98     matrix = []
 99     for i in range(1,len(dataset)):
100         matrix.append(dataset[i].split()[1:])
101     for l in range(len(matrix)):
102         for i in range(len(matrix[l])):
103             matrix[l][i]=int(matrix[l][i])
104     return [matrix,symbolList]
105     
106 if __name__ == '__main__':
107     [matrix,symbolList] = ImportScoreMatrix()
108     
109     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
110     text1 = '0'+dataset[0]
111     text2 = '0'+dataset[1]
112     
113     [Graph,Path,maxIndx] = buildLocalAlignmentGraph(text1,text2)
114     
115     [outT1,outT2]=OutputLocalAligement(Path,Graph,text1,text2,maxIndx)
116     print(int(Graph[maxIndx[0]][maxIndx[1]]))
117     print(outT1)
118     print(outT2)
局部比对 Python
3. Overlarp Alignment
  1 '''
  2 Code Challenge: Solve the Overlap Alignment Problem.
  3    >>Input: Two strings v and w, each of length at most 1000.
  4    >>Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w.
  5     achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2.
  6 -------------------
  7 Sample Input:
  8 PAWHEAE
  9 HEAGAWGHEE
 10 -------------------
 11 Sample Output:
 12 1
 13 HEAE
 14 HEAG
 15 -------------------
 16 coder: Lo Kwongho
 17 '''
 18 
 19 import numpy
 20 from os.path import dirname
 21 
 22 def Init_Graph_Overlap(l1,l2):
 23     Graph = numpy.zeros([l1,l2])
 24     for y in range(1,l2):
 25         Graph[0][y] = Graph[0][y-1]-1
 26     return Graph
 27         
 28 def Init_Path(l1,l2):
 29     Path = numpy.ones([l1,l2])*-1
 30     for x in range(1,l1):
 31         Path[x][0] = 1
 32     for y in range(1,l2):
 33         Path[0][y] = 2
 34     return Path
 35 
 36 def buildOverlapAlignmentGraph(text1,text2):
 37     l1 = len(text1)
 38     l2 = len(text2)
 39     Graph = Init_Graph_Overlap(l1, l2)
 40     Path = Init_Path(l1,l2)
 41     for x in range(1,l1):
 42         for y in range(1,l2):
 43             if text1[x]==text2[y]:
 44                 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-2,Graph[x][y-1]-2)
 45             else:
 46                 Graph[x][y]=max(Graph[x-1][y-1]-2,Graph[x-1][y]-2,Graph[x][y-1]-2)
 47             if Graph[x][y]==Graph[x-1][y]-2:
 48                 Path[x][y]=1
 49             elif Graph[x][y]==Graph[x][y-1]-2:
 50                 Path[x][y]=2
 51             else:
 52                 Path[x][y]=3
 53         
 54     maxVal = 0
 55     maxIndx = -1
 56     for i in range(l2):
 57         if Graph[l1-1][i]>maxVal:
 58             maxVal=Graph[l1-1][i]
 59             maxIndx=i
 60                     
 61     return [Graph,Path,maxIndx,maxVal]    
 62     
 63 def OutputOverlapAligement(Path,Graph,text1,text2,maxIndx):
 64     outT1 = ''
 65     outT2 = ''
 66     l1 = len(text1)
 67     l2 = len(text2)
 68     x = l1-1
 69     y = maxIndx
 70     while(y!=0):
 71         if Path[x][y]==1:
 72             outT1 += text1[x]
 73             outT2 += '-'
 74             x -= 1            
 75         elif Path[x][y]==2:
 76             outT1 += '-'
 77             outT2 += text2[y]
 78             y -= 1
 79         elif Path[x][y]==3:
 80             outT1 += text1[x]
 81             outT2 += text2[y]
 82             x -= 1
 83             y -= 1
 84         else:
 85             x=0
 86             y=0
 87     return [outT1[::-1],outT2[::-1]]    
 88         
 89 
 90 if __name__ == '__main__':
 91     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
 92     text1 = '0'+dataset[0]
 93     text2 = '0'+dataset[1]
 94     l1 = len(text1)
 95     l2 = len(text2)
 96     [Graph,Path,maxIndx,maxVal] = buildOverlapAlignmentGraph(text1,text2)
 97     #print(Graph)
 98         
 99     [outText1,outText2]=OutputOverlapAligement(Path, Graph, text1, text2, maxIndx)
100 
101     print(int(maxVal))
102     print(outText1)
103     print(outText2)
Overlarp in python
4.Fitting Alignment 
  1 '''
  2 Fitting Alignment Problem: Construct a highest-scoring fitting alignment between two strings.
  3    >>Input: Strings v and w as well as a matrix Score.
  4    >>Output: A highest-scoring fitting alignment of v and w as defined by the scoring matrix Score.
  5 -------------------
  6 Sample Input:
  7 GTAGGCTTAAGGTTA
  8 TAGATA
  9 -------------------
 10 Sample Output:
 11 2
 12 TAGGCTTA
 13 TAGA--TA
 14 -------------------
 15 coder: Lo Kwongho
 16 '''
 17 
 18 import numpy
 19 from os.path import dirname
 20 
 21 def Init_Graph_Fiting(l1,l2):
 22     Graph = numpy.zeros([l1,l2])
 23     for y in range(1,l2):
 24         Graph[0][y] = Graph[0][y-1]-1
 25     return Graph
 26         
 27 def Init_Path(l1,l2):
 28     Path = numpy.ones([l1,l2])*-1
 29     for x in range(1,l1):
 30         Path[x][0] = 1
 31     for y in range(1,l2):
 32         Path[0][y] = 2
 33     return Path
 34 
 35 def buildFittingAlignmentGraph(text1,text2):
 36     l1 = len(text1)
 37     l2 = len(text2)
 38     Graph = Init_Graph_Fiting(l1, l2)
 39     Path = Init_Path(l1,l2)
 40     for x in range(1,l1):
 41         for y in range(1,l2):
 42             if text1[x]==text2[y]:
 43                 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-1,Graph[x][y-1]-1)
 44             else:
 45                 Graph[x][y]=max(Graph[x-1][y-1]-1,Graph[x-1][y]-1,Graph[x][y-1]-1)
 46             if Graph[x][y]==Graph[x-1][y]-1:
 47                 Path[x][y]=1
 48             elif Graph[x][y]==Graph[x][y-1]-1:
 49                 Path[x][y]=2
 50             else:
 51                 Path[x][y]=3
 52     
 53     maxVal = 0
 54     maxIndx = -1
 55     for i in range(l1):
 56         if Graph[i][l2-1]>maxVal:
 57             maxVal=Graph[i][l2-1]
 58             maxIndx=i
 59                         
 60     return [Graph,Path,maxIndx,maxVal]    
 61     
 62 def OutputFittingAligement(Path,Graph,text1,text2,maxIndx):
 63     outT1 = ''
 64     outT2 = ''
 65     l1 = len(text1)
 66     l2 = len(text2)
 67     x = maxIndx
 68     y = l2-1
 69     while(y!=0):
 70         if Path[x][y]==1:
 71             outT1 += text1[x]
 72             outT2 += '-'
 73             x -= 1            
 74         elif Path[x][y]==2:
 75             outT1 += '-'
 76             outT2 += text2[y]
 77             y -= 1
 78         elif Path[x][y]==3:
 79             outT1 += text1[x]
 80             outT2 += text2[y]
 81             x -= 1
 82             y -= 1
 83         else:
 84             x=0
 85             y=0
 86     return [outT1[::-1],outT2[::-1]]    
 87         
 88 
 89 if __name__ == '__main__':
 90     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
 91     text1 = '0'+dataset[0]
 92     text2 = '0'+dataset[1]
 93     l1 = len(text1)
 94     l2 = len(text2)
 95     [Graph,Path,maxIndx,maxVal] = buildFittingAlignmentGraph(text1,text2)
 96     
 97     [outText1,outText2]=OutputFittingAligement(Path, Graph, text1, text2, maxIndx)
 98     #print(Graph)
 99     print(int(maxVal))
100     print(outText1)
101     print(outText2)
Fitting Alignment in python
这四种比对的关系如图:
 
全局比对                    局部比对
Overlarp Alignment                 Fitting Alignment
5、基因的插入和删除,通常都是连续的一段,故在比对出现的连续空位,应该把它们当作一个整体看待。在比对的空位罚分中,生物学家认为,在每一条链上新开一个空位,应罚重分,而空位的延续,罚分应较少:
解决问题的方法是:开三个矩阵,每个矩阵代表一种方向。在→、↓方向上行走,代表产生空位。故每当从↘转移到→、↓,或者→、↓间转移,代表在某链上产生新空位,重罚,而在→、↓内转移,代表空位延续,轻罚。
 
                     
  1 '''
  2 Code Challenge: Solve the Alignment with Affine Gap Penalties Problem.
  3    >>Input: Two amino acid strings v and w (each of length at most 100).
  4    >>Output: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the
  5      BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.
  6 ---------------------
  7 Sample Input:
  8 PRTEINS
  9 PRTWPSEIN
 10 ---------------------
 11 Sample Output:
 12 8
 13 PRT---EINS
 14 PRTWPSEIN-
 15 ---------------------
 16 coder: Lo Kwongho
 17 '''
 18 import numpy
 19 from os.path import dirname
 20 negINFINITY = -999
 21 #Penalties
 22 gs = -10 #gap_Start
 23 ge = -1  #gap_Extend
 24 #
 25 def Grade(Symb1,Symb2):
 26     Indx1 = symbolList[Symb1]
 27     Indx2 = symbolList[Symb2]
 28     return matrix[Indx1][Indx2]
 29 
 30 def initGraph(l1,l2):
 31     Graph = [numpy.zeros([l1,l2]    ,dtype=int) for i in range(3)]
 32 
 33     Graph[1][0][0] = negINFINITY
 34     Graph[2][0][0] = negINFINITY
 35     for x in range(1,l1):
 36         Graph[0][x][0]=negINFINITY
 37         if x==1:
 38             Graph[1][x][0]=ge+gs
 39         else:
 40             Graph[1][x][0]=Graph[1][x-1][0]+ge
 41         Graph[2][x][0]=negINFINITY
 42     for y in range(1,l2):
 43         Graph[0][0][y]=negINFINITY
 44         if y ==1:
 45             Graph[2][0][y]=ge+gs
 46         else:
 47             Graph[2][0][y]=Graph[2][0][y-1]+ge
 48         Graph[1][0][y]=negINFINITY
 49     return Graph
 50     
 51 def Init_Path(l1,l2):
 52     Path = [numpy.ones([l1,l2])*-1 for i in range(3)]
 53     '''for x in range(1,l1):
 54         Path[x][0] = 1
 55     for y in range(1,l2):
 56         Path[0][y] = 2'''
 57     return Path
 58     
 59 def buildAlignmentGraph(text1,text2,l1,l2):
 60 
 61     Graph = initGraph(l1,l2)
 62     #Path = #Init_Path(l1,l2)
 63     for x in range(1,l1):
 64         for y in range(1,l2):                
 65             # X ######
 66             Graph[1][x][y]=max(gs+ge+Graph[0][x-1][y],gs+ge+Graph[2][x-1][y],ge+Graph[1][x-1][y])
 67 
 68                 
 69             # Y ###### 
 70             Graph[2][x][y]=max(gs+ge+Graph[0][x][y-1],gs+ge+Graph[1][x][y-1],ge+Graph[2][x][y-1])
 71 
 72             # M ######
 73             Graph[0][x][y]=Grade(text1[x], text2[y])+max(Graph[0][x-1][y-1],Graph[1][x-1][y-1],Graph[2][x-1][y-1])
 74 
 75     maxVal = 0
 76     maxIndx = -1
 77     for i in range(3):
 78         if Graph[i][l1-1][l2-1]>maxVal:
 79             maxVal=Graph[i][l1-1][l2-1]
 80             maxIndx=i
 81     return [Graph,maxIndx,maxVal]
 82 
 83 def trackBack(Graph,maxIndx,text1,text2):
 84     x = len(text1)-1
 85     y = len(text2)-1
 86     otext1 = ''
 87     otext2 = ''
 88     Indx = maxIndx
 89     while(1):
 90         if Indx==0:
 91             otext1 += text1[x]
 92             otext2 += text2[y]
 93             if x ==1:
 94                 break
 95             if Graph[0][x][y]==Graph[1][x-1][y-1]+Grade(text1[x], text2[y]):
 96                 Indx = 1
 97             elif Graph[0][x][y]==Graph[2][x-1][y-1]+Grade(text1[x], text2[y]):
 98                 Indx = 2
 99             else:
100                 Indx = 0
101             x -= 1
102             y -= 1
103         elif Indx==1:
104             otext1 += text1[x]
105             otext2 += '-'
106             if x == 1:
107                 break
108             if Graph[1][x][y]==Graph[0][x-1][y]+ge+gs:
109                 Indx = 0
110             elif Graph[1][x][y]==Graph[2][x-1][y]+ge+gs:
111                 Indx = 2
112             else:
113                 Indx = 1
114             x -= 1
115         else:
116             otext1 += '-'
117             otext2 += text2[y]
118             if y == 1:
119                 break
120             if Graph[2][x][y]==Graph[0][x][y-1]+ge+gs:
121                 Indx = 0
122             elif Graph[2][x][y]==Graph[1][x][y-1]+ge+gs:
123                 Indx = 1
124             else:
125                 Indx = 2
126             y -= 1
127                 
128     return [otext1[::-1],otext2[::-1]]
129         
130 def ImportScoreMatrix():
131     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
132     symbolList = dataset[0].split()
133     for i in range(len(symbolList)):
134         symbolList[i]=[symbolList[i],i]
135     symbolList = dict(symbolList)
136     matrix = []
137     for i in range(1,len(dataset)):
138         matrix.append(dataset[i].split()[1:])
139     for l in range(len(matrix)):
140         for i in range(len(matrix[l])):
141             matrix[l][i]=int(matrix[l][i])
142     return [matrix,symbolList]
143 
144 
145 if __name__ == '__main__':
146     [matrix,symbolList] = ImportScoreMatrix() # 打分矩阵
147     
148     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
149     text1 = '0'+dataset[0]
150     text2 = '0'+dataset[1]
151     l1 = len(text1)
152     l2 = len(text2)
153     [Graph,maxIndx,maxVal] = buildAlignmentGraph(text1, text2, l1, l2)
154     #print(Graph)
155     
156     [output_text1,output_text2] = trackBack(Graph,maxIndx,text1,text2)
157     print(maxVal)
158     print(output_text1)
159     print(output_text2)
160     
Alignment with Affine Gap Penalties
6 * 一种线性空间的比对方法 Space-Efficient Sequence Alignment(分治+动态规划)
https://www.cs.rit.edu/~rlaz/algorithms20082/slides/SpaceEfficientAlignment.pdf
  1 '''
  2 Code Challenge: Implement LinearSpaceAlignment to solve the Global Alignment Problem for a large dataset.
  3 >>>Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet.
  4 >>>Output: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the
  5 
  6 BLOSUM62 scoring matrix and indel penalty σ = 5.
  7 ------------
  8 Sample Input:
  9 PLEASANTLY
 10 MEANLY
 11 ------------
 12 Sample Output:
 13 8
 14 PLEASANTLY
 15 -MEA--N-LY
 16 ------------
 17 coder: Lo Kwongho in 2018.9
 18 '''
 19 from os.path import dirname
 20 import numpy
 21 #
 22 indel = -5
 23 negINF = -9999
 24 #
 25 #
 26 def Grade(Symb1,Symb2):
 27     Indx1 = symbolList[Symb1]
 28     Indx2 = symbolList[Symb2]
 29     return matrix[Indx1][Indx2]
 30 
 31 def ImportScoreMatrix():
 32     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n')
 33     symbolList = dataset[0].split()
 34     for i in range(len(symbolList)):
 35         symbolList[i]=[symbolList[i],i]
 36     symbolList = dict(symbolList)
 37     matrix = []
 38     for i in range(1,len(dataset)):
 39         matrix.append(dataset[i].split()[1:])
 40     for l in range(len(matrix)):
 41         for i in range(len(matrix[l])):
 42             matrix[l][i]=int(matrix[l][i])
 43     return [matrix,symbolList]
 44 #
 45 def half_Alignment(v,w):
 46     nv = len(v)
 47     mw = len(w)
 48     s = numpy.zeros(shape=(nv+1,2),dtype=int)
 49     for i in range(nv+1):
 50         s[i,0] = indel*i
 51     if mw==0:
 52         return s[:,0] #
 53     for j in range(mw):
 54         s[0,1]=s[0,0]+indel
 55         for i in range(nv):
 56             s[i+1,1]=max(s[i,1]+indel,s[i+1,0]+indel,s[i,0]+Grade(w[j],v[i]))
 57         s[:,0]=s[:,1]
 58     return s[:,1]
 59 
 60 def midEdge(v,w):
 61     nv = len(v)
 62     mw = len(w)
 63     mid = int((mw-1)/2)
 64     wl = w[:mid]
 65     wr = w[mid+1:]
 66     pre = half_Alignment(v,wl)
 67     suf = half_Alignment(v[::-1],wr[::-1])[::-1]
 68     hs = [pre[i]+suf[i]+indel  for i in range(nv+1)]
 69     ds = [pre[i]+suf[i+1]+Grade(w[mid],v[i])  for i in range(nv)]
 70     maxhs = max(hs)
 71     maxds = max(ds)
 72     if maxhs>maxds:
 73         return ( (hs.index(maxhs),mid) ,(hs.index(maxhs),mid+1) )
 74     else:
 75         return ( (ds.index(maxds),mid) ,(ds.index(maxds)+1,mid+1) )
 76 
 77 def build_Alignment_track(v,w):
 78     vn = len(v)
 79     wm = len(w)
 80     if vn==0 and wm==0:
 81         return []
 82     elif vn==0:
 83         return ['-']*wm
 84     elif wm==0:
 85         return ['|']*vn
 86     ((x1,y1),(x2,y2)) = midEdge(v,w)
 87     if x1==x2:
 88         edge = ['-']
 89     else:
 90         edge = ['\\']
 91     wleft = w[:y1]
 92     wright = w[y2:]
 93     vupper = v[:x1]
 94     vbotm = v[x2:]
 95 
 96     upper_left_track = build_Alignment_track(vupper,wleft)
 97     bottom_right_track = build_Alignment_track(vbotm,wright)
 98     return upper_left_track+edge+bottom_right_track
 99 
100 def trackToString(v,w,track):
101     vi = 0
102     wj = 0
103     outv = ''
104     outw = ''
105     score = 0
106     for i in track:
107         if i == '|':
108             outv += v[vi]
109             outw += '-'
110             score += indel
111             vi += 1    
112         elif i == '-':
113             outv += '-'
114             outw += w[wj]
115             score += indel
116             wj += 1            
117         else:
118             outv += v[vi]
119             outw += w[wj]
120             score += Grade(v[vi], w[wj])
121             vi += 1
122             wj += 1
123             
124     return [score,outv,outw]
125 
126 def LinearSpaceAlignment(v,w):
127     track = build_Alignment_track(v,w)
128     [score,outv, outw] = trackToString(v,w,track)
129     print(score)
130     print(outv)
131     print(outw)
132 
133 if __name__ == '__main__':
134     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
135     [matrix,symbolList] = ImportScoreMatrix()
136     v = dataset[0]
137     w = dataset[1]
138     LinearSpaceAlignment(v,w)
Linear-Space Alignment

猜你喜欢

转载自www.cnblogs.com/lokwongho/p/9688047.html