-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathProgDynOptim.py
executable file
·268 lines (229 loc) · 11.2 KB
/
ProgDynOptim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/usr/bin/env python3
#*****************************************************************************
# Name: MTG-Link
# Description: Local assembly tool for linked-reads data
# Copyright (C) 2020 INRAE
# Author: Anne Guichard
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#*****************************************************************************
"""Module 'ProgDynOptim.py': class of an optimized dynamic programmation
The module 'ProgDynOptim.py' contains the class of an optimized dynamic programmation, that is used to determine the alignment between two sequences overlapping.
But we determine the alignment after the seed sequence so it is equivalent to a global alignment.
"""
class DynamicMatrixOptim:
"""The class 'DynamicMatrixOptim' contains all the attributes, properties and methods to create a DynamicMatrixOptim object.
Optimisation of DynamicMatrix to get the edit distance of the global alignment of sequences G (genome) and R (read)
The score system {match, mismatch, gap} is fixed to get the edit distance : {0, 1, 1}
Defines global alignment functions
Optimisations:
* constrained in a band of width 2*dmax+1
* keeping in memory only 2 lines of size 2*dmax+3 (+ 3 because: size of band = 2*dmax+1, +1 first cell and +1 last cell (both fixed to MAX value)
* does not backtrack, instead stores for each cell, the starting position of G (optional for the "OLC" program)
"""
def __init__(self, G, R, dmax):
'''Defines and stores initial values'''
self.G = G
self.R = R
self.match = 0
self.mismatch = 1
self.gap = 1
self.dmax = dmax
self.MAX = 150
# These two lines store the edit distances.
## First line (prevLine here): gap penalties.
self.prevLine = [i - (self.dmax+1) for i in range(2*self.dmax+3)]
self.nextLine = [0 for i in range(2*self.dmax+3)]
# Set both the first and last cells to MAX value (to stop recurrence at the extremities of the line).
self.prevLine[0] = self.MAX
self.prevLine[2*self.dmax+2] = self.MAX
self.nextLine[0] = self.MAX
self.nextLine[2*self.dmax+2] = self.MAX
# # These two lines store the starting position of R.
# self.prevStart = [- self.dmax + i-1 for i in range(2*self.dmax+3)]
# self.nextStart = [- self.dmax + i-1 for i in range(2*self.dmax+3)]
def score(self, s, t):
'''Method to get the alignment score of two nucleotides (s and t)'''
if s == t: return self.match
else: return self.mismatch
def getEditDistanceAndGenomePosition(self):
'''Method to return the edit distance (optimal score on last line = scoreMin)
and beginning position of the alignment on the sequence G (0-based) (optional for the "OLC" program)'''
'''It "fills" the matrix in global mode, but constrained in a band of width 2*dmax+1 and keeping in memory only 2 lines of size 2*dmax+3'''
# Loop on genome positions.
for i in range(1,len(self.G)+1): #i-1: position in genome (i: line position on full matrix in basic ProgDyn_SuffixPrefix)
j = i - self.dmax #j-1: postion in read (j: beginning position in self.nextLine)
p = 1 #p: position in self.prevLine
# At the beginning, we have lots of cells that extend beyond the matrix on the left, and that we don't need to fill.
while j < 0:
j += 1
p += 1
continue
# Start filling the two lines from the cells that are really in the matrix.
while p <= 2*self.dmax+1 and j >= 0:
# First column: gap penalties.
if j == 0:
self.nextLine[p] = i
j += 1
p += 1
continue
# Case where R is shorter than G (e.g. |R| < |overlap| + dmax).
if j > len(self.R):
self.nextLine[p] = self.MAX
break
# Compute the 3 possible scores.
## Note: segment of 2*dmax+3 is shifted of one position to the right at each iteration of i (e.g. prevLine and nextLine not aligned).
diag = self.prevLine[p] + self.score(self.G[i-1],self.R[j-1])
vert = self.prevLine[p+1] + self.gap
hori = self.nextLine[p-1] + self.gap
# Get the minimal value among the 3 possible scores.
if diag <= vert and diag <= hori: #'<=': gives the priority to diagonal if ex-aequo in scores
self.nextLine[p] = diag
# self.nextStart[p] = self.prevStart[p]
elif vert < diag and vert <= hori:
self.nextLine[p] = vert
# self.nextStart[p] = self.prevStart[p+1]
else:
self.nextLine[p] = hori
# self.nextStart[p] = self.nextStart[p-1]
j += 1
p += 1
# If the minimum value of nextLine > dmax, stop the alignment.
if min(self.nextLine) > self.dmax:
return None, None
# Once nextLine is filled, prevLine is replaced by nextLine (same for start positions).
## ATTENTION: self.prev = self.next (if self.next is modified then self.prev if modified as well).
self.prevLine = list(self.nextLine) #or self.prev=self.next[:]
# self.prevStart = list(self.nextStart)
# Determinate the dmax for this overlap (dmax variable, whose value depends on the overlap size).
dmax_overlap = round(i / 10)
if min(self.nextLine) > dmax_overlap:
return None, None
# Find where to start the backtracking, searching the optimal score on the last line (e.g. find p with optimal score).
## ATTENTION: start from the bottom-right corner, and goes back to the left to find pmin.
if len(self.R) < len(self.G):
pmin = 2*self.dmax + 1 - (len(self.G) - len(self.R))
if pmin < 0: pmin = 0 #ATTN: pmin can't be negative
else:
pmin = 2*self.dmax + 1
scoreMin = self.nextLine[pmin]
for p in range(2*self.dmax, 0, -1):
if self.nextLine[p] < scoreMin:
scoreMin = self.nextLine[p]
pmin = p
# If edit distance > dmax, return 'None, None'.
if scoreMin > self.dmax:
return None, None
# # Get the beginning position of the alignment/overlap on G (0-based).
# posG = -self.nextStart[pmin] #self.nextStart[pmin]: beginning position of the alignment on R
# Get jMin to get the position on the read that corresponds to the end of the overlap (0-based).
jMin = len(self.G) - self.dmax + pmin - 1
return scoreMin, jMin
# ## Tests
# # Basic test with no alignment possible (too many InDels/Subs for short overlap)
# print("\n")
# print("###########")
# print("Indels dans R with user-defined parameter dmax = 3")
# print("No alignment")
# print("###########")
# S = "GCGCTGCTTAATATGATCGATCGATCGAATCGACTAG"
# R = "AATATGATCGATCGATGGAAATCCACTAGTCC"
# i = 9
# s = 10
# print(S[(i+s):])
# print(R[s:])
# dm = DynamicMatrixOptim(S[(i+s):], R[s:], 3)
# # # NB: uncomment the prevStart/nextStart implementation lines to get posG
# # # dist, posG, posR = dm.getEditDistanceAndGenomePosition()
# # # print(f"Best edit distance of {dist} at position {posG} on Genome. Extension begins at position {posR} on Read.")
# dist, posR = dm.getEditDistanceAndGenomePosition()
# if posR is not None:
# posExt = posR + s
# else:
# posExt = None
# print(f"Best edit distance of {dist}. Extension begins at position {posR} on R[s:]. Extension begins at position {posExt} on R.")
# #RESULTS:
# '''Best edit distance of None. Extension begins at position None on R[s:]. Extension begins at position None on R.'''
# Overlap sur G = 18 donc dmax_overlap = 2
# # Basic test with alignment possible
# print("\n")
# print("###########")
# print("Indels dans R with user-defined parameter dmax = 3")
# print("Alignment")
# print("###########")
# S = "GCGCTGCTTAATATGATCGATCGATGGAATCGACTAG"
# R = "AATATGATCGATCGATGGAAATCCACTAGTCC"
# i = 9
# s = 10
# print(S[(i+s):])
# print(R[s:])
# dm = DynamicMatrixOptim(S[(i+s):], R[s:], 3)
# dist, posR = dm.getEditDistanceAndGenomePosition()
# if posR is not None:
# posExt = posR + s
# else:
# posExt = None
# print(f"Best edit distance of {dist}. Extension begins at position {posR} on R[s:]. Extension begins at position {posExt} on R.")
# #RESULTS:
# '''Best edit distance of 2. Extension begins at position 19 on R[s:]. Extension begins at position 29 on R.'''
# #Overlap sur G = 18 donc dmax_overlap = 2
# #ALIGNMENT:
# ATCGATGGAA TC G ACTAG
# |||||||||| - || . |||||
# ATCGATGGAA A TC C ACTAG TCC
# # |R| <= |G|
# print("\n")
# print("###########")
# print("|R| <= |G|")
# print("###########")
# S = "GCGCTGCTTAATATGATCGATCGATCGAACTAGTCCTAAT"
# R = "AATATGATCGATCGATGAACTAGTCC"
# i = 9
# s = 10
# print(S[(i+s):])
# print(R[s:])
# dm = DynamicMatrixOptim(S[(i+s):], R[s:], 3)
# dist, posR = dm.getEditDistanceAndGenomePosition()
# if posR is not None:
# posExt = posR + s
# else:
# posExt = None
# print(f"Best edit distance of {dist}. Extension begins at position {posR} on R[s:]. Extension begins at position {posExt} on R.")
# #RESULTS:
# '''Best edit distance of None. Extension begins at position None on R[s:]. Extension begins at position None on R.'''
# # |G| <= |R|
# print("\n")
# print("###########")
# print("|G| <= |R|")
# print("###########")
# S = "GCGCTGCTTAATATGATCGATCGATGATACTAG"
# R = "AATATGATCGATCGATGATCTAGTCC"
# i = 9
# s = 10
# print(S[(i+s):])
# print(R[s:])
# dm = DynamicMatrixOptim(S[(i+s):], R[s:], 7)
# dist, posR = dm.getEditDistanceAndGenomePosition()
# if posR is not None:
# posExt = posR + s
# else:
# posExt = None
# print(f"Best edit distance of {dist}. Extension begins at position {posR} on R[s:]. Extension begins at position {posExt} on R.")
# #RESULTS:
# '''Best edit distance of 1. Extension begins at position 13 on R[s:]. Extension begins at position 23 on R.'''
# #Overlap sur G = 14 donc dmax_overlap = 1
# #ALIGNMENT:
# ATCGATGAT A CTAG
# ||||||||| - ||||
# ATCGATGAT CTAG TCC