Skip to content
Snippets Groups Projects
Commit 9f9b734d authored by Mika Cankosyan's avatar Mika Cankosyan
Browse files

nussinov implementation + bit of optimization (not finished)

parents
No related branches found
No related tags found
No related merge requests found
'''
notes
'''
# constants
AU_SCORE = 1
GC_SCORE = 1.5
GU_SCORE = 0.5
def is_valid_rna_sequence(rna_sequence):
valid_chars = {'A', 'U', 'G', 'C'}
return set(rna_sequence).issubset(valid_chars)
def is_match(nucleotide1, nucleotide2):
pairings = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G'}
return nucleotide2 == pairings[nucleotide1]
# returns -1 if not match, positive score if match
def match_score(nucleotide1, nucleotide2):
scores = {
'A': {'A': -1, 'U': AU_SCORE, 'G': -1, 'C': -1},
'U': {'A': AU_SCORE, 'U': -1, 'G': GU_SCORE, 'C': -1},
'G': {'A': -1, 'U': GU_SCORE, 'G': -1, 'C': GC_SCORE},
'C': {'A': -1, 'U': -1, 'G': GC_SCORE, 'C': -1}
}
return scores[nucleotide1][nucleotide2]
def print_2d_array(array):
for row in array:
print(' '.join(f'{num:3}' for num in row))
# returns a list consisting of '(', ')', and '-' characters
def bt_to_coded_list(bt, i, j):
length = j - i + 1
og_i = i
coded_list = list('-' * length)
while (bt[i][j] != -1):
if (bt[i][j] == "left"):
j -= 1
elif (bt[i][j] == "down"):
i += 1
elif (bt[i][j] == "match"):
coded_list[i - og_i] = '('
coded_list[j - og_i] = ')'
i += 1
j -= 1
else: # bifurcation
return coded_list[:i] + bt_to_coded_list(bt, i, bt[i][j]) \
+ bt_to_coded_list(bt, bt[i][j] + 1, j) + coded_list[j + 1:]
return coded_list
# returns dictionary of the base pairs
def bt_to_base_pairs(bt, i, j, base_pairs):
length = j - i + 1
while (bt[i][j] != -1):
if (bt[i][j] == "left"):
j -= 1
elif (bt[i][j] == "down"):
i += 1
elif (bt[i][j] == "match"):
base_pairs[i] = j
i += 1
j -= 1
else: # bifurcation
bt_to_base_pairs(bt, i, bt[i][j], base_pairs)
bt_to_base_pairs(bt, bt[i][j] + 1, j, base_pairs)
return base_pairs
return base_pairs
# takes in rna primary sequence
# returns dynamic programming table and backtrace table
# error if length < 1
# if there's a "tie" between bifurcations, chooses the smallest k giving the tied max
# if there's a "tie" between two or more of left, down, match, and bifurcation, chooses based on precedence in that order
def original(rna_sequence):
length = len(rna_sequence)
dp_table = [[0 for i in range(length)] for j in range(length)]
bt = [[0 for i in range(length)] for j in range(length)]
# i is row, j is col
i = length - 1
j = 0
done = False
while (not done):
while (i != length and j != length):
if (i >= j):
dp_table[i][j] = 0
bt[i][j] = -1
else:
left = dp_table[i][j - 1]
down = dp_table[i + 1][j]
match = None
if (is_match(rna_sequence[i], rna_sequence[j])):
match = dp_table[i + 1][j - 1] + 1
else:
match = -1
bifurcation = -1
k = None
for K in range(1 + 1, j):
current = dp_table[i][K] + dp_table[K + 1][j]
if (current > bifurcation):
bifurcation = current
k = K
max = None
if (left >= down and left >= match and left >= bifurcation):
max = left
bt[i][j] = "left"
elif (down >= left and down >= match and down >= bifurcation):
max = down
bt[i][j] = "down"
elif (match >= left and match >= down and match >= bifurcation):
max = match
bt[i][j] = "match"
else:
max = bifurcation
bt[i][j] = k
dp_table[i][j] = max
i += 1
j += 1
if (i == length):
if (j == length):
i = 0
j = 1
else:
i -= j + 1
j = 0
elif (i != 0):
j = length - i + 1
i = 0
else:
done = True
return dp_table, bt
# takes in rna primary sequence
# returns dynamic programming table and backtrace table
def optimized(rna_sequence):
length = len(rna_sequence)
dp_table = [[0 for i in range(length)] for j in range(length)]
bt = [[0 for i in range(length)] for j in range(length)]
# i is row, j is col
i = length - 1
j = 0
done = False
while (not done):
while (i != length and j != length):
if (i >= j):
dp_table[i][j] = 0
bt[i][j] = -1
else:
left = dp_table[i][j - 1]
down = dp_table[i + 1][j]
match = None
match_score1 = match_score(rna_sequence[i], rna_sequence[j])
if (match_score1 != -1):
match = dp_table[i + 1][j - 1] + match_score1
else:
match = -1
bifurcation = -1
k = None
for K in range(1 + 1, j):
current = dp_table[i][K] + dp_table[K + 1][j]
if (current > bifurcation):
bifurcation = current
k = K
max = None
if (left >= down and left >= match and left >= bifurcation):
max = left
bt[i][j] = "left"
elif (down >= left and down >= match and down >= bifurcation):
max = down
bt[i][j] = "down"
elif (match >= left and match >= down and match >= bifurcation):
max = match
bt[i][j] = "match"
else:
max = bifurcation
bt[i][j] = k
dp_table[i][j] = max
i += 1
j += 1
if (i == length):
if (j == length):
i = 0
j = 1
else:
i -= j + 1
j = 0
elif (i != 0):
j = length - i + 1
i = 0
else:
done = True
return dp_table, bt
rna_sequence = input("enter the rna sequence: ") # will only work if it's a string consisting of AUCG
if (not is_valid_rna_sequence(rna_sequence)):
print("didn't enter valid rna sequence (string of uppercase A U G C with no spaces), please run again")
exit()
choice = input("original or optimized: ") # will only work if "original" or "optimized"
if (choice == "original"):
dp_table, bt = original(rna_sequence)
elif (choice == "optimized"):
dp_table, bt = optimized(rna_sequence)
else:
print("didn't enter valid choice, please run again")
exit()
print_2d_array(dp_table)
coded_list = bt_to_coded_list(bt, 0, len(rna_sequence) - 1)
base_pairs = bt_to_base_pairs(bt, 0, len(rna_sequence) - 1, {})
print(''.join(coded_list))
print(base_pairs)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment