nussinov implementation + bit of optimization (not finished)

9f9b734d · Mika Cankosyan · 9f9b734d · 9f9b734d
Commit 9f9b734d authored 3 months ago by Mika Cankosyan
--- a/README.txt
+++ b/README.txt
--- a/nussinov.py
+++ b/nussinov.py
+'''
+notes
+'''
+
+# constants
+AU_SCORE = 1
+GC_SCORE = 1.5
+GU_SCORE = 0.5
+
+def is_valid_rna_sequence(rna_sequence):
+    valid_chars = {'A', 'U', 'G', 'C'}
+    return set(rna_sequence).issubset(valid_chars)
+
+def is_match(nucleotide1, nucleotide2):
+    pairings = {'A': 'U', 'U': 'A', 'G': 'C', 'C': 'G'}
+    return nucleotide2 == pairings[nucleotide1] 
+
+# returns -1 if not match, positive score if match
+def match_score(nucleotide1, nucleotide2):
+    scores = {
+        'A': {'A': -1, 'U': AU_SCORE, 'G': -1, 'C': -1},
+        'U': {'A': AU_SCORE, 'U': -1, 'G': GU_SCORE, 'C': -1},
+        'G': {'A': -1, 'U': GU_SCORE, 'G': -1, 'C': GC_SCORE},
+        'C': {'A': -1, 'U': -1, 'G': GC_SCORE, 'C': -1}
+    }
+    return scores[nucleotide1][nucleotide2]
+
+def print_2d_array(array):
+    for row in array:
+        print(' '.join(f'{num:3}' for num in row))
+
+# returns a list consisting of '(', ')', and '-' characters
+def bt_to_coded_list(bt, i, j):
+    length = j - i + 1
+    og_i = i
+    coded_list = list('-' * length)
+    while (bt[i][j] != -1):
+        if (bt[i][j] == "left"):
+            j -= 1
+        elif (bt[i][j] == "down"):
+            i += 1
+        elif (bt[i][j] == "match"):
+            coded_list[i - og_i] = '('
+            coded_list[j - og_i] = ')'
+            i += 1
+            j -= 1
+        else: # bifurcation
+            return coded_list[:i] + bt_to_coded_list(bt, i, bt[i][j]) \
+                + bt_to_coded_list(bt, bt[i][j] + 1, j) + coded_list[j + 1:]
+    return coded_list
+
+# returns dictionary of the base pairs
+def bt_to_base_pairs(bt, i, j, base_pairs):
+    length = j - i + 1
+    while (bt[i][j] != -1):
+        if (bt[i][j] == "left"):
+            j -= 1
+        elif (bt[i][j] == "down"):
+            i += 1
+        elif (bt[i][j] == "match"):
+            base_pairs[i] = j
+            i += 1
+            j -= 1
+        else: # bifurcation
+            bt_to_base_pairs(bt, i, bt[i][j], base_pairs)
+            bt_to_base_pairs(bt, bt[i][j] + 1, j, base_pairs)
+            return base_pairs
+    return base_pairs
+
+# takes in rna primary sequence
+# returns dynamic programming table and backtrace table
+# error if length < 1
+# if there's a "tie" between bifurcations, chooses the smallest k giving the tied max
+# if there's a "tie" between two or more of left, down, match, and bifurcation, chooses based on precedence in that order
+def original(rna_sequence):
+    length = len(rna_sequence)
+    dp_table = [[0 for i in range(length)] for j in range(length)]
+    bt = [[0 for i in range(length)] for j in range(length)]
+    # i is row, j is col
+    i = length - 1
+    j = 0
+    done = False
+    while (not done):
+        while (i != length and j != length):
+            if (i >= j):
+                dp_table[i][j] = 0
+                bt[i][j] = -1
+            else:
+                left = dp_table[i][j - 1]
+                down = dp_table[i + 1][j]
+                match = None
+                if (is_match(rna_sequence[i], rna_sequence[j])):
+                    match = dp_table[i + 1][j - 1] + 1
+                else:
+                    match = -1
+                bifurcation = -1
+                k = None
+                for K in range(1 + 1, j):
+                    current = dp_table[i][K] + dp_table[K + 1][j]
+                    if (current > bifurcation):
+                        bifurcation = current
+                        k = K
+                max = None
+                if (left >= down and left >= match and left >= bifurcation):
+                    max = left
+                    bt[i][j] = "left"
+                elif (down >= left and down >= match and down >= bifurcation):
+                    max = down
+                    bt[i][j] = "down"
+                elif (match >= left and match >= down and match >= bifurcation):
+                    max = match
+                    bt[i][j] = "match"
+                else:
+                    max = bifurcation
+                    bt[i][j] = k
+                dp_table[i][j] = max
+            i += 1
+            j += 1
+        if (i == length):
+            if (j == length):
+                i = 0
+                j = 1
+            else:    
+                i -= j + 1
+                j = 0
+        elif (i != 0):
+            j = length - i + 1
+            i = 0
+        else:
+            done = True
+    return dp_table, bt
+
+# takes in rna primary sequence
+# returns dynamic programming table and backtrace table
+def optimized(rna_sequence):
+    length = len(rna_sequence)
+    dp_table = [[0 for i in range(length)] for j in range(length)]
+    bt = [[0 for i in range(length)] for j in range(length)]
+    # i is row, j is col
+    i = length - 1
+    j = 0
+    done = False
+    while (not done):
+        while (i != length and j != length):
+            if (i >= j):
+                dp_table[i][j] = 0
+                bt[i][j] = -1
+            else:
+                left = dp_table[i][j - 1]
+                down = dp_table[i + 1][j]
+                match = None
+                match_score1 = match_score(rna_sequence[i], rna_sequence[j])
+                if (match_score1 != -1):
+                    match = dp_table[i + 1][j - 1] + match_score1
+                else:
+                    match = -1
+                bifurcation = -1
+                k = None
+                for K in range(1 + 1, j):
+                    current = dp_table[i][K] + dp_table[K + 1][j]
+                    if (current > bifurcation):
+                        bifurcation = current
+                        k = K
+                max = None
+                if (left >= down and left >= match and left >= bifurcation):
+                    max = left
+                    bt[i][j] = "left"
+                elif (down >= left and down >= match and down >= bifurcation):
+                    max = down
+                    bt[i][j] = "down"
+                elif (match >= left and match >= down and match >= bifurcation):
+                    max = match
+                    bt[i][j] = "match"
+                else:
+                    max = bifurcation
+                    bt[i][j] = k
+                dp_table[i][j] = max
+            i += 1
+            j += 1
+        if (i == length):
+            if (j == length):
+                i = 0
+                j = 1
+            else:    
+                i -= j + 1
+                j = 0
+        elif (i != 0):
+            j = length - i + 1
+            i = 0
+        else:
+            done = True
+    return dp_table, bt
+
+rna_sequence = input("enter the rna sequence: ") # will only work if it's a string consisting of AUCG
+if (not is_valid_rna_sequence(rna_sequence)):
+    print("didn't enter valid rna sequence (string of uppercase A U G C with no spaces), please run again")
+    exit()
+choice = input("original or optimized: ") # will only work if "original" or "optimized"
+if (choice == "original"):
+    dp_table, bt = original(rna_sequence)
+elif (choice == "optimized"):
+    dp_table, bt = optimized(rna_sequence)
+else:
+    print("didn't enter valid choice, please run again")
+    exit()
+print_2d_array(dp_table)
+coded_list = bt_to_coded_list(bt, 0, len(rna_sequence) - 1)
+base_pairs = bt_to_base_pairs(bt, 0, len(rna_sequence) - 1, {})
+print(''.join(coded_list))
+print(base_pairs)
\ No newline at end of file