From 2f9e22d1d4d42d209faa4ef39b7a12f390a79278 Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <emma.westerhoff@students.olin.edu>
Date: Thu, 14 Sep 2017 11:58:55 -0400
Subject: [PATCH 1/6] uploading code for week 1

---
 gene_finder.py | 95 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 77 insertions(+), 18 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 3b1e7dd..396600e 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 """
-YOUR HEADER COMMENT HERE
+First project for Olin Software Design Fall 2017
 
-@author: YOUR NAME HERE
+@author: Emma Westerhoff
 
 """
 
@@ -30,9 +30,15 @@ def get_complement(nucleotide):
     >>> get_complement('C')
     'G'
     """
-    # TODO: implement this
-    pass
-
+    nucleotide_inputs = ['A', 'T', 'C', 'G']
+    nucleotide_complements = ['T', 'A', 'G', 'C']
+    i = 0
+    complement = 'x' #lets the user know the complement was incorrectly computed
+    while i < len(nucleotide_inputs):
+        if nucleotide_inputs[i] == nucleotide:
+            complement = nucleotide_complements[i]
+        i = i + 1
+    return complement
 
 def get_reverse_complement(dna):
     """ Computes the reverse complementary sequence of DNA for the specfied DNA
@@ -45,9 +51,15 @@ def get_reverse_complement(dna):
     >>> get_reverse_complement("CCGCGTTCA")
     'TGAACGCGG'
     """
-    # TODO: implement this
-    pass
-
+    reverse = ''
+    i = 0
+    length = len(dna)
+    while i < length:
+        letter = dna[length - 1 -i] #moves backwards along the string
+        pair = get_complement(letter) #finds complement
+        reverse = reverse + pair
+        i = i+1
+    return reverse
 
 def rest_of_ORF(dna):
     """ Takes a DNA sequence that is assumed to begin with a start
@@ -61,10 +73,30 @@ def rest_of_ORF(dna):
     'ATG'
     >>> rest_of_ORF("ATGAGATAGG")
     'ATGAGA'
+    >>> rest_of_ORF("ATTTCGGGT")
+    'ATTTCGGGT'
     """
-    # TODO: implement this
-    pass
-
+    stop_codons = ['TAG', 'TGA', 'TAA']
+    codons = []
+    n = 3
+    return_string = ''
+    for i in range(0, len(dna), n):
+        codons.append(dna[i:i+n])
+
+        #clean this up: why did i get index errors?
+        #redo: ''.join instead of for loops
+        #try/except clauses?
+    for c in range(0, len(codons)):
+        for s in range(0, len(stop_codons)):
+            if codons[c] == stop_codons[s]:
+                codons = codons[:c]
+                for a in codons:
+                    return_string = return_string+a
+                return return_string
+
+    for a in codons:
+        return_string = return_string+a
+    return return_string
 
 def find_all_ORFs_oneframe(dna):
     """ Finds all non-nested open reading frames in the given DNA
@@ -79,8 +111,23 @@ def find_all_ORFs_oneframe(dna):
     >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
     ['ATGCATGAATGTAGA', 'ATGTGCCC']
     """
-    # TODO: implement this
-    pass
+    start_codon = 'ATG'
+    codons = []
+    n = 3
+    ORFS = []
+    c=0
+
+    for i in range(0, len(dna), n):
+        codons.append(dna[i:i+n])
+
+    while c in range(0, len(codons)):
+        if codons[c] == start_codon:
+                dna_sequence = rest_of_ORF(''.join(codons[c:]))
+                ORFS.append(dna_sequence)
+                c = c + len(dna_sequence)
+        c = c+1
+
+    return ORFS
 
 
 def find_all_ORFs(dna):
@@ -96,9 +143,17 @@ def find_all_ORFs(dna):
     >>> find_all_ORFs("ATGCATGAATGTAG")
     ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
     """
-    # TODO: implement this
-    pass
+    return_list = []
+
+    for i in range (0,3):
+        #cases are coming through that occur in the same frame
+        orfs = find_all_ORFs_oneframe(dna[i:])
+        for o in orfs:
+            result = ''.join(o)
+            if result != '':
+                return_list.append(result)
 
+    return return_list
 
 def find_all_ORFs_both_strands(dna):
     """ Finds all non-nested open reading frames in the given DNA sequence on both
@@ -109,8 +164,11 @@ def find_all_ORFs_both_strands(dna):
     >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
     ['ATGCGAATG', 'ATGCTACATTCGCAT']
     """
-    # TODO: implement this
-    pass
+    return_list = []
+
+    return_list.append(''.join(find_all_ORFs(dna)))
+    return_list.append(''.join(find_all_ORFs(get_reverse_complement(dna))))
+    return return_list
 
 
 def longest_ORF(dna):
@@ -163,4 +221,5 @@ def gene_finder(dna):
 
 if __name__ == "__main__":
     import doctest
-    doctest.testmod()
+    doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True)
+    #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC'))

From 75183c2623765aeeb724e6945fb06f83fe6e88c2 Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <emma.westerhoff@students.olin.edu>
Date: Thu, 14 Sep 2017 12:03:14 -0400
Subject: [PATCH 2/6] updating part 1 files for readability

---
 gene_finder.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 396600e..8b4f8d1 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -54,6 +54,7 @@ def get_reverse_complement(dna):
     reverse = ''
     i = 0
     length = len(dna)
+
     while i < length:
         letter = dna[length - 1 -i] #moves backwards along the string
         pair = get_complement(letter) #finds complement
@@ -79,23 +80,18 @@ def rest_of_ORF(dna):
     stop_codons = ['TAG', 'TGA', 'TAA']
     codons = []
     n = 3
-    return_string = ''
+
     for i in range(0, len(dna), n):
         codons.append(dna[i:i+n])
 
-        #clean this up: why did i get index errors?
-        #redo: ''.join instead of for loops
-        #try/except clauses?
     for c in range(0, len(codons)):
         for s in range(0, len(stop_codons)):
             if codons[c] == stop_codons[s]:
                 codons = codons[:c]
-                for a in codons:
-                    return_string = return_string+a
+                return_string = ''.join(codons)
                 return return_string
 
-    for a in codons:
-        return_string = return_string+a
+    return_string = ''.join(codons)
     return return_string
 
 def find_all_ORFs_oneframe(dna):
@@ -124,8 +120,8 @@ def find_all_ORFs_oneframe(dna):
         if codons[c] == start_codon:
                 dna_sequence = rest_of_ORF(''.join(codons[c:]))
                 ORFS.append(dna_sequence)
-                c = c + len(dna_sequence)
-        c = c+1
+                c = c + len(dna_sequence) #skips over the rest of the sequence
+        c = c+1 #if I'm missing a permutation, this might be a problem.
 
     return ORFS
 
@@ -150,9 +146,8 @@ def find_all_ORFs(dna):
         orfs = find_all_ORFs_oneframe(dna[i:])
         for o in orfs:
             result = ''.join(o)
-            if result != '':
+            if result != '': #if there are no permutations in a run through
                 return_list.append(result)
-
     return return_list
 
 def find_all_ORFs_both_strands(dna):
@@ -221,5 +216,5 @@ def gene_finder(dna):
 
 if __name__ == "__main__":
     import doctest
-    doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True)
+    doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True)
     #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC'))

From 25623e83e579b3eba47a5fab0222eba94c2ff14d Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <emma.westerhoff@students.olin.edu>
Date: Thu, 14 Sep 2017 17:01:13 -0400
Subject: [PATCH 3/6] adding second week code

---
 gene_finder.py | 66 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 8b4f8d1..bfe0c67 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -37,7 +37,7 @@ def get_complement(nucleotide):
     while i < len(nucleotide_inputs):
         if nucleotide_inputs[i] == nucleotide:
             complement = nucleotide_complements[i]
-        i = i + 1
+        i += 1
     return complement
 
 def get_reverse_complement(dna):
@@ -59,7 +59,7 @@ def get_reverse_complement(dna):
         letter = dna[length - 1 -i] #moves backwards along the string
         pair = get_complement(letter) #finds complement
         reverse = reverse + pair
-        i = i+1
+        i += 1
     return reverse
 
 def rest_of_ORF(dna):
@@ -111,7 +111,7 @@ def find_all_ORFs_oneframe(dna):
     codons = []
     n = 3
     ORFS = []
-    c=0
+    c = 0
 
     for i in range(0, len(dna), n):
         codons.append(dna[i:i+n])
@@ -120,8 +120,8 @@ def find_all_ORFs_oneframe(dna):
         if codons[c] == start_codon:
                 dna_sequence = rest_of_ORF(''.join(codons[c:]))
                 ORFS.append(dna_sequence)
-                c = c + len(dna_sequence) #skips over the rest of the sequence
-        c = c+1 #if I'm missing a permutation, this might be a problem.
+                c += len(dna_sequence) #skips over the rest of the sequence
+        c += 1 #if I'm missing a permutation, this might be a problem.
 
     return ORFS
 
@@ -172,9 +172,15 @@ def longest_ORF(dna):
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    # TODO: implement this
-    pass
+    orfs = find_all_ORFs_both_strands(dna)
+
+    longest_size=len(max(orfs,key=len))
+
+    for o in orfs:
+        if(longest_size==len(o)):
+            longest = o
 
+    return longest
 
 def longest_ORF_noncoding(dna, num_trials):
     """ Computes the maximum length of the longest ORF over num_trials shuffles
@@ -183,8 +189,14 @@ def longest_ORF_noncoding(dna, num_trials):
         dna: a DNA sequence
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
-    # TODO: implement this
-    pass
+    lengths = []
+    for rand in range(0, num_trials):
+        new_sequence = shuffle_string(dna)
+        leng = len(longest_ORF(new_sequence))
+        lengths.append(leng)
+
+    maximum = max(lengths)
+    return maximum
 
 
 def coding_strand_to_AA(dna):
@@ -201,8 +213,21 @@ def coding_strand_to_AA(dna):
         >>> coding_strand_to_AA("ATGCCCGCTTT")
         'MPA'
     """
-    # TODO: implement this
-    pass
+    n = 3
+    codons = []
+    acids = ''
+
+    for i in range(0, len(dna), n):
+        codons.append(dna[i:i+n])
+
+    if len(codons[-1]) < 3:
+        codons.pop(-1)
+
+    for c in codons:
+        amino = aa_table[c]
+        acids += ''.join(amino)
+
+    return acids
 
 
 def gene_finder(dna):
@@ -212,9 +237,22 @@ def gene_finder(dna):
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
     # TODO: implement this
-    pass
+
+    threshold = longest_ORF_noncoding(dna, 4)
+    #change this to 1500 or so later
+    dna_orfs = find_all_ORFs_both_strands(dna)
+
+    for snip in dna_orfs:
+        if len(snip) < threshold:
+            dna_orfs.remove(snip)
+
+    amino_string = coding_strand_to_AA(''.join(dna_orfs))
+    print(amino_string)
+    return amino_string
 
 if __name__ == "__main__":
     import doctest
-    doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True)
-    #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC'))
+    #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)
+    dna = load_seq("./data/X73525.fa")
+    #longest_ORF_noncoding(dna, 4)
+    gene_finder(dna)

From e81dbdac8830959fd4e576bbf1f8c9364cd7d303 Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <ewesterhoff@olin.edu>
Date: Sun, 24 Sep 2017 14:58:32 -0400
Subject: [PATCH 4/6] updating gene finder code, adding nitrogenase finder

---
 gene_finder.py        | 21 +++++++++--------
 nitrogenase_finder.py | 54 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 10 deletions(-)
 create mode 100644 nitrogenase_finder.py

diff --git a/gene_finder.py b/gene_finder.py
index bfe0c67..f3dbd85 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -10,7 +10,6 @@
 from amino_acids import aa, codons, aa_table   # you may find these useful
 from load import load_seq
 
-
 def shuffle_string(s):
     """Shuffles the characters in the input string
         NOTE: this is a helper function, you do not
@@ -236,23 +235,25 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    # TODO: implement this
-
-    threshold = longest_ORF_noncoding(dna, 4)
+    threshold = longest_ORF_noncoding(dna, 15)
     #change this to 1500 or so later
     dna_orfs = find_all_ORFs_both_strands(dna)
+    amino_sequences = []
+    longs = []
 
     for snip in dna_orfs:
-        if len(snip) < threshold:
-            dna_orfs.remove(snip)
+        if len(snip) > threshold:
+            longs.append(snip)
+            print(snip)
 
-    amino_string = coding_strand_to_AA(''.join(dna_orfs))
-    print(amino_string)
-    return amino_string
+    for string in dna_orfs:
+        amino_sequences.append(coding_strand_to_AA(''.join(string)))
+        print('hey')
+    #print(amino_sequences)
+    return amino_sequences
 
 if __name__ == "__main__":
     import doctest
     #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)
     dna = load_seq("./data/X73525.fa")
-    #longest_ORF_noncoding(dna, 4)
     gene_finder(dna)
diff --git a/nitrogenase_finder.py b/nitrogenase_finder.py
new file mode 100644
index 0000000..e543c57
--- /dev/null
+++ b/nitrogenase_finder.py
@@ -0,0 +1,54 @@
+"""
+Extension on first project for Olin Software Design Fall 2017
+
+@author: Emma Westerhoff
+
+"""
+
+from load import load_nitrogenase_seq, load_metagenome
+
+def longest_common_substring(string1, string2): #s length r, t ength n
+    """ Computes the longest common substring using dynamic programming
+
+    >>> longest_common_substring('abcdefgqwertyuiop', 'xyabcdjipqwertyuiop')
+    'gqwertyuiop'
+    """
+
+    x = len(string1)
+    y = len(string2)
+
+    L = [[None]*(y) for a in range(x)]
+
+    z = 0
+    ret = ''
+
+    for i in range(0, x):
+        for j in range(0, y):
+            if string1[i] == string2[j]:
+                if i == 0 or j == 0:
+                    L[i][j] = 0
+                else:
+                    L[i][j] = L[i-1][j-1] + 1
+                if L[i][j] > z:
+                    z = L[i][j]
+                    ret = string1[i-z:i+1]
+                #elif L[i][j] == z:
+                    #ret.append(string1[i-z:i+1])
+            else:
+                L[i][j] = 0
+    return ret
+
+def nitrogen_fixation(x):
+    #TODO: implement this
+    pass
+
+if __name__ == "__main__":
+    nitrogenase = load_nitrogenase_seq()
+    metagenome = load_metagenome()
+    #metagenome is of form [('some info', 'actual sequence')]
+    #transform metagenome to proper form
+    
+    #import doctest
+    #doctest.run_docstring_examples(longest_common_substring, globals(), verbose=True)
+    #longest = longest_common_substring(nitrogenase, metagenome)
+    #print(longest)

From 00b83e1089697cf134d923b8f7e884043ddee3b6 Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <ewesterhoff@olin.edu>
Date: Sun, 24 Sep 2017 16:19:04 -0400
Subject: [PATCH 5/6] updating code for ninja checkin

---
 gene_finder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index f3dbd85..6415844 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -244,12 +244,12 @@ def gene_finder(dna):
     for snip in dna_orfs:
         if len(snip) > threshold:
             longs.append(snip)
-            print(snip)
 
     for string in dna_orfs:
         amino_sequences.append(coding_strand_to_AA(''.join(string)))
-        print('hey')
+
     #print(amino_sequences)
+    #print(len(amino_sequences))
     return amino_sequences
 
 if __name__ == "__main__":

From aadb20e39fc2c39063d0bcf31fc1d851f26a8d29 Mon Sep 17 00:00:00 2001
From: Emma Westerhoff <ewesterhoff@olin.edu>
Date: Sun, 24 Sep 2017 16:57:39 -0400
Subject: [PATCH 6/6] final code

---
 gene_finder.py        | 41 +++++++++++++++++++++++++----------------
 nitrogenase_finder.py |  2 +-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 6415844..077e3e1 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -160,10 +160,16 @@ def find_all_ORFs_both_strands(dna):
     """
     return_list = []
 
-    return_list.append(''.join(find_all_ORFs(dna)))
-    return_list.append(''.join(find_all_ORFs(get_reverse_complement(dna))))
-    return return_list
+    all_orfs_one = find_all_ORFs(dna)
+    all_orfs_two = find_all_ORFs(get_reverse_complement(dna))
+
+    for o in all_orfs_one:
+        return_list.append(o)
 
+    for a in all_orfs_two:
+        return_list.append(a)
+    #print(return_list)
+    return return_list
 
 def longest_ORF(dna):
     """ Finds the longest ORF on both strands of the specified DNA and returns it
@@ -173,13 +179,16 @@ def longest_ORF(dna):
     """
     orfs = find_all_ORFs_both_strands(dna)
 
-    longest_size=len(max(orfs,key=len))
+    #longest_size=len(max(orfs,key=len))
+
+    longest_length = 0
 
     for o in orfs:
-        if(longest_size==len(o)):
-            longest = o
+        if len(o) > longest_length:
+            longest_seq = o
+            longest_length = len(o)
 
-    return longest
+    return longest_seq
 
 def longest_ORF_noncoding(dna, num_trials):
     """ Computes the maximum length of the longest ORF over num_trials shuffles
@@ -189,13 +198,16 @@ def longest_ORF_noncoding(dna, num_trials):
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
     lengths = []
+    max_length = 0
     for rand in range(0, num_trials):
         new_sequence = shuffle_string(dna)
         leng = len(longest_ORF(new_sequence))
-        lengths.append(leng)
+        if(leng > max_length):
+            max_length = leng
 
-    maximum = max(lengths)
-    return maximum
+    # maximum = max(lengths)
+    print(max_length)
+    return max_length
 
 
 def coding_strand_to_AA(dna):
@@ -235,7 +247,7 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    threshold = longest_ORF_noncoding(dna, 15)
+    threshold = longest_ORF_noncoding(dna, 400)
     #change this to 1500 or so later
     dna_orfs = find_all_ORFs_both_strands(dna)
     amino_sequences = []
@@ -243,12 +255,9 @@ def gene_finder(dna):
 
     for snip in dna_orfs:
         if len(snip) > threshold:
-            longs.append(snip)
-
-    for string in dna_orfs:
-        amino_sequences.append(coding_strand_to_AA(''.join(string)))
+            amino_sequences.append(coding_strand_to_AA(snip))
 
-    #print(amino_sequences)
+    print(amino_sequences)
     #print(len(amino_sequences))
     return amino_sequences
 
diff --git a/nitrogenase_finder.py b/nitrogenase_finder.py
index e543c57..4b43f50 100644
--- a/nitrogenase_finder.py
+++ b/nitrogenase_finder.py
@@ -47,7 +47,7 @@ def nitrogen_fixation(x):
     metagenome = load_metagenome()
     #metagenome is of form [('some info', 'actual sequence')]
     #transform metagenome to proper form
-    
+
     #import doctest
     #doctest.run_docstring_examples(longest_common_substring, globals(), verbose=True)
     #longest = longest_common_substring(nitrogenase, metagenome)