From 2f9e22d1d4d42d209faa4ef39b7a12f390a79278 Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Thu, 14 Sep 2017 11:58:55 -0400 Subject: [PATCH 1/6] uploading code for week 1 --- gene_finder.py | 95 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 18 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 3b1e7dd..396600e 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -YOUR HEADER COMMENT HERE +First project for Olin Software Design Fall 2017 -@author: YOUR NAME HERE +@author: Emma Westerhoff """ @@ -30,9 +30,15 @@ def get_complement(nucleotide): >>> get_complement('C') 'G' """ - # TODO: implement this - pass - + nucleotide_inputs = ['A', 'T', 'C', 'G'] + nucleotide_complements = ['T', 'A', 'G', 'C'] + i = 0 + complement = 'x' #lets the user know the complement was incorrectly computed + while i < len(nucleotide_inputs): + if nucleotide_inputs[i] == nucleotide: + complement = nucleotide_complements[i] + i = i + 1 + return complement def get_reverse_complement(dna): """ Computes the reverse complementary sequence of DNA for the specfied DNA @@ -45,9 +51,15 @@ def get_reverse_complement(dna): >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' """ - # TODO: implement this - pass - + reverse = '' + i = 0 + length = len(dna) + while i < length: + letter = dna[length - 1 -i] #moves backwards along the string + pair = get_complement(letter) #finds complement + reverse = reverse + pair + i = i+1 + return reverse def rest_of_ORF(dna): """ Takes a DNA sequence that is assumed to begin with a start @@ -61,10 +73,30 @@ def rest_of_ORF(dna): 'ATG' >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' + >>> rest_of_ORF("ATTTCGGGT") + 'ATTTCGGGT' """ - # TODO: implement this - pass - + stop_codons = ['TAG', 'TGA', 'TAA'] + codons = [] + n = 3 + return_string = '' + for i in range(0, len(dna), n): + codons.append(dna[i:i+n]) + + #clean this up: why did i get index errors? + #redo: ''.join instead of for loops + #try/except clauses? + for c in range(0, len(codons)): + for s in range(0, len(stop_codons)): + if codons[c] == stop_codons[s]: + codons = codons[:c] + for a in codons: + return_string = return_string+a + return return_string + + for a in codons: + return_string = return_string+a + return return_string def find_all_ORFs_oneframe(dna): """ Finds all non-nested open reading frames in the given DNA @@ -79,8 +111,23 @@ def find_all_ORFs_oneframe(dna): >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] """ - # TODO: implement this - pass + start_codon = 'ATG' + codons = [] + n = 3 + ORFS = [] + c=0 + + for i in range(0, len(dna), n): + codons.append(dna[i:i+n]) + + while c in range(0, len(codons)): + if codons[c] == start_codon: + dna_sequence = rest_of_ORF(''.join(codons[c:])) + ORFS.append(dna_sequence) + c = c + len(dna_sequence) + c = c+1 + + return ORFS def find_all_ORFs(dna): @@ -96,9 +143,17 @@ def find_all_ORFs(dna): >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] """ - # TODO: implement this - pass + return_list = [] + + for i in range (0,3): + #cases are coming through that occur in the same frame + orfs = find_all_ORFs_oneframe(dna[i:]) + for o in orfs: + result = ''.join(o) + if result != '': + return_list.append(result) + return return_list def find_all_ORFs_both_strands(dna): """ Finds all non-nested open reading frames in the given DNA sequence on both @@ -109,8 +164,11 @@ def find_all_ORFs_both_strands(dna): >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ - # TODO: implement this - pass + return_list = [] + + return_list.append(''.join(find_all_ORFs(dna))) + return_list.append(''.join(find_all_ORFs(get_reverse_complement(dna)))) + return return_list def longest_ORF(dna): @@ -163,4 +221,5 @@ def gene_finder(dna): if __name__ == "__main__": import doctest - doctest.testmod() + doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True) + #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC')) From 75183c2623765aeeb724e6945fb06f83fe6e88c2 Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Thu, 14 Sep 2017 12:03:14 -0400 Subject: [PATCH 2/6] updating part 1 files for readability --- gene_finder.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 396600e..8b4f8d1 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -54,6 +54,7 @@ def get_reverse_complement(dna): reverse = '' i = 0 length = len(dna) + while i < length: letter = dna[length - 1 -i] #moves backwards along the string pair = get_complement(letter) #finds complement @@ -79,23 +80,18 @@ def rest_of_ORF(dna): stop_codons = ['TAG', 'TGA', 'TAA'] codons = [] n = 3 - return_string = '' + for i in range(0, len(dna), n): codons.append(dna[i:i+n]) - #clean this up: why did i get index errors? - #redo: ''.join instead of for loops - #try/except clauses? for c in range(0, len(codons)): for s in range(0, len(stop_codons)): if codons[c] == stop_codons[s]: codons = codons[:c] - for a in codons: - return_string = return_string+a + return_string = ''.join(codons) return return_string - for a in codons: - return_string = return_string+a + return_string = ''.join(codons) return return_string def find_all_ORFs_oneframe(dna): @@ -124,8 +120,8 @@ def find_all_ORFs_oneframe(dna): if codons[c] == start_codon: dna_sequence = rest_of_ORF(''.join(codons[c:])) ORFS.append(dna_sequence) - c = c + len(dna_sequence) - c = c+1 + c = c + len(dna_sequence) #skips over the rest of the sequence + c = c+1 #if I'm missing a permutation, this might be a problem. return ORFS @@ -150,9 +146,8 @@ def find_all_ORFs(dna): orfs = find_all_ORFs_oneframe(dna[i:]) for o in orfs: result = ''.join(o) - if result != '': + if result != '': #if there are no permutations in a run through return_list.append(result) - return return_list def find_all_ORFs_both_strands(dna): @@ -221,5 +216,5 @@ def gene_finder(dna): if __name__ == "__main__": import doctest - doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True) + doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True) #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC')) From 25623e83e579b3eba47a5fab0222eba94c2ff14d Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Thu, 14 Sep 2017 17:01:13 -0400 Subject: [PATCH 3/6] adding second week code --- gene_finder.py | 66 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 8b4f8d1..bfe0c67 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -37,7 +37,7 @@ def get_complement(nucleotide): while i < len(nucleotide_inputs): if nucleotide_inputs[i] == nucleotide: complement = nucleotide_complements[i] - i = i + 1 + i += 1 return complement def get_reverse_complement(dna): @@ -59,7 +59,7 @@ def get_reverse_complement(dna): letter = dna[length - 1 -i] #moves backwards along the string pair = get_complement(letter) #finds complement reverse = reverse + pair - i = i+1 + i += 1 return reverse def rest_of_ORF(dna): @@ -111,7 +111,7 @@ def find_all_ORFs_oneframe(dna): codons = [] n = 3 ORFS = [] - c=0 + c = 0 for i in range(0, len(dna), n): codons.append(dna[i:i+n]) @@ -120,8 +120,8 @@ def find_all_ORFs_oneframe(dna): if codons[c] == start_codon: dna_sequence = rest_of_ORF(''.join(codons[c:])) ORFS.append(dna_sequence) - c = c + len(dna_sequence) #skips over the rest of the sequence - c = c+1 #if I'm missing a permutation, this might be a problem. + c += len(dna_sequence) #skips over the rest of the sequence + c += 1 #if I'm missing a permutation, this might be a problem. return ORFS @@ -172,9 +172,15 @@ def longest_ORF(dna): >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ - # TODO: implement this - pass + orfs = find_all_ORFs_both_strands(dna) + + longest_size=len(max(orfs,key=len)) + + for o in orfs: + if(longest_size==len(o)): + longest = o + return longest def longest_ORF_noncoding(dna, num_trials): """ Computes the maximum length of the longest ORF over num_trials shuffles @@ -183,8 +189,14 @@ def longest_ORF_noncoding(dna, num_trials): dna: a DNA sequence num_trials: the number of random shuffles returns: the maximum length longest ORF """ - # TODO: implement this - pass + lengths = [] + for rand in range(0, num_trials): + new_sequence = shuffle_string(dna) + leng = len(longest_ORF(new_sequence)) + lengths.append(leng) + + maximum = max(lengths) + return maximum def coding_strand_to_AA(dna): @@ -201,8 +213,21 @@ def coding_strand_to_AA(dna): >>> coding_strand_to_AA("ATGCCCGCTTT") 'MPA' """ - # TODO: implement this - pass + n = 3 + codons = [] + acids = '' + + for i in range(0, len(dna), n): + codons.append(dna[i:i+n]) + + if len(codons[-1]) < 3: + codons.pop(-1) + + for c in codons: + amino = aa_table[c] + acids += ''.join(amino) + + return acids def gene_finder(dna): @@ -212,9 +237,22 @@ def gene_finder(dna): returns: a list of all amino acid sequences coded by the sequence dna. """ # TODO: implement this - pass + + threshold = longest_ORF_noncoding(dna, 4) + #change this to 1500 or so later + dna_orfs = find_all_ORFs_both_strands(dna) + + for snip in dna_orfs: + if len(snip) < threshold: + dna_orfs.remove(snip) + + amino_string = coding_strand_to_AA(''.join(dna_orfs)) + print(amino_string) + return amino_string if __name__ == "__main__": import doctest - doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True) - #print(find_all_ORFs_oneframe('ATGCATGAATGTAGATAGATGTGCCC')) + #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True) + dna = load_seq("./data/X73525.fa") + #longest_ORF_noncoding(dna, 4) + gene_finder(dna) From e81dbdac8830959fd4e576bbf1f8c9364cd7d303 Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Sun, 24 Sep 2017 14:58:32 -0400 Subject: [PATCH 4/6] updating gene finder code, adding nitrogenase finder --- gene_finder.py | 21 +++++++++-------- nitrogenase_finder.py | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 10 deletions(-) create mode 100644 nitrogenase_finder.py diff --git a/gene_finder.py b/gene_finder.py index bfe0c67..f3dbd85 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -10,7 +10,6 @@ from amino_acids import aa, codons, aa_table # you may find these useful from load import load_seq - def shuffle_string(s): """Shuffles the characters in the input string NOTE: this is a helper function, you do not @@ -236,23 +235,25 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - # TODO: implement this - - threshold = longest_ORF_noncoding(dna, 4) + threshold = longest_ORF_noncoding(dna, 15) #change this to 1500 or so later dna_orfs = find_all_ORFs_both_strands(dna) + amino_sequences = [] + longs = [] for snip in dna_orfs: - if len(snip) < threshold: - dna_orfs.remove(snip) + if len(snip) > threshold: + longs.append(snip) + print(snip) - amino_string = coding_strand_to_AA(''.join(dna_orfs)) - print(amino_string) - return amino_string + for string in dna_orfs: + amino_sequences.append(coding_strand_to_AA(''.join(string))) + print('hey') + #print(amino_sequences) + return amino_sequences if __name__ == "__main__": import doctest #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True) dna = load_seq("./data/X73525.fa") - #longest_ORF_noncoding(dna, 4) gene_finder(dna) diff --git a/nitrogenase_finder.py b/nitrogenase_finder.py new file mode 100644 index 0000000..e543c57 --- /dev/null +++ b/nitrogenase_finder.py @@ -0,0 +1,54 @@ +""" +Extension on first project for Olin Software Design Fall 2017 + +@author: Emma Westerhoff + +""" + +from load import load_nitrogenase_seq, load_metagenome + +def longest_common_substring(string1, string2): #s length r, t ength n + """ Computes the longest common substring using dynamic programming + + >>> longest_common_substring('abcdefgqwertyuiop', 'xyabcdjipqwertyuiop') + 'gqwertyuiop' + """ + + x = len(string1) + y = len(string2) + + L = [[None]*(y) for a in range(x)] + + z = 0 + ret = '' + + for i in range(0, x): + for j in range(0, y): + if string1[i] == string2[j]: + if i == 0 or j == 0: + L[i][j] = 0 + else: + L[i][j] = L[i-1][j-1] + 1 + if L[i][j] > z: + z = L[i][j] + ret = string1[i-z:i+1] + #elif L[i][j] == z: + #ret.append(string1[i-z:i+1]) + else: + L[i][j] = 0 + return ret + +def nitrogen_fixation(x): + #TODO: implement this + pass + +if __name__ == "__main__": + nitrogenase = load_nitrogenase_seq() + metagenome = load_metagenome() + #metagenome is of form [('some info', 'actual sequence')] + #transform metagenome to proper form + + #import doctest + #doctest.run_docstring_examples(longest_common_substring, globals(), verbose=True) + #longest = longest_common_substring(nitrogenase, metagenome) + #print(longest) From 00b83e1089697cf134d923b8f7e884043ddee3b6 Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Sun, 24 Sep 2017 16:19:04 -0400 Subject: [PATCH 5/6] updating code for ninja checkin --- gene_finder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index f3dbd85..6415844 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -244,12 +244,12 @@ def gene_finder(dna): for snip in dna_orfs: if len(snip) > threshold: longs.append(snip) - print(snip) for string in dna_orfs: amino_sequences.append(coding_strand_to_AA(''.join(string))) - print('hey') + #print(amino_sequences) + #print(len(amino_sequences)) return amino_sequences if __name__ == "__main__": From aadb20e39fc2c39063d0bcf31fc1d851f26a8d29 Mon Sep 17 00:00:00 2001 From: Emma Westerhoff Date: Sun, 24 Sep 2017 16:57:39 -0400 Subject: [PATCH 6/6] final code --- gene_finder.py | 41 +++++++++++++++++++++++++---------------- nitrogenase_finder.py | 2 +- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 6415844..077e3e1 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -160,10 +160,16 @@ def find_all_ORFs_both_strands(dna): """ return_list = [] - return_list.append(''.join(find_all_ORFs(dna))) - return_list.append(''.join(find_all_ORFs(get_reverse_complement(dna)))) - return return_list + all_orfs_one = find_all_ORFs(dna) + all_orfs_two = find_all_ORFs(get_reverse_complement(dna)) + + for o in all_orfs_one: + return_list.append(o) + for a in all_orfs_two: + return_list.append(a) + #print(return_list) + return return_list def longest_ORF(dna): """ Finds the longest ORF on both strands of the specified DNA and returns it @@ -173,13 +179,16 @@ def longest_ORF(dna): """ orfs = find_all_ORFs_both_strands(dna) - longest_size=len(max(orfs,key=len)) + #longest_size=len(max(orfs,key=len)) + + longest_length = 0 for o in orfs: - if(longest_size==len(o)): - longest = o + if len(o) > longest_length: + longest_seq = o + longest_length = len(o) - return longest + return longest_seq def longest_ORF_noncoding(dna, num_trials): """ Computes the maximum length of the longest ORF over num_trials shuffles @@ -189,13 +198,16 @@ def longest_ORF_noncoding(dna, num_trials): num_trials: the number of random shuffles returns: the maximum length longest ORF """ lengths = [] + max_length = 0 for rand in range(0, num_trials): new_sequence = shuffle_string(dna) leng = len(longest_ORF(new_sequence)) - lengths.append(leng) + if(leng > max_length): + max_length = leng - maximum = max(lengths) - return maximum + # maximum = max(lengths) + print(max_length) + return max_length def coding_strand_to_AA(dna): @@ -235,7 +247,7 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - threshold = longest_ORF_noncoding(dna, 15) + threshold = longest_ORF_noncoding(dna, 400) #change this to 1500 or so later dna_orfs = find_all_ORFs_both_strands(dna) amino_sequences = [] @@ -243,12 +255,9 @@ def gene_finder(dna): for snip in dna_orfs: if len(snip) > threshold: - longs.append(snip) - - for string in dna_orfs: - amino_sequences.append(coding_strand_to_AA(''.join(string))) + amino_sequences.append(coding_strand_to_AA(snip)) - #print(amino_sequences) + print(amino_sequences) #print(len(amino_sequences)) return amino_sequences diff --git a/nitrogenase_finder.py b/nitrogenase_finder.py index e543c57..4b43f50 100644 --- a/nitrogenase_finder.py +++ b/nitrogenase_finder.py @@ -47,7 +47,7 @@ def nitrogen_fixation(x): metagenome = load_metagenome() #metagenome is of form [('some info', 'actual sequence')] #transform metagenome to proper form - + #import doctest #doctest.run_docstring_examples(longest_common_substring, globals(), verbose=True) #longest = longest_common_substring(nitrogenase, metagenome)