-
Notifications
You must be signed in to change notification settings - Fork 18
Turning in MiniProject1 #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
2f9e22d
75183c2
25623e8
e81dbda
00b83e1
aadb20e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,16 +1,15 @@ | ||
| # -*- coding: utf-8 -*- | ||
| """ | ||
| YOUR HEADER COMMENT HERE | ||
| First project for Olin Software Design Fall 2017 | ||
|
|
||
| @author: YOUR NAME HERE | ||
| @author: Emma Westerhoff | ||
|
|
||
| """ | ||
|
|
||
| import random | ||
| from amino_acids import aa, codons, aa_table # you may find these useful | ||
| from load import load_seq | ||
|
|
||
|
|
||
| def shuffle_string(s): | ||
| """Shuffles the characters in the input string | ||
| NOTE: this is a helper function, you do not | ||
|
|
@@ -30,9 +29,15 @@ def get_complement(nucleotide): | |
| >>> get_complement('C') | ||
| 'G' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| nucleotide_inputs = ['A', 'T', 'C', 'G'] | ||
| nucleotide_complements = ['T', 'A', 'G', 'C'] | ||
| i = 0 | ||
| complement = 'x' #lets the user know the complement was incorrectly computed | ||
| while i < len(nucleotide_inputs): | ||
| if nucleotide_inputs[i] == nucleotide: | ||
| complement = nucleotide_complements[i] | ||
| i += 1 | ||
| return complement | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like how you implemented this function without using any |
||
|
|
||
| def get_reverse_complement(dna): | ||
| """ Computes the reverse complementary sequence of DNA for the specfied DNA | ||
|
|
@@ -45,9 +50,16 @@ def get_reverse_complement(dna): | |
| >>> get_reverse_complement("CCGCGTTCA") | ||
| 'TGAACGCGG' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| reverse = '' | ||
| i = 0 | ||
| length = len(dna) | ||
|
|
||
| while i < length: | ||
| letter = dna[length - 1 -i] #moves backwards along the string | ||
| pair = get_complement(letter) #finds complement | ||
| reverse = reverse + pair | ||
| i += 1 | ||
| return reverse | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fancy one line code (pythonic stylistic suggestion) : |
||
|
|
||
| def rest_of_ORF(dna): | ||
| """ Takes a DNA sequence that is assumed to begin with a start | ||
|
|
@@ -61,10 +73,25 @@ def rest_of_ORF(dna): | |
| 'ATG' | ||
| >>> rest_of_ORF("ATGAGATAGG") | ||
| 'ATGAGA' | ||
| >>> rest_of_ORF("ATTTCGGGT") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! Always add your own unit tests 👍 |
||
| 'ATTTCGGGT' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| stop_codons = ['TAG', 'TGA', 'TAA'] | ||
| codons = [] | ||
| n = 3 | ||
|
|
||
| for i in range(0, len(dna), n): | ||
| codons.append(dna[i:i+n]) | ||
|
|
||
| for c in range(0, len(codons)): | ||
| for s in range(0, len(stop_codons)): | ||
| if codons[c] == stop_codons[s]: | ||
| codons = codons[:c] | ||
| return_string = ''.join(codons) | ||
| return return_string | ||
|
|
||
| return_string = ''.join(codons) | ||
| return return_string | ||
|
|
||
| def find_all_ORFs_oneframe(dna): | ||
| """ Finds all non-nested open reading frames in the given DNA | ||
|
|
@@ -79,8 +106,23 @@ def find_all_ORFs_oneframe(dna): | |
| >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") | ||
| ['ATGCATGAATGTAGA', 'ATGTGCCC'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| start_codon = 'ATG' | ||
| codons = [] | ||
| n = 3 | ||
| ORFS = [] | ||
| c = 0 | ||
|
|
||
| for i in range(0, len(dna), n): | ||
| codons.append(dna[i:i+n]) | ||
|
|
||
| while c in range(0, len(codons)): | ||
| if codons[c] == start_codon: | ||
| dna_sequence = rest_of_ORF(''.join(codons[c:])) | ||
| ORFS.append(dna_sequence) | ||
| c += len(dna_sequence) #skips over the rest of the sequence | ||
| c += 1 #if I'm missing a permutation, this might be a problem. | ||
|
|
||
| return ORFS | ||
|
|
||
|
|
||
| def find_all_ORFs(dna): | ||
|
|
@@ -96,9 +138,16 @@ def find_all_ORFs(dna): | |
| >>> find_all_ORFs("ATGCATGAATGTAG") | ||
| ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| return_list = [] | ||
|
|
||
| for i in range (0,3): | ||
| #cases are coming through that occur in the same frame | ||
| orfs = find_all_ORFs_oneframe(dna[i:]) | ||
| for o in orfs: | ||
| result = ''.join(o) | ||
| if result != '': #if there are no permutations in a run through | ||
| return_list.append(result) | ||
| return return_list | ||
|
|
||
| def find_all_ORFs_both_strands(dna): | ||
| """ Finds all non-nested open reading frames in the given DNA sequence on both | ||
|
|
@@ -109,19 +158,37 @@ def find_all_ORFs_both_strands(dna): | |
| >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") | ||
| ['ATGCGAATG', 'ATGCTACATTCGCAT'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| return_list = [] | ||
|
|
||
| all_orfs_one = find_all_ORFs(dna) | ||
| all_orfs_two = find_all_ORFs(get_reverse_complement(dna)) | ||
|
|
||
| for o in all_orfs_one: | ||
| return_list.append(o) | ||
|
|
||
| for a in all_orfs_two: | ||
| return_list.append(a) | ||
| #print(return_list) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove commented lines when submitting your final code |
||
| return return_list | ||
|
|
||
| def longest_ORF(dna): | ||
| """ Finds the longest ORF on both strands of the specified DNA and returns it | ||
| as a string | ||
| >>> longest_ORF("ATGCGAATGTAGCATCAAA") | ||
| 'ATGCTACATTCGCAT' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| orfs = find_all_ORFs_both_strands(dna) | ||
|
|
||
| #longest_size=len(max(orfs,key=len)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comment |
||
|
|
||
| longest_length = 0 | ||
|
|
||
| for o in orfs: | ||
| if len(o) > longest_length: | ||
| longest_seq = o | ||
| longest_length = len(o) | ||
|
|
||
| return longest_seq | ||
|
|
||
| def longest_ORF_noncoding(dna, num_trials): | ||
| """ Computes the maximum length of the longest ORF over num_trials shuffles | ||
|
|
@@ -130,8 +197,17 @@ def longest_ORF_noncoding(dna, num_trials): | |
| dna: a DNA sequence | ||
| num_trials: the number of random shuffles | ||
| returns: the maximum length longest ORF """ | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One way to add unit tests for random functions is to use a fixed random seed. Check this documentation https://docs.python.org/3/library/random.html if this sounds interesting to you. |
||
| # TODO: implement this | ||
| pass | ||
| lengths = [] | ||
| max_length = 0 | ||
| for rand in range(0, num_trials): | ||
| new_sequence = shuffle_string(dna) | ||
| leng = len(longest_ORF(new_sequence)) | ||
| if(leng > max_length): | ||
| max_length = leng | ||
|
|
||
| # maximum = max(lengths) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comment |
||
| print(max_length) | ||
| return max_length | ||
|
|
||
|
|
||
| def coding_strand_to_AA(dna): | ||
|
|
@@ -148,8 +224,21 @@ def coding_strand_to_AA(dna): | |
| >>> coding_strand_to_AA("ATGCCCGCTTT") | ||
| 'MPA' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| n = 3 | ||
| codons = [] | ||
| acids = '' | ||
|
|
||
| for i in range(0, len(dna), n): | ||
| codons.append(dna[i:i+n]) | ||
|
|
||
| if len(codons[-1]) < 3: | ||
| codons.pop(-1) | ||
|
|
||
| for c in codons: | ||
| amino = aa_table[c] | ||
| acids += ''.join(amino) | ||
|
|
||
| return acids | ||
|
|
||
|
|
||
| def gene_finder(dna): | ||
|
|
@@ -158,9 +247,22 @@ def gene_finder(dna): | |
| dna: a DNA sequence | ||
| returns: a list of all amino acid sequences coded by the sequence dna. | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| threshold = longest_ORF_noncoding(dna, 400) | ||
| #change this to 1500 or so later | ||
| dna_orfs = find_all_ORFs_both_strands(dna) | ||
| amino_sequences = [] | ||
| longs = [] | ||
|
|
||
| for snip in dna_orfs: | ||
| if len(snip) > threshold: | ||
| amino_sequences.append(coding_strand_to_AA(snip)) | ||
|
|
||
| print(amino_sequences) | ||
| #print(len(amino_sequences)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comment |
||
| return amino_sequences | ||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| doctest.testmod() | ||
| #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True) | ||
| dna = load_seq("./data/X73525.fa") | ||
| gene_finder(dna) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| """ | ||
| Extension on first project for Olin Software Design Fall 2017 | ||
| @author: Emma Westerhoff | ||
| """ | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice! I like the fact that you've learned the concept of dynamic programming. Something to think about : Now that you've learned about recursion, what would be the pros and cons of using recursion vs dynamic programming to tackle this problem? |
||
| from load import load_nitrogenase_seq, load_metagenome | ||
|
|
||
| def longest_common_substring(string1, string2): #s length r, t ength n | ||
| """ Computes the longest common substring using dynamic programming | ||
| >>> longest_common_substring('abcdefgqwertyuiop', 'xyabcdjipqwertyuiop') | ||
| 'gqwertyuiop' | ||
| """ | ||
|
|
||
| x = len(string1) | ||
| y = len(string2) | ||
|
|
||
| L = [[None]*(y) for a in range(x)] | ||
|
|
||
| z = 0 | ||
| ret = '' | ||
|
|
||
| for i in range(0, x): | ||
| for j in range(0, y): | ||
| if string1[i] == string2[j]: | ||
| if i == 0 or j == 0: | ||
| L[i][j] = 0 | ||
| else: | ||
| L[i][j] = L[i-1][j-1] + 1 | ||
| if L[i][j] > z: | ||
| z = L[i][j] | ||
| ret = string1[i-z:i+1] | ||
| #elif L[i][j] == z: | ||
| #ret.append(string1[i-z:i+1]) | ||
| else: | ||
| L[i][j] = 0 | ||
| return ret | ||
|
|
||
| def nitrogen_fixation(x): | ||
| #TODO: implement this | ||
| pass | ||
|
|
||
| if __name__ == "__main__": | ||
| nitrogenase = load_nitrogenase_seq() | ||
| metagenome = load_metagenome() | ||
| #metagenome is of form [('some info', 'actual sequence')] | ||
| #transform metagenome to proper form | ||
|
|
||
| #import doctest | ||
| #doctest.run_docstring_examples(longest_common_substring, globals(), verbose=True) | ||
| #longest = longest_common_substring(nitrogenase, metagenome) | ||
| #print(longest) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be more helpful to add some short description of what this project is about.