From 552de149b21bdf5ceede0607ea3553c220db80f3 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Tue, 17 May 2016 21:15:01 +0200
Subject: [PATCH 01/11] inserito lo script python che a partire da un file di
allineamenti e un file di annotazioni su SNPs restituisce calcola alcune
statistiche.
---
src/input-bam/statistics.py | 264 ++++++++++++++++++++++++++++++++++++
1 file changed, 264 insertions(+)
create mode 100644 src/input-bam/statistics.py
diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
new file mode 100644
index 00000000..7c5df217
--- /dev/null
+++ b/src/input-bam/statistics.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import progressbar
+import pysam
+
+
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir):
+ in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+ # return the number of SNPs in the file
+ snp_total_number = file_len(annotation_file)
+ logging.info('Number of Total SNPs: {0}'.format(snp_total_number))
+ ann_file = open(annotation_file)
+
+ # skip first line (comment if the file starts with the header)
+ ann_file.readline()
+
+ # read header info
+ header = ann_file.readline().split(file_delimiter)
+
+ # the fields to read
+ snp_header_index_start = header.index('chromStart')
+ snp_header_index_stop = header.index('chromEnd')
+ snp_header_index_name = header.index('name')
+ snp_header_observed = header.index('observed')
+
+ statistics_file = open(out_dir + 'statistics.txt', 'w')
+
+ # read annotation file
+ snp_valid = 0 # SNPs with almost 1 read aligned
+ snp_not_valid = 0 # SNPs with no read aligned
+ snp_count = 0
+ read_with_first_allele = 0
+ read_with_second_allele = 0
+ read_with_different_allele = 0
+ prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+ for line in ann_file:
+ snp_count += 1
+ prgbar.update(snp_count)
+
+ splitted_line = line.split(file_delimiter)
+
+ snp_start = int(splitted_line[snp_header_index_start])
+ snp_stop = int(splitted_line[snp_header_index_stop])
+ snp_name = splitted_line[snp_header_index_name]
+ first_observed, second_observed = check_observed(
+ splitted_line[snp_header_observed])
+
+ frequences_observed = {'first': 0, 'second': 0, 'others': 0}
+
+ logging.info('Loaded snp: {0}'.format(snp_name))
+ statistics_file.write('\nLoaded snp: {0}'.format(snp_name))
+ frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
+ count_read = 0
+ for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop):
+
+ logging.debug('Coverage at base {0} = {1}'.format(
+ pileupcolumn.pos, pileupcolumn.n))
+ for pileupread in pileupcolumn.pileups:
+ if not pileupread.is_del and not pileupread.is_refskip:
+ count_read += 1
+ base_value = pileupread.alignment.query_sequence[
+ pileupread.query_position]
+ logging.debug('Base in read = {0}, First observed = {1}, Second observed = {2}'.format(
+ base_value, first_observed, second_observed))
+ frequences[base_value] += 1
+ if base_value == first_observed:
+ frequences_observed['first'] += 1
+ read_with_first_allele += 1
+ elif base_value == second_observed:
+ frequences_observed['second'] += 1
+ read_with_second_allele += 1
+ else:
+ frequences_observed['others'] += 1
+ read_with_different_allele += 1
+ logging.debug(frequences)
+
+ if count_read != 0:
+ snp_valid += 1
+ # number of reads aligned with the SNP
+ logging.info(
+ '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+ statistics_file.write(
+ '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+
+ # number of reads aligned with the SNP in the first allele
+ logging.info('Number of read aligned with the first allele: {0}'.format(
+ frequences_observed['first']))
+ statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
+ frequences_observed['first']))
+
+ # reads aligned with the SNP in the first allele over all
+ logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
+ frequences_observed['first']) / count_read * 100))
+ statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
+ frequences_observed['first']) / count_read * 100))
+
+ # number of reads aligned with the SNP in the second allele
+ logging.info('Number of read aligned with the second allele: {0}'.format(
+ frequences_observed['second']))
+ statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
+ frequences_observed['second']))
+
+ # reads aligned with the SNP in the second allele over all
+ logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
+ frequences_observed['second']) / count_read * 100))
+ statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
+ frequences_observed['second']) / count_read * 100))
+
+ # reads aligned with the SNP - no first or second allele
+ logging.info('Number of read aligned with different allele: {0}'.format(
+ frequences_observed['others']))
+ statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format(
+ float(frequences_observed['others']) / count_read) * 100)
+
+ else:
+ snp_not_valid += 1
+ logging.info(
+ '\nNo reads aligned with this snp: {0}'.format(snp_name))
+ statistics_file.write(
+ '\nNo reads aligned with this snp: {0}'.format(snp_name))
+
+ prgbar.finish()
+ logging.info(
+ '\n*************************************************************')
+ logging.info('Processed {0} SNPs'.format(snp_count))
+ statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
+
+ logging.info('SNPs with almost one read aligned (%): {0}'.format(
+ float(snp_valid) / snp_count) * 100)
+ statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format(
+ float(snp_valid) / snp_count) * 100)
+ logging.info('SNPs with no read aligned (%): {0}'.format(
+ float(snp_not_valid) / snp_count) * 100)
+ statistics_file.write('\nSNPs with no read aligned (%): {0}'.format(
+ float(snp_not_valid) / snp_count) * 100)
+
+ logging.info('Total number of read aligned with the first allele: {0}'.format(
+ read_with_first_allele))
+ statistics_file.write(
+ '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele))
+
+ logging.info('Total number of read aligned with the second allele: {0}'.format(
+ read_with_second_allele))
+ statistics_file.write(
+ '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele))
+ logging.info('Total number of read aligned with different allele: {0}'.format(
+ read_with_different_allele))
+ statistics_file.write(
+ '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele))
+
+ logging.info('Generation of statistics: Completed')
+ statistics_file.close()
+ ann_file.close()
+ in_sam.close()
+
+
+def main():
+
+ parser = argparse.ArgumentParser(
+ description='Generate statistics using alignment BAM file and SNPs annotation file')
+ parser.add_argument('-b',
+ action='store',
+ dest='fileBAM',
+ help='Alignment file in BAM format.',
+ required=True)
+ parser.add_argument('-a',
+ action='store',
+ dest='annotationFile',
+ help='SNPs location annotation file.',
+ required=True)
+ parser.add_argument('-r',
+ action='store',
+ dest='region',
+ help='Region to be examined in the form: chr:start-stop. Ex. 1:100-200.')
+ parser.add_argument('-o',
+ action='store',
+ dest='output_dir',
+ help='Output (root) directory.',
+ required=True)
+ parser.add_argument('-v',
+ help='increase output verbosity',
+ action='count')
+
+ args = parser.parse_args()
+
+ in_bam_file = args.fileBAM
+ in_region = args.region
+ out_root_dir = args.output_dir
+
+ if args.v is None:
+ log_level = logging.INFO
+ else:
+ log_level = logging.DEBUG
+
+ logging.basicConfig(level=log_level,
+ format='%(levelname)-8s [%(asctime)s] %(message)s',
+ datefmt='%y%m%d %H%M%S')
+
+ if not os.path.exists(out_root_dir):
+ logging.error('Output dir not found.')
+ sys.exit(1)
+
+ logging.info('StatisticsGenerator: Program Started')
+
+ region = check_region(in_region)
+
+ out_dir = out_root_dir + '/'
+
+ if not os.path.exists(out_dir):
+ os.mkdir(out_dir)
+
+ ann_file_delimiter = '\t'
+
+ generate_statistics(args.annotationFile,
+ ann_file_delimiter, region, in_bam_file, out_dir)
+
+ logging.info('StatisticsGenerator: Program Completed')
+
+
+# utility
+
+def check_observed(observed_couple):
+ # observed are in the form A/T, T/-, -/C
+ regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+ couple = regexpr_snp.search(observed_couple)
+ return couple.group(1), couple.group(2)
+
+
+def file_len(file):
+ # return the length (number of rows) of a file
+ with open(file) as f:
+ for i, l in enumerate(f):
+ pass
+ return i
+
+
+def check_region(input_region):
+ # if no region is set, is used all the 1st chromosome (for test purpose)
+ if input_region is not None:
+ regexp_reg = re.compile(r'^(\w+):(\d+)-(\d+)')
+ region = re.search(regexp_reg, input_region)
+ if region is not None:
+ in_chr = region.group(1)
+ in_start = int(region.group(2))
+ in_stop = int(region.group(3))
+ if in_start > in_stop:
+ logging.error('Wrong input region.')
+ sys.exit(1)
+ else:
+ return in_chr, in_start, in_stop
+ else:
+ return '1'
+
+
+if __name__ == '__main__':
+ main()
From 22d3c35bf54d7abdcfadebd28313128904268fe8 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Tue, 17 May 2016 23:03:45 +0200
Subject: [PATCH 02/11] Inserimento script per conversione BAM to WIF. Da
completare!
---
src/input-bam/bamToWif.py | 193 ++++++++++++++++++++++++++++++++++++++
1 file changed, 193 insertions(+)
create mode 100644 src/input-bam/bamToWif.py
diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
new file mode 100644
index 00000000..961947f3
--- /dev/null
+++ b/src/input-bam/bamToWif.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import logging
+import os
+import sys
+import argparse
+import progressbar
+import re
+import pprint
+
+import pysam
+
+
+def check_observed(observed_couple):
+ # observed are in the form A/T, T/-, -/C
+ regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+ couple = regexpr_snp.search(observed_couple)
+ return couple.group(1), couple.group(2)
+
+
+def file_len(file):
+ # return the length (number of rows) of a file
+ with open(file) as f:
+ for i, l in enumerate(f):
+ pass
+ return i
+
+
+def snpsInRead(alignment_file, annotation_file, file_delimiter):
+
+ in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+ # return the number of SNPs in the file
+ # snp_total_number = file_len(annotation_file)
+
+ total_fetch_alignment = in_sam.fetch()
+ total_read_number = in_sam.count()
+
+ read_count = 0
+ read_bar = progressbar.ProgressBar(maxval=total_read_number).start()
+ snp_read_list = {}
+ for read in total_fetch_alignment:
+ read_count += 1
+ read_bar.update(read_count)
+
+ read_qname = read.qname
+ read_start = read.reference_start
+ read_end = read.reference_end
+ logging.debug('read {0}, start: {1}, end: {2}'.format(
+ read_qname, read_start, read_end))
+
+ ann_file = open(annotation_file)
+
+ # skip first line (comment if the file starts with the header)
+ # ann_file.readline()
+
+ # Header info - first line
+ header = ann_file.readline().split(file_delimiter)
+
+ # the fields to read
+ snp_header_index_chromosome = header.index('chrom')
+ snp_header_index_start = header.index('chromStart')
+ snp_header_index_stop = header.index('chromEnd')
+ snp_header_index_observed = header.index('observed')
+ snp_read_list[read_qname] = []
+ # snp_bar = progressbar.ProgressBar(maxval=snp_total_number).start()
+ # snp_count = 0
+ for line in ann_file:
+ # snp_count += 1
+ # snp_bar.update(snp_count)
+
+ splitted_line = line.split(file_delimiter)
+
+ snp_chrom = splitted_line[snp_header_index_chromosome]
+ snp_start = int(splitted_line[snp_header_index_start])
+ snp_stop = int(splitted_line[snp_header_index_stop])
+ first_observed, second_observed = check_observed(
+ splitted_line[snp_header_index_observed])
+
+ # Check every read aligned with the SNPs
+ for pileupcolumn in in_sam.pileup(snp_chrom, snp_start, snp_stop):
+ for pileupread in pileupcolumn.pileups:
+ read_snp_name = pileupread.alignment.query_name
+ logging.debug('Found read: {0}'.format(read_snp_name))
+ if read_snp_name == read_qname:
+ if not pileupread.is_del and not pileupread.is_refskip:
+ read_snp_position = pileupread.query_position
+ base_value = pileupread.alignment.query_sequence[
+ pileupread.query_position]
+ if base_value == first_observed:
+ allele = '0'
+ elif base_value == second_observed:
+ allele = '1'
+ else:
+ # skip the SNP if not in first or second allele
+ continue
+ base_quality = pileupread.alignment.query_alignment_qualities[
+ read_snp_position]
+ # if read_snp_name == read_qname and
+ # read_snp_position >= read_start and
+ # read_snp_position <= read_end:
+
+ snp_read_list[read_qname].append(
+ [read_snp_position, base_value, allele, base_quality])
+
+ ann_file.close()
+ snp_read_list[read_qname].append(read.mapping_quality)
+ # snp_bar.finish()
+ read_bar.finish()
+ return snp_read_list
+
+
+def convertBamToWif(snp_read_list, out_dir):
+
+ output_file = open(out_dir + 'alignment.wif', 'w')
+
+ for read, snp_list in snp_read_list.items():
+ if len(snp_list) <= 1:
+ continue
+ else:
+ snp_mapping_quality = snp_list[-1]
+ for snp in snp_list[:-1]:
+ output_file.write('{0} {1} {2} {3} : '.format(
+ snp[0], snp[1], snp[2], snp[3]))
+ output_file.write('# {0} : NA\n'.format(snp_mapping_quality))
+
+
+def main():
+
+ parser = argparse.ArgumentParser(
+ description='Convert an alignment file .bam in a .wif Hapcol compatible file')
+ parser.add_argument('-b',
+ action='store',
+ dest='fileBAM',
+ help='Alignment file in BAM format.',
+ required=True)
+ parser.add_argument('-a',
+ action='store',
+ dest='annotationFile',
+ help='SNPs annotation file.',
+ required=True)
+ parser.add_argument('-o',
+ action='store',
+ dest='output_dir',
+ help='Output (root) directory.',
+ required=True)
+ parser.add_argument('-v',
+ help='increase output verbosity',
+ action='count')
+
+ args = parser.parse_args()
+
+ logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####")
+
+ in_bam_file = args.fileBAM
+ out_root_dir = args.output_dir
+
+ if args.v == 0:
+ log_level = logging.INFO
+ else:
+ log_level = logging.DEBUG
+
+ logging.basicConfig(level=log_level,
+ format='%(levelname)-8s [%(asctime)s] %(message)s',
+ datefmt='%y%m%d %H%M%S')
+
+ if not os.path.exists(out_root_dir):
+ logging.error('Output dir not found.')
+ sys.exit(1)
+
+ logging.info('BamToWif: Program Started')
+
+ out_dir = out_root_dir + '/'
+
+ if not os.path.exists(out_dir):
+ os.mkdir(out_dir)
+
+ ann_file_delimiter = '\t'
+
+ logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####")
+
+ snps_list = snpsInRead(in_bam_file, args.annotationFile,
+ ann_file_delimiter)
+
+ logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####")
+
+ convertBamToWif(snps_list, out_dir)
+
+ logging.info('BamToWif: Program Completed')
+
+if __name__ == '__main__':
+ main()
From 9c219b26230df544340d875f828c7d0660d6c530 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Wed, 18 May 2016 21:56:21 +0200
Subject: [PATCH 03/11] qualche correzione ai log
---
src/input-bam/statistics.py | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index 7c5df217..9c81e13c 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -86,7 +86,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
snp_valid += 1
# number of reads aligned with the SNP
logging.info(
- '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+ 'Total number of read aligned with SNP: {0}'.format(count_read))
statistics_file.write(
'\nTotal number of read aligned with SNP: {0}'.format(count_read))
@@ -96,7 +96,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
frequences_observed['first']))
- # reads aligned with the SNP in the first allele over all
+ # reads aligned with the SNP in the first allele
logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
frequences_observed['first']) / count_read * 100))
statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
@@ -108,28 +108,36 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
frequences_observed['second']))
- # reads aligned with the SNP in the second allele over all
+ # reads aligned with the SNP in the second allele
logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
frequences_observed['second']) / count_read * 100))
statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
frequences_observed['second']) / count_read * 100))
- # reads aligned with the SNP - no first or second allele
+ # reads aligned with the SNP - different allele
logging.info('Number of read aligned with different allele: {0}'.format(
frequences_observed['others']))
- statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format(
+ statistics_file.info('\nNumber of read aligned with different allele: {0}'.format(
+ frequences_observed['others']))
+
+ logging.info('Percentual of read aligned with different allele: {0}'.format(
+ float(frequences_observed['others']) / count_read) * 100)
+ statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format(
float(frequences_observed['others']) / count_read) * 100)
else:
snp_not_valid += 1
logging.info(
- '\nNo reads aligned with this snp: {0}'.format(snp_name))
+ 'No reads aligned with this snp: {0}'.format(snp_name))
statistics_file.write(
'\nNo reads aligned with this snp: {0}'.format(snp_name))
prgbar.finish()
logging.info(
- '\n*************************************************************')
+ '#### TOTAL RESULTS ####')
+ statistics_file.write(
+ '\n#### TOTAL RESULTS ####')
+
logging.info('Processed {0} SNPs'.format(snp_count))
statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
From dcf3cc3c305cdc637f94cce4acb08de8564e8b93 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Wed, 18 May 2016 21:57:50 +0200
Subject: [PATCH 04/11] qualche correzione ai log
---
src/input-bam/bamToWif.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
index 961947f3..98f93cf9 100644
--- a/src/input-bam/bamToWif.py
+++ b/src/input-bam/bamToWif.py
@@ -151,7 +151,7 @@ def main():
args = parser.parse_args()
- logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####")
+ logging.info('#### STEP 1 - PROCESSING INPUT ARGUMENTS ####')
in_bam_file = args.fileBAM
out_root_dir = args.output_dir
@@ -178,12 +178,12 @@ def main():
ann_file_delimiter = '\t'
- logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####")
+ logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####')
snps_list = snpsInRead(in_bam_file, args.annotationFile,
ann_file_delimiter)
- logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####")
+ logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####')
convertBamToWif(snps_list, out_dir)
From 3bcd3a11072b3c7bd9b90462d712f5787543ea64 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Wed, 18 May 2016 22:16:48 +0200
Subject: [PATCH 05/11] =?UTF-8?q?Aggiunta=20possibilit=C3=A0=20di=20decide?=
=?UTF-8?q?re=20in=20fase=20di=20esecuzione=20dello=20script=20il=20delimi?=
=?UTF-8?q?tatore=20dei=20campi=20dei=20file=20di=20annotazione=20e=20lo?=
=?UTF-8?q?=20skip=20della=20prima=20riga?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
src/input-bam/bamToWif.py | 20 ++++++++++++++++----
src/input-bam/statistics.py | 20 ++++++++++++++++----
2 files changed, 32 insertions(+), 8 deletions(-)
diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
index 98f93cf9..c7c24e53 100644
--- a/src/input-bam/bamToWif.py
+++ b/src/input-bam/bamToWif.py
@@ -27,7 +27,7 @@ def file_len(file):
return i
-def snpsInRead(alignment_file, annotation_file, file_delimiter):
+def snpsInRead(alignment_file, annotation_file, file_delimiter, skip_first_line):
in_sam = pysam.AlignmentFile(alignment_file, 'rb')
@@ -53,7 +53,8 @@ def snpsInRead(alignment_file, annotation_file, file_delimiter):
ann_file = open(annotation_file)
# skip first line (comment if the file starts with the header)
- # ann_file.readline()
+ if skip_first_line:
+ ann_file.readline()
# Header info - first line
header = ann_file.readline().split(file_delimiter)
@@ -145,6 +146,14 @@ def main():
dest='output_dir',
help='Output (root) directory.',
required=True)
+ parser.add_argument('-skip-first-line',
+ action='store',
+ dest='skip_first_line',
+ help='set this flag if the first line IS NOT the header')
+ parser.add_argument('-file-delimiter',
+ action='store',
+ dest='file_delim',
+ help='set the file delimiter for the annotation file. Default: \\t')
parser.add_argument('-v',
help='increase output verbosity',
action='count')
@@ -176,12 +185,15 @@ def main():
if not os.path.exists(out_dir):
os.mkdir(out_dir)
- ann_file_delimiter = '\t'
+ if args.file_delim is None:
+ ann_file_delimiter = '\t'
+ else:
+ ann_file_delimiter = args.file_delimiter
logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####')
snps_list = snpsInRead(in_bam_file, args.annotationFile,
- ann_file_delimiter)
+ ann_file_delimiter, args.skip_first_line)
logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####')
diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index 9c81e13c..af123a39 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -11,7 +11,7 @@
import pysam
-def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir):
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line):
in_sam = pysam.AlignmentFile(alignment_file, 'rb')
# return the number of SNPs in the file
@@ -20,7 +20,8 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
ann_file = open(annotation_file)
# skip first line (comment if the file starts with the header)
- ann_file.readline()
+ if skip_first_line:
+ ann_file.readline()
# read header info
header = ann_file.readline().split(file_delimiter)
@@ -193,6 +194,14 @@ def main():
dest='output_dir',
help='Output (root) directory.',
required=True)
+ parser.add_argument('-skip-first-line',
+ action='store',
+ dest='skip_first_line',
+ help='set this flag if the first line IS NOT the header')
+ parser.add_argument('-file-delimiter',
+ action='store',
+ dest='file_delim',
+ help='set the file delimiter for the annotation file. Default: \\t')
parser.add_argument('-v',
help='increase output verbosity',
action='count')
@@ -225,10 +234,13 @@ def main():
if not os.path.exists(out_dir):
os.mkdir(out_dir)
- ann_file_delimiter = '\t'
+ if args.file_delim is None:
+ ann_file_delimiter = '\t'
+ else:
+ ann_file_delimiter = args.file_delimiter
generate_statistics(args.annotationFile,
- ann_file_delimiter, region, in_bam_file, out_dir)
+ ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line)
logging.info('StatisticsGenerator: Program Completed')
From 1f0a0ef6abbcd53904471609d779189700292ffa Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Sun, 22 May 2016 23:43:12 +0200
Subject: [PATCH 06/11] fix sui log
---
src/input-bam/statistics.py | 107 ++++++++++++++----------------------
1 file changed, 41 insertions(+), 66 deletions(-)
diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index af123a39..a4225062 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -11,7 +11,7 @@
import pysam
-def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line):
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, skip_first_line):
in_sam = pysam.AlignmentFile(alignment_file, 'rb')
# return the number of SNPs in the file
@@ -32,8 +32,6 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
snp_header_index_name = header.index('name')
snp_header_observed = header.index('observed')
- statistics_file = open(out_dir + 'statistics.txt', 'w')
-
# read annotation file
snp_valid = 0 # SNPs with almost 1 read aligned
snp_not_valid = 0 # SNPs with no read aligned
@@ -57,11 +55,10 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
frequences_observed = {'first': 0, 'second': 0, 'others': 0}
logging.info('Loaded snp: {0}'.format(snp_name))
- statistics_file.write('\nLoaded snp: {0}'.format(snp_name))
+
frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
count_read = 0
for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop):
-
logging.debug('Coverage at base {0} = {1}'.format(
pileupcolumn.pos, pileupcolumn.n))
for pileupread in pileupcolumn.pileups:
@@ -88,85 +85,51 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
# number of reads aligned with the SNP
logging.info(
'Total number of read aligned with SNP: {0}'.format(count_read))
- statistics_file.write(
- '\nTotal number of read aligned with SNP: {0}'.format(count_read))
# number of reads aligned with the SNP in the first allele
- logging.info('Number of read aligned with the first allele: {0}'.format(
- frequences_observed['first']))
- statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
- frequences_observed['first']))
-
- # reads aligned with the SNP in the first allele
- logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
- frequences_observed['first']) / count_read * 100))
- statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
- frequences_observed['first']) / count_read * 100))
+ logging.info('Number of read aligned with the first allele: {0} ({1}%)'.format(
+ frequences_observed['first'], round((float(
+ frequences_observed['first']) / count_read) * 100, 2)))
# number of reads aligned with the SNP in the second allele
- logging.info('Number of read aligned with the second allele: {0}'.format(
- frequences_observed['second']))
- statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
- frequences_observed['second']))
-
- # reads aligned with the SNP in the second allele
- logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
- frequences_observed['second']) / count_read * 100))
- statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
- frequences_observed['second']) / count_read * 100))
+ logging.info('Number of read aligned with the second allele: {0} ({1}%)'.format(
+ frequences_observed['second'], round((float(
+ frequences_observed['second']) / count_read) * 100, 2)))
# reads aligned with the SNP - different allele
- logging.info('Number of read aligned with different allele: {0}'.format(
- frequences_observed['others']))
- statistics_file.info('\nNumber of read aligned with different allele: {0}'.format(
- frequences_observed['others']))
-
- logging.info('Percentual of read aligned with different allele: {0}'.format(
- float(frequences_observed['others']) / count_read) * 100)
- statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format(
- float(frequences_observed['others']) / count_read) * 100)
+ logging.info('Number of read aligned with different allele: {0} ({1}%)'.format(
+ frequences_observed['others'], round((float(
+ frequences_observed['others']) / count_read) * 100, 2)))
else:
snp_not_valid += 1
logging.info(
'No reads aligned with this snp: {0}'.format(snp_name))
- statistics_file.write(
- '\nNo reads aligned with this snp: {0}'.format(snp_name))
prgbar.finish()
logging.info(
'#### TOTAL RESULTS ####')
- statistics_file.write(
- '\n#### TOTAL RESULTS ####')
logging.info('Processed {0} SNPs'.format(snp_count))
- statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
- logging.info('SNPs with almost one read aligned (%): {0}'.format(
- float(snp_valid) / snp_count) * 100)
- statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format(
- float(snp_valid) / snp_count) * 100)
- logging.info('SNPs with no read aligned (%): {0}'.format(
- float(snp_not_valid) / snp_count) * 100)
- statistics_file.write('\nSNPs with no read aligned (%): {0}'.format(
- float(snp_not_valid) / snp_count) * 100)
+ logging.info('SNPs with almost one read aligned: {0} ({1}%)'.format(
+ snp_valid, round(((float(
+ snp_valid) / snp_count) * 100), 2)))
+
+ logging.info('SNPs with no read aligned: {0} ({1}%)'.format(
+ snp_not_valid, round(((float(
+ snp_not_valid) / snp_count) * 100), 2)))
logging.info('Total number of read aligned with the first allele: {0}'.format(
read_with_first_allele))
- statistics_file.write(
- '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele))
logging.info('Total number of read aligned with the second allele: {0}'.format(
read_with_second_allele))
- statistics_file.write(
- '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele))
+
logging.info('Total number of read aligned with different allele: {0}'.format(
read_with_different_allele))
- statistics_file.write(
- '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele))
logging.info('Generation of statistics: Completed')
- statistics_file.close()
ann_file.close()
in_sam.close()
@@ -195,7 +158,7 @@ def main():
help='Output (root) directory.',
required=True)
parser.add_argument('-skip-first-line',
- action='store',
+ action='store_true',
dest='skip_first_line',
help='set this flag if the first line IS NOT the header')
parser.add_argument('-file-delimiter',
@@ -217,30 +180,28 @@ def main():
else:
log_level = logging.DEBUG
- logging.basicConfig(level=log_level,
- format='%(levelname)-8s [%(asctime)s] %(message)s',
- datefmt='%y%m%d %H%M%S')
-
if not os.path.exists(out_root_dir):
logging.error('Output dir not found.')
sys.exit(1)
- logging.info('StatisticsGenerator: Program Started')
-
- region = check_region(in_region)
-
out_dir = out_root_dir + '/'
if not os.path.exists(out_dir):
os.mkdir(out_dir)
+ prepare_loggers(out_dir, log_level)
+
+ logging.info('StatisticsGenerator: Program Started')
+
+ region = check_region(in_region)
+
if args.file_delim is None:
ann_file_delimiter = '\t'
else:
ann_file_delimiter = args.file_delimiter
generate_statistics(args.annotationFile,
- ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line)
+ ann_file_delimiter, region, in_bam_file, args.skip_first_line)
logging.info('StatisticsGenerator: Program Completed')
@@ -262,6 +223,20 @@ def file_len(file):
return i
+def prepare_loggers(out_dir, log_level):
+ logging.basicConfig(filename=out_dir + 'statistics.txt',
+ filemode='w',
+ level=log_level,
+ format='%(levelname)-8s [%(asctime)s] %(message)s',
+ datefmt='%y%m%d %H:%M:%S')
+ console = logging.StreamHandler()
+ console.setLevel(logging.INFO)
+ formatter = logging.Formatter(
+ '[%(levelname)-8s] %(asctime)s - %(message)s')
+ console.setFormatter(formatter)
+ logging.getLogger('').addHandler(console)
+
+
def check_region(input_region):
# if no region is set, is used all the 1st chromosome (for test purpose)
if input_region is not None:
From 1cb00ed95d36f8f7d434389af71b031748bf9d99 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Mon, 13 Jun 2016 23:44:48 +0200
Subject: [PATCH 07/11] nuova versione. Carica in memoria la mappa di SNPs, la
scorre e per ogni read allineato allo snp controlla se questo si allinea con
l'allele di maggioranza, di minoranza o altro.
---
src/input-bam/statistics_v2.py | 220 +++++++++++++++++++++++++++++++++
1 file changed, 220 insertions(+)
create mode 100644 src/input-bam/statistics_v2.py
diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py
new file mode 100644
index 00000000..87867e1b
--- /dev/null
+++ b/src/input-bam/statistics_v2.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import progressbar
+import pysam
+import time
+
+
+def generate_statistics(snp_map, alignment_file):
+ logging.debug('Started method generate_statistics')
+ start_time = time.time()
+
+ in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+ snp_total_number = len(snp_map)
+
+ snp_count = 0
+ prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+ count_first_allele = 0
+ count_second_allele = 0
+ count_other_allele = 0
+
+ read_count = 0
+ for snp_name, snp_values in snp_map.items():
+ snp_count += 1
+ prgbar.update(snp_count)
+ logging.debug('Loaded snp: {0}'.format(snp_name))
+ snp_chrom = snp_values[0]
+ snp_start = snp_values[1]
+ snp_end = snp_values[2]
+ first_observed = snp_values[3]
+ second_observed = snp_values[4]
+ logging.debug(
+ 'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end))
+ fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
+ tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end)
+ logging.info(
+ 'Number of reads aligned with snp: {0}'.format(tot_fetch_aln))
+
+ if tot_fetch_aln == 0 or tot_fetch_aln < 10:
+ logging.debug('SNP without at least ten items will be discarded.')
+ continue
+
+ for read in fetch_aln:
+ read_count += 1
+ read_name = read.query_name
+ read_start = read.reference_start
+ read_end = read.reference_end
+ read_sequence = read.query_sequence
+ logging.debug('Loaded read: {0}'.format(read_name))
+
+ if snp_start >= read_start and snp_end <= read_end:
+ logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format(
+ read_sequence[snp_start:snp_end], first_observed, second_observed))
+ if read_sequence[snp_start:snp_end] == first_observed:
+ count_first_allele += 1
+ elif read_sequence[snp_start:snp_end] == second_observed:
+ count_second_allele += 1
+ elif not read_sequence[snp_start:snp_end].strip():
+ logging.debug('No valid read found.')
+ else:
+ count_other_allele += 1
+ prgbar.finish()
+
+ logging.info('Total number of SNPs: {0}'.format(snp_total_number))
+
+ logging.info('Total number of reads: {0}'.format(read_count))
+
+ logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format(
+ count_first_allele, (float(count_first_allele) / read_count) * 100))
+
+ logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format(
+ count_second_allele, (float(count_second_allele) / read_count) * 100))
+
+ logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format(
+ count_other_allele, (float(count_other_allele) / read_count) * 100))
+
+ logging.debug('Finished method generate_statistics in {0} seconds'.format(
+ time.time() - start_time))
+ logging.info('Generation of statistics: Completed')
+ in_sam.close()
+
+
+def main():
+
+ parser = argparse.ArgumentParser(
+ description='Generate statistics using alignment BAM file and SNPs annotation file')
+ parser.add_argument('-b',
+ action='store',
+ dest='fileBAM',
+ help='Alignment file in BAM format.',
+ required=True)
+ parser.add_argument('-a',
+ action='store',
+ dest='annotationFile',
+ help='SNPs location annotation file.',
+ required=True)
+ parser.add_argument('-o',
+ action='store',
+ dest='output_dir',
+ help='Output (root) directory.',
+ required=True)
+ parser.add_argument('-skip-first-line',
+ action='store_true',
+ dest='skip_first_line',
+ help='set this flag if the first line IS NOT the header')
+ parser.add_argument('-file-delimiter',
+ action='store',
+ dest='file_delim',
+ help='set the file delimiter for the annotation file. Default: \\t')
+ parser.add_argument('-v',
+ help='increase output verbosity',
+ dest='verbosity',
+ action='store_true')
+
+ args = parser.parse_args()
+
+ in_bam_file = args.fileBAM
+ out_root_dir = args.output_dir
+
+ if args.verbosity:
+ log_level = logging.DEBUG
+ else:
+ log_level = logging.INFO
+
+ if not os.path.exists(out_root_dir):
+ logging.error('Output dir not found.')
+ sys.exit(1)
+
+ out_dir = out_root_dir + '/'
+
+ if not os.path.exists(out_dir):
+ os.mkdir(out_dir)
+
+ prepare_loggers(log_level)
+
+ logging.info('Program Started')
+
+ if args.file_delim is None:
+ ann_file_delimiter = '\t'
+ else:
+ ann_file_delimiter = args.file_delimiter
+
+ logging.info('#### STEP 1 - Loading SNPs ####')
+
+ snp_map = reading_snp(args.annotationFile,
+ ann_file_delimiter, args.skip_first_line)
+
+ logging.info('#### STEP 2 - Generating Statistics ####')
+
+ generate_statistics(snp_map, in_bam_file)
+
+ logging.info('Program Completed')
+
+
+# utility
+
+def check_observed(observed_couple):
+ # observed are in the form A/T, T/-, -/C
+ regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+ couple = regexpr_snp.search(observed_couple)
+ return couple.group(1), couple.group(2)
+
+
+def prepare_loggers(log_level):
+ logging.basicConfig(level=log_level,
+ format='%(levelname)-8s [%(asctime)s] %(message)s',
+ datefmt='%y%m%d %H:%M:%S')
+
+
+def reading_snp(annotation_file, file_delimiter, flag_first_line):
+
+ logging.debug('Started method reading_snp')
+ start_time = time.time()
+ ann_file = open(annotation_file)
+
+ # skip first line (comment if the file starts with the header)
+ if flag_first_line:
+ ann_file.readline()
+
+ # read header info
+ header = ann_file.readline().split(file_delimiter)
+
+ # the fields to read
+ snp_header_index_chrom = header.index('chrom')
+ snp_header_index_start = header.index('chromStart')
+ snp_header_index_stop = header.index('chromEnd')
+ snp_header_index_name = header.index('name')
+ snp_header_observed = header.index('observed')
+ snp_map = {}
+ for line in ann_file:
+ splitted_line = line.split(file_delimiter)
+
+ snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1]
+ snp_start = int(splitted_line[snp_header_index_start])
+ snp_stop = int(splitted_line[snp_header_index_stop])
+ snp_name = splitted_line[snp_header_index_name]
+ first_observed, second_observed = check_observed(
+ splitted_line[snp_header_observed])
+ logging.debug('{0},{1},{2},{3}'.format(
+ snp_chrom, snp_start, snp_stop, snp_name))
+ snp_map[snp_name] = [snp_chrom, snp_start, snp_stop,
+ first_observed, second_observed]
+
+ ann_file.close()
+
+ logging.debug('Finished method reading_snp in: {0} seconds'.format(
+ time.time() - start_time))
+
+ return snp_map
+
+
+if __name__ == '__main__':
+ main()
From 4ee1395b1c4aac7ddb311645ac8c9246b5ee6560 Mon Sep 17 00:00:00 2001
From: ddcatania
Date: Tue, 21 Jun 2016 22:59:09 +0200
Subject: [PATCH 08/11] Calcola i due alleli con maggior frequenza e secondo
una certa soglia 'epsilon' classifica gli SNP in omozigote o eterozigote
---
src/input-bam/statistics_v2.py | 210 ++++++++++++++++++---------------
1 file changed, 117 insertions(+), 93 deletions(-)
diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py
index 87867e1b..41169dc9 100644
--- a/src/input-bam/statistics_v2.py
+++ b/src/input-bam/statistics_v2.py
@@ -6,85 +6,11 @@
import os
import re
import sys
+from collections import Counter
+import time
import progressbar
import pysam
-import time
-
-
-def generate_statistics(snp_map, alignment_file):
- logging.debug('Started method generate_statistics')
- start_time = time.time()
-
- in_sam = pysam.AlignmentFile(alignment_file, 'rb')
-
- snp_total_number = len(snp_map)
-
- snp_count = 0
- prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
- count_first_allele = 0
- count_second_allele = 0
- count_other_allele = 0
-
- read_count = 0
- for snp_name, snp_values in snp_map.items():
- snp_count += 1
- prgbar.update(snp_count)
- logging.debug('Loaded snp: {0}'.format(snp_name))
- snp_chrom = snp_values[0]
- snp_start = snp_values[1]
- snp_end = snp_values[2]
- first_observed = snp_values[3]
- second_observed = snp_values[4]
- logging.debug(
- 'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end))
- fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
- tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end)
- logging.info(
- 'Number of reads aligned with snp: {0}'.format(tot_fetch_aln))
-
- if tot_fetch_aln == 0 or tot_fetch_aln < 10:
- logging.debug('SNP without at least ten items will be discarded.')
- continue
-
- for read in fetch_aln:
- read_count += 1
- read_name = read.query_name
- read_start = read.reference_start
- read_end = read.reference_end
- read_sequence = read.query_sequence
- logging.debug('Loaded read: {0}'.format(read_name))
-
- if snp_start >= read_start and snp_end <= read_end:
- logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format(
- read_sequence[snp_start:snp_end], first_observed, second_observed))
- if read_sequence[snp_start:snp_end] == first_observed:
- count_first_allele += 1
- elif read_sequence[snp_start:snp_end] == second_observed:
- count_second_allele += 1
- elif not read_sequence[snp_start:snp_end].strip():
- logging.debug('No valid read found.')
- else:
- count_other_allele += 1
- prgbar.finish()
-
- logging.info('Total number of SNPs: {0}'.format(snp_total_number))
-
- logging.info('Total number of reads: {0}'.format(read_count))
-
- logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format(
- count_first_allele, (float(count_first_allele) / read_count) * 100))
-
- logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format(
- count_second_allele, (float(count_second_allele) / read_count) * 100))
-
- logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format(
- count_other_allele, (float(count_other_allele) / read_count) * 100))
-
- logging.debug('Finished method generate_statistics in {0} seconds'.format(
- time.time() - start_time))
- logging.info('Generation of statistics: Completed')
- in_sam.close()
def main():
@@ -110,6 +36,10 @@ def main():
action='store_true',
dest='skip_first_line',
help='set this flag if the first line IS NOT the header')
+ parser.add_argument('-epsilon',
+ action='store',
+ dest='threshold',
+ help='set the threshold for Homozygous or Heterozygous. Default: 80')
parser.add_argument('-file-delimiter',
action='store',
dest='file_delim',
@@ -147,6 +77,10 @@ def main():
else:
ann_file_delimiter = args.file_delimiter
+ threshold = 80
+ if args.threshold is not None:
+ threshold = int(args.threshold)
+
logging.info('#### STEP 1 - Loading SNPs ####')
snp_map = reading_snp(args.annotationFile,
@@ -154,20 +88,12 @@ def main():
logging.info('#### STEP 2 - Generating Statistics ####')
- generate_statistics(snp_map, in_bam_file)
+ classify_snp(in_bam_file, snp_map, threshold, out_dir)
logging.info('Program Completed')
# utility
-
-def check_observed(observed_couple):
- # observed are in the form A/T, T/-, -/C
- regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
- couple = regexpr_snp.search(observed_couple)
- return couple.group(1), couple.group(2)
-
-
def prepare_loggers(log_level):
logging.basicConfig(level=log_level,
format='%(levelname)-8s [%(asctime)s] %(message)s',
@@ -192,29 +118,127 @@ def reading_snp(annotation_file, file_delimiter, flag_first_line):
snp_header_index_start = header.index('chromStart')
snp_header_index_stop = header.index('chromEnd')
snp_header_index_name = header.index('name')
- snp_header_observed = header.index('observed')
snp_map = {}
for line in ann_file:
+
splitted_line = line.split(file_delimiter)
snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1]
snp_start = int(splitted_line[snp_header_index_start])
snp_stop = int(splitted_line[snp_header_index_stop])
snp_name = splitted_line[snp_header_index_name]
- first_observed, second_observed = check_observed(
- splitted_line[snp_header_observed])
- logging.debug('{0},{1},{2},{3}'.format(
- snp_chrom, snp_start, snp_stop, snp_name))
- snp_map[snp_name] = [snp_chrom, snp_start, snp_stop,
- first_observed, second_observed]
+
+ snp_map[snp_name] = [snp_chrom, snp_start, snp_stop]
ann_file.close()
logging.debug('Finished method reading_snp in: {0} seconds'.format(
time.time() - start_time))
-
return snp_map
+def classify_snp(alignment_file, snp_map, threshold, out_dir):
+ logging.debug('Started method classifySNP')
+ start_time = time.time()
+ count_homo = 0
+ count_hete = 0
+ snp_total_number = len(snp_map)
+ statistics_file = open(out_dir + 'statistics2.txt', 'w')
+
+ in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+ prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+ cnt = 0
+
+ for snp_name, snp_values in snp_map.items():
+ statistics_file.write('Reading SNP: {0}\n'.format(snp_name))
+ cnt += 1
+ prgbar.update(cnt)
+ snp_chrom = snp_values[0]
+ snp_start = snp_values[1]
+ snp_end = snp_values[2]
+
+ logging.debug(
+ 'SNP start: {0} - SNP end:{1}'.format(snp_start, snp_end))
+
+ fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
+ count_aln = in_sam.count(snp_chrom, snp_start, snp_end)
+
+ alleles = Counter()
+
+ for read in fetch_aln:
+ read_name = read.query_name
+ logging.debug('Reading read: {0}'.format(read_name))
+ read_sequence = read.query_sequence
+ read_base = read_sequence[snp_start:snp_end]
+ if not read_base:
+ logging.debug('Empty read - indel')
+ continue
+ logging.debug('Read base: {0}'.format(read_base))
+ alleles[read_base] += 1
+
+ logging.debug(alleles)
+ statistics_file.write('Allele counter for the SNP:\n')
+ for key, value in alleles.items():
+ statistics_file.write('{0} : {1}\n'.format(key, value))
+
+ if alleles:
+ first_allele = alleles.most_common(2)[0]
+ logging.debug(first_allele)
+ second_allele = alleles.most_common(2)[1]
+ logging.debug(second_allele)
+
+ if (first_allele[1] / count_aln) * 100 >= threshold:
+ statistics_file.write('Snp is Homozygous\n')
+ logging.debug('The snp {0} is: Homozygous'.format(snp_name))
+ count_homo += 1
+ else:
+ statistics_file.write('Snp is Heterozygous\n')
+ logging.debug('The snp {0} is: Heterozygous'.format(snp_name))
+ count_hete += 1
+
+ logging.debug(
+ 'Number of read aligned on SNP: {0}'.format(count_aln))
+ statistics_file.write(
+ 'Number of read aligned on SNP: {0}\n'.format(count_aln))
+
+ out = {}
+ if count_aln > 0:
+ for key in alleles:
+ out[key] = (float(alleles[key]) / count_aln) * 100
+
+ logging.debug(out)
+ statistics_file.write('Base Frequences for the SNP:\n')
+ for key, value in out.items():
+ statistics_file.write('{0} : {1}\n'.format(key, value))
+
+ prgbar.finish()
+ in_sam.close()
+
+ logging.debug(
+ 'Total number of SNPs: {0}'.format(snp_total_number))
+
+ statistics_file.write(
+ 'Total number of SNPs: {0}\n'.format(snp_total_number))
+
+ logging.debug(
+ 'Number of SNPs: Homozygous {0} ({1}%)'.format(
+ count_homo, ((float(count_homo) / snp_total_number) * 100)))
+
+ statistics_file.write('Number of SNPs: Homozygous {0} ({1}%)\n'.format(
+ count_homo, ((float(count_homo) / snp_total_number) * 100)))
+
+ logging.debug(
+ 'Number of SNPs: Heterozygous {0} ({1}%)'.format(
+ count_hete, ((float(count_hete) / snp_total_number) * 100)))
+
+ statistics_file.write('Number of SNPs: Heterozygous {0} ({1}%)\n'.format(
+ count_hete, ((float(count_hete) / snp_total_number) * 100)))
+
+ logging.debug('Finished method classifySNP in: {0} seconds'.format(
+ time.time() - start_time))
+
+ statistics_file.close()
+
+
if __name__ == '__main__':
main()
From d6d64bc084dfdf920bf66524d0718171350941a0 Mon Sep 17 00:00:00 2001
From: Nara88
Date: Thu, 29 Sep 2016 18:39:08 +0200
Subject: [PATCH 09/11] Add files via upload
---
Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
create mode 100644 Vcf_conversion.rb
diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb
new file mode 100644
index 00000000..d661f72f
--- /dev/null
+++ b/Vcf_conversion.rb
@@ -0,0 +1,105 @@
+#Methods
+
+def get_line_from_file(path, line)
+result = nil
+ File.open(path, "r") do |f|
+ while line > 0
+ line -= 1
+ result = f.gets
+ result ||= ''
+ result = result.chomp
+ end
+ end
+ return result
+end
+
+def write_header(output_path)
+ formato = "##fileformat=VCFv4.2\n"
+ colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
+ File.open(output_path, "w") { |file| file.write(formato + colonne) }
+end
+
+def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+ File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
+end
+
+def find_alt(snp_line, ref)
+
+ alternatives = snp_line[9].to_s.split "/"
+ alt = "."
+ if (alternatives.length == 2)
+ alternatives.each do |a|
+ if a.to_s != ref
+ alt = a.to_s
+ end
+ end
+ else
+ puts "Error: bihallelic field"
+ ref = "X"
+ end
+
+ return alt
+
+end
+
+
+def create_line(snp_path, current_line, current_father, current_mother, output)
+
+ current_snp_line = current_line + 3
+
+ snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
+
+ chrom = snp_line[1].to_s
+
+ pos = (snp_line[2].to_i + 1).to_s
+
+ id = "."
+
+ ref = snp_line[7].to_s
+
+ alt = find_alt(snp_line, ref)
+
+ qual = "."
+
+ filter = "PASS"
+
+ info = "."
+
+ format = "GT"
+
+ sample = current_father.to_s + "|" + current_mother.to_s
+
+ write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+
+end
+
+def write_body(haplotypes, snp, output)
+
+ current_line = 0
+ father = get_line_from_file(haplotypes, 1).split('')
+ mother = get_line_from_file(haplotypes, 2).split('')
+
+ father.each do |i|
+ create_line(snp, current_line, father[current_line], mother[current_line], output)
+ current_line += 1
+ end
+
+end
+
+def create_vcf(haplotypes_path, snp_path, output_path)
+
+ snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
+
+ if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
+ write_header(output_path)
+ write_body(haplotypes_path, snp_path, output_path)
+ else
+ puts "Error: non-matching files"
+ end
+
+end
+
+
+#Vcf file creation
+
+create_vcf(haplotypes_path, snp_path, output_path)
From 268aa70e34422bbb3f95c29b61e3d802c61e4e92 Mon Sep 17 00:00:00 2001
From: Nara88
Date: Sun, 2 Oct 2016 20:35:50 +0200
Subject: [PATCH 10/11] Delete Vcf_conversion.rb
---
Vcf_conversion.rb | 105 ----------------------------------------------
1 file changed, 105 deletions(-)
delete mode 100644 Vcf_conversion.rb
diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb
deleted file mode 100644
index d661f72f..00000000
--- a/Vcf_conversion.rb
+++ /dev/null
@@ -1,105 +0,0 @@
-#Methods
-
-def get_line_from_file(path, line)
-result = nil
- File.open(path, "r") do |f|
- while line > 0
- line -= 1
- result = f.gets
- result ||= ''
- result = result.chomp
- end
- end
- return result
-end
-
-def write_header(output_path)
- formato = "##fileformat=VCFv4.2\n"
- colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
- File.open(output_path, "w") { |file| file.write(formato + colonne) }
-end
-
-def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
- File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
-end
-
-def find_alt(snp_line, ref)
-
- alternatives = snp_line[9].to_s.split "/"
- alt = "."
- if (alternatives.length == 2)
- alternatives.each do |a|
- if a.to_s != ref
- alt = a.to_s
- end
- end
- else
- puts "Error: bihallelic field"
- ref = "X"
- end
-
- return alt
-
-end
-
-
-def create_line(snp_path, current_line, current_father, current_mother, output)
-
- current_snp_line = current_line + 3
-
- snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
-
- chrom = snp_line[1].to_s
-
- pos = (snp_line[2].to_i + 1).to_s
-
- id = "."
-
- ref = snp_line[7].to_s
-
- alt = find_alt(snp_line, ref)
-
- qual = "."
-
- filter = "PASS"
-
- info = "."
-
- format = "GT"
-
- sample = current_father.to_s + "|" + current_mother.to_s
-
- write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
-
-end
-
-def write_body(haplotypes, snp, output)
-
- current_line = 0
- father = get_line_from_file(haplotypes, 1).split('')
- mother = get_line_from_file(haplotypes, 2).split('')
-
- father.each do |i|
- create_line(snp, current_line, father[current_line], mother[current_line], output)
- current_line += 1
- end
-
-end
-
-def create_vcf(haplotypes_path, snp_path, output_path)
-
- snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
-
- if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
- write_header(output_path)
- write_body(haplotypes_path, snp_path, output_path)
- else
- puts "Error: non-matching files"
- end
-
-end
-
-
-#Vcf file creation
-
-create_vcf(haplotypes_path, snp_path, output_path)
From 5f32b743f7ea22e09f4c8264daec38947569ab07 Mon Sep 17 00:00:00 2001
From: Nara88
Date: Sun, 2 Oct 2016 20:38:59 +0200
Subject: [PATCH 11/11] src/output-vcf/Vcf_conversion.rb
---
src/Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
create mode 100644 src/Vcf_conversion.rb
diff --git a/src/Vcf_conversion.rb b/src/Vcf_conversion.rb
new file mode 100644
index 00000000..d661f72f
--- /dev/null
+++ b/src/Vcf_conversion.rb
@@ -0,0 +1,105 @@
+#Methods
+
+def get_line_from_file(path, line)
+result = nil
+ File.open(path, "r") do |f|
+ while line > 0
+ line -= 1
+ result = f.gets
+ result ||= ''
+ result = result.chomp
+ end
+ end
+ return result
+end
+
+def write_header(output_path)
+ formato = "##fileformat=VCFv4.2\n"
+ colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
+ File.open(output_path, "w") { |file| file.write(formato + colonne) }
+end
+
+def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+ File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
+end
+
+def find_alt(snp_line, ref)
+
+ alternatives = snp_line[9].to_s.split "/"
+ alt = "."
+ if (alternatives.length == 2)
+ alternatives.each do |a|
+ if a.to_s != ref
+ alt = a.to_s
+ end
+ end
+ else
+ puts "Error: bihallelic field"
+ ref = "X"
+ end
+
+ return alt
+
+end
+
+
+def create_line(snp_path, current_line, current_father, current_mother, output)
+
+ current_snp_line = current_line + 3
+
+ snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
+
+ chrom = snp_line[1].to_s
+
+ pos = (snp_line[2].to_i + 1).to_s
+
+ id = "."
+
+ ref = snp_line[7].to_s
+
+ alt = find_alt(snp_line, ref)
+
+ qual = "."
+
+ filter = "PASS"
+
+ info = "."
+
+ format = "GT"
+
+ sample = current_father.to_s + "|" + current_mother.to_s
+
+ write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+
+end
+
+def write_body(haplotypes, snp, output)
+
+ current_line = 0
+ father = get_line_from_file(haplotypes, 1).split('')
+ mother = get_line_from_file(haplotypes, 2).split('')
+
+ father.each do |i|
+ create_line(snp, current_line, father[current_line], mother[current_line], output)
+ current_line += 1
+ end
+
+end
+
+def create_vcf(haplotypes_path, snp_path, output_path)
+
+ snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
+
+ if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
+ write_header(output_path)
+ write_body(haplotypes_path, snp_path, output_path)
+ else
+ puts "Error: non-matching files"
+ end
+
+end
+
+
+#Vcf file creation
+
+create_vcf(haplotypes_path, snp_path, output_path)