From d47bbe4fd8b8dfc5b87331e2fbf0b57d7ceeccad Mon Sep 17 00:00:00 2001 From: Alexander Regueiro Date: Thu, 8 May 2025 01:36:50 +0100 Subject: [PATCH 1/5] Remove .fa files --- yleaf/.gitignore | 1 + yleaf/data/hg19/chrY.fa | 0 yleaf/data/hg19/full_reference.fa | 0 yleaf/data/hg38/chrY.fa | 1 - yleaf/data/hg38/full_reference.fa | 0 5 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 yleaf/.gitignore delete mode 100644 yleaf/data/hg19/chrY.fa delete mode 100644 yleaf/data/hg19/full_reference.fa delete mode 100644 yleaf/data/hg38/chrY.fa delete mode 100644 yleaf/data/hg38/full_reference.fa diff --git a/yleaf/.gitignore b/yleaf/.gitignore new file mode 100644 index 0000000..c455754 --- /dev/null +++ b/yleaf/.gitignore @@ -0,0 +1 @@ +data/**/*.fa diff --git a/yleaf/data/hg19/chrY.fa b/yleaf/data/hg19/chrY.fa deleted file mode 100644 index e69de29..0000000 diff --git a/yleaf/data/hg19/full_reference.fa b/yleaf/data/hg19/full_reference.fa deleted file mode 100644 index e69de29..0000000 diff --git a/yleaf/data/hg38/chrY.fa b/yleaf/data/hg38/chrY.fa deleted file mode 100644 index 8b13789..0000000 --- a/yleaf/data/hg38/chrY.fa +++ /dev/null @@ -1 +0,0 @@ - diff --git a/yleaf/data/hg38/full_reference.fa b/yleaf/data/hg38/full_reference.fa deleted file mode 100644 index e69de29..0000000 From d2554b2e6d2f6dce36e91da395a8b4853aa1f377 Mon Sep 17 00:00:00 2001 From: Alexander Regueiro Date: Thu, 8 May 2025 01:39:34 +0100 Subject: [PATCH 2/5] Quote paths in command arguments --- yleaf/Yleaf.py | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/yleaf/Yleaf.py b/yleaf/Yleaf.py index 39b8659..8cf048f 100644 --- a/yleaf/Yleaf.py +++ b/yleaf/Yleaf.py @@ -91,14 +91,14 @@ def run_vcf( safe_create_dir(sample_vcf_folder, args.force) sample_vcf_file_txt = sample_vcf_folder / (sample_vcf_file.name.replace(".vcf.gz", ".txt")) - cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' {sample_vcf_file} > {sample_vcf_file_txt}" + cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' '{sample_vcf_file}' > '{sample_vcf_file_txt}'" call_command(cmd) pileupfile = pd.read_csv(sample_vcf_file_txt, dtype=str, header=None, sep="\t") # remove sample_vcf_file_txt - cmd = f"rm {sample_vcf_file_txt}" + cmd = f"rm '{sample_vcf_file_txt}'" call_command(cmd) pileupfile.columns = ['chr', 'pos', 'refbase', 'altbase', 'reads'] @@ -255,7 +255,7 @@ def main_vcf_split( ): # first sort the vcf file sorted_vcf_file = base_out_folder / (vcf_file.name.replace(".vcf.gz", ".sorted.vcf.gz")) - cmd = f"bcftools sort -O z -o {sorted_vcf_file} {vcf_file}" + cmd = f"bcftools sort -O z -o '{sorted_vcf_file}' '{vcf_file}'" try: call_command(cmd) except SystemExit: @@ -263,11 +263,11 @@ def main_vcf_split( return None # next index the vcf file - cmd = f"bcftools index -f {sorted_vcf_file}" + cmd = f"bcftools index -f '{sorted_vcf_file}'" call_command(cmd) # get chromosome annotation using bcftools query -f '%CHROM\n' sorted.vcf.gz | uniq - cmd = f"bcftools query -f '%CHROM\n' {sorted_vcf_file} | uniq" + cmd = f"bcftools query -f '%CHROM\n' '{sorted_vcf_file}' | uniq" process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = process.communicate() if process.returncode != 0: @@ -293,21 +293,21 @@ def main_vcf_split( # filter the vcf file using the reference bed file filtered_vcf_file = base_out_folder / "filtered_vcf_files" / ( sorted_vcf_file.name.replace(".sorted.vcf.gz", ".filtered.vcf.gz")) - cmd = f"bcftools view -O z -R {new_position_bed_file} {sorted_vcf_file} > {filtered_vcf_file}" + cmd = f"bcftools view -O z -R '{new_position_bed_file}' '{sorted_vcf_file}' > '{filtered_vcf_file}'" call_command(cmd) # remover temp_position_bed.bed - cmd = f"rm {new_position_bed_file}" + cmd = f"rm '{new_position_bed_file}'" call_command(cmd) # remove sorted.vcf.gz and sorted.vcf.gz.csi - cmd = f"rm {sorted_vcf_file}" + cmd = f"rm '{sorted_vcf_file}'" call_command(cmd) - cmd = f"rm {sorted_vcf_file}.csi" + cmd = f"rm '{sorted_vcf_file}.csi'" call_command(cmd) # check number of samples in the vcf file - cmd = f"bcftools query -l {filtered_vcf_file} | wc -l" + cmd = f"bcftools query -l '{filtered_vcf_file}' | wc -l" process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = process.communicate() if process.returncode != 0: @@ -319,7 +319,7 @@ def main_vcf_split( # split the vcf file into separate files for each sample split_vcf_folder = base_out_folder / (vcf_file.name.replace(".vcf.gz", "_split")) safe_create_dir(split_vcf_folder, args.force) - cmd = f"bcftools +split {filtered_vcf_file} -Oz -o {split_vcf_folder}" + cmd = f"bcftools +split '{filtered_vcf_file}' -Oz -o '{split_vcf_folder}'" call_command(cmd) sample_vcf_files = get_files_with_extension(split_vcf_folder, '.vcf.gz') elif num_samples == 1: @@ -559,26 +559,24 @@ def main_fastq( fastq_file2 = Path(str(fastq_file).replace("_R1", "_R2")) LOG.info(f"Starting with running for {fastq_file} and {fastq_file2}") sam_file = bam_folder / "temp_fastq_sam.sam" - fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} {fastq_file2} > {sam_file}" + fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' '{fastq_file2}' > '{sam_file}'" call_command(fastq_cmd) bam_file = bam_folder / (fastq_file.name.rsplit("_R1", 1)[0] + ".bam") - cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file, - args.threads, bam_file) + cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ '{arg.threads}' -m 2G -o '{bam_file}'" call_command(cmd) - cmd = "samtools index -@ {} {}".format(args.threads, bam_file) + cmd = f"samtools index -@ {arg.threads} '{bam_file}'" call_command(cmd) os.remove(sam_file) elif "_R1" not in str(fastq_file) and "_R2" not in str(fastq_file) and ".gz" in str(fastq_file): LOG.info(f"Starting with running for {fastq_file}") sam_file = bam_folder / "temp_fastq_sam.sam" - fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} > {sam_file}" + fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' > '{sam_file}'" call_command(fastq_cmd) bam_file = bam_folder / (fastq_file.name.rsplit(".", 1)[0] + ".bam") - cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file, - args.threads, bam_file) + cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ {args.threads} -m 2G -o '{bam_file}'" call_command(cmd) - cmd = "samtools index -@ {} {}".format(args.threads, bam_file) + cmd = f"samtools index -@ {args.threads} '{bam_file}'" call_command(cmd) os.remove(sam_file) args.bamfile = bam_folder @@ -697,11 +695,11 @@ def samtools( if is_bam_pathfile: if not any([Path(str(path_file) + ".bai").exists(), Path(str(path_file).rstrip(".bam") + '.bai').exists()]): - cmd = "samtools index -@{} {}".format(args.threads, path_file) + cmd = f"samtools index -@{args.threads} '{path_file}'" call_command(cmd) else: if not any([Path(str(path_file) + ".crai").exists(), Path(str(path_file).rstrip(".cram") + '.crai').exists()]): - cmd = "samtools index -@{} {}".format(args.threads, path_file) + cmd = f"samtools index -@{args.threads} '{path_file}'" call_command(cmd) header, mapped, unmapped = chromosome_table(path_file, output_folder, output_folder.name) @@ -732,7 +730,7 @@ def chromosome_table( output = path_folder / (file_name + '.chr') tmp_output = path_folder / "tmp.txt" f = open(tmp_output, "w") - cmd = f"samtools idxstats {path_file}" + cmd = f"samtools idxstats '{path_file}'" call_command(cmd, stdout_location=f) df_chromosome = pd.read_table(tmp_output, header=None) @@ -818,8 +816,8 @@ def execute_mpileup( cmd += f" -l {str(bed)}" if reference is not None: - cmd += f" -f {str(reference)}" - cmd += f" -AQ{quality_thresh}q1 {str(bam_file)} > {str(pileupfile)}" + cmd += f" -f '{str(reference)}'" + cmd += f" -AQ{quality_thresh}q1 '{str(bam_file)}' > '{str(pileupfile)}'" call_command(cmd) @@ -1191,7 +1189,7 @@ def predict_haplogroup( ): if use_old: script = yleaf_constants.SRC_FOLDER / "old_predict_haplogroup.py" - cmd = "python {} -i {} -o {}".format(script, path_file, output) + cmd = "python '{script}' -i '{path_file}' -o '{output}'" call_command(cmd) else: from yleaf import predict_haplogroup From c0a3261e118a30e0a4dd8e7fbadd8876bcb9cabc Mon Sep 17 00:00:00 2001 From: Alexander Regueiro Date: Thu, 8 May 2025 01:41:25 +0100 Subject: [PATCH 3/5] Fix spelling/typos --- yleaf/predict_haplogroup.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/yleaf/predict_haplogroup.py b/yleaf/predict_haplogroup.py index 2ceeb9a..5af62d3 100644 --- a/yleaf/predict_haplogroup.py +++ b/yleaf/predict_haplogroup.py @@ -8,7 +8,7 @@ License: GNU General Public License v3 or later A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt. -Autor: Bram van Wersch +Author: Bram van Wersch """ import argparse @@ -192,9 +192,9 @@ def read_yleaf_out_file( def get_most_likely_haplotype( tree: Tree, haplotype_dict: Dict[str, HgMarkersLinker], - treshold: float + threshold: float, ) -> Tuple[str, str, int, int, int, int, int]: - """Get the most specific haplogroup with a score above the treshold. The score is build up from 3 parts: + """Get the most specific haplogroup with a score above the threshold. The score is build up from 3 parts: QC1: This score indicates whether the predicted haplogroup follows the expected backbone of the haplogroup tree structure (i.e. if haplogroup E is predicted the markers defining: A0-T, A1, A1b, BT, CT, DE should be @@ -220,7 +220,7 @@ def get_most_likely_haplotype( in the haplogroup prediction. First all possible haplogroups are sorted based on the depth in the tree. This is the order haplogroup scores will - be calculated to determine if they are above the treshold. As soon as the treshold is reached the function returns. + be calculated to determine if they are above the threshold. As soon as the threshold is reached the function returns. """ sorted_depth_haplotypes = sorted(haplotype_dict.keys(), key=lambda k: tree.get(k).depth, reverse=True) covered_nodes = set() @@ -256,27 +256,27 @@ def get_most_likely_haplotype( qc1_score = get_qc1_score(path, haplotype_dict) - # if any of the scores are below treshold, the total can not be above so ignore - if qc1_score < treshold: + # if any of the scores are below threshold, the total can not be above so ignore + if qc1_score < threshold: continue if haplotype_dict[node.name].nr_total == 0: qc2_score = 0 else: qc2_score = haplotype_dict[node.name].nr_derived / haplotype_dict[node.name].nr_total - if qc2_score < treshold: + if qc2_score < threshold: continue if qc3_score[1] == 0: qc3_score = 0 else: qc3_score = qc3_score[0] / qc3_score[1] - if qc3_score < treshold: + if qc3_score < threshold: continue total_score = product([qc1_score, qc2_score, qc3_score]) # if above filter we found the hit - if total_score > treshold: + if total_score > threshold: ancestral_children = get_ancestral_children(node, haplotype_dict, tree) best_score = (node.name, ancestral_children, qc1_score, qc2_score, qc3_score, total_score, node.depth) break From 60dfafebf315822acb7dfba436117eaa657d6b7b Mon Sep 17 00:00:00 2001 From: Alexander Regueiro Date: Thu, 8 May 2025 01:44:56 +0100 Subject: [PATCH 4/5] Fix type annotations --- yleaf/predict_haplogroup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) mode change 100644 => 100755 yleaf/predict_haplogroup.py diff --git a/yleaf/predict_haplogroup.py b/yleaf/predict_haplogroup.py old mode 100644 new mode 100755 index 5af62d3..6b74e2c --- a/yleaf/predict_haplogroup.py +++ b/yleaf/predict_haplogroup.py @@ -15,14 +15,14 @@ import multiprocessing from functools import partial from pathlib import Path -from typing import Set, Dict, Iterator, List, Any, Union, Tuple +from typing import Set, Dict, Iterator, List, Any, Union, Tuple, Optional import logging from yleaf import yleaf_constants from yleaf.tree import Tree, Node -BACKBONE_GROUPS: Set = set() -MAIN_HAPLO_GROUPS: Set = set() +BACKBONE_GROUPS: Set[str] = set() +MAIN_HAPLO_GROUPS: Set[str] = set() QC1_SCORE_CACHE: Dict[str, float] = {} DEFAULT_MIN_SCORE = 0.95 @@ -101,7 +101,7 @@ def main_predict_haplogroup( return [haplotype_dict, best_haplotype_score, folder] -def main(namespace: argparse.Namespace = None): +def main(namespace: Optional[argparse.Namespace] = None): """Main entry point for prediction script""" LOG.info("Starting haplogroup prediction...") if namespace is None: From 8827807eb791e28ea348fdebf2f0c5de173e85c2 Mon Sep 17 00:00:00 2001 From: Alexander Regueiro Date: Thu, 8 May 2025 02:05:22 +0100 Subject: [PATCH 5/5] Fix issue with global vars in multiprocessing --- yleaf/Yleaf.py | 0 yleaf/predict_haplogroup.py | 10 +++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) mode change 100644 => 100755 yleaf/Yleaf.py diff --git a/yleaf/Yleaf.py b/yleaf/Yleaf.py old mode 100644 new mode 100755 diff --git a/yleaf/predict_haplogroup.py b/yleaf/predict_haplogroup.py index 6b74e2c..594d657 100755 --- a/yleaf/predict_haplogroup.py +++ b/yleaf/predict_haplogroup.py @@ -81,8 +81,15 @@ def get_state(self) -> str: def main_predict_haplogroup( namespace: argparse.Namespace, + backbone_groups: Set[str], + main_haplo_groups: Set[str], folder: Path, ): + # set these global vars because we might be inside a new process + global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS + BACKBONE_GROUPS = backbone_groups + MAIN_HAPLO_GROUPS = main_haplo_groups + # make sure to reset this for each sample global QC1_SCORE_CACHE QC1_SCORE_CACHE = {} @@ -113,7 +120,7 @@ def main(namespace: Optional[argparse.Namespace] = None): final_table = [] with multiprocessing.Pool(processes=threads) as p: - predictions = p.map(partial(main_predict_haplogroup, namespace), read_input_folder(in_folder)) + predictions = p.map(partial(main_predict_haplogroup, namespace, BACKBONE_GROUPS, MAIN_HAPLO_GROUPS), read_input_folder(in_folder)) for haplotype_dict, best_haplotype_score, folder in predictions: if haplotype_dict is None: @@ -146,6 +153,7 @@ def get_arguments() -> argparse.Namespace: def read_backbone_groups(): """Read some basic data that is always needed""" global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS + with open(yleaf_constants.DATA_FOLDER / yleaf_constants.HG_PREDICTION_FOLDER / "major_tables/Intermediates.txt") as f: for line in f: if "~" in line: