genid · alexreg · May 8, 2025 · May 8, 2025 · May 8, 2025 · May 8, 2025
diff --git a/yleaf/.gitignore b/yleaf/.gitignore
@@ -0,0 +1 @@
+data/**/*.fa
diff --git a/yleaf/Yleaf.py b/yleaf/Yleaf.py
@@ -91,14 +91,14 @@ def run_vcf(
     safe_create_dir(sample_vcf_folder, args.force)
 
     sample_vcf_file_txt = sample_vcf_folder / (sample_vcf_file.name.replace(".vcf.gz", ".txt"))
-    cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' {sample_vcf_file} > {sample_vcf_file_txt}"
+    cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' '{sample_vcf_file}' > '{sample_vcf_file_txt}'"
     call_command(cmd)
 
     pileupfile = pd.read_csv(sample_vcf_file_txt,
                              dtype=str, header=None, sep="\t")
 
     # remove sample_vcf_file_txt
-    cmd = f"rm {sample_vcf_file_txt}"
+    cmd = f"rm '{sample_vcf_file_txt}'"
     call_command(cmd)
 
     pileupfile.columns = ['chr', 'pos', 'refbase', 'altbase', 'reads']
@@ -255,19 +255,19 @@ def main_vcf_split(
 ):
     # first sort the vcf file
     sorted_vcf_file = base_out_folder / (vcf_file.name.replace(".vcf.gz", ".sorted.vcf.gz"))
-    cmd = f"bcftools sort -O z -o {sorted_vcf_file} {vcf_file}"
+    cmd = f"bcftools sort -O z -o '{sorted_vcf_file}' '{vcf_file}'"
     try:
         call_command(cmd)
     except SystemExit:
         LOG.error(f"Failed to sort the vcf file {vcf_file.name}. Skipping...")
         return None
 
     # next index the vcf file
-    cmd = f"bcftools index -f {sorted_vcf_file}"
+    cmd = f"bcftools index -f '{sorted_vcf_file}'"
     call_command(cmd)
 
     # get chromosome annotation using bcftools query -f '%CHROM\n' sorted.vcf.gz | uniq
-    cmd = f"bcftools query -f '%CHROM\n' {sorted_vcf_file} | uniq"
+    cmd = f"bcftools query -f '%CHROM\n' '{sorted_vcf_file}' | uniq"
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     stdout, stderr = process.communicate()
     if process.returncode != 0:
@@ -293,21 +293,21 @@ def main_vcf_split(
     # filter the vcf file using the reference bed file
     filtered_vcf_file = base_out_folder / "filtered_vcf_files" / (
         sorted_vcf_file.name.replace(".sorted.vcf.gz", ".filtered.vcf.gz"))
-    cmd = f"bcftools view -O z -R {new_position_bed_file} {sorted_vcf_file} > {filtered_vcf_file}"
+    cmd = f"bcftools view -O z -R '{new_position_bed_file}' '{sorted_vcf_file}' > '{filtered_vcf_file}'"
     call_command(cmd)
 
     # remover temp_position_bed.bed
-    cmd = f"rm {new_position_bed_file}"
+    cmd = f"rm '{new_position_bed_file}'"
     call_command(cmd)
 
     # remove sorted.vcf.gz and sorted.vcf.gz.csi
-    cmd = f"rm {sorted_vcf_file}"
+    cmd = f"rm '{sorted_vcf_file}'"
     call_command(cmd)
-    cmd = f"rm {sorted_vcf_file}.csi"
+    cmd = f"rm '{sorted_vcf_file}.csi'"
     call_command(cmd)
 
     # check number of samples in the vcf file
-    cmd = f"bcftools query -l {filtered_vcf_file} | wc -l"
+    cmd = f"bcftools query -l '{filtered_vcf_file}' | wc -l"
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     stdout, stderr = process.communicate()
     if process.returncode != 0:
@@ -319,7 +319,7 @@ def main_vcf_split(
         # split the vcf file into separate files for each sample
         split_vcf_folder = base_out_folder / (vcf_file.name.replace(".vcf.gz", "_split"))
         safe_create_dir(split_vcf_folder, args.force)
-        cmd = f"bcftools +split {filtered_vcf_file} -Oz -o {split_vcf_folder}"
+        cmd = f"bcftools +split '{filtered_vcf_file}' -Oz -o '{split_vcf_folder}'"
         call_command(cmd)
         sample_vcf_files = get_files_with_extension(split_vcf_folder, '.vcf.gz')
     elif num_samples == 1:
@@ -559,26 +559,24 @@ def main_fastq(
             fastq_file2 = Path(str(fastq_file).replace("_R1", "_R2"))
             LOG.info(f"Starting with running for {fastq_file} and {fastq_file2}")
             sam_file = bam_folder / "temp_fastq_sam.sam"
-            fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} {fastq_file2} > {sam_file}"
+            fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' '{fastq_file2}' > '{sam_file}'"
             call_command(fastq_cmd)
             bam_file = bam_folder / (fastq_file.name.rsplit("_R1", 1)[0] + ".bam")
-            cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file,
-                                                                                        args.threads, bam_file)
+            cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ '{arg.threads}' -m 2G -o '{bam_file}'"
             call_command(cmd)
-            cmd = "samtools index -@ {} {}".format(args.threads, bam_file)
+            cmd = f"samtools index -@ {arg.threads} '{bam_file}'"
             call_command(cmd)
             os.remove(sam_file)
         elif "_R1" not in str(fastq_file) and "_R2" not in str(fastq_file) and ".gz" in str(fastq_file):
             LOG.info(f"Starting with running for {fastq_file}")
             sam_file = bam_folder / "temp_fastq_sam.sam"
 
-            fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} > {sam_file}"
+            fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' > '{sam_file}'"
             call_command(fastq_cmd)
             bam_file = bam_folder / (fastq_file.name.rsplit(".", 1)[0] + ".bam")
-            cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file,
-                                                                                        args.threads, bam_file)
+            cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ {args.threads} -m 2G -o '{bam_file}'"
             call_command(cmd)
-            cmd = "samtools index -@ {} {}".format(args.threads, bam_file)
+            cmd = f"samtools index -@ {args.threads} '{bam_file}'"
             call_command(cmd)
             os.remove(sam_file)
     args.bamfile = bam_folder
@@ -697,11 +695,11 @@ def samtools(
 
     if is_bam_pathfile:
         if not any([Path(str(path_file) + ".bai").exists(), Path(str(path_file).rstrip(".bam") + '.bai').exists()]):
-            cmd = "samtools index -@{} {}".format(args.threads, path_file)
+            cmd = f"samtools index -@{args.threads} '{path_file}'"
             call_command(cmd)
     else:
         if not any([Path(str(path_file) + ".crai").exists(), Path(str(path_file).rstrip(".cram") + '.crai').exists()]):
-            cmd = "samtools index -@{} {}".format(args.threads, path_file)
+            cmd = f"samtools index -@{args.threads} '{path_file}'"
             call_command(cmd)
     header, mapped, unmapped = chromosome_table(path_file, output_folder, output_folder.name)
 
@@ -732,7 +730,7 @@ def chromosome_table(
     output = path_folder / (file_name + '.chr')
     tmp_output = path_folder / "tmp.txt"
     f = open(tmp_output, "w")
-    cmd = f"samtools idxstats {path_file}"
+    cmd = f"samtools idxstats '{path_file}'"
     call_command(cmd, stdout_location=f)
     df_chromosome = pd.read_table(tmp_output, header=None)
 
@@ -818,8 +816,8 @@ def execute_mpileup(
         cmd += f" -l {str(bed)}"
 
     if reference is not None:
-        cmd += f" -f {str(reference)}"
-    cmd += f" -AQ{quality_thresh}q1 {str(bam_file)} > {str(pileupfile)}"
+        cmd += f" -f '{str(reference)}'"
+    cmd += f" -AQ{quality_thresh}q1 '{str(bam_file)}' > '{str(pileupfile)}'"
     call_command(cmd)
 
 
@@ -1191,7 +1189,7 @@ def predict_haplogroup(
 ):
     if use_old:
         script = yleaf_constants.SRC_FOLDER / "old_predict_haplogroup.py"
-        cmd = "python {} -i {} -o {}".format(script, path_file, output)
+        cmd = "python '{script}' -i '{path_file}' -o '{output}'"
         call_command(cmd)
     else:
         from yleaf import predict_haplogroup

diff --git a/yleaf/data/hg19/chrY.fa b/yleaf/data/hg19/chrY.fa
diff --git a/yleaf/data/hg19/full_reference.fa b/yleaf/data/hg19/full_reference.fa
diff --git a/yleaf/data/hg38/chrY.fa b/yleaf/data/hg38/chrY.fa
diff --git a/yleaf/data/hg38/full_reference.fa b/yleaf/data/hg38/full_reference.fa
diff --git a/yleaf/predict_haplogroup.py b/yleaf/predict_haplogroup.py
@@ -8,21 +8,21 @@
 License: GNU General Public License v3 or later
 A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.
 
-Autor: Bram van Wersch
+Author: Bram van Wersch
 """
 
 import argparse
 import multiprocessing
 from functools import partial
 from pathlib import Path
-from typing import Set, Dict, Iterator, List, Any, Union, Tuple
+from typing import Set, Dict, Iterator, List, Any, Union, Tuple, Optional
 import logging
 
 from yleaf import yleaf_constants
 from yleaf.tree import Tree, Node
 
-BACKBONE_GROUPS: Set = set()
-MAIN_HAPLO_GROUPS: Set = set()
+BACKBONE_GROUPS: Set[str] = set()
+MAIN_HAPLO_GROUPS: Set[str] = set()
 QC1_SCORE_CACHE: Dict[str, float] = {}
 
 DEFAULT_MIN_SCORE = 0.95
@@ -81,8 +81,15 @@ def get_state(self) -> str:
 
 def main_predict_haplogroup(
     namespace: argparse.Namespace,
+    backbone_groups: Set[str],
+    main_haplo_groups: Set[str],
     folder: Path,
 ):
+    # set these global vars because we might be inside a new process
+    global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS
+    BACKBONE_GROUPS = backbone_groups
+    MAIN_HAPLO_GROUPS = main_haplo_groups
+
     # make sure to reset this for each sample
     global QC1_SCORE_CACHE
     QC1_SCORE_CACHE = {}
@@ -101,7 +108,7 @@ def main_predict_haplogroup(
     return [haplotype_dict, best_haplotype_score, folder]
 
 
-def main(namespace: argparse.Namespace = None):
+def main(namespace: Optional[argparse.Namespace] = None):
     """Main entry point for prediction script"""
     LOG.info("Starting haplogroup prediction...")
     if namespace is None:
@@ -113,7 +120,7 @@ def main(namespace: argparse.Namespace = None):
     final_table = []
 
     with multiprocessing.Pool(processes=threads) as p:
-        predictions = p.map(partial(main_predict_haplogroup, namespace), read_input_folder(in_folder))
+        predictions = p.map(partial(main_predict_haplogroup, namespace, BACKBONE_GROUPS, MAIN_HAPLO_GROUPS), read_input_folder(in_folder))
 
     for haplotype_dict, best_haplotype_score, folder in predictions:
         if haplotype_dict is None:
@@ -146,6 +153,7 @@ def get_arguments() -> argparse.Namespace:
 def read_backbone_groups():
     """Read some basic data that is always needed"""
     global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS
+
     with open(yleaf_constants.DATA_FOLDER / yleaf_constants.HG_PREDICTION_FOLDER / "major_tables/Intermediates.txt") as f:
         for line in f:
             if "~" in line:
@@ -192,9 +200,9 @@ def read_yleaf_out_file(
 def get_most_likely_haplotype(
     tree: Tree,
     haplotype_dict: Dict[str, HgMarkersLinker],
-    treshold: float
+    threshold: float,
 ) -> Tuple[str, str, int, int, int, int, int]:
-    """Get the most specific haplogroup with a score above the treshold. The score is build up from 3 parts:
+    """Get the most specific haplogroup with a score above the threshold. The score is build up from 3 parts:
     QC1: This score indicates whether the predicted haplogroup follows the
         expected backbone of the haplogroup tree structure (i.e. if haplogroup E
         is predicted the markers defining: A0-T, A1, A1b, BT, CT, DE should be
@@ -220,7 +228,7 @@ def get_most_likely_haplotype(
         in the haplogroup prediction.
 
     First all possible haplogroups are sorted based on the depth in the tree. This is the order haplogroup scores will
-    be calculated to determine if they are above the treshold. As soon as the treshold is reached the function returns.
+    be calculated to determine if they are above the threshold. As soon as the threshold is reached the function returns.
     """
     sorted_depth_haplotypes = sorted(haplotype_dict.keys(), key=lambda k: tree.get(k).depth, reverse=True)
     covered_nodes = set()
@@ -256,27 +264,27 @@ def get_most_likely_haplotype(
 
         qc1_score = get_qc1_score(path, haplotype_dict)
 
-        # if any of the scores are below treshold, the total can not be above so ignore
-        if qc1_score < treshold:
+        # if any of the scores are below threshold, the total can not be above so ignore
+        if qc1_score < threshold:
             continue
 
         if haplotype_dict[node.name].nr_total == 0:
             qc2_score = 0
         else:
             qc2_score = haplotype_dict[node.name].nr_derived / haplotype_dict[node.name].nr_total
-            if qc2_score < treshold:
+            if qc2_score < threshold:
                 continue
         if qc3_score[1] == 0:
             qc3_score = 0
         else:
             qc3_score = qc3_score[0] / qc3_score[1]
-            if qc3_score < treshold:
+            if qc3_score < threshold:
                 continue
 
         total_score = product([qc1_score, qc2_score, qc3_score])
 
         # if above filter we found the hit
-        if total_score > treshold:
+        if total_score > threshold:
             ancestral_children = get_ancestral_children(node, haplotype_dict, tree)
             best_score = (node.name, ancestral_children, qc1_score, qc2_score, qc3_score, total_score, node.depth)
             break