Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions yleaf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/**/*.fa
48 changes: 23 additions & 25 deletions yleaf/Yleaf.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,14 @@ def run_vcf(
safe_create_dir(sample_vcf_folder, args.force)

sample_vcf_file_txt = sample_vcf_folder / (sample_vcf_file.name.replace(".vcf.gz", ".txt"))
cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' {sample_vcf_file} > {sample_vcf_file_txt}"
cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%AD]\n' '{sample_vcf_file}' > '{sample_vcf_file_txt}'"
call_command(cmd)

pileupfile = pd.read_csv(sample_vcf_file_txt,
dtype=str, header=None, sep="\t")

# remove sample_vcf_file_txt
cmd = f"rm {sample_vcf_file_txt}"
cmd = f"rm '{sample_vcf_file_txt}'"
call_command(cmd)

pileupfile.columns = ['chr', 'pos', 'refbase', 'altbase', 'reads']
Expand Down Expand Up @@ -255,19 +255,19 @@ def main_vcf_split(
):
# first sort the vcf file
sorted_vcf_file = base_out_folder / (vcf_file.name.replace(".vcf.gz", ".sorted.vcf.gz"))
cmd = f"bcftools sort -O z -o {sorted_vcf_file} {vcf_file}"
cmd = f"bcftools sort -O z -o '{sorted_vcf_file}' '{vcf_file}'"
try:
call_command(cmd)
except SystemExit:
LOG.error(f"Failed to sort the vcf file {vcf_file.name}. Skipping...")
return None

# next index the vcf file
cmd = f"bcftools index -f {sorted_vcf_file}"
cmd = f"bcftools index -f '{sorted_vcf_file}'"
call_command(cmd)

# get chromosome annotation using bcftools query -f '%CHROM\n' sorted.vcf.gz | uniq
cmd = f"bcftools query -f '%CHROM\n' {sorted_vcf_file} | uniq"
cmd = f"bcftools query -f '%CHROM\n' '{sorted_vcf_file}' | uniq"
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
Expand All @@ -293,21 +293,21 @@ def main_vcf_split(
# filter the vcf file using the reference bed file
filtered_vcf_file = base_out_folder / "filtered_vcf_files" / (
sorted_vcf_file.name.replace(".sorted.vcf.gz", ".filtered.vcf.gz"))
cmd = f"bcftools view -O z -R {new_position_bed_file} {sorted_vcf_file} > {filtered_vcf_file}"
cmd = f"bcftools view -O z -R '{new_position_bed_file}' '{sorted_vcf_file}' > '{filtered_vcf_file}'"
call_command(cmd)

# remover temp_position_bed.bed
cmd = f"rm {new_position_bed_file}"
cmd = f"rm '{new_position_bed_file}'"
call_command(cmd)

# remove sorted.vcf.gz and sorted.vcf.gz.csi
cmd = f"rm {sorted_vcf_file}"
cmd = f"rm '{sorted_vcf_file}'"
call_command(cmd)
cmd = f"rm {sorted_vcf_file}.csi"
cmd = f"rm '{sorted_vcf_file}.csi'"
call_command(cmd)

# check number of samples in the vcf file
cmd = f"bcftools query -l {filtered_vcf_file} | wc -l"
cmd = f"bcftools query -l '{filtered_vcf_file}' | wc -l"
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
Expand All @@ -319,7 +319,7 @@ def main_vcf_split(
# split the vcf file into separate files for each sample
split_vcf_folder = base_out_folder / (vcf_file.name.replace(".vcf.gz", "_split"))
safe_create_dir(split_vcf_folder, args.force)
cmd = f"bcftools +split {filtered_vcf_file} -Oz -o {split_vcf_folder}"
cmd = f"bcftools +split '{filtered_vcf_file}' -Oz -o '{split_vcf_folder}'"
call_command(cmd)
sample_vcf_files = get_files_with_extension(split_vcf_folder, '.vcf.gz')
elif num_samples == 1:
Expand Down Expand Up @@ -559,26 +559,24 @@ def main_fastq(
fastq_file2 = Path(str(fastq_file).replace("_R1", "_R2"))
LOG.info(f"Starting with running for {fastq_file} and {fastq_file2}")
sam_file = bam_folder / "temp_fastq_sam.sam"
fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} {fastq_file2} > {sam_file}"
fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' '{fastq_file2}' > '{sam_file}'"
call_command(fastq_cmd)
bam_file = bam_folder / (fastq_file.name.rsplit("_R1", 1)[0] + ".bam")
cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file,
args.threads, bam_file)
cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ '{arg.threads}' -m 2G -o '{bam_file}'"
call_command(cmd)
cmd = "samtools index -@ {} {}".format(args.threads, bam_file)
cmd = f"samtools index -@ {arg.threads} '{bam_file}'"
call_command(cmd)
os.remove(sam_file)
elif "_R1" not in str(fastq_file) and "_R2" not in str(fastq_file) and ".gz" in str(fastq_file):
LOG.info(f"Starting with running for {fastq_file}")
sam_file = bam_folder / "temp_fastq_sam.sam"

fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} {reference} {fastq_file} > {sam_file}"
fastq_cmd = f"minimap2 -ax sr -k14 -w7 -t {args.threads} '{reference}' '{fastq_file}' > '{sam_file}'"
call_command(fastq_cmd)
bam_file = bam_folder / (fastq_file.name.rsplit(".", 1)[0] + ".bam")
cmd = "samtools view -@ {} -bS {} | samtools sort -@ {} -m 2G -o {}".format(args.threads, sam_file,
args.threads, bam_file)
cmd = f"samtools view -@ {args.threads} -bS '{sam_file}' | samtools sort -@ {args.threads} -m 2G -o '{bam_file}'"
call_command(cmd)
cmd = "samtools index -@ {} {}".format(args.threads, bam_file)
cmd = f"samtools index -@ {args.threads} '{bam_file}'"
call_command(cmd)
os.remove(sam_file)
args.bamfile = bam_folder
Expand Down Expand Up @@ -697,11 +695,11 @@ def samtools(

if is_bam_pathfile:
if not any([Path(str(path_file) + ".bai").exists(), Path(str(path_file).rstrip(".bam") + '.bai').exists()]):
cmd = "samtools index -@{} {}".format(args.threads, path_file)
cmd = f"samtools index -@{args.threads} '{path_file}'"
call_command(cmd)
else:
if not any([Path(str(path_file) + ".crai").exists(), Path(str(path_file).rstrip(".cram") + '.crai').exists()]):
cmd = "samtools index -@{} {}".format(args.threads, path_file)
cmd = f"samtools index -@{args.threads} '{path_file}'"
call_command(cmd)
header, mapped, unmapped = chromosome_table(path_file, output_folder, output_folder.name)

Expand Down Expand Up @@ -732,7 +730,7 @@ def chromosome_table(
output = path_folder / (file_name + '.chr')
tmp_output = path_folder / "tmp.txt"
f = open(tmp_output, "w")
cmd = f"samtools idxstats {path_file}"
cmd = f"samtools idxstats '{path_file}'"
call_command(cmd, stdout_location=f)
df_chromosome = pd.read_table(tmp_output, header=None)

Expand Down Expand Up @@ -818,8 +816,8 @@ def execute_mpileup(
cmd += f" -l {str(bed)}"

if reference is not None:
cmd += f" -f {str(reference)}"
cmd += f" -AQ{quality_thresh}q1 {str(bam_file)} > {str(pileupfile)}"
cmd += f" -f '{str(reference)}'"
cmd += f" -AQ{quality_thresh}q1 '{str(bam_file)}' > '{str(pileupfile)}'"
call_command(cmd)


Expand Down Expand Up @@ -1191,7 +1189,7 @@ def predict_haplogroup(
):
if use_old:
script = yleaf_constants.SRC_FOLDER / "old_predict_haplogroup.py"
cmd = "python {} -i {} -o {}".format(script, path_file, output)
cmd = "python '{script}' -i '{path_file}' -o '{output}'"
call_command(cmd)
else:
from yleaf import predict_haplogroup
Expand Down
Empty file removed yleaf/data/hg19/chrY.fa
Empty file.
Empty file removed yleaf/data/hg19/full_reference.fa
Empty file.
1 change: 0 additions & 1 deletion yleaf/data/hg38/chrY.fa

This file was deleted.

Empty file removed yleaf/data/hg38/full_reference.fa
Empty file.
36 changes: 22 additions & 14 deletions yleaf/predict_haplogroup.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@
License: GNU General Public License v3 or later
A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.

Autor: Bram van Wersch
Author: Bram van Wersch
"""

import argparse
import multiprocessing
from functools import partial
from pathlib import Path
from typing import Set, Dict, Iterator, List, Any, Union, Tuple
from typing import Set, Dict, Iterator, List, Any, Union, Tuple, Optional
import logging

from yleaf import yleaf_constants
from yleaf.tree import Tree, Node

BACKBONE_GROUPS: Set = set()
MAIN_HAPLO_GROUPS: Set = set()
BACKBONE_GROUPS: Set[str] = set()
MAIN_HAPLO_GROUPS: Set[str] = set()
QC1_SCORE_CACHE: Dict[str, float] = {}

DEFAULT_MIN_SCORE = 0.95
Expand Down Expand Up @@ -81,8 +81,15 @@ def get_state(self) -> str:

def main_predict_haplogroup(
namespace: argparse.Namespace,
backbone_groups: Set[str],
main_haplo_groups: Set[str],
folder: Path,
):
# set these global vars because we might be inside a new process
global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS
BACKBONE_GROUPS = backbone_groups
MAIN_HAPLO_GROUPS = main_haplo_groups

# make sure to reset this for each sample
global QC1_SCORE_CACHE
QC1_SCORE_CACHE = {}
Expand All @@ -101,7 +108,7 @@ def main_predict_haplogroup(
return [haplotype_dict, best_haplotype_score, folder]


def main(namespace: argparse.Namespace = None):
def main(namespace: Optional[argparse.Namespace] = None):
"""Main entry point for prediction script"""
LOG.info("Starting haplogroup prediction...")
if namespace is None:
Expand All @@ -113,7 +120,7 @@ def main(namespace: argparse.Namespace = None):
final_table = []

with multiprocessing.Pool(processes=threads) as p:
predictions = p.map(partial(main_predict_haplogroup, namespace), read_input_folder(in_folder))
predictions = p.map(partial(main_predict_haplogroup, namespace, BACKBONE_GROUPS, MAIN_HAPLO_GROUPS), read_input_folder(in_folder))

for haplotype_dict, best_haplotype_score, folder in predictions:
if haplotype_dict is None:
Expand Down Expand Up @@ -146,6 +153,7 @@ def get_arguments() -> argparse.Namespace:
def read_backbone_groups():
"""Read some basic data that is always needed"""
global BACKBONE_GROUPS, MAIN_HAPLO_GROUPS

with open(yleaf_constants.DATA_FOLDER / yleaf_constants.HG_PREDICTION_FOLDER / "major_tables/Intermediates.txt") as f:
for line in f:
if "~" in line:
Expand Down Expand Up @@ -192,9 +200,9 @@ def read_yleaf_out_file(
def get_most_likely_haplotype(
tree: Tree,
haplotype_dict: Dict[str, HgMarkersLinker],
treshold: float
threshold: float,
) -> Tuple[str, str, int, int, int, int, int]:
"""Get the most specific haplogroup with a score above the treshold. The score is build up from 3 parts:
"""Get the most specific haplogroup with a score above the threshold. The score is build up from 3 parts:
QC1: This score indicates whether the predicted haplogroup follows the
expected backbone of the haplogroup tree structure (i.e. if haplogroup E
is predicted the markers defining: A0-T, A1, A1b, BT, CT, DE should be
Expand All @@ -220,7 +228,7 @@ def get_most_likely_haplotype(
in the haplogroup prediction.

First all possible haplogroups are sorted based on the depth in the tree. This is the order haplogroup scores will
be calculated to determine if they are above the treshold. As soon as the treshold is reached the function returns.
be calculated to determine if they are above the threshold. As soon as the threshold is reached the function returns.
"""
sorted_depth_haplotypes = sorted(haplotype_dict.keys(), key=lambda k: tree.get(k).depth, reverse=True)
covered_nodes = set()
Expand Down Expand Up @@ -256,27 +264,27 @@ def get_most_likely_haplotype(

qc1_score = get_qc1_score(path, haplotype_dict)

# if any of the scores are below treshold, the total can not be above so ignore
if qc1_score < treshold:
# if any of the scores are below threshold, the total can not be above so ignore
if qc1_score < threshold:
continue

if haplotype_dict[node.name].nr_total == 0:
qc2_score = 0
else:
qc2_score = haplotype_dict[node.name].nr_derived / haplotype_dict[node.name].nr_total
if qc2_score < treshold:
if qc2_score < threshold:
continue
if qc3_score[1] == 0:
qc3_score = 0
else:
qc3_score = qc3_score[0] / qc3_score[1]
if qc3_score < treshold:
if qc3_score < threshold:
continue

total_score = product([qc1_score, qc2_score, qc3_score])

# if above filter we found the hit
if total_score > treshold:
if total_score > threshold:
ancestral_children = get_ancestral_children(node, haplotype_dict, tree)
best_score = (node.name, ancestral_children, qc1_score, qc2_score, qc3_score, total_score, node.depth)
break
Expand Down