diff --git a/.github/skip_nf_test.json b/.github/skip_nf_test.json index d7443725..8b5688a8 100644 --- a/.github/skip_nf_test.json +++ b/.github/skip_nf_test.json @@ -1,5 +1,9 @@ { "conda": [ + "modules/msk/custom/fingerprintvcfparser", + "modules/msk/custom/fingerprintcontamination", + "modules/msk/custom/fingerprintcombine", + "modules/msk/custom/fingerprintcorrelation", "modules/msk/calculatenoise", "modules/msk/ppflagfixer", "modules/msk/facets", @@ -28,6 +32,7 @@ "modules/msk/phylowgs/parsecnvs", "modules/msk/pvmaf/concat", "modules/msk/pvmaf/tagtraceback", + "subworkflows/msk/fingerprint_gbcms", "subworkflows/msk/genome_nexus", "modules/msk/oncokb/mafannotate" ], diff --git a/modules/msk/custom/fingerprintcombine/environment.yml b/modules/msk/custom/fingerprintcombine/environment.yml new file mode 100644 index 00000000..8a3b7591 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: +- conda-forge +- bioconda +dependencies: +- conda-forge::r-argparse=2.2.5 +- conda-forge::r-data.table=1.17.8 +- conda-forge::r-dplyr=1.1.4 +- conda-forge::r-plyr=1.8.9 +- conda-forge::r-tidyverse=2.0.0 diff --git a/modules/msk/custom/fingerprintcombine/main.nf b/modules/msk/custom/fingerprintcombine/main.nf new file mode 100644 index 00000000..ffb406d5 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/main.nf @@ -0,0 +1,57 @@ +process CUSTOM_FINGERPRINTCOMBINE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66': + 'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66' }" + //' oras://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:d96a65055f79744c': + + + input: + tuple val(meta), path(fp_tsv), val(sample), val(genome_build), val(patient) + path(liftover_loci_mapping) + + output: + tuple val(meta), path("*DPfilter_ALL_FP.txt") , emit: combined_fp_tsv + tuple val("${task.process}"), val('complete_FP_table.R'), val('0.1.0'), emit: versions_fingerprintcombine, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + declare -a fp_tsv_list + declare -a sample_list + declare -a genome_build_list + declare -a patient_list + fp_tsv_list=(${fp_tsv.join(' ')}) + sample_list=(${sample.join(' ')}) + genome_build_list=(${genome_build.join(' ')}) + patient_list=(${patient.join(' ')}) + echo -e "sample_id\tgenome_build\tfp_tsv\tpatient" > input.tsv + for i in \$(seq 0 1 \$((\${#fp_tsv_list[@]}-1)) ) ; do + fp_tsv=\${fp_tsv_list[i]} + sample=\${sample_list[i]} + genome=\${genome_build_list[i]} + patient=\${patient_list[i]} + echo -e "\$sample\t\$genome\t\$fp_tsv\t\$patient" + done >> input.tsv + + complete_FP_table.R \\ + -i input.tsv \\ + -l $liftover_loci_mapping \\ + $args + """ + + stub: + def args = task.ext.args ?: '' + + """ + echo $args + + touch XDPfilter_ALL_FP.txt + """ +} diff --git a/modules/msk/custom/fingerprintcombine/meta.yml b/modules/msk/custom/fingerprintcombine/meta.yml new file mode 100644 index 00000000..7ed95b68 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/meta.yml @@ -0,0 +1,82 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_fingerprintcombine" +description: | + A module to combine multiple fingerprint TSV files into a single comprehensive + table, with optional liftover of loci coordinates. +keywords: + - fingerprint + - qc + - loci + - tsv + - correlation +tools: + - "custom": + description: "A custom R script to combine fingerprint TSV files" + homepage: "https://github.com/mskcc-omics-workflows/modules/tree/main/modules/msk/custom/fingerprintcombine/meta.yml" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - fp_tsv: + type: file + description: | + Fingerprint TSV files to be combined. + Structure: [ val(sample), val(genome_build), path(fp_tsv) ] + ontologies: [] + - sample: + type: string + description: Sample identifier corresponding to each fingerprint TSV file. + - genome_build: + type: string + description: + Genome build (e.g., hg19, hg38) corresponding to each fingerprint + TSV file. + - - liftover_loci_mapping: + type: file + description: | + A TSV file mapping original loci to liftover loci. + Format: original_chr, original_pos, liftover_chr, liftover_pos + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + combined_fp_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*DPfilter_ALL_FP.txt": + type: file + description: Wide table combining all input fingerprint TSV files. + pattern: "*DPfilter_ALL_FP.txt" + ontologies: + - edam: http://edamontology.org/format_3750 # TSV + versions_fingerprintcombine: + - - ${task.process}: + type: string + description: The name of the process + - complete_FP_table.R: + type: string + description: The name of the tool + - 0.1.0: + type: string + description: Version of the custom script +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - complete_FP_table.R: + type: string + description: The name of the tool + - 0.1.0: + type: string + description: Version of the custom script +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/custom/fingerprintcombine/resources/usr/bin/complete_FP_table.R b/modules/msk/custom/fingerprintcombine/resources/usr/bin/complete_FP_table.R new file mode 100755 index 00000000..f551459e --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/resources/usr/bin/complete_FP_table.R @@ -0,0 +1,110 @@ +#! /usr/bin/env Rscript + +#------------------------------------------------------------------------------- +# Script: complete_FP_table.R +# Author: Erika Gedvilaite +# Date: 2025-09-23 +# Version: 0.1.0 +# +# Description: This script takes in standard fingerprint tables and combines +# them into a single, wide table for downstream plotting and analysis. +# +# Annotation: +# - Input table should have three columns: sample_id, genome_build, fp_tsv +# - Genome build should be either "hg19" or "hg38" or "GRCh37" or "GRCh38" +# (case insensitive) +# +#------------------------------------------------------------------------------- + + +rm(list=ls()) + +library(argparse, quietly = T) +library(plyr, quietly = T) +library(dplyr, quietly = T) +library(data.table, quietly = T) +library(tidyverse, quietly = T) + +`%notin%` <- Negate(`%in%`) +`%notlike%` <- Negate(`%like%`) + +parser = ArgumentParser(description = 'Generate FP tables for plotting') +parser$add_argument('-i', '--input_table', required = TRUE, + help = 'Input table with paths to individual fingerprint TSV files, sample ids, and genome build') +parser$add_argument('-o', '--analysis_folder', required = FALSE, default = ".", + help = 'Output folder') +parser$add_argument('-l', '--loci_mapper', required = TRUE, + help = 'Loci mapper file') +parser$add_argument('-d', '--depth_filter', required = FALSE, default = 20, + help = 'Depth filter to apply to individual fingerprint TSV files (default: 20)') +args = parser$parse_args() + + + +message("Reading in Liftover file") + +hg19_hg38_mapper = fread(args$loci_mapper,header = T) +hg19_hg38_mapper$Loci_hg19 = paste(hg19_hg38_mapper$GRCH37_CHROM,hg19_hg38_mapper$GRCH37_POS,sep=":") +hg19_hg38_mapper$Loci_hg38 = paste(hg19_hg38_mapper$GRCH38_CHROM,hg19_hg38_mapper$GRCH38_POS,sep=":") +hg19_hg38_mapper = hg19_hg38_mapper %>% select(Loci_hg19, Loci_hg38) %>% unique() + +message("Loading Samples") +input_table = fread(args$input_table, header = T) %>% arrange(patient, sample_id) +for (i in 1:nrow(input_table)){ + sample = input_table$sample_id[i] + genome_build = input_table$genome_build[i] + print(genome_build) + if (tolower(genome_build) %notin% c("hg19","grch37","hg38","grch38")){ + stop(paste0("Genome build not recognized: ", genome_build, ". Must be in the following list: hg19, hg38, grch37, grch38 (case will be ignored).")) + } + file = input_table$fp_tsv[i] + if (!file.exists(file)){ + stop(paste0("File does not exist: ", input_table$fp_tsv[i])) + } + temp_dataset <- fread(file, header = T, sep="\t") + colnames(temp_dataset) = c("Locus", "Count", "Genotype","VAF") + temp_dataset = separate(temp_dataset, Count, into = c(NA,'DP1',NA,'DP2'), remove = F) + temp_dataset$DP2[is.na(temp_dataset$DP2)==T] <- 0 + temp_dataset$DP = as.numeric(temp_dataset$DP1) + as.numeric(temp_dataset$DP2) + temp_dataset = temp_dataset[temp_dataset$DP >= args$depth_filter,] ## keeping loci >= 20 dp by default + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + #temp_dataset$Sample = sample #only loci with DP >= depth filter will have Sample info + temp_dataset$Sample <- rep(sample, nrow(temp_dataset)) + temp_dataset = temp_dataset %>% select("Locus","Genotype","Sample","VAF") + temp_dataset$Locus = str_replace(temp_dataset$Locus,"chr","") + + if (tolower(genome_build) %in% c("hg19","grch37")){ + temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg19", by.y = "Locus", all.x = T) + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + } else if (tolower(genome_build) %in% c("hg38","grch38")){ + temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg38", by.y = "Locus", all.x = T) + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + } + + if (!exists("all_gbcm")){ + all_gbcm = temp_dataset + } else { + all_gbcm = rbind(all_gbcm, temp_dataset) + } +} +all_gbcm = all_gbcm[is.na(all_gbcm$Sample)==F,] # filters out loci that don't have Sample info (i.e. loci not passing DP filter) +all_gbcm$VAF = round(as.numeric(all_gbcm$VAF), 5) + +wide_all_gbcm = all_gbcm %>% pivot_wider(names_from = Sample, values_from = c(Genotype, VAF)) + +message("Creating final GBCM file") + +all_fp_gbcm_final = merge(hg19_hg38_mapper, wide_all_gbcm,all.x = T) + +if (!dir.exists(args$analysis_folder)) { + dir.create(args$analysis_folder, recursive = TRUE) +} else { + print(paste("Directory already exists:", args$analysis_folder)) +} + +message(paste("Output file: ", args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep="")) + +all_fp_gbcm_final <- apply(all_fp_gbcm_final,2,as.character) +write.table(all_fp_gbcm_final, file = paste(args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep=""), append = F, sep = "\t", row.names = F, quote = F) + +message("FP file completed") diff --git a/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv b/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv new file mode 100644 index 00000000..0339b805 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv @@ -0,0 +1,10 @@ +GRCH37_CHROM GRCH37_POS GRCH38_CHROM GRCH38_POS +MT192765.1 197 MT192765.1 199 +MT192765.1 4788 MT192765.1 4900 +MT192765.1 8236 MT192765.1 8257 +MT192765.1 10506 MT192765.1 10528 +MT192765.1 11037 MT192765.1 11059 +MT192765.1 15009 MT192765.1 15500 +MT192765.1 18807 MT192765.1 18929 +MT192765.1 23813 MT192765.1 24835 +MT192765.1 24103 MT192765.1 25125 diff --git a/modules/msk/custom/fingerprintcombine/tests/main.nf.test b/modules/msk/custom/fingerprintcombine/tests/main.nf.test new file mode 100644 index 00000000..0cb6e4d9 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/tests/main.nf.test @@ -0,0 +1,105 @@ +// nf-core modules test custom/fingerprintcombine +nextflow_process { + + name "Test Process CUSTOM_FINGERPRINTCOMBINE" + script "../main.nf" + process "CUSTOM_FINGERPRINTCOMBINE" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "custom" + tag "custom/fingerprintcombine" + tag "gbcms" + tag "custom/fingerprintvcfparser" + + test("sarscov2 - bam") { + + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test', pool:'mypool' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2', pool:'mypool' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv + .map{ meta, tsv -> + println meta + [[id:meta.pool], tsv, meta.id, "hg19","default"] + }.groupTuple(by:[0]) + input[1] = file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id:"testsample"], + [file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true)], + ["testsample"], + ["hg19"], + ["default"] + ] + input[1] = file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/custom/fingerprintcombine/tests/main.nf.test.snap b/modules/msk/custom/fingerprintcombine/tests/main.nf.test.snap new file mode 100644 index 00000000..85f90edd --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "testsample" + }, + "XDPfilter_ALL_FP.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTCOMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "testsample" + }, + "XDPfilter_ALL_FP.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fingerprintcombine": [ + [ + "CUSTOM_FINGERPRINTCOMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T20:26:07.925718004" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "mypool" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTCOMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "mypool" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ], + "versions_fingerprintcombine": [ + [ + "CUSTOM_FINGERPRINTCOMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T20:25:58.985229402" + } +} \ No newline at end of file diff --git a/modules/msk/custom/fingerprintcombine/tests/nextflow.config b/modules/msk/custom/fingerprintcombine/tests/nextflow.config new file mode 100644 index 00000000..583ce385 --- /dev/null +++ b/modules/msk/custom/fingerprintcombine/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'CUSTOM_FINGERPRINTCOMBINE' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/custom/fingerprintcontamination/environment.yml b/modules/msk/custom/fingerprintcontamination/environment.yml new file mode 100644 index 00000000..21c00633 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - numpy=2.3.3 + - pandas=2.3.2 diff --git a/modules/msk/custom/fingerprintcontamination/main.nf b/modules/msk/custom/fingerprintcontamination/main.nf new file mode 100644 index 00000000..4d48deda --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/main.nf @@ -0,0 +1,39 @@ +process CUSTOM_FINGERPRINTCONTAMINATION { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + //'oras://community.wave.seqera.io/library/numpy_pandas:1f8cb70bfdb82865': + 'docker://community.wave.seqera.io/library/numpy_pandas:f27ed83387b3c038': + 'community.wave.seqera.io/library/numpy_pandas:f27ed83387b3c038' }" + + input: + tuple val(meta), path(fp_tumor), path(fp_normal) + + output: + tuple val(meta), path("*.contamination.tsv") , emit: contamination_tsv + tuple val("${task.process}"), val('calculate_contamination.py'), eval('calculate_contamination.py -v | cut -f 2 -d" "'), emit: versions_fingerprintvcfparser, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + calculate_contamination.py \\ + -t ${fp_tumor} \\ + -n ${fp_normal ?: fp_tumor} \\ + -o ${prefix}.contamination.tsv \\ + ${args} + + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.contamination.tsv + + """ +} diff --git a/modules/msk/custom/fingerprintcontamination/meta.yml b/modules/msk/custom/fingerprintcontamination/meta.yml new file mode 100644 index 00000000..162fff70 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/meta.yml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_fingerprintcontamination" +description: "Calculate major and minor contamination from fingerprint tables" +version: "0.1.0" +keywords: + - fingerprint + - contamination + - qc +tools: + - "pandas": + description: "Python Data Analysis Library" + homepage: "https://pandas.pydata.org/" + documentation: "https://pandas.pydata.org/docs/" + identifier: biotools:pandas + - "numpy": + description: "Scientific computing library for Python" + homepage: "https://numpy.org/" + documentation: "https://numpy.org/doc/" + + identifier: biotools:numpy +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fp_tumor: + type: file + description: Fingerprint table file for tumor sample + pattern: "*.fp.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" # TSV + - edam: http://edamontology.org/format_3475 # TSV + - fp_normal: + type: file + description: Fingerprint table file for normal sample + pattern: "*.fp.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" + + - edam: http://edamontology.org/format_3475 # TSV +output: + contamination_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.contamination.tsv": + type: file + description: Contamination results table + pattern: "*.contamination.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" # TSV + - edam: http://edamontology.org/format_3475 # TSV + versions_fingerprintvcfparser: + - - ${task.process}: {} + - calculate_contamination.py: {} + - 'calculate_contamination.py -v | cut -f 2 -d" ': {} +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/custom/fingerprintcontamination/resources/usr/bin/calculate_contamination.py b/modules/msk/custom/fingerprintcontamination/resources/usr/bin/calculate_contamination.py new file mode 100755 index 00000000..23febaff --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/resources/usr/bin/calculate_contamination.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + + +""" +Calculates contamination from fingerprint table +""" + +__author__ = "Hanan Salim" +__email__ = "salimh@mskcc.org" +__contributors__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +__version__ = "0.1.0" +__status__ = "Dev" + +import argparse +import pandas as pd +import numpy as np +import os +import sys + +def major_contamination(tumor, depth_filter): + tumor_filtered = get_coverage(tumor, depth_filter) + + homozygous = ['AA','CC','GG','TT','A','C','G','T'] + heterozygous = ~tumor_filtered['Genotype'].isin(homozygous) + + try: + return sum(heterozygous)/tumor_filtered.shape[0] + except Exception as e: + return 0 + +def get_coverage(file, depth_filter): + #print(file['Alleles'].str.split(' ', expand=True)) + file[['A1', 'A2']] = file['Alleles'].str.split(' ', expand=True) + + A1_count = list(file['A1'].str.split(':', expand=True)[1]) + A2_count = list(file['A2'].str.split(':', expand=True)[1]) + A1_int = list(map(int, A1_count)) + A2_int = list(map(int, A2_count)) + + file['coverage'] = list(map(lambda x, y: x + y, A1_int, A2_int)) + + filtered_data = file[file['coverage'] > depth_filter] + + return(filtered_data) + +def minor_contamination(normal, tumor, depth_filter): + homozygous_sites = normal.index[normal['MAF'] < .10] + tumor_homozygous = tumor.loc[[i for i in homozygous_sites if i in tumor.index]] + tumor_homozygous_filtered = get_coverage(tumor_homozygous, depth_filter) + + return tumor_homozygous_filtered['MAF'].mean() + +def main(): + parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), description='Calculate major and minor contamination') + + parser.add_argument('-t','--tumor', + required=True, + help='Tumor fingerprint table file') + + parser.add_argument('-n','--normal', + required=True, + help='Normal fingerprint table file') + + parser.add_argument('-o','--output', + required=True, + help='Output file for contamination results') + + parser.add_argument('-d','--depthfilter', + required=False, + default=20, + type=int, + help='Depth filter for coverage (default: 20)' + ) + + parser.add_argument('--version', + action='version', + version='%(prog)s ' + __version__ + ) + + args = parser.parse_args() + + fields = ['Position', 'Alleles', 'Genotype', 'MAF'] + + tumor = pd.read_csv(args.tumor, sep='\t',names=fields,header=0) + tumor = tumor[~tumor['Position'].str.contains('X|Y', na=False)] + tumor = tumor.set_index('Position') + normal = pd.read_csv(args.normal, sep='\t',names=fields,header=0) + normal = normal[~normal['Position'].str.contains('X|Y', na=False)] + normal = normal.set_index('Position') + + major_contam = major_contamination(tumor, depth_filter=args.depthfilter) + minor_contam = minor_contamination(normal, tumor, depth_filter=args.depthfilter) + + with open(args.output,'w') as f: + f.write("Tumor\tNormal\tMajor_Contamination\tMinor_Contamination\n") + f.write("{}\t{}\t{:.4f}\t{:.4f}\n".format( + os.path.basename(args.tumor), + os.path.basename(args.normal), + major_contam, + minor_contam)) + +if __name__== "__main__": + main() diff --git a/modules/msk/custom/fingerprintcontamination/tests/main.nf.test b/modules/msk/custom/fingerprintcontamination/tests/main.nf.test new file mode 100644 index 00000000..bb89f9e0 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_process { + + name "Test Process CUSTOM_FINGERPRINTCONTAMINATION" + script "../main.nf" + process "CUSTOM_FINGERPRINTCONTAMINATION" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "custom" + tag "custom/fingerprintcontamination" + tag "gbcms" + tag "custom/fingerprintvcfparser" + + test("sarscov2 - bam") { + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/custom/fingerprintcontamination/tests/main.nf.test.snap b/modules/msk/custom/fingerprintcontamination/tests/main.nf.test.snap new file mode 100644 index 00000000..233a4680 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTCONTAMINATION", + "calculate_contamination.py", + "" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fingerprintvcfparser": [ + [ + "CUSTOM_FINGERPRINTCONTAMINATION", + "calculate_contamination.py", + "" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T13:12:25.869022442" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTCONTAMINATION", + "calculate_contamination.py", + "" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ] + ], + "versions_fingerprintvcfparser": [ + [ + "CUSTOM_FINGERPRINTCONTAMINATION", + "calculate_contamination.py", + "" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T13:12:16.153445117" + } +} \ No newline at end of file diff --git a/modules/msk/custom/fingerprintcontamination/tests/nextflow.config b/modules/msk/custom/fingerprintcontamination/tests/nextflow.config new file mode 100644 index 00000000..fbd2b2d0 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/tests/nextflow.config @@ -0,0 +1,10 @@ +process { + + withName: 'CUSTOM_FINGERPRINTCONTAMINATION' { + ext.args = "-d 0" + } + + withName: 'CUSTOM_FINGERPRINTVCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/custom/fingerprintcontamination/tests/stash_main.nf.test_stash b/modules/msk/custom/fingerprintcontamination/tests/stash_main.nf.test_stash new file mode 100644 index 00000000..ba4496a5 --- /dev/null +++ b/modules/msk/custom/fingerprintcontamination/tests/stash_main.nf.test_stash @@ -0,0 +1,113 @@ +nextflow_process { + + name "Test Process CUSTOM_FINGERPRINTCONTAMINATION" + script "../main.nf" + process "CUSTOM_FINGERPRINTCONTAMINATION" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "custom" + tag "custom/fingerprintcontamination" + + test("homo sapiens - chr 22 bam") { + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + + params{ + input = "NA12878_GIAB.chr22.vcf" + } + + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file("https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/illumina/bam/NA12878.chr22.bam", checkIfExists:true), + file("https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/illumina/bam/NA12878.chr22.bam.bai", checkIfExists:true), + file("$baseDir/modules/msk/custom/fingerprintcontamination/tests/NA12878_GIAB.chr22.vcf", checkIfExists:true), + "variant_file.vcf" + ] + input[1] = file(params.test_data_mskcc['calculate_noise']['test_chr22_fa'], checkIfExists: true) + input[2] = file(params.test_data_mskcc['calculate_noise']['test_chr22_fa_fai'], checkIfExists: true) + //input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + //input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/custom/fingerprintcorrelation/environment.yml b/modules/msk/custom/fingerprintcorrelation/environment.yml new file mode 100644 index 00000000..acabcada --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/environment.yml @@ -0,0 +1,18 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-argparse=2.3.1 + - conda-forge::r-data.table=1.17.8 + - conda-forge::r-dplyr=1.1.4 + - conda-forge::r-ggforce=0.5.0 + - conda-forge::r-ggiraph=0.8.12 + - conda-forge::r-gtools=3.9.5 + - conda-forge::r-htmlwidgets=1.6.4 + - conda-forge::r-plotly=4.11.0 + - conda-forge::r-plyr=1.8.9 + - conda-forge::r-reshape2=1.4.4 + - conda-forge::r-scales=1.4.0 + - conda-forge::r-tidyverse=2.0.0 diff --git a/modules/msk/custom/fingerprintcorrelation/main.nf b/modules/msk/custom/fingerprintcorrelation/main.nf new file mode 100644 index 00000000..1eec8ff8 --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/main.nf @@ -0,0 +1,45 @@ +process CUSTOM_FINGERPRINTCORRELATION { + tag {'$prefix'} + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:5c045bc9fea1dbd5': + 'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:5c045bc9fea1dbd5' } " + // 'oras://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:8211a2010a4712ea': + + input: + tuple val(meta), path(combined_fp_tsv) + val(filter_term) + + output: + tuple val(meta), path("*.pdf") , emit: heatmap_pdf + tuple val(meta), path("*.html") , emit: heatmap_html + tuple val(meta), path("*_observations.tab") , emit: observations_tab + tuple val(meta), path("*_correlations.tab") , emit: correlations_tab + tuple val("${task.process}"), val('plot_gbcm.R'), val("0.1.0"), topic: versions, emit: versions_fingerprintcorrelation + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = meta.id ?: "batch" + def filter_args = (filter_term && filter_term != "") ? "-p ${filter_term} -f" : "" + """ + plot_gbcm.R \\ + -t ${combined_fp_tsv} \\ + -o ./ \\ + ${filter_args} + """ + + stub: + def args = task.ext.args ?: '' + def prefix = meta.id ?: "batch" + """ + touch ${prefix}.pdf + touch ${prefix}.html + touch ${prefix}_observations.tab + touch ${prefix}_correlations.tab + """ +} diff --git a/modules/msk/custom/fingerprintcorrelation/meta.yml b/modules/msk/custom/fingerprintcorrelation/meta.yml new file mode 100644 index 00000000..8e5e1d37 --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/meta.yml @@ -0,0 +1,51 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_fingerprintcorrelation" +description: null +keywords: + - sort + - example + - genomics +tools: + - "custom": + description: "" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: null + identifier: null + +input: + - - meta: {} + - combined_fp_tsv: {} +output: + heatmap_pdf: + - - meta: {} + - "*_gbcm_sample-to-sample4.pdf": {} + heatmap_html: + - - meta: {} + - "*_interactive4.html": {} + observations_tab: + - - meta: {} + - "*_observations.tab": {} + versions_fingerprintcorrelation: + - - ${task.process}: + type: string + description: The name of the process + - plot_gbcm.R: {} + - 0.1.0: {} +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - plot_gbcm.R: + type: string + description: The name of the tool + - 0.1.0: + type: eval + description: The expression to obtain the version of the tool +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/custom/fingerprintcorrelation/resources/usr/bin/plot_gbcm.R b/modules/msk/custom/fingerprintcorrelation/resources/usr/bin/plot_gbcm.R new file mode 100755 index 00000000..7e985759 --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/resources/usr/bin/plot_gbcm.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript + +#------------------------------------------------------------------------------- +# Script: plot_gbcm.R +# Author: Hanan Salim +# Date: 2026-02-09 +# Version: 0.2.0 +# +# Description: This script takes in a wide fingerprinting table pertaining +# to multiple samples and plots in pdf and html formats. +# Additionally, a table with the number of observations for each correlation +# is also written to an output file. +# +#------------------------------------------------------------------------------- + +rm(list=ls()) + +library(argparse, quietly = T) +library(plyr, quietly = T) +library(dplyr, quietly = T) +library(data.table, quietly = T) +library(tidyverse, quietly = T) +library(scales, quietly = T) +library(ggforce, quietly = T) +library(gtools, quietly = T) +library(htmlwidgets) +library(ggiraph) + + +`%notin%` <- Negate(`%in%`) +`%notlike%` <- Negate(`%like%`) + + +#function to size the dots +calculate_point_size <- function(x,y) { + n_x <- length(unique(x)) + n_y <- length(unique(y)) + + #define your plot size (in inches) + plot_width_in <- 20 + plot_height_in <- 20 + + #convert to mm (1 inch = 25.4 mm) + plot_width_mm <- plot_width_in * 25.4 + plot_height_mm <- plot_height_in * 25.4 + + #calculate tile size in mm + tile_width_mm <- plot_width_mm / n_x + tile_height_mm <- plot_height_mm / n_y + + #max circle diameter (fits inside smallest tile dimension) + max_diameter_mm <- min(tile_width_mm, tile_height_mm) + + #approximate max point size for geom_point (radius in mm) + max_point_size <- max_diameter_mm + + return(max_point_size) +} + + +#function to create static plots +static_plot <- function(data, max_point_size) { + n = length(unique(data$Var1)) + legend_size = max_point_size * n * .4 + + axis_text_size = if (n < 25) 14 else 10 + + p <- ggplot(data, aes(x = Var1, y = Var2)) + + geom_tile(color = "gray50", linewidth = 0.25, fill = NA) + + geom_point_interactive( + aes(size = log2_size, + fill = value, + tooltip = paste0( + "x: ", Var1, "\n", + "y: ", Var2, "\n", + "Loci Overlap: ", size, "\n", + "Correlation: ", round(value, 2) + )), + shape = 21, + color="NA" + ) + + scale_x_discrete(limits = sort(levels(data$Var1))) + + scale_y_discrete(limits = rev(sort(levels(data$Var2)))) + + scale_fill_viridis_c( + name = "Correlation", + option = "viridis", + direction = -1, + alpha = 0.75, + begin = 0, + end = 1, + limits = c(-1, 1), + breaks = seq(-1, 1, by = .25), + guide = guide_colorbar(direction = "vertical", + title.position = "top", + barheight = unit(legend_size, "mm"), + barwidth = unit(legend_size*.05, "mm") + )) + + scale_size_continuous( + limits = c(0, 14.2), #known max of log2(size) + range = c(0, max_point_size), + breaks = seq(2, 14, by = 4), + name = "Loci Overlap (log2)", + guide = guide_legend(direction = "vertical", + title.position = "top", + keyheight = unit(legend_size/4, "mm"), + override.aes = list( + color = "black", + stroke = 0.5 + )) + ) + + labs(title = title) + + theme_minimal() + + theme( + text = element_text(family = "Courier"), + panel.grid = element_blank(), + axis.text.x = element_text(angle = 90, hjust = 1, size = 10, color = "black"), + axis.text.y = element_text(size = 10, color = "black"), + axis.title = element_blank(), + plot.title = element_text(hjust = 0.5, size = 24, margin = margin(b = 15)), + legend.position = "right", + legend.box = "horizontal", + legend.box.just = "left", + legend.title.align = 0.5, + legend.spacing.x = unit(1, "cm"), + aspect.ratio = 1 + ) + + return(p) +} + + +parser = ArgumentParser(description = 'create correlation plots for a given sample') + +parser$add_argument('-t', '--table', required = TRUE, + help = 'summary table') + +parser$add_argument('-o', '--analysis_folder', required = TRUE, + help = 'output folder') + +parser$add_argument('-p', '--pool', required = FALSE, + default = "fp_plots", + help = 'pool ID') + +parser$add_argument('-f', '--filter', + action = "store_true", + default = FALSE, + help = "create pool levelel plots instead of extended plots" +) + +args = parser$parse_args() + +fingerprints = fread(args$table, sep = '\t') +outdir = args$analysis_folder +sample = args$pool + + +#format data +fingerprints <- fingerprints %>% select(-contains(c('Loci_hg19', 'Loci_hg38'))) +cols <- grep("VAF", names(fingerprints), value = TRUE) +fingerprints <- fingerprints[, ..cols] + +for ( col in 1:ncol(fingerprints)){ + colnames(fingerprints)[col] <- sub("VAF_", "", colnames(fingerprints)[col]) +} + +title = paste("Pool:", sample,"; ", nrow(fingerprints)," Loci used",sep = "") + +fp_matrix <- data.matrix(fingerprints) +fp_matrix = cor(as.matrix(fp_matrix), method = c("pearson"), use = "pairwise.complete.obs") + +fp_long <- reshape2::melt(fp_matrix) +observations = crossprod(!is.na(fingerprints)) +obs_long <- reshape2::melt(observations) +final <- data.frame(fp_long, size = obs_long$value) + +#calculate log2 size column +final$log2_size <- log2(final$size) + +if (args$filter) { + + if (identical(args$pool, "fp_plots")) { + message("A pool ID is required to create pool level plots") + quit(status = 1) + } + + message("Creating pool level plots") + type="pool" + + final = final %>% filter(grepl(args$pool, Var1) & grepl(args$pool, Var2)) + final = droplevels(final) + +} else { + message("Creating extended plots") + type="extended" +} + +#get max point size +max_point_size = calculate_point_size(final$Var1, final$Var2) + +#create static plot +s <- static_plot(final, max_point_size) +ggsave(paste(outdir,"/",sample,"_", type, '.pdf', sep = ""), plot = s, width = 25, height = 25, units = "in", device = cairo_pdf) + +#create interactive plot +i = girafe(ggobj = s, width_svg = 25, height_svg = 25, + options = list(opts_tooltip(css = "padding:5pt; font-size:16pt; color:white; background-color:black;"))) +saveWidget(i, paste(outdir,"/",sample,"_", type,'.html', sep = ""), selfcontained = TRUE) + +#save tables +write.table(observations, paste(outdir,"/",sample, '_observations.tab', sep = ''), sep = '\t') +write.table(fp_matrix, paste(outdir,"/",sample, '_correlations.tab', sep = ''), sep = '\t') diff --git a/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test b/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test new file mode 100644 index 00000000..8142af7b --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test @@ -0,0 +1,113 @@ +nextflow_process { + + name "Test Process CUSTOM_FINGERPRINTCORRELATION" + script "../main.nf" + process "CUSTOM_FINGERPRINTCORRELATION" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "custom" + tag "custom/fingerprintcorrelation" + tag "custom/fingerprintcombine" + tag "gbcms" + tag "custom/fingerprintvcfparser" + + test("sarscov2 - bam") { + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + run("CUSTOM_FINGERPRINTCOMBINE"){ + script "../../fingerprintcombine/main.nf" + process { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv + .map{ meta, tsv -> + def meta2 = [id:meta.pool] + [[id:meta.pool], tsv, meta.id, "hg19", "default"] + }.groupTuple(by:[0]) + input[1] = file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + } + when { + process { + """ + input[0] = CUSTOM_FINGERPRINTCOMBINE.out.combined_fp_tsv + input[1] = "" + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.correlations_tab, + process.out.observations_tab, + process.out.versions_fingerprintcorrelation + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[id:'thispool'], file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true)] + input[1] = "" + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.correlations_tab, + process.out.observations_tab, + process.out.versions_fingerprintcorrelation + ).match() } + ) + } + + } + +} diff --git a/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test.snap b/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test.snap new file mode 100644 index 00000000..13fe33ac --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/tests/main.nf.test.snap @@ -0,0 +1,66 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + [ + [ + { + "id": "thispool" + }, + "thispool_correlations.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "thispool" + }, + "thispool_observations.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + "CUSTOM_FINGERPRINTCORRELATION", + "plot_gbcm.R", + "0.1.0" + ] + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-11T12:20:48.405942771" + }, + "sarscov2 - bam": { + "content": [ + [ + [ + { + "id": null + }, + "fp_plots_correlations.tab:md5,dbc55d8829950501d3ed2db9a832165c" + ] + ], + [ + [ + { + "id": null + }, + "fp_plots_observations.tab:md5,858d6d115a4da81652bb98dcc8b8077f" + ] + ], + [ + [ + "CUSTOM_FINGERPRINTCORRELATION", + "plot_gbcm.R", + "0.1.0" + ] + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-11T12:20:41.807879336" + } +} \ No newline at end of file diff --git a/modules/msk/custom/fingerprintcorrelation/tests/nextflow.config b/modules/msk/custom/fingerprintcorrelation/tests/nextflow.config new file mode 100644 index 00000000..b676d906 --- /dev/null +++ b/modules/msk/custom/fingerprintcorrelation/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'CUSTOM_FINGERPRINTCOMBINE' { + ext.args = "-d 0" + } + withName: 'CUSTOM_FINGERPRINTVCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/custom/fingerprintvcfparser/environment.yml b/modules/msk/custom/fingerprintvcfparser/environment.yml new file mode 100644 index 00000000..a5547b5c --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::pysam=0.23.3" diff --git a/modules/msk/custom/fingerprintvcfparser/main.nf b/modules/msk/custom/fingerprintvcfparser/main.nf new file mode 100644 index 00000000..b5924ce8 --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/main.nf @@ -0,0 +1,41 @@ +process CUSTOM_FINGERPRINTVCFPARSER { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pysam:0.23.0--py39hdd5828d_0': + 'biocontainers/pysam:0.23.0--py39hdd5828d_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("${prefix}.fp.tsv") , emit: tsv + tuple val("${task.process}"), val('parse_fingerprint_vcf.py'), eval('parse_fingerprint_vcf.py -v | cut -f 2 -d" "'), emit: versions_fingerprintvcfparser, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + parse_fingerprint_vcf.py \\ + --input ${vcf} \\ + --output ${prefix}.fp.tsv \\ + --samplename ${prefix} \\ + $args + + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo $args + + touch ${prefix}.fp.tsv + + """ +} diff --git a/modules/msk/custom/fingerprintvcfparser/meta.yml b/modules/msk/custom/fingerprintvcfparser/meta.yml new file mode 100644 index 00000000..922f1504 --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/meta.yml @@ -0,0 +1,58 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_fingerprintvcfparser" +description: + Custom script to parse fingerprint VCF files, generated by the GBCMS + module. +keywords: + - custom + - fingerprint + - vcf + - pysam +tools: + - "custom": + description: + "Pysam is a Python module for reading and manipulating SAM/BAM/VCF/BCF + files. It's a lightweight wrapper of the htslib C-API, the same one that powers + samtools, bcftools, and tabix." + homepage: "https://pysam.readthedocs.io/en/latest/api.html" + documentation: "https://pysam.readthedocs.io/en/latest/api.html" + tool_dev_url: "https://github.com/pysam-developers/pysam" + licence: ["MIT"] + identifier: biotools:pysam + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: Fasta file containing scaffold + pattern: "*.vcf" + ontologies: + - edam: http://edamontology.org/format_3016 # VCF +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}.fp.tsv: + type: file + description: + Tab-separated values (TSV) file containing parsed fingerprint + data + pattern: "${prefix}.fp.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + + versions_fingerprintvcfparser: + - - ${task.process}: {} + - parse_fingerprint_vcf.py: {} + - 'parse_fingerprint_vcf.py -v | cut -f 2 -d" ': {} +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/custom/fingerprintvcfparser/resources/usr/bin/parse_fingerprint_vcf.py b/modules/msk/custom/fingerprintvcfparser/resources/usr/bin/parse_fingerprint_vcf.py new file mode 100755 index 00000000..b4ddd044 --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/resources/usr/bin/parse_fingerprint_vcf.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +import argparse + +""" +Converts fingerprint vcf to a formatted table +""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.1.0" +__status__ = "Dev" + +import sys, os +from pysam import VariantFile # version >= 0.15.2 +from itertools import groupby + +def usage(): + parser = argparse.ArgumentParser(prog='parse_fingerprint_vcf.py') + parser.add_argument('--input','-i', help = 'input file', required = True) + parser.add_argument('--samplename','-n', help = 'sample name', required = True) + parser.add_argument('--output','-o', help = 'output file', required = True) + parser.add_argument('--depth-filter','-d', default = 20, type = int, help = 'minimum read depth for outputting a minor allele frequency [default = 20]') + parser.add_argument('--version','-v',action='version',version='%(prog)s ' + __version__, help="Show program's version number and exit.") + return parser.parse_args() + +def main(): + args = usage() + + fp_out_list = [] + + vcf_in = VariantFile(args.input, "r") + for vcf_rec in vcf_in.fetch(): + ref_allele = vcf_rec.ref + alt_allele = vcf_rec.alts[0] + ref_allele_count = vcf_rec.samples[args.samplename]["RD"] + alt_allele_count = vcf_rec.samples[args.samplename]["AD"] + if ref_allele_count >= alt_allele_count and ref_allele_count > 0: + maf = alt_allele_count / float(ref_allele_count + alt_allele_count) + if maf < .1: + genotype = ref_allele*2 + else: + genotype = ref_allele + alt_allele + elif alt_allele_count > ref_allele_count: + maf = ref_allele_count / float(ref_allele_count + alt_allele_count) + if maf < .1: + genotype = alt_allele*2 + #else: genotype = alt_allele + ref_allele + else: + genotype = ref_allele + alt_allele + elif ref_allele_count == 0: + genotype = "--" + else: + genotype = ref_allele + alt_allele + if ref_allele_count + alt_allele_count < args.depth_filter or genotype == "--": + maf = "" + + + formatted_counts = "{}:{} {}:{}".format(ref_allele,ref_allele_count,alt_allele,alt_allele_count) + + locus = "{}:{}".format(vcf_rec.chrom,vcf_rec.pos) + depth = vcf_rec.samples[args.samplename]["DP"] + + fp_out_list += [[locus,formatted_counts, genotype, maf]] + + with open(args.output,'w') as f: + f.write("\t".join(['Locus', args.samplename + '_Counts', args.samplename + '_Genotypes', args.samplename + '_MinorAlleleFreq']) + "\n") + for i in fp_out_list: + f.write("\t".join([str(j) for j in i]) + "\n") + +if __name__ == "__main__": + main() diff --git a/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test b/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test new file mode 100644 index 00000000..e4454cba --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test @@ -0,0 +1,75 @@ +// nf-core modules test custom/fingerprintvcfparser +nextflow_process { + + name "Test Process CUSTOM_FINGERPRINTVCFPARSER" + script "../main.nf" + process "CUSTOM_FINGERPRINTVCFPARSER" + + tag "modules" + tag "modules_msk" + tag "custom" + tag "custom/fingerprintvcfparser" + tag "gbcms" + + test("sarscov2 - vcf") { + config "./nextflow.config" + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + } + when { + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - vcf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test.snap b/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test.snap new file mode 100644 index 00000000..fb734f9f --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "sarscov2 - vcf": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,9fa9a081f17ee52f03463c96d46a23aa" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTVCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ], + "tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,9fa9a081f17ee52f03463c96d46a23aa" + ] + ], + "versions_fingerprintvcfparser": [ + [ + "CUSTOM_FINGERPRINTVCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T13:02:44.951823372" + }, + "sarscov2 - vcf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fp.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "CUSTOM_FINGERPRINTVCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fp.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fingerprintvcfparser": [ + [ + "CUSTOM_FINGERPRINTVCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T13:02:51.967429606" + } +} \ No newline at end of file diff --git a/modules/msk/custom/fingerprintvcfparser/tests/nextflow.config b/modules/msk/custom/fingerprintvcfparser/tests/nextflow.config new file mode 100644 index 00000000..f2cf46a3 --- /dev/null +++ b/modules/msk/custom/fingerprintvcfparser/tests/nextflow.config @@ -0,0 +1,3 @@ +params { + enable_conda = false +} diff --git a/modules/msk/gbcms/main.nf b/modules/msk/gbcms/main.nf index 38922559..bd0d8dbf 100644 --- a/modules/msk/gbcms/main.nf +++ b/modules/msk/gbcms/main.nf @@ -12,7 +12,7 @@ process GBCMS { output: tuple val(meta), path('*.{vcf,maf}'), emit: variant_file - path "versions.yml" , emit: versions + tuple val("${task.process}"), val('gbcms'), eval("GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]'"), emit: versions_gbcms, topic: versions when: task.ext.when == null || task.ext.when @@ -44,10 +44,6 @@ process GBCMS { --output ${output} \\ --bam $sample:${bam} $args - cat <<-END_VERSIONS > versions.yml - "${task.process}": - GetBaseCountsMultiSample: \$(echo \$(GetBaseCountsMultiSample --help) | grep -oP '[0-9]\\.[0-9]\\.[0-9]') - END_VERSIONS """ stub: @@ -56,9 +52,5 @@ process GBCMS { """ touch variant_file.maf - cat <<-END_VERSIONS > versions.yml - "${task.process}": - GetBaseCountsMultiSample: 1.2.5 - END_VERSIONS """ } diff --git a/modules/msk/gbcms/meta.yml b/modules/msk/gbcms/meta.yml index 170a3e3c..a782f77a 100644 --- a/modules/msk/gbcms/meta.yml +++ b/modules/msk/gbcms/meta.yml @@ -14,10 +14,9 @@ tools: in a given VCF file or MAF file" homepage: "https://github.com/msk-access/GetBaseCountsMultiSample" documentation: "https://github.com/msk-access/GetBaseCountsMultiSample/blob/master/README.md" - identifier: "" + input: - # Only when we have meta - - meta: type: map description: | @@ -29,45 +28,65 @@ input: Input bam file, in the format of SAMPLE_NAME:BAM_FILE. This paramter need to be specified at least once pattern: "*.bam" + ontologies: [] - bambai: type: file description: Index of Bam pattern: "*.bai" + ontologies: [] - variant_file: type: file description: Input variant file in TCGA maf format. --maf or --vcf need to be specified at least once. But --maf and --vcf are mutually exclusive pattern: "*.{maf,vcf}" + ontologies: [] - output: type: string description: Output file - - - fasta: - type: file - description: Input reference sequence file - pattern: "*.fasta" - - - fastafai: - type: file - description: Index of the reference Fasta - pattern: "*.fai" + - fasta: + type: file + description: Input reference sequence file + pattern: "*.fasta" + ontologies: [] + - fastafai: + type: file + description: Index of the reference Fasta + pattern: "*.fai" + + ontologies: [] output: - - variant_file: - - meta: - type: file - description: - base counts in multiple BAM files for all the sites in a given - VCF file or MAF file - pattern: "*.{vcf,maf}" + variant_file: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` - "*.{vcf,maf}": type: file description: base counts in multiple BAM files for all the sites in a given VCF file or MAF file pattern: "*.{vcf,maf}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions_gbcms: + - - ${task.process}: + type: string + description: The name of the process + - gbcms: + type: string + description: The name of the tool + - GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]': {} +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - gbcms: + type: string + description: The name of the tool + - GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]': + type: eval + description: The expression to obtain the version of the tool authors: - "@buehlere" diff --git a/modules/msk/gbcms/tests/main.nf.test.snap b/modules/msk/gbcms/tests/main.nf.test.snap index 31b547e2..60ff40c5 100644 --- a/modules/msk/gbcms/tests/main.nf.test.snap +++ b/modules/msk/gbcms/tests/main.nf.test.snap @@ -12,7 +12,11 @@ ] ], "1": [ - "versions.yml:md5,a94265ed3bc4b5631d85b9b9b5d2b7e5" + [ + "GBCMS", + "gbcms", + "1.2.4" + ] ], "variant_file": [ [ @@ -23,15 +27,19 @@ "variant_file.vcf:md5,28c8df33c7ea5ed5d1cf9997d8e00ffa" ] ], - "versions": [ - "versions.yml:md5,a94265ed3bc4b5631d85b9b9b5d2b7e5" + "versions_gbcms": [ + [ + "GBCMS", + "gbcms", + "1.2.4" + ] ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-02-13T17:19:51.302342" + "timestamp": "2025-12-17T12:07:13.813792199" } } \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms/main.nf b/subworkflows/msk/fingerprint_gbcms/main.nf new file mode 100644 index 00000000..bc63c22f --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/main.nf @@ -0,0 +1,74 @@ +include { GBCMS } from '../../../modules/msk/gbcms/main' +include { CUSTOM_FINGERPRINTVCFPARSER } from '../../../modules/msk/custom/fingerprintvcfparser/main' +include { CUSTOM_FINGERPRINTCONTAMINATION } from '../../../modules/msk/custom/fingerprintcontamination/main' +include { FINGERPRINT_GBCMS_BATCH } from '../fingerprint_gbcms_batch/main' + +workflow FINGERPRINT_GBCMS { + + take: + ch_bam // channel: [ val(meta), [ bam ] ] + ch_bai // channel: [ val(meta), [ bai ] ] + ch_fp_tsv // channel: [ val(meta), [ tsv ] ] + ch_fp_loci_vcf // channel: [ val(meta), [ vcf ] ] + ch_liftover_loci_mapping // channel: [ liftover_loci_mapping ] + ch_fasta // channel: [ fasta ] + ch_fastafai // channel: [ fastafai ] + default_genome // channel: [ genome ] + run_correlation + + main: + + println ch_fp_loci_vcf.getClass() + println ch_fasta.getClass() + + GBCMS( + ch_bam + .combine(ch_bai, by:[0]) + .combine(ch_fp_loci_vcf.map{ if ( [it].flatten().size() > 1){ it[1] } else { it }}.first()) + .map{ meta, bam, bai, vcf -> [ meta, bam, bai, vcf, meta.id + ".fp.vcf" ] }.view(), + ch_fasta.first(), + ch_fastafai.first() + //ch_fasta.view().map{ if (it[0] instanceof Map){ it[1] } else { it }}.first(), + //ch_fastafai.view().map{ if (it[0] instanceof Map){ it[1] } else { it }}.first() + ) + + + + CUSTOM_FINGERPRINTVCFPARSER ( GBCMS.out.variant_file ) + + all_fps = CUSTOM_FINGERPRINTVCFPARSER.out.tsv.mix(ch_fp_tsv) + + paired_fps = all_fps + .filter{ meta, tsv -> meta.case_id != null && meta.control_id != null && meta.id == meta.case_id } + .combine(all_fps) + .filter{ meta1, fp1, meta2, fp2 -> + meta1.control_id == meta2.id + }.map{ meta1, fp1, meta2, fp2 -> + [ meta1, fp1, fp2] + } + + unpaired_fps = all_fps + .filter{ meta, tsv -> meta.id != meta.case_id || meta.control_id == null } + .map{ meta, tsv -> [ meta, tsv, [] ] } + + CUSTOM_FINGERPRINTCONTAMINATION ( paired_fps.mix(unpaired_fps).view() ) + + if (run_correlation) { + FINGERPRINT_GBCMS_BATCH ( + all_fps, + ch_liftover_loci_mapping, + default_genome, + [] + ) + combined_fp_tsv = FINGERPRINT_GBCMS_BATCH.out.combined_fp_tsv + } else { + combined_fp_tsv = Channel.empty() + } + + emit: + fp_tsv_from_bam = CUSTOM_FINGERPRINTVCFPARSER.out.tsv // channel: [ val(meta), tsv ] + fp_tsv = all_fps // channel: [ val(meta), tsv ] + contamination_tsv = CUSTOM_FINGERPRINTCONTAMINATION.out.contamination_tsv // channel: [ val(meta), contamination_tsv ] + combined_fp_tsv = combined_fp_tsv // channel: [ tsv ] + +} diff --git a/subworkflows/msk/fingerprint_gbcms/meta.yml b/subworkflows/msk/fingerprint_gbcms/meta.yml new file mode 100644 index 00000000..8deac1af --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fingerprint_gbcms" +description: | + Get base counts for all fingerprinting sites from BAM/CRAM/SAM files using the GBCMS module, + and parse the resulting VCF files into standardized TSV format using a custom parser. +keywords: + - fingerprint + - fingerprinting + - loci + - vcf + - bam + - qc +components: + - gbcms + - custom/fingerprintvcfparser + - custom/fingerprintcontamination + - fingerprint_gbcms_batch +input: + - ch_bam: + type: file + description: | + The input channel containing the BAM/CRAM/SAM files + Structure: [ val(meta), path(bam) ] + pattern: "*.{bam/cram/sam}" + - ch_bai: + type: file + description: | + The input channel containing the BAM index files (BAI/CSI) + Structure: [ val(meta), path(bai) ] # or path(csi) + pattern: "*.{bai/csi}" + - ch_fp_vcf: + type: file + description: | + Channel containing fingerprint VCF files + Structure: [ val(meta), path(vcf) ] + pattern: "*.vcf" + - ch_fasta: + type: file + description: | + Channel containing reference FASTA files + Structure: [ path(fasta) ] + pattern: "*.{fasta,fa}" + - ch_fastafai: + type: file + description: | + Channel containing reference FASTA index files + Structure: [ path(fasta.fai) ] + pattern: "*.{fasta,fa}.fai" +output: + - tsv: + type: file + description: | + Channel containing standardized fingerprint TSV files + Structure: [ val(meta), path(tsv) ] + pattern: "*.fp.tsv" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test new file mode 100644 index 00000000..171495b5 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_workflow { + + name "Test Subworkflow FINGERPRINT_GBCMS" + script "../main.nf" + config "./nextflow.config" + workflow "FINGERPRINT_GBCMS" + + tag "subworkflows" + tag "subworkflows_msk" + tag "subworkflows/fingerprint_gbcms" + tag "subworkflows/fingerprint_gbcms_batch" + tag "gbcms" + tag "custom/fingerprintvcfparser" + tag "custom/fingerprintcontamination" + + test("sarscov2 - bam") { + + when { + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true) + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true) + ], + ) + input[1] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true) + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true) + ], + ) + input[2] = Channel.empty() + input[3] = Channel.of(file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true)) + input[4] = [file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true)] + input[5] = Channel.of(file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)) + input[6] = Channel.of(file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)) + input[7] = "hg19" + input[8] = true + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} diff --git a/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap new file mode 100644 index 00000000..414032c9 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap @@ -0,0 +1,125 @@ +{ + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "1": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "2": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.contamination.tsv:md5,2eb950d4d5e0f9b4f7ae53d41d22fb5f" + ] + ], + "3": [ + [ + { + "id": "defaultbatch" + }, + "0DPfilter_ALL_FP.txt:md5,2b376a207fd1bd6bec55fa765e3a3947" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "defaultbatch" + }, + "0DPfilter_ALL_FP.txt:md5,2b376a207fd1bd6bec55fa765e3a3947" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.contamination.tsv:md5,2eb950d4d5e0f9b4f7ae53d41d22fb5f" + ] + ], + "fp_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "fp_tsv_from_bam": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T14:58:55.67145979" + } +} \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config b/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config new file mode 100644 index 00000000..17e225ec --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config @@ -0,0 +1,13 @@ +process { + withName: 'CUSTOM_FINGERPRINTCOMBINE' { + ext.args = "-d 0" + } + + withName: 'CUSTOM_FINGERPRINTCONTAMINATION' { + ext.args = "-d 0" + } + + withName: 'CUSTOM_FINGERPRINTVCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/main.nf b/subworkflows/msk/fingerprint_gbcms_batch/main.nf new file mode 100644 index 00000000..19a19169 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/main.nf @@ -0,0 +1,33 @@ +include { CUSTOM_FINGERPRINTCOMBINE } from '../../../modules/msk/custom/fingerprintcombine/main' +include { CUSTOM_FINGERPRINTCORRELATION } from '../../../modules/msk/custom/fingerprintcorrelation/main' + +workflow FINGERPRINT_GBCMS_BATCH { + + take: + ch_fp // channel: [ val(meta), [ bam ] ] + ch_liftover_loci_mapping // channel: [ liftover_loci_mapping ] + default_genome + filter_terms // channel: filterterm + + main: + + CUSTOM_FINGERPRINTCOMBINE( + ch_fp + .map{ meta, tsv -> + def meta2 = [id:'defaultbatch'] + if (meta.pool) { + meta2.id = meta.pool + } + [meta2, tsv, meta.id, meta.genome ?: default_genome, meta.patient ?: meta.sample ] + }.groupTuple(by:[0]), + ch_liftover_loci_mapping.first() + ) + + CUSTOM_FINGERPRINTCORRELATION( + CUSTOM_FINGERPRINTCOMBINE.out.combined_fp_tsv, + filter_terms.unique() + ) + + emit: + combined_fp_tsv = CUSTOM_FINGERPRINTCOMBINE.out.combined_fp_tsv // channel: [ val(meta), [ bam ] ] +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/meta.yml b/subworkflows/msk/fingerprint_gbcms_batch/meta.yml new file mode 100644 index 00000000..1646d1c9 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/meta.yml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fingerprint_gbcms_batch" +description: "Subworkflow to combine and compare Fingerprint files from different samples" +keywords: + - fingerprint + - qc + - liftover + - batch + - pool +components: + - custom/fingerprintcombine + - custom/fingerprintcorrelation +input: + - ch_fp: + type: file + description: | + The input channel containing one fingerprint file per sample + Structure: [ val(meta), path(fp_tsv) ] + pattern: "*.fp.tsv" + - ch_liftover_loci_mapping: + type: file + description: | + The input channel containing the loci mapping file for liftover + Structure: [ path(loci_mapping.tsv) ] + pattern: "*.tsv" +output: + - combined_fp_tsv: + type: file + description: | + Channel containing combined fingerprint TSV file + Structure: [ path(combined_fp_tsv) ] + pattern: "*DPfilter_ALL_FP.txt" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test new file mode 100644 index 00000000..d705e2c7 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test @@ -0,0 +1,76 @@ +// nf-core subworkflows test fingerprint_gbcms_batch +nextflow_workflow { + + name "Test Subworkflow FINGERPRINT_GBCMS_BATCH" + script "../main.nf" + config "./nextflow.config" + workflow "FINGERPRINT_GBCMS_BATCH" + + tag "subworkflows" + tag "subworkflows_msk" + tag "subworkflows/fingerprint_gbcms_batch" + tag "gbcms" + tag "custom/fingerprintvcfparser" + tag "custom/fingerprintcombine" + tag "custom/fingerprintcorrelation" + + + test("sarscov2 - bam - single_end") { + + setup { + run("GBCMS"){ + script "../../../../modules/msk/gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("CUSTOM_FINGERPRINTVCFPARSER"){ + script "../../../../modules/msk/custom/fingerprintvcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + workflow { + """ + input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv + input[1] = [file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true)] + input[2] = "hg19" + input[3] = Channel.empty() + //input[3] = Channel.of("") + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap new file mode 100644 index 00000000..3441d510 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap @@ -0,0 +1,29 @@ +{ + "sarscov2 - bam - single_end": { + "content": [ + { + "0": [ + [ + { + "id": "defaultbatch" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "defaultbatch" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-17T13:30:57.724162129" + } +} \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config b/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config new file mode 100644 index 00000000..583ce385 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'CUSTOM_FINGERPRINTCOMBINE' { + ext.args = "-d 0" + } +}