Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
15c15f6
first commit for custom/fingerprintvcfparser
anoronh4 Aug 12, 2025
785cd7c
update module and nf-test
anoronh4 Aug 12, 2025
7f47996
set enable_conda to false
anoronh4 Aug 14, 2025
af7d902
update meta.yml
anoronh4 Aug 14, 2025
ed05eb0
remove TODO lines
anoronh4 Aug 14, 2025
5dc010e
remove more TODO lines
anoronh4 Aug 14, 2025
112ac67
add module to skipped nf-tests for conda
anoronh4 Aug 14, 2025
e09936f
bugfix
anoronh4 Aug 14, 2025
a8805ae
add subworkflow for generating fingerprints with gbcms
anoronh4 Aug 14, 2025
c9cf10f
add subworkflow to skipped nf-tests for conda
anoronh4 Aug 15, 2025
e3a7bed
add fingerprint contamination module
anoronh4 Sep 25, 2025
95548d1
add custom/fingerprintcombine module
anoronh4 Sep 26, 2025
b6e5229
add subworkflow fingerprint_gbcms_batch
anoronh4 Sep 26, 2025
257e8f9
update subworkflow to designate a genome to each sample fingerprint file
anoronh4 Sep 26, 2025
33c6cbd
remove tag attribute on custom/fingerprintcombine
anoronh4 Sep 26, 2025
404b8ef
update fingerprint_gbcms subworkflow, including fingerprint_gbcms_bat…
anoronh4 Sep 26, 2025
ebf5800
skip conda tests for fingerprint modules and subworkflows
anoronh4 Sep 26, 2025
eb9e073
update version output of contamination script
anoronh4 Sep 26, 2025
e0cf039
update snapshot
anoronh4 Sep 26, 2025
6691063
update snapshot
anoronh4 Sep 26, 2025
de1c5f6
exclude X, Y chromosomes from contamination calculations
anoronh4 Nov 5, 2025
a461d1e
set index of table to 'Position'
anoronh4 Nov 6, 2025
a280d30
fixed filtering of table by index labels
anoronh4 Nov 7, 2025
04ff5ad
fix indentation
anoronh4 Nov 7, 2025
e5fb9ed
fix file formatting and spacing
anoronh4 Dec 17, 2025
0c3c341
add meta map to custom/fingerprintcombine
anoronh4 Dec 17, 2025
3cc109e
add custom/fingerprintcorrelation
anoronh4 Dec 18, 2025
70ffc3f
Merge branch 'develop' into module/fingerprintparser
anoronh4 Dec 18, 2025
54fff1b
fix failing test
anoronh4 Dec 18, 2025
53a03e7
exclude one more module from conda tests
anoronh4 Dec 18, 2025
8a596c9
update custom/fingerprintcorrelation to output table with correlation…
anoronh4 Dec 19, 2025
51047f1
add grouping logic for ordering samples in fingerprintcombine
Jan 9, 2026
f45b4d9
add logic to handle exception for when denominator of fraction is zero
Jan 9, 2026
18db500
fix failing nf-tests
anoronh4 Jan 9, 2026
d6ecf3c
put process tag in double quotes
Jan 30, 2026
6d6c0e1
change method of adding a column in order to handle empty table
Jan 30, 2026
acfa784
change output channel to include mix of run-computed and previously-c…
Jan 30, 2026
1939aca
Merge branch 'module/fingerprintparser' of github.com:mskcc-omics-wor…
Jan 30, 2026
baff4ae
updates
anoronh4 Feb 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/skip_nf_test.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"conda": [
"modules/msk/custom/fingerprintvcfparser",
"modules/msk/custom/fingerprintcontamination",
"modules/msk/custom/fingerprintcombine",
"modules/msk/custom/fingerprintcorrelation",
"modules/msk/calculatenoise",
"modules/msk/ppflagfixer",
"modules/msk/facets",
Expand Down Expand Up @@ -28,6 +32,7 @@
"modules/msk/phylowgs/parsecnvs",
"modules/msk/pvmaf/concat",
"modules/msk/pvmaf/tagtraceback",
"subworkflows/msk/fingerprint_gbcms",
"subworkflows/msk/genome_nexus",
"modules/msk/oncokb/mafannotate"
],
Expand Down
11 changes: 11 additions & 0 deletions modules/msk/custom/fingerprintcombine/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::r-argparse=2.2.5
- conda-forge::r-data.table=1.17.8
- conda-forge::r-dplyr=1.1.4
- conda-forge::r-plyr=1.8.9
- conda-forge::r-tidyverse=2.0.0
57 changes: 57 additions & 0 deletions modules/msk/custom/fingerprintcombine/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
process CUSTOM_FINGERPRINTCOMBINE {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66':
'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66' }"
//' oras://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:d96a65055f79744c':


input:
tuple val(meta), path(fp_tsv), val(sample), val(genome_build), val(patient)
path(liftover_loci_mapping)

output:
tuple val(meta), path("*DPfilter_ALL_FP.txt") , emit: combined_fp_tsv
tuple val("${task.process}"), val('complete_FP_table.R'), val('0.1.0'), emit: versions_fingerprintcombine, topic: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
"""
declare -a fp_tsv_list
declare -a sample_list
declare -a genome_build_list
declare -a patient_list
fp_tsv_list=(${fp_tsv.join(' ')})
sample_list=(${sample.join(' ')})
genome_build_list=(${genome_build.join(' ')})
patient_list=(${patient.join(' ')})
echo -e "sample_id\tgenome_build\tfp_tsv\tpatient" > input.tsv
for i in \$(seq 0 1 \$((\${#fp_tsv_list[@]}-1)) ) ; do
fp_tsv=\${fp_tsv_list[i]}
sample=\${sample_list[i]}
genome=\${genome_build_list[i]}
patient=\${patient_list[i]}
echo -e "\$sample\t\$genome\t\$fp_tsv\t\$patient"
done >> input.tsv

complete_FP_table.R \\
-i input.tsv \\
-l $liftover_loci_mapping \\
$args
"""

stub:
def args = task.ext.args ?: ''

"""
echo $args

touch XDPfilter_ALL_FP.txt
"""
}
82 changes: 82 additions & 0 deletions modules/msk/custom/fingerprintcombine/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "custom_fingerprintcombine"
description: |
A module to combine multiple fingerprint TSV files into a single comprehensive
table, with optional liftover of loci coordinates.
keywords:
- fingerprint
- qc
- loci
- tsv
- correlation
tools:
- "custom":
description: "A custom R script to combine fingerprint TSV files"
homepage: "https://github.com/mskcc-omics-workflows/modules/tree/main/modules/msk/custom/fingerprintcombine/meta.yml"
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- fp_tsv:
type: file
description: |
Fingerprint TSV files to be combined.
Structure: [ val(sample), val(genome_build), path(fp_tsv) ]
ontologies: []
- sample:
type: string
description: Sample identifier corresponding to each fingerprint TSV file.
- genome_build:
type: string
description:
Genome build (e.g., hg19, hg38) corresponding to each fingerprint
TSV file.
- - liftover_loci_mapping:
type: file
description: |
A TSV file mapping original loci to liftover loci.
Format: original_chr, original_pos, liftover_chr, liftover_pos
pattern: "*.tsv"
ontologies:
- edam: http://edamontology.org/format_3475 # TSV
output:
combined_fp_tsv:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- "*DPfilter_ALL_FP.txt":
type: file
description: Wide table combining all input fingerprint TSV files.
pattern: "*DPfilter_ALL_FP.txt"
ontologies:
- edam: http://edamontology.org/format_3750 # TSV
versions_fingerprintcombine:
- - ${task.process}:
type: string
description: The name of the process
- complete_FP_table.R:
type: string
description: The name of the tool
- 0.1.0:
type: string
description: Version of the custom script
topics:
versions:
- - ${task.process}:
type: string
description: The name of the process
- complete_FP_table.R:
type: string
description: The name of the tool
- 0.1.0:
type: string
description: Version of the custom script
authors:
- "@anoronh4"
maintainers:
- "@anoronh4"
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#! /usr/bin/env Rscript

#-------------------------------------------------------------------------------
# Script: complete_FP_table.R
# Author: Erika Gedvilaite
# Date: 2025-09-23
# Version: 0.1.0
#
# Description: This script takes in standard fingerprint tables and combines
# them into a single, wide table for downstream plotting and analysis.
#
# Annotation:
# - Input table should have three columns: sample_id, genome_build, fp_tsv
# - Genome build should be either "hg19" or "hg38" or "GRCh37" or "GRCh38"
# (case insensitive)
#
#-------------------------------------------------------------------------------


rm(list=ls())

library(argparse, quietly = T)
library(plyr, quietly = T)
library(dplyr, quietly = T)
library(data.table, quietly = T)
library(tidyverse, quietly = T)

`%notin%` <- Negate(`%in%`)
`%notlike%` <- Negate(`%like%`)

parser = ArgumentParser(description = 'Generate FP tables for plotting')
parser$add_argument('-i', '--input_table', required = TRUE,
help = 'Input table with paths to individual fingerprint TSV files, sample ids, and genome build')
parser$add_argument('-o', '--analysis_folder', required = FALSE, default = ".",
help = 'Output folder')
parser$add_argument('-l', '--loci_mapper', required = TRUE,
help = 'Loci mapper file')
parser$add_argument('-d', '--depth_filter', required = FALSE, default = 20,
help = 'Depth filter to apply to individual fingerprint TSV files (default: 20)')
args = parser$parse_args()



message("Reading in Liftover file")

hg19_hg38_mapper = fread(args$loci_mapper,header = T)
hg19_hg38_mapper$Loci_hg19 = paste(hg19_hg38_mapper$GRCH37_CHROM,hg19_hg38_mapper$GRCH37_POS,sep=":")
hg19_hg38_mapper$Loci_hg38 = paste(hg19_hg38_mapper$GRCH38_CHROM,hg19_hg38_mapper$GRCH38_POS,sep=":")
hg19_hg38_mapper = hg19_hg38_mapper %>% select(Loci_hg19, Loci_hg38) %>% unique()

message("Loading Samples")
input_table = fread(args$input_table, header = T) %>% arrange(patient, sample_id)
for (i in 1:nrow(input_table)){
sample = input_table$sample_id[i]
genome_build = input_table$genome_build[i]
print(genome_build)
if (tolower(genome_build) %notin% c("hg19","grch37","hg38","grch38")){
stop(paste0("Genome build not recognized: ", genome_build, ". Must be in the following list: hg19, hg38, grch37, grch38 (case will be ignored)."))
}
file = input_table$fp_tsv[i]
if (!file.exists(file)){
stop(paste0("File does not exist: ", input_table$fp_tsv[i]))
}
temp_dataset <- fread(file, header = T, sep="\t")
colnames(temp_dataset) = c("Locus", "Count", "Genotype","VAF")
temp_dataset = separate(temp_dataset, Count, into = c(NA,'DP1',NA,'DP2'), remove = F)
temp_dataset$DP2[is.na(temp_dataset$DP2)==T] <- 0
temp_dataset$DP = as.numeric(temp_dataset$DP1) + as.numeric(temp_dataset$DP2)
temp_dataset = temp_dataset[temp_dataset$DP >= args$depth_filter,] ## keeping loci >= 20 dp by default
temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0
#temp_dataset$Sample = sample #only loci with DP >= depth filter will have Sample info
temp_dataset$Sample <- rep(sample, nrow(temp_dataset))
temp_dataset = temp_dataset %>% select("Locus","Genotype","Sample","VAF")
temp_dataset$Locus = str_replace(temp_dataset$Locus,"chr","")

if (tolower(genome_build) %in% c("hg19","grch37")){
temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg19", by.y = "Locus", all.x = T)
temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0
} else if (tolower(genome_build) %in% c("hg38","grch38")){
temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg38", by.y = "Locus", all.x = T)
temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0
}

if (!exists("all_gbcm")){
all_gbcm = temp_dataset
} else {
all_gbcm = rbind(all_gbcm, temp_dataset)
}
}
all_gbcm = all_gbcm[is.na(all_gbcm$Sample)==F,] # filters out loci that don't have Sample info (i.e. loci not passing DP filter)
all_gbcm$VAF = round(as.numeric(all_gbcm$VAF), 5)

wide_all_gbcm = all_gbcm %>% pivot_wider(names_from = Sample, values_from = c(Genotype, VAF))

message("Creating final GBCM file")

all_fp_gbcm_final = merge(hg19_hg38_mapper, wide_all_gbcm,all.x = T)

if (!dir.exists(args$analysis_folder)) {
dir.create(args$analysis_folder, recursive = TRUE)
} else {
print(paste("Directory already exists:", args$analysis_folder))
}

message(paste("Output file: ", args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep=""))

all_fp_gbcm_final <- apply(all_fp_gbcm_final,2,as.character)
write.table(all_fp_gbcm_final, file = paste(args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep=""), append = F, sep = "\t", row.names = F, quote = F)

message("FP file completed")
10 changes: 10 additions & 0 deletions modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
GRCH37_CHROM GRCH37_POS GRCH38_CHROM GRCH38_POS
MT192765.1 197 MT192765.1 199
MT192765.1 4788 MT192765.1 4900
MT192765.1 8236 MT192765.1 8257
MT192765.1 10506 MT192765.1 10528
MT192765.1 11037 MT192765.1 11059
MT192765.1 15009 MT192765.1 15500
MT192765.1 18807 MT192765.1 18929
MT192765.1 23813 MT192765.1 24835
MT192765.1 24103 MT192765.1 25125
105 changes: 105 additions & 0 deletions modules/msk/custom/fingerprintcombine/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// nf-core modules test custom/fingerprintcombine
nextflow_process {

name "Test Process CUSTOM_FINGERPRINTCOMBINE"
script "../main.nf"
process "CUSTOM_FINGERPRINTCOMBINE"
config "./nextflow.config"

tag "modules"
tag "modules_msk"
tag "custom"
tag "custom/fingerprintcombine"
tag "gbcms"
tag "custom/fingerprintvcfparser"

test("sarscov2 - bam") {

setup {
run("GBCMS"){
script "../../../gbcms/main.nf"
process {
"""
input[0] = Channel.of(
[
[ id:'test', sample:'test', pool:'mypool' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true),
"variant_file.vcf"
],
[
[ id:'test2', sample:'test2', pool:'mypool' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true),
"variant_file.vcf"
],
)
input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
"""
}
}
run("CUSTOM_FINGERPRINTVCFPARSER"){
script "../../fingerprintvcfparser/main.nf"
process {
"""
input[0] = GBCMS.out.variant_file
"""
}
}
}

when {
process {
"""
input[0] = CUSTOM_FINGERPRINTVCFPARSER.out.tsv
.map{ meta, tsv ->
println meta
[[id:meta.pool], tsv, meta.id, "hg19","default"]
}.groupTuple(by:[0])
input[1] = file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true)
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}


test("sarscov2 - bam - stub") {

options "-stub"

when {
process {
"""
input[0] = [
[id:"testsample"],
[file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true)],
["testsample"],
["hg19"],
["default"]
]
input[1] = file("$baseDir/modules/msk/custom/fingerprintcombine/tests/loci_mapping.tsv", checkIfExists:true)
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}

}
Loading