From 3e69f06432f62b7924d2e043ef4768c5d09bf614 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 3 May 2023 22:55:16 +0000 Subject: [PATCH 01/58] fix: ensure workflow resources are always in root data directory --- .../NF_RCP-F/workflow_code/nextflow.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config index bcf31521..7fc584c2 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config @@ -81,17 +81,17 @@ manifest { def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html" + file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html" + file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt" + file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt" } dag { enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram - file = "${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" + file = "${params.output}/${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" } From 9662c14ffc0324d653215405308782f0848d20ae Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 8 May 2023 16:28:43 +0000 Subject: [PATCH 02/58] feat: update version from 1.0.3 to 1.0.4 --- .../Workflow_Documentation/NF_RCP-F/README.md | 20 +++++++++---------- .../NF_RCP-F/workflow_code/nextflow.config | 11 +++++----- RNAseq/Workflow_Documentation/README.md | 2 +- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md index 7dfb3f32..a173b0cf 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md @@ -101,9 +101,9 @@ All files required for utilizing the NF_RCP-F GeneLab workflow for processing RN copy of latest NF_RCP-F version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash -wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP-F_1.0.3/NF_RCP-F_1.0.3.zip +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_RCP-F_1.0.4/NF_RCP-F_1.0.4.zip -unzip NF_RCP-F_1.0.3.zip +unzip NF_RCP-F_1.0.4.zip ```
@@ -115,10 +115,10 @@ unzip NF_RCP-F_1.0.3.zip Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_RCP-F workflow: -> Note: This command should be run in the location containing the `NF_RCP-F_1.0.3` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes. +> Note: This command should be run in the location containing the `NF_RCP-F_1.0.4` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. Depending on your network speed, fetching the images will take ~20 minutes. ```bash -bash NF_RCP-F_1.0.3/bin/prepull_singularity.sh NF_RCP-F_1.0.3/config/software/by_docker_image.config +bash NF_RCP-F_1.0.4/bin/prepull_singularity.sh NF_RCP-F_1.0.4/config/software/by_docker_image.config ``` @@ -134,7 +134,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity ### 4. Run the Workflow -While in the location containing the `NF_RCP-F_1.0.3` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_RCP-F workflow: +While in the location containing the `NF_RCP-F_1.0.4` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_RCP-F workflow: > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
@@ -142,7 +142,7 @@ While in the location containing the `NF_RCP-F_1.0.3` directory that was downloa #### 4a. Approach 1: Run the workflow on a GeneLab RNAseq dataset with automatic retrieval of Ensembl reference fasta and gtf files ```bash -nextflow run NF_RCP-F_1.0.3/main.nf \ +nextflow run NF_RCP-F_1.0.4/main.nf \ -profile singularity \ --gldsAccession GLDS-194 ``` @@ -154,7 +154,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \ > Note: The `--ref_source` and `--ensemblVersion` parameters should match the reference source and version number of the local reference fasta and gtf files used ```bash -nextflow run NF_RCP-F_1.0.3/main.nf \ +nextflow run NF_RCP-F_1.0.4/main.nf \ -profile singularity \ --gldsAccession GLDS-194 \ --ensemblVersion 107 \ @@ -170,7 +170,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \ > Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md). ```bash -nextflow run NF_RCP-F_1.0.3/main.nf \ +nextflow run NF_RCP-F_1.0.4/main.nf \ -profile singularity \ --runsheetPath ``` @@ -179,7 +179,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \ **Required Parameters For All Approaches:** -* `NF_RCP-F_1.0.3/main.nf` - Instructs Nextflow to run the NF_RCP-F workflow +* `NF_RCP-F_1.0.4/main.nf` - Instructs Nextflow to run the NF_RCP-F workflow * `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow @@ -225,7 +225,7 @@ nextflow run NF_RCP-F_1.0.3/main.nf \ All parameters listed above and additional optional arguments for the RCP workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command: ```bash -nextflow run NF_RCP-F_1.0.3/main.nf --help +nextflow run NF_RCP-F_1.0.4/main.nf --help ``` See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows. diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config index 7fc584c2..aa1e5710 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config @@ -15,6 +15,7 @@ profiles { charliecloud.enabled = false // Address issue: https://github.com/nextflow-io/nextflow/issues/1210 process { + containerOptions = "--no-home" errorStrategy = { task.exitStatus == 255 ? 'retry' : 'terminate' } @@ -74,24 +75,24 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=22.10.1' - version = '1.0.3' + version = '1.0.4' } // Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html" + file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html" + file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.output}/${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt" + file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/execution_trace_${trace_timestamp}.txt" } dag { enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram - file = "${params.output}/${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" + file = "${params.outputDir}/${params.gldsAccession}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" } diff --git a/RNAseq/Workflow_Documentation/README.md b/RNAseq/Workflow_Documentation/README.md index ead4736c..8c7dc6ef 100644 --- a/RNAseq/Workflow_Documentation/README.md +++ b/RNAseq/Workflow_Documentation/README.md @@ -8,7 +8,7 @@ GeneLab has wrapped each step of the pipeline into a workflow with validation an |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7101-F.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md)|[NF_RCP-F_1.0.3](NF_RCP-F)|22.10.1| +|*[GL-DPPD-7101-F.md](../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md)|[NF_RCP-F_1.0.4](NF_RCP-F)|22.10.1| *Current GeneLab Pipeline/Workflow Implementation From 3b7e0bab4017e90481359c48f9cf7c8837ed54d2 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 8 May 2023 16:36:09 +0000 Subject: [PATCH 03/58] feat: allow trim-galore! to autodetect adapter type Prior, --illumina was harded coded. Now, neither --illumina nor --nextera are supplied. As per the Trim Galore [Manual](https://github.com/FelixKrueger/TrimGalore/blob/0.6.7/Docs/Trim_Galore_User_Guide.md#adapter-auto-detection): " If no sequence was supplied, Trim Galore will attempt to auto-detect the adapter which has been used. For this it will analyse the first 1 million sequences of the first specified file and attempt to find the first 12 or 13bp of the following standard adapters: ``` Illumina: AGATCGGAAGAGC Small RNA: TGGAATTCTCGG Nextera: CTGTCTCTTATA ``` If no adapter contamination can be detected within the first 1 million sequences, or in case of a tie between several different adapters, Trim Galore defaults to `--illumina`, as long as the Illumina adapter sequence was one of the options. If there was a tie between the Nextera and small RNA adapter, the default is `--nextera`. The auto-detection results are shown on screen and printed to the trimming report for future reference. " --- .../NF_RCP-F/workflow_code/modules/quality.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf index 0d84a93d..9e60c283 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/quality.nf @@ -65,14 +65,9 @@ process TRIMGALORE { path("versions.txt"), emit: version script: - /* - * comments -> --ilumina # if adapters are not illumina, replace with adapters - * --paired # only for PE studies, # if SE use only single read file - */ """ trim_galore --gzip \ --cores $task.cpus \ - --illumina \ --phred33 \ ${ meta.paired_end ? '--paired' : '' } \ $reads \ From 8a158b1bc77679e14b4ce976f366350139e1669c Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 8 May 2023 19:18:52 +0000 Subject: [PATCH 04/58] refactor: remove deprecated tests and test settings --- .../config/tests/test_glds194.config | 15 ----------- .../config/tests/test_glds207.config | 11 -------- .../config/tests/test_glds251.config | 12 --------- .../config/tests/test_glds48.config | 15 ----------- .../config/tests/test_glds91.config | 15 ----------- .../config/tests/test_nonGLDS.config | 20 --------------- .../NF_RCP-F/workflow_code/nextflow.config | 25 ------------------- 7 files changed, 113 deletions(-) delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config deleted file mode 100644 index 00976bf2..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds194.config +++ /dev/null @@ -1,15 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'GLDS-194' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - /* - DEBUG parameters, should NOT be overwritten for production processing runs - */ - genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome) - truncateTo = 300 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config deleted file mode 100644 index a4a26252..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds207.config +++ /dev/null @@ -1,11 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'GLDS-207' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - truncateTo = 100 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config deleted file mode 100644 index 0085e1f9..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds251.config +++ /dev/null @@ -1,12 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'GLDS-251' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - genomeSubsample = 5 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome) - truncateTo = 300 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config deleted file mode 100644 index 049b4527..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds48.config +++ /dev/null @@ -1,15 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'GLDS-48' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - /* - DEBUG parameters, should NOT be overwritten for production processing runs - */ - genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome) - truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config deleted file mode 100644 index a667c13e..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_glds91.config +++ /dev/null @@ -1,15 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'GLDS-91' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - /* - DEBUG parameters, should NOT be overwritten for production processing runs - */ - genomeSubsample = 21 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome) - truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config deleted file mode 100644 index 5e72e005..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/tests/test_nonGLDS.config +++ /dev/null @@ -1,20 +0,0 @@ -// Should be loaded by every RNASeq process. -params { - /* - Parameters that MUST be supplied - */ - gldsAccession = 'CustomAnalysis' // GeneLab Data Accession Number, e.g. GLDS-104 - use_dummy_gene_counts = true // Use random gene counts for Deseq2, this addresses an issue where low/zero gene counts causes DGE analysis to fail - - /* - Parameters that CAN be overwritten - */ - runsheetPath = "${projectDir}/test_assets/CustomAnalysis_bulkRNASeq_v1_runsheet.csv" - - /* - DEBUG parameters, should NOT be overwritten for production processing runs - */ - genomeSubsample = 19 // Subsamples the reference fasta and gtf to a single sequence (often representing a single chromosome) - truncateTo = 600 // Subsamples the raw reads files to the specified number of reads for EACH raw reads file. - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config index aa1e5710..364a00f2 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/nextflow.config @@ -39,34 +39,9 @@ profiles { includeConfig 'config/software/by_docker_image.config' } - test_nonGLDS { - includeConfig 'config/executor/default_CI_test.config' - includeConfig 'config/tests/test_nonGLDS.config' - } - test { includeConfig 'config/executor/default_CI_test.config' } - - test91 { - includeConfig 'config/executor/default_CI_test.config' - includeConfig 'config/tests/test_glds91.config' - } - - test194 { - includeConfig 'config/executor/default_CI_test.config' - includeConfig 'config/tests/test_glds194.config' - } - - test207 { - includeConfig 'config/executor/default_CI_test.config' - includeConfig 'config/tests/test_glds207.config' - } - - test251 { - includeConfig 'config/executor/default_CI_test.config' - includeConfig 'config/tests/test_glds251.config' - } } manifest { From 03618d9e776a81a4b3a4847c91021438e00ac9c6 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 9 May 2023 20:39:54 +0000 Subject: [PATCH 05/58] feat: add NF_RCP plugin for dp_tools update --- .../bin/dp_tools__NF_RCP/__init__.py | 9 + .../bin/dp_tools__NF_RCP/checks.py | 1537 +++++++++++++++++ .../bin/dp_tools__NF_RCP/config.yaml | 1308 ++++++++++++++ .../bin/dp_tools__NF_RCP/protocol.py | 960 ++++++++++ .../bin/dp_tools__NF_RCP/schemas.py | 33 + 5 files changed, 3847 insertions(+) create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py new file mode 100644 index 00000000..5faa427c --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/__init__.py @@ -0,0 +1,9 @@ +from pathlib import Path + +# Import for access at the module level +from . import checks +from . import protocol +from . import schemas + +# Set config path +config = Path(__file__).parent / "config.yaml" \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py new file mode 100644 index 00000000..885d2160 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py @@ -0,0 +1,1537 @@ +from collections import defaultdict +import copy +import enum +import gzip +import itertools +import logging +import math +from pathlib import Path +from statistics import mean +import string +import subprocess +from typing import Callable, Dict, Union +from importlib.metadata import files + +import pandas as pd + +from dp_tools.core.entity_model import Dataset, Sample, multiqc_run_to_dataframes + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers + + +def r_style_make_names(s: str) -> str: + """Recreates R's make.names function for individual strings. + This function is often used to create syntactically valid names in R which are then saved in R outputs. + Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names + + Args: + s (str): A string to convert + + Returns: + str: A string converted in the same way as R's make.names function + """ + EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign + VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS + REPLACEMENT_CHAR = "." + new_string_chars = list() + for char in s: + if char in VALID_CHARACTERS: + new_string_chars.append(char) + else: + new_string_chars.append(REPLACEMENT_CHAR) + return "".join(new_string_chars) + + +# adapted from reference: https://stackoverflow.com/questions/56048627/round-floats-in-a-nested-dictionary-recursively +# used to round values for easier to read messages +def formatfloat(x): + return "%.3g" % float(x) + + +def pformat(original_dictionary, function): + dictionary = copy.deepcopy( + original_dictionary + ) # we don't want to override original values + if isinstance(dictionary, dict): + new_dict = dict() + for k, v in dictionary.items(): + new_dict[k] = function(v) if isinstance(v, float) else pformat(v, function) + return new_dict + return dictionary + + +def convert_nan_to_zero(input: Dict[str, Union[float, int]]) -> Dict: + """Convert any Nan into zero""" + output = dict() + for key, value in input.items(): + output[key] = value if not math.isnan(value) else 0 + return output + + +## Functions that use the following syntax to merge values from general stats: +# "stat1 + stat2" should search and sum the stats +# TODO: refine dict typehint +def stat_string_to_value(stat_string: str, mqcData: dict) -> float: + """ "stat1 + stat2" should search and sum the stats""" + sum = float(0) + direct_keys = stat_string.split(" + ") + for direct_key in direct_keys: + print(direct_key) + sum += mqcData[direct_key] + return float(sum) + + +## Dataframe and Series specific helper functions +def nonNull(df: pd.DataFrame) -> bool: + # negation since it checks if any are null + return ~df.isnull().any(axis=None) + + +def nonNegative(df: pd.DataFrame) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df >= 0) | (df.isnull())).all(axis=None) + + +def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None) + + +def check_forward_and_reverse_reads_counts_match( + sample: Sample, reads_key_1: str, reads_key_2: str +) -> FlagEntry: + # data specific preprocess + count_fwd_reads = float( + sample.compile_multiqc_data([reads_key_1])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + count_rev_reads = float( + sample.compile_multiqc_data([reads_key_2])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + + # check logic + if count_fwd_reads == count_rev_reads: + code = FlagCode.GREEN + message = ( + f"Forward and reverse read counts match at " + f"{int(count_rev_reads)} sequences " + ) + else: + code = FlagCode.HALT + message = ( + f"Forward and reverse read counts do not " + f"match: forward_Count:{int(count_fwd_reads)}, " + f"reverse_Count:{int(count_rev_reads)}" + ) + + return {"code": code, "message": message} + + +def check_file_exists(file: Path) -> FlagEntry: + # check logic + if file.is_file(): + code = FlagCode.GREEN + message = f"File exists: {file.name} " + else: + code = FlagCode.HALT + message = f"Missing file: {file.name} expected at {str(file)} " + + return {"code": code, "message": message} + + +def check_fastqgz_file_contents(file: Path, count_lines_to_check: int) -> FlagEntry: + """Check fastqgz by: + 1. Decompressing as a stream of lines. + 2. Affirming expected headers (every 4th line) look correct. + + :param file: Input fastqGZ file path + :type file: Path + :param count_lines_to_check: Maximum number of lines to check. Setting this to a negative value will remove the limit + :type count_lines_to_check: int + :return: A required fields-only flag entry dictionary + :rtype: FlagEntry + """ + + lines_with_issues: list[int] = list() + + # check logic + # truncated files raise EOFError + # catch this as HALT3 + try: + with gzip.open(file, "rb") as f: + for i, byte_line in enumerate(f): + # checks if lines counted equals the limit input + if i + 1 == count_lines_to_check: + log.debug( + f"Reached {count_lines_to_check} lines, ending line check" + ) + break + + line = byte_line.decode() + # every fourth line should be an identifier + expected_identifier_line = i % 4 == 0 + # check if line is actually an identifier line + if expected_identifier_line and line[0] != "@": + lines_with_issues.append(i + 1) + # update every 2,000,000 reads + if i % 2_000_000 == 0: + log.debug(f"Checked {i} lines for {file}") + pass + + if not len(lines_with_issues) == 0: + code = FlagCode.HALT + message = ( + f"Following decompressed fastqGZ lines have issues: {lines_with_issues}" + ) + else: + code = FlagCode.GREEN + message = f"First {count_lines_to_check} lines checked found no issues. This means headers lines were identifiable and no decompression errors occured." + except (EOFError, gzip.BadGzipFile): + code = FlagCode.HALT + message = ( + f"Error during decompression, likely a compression or truncation issue." + ) + + return {"code": code, "message": message} + +def check_gzip_file_integrity(file: Path, gzip_bin: Path = Path("gzip")) -> FlagEntry: + """ Check gzip file integrity using 'gzip -t' as per https://www.gnu.org/software/gzip/manual/gzip.html """ + output = subprocess.run( + [str(gzip_bin), "-t", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Gzip integrity test raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Gzip integrity test failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + +def check_bam_file_integrity( + file: Path, samtools_bin: Path = Path("samtools") +) -> FlagEntry: + """Uses http://www.htslib.org/doc/samtools-quickcheck.html""" + # data specific preprocess + + # check logic + output = subprocess.run( + [str(samtools_bin), "quickcheck", "-v", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Samtools quickcheck raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Samtools quickcheck failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + + +def check_thresholds( + multiqc_inputs: list[Path], mqc_key: str, stat_string: str, thresholds: list[dict] +) -> FlagEntry: + # data specific preprocess + data = multiqc_run_to_dataframes(multiqc_inputs) + value = stat_string_to_value(stat_string, data["general_stats"][mqc_key]) + + # check logic + # Assuming GREEN unless reassigned + code = FlagCode.GREEN + for threshold in thresholds: + match threshold["type"]: + case "lower": + if value < threshold["value"]: + code = ( + FlagCode[threshold["code"]] + if code < FlagCode[threshold["code"]] + else code + ) + + if code == FlagCode.GREEN: + message = f"Value: ({value}) did not breech any configured thresholds" + else: + message = f"Value: ({value}) breeched configured thresholds" + return {"code": code, "message": message} + + +def check_metadata_attributes_exist( + dataset: Dataset, expected_attrs: list[str] +) -> FlagEntry: + missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata)) + + # check if any missing_metadata_fields are present + # check logic + if not missing_metadata_fields: + code = FlagCode.GREEN + message = f"All expected metadata keys found: Expected {expected_attrs}, Found {set(dataset.metadata)}" + else: + code = FlagCode.HALT + message = f"Missing dataset metadata (source from Runsheet): {missing_metadata_fields}" + return {"code": code, "message": message} + + +def check_for_outliers( + dataset: Dataset, + data_asset_keys: list[str], + mqc_module: str, + mqc_plot: str, + mqc_keys: list[str], + thresholds: list[dict], +) -> FlagEntryWithOutliers: + # assume code is GREEN until outliers detected + code = FlagCode.GREEN + # dataframe extraction + compiled_mqc_data = dataset.compile_multiqc_data(data_asset_keys=data_asset_keys) + + if mqc_plot == "general_stats": + df = compiled_mqc_data["general_stats"][mqc_module] + else: + df = compiled_mqc_data["plots"][mqc_module][mqc_plot] + + def default_to_regular(d): + if isinstance(d, defaultdict): + d = {k: default_to_regular(v) for k, v in d.items()} + return d + + # track for outliers + outliers: dict[str, dict[str, dict[str, str]]] = defaultdict( + lambda: defaultdict(dict) + ) + + # override if mqc_keys is a special value + if mqc_keys == ["_ALL"]: + mqc_keys = df.columns + + for mqc_key in mqc_keys: + for threshold in thresholds: + if threshold["middle_fcn"] == "mean": + middle = df[mqc_key].mean() + elif threshold["middle_fcn"] == "median": + middle = df[mqc_key].median() + else: + raise ValueError( + f"Cannot compute middle from supplied middle_fcn name: {threshold['middle_fcn']}. Must supply either 'median' or 'mean'" + ) + + # bail if standard deviation == 0 + # e.g. if all values are identical (and thus has no outliers) + if df[mqc_key].std() == 0: + continue + + # compute difference + df_diffs = df[mqc_key] - middle + + # compute as number of standard deviations + df_diffs_in_std = df_diffs / df[mqc_key].std() + + # add to outlier tracker if over the threshold + for key, value in df_diffs_in_std.iteritems(): + # if an outlier + if abs(value) > threshold["stdev_threshold"]: + # track it + outliers[key][mqc_module][mqc_key] = value + # elevate code if current code is lower severity + if code < FlagCode[threshold["code"]]: + code = FlagCode[threshold["code"]] + + # convert defaultdict to regular for all reporting + outliers = default_to_regular(outliers) + # check logic + if code == FlagCode.GREEN: + message = f"No outliers found for {mqc_keys} in {mqc_plot} part of {mqc_module} multiQC module" + else: + message = ( + f"Outliers found in {mqc_module} multiQC module as follows: {outliers}" + ) + return {"code": code, "message": message, "outliers": outliers} + + +def _check_expected_files_exist( + input_dir: Path, expected_extensions: list[str], parent_dir_is_filename: bool = True +): + if parent_dir_is_filename: + fname = input_dir.name + expected_files = [input_dir / f"{fname}{ext}" for ext in expected_extensions] + missing_files = list() + for expected_file in expected_files: + if not expected_file.is_file(): + missing_files.append(str(expected_file)) + + expected_file_str = [str(f) for f in expected_files] + return missing_files, expected_file_str + + +def check_genebody_coverage_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".geneBodyCoverage.r", + ".geneBodyCoverage.txt", + ".geneBodyCoverage.curves.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from geneBody coverage found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from geneBody coverage: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_inner_distance_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".inner_distance_plot.r", + ".inner_distance_freq.txt", + ".inner_distance.txt", + ".inner_distance_plot.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from inner distance found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from inner distance: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_strandedness_assessable_from_infer_experiment( + dataset: Dataset, + stranded_assessment_range: dict[str, float], + unstranded_assessment_range: dict[str, float], + valid_dominant_strandedness_assessments: list[str], +) -> FlagEntry: + # data specific preprocess + def get_median_strandedness( + dataset: Dataset, + ) -> dict[str, float]: + + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + median_strandedness = df.median().to_dict() + + return median_strandedness + + median_strandedness = get_median_strandedness(dataset) + + # check if dominant assessment is valid + strand_assessment: str = max( + median_strandedness, key=lambda k: median_strandedness[k] + ) + + # flag based on thresholds + assessment_value: float = median_strandedness[strand_assessment] + + is_stranded: bool = ( + stranded_assessment_range["max"] + > assessment_value + > stranded_assessment_range["min"] + ) + is_unstranded: bool = ( + unstranded_assessment_range["max"] + > assessment_value + > unstranded_assessment_range["min"] + ) + + def determine_samples_outside_range( + dataset: Dataset, min: float, max: float + ) -> list[str]: + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + return df.index[df[strand_assessment].between(min, max) == False].to_list() + + # Catalog and flag any samples outside of range + # flags based on samples that are out of the assessment range + samples_outside_range: list[str] + if is_stranded: + samples_outside_range = determine_samples_outside_range( + dataset, + stranded_assessment_range["min"], + stranded_assessment_range["max"], + ) + elif is_unstranded: + samples_outside_range = determine_samples_outside_range( + dataset, + unstranded_assessment_range["min"], + unstranded_assessment_range["max"], + ) + else: # this means that the strandedness is ambiguous + samples_outside_range = list() + + # check logic + if strand_assessment not in valid_dominant_strandedness_assessments: + code = FlagCode.HALT + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is invalid for processing. Valid assessments: {valid_dominant_strandedness_assessments}" + elif not samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.GREEN + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with no individual samples outside the assessment range" + elif samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.RED + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with samples outside the assessment range: {samples_outside_range}" + else: + code = FlagCode.HALT + message = ( + f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is ambiguous due to being inside range " + f"({stranded_assessment_range['min']}-{unstranded_assessment_range['max']})" + ) + + return {"code": code, "message": message} + + +def check_rsem_counts_and_unnormalized_tables_parity( + rsem_table_path: Path, deseq2_table_path: Path +) -> FlagEntry: + # data specific preprocess + df_rsem = pd.read_csv(rsem_table_path) + df_deseq2 = pd.read_csv(deseq2_table_path) + + # return halt flag if column labels not conserved + if not set(df_deseq2.columns) == set(df_rsem.columns): + unique_to_deseq2 = set(df_deseq2.columns) - set(df_rsem.columns) + unique_to_rsem = set(df_rsem.columns) - set(df_deseq2.columns) + return { + "code": FlagCode.HALT, + "message": f"Columns do not match: unique to rsem: {unique_to_rsem}. unique to deseq2: {unique_to_deseq2}.", + } + + # rearrange columns to the same order + df_deseq2 = df_deseq2[df_rsem.columns] + + # check logic + if df_deseq2.equals(df_rsem): + code = FlagCode.GREEN + message = f"Tables of unnormalized counts match." + else: + code = FlagCode.HALT + message = ( + f"Tables of unnormalized counts have same columns but values do not match." + ) + return {"code": code, "message": message} + + +def check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + STAR_COUNT_MODES = ["unstranded", "sense", "antisense"] + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # all columns must match with the same strand column + strand_assessment: str = None # type: ignore + samples_with_issues: dict[str, list[str]] = { + "Not in aggregate table": list(), + "Sample counts mismatch": list(), + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv( + path, sep="\t", names=STAR_COUNT_MODES, index_col=0 + ).filter( + regex="^(?!N_.*).*", axis="rows" + ) # filter out N_* entries + + # check if the values match for any of the count modes + # unstranded, sense, antisense + # for remaining samples, only check the match for the first count mode + # TODO: Fix rare false postive related to zero counts, in those cases the strand_assessment can be prematurely determined which causes other samples to be compared with an inappropriate assessment + for count_mode in STAR_COUNT_MODES: + # make sure to sort indicies + if df_agg[sample].sort_index().equals(df_samp[count_mode].sort_index()): + # assign strand assessment if first sample + if strand_assessment is None: + strand_assessment = count_mode + + if strand_assessment == count_mode: + # no issues found (i.e. counts match with a consistent count mode column), break out + break + else: # no break + samples_with_issues["Sample counts mismatch"].append(sample) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = ( + f"All samples accounted for and with matching counts " + f"between samplewise and aggregate table using strand assessment: '{strand_assessment}'" + ) + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # TODO: LOW PRIORITY, fix this typehint + samples_with_issues: dict[str, Union[list[str], list[tuple[str, list[str]]]]] = { + "Not in aggregate table": list(), # type: ignore + "Sample counts mismatch": list(), # type: ignore + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv(path, sep="\t", index_col=0) + + # check if values match + if geneID_with_mismatched_counts := ( + list(df_agg.loc[df_agg[sample] != df_samp["expected_count"]].index) + ): + samples_with_issues["Sample counts mismatch"].append( + (sample, geneID_with_mismatched_counts) + ) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = f"All samples accounted for and with matching counts between samplewise and aggregate table" + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_sample_table_against_runsheet( + runsheet: Path, sampleTable: Path, all_samples_required: bool +) -> FlagEntry: + """Check the sample table includes all samples as denoted in the runsheet. + + Args: + runsheet (Path): csv file used for processing, the index denotes all samples + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay. + + Returns: + FlagEntry: A check result + """ + # data specific preprocess + df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index() + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + + extra_samples: dict[str, set[str]] = { + "unique_to_runsheet": set(df_rs.index) - set(df_sample.index), + "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index), + } + + # check logic + if any( + [ + (extra_samples["unique_to_runsheet"] and all_samples_required), + (extra_samples["unique_to_sampleTable"]), + ] + ): + code = FlagCode.HALT + message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}" + else: + code = FlagCode.GREEN + message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})" + return {"code": code, "message": message} + + +class GroupFormatting(enum.Enum): + r_make_names = enum.auto() + ampersand_join = enum.auto() + + +def utils_runsheet_to_expected_groups( + runsheet: Path, + formatting: GroupFormatting = GroupFormatting.ampersand_join, + limit_to_samples: list = None, + map_to_lists: bool = False, +) -> Union[dict[str, str], dict[str, list[str]]]: + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) + .filter(regex="^Factor Value\[.*\]") + .sort_index() + ) # using only Factor Value columns + + if limit_to_samples: + df_rs = df_rs.filter(items=limit_to_samples, axis="rows") + + match formatting: + case GroupFormatting.r_make_names: + expected_conditions_based_on_runsheet = ( + df_rs.apply(lambda x: "...".join(x), axis="columns") + .apply(r_style_make_names) # join factors with '...' + .to_dict() + ) # reformat entire group in the R style + case GroupFormatting.ampersand_join: + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: f"({' & '.join(x)})", axis="columns" + ).to_dict() + case _: + raise ValueError( + f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}" + ) + + # convert from {sample: group} dict + # to {group: [samples]} dict + if map_to_lists: + unique_groups = set(expected_conditions_based_on_runsheet.values()) + reformatted_dict: dict[str, list[str]] = dict() + for query_group in unique_groups: + reformatted_dict[query_group] = [ + sample + for sample, group in expected_conditions_based_on_runsheet.items() + if group == query_group + ] + expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict + + return expected_conditions_based_on_runsheet + + +def check_sample_table_for_correct_group_assignments( + runsheet: Path, sampleTable: Path +) -> FlagEntry: + """Check the sample table is assigned to the correct experimental group. + An experimental group is defined by the Factor Value columns found in the runsheet. + + Args: + runsheet (Path): csv file used for processing, includes metadata used for experimental group designation + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + + Returns: + FlagEntry: A check result + """ + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + # data specific preprocess + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric + .filter(regex="^Factor Value\[.*\]") + .loc[df_sample.index] # ensure only sampleTable groups are checked + .sort_index() + ) # using only Factor Value columns + + # TODO: refactor with utils_runsheet_to_expected_groups + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: "...".join(x), axis="columns" + ).apply( # join factors with '...' + r_style_make_names + ) # reformat entire group in the R style + + mismatched_rows = expected_conditions_based_on_runsheet != df_sample["condition"] + + # check logic + if not any(mismatched_rows): + code = FlagCode.GREEN + message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}" + else: + code = FlagCode.HALT + mismatch_description = ( + df_sample[mismatched_rows]["condition"] + + " <--SAMPLETABLE : RUNSHEET--> " + + expected_conditions_based_on_runsheet[mismatched_rows] + ).to_dict() + message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}" + return {"code": code, "message": message} + + +def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + # check logic + differences = set(expected_comparisons).symmetric_difference( + set(df_contrasts.columns) + ) + if not differences: + code = FlagCode.GREEN + message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {set(expected_comparisons)}" + else: + code = FlagCode.HALT + message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {differences}" + return {"code": code, "message": message} + + +def check_contrasts_table_rows(contrasts_table: Path, **_) -> FlagEntry: + # data specific preprocess + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + def _get_groups_from_comparisions(s: str) -> set[str]: + """Converts '(G1)v(G2)' + into G1...G2 where G1 and G2 are renamed as per the r make names function + + Args: + s (str): Input that fits this format: '(G1)v(G2)' + + Returns: + str: Reformatted string + """ + g1, g2 = s.split(")v(") + # remove parens and reformat with r make names style + g1 = r_style_make_names(g1[1:].replace(" & ", "...")) + g2 = r_style_make_names(g2[:-1].replace(" & ", "...")) + return {g1, g2} + + bad_columns: dict[str, dict[str, set]] = dict() + for (col_name, col_series) in df_contrasts.iteritems(): + expected_values = _get_groups_from_comparisions(col_name) + if not expected_values == set(col_series): + bad_columns[col_name] = { + "expected": expected_values, + "actual": set(col_series), + } + + # check logic + if not bad_columns: + code = FlagCode.GREEN + message = f"Contrasts column and rows match expected formatting" + else: + code = FlagCode.HALT + message = f"Contrasts columns {bad_columns} have unexpected values" + return {"code": code, "message": message} + + +def check_dge_table_annotation_columns_exist( + dge_table: Path, organism: str, **_ +) -> FlagEntry: + REQUIRED_ANNOTATION_KEYS = { + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS", + } + MASTER_ANNOTATION_KEY = {"_DEFAULT": "ENSEMBL", "Arabidopsis thaliana": "TAIR"} + + df_dge = pd.read_csv(dge_table) + + required_columns = REQUIRED_ANNOTATION_KEYS.union( + {MASTER_ANNOTATION_KEY.get(organism, MASTER_ANNOTATION_KEY["_DEFAULT"])} + ) + + missing_columns = required_columns - set(df_dge.columns) + # check logic + if not missing_columns: + code = FlagCode.GREEN + message = f"Found all required annotation columns: {required_columns}" + else: + code = FlagCode.HALT + message = ( + f"Missing the following required annotation columns: {missing_columns}" + ) + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_exist( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + # data specific preprocess + df_dge = pd.read_csv(dge_table) + + missing_sample_columns = samples - set(df_dge.columns) + + # check logic + if not missing_sample_columns: + code = FlagCode.GREEN + message = f"All samplewise columns present" + else: + code = FlagCode.HALT + message = f"Missing these sample count columns: {missing_sample_columns}" + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_constraints( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + MINIMUM_COUNT = 0 + # data specific preprocess + df_dge = pd.read_csv(dge_table)[samples] + + column_meets_constraints = df_dge.apply( + lambda col: all(col >= MINIMUM_COUNT), axis="rows" + ) + + # check logic + contraint_description = f"All counts are greater or equal to {MINIMUM_COUNT}" + if all(column_meets_constraints): + code = FlagCode.GREEN + message = ( + f"All values in columns: {samples} met constraint: {contraint_description}" + ) + else: + code = FlagCode.HALT + message = ( + f"These columns {list(column_meets_constraints.index[~column_meets_constraints])} " + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_group_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + expected_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All group summary statistic columns (Prefixes: {GROUP_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these group summary statistic columns (Prefixes: {GROUP_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_dge_table_group_columns_constraints( + dge_table: Path, runsheet: Path, samples: set[str], **_ +) -> FlagEntry: + FLOAT_TOLERANCE = ( + 0.001 # Percent allowed difference due to float precision differences + ) + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + query_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + + expected_group_lists = utils_runsheet_to_expected_groups( + runsheet, map_to_lists=True, limit_to_samples=samples + ) + df_dge = pd.read_csv(dge_table) + + # issue trackers + issues: dict[str, list[str]] = { + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [], + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [], + } + + group: str + sample_set: list[str] + for group, sample_set in expected_group_lists.items(): + abs_percent_differences = abs( + (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + abs_percent_differences = abs( + (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + # check logic + contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )" + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns: {query_columns} met constraint: {contraint_description}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_comparison_statistical_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + COMPARISON_PREFIXES = ["Log2fc_", "Stat_", "P.value_", "Adj.p.value_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + expected_columns = { + "".join(comb) + for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def utils_common_constraints_on_dataframe( + df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...] +) -> dict: + + issues: dict[str, list[str]] = { + "Failed non null constraint": list(), + "Failed non negative constraint": list(), + } + + for (col_set, col_constraints) in constraints: + # this will avoid overriding the original constraints dictionary + # which is likely used in the check message + col_constraints = col_constraints.copy() + + # limit to only columns of interest + query_df = df[col_set] + for (colname, colseries) in query_df.iteritems(): + # check non null constraint + if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: + issues["Failed non null constraint"].append(colname) + # check non negative constraint + if ( + col_constraints.pop("nonNegative", False) + and nonNegative(colseries) == False + ): + issues["Failed non negative constraint"].append(colname) + # check allowed values constraint + if allowedValues := col_constraints.pop("allowedValues", False): + if onlyAllowedValues(colseries, allowedValues) == False: + issues["Failed non negative constraint"].append(colname) + + # raise exception if there are unhandled constraint keys + if col_constraints: + raise ValueError(f"Unhandled constraint types: {col_constraints}") + + return issues + + +def check_dge_table_group_statistical_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + + resolved_constraints = ( + ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + ({f"Stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + # can be removed from analysis before p-value and adj-p-value assessed + # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na + ( + {f"P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ( + {f"Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {resolved_constraints}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}." + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = { + "All.mean": {"nonNull": True, "nonNegative": True}, + "All.stdev": {"nonNull": True, "nonNegative": True}, + "LRT.p.value": {"nonNull": False, "nonNegative": True}, + } + expected_columns = set(fixed_stats_columns) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = ( + f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}" + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_constraints( + dge_table: Path, **_ +) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = ( + ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}), + ({"LRT.p.value"}, {"nonNull": False, "nonNegative": True}), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {fixed_stats_columns}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {fixed_stats_columns}." + ) + return {"code": code, "message": message} + + +def check_dge_table_log2fc_within_reason( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 10 # Percent + LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT = 50 # Percent + + # TODO: discuss, this might even be fine to lower quite a bit + # e.g THRESHOLD_PERCENT_MEANS_DIFFERENCE = 1 # percent + THRESHOLD_PERCENT_MEANS_DIFFERENCE = 50 # percent + + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_dge = pd.read_csv(dge_table) + + # Track error messages + err_msg_yellow = "" + all_suspect_signs: dict[int, dict[str, float]] = dict() + for comparision in expected_comparisons: + query_column = f"Log2fc_{comparision}" + group1_mean_col = ( + "Group.Mean_" + comparision.split(")v(")[0] + ")" + ) # Uses parens and adds them back to prevent slicing on 'v' within factor names + group2_mean_col = "Group.Mean_" + "(" + comparision.split(")v(")[1] + computed_log2fc = (df_dge[group1_mean_col] / df_dge[group2_mean_col]).apply( + math.log, args=[2] + ) + abs_percent_difference = abs( + ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100 + ) + percent_within_tolerance = ( + mean( + abs_percent_difference + < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD + ) + * 100 + ) + # flag if not enough within tolerance + if percent_within_tolerance < LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT: + err_msg_yellow += ( + f"For comparison: '{comparision}' {percent_within_tolerance:.2f} % of genes have absolute percent differences " + f"(between log2fc direct computation and DESeq2's approach) " + f"less than {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD} % which does not met the minimum percentage " + f"({LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT} %) of genes required. " + f"This may indicate misassigned or misaligned columns. " + ) + + #### sign based checks + + # filter to genes with based on groups means + abs_percent_differences = ( + abs( + (df_dge[group1_mean_col] - df_dge[group2_mean_col]) + / df_dge[group2_mean_col] + ) + * 100 + ) + df_dge_filtered = df_dge.loc[ + abs_percent_differences > THRESHOLD_PERCENT_MEANS_DIFFERENCE + ] + + df_dge_filtered["positive_sign_expected"] = ( + df_dge[group1_mean_col] - df_dge[group2_mean_col] > 0 + ) + + df_dge_filtered["matches_expected_sign"] = ( + (df_dge[query_column] > 0) & df_dge_filtered["positive_sign_expected"] + ) | ((df_dge[query_column] < 0) & ~df_dge_filtered["positive_sign_expected"]) + + all_suspect_signs = all_suspect_signs | df_dge_filtered.loc[ + df_dge_filtered["matches_expected_sign"] == False + ][[group1_mean_col, group2_mean_col, query_column]].to_dict("index") + + if all_suspect_signs: + code = FlagCode.RED + message = f"At least one log2fc sign is suspect, the following log2fc compared to actual group means: {all_suspect_signs}" + elif err_msg_yellow: + code = FlagCode.YELLOW + message = ( + f"All log2fc not within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. " + ) + else: + code = FlagCode.GREEN + message = ( + f"All log2fc within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. Additionally, for comparisons with mean differences " + f"greater than {THRESHOLD_PERCENT_MEANS_DIFFERENCE}% all have reasonable log2fc signs" + ) + + return {"code": code, "message": message} + + +def check_viz_table_columns_exist(dge_table: Path, runsheet: Path, **_) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_prefixes = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + expected_columns = set( + itertools.chain(*[c1 for c1, _ in viz_pairwise_columns_prefixes]) + ) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All viz specific comparison columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these viz specific comparison columns: {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_viz_table_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_constraints = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + df_viz = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe( + df_viz, viz_pairwise_columns_constraints + ) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = ( + f"All values in columns met constraint: {viz_pairwise_columns_constraints}" + ) + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {viz_pairwise_columns_constraints}." + ) + return {"code": code, "message": message} + + +def check_viz_pca_table_index_and_columns_exist( + pca_table: Path, samples: set[str] +) -> FlagEntry: + EXPECTED_VIS_PCA_COLUMNS = {"PC1", "PC2", "PC3"} + err_msg = "" + # data specific preprocess + df = pd.read_csv(pca_table, index_col=0) + + # check all samples included + if missing_samples := samples - set(df.index): + err_msg += f"Missing samples in index: {missing_samples}" + + # check all expected columns exist + if missing_cols := EXPECTED_VIS_PCA_COLUMNS - set(df.columns): + err_msg += f"Missing expected columns: {missing_cols}" + + if not err_msg: + code = FlagCode.GREEN + message = f"PCA Table has all the samples in the index and these columns exist: {EXPECTED_VIS_PCA_COLUMNS}" + else: + code = FlagCode.HALT + message = err_msg + + return {"code": code, "message": message} + + +def utils_formatting_list(l: list[str], spaces: int = 2) -> str: + """Reformats list to print friendly multi line string. + + Example: + Reformatting a list of samples:: + + l = ['groundControl_1','groundControl_2','spaceFlight_1','spaceFlight-2'] + print(f"Samples: \n{utils_formatting_list(l)}") + + Args: + l (list): A list of strings to format + spaces (int): Number of leading spaces per line + + Returns: + str: Print friendly multiline string + """ + leading_spaces = " " * spaces + return "\n".join([f"{leading_spaces}- {item}" for item in l]) + + +def utils_rsem_counts_table_to_dataframe( + counts_table: Path, describe: bool = True +) -> pd.DataFrame: + df = pd.read_csv(counts_table, index_col=0).rename_axis("geneID") + if describe: + print(f"Loaded rsem counts table:") + print(f" Samples: \n{utils_formatting_list(list(df.columns), spaces = 4)}") + print(f" Number of Genes: {len(df)}") + return df + + +def utils_get_asset(asset_name: str) -> Path: + [p] = (p for p in files("dp_tools") if p.name == asset_name) + return p.locate() + + +def check_ERCC_subgroup_representation(unnormalizedCountTable: Path, **_) -> FlagEntry: + """Check ERCC subgroup representation is robust. + Specifically, counts the dataset wide ERCC IDs then categorizes each subgroup + by the number of represented ERCC IDs in that subgroup. + Finally, generates a Flag result by comparison to thresholds. + + Args: + counts_table (Path): RSEM unnormalized counts table + + Returns: + FlagEntry: Result of the check. + """ + MINIMUM_GREEN = 21 + MINIMUM_YELLOW = 19 + MINIMUM_RED = 0 + MINIMUM_HALT = 0 + + # data specific preprocess + df_counts = utils_rsem_counts_table_to_dataframe(unnormalizedCountTable) + + ercc_file = utils_get_asset("cms_095046.txt") + df_ercc = pd.read_csv(ercc_file, sep="\t") + + # filter to only ercc genes + df_counts = df_counts.loc[df_counts.index.isin(df_ercc["ERCC ID"])] + + # filter to only genes with at least one count (i.e. ERCC genes represented in the dataset) + df_counts = df_counts.loc[df_counts.sum(axis="columns") > 0] + + # merge to ercc table data including subgroup + df_counts = df_counts.merge(df_ercc, left_index=True, right_on="ERCC ID") + + # generate subgroup counts + df_subgroup_counts = df_counts["subgroup"].value_counts().sort_index() + + green_key = f"green level subgroups: > {MINIMUM_GREEN} ERCC represented" + yellow_key = ( + f"yellow level subgroups: {MINIMUM_YELLOW}-{MINIMUM_GREEN} ERCC represented" + ) + red_key = f"red level subgroups: {MINIMUM_RED}-{MINIMUM_YELLOW} ERCC represented" + halt_key = f"halt level subgroups: < {MINIMUM_HALT} ERCC represented" + + # classify each representation count + representation_category: dict[str, dict[str,int]] = { + green_key: df_subgroup_counts.loc[df_subgroup_counts > MINIMUM_GREEN].to_dict(), + yellow_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between(MINIMUM_YELLOW, MINIMUM_GREEN) + ].to_dict() + , + red_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between( + MINIMUM_RED, MINIMUM_YELLOW, inclusive="left" + ) + ].to_dict() + , + halt_key: df_subgroup_counts.loc[df_subgroup_counts < MINIMUM_HALT].to_dict(), + } + + # check logic + if representation_category[halt_key]: + code = FlagCode.HALT + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[red_key]: + code = FlagCode.RED + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[yellow_key]: + code = FlagCode.YELLOW + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + else: + code = FlagCode.GREEN + message = ( + f"Dataset wide ERCC representation is robust: {representation_category}" + ) + return {"code": code, "message": message} + + +def check_sample_in_multiqc_report( + samples: list[str], + multiqc_report_path: Path, + name_reformat_func: Callable = lambda s: s, +) -> FlagEntry: + """Determines if the query samples are present in the multiqc report. + + This is achieved by checking the 'multiqc_sources.txt' table, 'Sample Name' column. + An optional name_reformat_function can be supplied to address sample name changes that occur in the multiqc report. + An example being the renaming of Sample '-' characters to '_' for certain RSeQC modules. + + :param sample: Query sample names to check for presense + :type sample: list[str] + :param multiqc_report_path: MultiQC report directory + :type multiqc_report_path: Path + :param name_reformat_func: A function applied to the multiQC sample names before searching against query sample names, defaults to not renaming the multiQC sample names + :type name_reformat_func: Callable, optional + :return: Flag Entry denoting successful or failing results. Includes description of query sample names and any missing samples + :rtype: FlagEntry + """ + # Load multiQC sources table and retrieve set of samples + [sources_table] = multiqc_report_path.glob("**/multiqc_sources.txt") + multiQC_samples = list(pd.read_csv(sources_table, sep="\t")["Sample Name"]) + + # Transform multiQC samples using name_reformat_func + reformatted_multiQC_samples = [name_reformat_func(s) for s in multiQC_samples] + + # Check for any missing reformatted sample names. + # Also track extra samples, these are not errors but should be reported as well. + missing_samples = set(samples) - set(reformatted_multiQC_samples) + + # check logic + if len(missing_samples) == 0: + code = FlagCode.GREEN + message = f"Found all query samples after reformatting multiQC sample names. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + else: + code = FlagCode.HALT + message = f"Missing the following query samples: {missing_samples}. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + return {"code": code, "message": message} \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml new file mode 100644 index 00000000..20163de3 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/config.yaml @@ -0,0 +1,1308 @@ +# TOP LEVEL +NAME: "bulkRNASeq" +VERSION: "1" + +# anchors for reuse +_anchors: + rawDataDir: &rawDataDir "00-RawData" + trimDataDir: &trimDataDir "01-TG_Preproc" + alignDataDir: &alignDataDir "02-STAR_Alignment" + countsDataDir: &countsDataDir "03-RSEM_Counts" + normCountsDataDir: &normCountsDataDir "04-DESeq2_NormCounts" + DGEDataDir: &DGEDataDir "05-DESeq2_DGE" + rseqcDataDir: &rseqcDataDir "RSeQC_Analyses" # DISCUSS: Should this be renamed to "RSeQC_Analyses" for consistent casing? -J.O. , this has been renamed and differs from the recent bash based processings + ERCCAnalysisDir: &ERCCAnalysisDir "ERCC_Analysis" + FastQC_Reports: &FastQC_Reports "FastQC_Reports" + neverPublished: &neverPublished + subcategory: null + subdirectory: null + publish to repo: false + include subdirectory in table: false + table order: -1 + +Staging: + General: + Required Metadata: + From ISA: + # - ISA Field Name: Study Assay Measurement Type + # ISA Table Source: Investigation + # Investigation Subtable: STUDY ASSAYS + # Runsheet Column Name: Study Assay Measurement Type + # Processing Usage: >- + # Mapping to the appropriate processing pipeline for the assay. + # Example: transcription profiling + + # - ISA Field Name: Study Assay Technology Type + # ISA Table Source: Investigation + # Investigation Subtable: STUDY ASSAYS + # Runsheet Column Name: Study Assay Technology Type + # Processing Usage: >- + # Mapping to the appropriate processing pipeline for the assay. + # Example: DNA microarray + + # - ISA Field Name: Study Assay Technology Platform + # ISA Table Source: Investigation + # Investigation Subtable: STUDY ASSAYS + # Runsheet Column Name: Study Assay Technology Platform + # Processing Usage: >- + # Mapping to the appropriate processing pipeline for the assay. + # Example: Affymetrix + + - ISA Field Name: Study Protocol Type + ISA Table Source: Investigation + Investigation Subtable: STUDY PROTOCOLS + # will return a boolean indicating if any of the following includes + True If Includes At Least One: + - spike-in quality control role + - spike-in protocol + - spike-in control + - spike-in control protocol + Runsheet Column Name: has_ERCC + Processing Usage: >- + Indicates is ERCC spike-in has been added. This can be automatically + determined from the ISA archive as well based on 'Study Protocol Name' and 'Study Protocol Type' + Example: 'TRUE' + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Arabidopsis thaliana + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + ISA Table Source: Assay + Runsheet Column Name: paired_end + Remapping: {"PAIRED":true, "Paired":true, "SINGLE":false} + Processing Usage: >- + Indicates if the sequencing was paired end. This controls how a variety of tools are invoked + including in-house written scripts. + Example: 'TRUE' + + # this entry denotes the following: + # retrive from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + - ISA Field Name: + - Parameter Value[Merged Sequence Data File] + - Characteristics[Merged Sequence Data File] + - Raw Data File + ISA Table Source: Assay + Multiple Values Per Entry: true + Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + Runsheet Column Name: + - {'name':'read1_path', 'index':0} + - {'name':'read2_path', 'index':1, 'optional':true} + GLDS URL Mapping: true + Processing Usage: >- + Location to the raw data fastq file. May be a url or local path. + Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + - ISA Field Name: Factor Value[{factor_name}] + ISA Table Source: [Assay, Sample] + Runsheet Column Name: Factor Value[{factor_name}] + Matches Multiple Columns: true + Match Regex: "Factor Value\\[.*\\]" + Append Column Following: "Unit" + Processing Usage: >- + Factor values in a study. Used to assign experimental groups for each sample. + Note: On the runsheet, a subsequent 'Unit' Column value will be + suffix-concatenated if it exists. + Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "transcription profiling" + technology: "RNA Sequencing (RNA-Seq)" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_rna_seq_" + + # configuration related to updating investigation file + # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # LEADCAP_organism should be the studied organisms scientific name with a leading cap + Post Processing Add Study Protocol: + GeneLab RNAseq data processing protocol::{LEADCAP_organism} V1 + +data assets: + runsheet: + processed location: + - "Metadata" + - "{dataset}_bulkRNASeq_v1_runsheet.csv" + + tags: + - raw + + resource categories: *neverPublished + + ISA Archive: + processed location: + - "Metadata" + - "*-ISA.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report" + + tags: + - raw + + resource categories: *neverPublished + + raw MultiQC directory ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "raw_multiqc_report.zip" + + tags: + - raw + + resource categories: &MergedSequenceData_MultiQCReports + subcategory: Merged Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 1 + + raw forward reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R1_raw.fastq.gz" + + tags: + - raw + + resource categories: &MergedSequenceData_Fastq + subcategory: Merged Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 0 + + raw reverse reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_R2_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw reads fastq GZ: + processed location: + - *rawDataDir + - "Fastq" + - "{sample}_raw.fastq.gz" + + tags: + - raw + + resource categories: *MergedSequenceData_Fastq + + raw forward reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + # J.Oribello: We should revisit this, fastQC includes some unique (not parsed + # into multiQC) relevant information like the actual overrepresented sequence strings + raw reverse reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC HTML: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.html" + + tags: + - raw + + resource categories: *neverPublished + + raw forward reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R1_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reverse reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_R2_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + raw reads fastQC ZIP: + processed location: + - *rawDataDir + - *FastQC_Reports + - "{sample}_raw_fastqc.zip" + + tags: + - raw + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + trimmed fastQC MultiQC directory ZIP: + processed location: + - *trimDataDir + - *FastQC_Reports + - "trimmed_multiqc_report.zip" + + tags: + - processed + + resource categories: &TrimmedSequenceData_MultiQCReports + subcategory: Trimmed Sequence Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 4 + + trimmed forward reads fastq GZ: &trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R1_trimmed.fastq.gz" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Fastq + publish to repo: true + include subdirectory in table: false + table order: 3 + + trimmed reverse reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_R2_trimmed.fastq.gz" + + tags: + - processed + + trimmed reads fastq GZ: + <<: *trimmedFastqGZ + processed location: + - *trimDataDir + - "Fastq" + - "{sample}_trimmed.fastq.gz" + + tags: + - processed + + trimmed forward reads fastQC HTML: &trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.html" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.html" + + tags: + - processed + + trimmed reads fastQC HTML: + <<: *trimmedForwardReadsFastQCHTML + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.html" + + tags: + - processed + + trimmed forward reads fastQC ZIP: &trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R1_trimmed_fastqc.zip" + + tags: + - processed + + resource categories: *neverPublished + + trimmed reverse reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_R2_trimmed_fastqc.zip" + + tags: + - processed + + trimmed reads fastQC ZIP: + <<: *trimmedForwardReadsFastQCZIP + processed location: + - *trimDataDir + - *FastQC_Reports + - "{sample}_trimmed_fastqc.zip" + + tags: + - processed + + trimming MultiQC directory: + processed location: + - *trimDataDir + - &trimmingReportsDir "Trimming_Reports" + - "trimming_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + forward reads trimming report: &trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R1_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + resource categories: + subcategory: Trimmed Sequence Data + subdirectory: Trimming Reports + publish to repo: true + include subdirectory in table: true + table order: 5 + + reverse reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_R2_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + reads trimming report: + <<: *trimmedForwardReadsFastQCTrimmingReport + processed location: + - *trimDataDir + - *trimmingReportsDir + - "{sample}_raw.fastq.gz_trimming_report.txt" + + tags: + - processed + + aligned MultiQC directory: + processed location: + - *alignDataDir + - "align_multiqc_report" + + resource categories: *neverPublished + + tags: + - processed + + aligned MultiQC directory ZIP: + processed location: + - *alignDataDir + - "align_multiqc_report.zip" + + tags: + - processed + + resource categories: &AlignedSequenceData_MultiQCReports + subcategory: Aligned Sequence Data # RENAME: from 'Aligned sequence data'. For consistency with Title casing across the board + subdirectory: MultiQC Reports # RENAME: from 'MultiQC Reports'. For consistency with Title casing across the board + publish to repo: true + include subdirectory in table: true + table order: 8 + + aligned ToTranscriptome Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.toTranscriptome.out.bam" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignedData + subcategory: Aligned Sequence Data + subdirectory: Aligned Data + publish to repo: true + include subdirectory in table: false + table order: 6 + + aligned SortedByCoord Bam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord.out.bam" + + tags: + - processed + + resource categories: *neverPublished + + aligned SortedByCoord ResortedBam: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned SortedByCoord ResortedBamIndex: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Aligned.sortedByCoord_sorted.out.bam.bai" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + aligned log Final: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.final.out" + + tags: + - processed + + resource categories: &AlignedSequenceData_AlignmentLogs + subcategory: Aligned Sequence Data + subdirectory: Alignment Logs + publish to repo: true + include subdirectory in table: true + table order: 7 + + aligned log Progress: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.progress.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned log Full: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_Log.out" + + tags: + - processed + + resource categories: *neverPublished + + aligned sjTab: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_SJ.out.tab" + + tags: + - processed + + resource categories: *AlignedSequenceData_AlignedData + + genebody coverage MultiQC directory: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + genebody coverage MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "geneBody_cov_multiqc_report.zip" + + tags: + - processed + + resource categories: &RSeQC_MultiQCReports + subcategory: RSeQC + subdirectory: MultiQC Reports + publish to repo: true + include subdirectory in table: true + table order: 9 + + infer experiment MultiQC directory: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + infer experiment MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "infer_exp_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + inner distance MultiQC directory: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + inner distance MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "inner_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + read distribution MultiQC directory: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report" + + tags: + - processed + + resource categories: *neverPublished + + read distribution MultiQC directory ZIP: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "read_dist_multiqc_report.zip" + + tags: + - processed + + resource categories: *RSeQC_MultiQCReports + + genebody coverage out: + processed location: + - *rseqcDataDir + - "02_geneBody_coverage" + - "{sample}" + + tags: + - processed + + # TODO: DISCUSS Consider this for directories that are handled the same but should validate contents + # is directory: true + # contents: + # - ["{sample}.geneBodyCoverage.r"] + # - ["{sample}.geneBodyCoverage.txt"] + # - ["{sample}.geneBodyCoverage.curves.pdf"] + + resource categories: *neverPublished + + infer experiment out: + processed location: + - *rseqcDataDir + - "03_infer_experiment" + - "{sample}_infer_expt.out" + + tags: + - processed + + resource categories: *neverPublished + + inner distance out: + processed location: + - *rseqcDataDir + - "04_inner_distance" + - "{sample}" + + tags: + - processed + + resource categories: *neverPublished + + read distribution out: + processed location: + - *rseqcDataDir + - "05_read_distribution" + - "{sample}_read_dist.out" + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report" # RENAMED from count_multiqc_report as of 4/14/2022 + + tags: + - processed + + resource categories: *neverPublished + + RSEM counts MultiQC directory ZIP: + processed location: + - *countsDataDir + - "RSEM_count_multiqc_report.zip" + + tags: + - processed + + resource categories: &RawCountsData_MultiQCReports + subcategory: Raw Counts Data + subdirectory: Multiqc Reports + publish to repo: true + include subdirectory in table: true + table order: 11 + + star number non-zero count genes table: + processed location: + - *alignDataDir + - "STAR_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + star unnormalized counts table: + processed location: + - *alignDataDir + - "STAR_Unnormalized_Counts.csv" + + tags: + - processed + + resource categories: &RawCountsTables + subcategory: Raw Counts Tables + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 12 + + rsem number non-zero count genes table: + processed location: + - *countsDataDir + - "RSEM_NumNonZeroGenes.csv" + + tags: + - processed + + resource categories: *neverPublished + + rsem unnormalized counts table: + processed location: + - *countsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED from 'Unnormalized_Counts.csv' + + tags: + - processed + + resource categories: *RawCountsTables + + sample reads per gene table: + processed location: + - *alignDataDir + - "{sample}" + - "{sample}_ReadsPerGene.out.tab" + + tags: + - processed + + resource categories: *neverPublished # TODO: Discuss, should this be repo published? In what way? + + sample gene counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.genes.results" + + tags: + - processed + + resource categories: &RawCountsData_CountData + subcategory: Raw Counts Data + subdirectory: Count Data + publish to repo: true + include subdirectory in table: false + table order: 10 + + sample isoform counts table: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.isoforms.results" + + tags: + - processed + + resource categories: *RawCountsData_CountData + + sample counts stats directory: + processed location: + - *countsDataDir + # Removed - "{sample}", DISCUSS: Since this directory contains multiple files per sample, should this be nested in sample-wise dirs consistent with STAR and RSeQC. J.O. + - "{sample}.stat" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "Normalized_Counts.csv" + + tags: + - processed + + resource categories: &normalizedCountsData + subcategory: Normalized Counts Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 13 + + ERCC normalized DESeq2 normalized counts table: + processed location: + - *normCountsDataDir + - "ERCC_Normalized_Counts.csv" + + tags: + - processed + + resource categories: *normalizedCountsData + + sample table: + processed location: + - *DGEDataDir + - "SampleTable.csv" + + tags: + - processed + + resource categories: &DGEAnalysisData + subcategory: Differential Expression Analysis Data + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 14 + + ERCC sample table: + processed location: + - *DGEDataDir + - &erccSubDir "ERCC_NormDGE" + - "ERCCnorm_SampleTable.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 unnormalized counts table: + processed location: + - *normCountsDataDir + - "RSEM_Unnormalized_Counts.csv" # RENAMED: from "Unnormalized_Counts.csv" for clarity + + tags: + - processed + + resource categories: *neverPublished # DISCUSS: temporary name clash resolution for publishables + + DESeq2 contrasts table: + processed location: + - *DGEDataDir + - "contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 contrasts table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_contrasts.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - "differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + ERCC normalized DESeq2 annotated DGE table: + processed location: + - *DGEDataDir + - *erccSubDir + - "ERCCnorm_differential_expression.csv" + + tags: + - processed + + resource categories: *DGEAnalysisData + + DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - "visualization_output_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 annotated DGE extended for viz table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_output_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - "visualization_PCA_table.csv" + + tags: + - processed + + resource categories: *neverPublished + + ERCC normalized DESeq2 viz PCA table: + processed location: + - *DGEDataDir + - *erccSubDir + - "visualization_PCA_table_ERCCnorm.csv" + + tags: + - processed + + resource categories: *neverPublished + + + ERCC analysis HTML: + processed location: + - *ERCCAnalysisDir + - "ERCC_analysis.html" + + tags: + - processed + + conditional on dataset: + - has_ERCC: [True] + + resource categories: + subcategory: ERCC Analyses + subdirectory: "" + publish to repo: true + include subdirectory in table: false + table order: 15 + + # NOTE: this is while the ERCC analysis sits outside the full pipeline and + # once incoporated, it should be validated for existence! + validate exists: false + +# Assets that are no longer generated by the latest pipeline +Archived Data Assets: + + # DISCUSS: When Trim Galore MQC if made clearer, publishing this should be revisited + # Currently this only reports the direct cutadapt related trimming and misses Trim-Galore + # Specific metrics. + # - Jonathan Oribello + trimming MultiQC directory ZIP: + processed location: + - *trimDataDir + - *trimmingReportsDir + - "trimming_multiqc_report.zip" + + tags: + - processed + + resource categories: *neverPublished + + +data asset sets: + # These assets are not generated in the workflow, but are generated after the workflow + PUTATIVE: + - "ERCC analysis HTML" + glds metadata: + - "ISA Archive" + has ercc: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + demuliplexed paired end raw data: + - "runsheet" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + qc reports for paired end raw data: + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + paired end trimmed reads: + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + qc reports for paired end trimmed reads data: + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + demuliplexed single end raw data: + - "runsheet" + - "raw reads fastq GZ" + qc reports for single end raw data: + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + single end trimmed reads: + - "trimmed reads fastq GZ" + qc reports for single end trimmed reads data: + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimming MultiQC directory" + - "reads trimming report" + STAR alignments: + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "sample reads per gene table" + - "star number non-zero count genes table" + - "star unnormalized counts table" + RSeQC output for paired end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + RSeQC output for single end data: + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + RSEM counts: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + is single end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw reads fastq GZ" + - "raw reads fastQC HTML" + - "raw reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed reads fastq GZ" + - "trimmed reads fastQC HTML" + - "trimmed reads fastQC ZIP" + - "trimming MultiQC directory" + - "reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + is paired end full: + - "runsheet" + - "ISA Archive" + - "raw MultiQC directory" + - "raw MultiQC directory ZIP" + - "raw forward reads fastq GZ" + - "raw reverse reads fastq GZ" + - "raw forward reads fastQC HTML" + - "raw reverse reads fastQC HTML" + - "raw forward reads fastQC ZIP" + - "raw reverse reads fastQC ZIP" + - "trimmed fastQC MultiQC directory" + - "trimmed fastQC MultiQC directory ZIP" + - "trimmed forward reads fastq GZ" + - "trimmed reverse reads fastq GZ" + - "trimmed forward reads fastQC HTML" + - "trimmed reverse reads fastQC HTML" + - "trimmed forward reads fastQC ZIP" + - "trimmed reverse reads fastQC ZIP" + - "trimming MultiQC directory" + - "forward reads trimming report" + - "reverse reads trimming report" + - "aligned MultiQC directory" + - "aligned MultiQC directory ZIP" + - "aligned ToTranscriptome Bam" + - "aligned SortedByCoord Bam" + - "aligned SortedByCoord ResortedBam" + - "aligned SortedByCoord ResortedBamIndex" + - "aligned log Final" + - "aligned log Progress" + - "aligned log Full" + - "aligned sjTab" + - "genebody coverage MultiQC directory" + - "genebody coverage MultiQC directory ZIP" + - "infer experiment MultiQC directory" + - "infer experiment MultiQC directory ZIP" + - "inner distance MultiQC directory" + - "inner distance MultiQC directory ZIP" + - "read distribution MultiQC directory" + - "read distribution MultiQC directory ZIP" + - "genebody coverage out" + - "infer experiment out" + - "inner distance out" + - "read distribution out" + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "star number non-zero count genes table" + - "star unnormalized counts table" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" + - "sample reads per gene table" + - "sample gene counts table" + - "sample isoform counts table" + - "sample counts stats directory" + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + DGE Output: + - "DESeq2 normalized counts table" + - "sample table" + - "DESeq2 unnormalized counts table" + - "DESeq2 contrasts table" + - "DESeq2 annotated DGE table" + - "DESeq2 annotated DGE extended for viz table" + - "DESeq2 viz PCA table" + ERCC DGE Output: + - "ERCC normalized DESeq2 normalized counts table" + - "ERCC sample table" + - "ERCC normalized DESeq2 contrasts table" + - "ERCC normalized DESeq2 annotated DGE table" + - "ERCC normalized DESeq2 annotated DGE extended for viz table" + - "ERCC normalized DESeq2 viz PCA table" + # NOTE: Not part of NF_WF yet - "ERCC analysis HTML" + RSEM Output: + - "RSEM counts MultiQC directory" + - "RSEM counts MultiQC directory ZIP" + - "rsem number non-zero count genes table" + - "rsem unnormalized counts table" \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py new file mode 100644 index 00000000..3b30a6c8 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py @@ -0,0 +1,960 @@ +from pathlib import Path +import re +from typing import Union +import yaml +import logging + +from dp_tools.core.entity_model import Dataset + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import ValidationProtocol + +CONFIG = { + "Metadata-check_metadata_attributes_exist": { + "expected_attrs": ["paired_end", "has_ERCC", "organism"] + }, + "Raw Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Trim Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Raw Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "Trim Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "STAR Alignments By Sample-check_thresholds-Mapped": { + "mqc_key": "STAR", + "stat_string": "uniquely_mapped_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 70}, + {"code": "RED", "type": "lower", "value": 50}, + ], + }, + "STAR Alignments By Sample-check_thresholds-MultiMapped": { + "mqc_key": "STAR", + "stat_string": "multimapped_toomany_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 30}, + {"code": "RED", "type": "lower", "value": 15}, + ], + }, + "STAR Alignments-check_for_outliers": { + "mqc_module": "STAR", + "mqc_plot": "general_stats", + "mqc_keys": [ + "uniquely_mapped_percent", + "avg_mapped_read_length", + "mismatch_rate", + "deletion_rate", + "deletion_length", + "insertion_rate", + "insertion_length", + "multimapped_percent", + "multimapped_toomany_percent", + "unmapped_mismatches_percent", + "unmapped_tooshort_percent", + "unmapped_other_percent", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-geneBody_coverage": { + "mqc_module": "RSeQC", + "mqc_plot": "Gene Body Coverage", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-infer_experiment": { + "mqc_module": "RSeQC", + "mqc_plot": "Infer experiment", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-inner_distance": { + "mqc_module": "RSeQC", + "mqc_plot": "Inner Distance", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-read_distribution": { + "mqc_module": "RSeQC", + "mqc_plot": "Read Distribution", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_strandedness_assessable_from_infer_experiment": { + "stranded_assessment_range": {"max": 100, "min": 75}, + "unstranded_assessment_range": {"min": 40, "max": 60}, + "valid_dominant_strandedness_assessments": [ + "Sense (% Tags)", + "Antisense (% Tags)", + ], + }, + "RSEM Counts-check_for_outliers": { + "mqc_module": "Rsem", + "mqc_plot": "general_stats", + "mqc_keys": [ + "Unalignable", + "Alignable", + "Filtered", + "Total", + "alignable_percent", + "Unique", + "Multi", + "Uncertain", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, +} + + +def validate_bulkRNASeq( + dataset: Dataset, + config_path: Path = None, + run_args: dict = None, + report_args: dict = None, + protocol_args: dict = None, + defer_run: bool = False, +) -> Union[ValidationProtocol, ValidationProtocol.Report]: + + if config_path is not None: + with open(config_path, "r") as f: + config = yaml.safe_load(f) + else: + config = CONFIG + + if run_args is None: + run_args = dict() + + if report_args is None: + report_args = dict() + + if protocol_args is None: + protocol_args = dict() + # init validation protocol + vp = ValidationProtocol(**protocol_args) + # fmt: on + with vp.component_start( + name=dataset.name, + description="Validate processing from trim reads through differential gene expression output", + ): + + with vp.component_start( + name="Metadata", description="Metadata file validation" + ): + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_metadata_attributes_exist, + config=config["Metadata-check_metadata_attributes_exist"], + ) + + with vp.component_start( + name="Raw Reads", description="Raw Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["raw reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "raw forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "raw reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Raw Reads-check_for_outliers"] + ) + + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "raw MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in raw reads multiQC report", + ) + + with vp.component_start( + name="Trim Reads", description="Trimmed Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["trimmed reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Trim Reads-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimmed fastQC MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_R1|_R2$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimming MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check that all samples are present in the trimmed FastQC and trimming report multiQC reports", + ) + with vp.component_start( + name="STAR Alignments", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["aligned log Final"], + } + ] + ): + vp.add( + check_for_outliers, + config=config["STAR Alignments-check_for_outliers"], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "aligned MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in STAR multiQC report", + ) + + with vp.component_start( + name="RSeQC", + description="RSeQC submodule outliers checking and other submodule specific dataset wide checks", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["genebody coverage out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in geneBody Coverage", + config=config["RSeQC-check_for_outliers-geneBody_coverage"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["infer experiment out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in infer experiment", + config=config["RSeQC-check_for_outliers-infer_experiment"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["inner distance out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in inner distance", + config=config["RSeQC-check_for_outliers-inner_distance"], + skip=(not dataset.metadata["paired_end"]), + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["read distribution out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in read distribution", + config=config["RSeQC-check_for_outliers-read_distribution"], + ) + + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_strandedness_assessable_from_infer_experiment, + config=config[ + "RSeQC-check_strandedness_assessable_from_infer_experiment" + ], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "genebody coverage MultiQC directory" + ].path, + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "infer experiment MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_infer_expt$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "read distribution MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_read_dist$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC multiQC reports", + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "inner distance MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC inner distance multiQC report (paired end only)", + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="RSEM Counts", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["sample counts stats directory"], + } + ] + ): + vp.add( + check_for_outliers, config=config["RSEM Counts-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "RSEM counts MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSEM multiQC report", + ) + with vp.component_start( + name="Unnormalized Gene Counts", + description="Validate normalization related output", + ): + + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "star unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample reads per gene table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables + ) + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample gene counts table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables + ) + vp.add( + check_ERCC_subgroup_representation, + skip=(not dataset.metadata["has_ERCC"]), + ) + + with vp.component_start( + name="DGE Metadata", + description="", + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": True}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Metadata ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "ERCC sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": False}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Output", + description="", + ): + with vp.payload( + payloads=[ + { + "rsem_table_path": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "deseq2_table_path": lambda: dataset.data_assets[ + "DESeq2 unnormalized counts table" + ].path, + } + ] + ): + vp.add( + check_rsem_counts_and_unnormalized_tables_parity, + skip=( + "rsem unnormalized counts table" not in dataset.data_assets + or "DESeq2 unnormalized counts table" not in dataset.data_assets + ), + ) + + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set(dataset.samples), + "pca_table": lambda: dataset.data_assets[ + "DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + with vp.component_start( + name="DGE Output ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "pca_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + for sample in dataset.samples.values(): + with vp.component_start( + name=sample.name, description="Samples level checks" + ): + with vp.component_start( + name="Raw Reads By Sample", description="Raw reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda: sample.data_assets[ + "raw forward reads fastq GZ" + ].path + }, + { + "file": lambda: sample.data_assets[ + "raw reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda: sample.data_assets[ + "raw reads fastq GZ" + ].path + }, + ] + ) + ): + vp.add( + check_fastqgz_file_contents, + config=config[ + "Raw Reads By Sample-check_fastqgz_file_contents" + ], + ) + vp.add( + check_gzip_file_integrity, + ) + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "raw forward reads fastQC ZIP", + "reads_key_2": "raw reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="Trimmed Reads By Sample", description="Trimmed reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda: sample.data_assets[ + "trimmed forward reads fastq GZ" + ].path + }, + { + "file": lambda: sample.data_assets[ + "trimmed reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda: sample.data_assets[ + "trimmed reads fastq GZ" + ].path + } + ] + ) + ): + vp.add(check_file_exists, description="Check reads files exist") + vp.add( + check_fastqgz_file_contents, + config=config[ + "Trim Reads By Sample-check_fastqgz_file_contents" + ], + ) + + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "trimmed forward reads fastQC ZIP", + "reads_key_2": "trimmed reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + + with vp.component_start( + name="STAR Alignments By Sample", + description="STAR Alignment outputs", + ): + + with vp.payload( + payloads=[ + { + "file": lambda: sample.data_assets[ + "aligned ToTranscriptome Bam" + ].path, + }, + { + "file": lambda: sample.data_assets[ + "aligned SortedByCoord Bam" + ].path, + }, + ] + ): + vp.add( + check_bam_file_integrity, + config={ + "samtools_bin": "samtools" + }, # assumes accessible on path already + ) + + with vp.payload( + payloads=[ + { + "multiqc_inputs": lambda: [ + sample.data_assets["aligned log Final"].path + ], + }, + ] + ): + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-Mapped" + ], + description="Check that mapping rates are reasonable, specifically most reads map to the target genome", + ) + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-MultiMapped" + ], + description="Check that mapping rates are reasonable, specifically that a considerable amount of reads multimap to the target genome", + ) + + with vp.component_start( + name="RSeQC By Sample", + description="RNASeq QA outputs", + ): + with vp.component_start( + name="geneBody_coverage", + description="Assess integrity of transcripts and library prep signatures", + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda: sample.data_assets[ + "genebody coverage out" + ].path + }, + ] + ): + vp.add(check_genebody_coverage_output) + with vp.component_start( + name="inner_distance", + description="Reports on distance between mate reads based on gene annotations", + skip=(not dataset.metadata["paired_end"]), + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda: sample.data_assets[ + "inner distance out" + ].path + }, + ] + ): + vp.add(check_inner_distance_output) + # return protocol object without running or generating a report + if defer_run: + return vp + + vp.run(**run_args) + + # return report + return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts) \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py new file mode 100644 index 00000000..f12de761 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/schemas.py @@ -0,0 +1,33 @@ +import pandas as pd + +class runsheet: # Bad casing since we will use the class definition itself for all static methods + + @staticmethod + def check_single_value(column: pd.Series, error_msg: str, errors: list[str]) -> None: + if len(column.unique()) != 1: + errors.append(error_msg) + + @staticmethod + def check_read2_path_populated_if_paired_end(df: pd.DataFrame, errors: list[str]) -> None: + if (("read2_path" in df.columns and df['paired_end'][0] == True) or + ("read2_path" not in df.columns and df['paired_end'][0] == False)): + return + else: + errors.append("Expected 'read2_path' to be populated only if paired_end is True") + + @staticmethod + def validate(df_runsheet: pd.DataFrame) -> bool: + errors = [] + + # Check for single value in specified columns + runsheet.check_single_value(df_runsheet['has_ERCC'], "Dataset level columns do NOT contain one unique value for 'has_ERCC'", errors) + runsheet.check_single_value(df_runsheet['organism'], "Dataset level columns do NOT contain one unique value for 'organism'", errors) + runsheet.check_single_value(df_runsheet['paired_end'], "Dataset level columns do NOT contain one unique value for 'paired_end'", errors) + + # Check for 'read2_path' population if paired_end is True + runsheet.check_read2_path_populated_if_paired_end(df_runsheet, errors) + + if errors: + raise ValueError("\n".join(errors)) + else: + return True From f15c09965b3b69ed1a67da891871a56731dd3f6c Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 9 May 2023 20:40:33 +0000 Subject: [PATCH 06/58] feat: convert from dp_tools 1.1.8 style usage to 1.3.2 (plugin) --- .../config/software/by_docker_image.config | 6 ++-- .../modules/RUNSHEET_FROM_GLDS.nf | 28 +++++++++++++++++++ .../NF_RCP-F/workflow_code/stage_analysis.nf | 15 ++++++---- 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config index 0166b683..724890c0 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config @@ -45,12 +45,12 @@ process { container = "quay.io/biocontainers/rsem:1.3.1--pl526r341h4f16992_0" } - withName: 'RNASEQ_RUNSHEET_FROM_GLDS|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|SOFTWARE_VERSIONS' { - container = "quay.io/j_81/dp_tools:1.1.8" + withName: 'RUNSHEET_FROM_GLDS|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|SOFTWARE_VERSIONS' { + container = "quay.io/j_81/dp_tools:1.3.2" } withLabel: 'VV' { - container = "quay.io/j_81/dp_tools:1.1.8" + container = "quay.io/j_81/dp_tools:1.3.2" } withName: 'GET_MAX_READ_LENGTH|ASSESS_STRANDEDNESS' { diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf new file mode 100644 index 00000000..e5d003f7 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/RUNSHEET_FROM_GLDS.nf @@ -0,0 +1,28 @@ +process RUNSHEET_FROM_GLDS { + // Downloads isa Archive and creates runsheet using GeneLab API + tag "${ gldsAccession }" + publishDir "${ params.outputDir }/${ gldsAccession }/Metadata", + pattern: "*.{zip,csv}", + mode: params.publish_dir_mode + + input: + // TEMP: RESTORE ONCE OSD SUPPORT ADDED val(osdAccession) + val(gldsAccession) + path(dp_tools_plugin) + + output: + path("${ gldsAccession }_*_v?_runsheet.csv"), emit: runsheet + path("*.zip"), emit: isaArchive + + script: + def injects = params.biomart_attribute ? "--inject biomart_attribute='${ params.biomart_attribute }'" : '' + """ + + dpt-get-isa-archive --accession ${ gldsAccession } + ls ${dp_tools_plugin} + + dpt-isa-to-runsheet --accession ${ gldsAccession } \ + --plugin-dir ${dp_tools_plugin} \ + --isa-archive *.zip ${ injects } + """ +} \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf index 5fc69522..4593f022 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/stage_analysis.nf @@ -15,10 +15,10 @@ def mutate_to_single_end(it) { } // Import process from separate module file -include { RNASEQ_RUNSHEET_FROM_GLDS as GENERATE_RUNSHEET; - GENERATE_METASHEET; +include { GENERATE_METASHEET; STAGE_RAW_READS; get_runsheet_paths } from'./modules/genelab.nf' +include { RUNSHEET_FROM_GLDS } from './modules/RUNSHEET_FROM_GLDS.nf' /************************************************** * ACTUAL WORKFLOW ******************************** @@ -31,9 +31,12 @@ workflow staging{ sample_limit = params.limitSamplesTo ? params.limitSamplesTo : -1 // -1 in take means no limit if (!params.runsheetPath) { - ch_glds_accession | GENERATE_RUNSHEET - GENERATE_RUNSHEET.out.runsheet | set{ ch_runsheet } - GENERATE_METASHEET( GENERATE_RUNSHEET.out.isazip, GENERATE_RUNSHEET.out.runsheet ) + RUNSHEET_FROM_GLDS( + ch_glds_accession, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin + ) + RUNSHEET_FROM_GLDS.out.runsheet | set{ ch_runsheet } + GENERATE_METASHEET( RUNSHEET_FROM_GLDS.out.isaArchive, RUNSHEET_FROM_GLDS.out.runsheet ) } else { ch_runsheet = channel.fromPath(params.runsheetPath) } @@ -86,7 +89,7 @@ workflow staging{ emit: raw_reads = stageLocal ? STAGE_RAW_READS.out : null - isa = params.runsheetPath ? null : GENERATE_RUNSHEET.out.isazip + isa = params.runsheetPath ? null : RUNSHEET_FROM_GLDS.out.isaArchive runsheet = ch_runsheet metasheet = params.runsheetPath ? null : GENERATE_METASHEET.out.metasheet } From 36e0e69dfbc632df26274e5f5cf729f6053e1354 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 9 May 2023 20:41:31 +0000 Subject: [PATCH 07/58] feat: 48 fast test validates runsheet migration Prior, runsheet via 1.1.8 Now, runsheet via 1.3.2 + plugin Identical output expected and attained --- .../NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test | 1 + .../NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test index 7c6438b7..17dd250a 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test @@ -2,6 +2,7 @@ nextflow_pipeline { name "Test Workflow main.nf" script "main.nf" + tag "core" test("GLDS-48:Mouse,SingleEnd,NonERCC: Should run without failures") { diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap index b3f56516..eb4ad0e6 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/main.nf.GLDS48.test.snap @@ -61,7 +61,7 @@ "Mmus_C57-6J_LVR_GC_C_Rep5_M40_infer_expt.out:md5,cf4b9b80507493cd8232113620b4a765", "Mmus_C57-6J_LVR_GC_I_Rep2_M32_read_dist.out:md5,fe7bfe2aa0a2774fa2754ee0485c8b6e" ], - "timestamp": "2023-01-25T23:47:22+0000" + "timestamp": "2023-05-09T20:33:40+0000" }, "GLDS-48:Mouse,SingleEnd,NonERCC: Should run without failures": { "content": [ @@ -126,6 +126,6 @@ "Mmus_C57-6J_LVR_GC_C_Rep5_M40_infer_expt.out:md5,cf4b9b80507493cd8232113620b4a765", "Mmus_C57-6J_LVR_GC_I_Rep2_M32_read_dist.out:md5,fe7bfe2aa0a2774fa2754ee0485c8b6e" ], - "timestamp": "2023-01-25T23:47:23+0000" + "timestamp": "2023-05-09T20:33:40+0000" } } \ No newline at end of file From b3684a4c1db5df06eab20916ef7e130c410c147c Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 10 May 2023 23:46:07 +0000 Subject: [PATCH 08/58] feat: finish migration to updated dp_tools --- .../GL-DPPD-7101-F.md | 2 +- .../bin/dp_tools__NF_RCP/protocol.py | 41 +++++- .../config/software/by_docker_image.config | 4 +- .../NF_RCP-F/workflow_code/main.nf | 8 +- .../NF_RCP-F/workflow_code/modules/vv.nf | 122 ++++++++---------- 5 files changed, 102 insertions(+), 75 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index 363d9a36..e1a83b6e 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -122,7 +122,7 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform |tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)| |tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)| |stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)| -|dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| +|dp_tools|1.3.3|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| |pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)| |seaborn|0.12.0|[https://seaborn.pydata.org/](https://seaborn.pydata.org/)| |matplotlib|3.6.0|[https://matplotlib.org/stable](https://matplotlib.org/stable)| diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py index 3b30a6c8..ef6f9699 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py @@ -10,6 +10,8 @@ from dp_tools.core.check_model import ValidationProtocol +from .checks import * + CONFIG = { "Metadata-check_metadata_attributes_exist": { "expected_attrs": ["paired_end", "has_ERCC", "organism"] @@ -150,8 +152,27 @@ }, } - -def validate_bulkRNASeq( +# Manual kept in sync for now +COMPONENTS_LIST = [ + "Metadata", # for raw reads V&V + "Raw Reads", # for raw reads V&V + "Raw Reads By Sample", # for raw reads V&V + "Trim Reads", # for trim reads V&V + "Trimmed Reads By Sample", # for trim reads V&V + "STAR Alignments", # for star alignment V&V + "STAR Alignments By Sample", # for star alignment V&V + "RSeQC By Sample", # for RSeQC V&V + "RSeQC", # for RSeQC V&V + "RSEM Counts", # for after RSEM V&V + "Unnormalized Gene Counts", # for after RSEM V&V + "DGE Metadata", # for post DGE + "DGE Metadata ERCC", # for post DGE + "DGE Output", # for post DGE + "DGE Output ERCC", # for post DGE +] + + +def validate( dataset: Dataset, config_path: Path = None, run_args: dict = None, @@ -174,6 +195,22 @@ def validate_bulkRNASeq( if protocol_args is None: protocol_args = dict() + + # Modify protocol_args to convert run_components to skip_components based on COMPONENTS_LIST + if ( + "run_components" in protocol_args + and protocol_args.get("run_components") is not None + ): + protocol_args["skip_components"] = [ + c for c in COMPONENTS_LIST if c not in protocol_args["run_components"] + ] + # Check if any run components are not in COMPONENTS_LIST + if set(protocol_args["run_components"]) - set(COMPONENTS_LIST): + raise ValueError( + f"run_components contains components not in COMPONENTS_LIST. Unique to run_components: {set(protocol_args['run_components']) - set(COMPONENTS_LIST)}. All Components: {COMPONENTS_LIST}" + ) + del protocol_args["run_components"] + # init validation protocol vp = ValidationProtocol(**protocol_args) # fmt: on diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config index 724890c0..34cc3370 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_docker_image.config @@ -46,11 +46,11 @@ process { } withName: 'RUNSHEET_FROM_GLDS|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|SOFTWARE_VERSIONS' { - container = "quay.io/j_81/dp_tools:1.3.2" + container = "quay.io/j_81/dp_tools:1.3.3" } withLabel: 'VV' { - container = "quay.io/j_81/dp_tools:1.3.2" + container = "quay.io/j_81/dp_tools:1.3.3" } withName: 'GET_MAX_READ_LENGTH|ASSESS_STRANDEDNESS' { diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf index 1862ef81..916f3d88 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf @@ -240,6 +240,7 @@ workflow { RAW_FASTQC.out.fastqc | map { it -> [ it[1], it[2] ] } | flatten | collect, RAW_MULTIQC.out.zipped_report, RAW_MULTIQC.out.unzipped_report, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) VV_TRIMMED_READS( ch_meta, STAGING.out.runsheet, @@ -250,13 +251,15 @@ workflow { TRIMGALORE.out.reports | collect, TRIM_MULTIQC.out.zipped_report, TRIM_MULTIQC.out.unzipped_report, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) VV_STAR_ALIGNMENTS( STAGING.out.runsheet, ALIGN_STAR.out.publishables | collect, QUANTIFY_STAR_GENES.out.publishables | collect, ALIGN_MULTIQC.out.zipped_report, ALIGN_MULTIQC.out.unzipped_report, - STRANDEDNESS.out.bam_bed | collect + STRANDEDNESS.out.bam_bed | collect, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) VV_RSEQC( ch_meta, STAGING.out.runsheet, @@ -265,12 +268,14 @@ workflow { STRANDEDNESS.out.infer_experiment_multiqc, STRANDEDNESS.out.inner_distance_multiqc, STRANDEDNESS.out.read_distribution_multiqc, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) VV_RSEM_COUNTS( STAGING.out.runsheet, COUNT_ALIGNED.out.only_counts | collect, QUANTIFY_RSEM_GENES.out.publishables, COUNT_MULTIQC.out.zipped_report, COUNT_MULTIQC.out.unzipped_report, + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) VV_DESEQ2_ANALYSIS( ch_meta, STAGING.out.runsheet, @@ -281,6 +286,7 @@ workflow { DGE_BY_DESEQ2.out.dge, DGE_BY_DESEQ2.out.norm_counts_ercc | ifEmpty( { file("NO_FILES.placeholder") }), DGE_BY_DESEQ2.out.dge_ercc | ifEmpty( { file("NO_FILES.placeholder") }), + "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin ) // Software Version Capturing diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf index ae929f44..17842f69 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/vv.nf @@ -19,11 +19,12 @@ process VV_RAW_READS { input: val(meta) - path("VV_INPUT/Metadata/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC + path("VV_INPUT/Metadata/*") // runsheet path("VV_INPUT/00-RawData/Fastq/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC path("VV_INPUT/00-RawData/FastQC_Reports/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC + path(dp_tools__NF_RCP) output: val(meta) @@ -38,21 +39,17 @@ process VV_RAW_READS { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ - ${ meta.paired_end ? "'demuliplexed paired end raw data'" : "'demuliplexed single end raw data'"} \\ - ${ meta.paired_end ? "'qc reports for paired end raw data'" : "'qc reports for single end raw data'"} \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ + ${ meta.paired_end ? "'demuliplexed paired end raw data,qc reports for paired end raw data'" : "'demuliplexed single end raw data,qc reports for single end raw data'"} \\ --run-components \\ - 'Metadata' \\ - 'Raw Reads' \\ - 'Raw Reads By Sample' \\ - --max-flag-code ${ params.max_flag_code } + 'Metadata,Raw Reads,Raw Reads By Sample' \\ + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi """ } @@ -79,6 +76,7 @@ process VV_TRIMMED_READS { path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports multiqc zipped report path("VV_INPUT/01-TG_Preproc/Trimming_Reports/*") // trimming reports multiqc unzipped report + path(dp_tools__NF_RCP) output: path("01-TG_Preproc/Fastq"), emit: VVed_trimmed_reads @@ -92,20 +90,18 @@ process VV_TRIMMED_READS { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true + # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ - ${ meta.paired_end ? "'paired end trimmed reads'" : "'single end trimmed reads'"} \\ - ${ meta.paired_end ? "'qc reports for paired end trimmed reads data'" : "'qc reports for single end trimmed reads data'"} \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ + ${ meta.paired_end ? "'paired end trimmed reads,qc reports for paired end trimmed reads data'" : "'single end trimmed reads,qc reports for single end trimmed reads data'"} \\ --run-components \\ - 'Trim Reads' \\ - 'Trimmed Reads By Sample' \\ - --max-flag-code ${ params.max_flag_code } + 'Trim Reads,Trimmed Reads By Sample' \\ + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi """ } @@ -129,6 +125,7 @@ process VV_STAR_ALIGNMENTS { path("VV_INPUT/02-STAR_Alignment/*") // zipped multiqc report path("VV_INPUT/02-STAR_Alignment/*") // unzipped multiqc report path("VV_INPUT/02-STAR_Alignment/*") // reindexed, sorted bam/bed files + path(dp_tools__NF_RCP) output: path("02-STAR_Alignment") @@ -138,20 +135,18 @@ process VV_STAR_ALIGNMENTS { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true sort_into_subdirectories_by_sample.py 02-STAR_Alignment 02-STAR_Alignment '_*' # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ 'STAR alignments' \\ --run-components \\ - 'STAR Alignments' \\ - 'STAR Alignments By Sample' \\ - --max-flag-code ${ params.max_flag_code } + 'STAR Alignments,STAR Alignments By Sample' \\ + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi """ @@ -176,7 +171,7 @@ process VV_RSEQC { path("VV_INPUT/RSeQC_Analyses/03_infer_experiment/*") // genebody multiqc path("VV_INPUT/RSeQC_Analyses/04_inner_distance/*") // genebody multiqc path("VV_INPUT/RSeQC_Analyses/05_read_distribution/*") // genebody multiqc - + path(dp_tools__NF_RCP) output: path("RSeQC_Analyses") @@ -186,7 +181,7 @@ process VV_RSEQC { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.txt' sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.curves.pdf' sort_into_subdirectories_by_sample.py RSeQC_Analyses RSeQC_Analyses/02_geneBody_coverage '.geneBodyCoverage.r' @@ -199,18 +194,15 @@ process VV_RSEQC { # These are not in sub directories: sort_into_subdirectories_by_sample.py RSeQC_Analyses/05_read_distribution RSeQC_Analyses/05_read_distribution '_read_dist.out' mv RSeQC_Analyses/*_read_dist.out RSeQC_Analyses/05_read_distribution - # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ ${ meta.paired_end ? "'RSeQC output for paired end data'" : "'RSeQC output for single end data'"} \\ --run-components \\ - 'RSeQC' \\ - 'RSeQC By Sample' \\ - --max-flag-code ${ params.max_flag_code } + 'RSeQC,RSeQC By Sample' \\ + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi # Remove all placeholder files and empty directories to prevent publishing @@ -239,7 +231,7 @@ process VV_RSEM_COUNTS { path("VV_INPUT/03-RSEM_Counts/*") // RSEM dataset output path("VV_INPUT/03-RSEM_Counts/*") // zipped multiqc report path("VV_INPUT/03-RSEM_Counts/*") // unzipped multiqc report - + path(dp_tools__NF_RCP) output: path("03-RSEM_Counts") @@ -249,18 +241,17 @@ process VV_RSEM_COUNTS { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ 'RSEM counts' \\ --run-components \\ 'RSEM Counts' \\ - --max-flag-code ${ params.max_flag_code } + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi """ } @@ -285,9 +276,9 @@ process VV_DESEQ2_ANALYSIS { path("VV_INPUT/03-RSEM_Counts/*") // unzipped multiqc report path("VV_INPUT/04-DESeq2_NormCounts/*") // norm counts files path("VV_INPUT/05-DESeq2_DGE/*") // dge files - path("VV_INPUT/04-DESeq2_NormCounts/*") // ERCC norm counts files + path("VV_INPUT/ 04-DESeq2_NormCounts/*") // ERCC norm counts files path("VV_INPUT/05-DESeq2_DGE/ERCC_NormDGE/*") // ERCC dge files - + path(dp_tools__NF_RCP) output: path("04-DESeq2_NormCounts") @@ -298,25 +289,18 @@ process VV_DESEQ2_ANALYSIS { """ # move from VV_INPUT to task directory # This allows detection as output files for publishing - mv VV_INPUT/* . + mv VV_INPUT/* . || true # Run V&V unless user requests to skip V&V - if ${ !params.skipVV} ; then - VV_data_assets.py --root-path . \\ - --accession ${ params.gldsAccession } \\ - --runsheet-path Metadata/*_runsheet.csv \\ - --data-asset-sets \\ - 'RSEM Output' \\ - 'DGE Output' \\ - ${ meta.has_ercc ? "'ERCC DGE Output'" : ''} \\ + if ${ !params.skipVV } ; then + dpt validation run ${dp_tools__NF_RCP} . Metadata/*_runsheet.csv \\ + --data-asset-key-sets \\ + 'RSEM Output,DGE Output${ meta.has_ercc ? ",ERCC DGE Output" : ''}' \\ --run-components \\ - 'DGE Metadata' \\ - ${ meta.has_ercc ? "'DGE Metadata ERCC'" : '' } \\ - 'DGE Output' \\ - ${ meta.has_ercc ? "'DGE Output ERCC'" : '' } \\ - --max-flag-code ${ params.max_flag_code } + 'DGE Metadata${ meta.has_ercc ? ",DGE Metadata ERCC" : '' },DGE Output${ meta.has_ercc ? ",DGE Output ERCC" : '' }' \\ + --max-flag-code ${ params.max_flag_code } \\ + --output VV_log.tsv fi - # Remove all placeholder files and empty directories to prevent publishing find . -type f,l -name *.placeholder -delete find . -empty -type d -delete From dca4fdad7518ac9ead3ee2e4c5f57ac0fe25c715 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Thu, 11 May 2023 17:51:20 +0000 Subject: [PATCH 09/58] fix: bind sample at definition Prior, `sample` was global within the lambda functions and thus resulted in last `sample` being used for all checks (the last state of `sample`). Now, `sample` is bound at lambda definition rather than at runtime (using global). Validated in 194 and 48 tests. --- .../bin/dp_tools__NF_RCP/protocol.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py index ef6f9699..5eaa896a 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/protocol.py @@ -813,12 +813,12 @@ def validate( payloads=( [ { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "raw forward reads fastq GZ" ].path }, { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "raw reverse reads fastq GZ" ].path }, @@ -826,7 +826,7 @@ def validate( if dataset.metadata["paired_end"] else [ { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "raw reads fastq GZ" ].path }, @@ -862,12 +862,12 @@ def validate( payloads=( [ { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "trimmed forward reads fastq GZ" ].path }, { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "trimmed reverse reads fastq GZ" ].path }, @@ -875,7 +875,7 @@ def validate( if dataset.metadata["paired_end"] else [ { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "trimmed reads fastq GZ" ].path } @@ -912,12 +912,12 @@ def validate( with vp.payload( payloads=[ { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "aligned ToTranscriptome Bam" ].path, }, { - "file": lambda: sample.data_assets[ + "file": lambda sample=sample: sample.data_assets[ "aligned SortedByCoord Bam" ].path, }, @@ -933,7 +933,7 @@ def validate( with vp.payload( payloads=[ { - "multiqc_inputs": lambda: [ + "multiqc_inputs": lambda sample=sample: [ sample.data_assets["aligned log Final"].path ], }, @@ -965,7 +965,7 @@ def validate( with vp.payload( payloads=[ { - "input_dir": lambda: sample.data_assets[ + "input_dir": lambda sample=sample: sample.data_assets[ "genebody coverage out" ].path }, @@ -980,7 +980,7 @@ def validate( with vp.payload( payloads=[ { - "input_dir": lambda: sample.data_assets[ + "input_dir": lambda sample=sample: sample.data_assets[ "inner distance out" ].path }, From 2a5655297a6b547f1d154de1198bdbe9ebe3c4c0 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Thu, 25 May 2023 21:53:55 +0000 Subject: [PATCH 10/58] docs[dppd]: update DPPD with workflow update Reference workflow commit: 3b7e0ba --- RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index e1a83b6e..7db21261 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -43,6 +43,7 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform - Fixed rare edge case where groupwise mean and standard deviations could become misassociated to incorrect groups. This had affected [step 9f](#9f-prepare-genelab-dge-tables-with-annotations-on-datasets-with-ercc-spike-in) and [step 9i](#9i-prepare-genelab-dge-tables-with-annotations-on-datasets-without-ercc-spike-in). +- [Step 2a](#2a-trimfilter-raw-data) adapter type argument removed in favor of using the built in TrimGalore! adapter [autodetection](https://github.com/FelixKrueger/TrimGalore/blob/0.6.7/Docs/Trim_Galore_User_Guide.md#adapter-auto-detection). --- # Table of contents @@ -204,7 +205,6 @@ trim_galore --gzip \ --path_to_cutadapt /path/to/cutadapt \ --cores NumberOfThreads \ --phred33 \ - --illumina \ # if adapters are not illumina, replace with adapters used --output_dir /path/to/TrimGalore/output/directory \ --paired \ # only for PE studies, remove this parameter if raw data are SE sample1_R1_raw.fastq.gz sample1_R2_raw.fastq.gz sample2_R1_raw.fastq.gz sample2_R2_raw.fastq.gz @@ -218,7 +218,6 @@ trim_galore --gzip \ - `--path_to_cutadapt` - specify path to cutadapt software if it is not in your `$PATH` - `--cores` - specify the number of threads available on the server node to perform trimming - `--phred33` - instructs cutadapt to use ASCII+33 quality scores as Phred scores for quality trimming -- `--illumina` - defines the adapter sequence to be trimmed as the first 13bp of the Illumina universal adapter `AGATCGGAAGAGC` - `--output_dir` - the output directory to store results - `--paired` - indicates paired-end reads - both reads, forward (R1) and reverse (R2) must pass length threshold or else both reads are removed - `sample1_R1_raw.fastq.gz sample1_R2_raw.fastq.gz sample2_R1_raw.fastq.gz sample2_R2_raw.fastq.gz` – the input reads are specified as a positional argument, paired-end read files are listed pairwise such that the forward reads (*R1_raw.fastq.gz) are immediately followed by the respective reverse reads (*R2_raw.fastq.gz) for each sample From 3b5ceca79ef6c096b66873afafedc2eca1d928b5 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 22:45:22 +0000 Subject: [PATCH 11/58] ci: add github action --- .github/workflows/ci.yml | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..1ac1f747 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,52 @@ +name: CI +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + push: + branches: + - DEV_NF_RCP-F + +env: + NXF_ANSI_LOG: false + +jobs: + test: + name: Run pipeline with test data + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "22.10.1" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@v3 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Hash Github Workspace + id: hash_workspace + run: | + echo "digest=$(echo RNA_3.10.1_${{ github.workspace }} | md5sum | cut -c 1-25)" >> $GITHUB_OUTPUT + + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: ${{ steps.hash_workspace.outputs.digest }} + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: rnaseq3 + path: test-datasets/ + + - name: Run pipeline with test data + run: | + ls test-datasets + nf-test test ${GITHUB_WORKSPACE} \ No newline at end of file From da02ccc7324325a1a7b9569f925b92d439405940 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 22:51:02 +0000 Subject: [PATCH 12/58] ci: add nf-test install --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1ac1f747..ec5c5ece 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,10 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - name: Install nf-test + run: | + curl -fsSL https://code.askimed.com/install/nf-test | bash + - name: Hash Github Workspace id: hash_workspace run: | From 56c8eca4bc810f972887e3dc36e6965c8a42a81d Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 22:53:07 +0000 Subject: [PATCH 13/58] ci: fix install of nf-test --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec5c5ece..1264a076 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: - name: Install nf-test run: | curl -fsSL https://code.askimed.com/install/nf-test | bash + chmod u+x nf-test - name: Hash Github Workspace id: hash_workspace @@ -53,4 +54,4 @@ jobs: - name: Run pipeline with test data run: | ls test-datasets - nf-test test ${GITHUB_WORKSPACE} \ No newline at end of file + ./nf-test test ${GITHUB_WORKSPACE} \ No newline at end of file From 486e60240faeaa68f478c55173986bff76a44aca Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 22:55:14 +0000 Subject: [PATCH 14/58] ci: debugging --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1264a076..0ae3db3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,4 +54,6 @@ jobs: - name: Run pipeline with test data run: | ls test-datasets + ls + ls ${GITHUB_WORKSPACE} ./nf-test test ${GITHUB_WORKSPACE} \ No newline at end of file From e43dd3fe79d5b91b70b35572492701758cb1425b Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 22:59:02 +0000 Subject: [PATCH 15/58] ci: debugging, changing test launch location --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ae3db3c..b99b8208 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,4 +56,4 @@ jobs: ls test-datasets ls ls ${GITHUB_WORKSPACE} - ./nf-test test ${GITHUB_WORKSPACE} \ No newline at end of file + ./nf-test test ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/*.test \ No newline at end of file From ce2fbf3cf76275b30c54383c67b4da2c71b105c9 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 23:01:18 +0000 Subject: [PATCH 16/58] ci: debugging, changing test launch location --- .github/workflows/ci.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b99b8208..eea57161 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,7 +53,5 @@ jobs: - name: Run pipeline with test data run: | - ls test-datasets - ls - ls ${GITHUB_WORKSPACE} - ./nf-test test ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/*.test \ No newline at end of file + cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ + ./nf-test test tests/*.test \ No newline at end of file From 66e1da5fd02057dc91c435e0d23e7975ef3777af Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Mon, 10 Jul 2023 23:04:36 +0000 Subject: [PATCH 17/58] ci[debug]: nf-test path availability --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eea57161..98374b8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,9 +27,12 @@ jobs: version: "${{ matrix.NXF_VER }}" - name: Install nf-test + id: nf-test run: | curl -fsSL https://code.askimed.com/install/nf-test | bash chmod u+x nf-test + echo "::set-output name=bin_path::$(pwd)/nf-test" + - name: Hash Github Workspace id: hash_workspace @@ -54,4 +57,4 @@ jobs: - name: Run pipeline with test data run: | cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ - ./nf-test test tests/*.test \ No newline at end of file + ${{ steps.nf-test.outputs.bin_path}} test tests/*.test \ No newline at end of file From b27388faf2c67104bb88d7af3b78c21328022a2b Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 17:06:52 +0000 Subject: [PATCH 18/58] ci[debug]: get test data from fork --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 98374b8f..6288b454 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,11 +50,12 @@ jobs: if: steps.cache-testdata.outputs.cache-hit != 'true' uses: actions/checkout@v3 with: - repository: nf-core/test-datasets - ref: rnaseq3 + repository: J-81/test-datasets-extended + ref: NF_RCP-F path: test-datasets/ - name: Run pipeline with test data run: | + ls test-datasets/ cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ ${{ steps.nf-test.outputs.bin_path}} test tests/*.test \ No newline at end of file From 1092e80ab2b16fde06241e290a7254a45521a74a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 17:46:37 +0000 Subject: [PATCH 19/58] ci[debug]: Add tests for modules actions --- .github/workflows/ci.yml | 43 ++++- .../workflow_code/modules/{ => dge}/dge.nf | 2 - .../tests/config/nftest_modules.yml | 3 + .../tests/modules/dge.nf.test.snap | 89 ---------- .../tests/modules/{ => dge}/dge.nf.test | 31 ++-- .../tests/modules/dge/dge.nf.test.snap | 155 ++++++++++++++++++ 6 files changed, 214 insertions(+), 109 deletions(-) rename RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/{ => dge}/dge.nf (98%) create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap rename RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/{ => dge}/dge.nf.test (85%) create mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6288b454..20c97eb4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,14 +9,35 @@ env: NXF_ANSI_LOG: false jobs: + changes: + name: Check for changes + runs-on: ubuntu-latest + outputs: + # Expose matched filters as job 'modules' output variable + modules: ${{ steps.filter.outputs.changes }} + steps: + - uses: actions/checkout@v3 + + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: "RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml" + test: - name: Run pipeline with test data + name: ${{ matrix.tags }} ${{ matrix.profile }} runs-on: ubuntu-latest + needs: changes + if: needs.changes.outputs.modules != '[]' strategy: matrix: NXF_VER: - "22.10.1" - "latest-everything" + profile: + - "docker" + - "singularity" + tags: ["${{ fromJson(needs.changes.outputs.modules) }}"] + steps: - name: Check out pipeline code uses: actions/checkout@v3 @@ -26,6 +47,12 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - name: Set up Singularity + if: matrix.profile == 'singularity' + uses: eWaterCycle/setup-singularity@v5 + with: + singularity-version: 3.7.1 + - name: Install nf-test id: nf-test run: | @@ -54,8 +81,18 @@ jobs: ref: NF_RCP-F path: test-datasets/ - - name: Run pipeline with test data + # Test the module + - name: Run nf-test run: | ls test-datasets/ + echo "ls done" cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ - ${{ steps.nf-test.outputs.bin_path}} test tests/*.test \ No newline at end of file + ${{ steps.nf-test.outputs.bin_path}} test \ + --profile=${{ matrix.profile }} \ + --tag ${{ matrix.tags }} \ + --tap=test.tap + + - uses: pcolby/tap-summary@v1 + with: + path: >- + test.tap \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf similarity index 98% rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 00b09b05..2a21232e 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -46,7 +46,6 @@ process DGE_BY_DESEQ2 { --annotation_file_path ${annotation_file} \\ --extended_table_output_prefix "dge_output/"\\ --extended_table_output_suffix ".csv" \\ - --verbose if ${ meta.has_ercc ? 'true' : 'false'} then @@ -61,7 +60,6 @@ process DGE_BY_DESEQ2 { --annotation_file_path ${annotation_file} \\ --extended_table_output_prefix "dge_output_ercc/"\\ --extended_table_output_suffix "_ERCCnorm.csv" \\ - --verbose fi """ } diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml new file mode 100644 index 00000000..92ce1aa4 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml @@ -0,0 +1,3 @@ +fastqc: + - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/** + - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/** \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap deleted file mode 100644 index 6d0ff5be..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap +++ /dev/null @@ -1,89 +0,0 @@ -{ - "Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION": { - "content": [ - [ - [ - "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce", - "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9", - "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59", - "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5", - "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3" - ] - ], - [ - [ - "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b", - "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b" - ] - ], - [ - - ], - [ - - ], - [ - "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } - ], - "timestamp": "2023-01-25T20:29:00+0000" - }, - "ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION": { - "content": [ - [ - [ - "contrasts.csv:md5,af3bef64a768dd6220b6a143d2fbb1bc", - "SampleTable.csv:md5,0b64b62678b9903bda2a431129cf52af", - "differential_expression.csv:md5,e33ffaa350a90f7dd0f4607292db68de", - "visualization_output_table.csv:md5,89c4b8722bf2a8fe25c6fcfa915e5c56", - "visualization_PCA_table.csv:md5,c19f946356e520bd9bf68606d639f21c" - ] - ], - [ - [ - "Normalized_Counts.csv:md5,c148732be1d0b1bb61278bfef612f07b", - "RSEM_Unnormalized_Counts.csv:md5,fd101e235076c3ae66c513bc96017b33" - ] - ], - [ - "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } - ], - "timestamp": "2023-01-25T20:29:00+0000" - }, - "ISSUE_55_ON_Mock:Should run without failures": { - "content": [ - [ - [ - "contrasts.csv:md5,b99c88e9c92f1d1588727df47523c4ad", - "SampleTable.csv:md5,229c988f09fbfeca182da7011f6f93b4", - "differential_expression.csv:md5,f58b6f602598a0c25379afd0c5e87a71", - "visualization_output_table.csv:md5,d056472d2ac135cad9ee4d9f33bde387", - "visualization_PCA_table.csv:md5,1293b99878d7a7eb0e02dc6a38e33d39" - ] - ], - [ - [ - "Normalized_Counts.csv:md5,393160aee08165165ccd2b8579a45161", - "RSEM_Unnormalized_Counts.csv:md5,6759e0e7ec07960691d3913b3877c129" - ] - ], - [ - "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } - ], - "timestamp": "2023-01-25T20:29:00+0000" - } -} \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test similarity index 85% rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index d43f32c4..fe30615d 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -5,6 +5,7 @@ nextflow_process { process "DGE_BY_DESEQ2" test("Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION") { + tag 'dev' when { params { @@ -14,8 +15,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") @@ -37,7 +38,7 @@ nextflow_process { } - test("ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION") { + test("GLDS-423 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION") { when { params { @@ -47,10 +48,10 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/GLDS-321/03-RSEM_Counts/*.genes.results") - input[2] = [ primary_keytype:'TAIR' ] - input[3] = file("https://figshare.com/ndownloader/files/36597132") + input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.4/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") + input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.4/GLDS-423/03-RSEM_Counts/*.genes.results") + input[2] = [ primary_keytype:'ENSEMBL' ] + input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") """ } @@ -61,8 +62,8 @@ nextflow_process { assert snapshot( process.out.dge, process.out.norm_counts, - // NON_ERCC process.out.dge_ercc, - // NON_ERCC process.out.norm_counts_ercc, + process.out.dge_ercc, + process.out.norm_counts_ercc, process.out.version, ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated! ).match() @@ -70,7 +71,7 @@ nextflow_process { } - test("ISSUE_55_ON_Mock:Should run without failures") { + test("ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION") { when { params { @@ -80,10 +81,10 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/mocks/overlapping_samplenames/MOCK_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/mocks/overlapping_samplenames/*.genes.results") - input[2] = [ primary_keytype:'ENSEMBL' ] - input[3] = file("https://figshare.com/ndownloader/files/36597114") + input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-321/03-RSEM_Counts/*.genes.results") + input[2] = [ primary_keytype:'TAIR' ] + input[3] = file("https://figshare.com/ndownloader/files/36597132") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") """ } @@ -103,4 +104,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap new file mode 100644 index 00000000..5bf55599 --- /dev/null +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap @@ -0,0 +1,155 @@ +{ + "Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION": { + "content": [ + [ + [ + "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce", + "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9", + "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59", + "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5", + "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3" + ] + ], + [ + [ + "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b", + "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b" + ] + ], + [ + + ], + [ + + ], + [ + "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" + ], + { + "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, + "dp_tools_version": "1.1.8" + } + ], + "timestamp": "2023-06-15T17:38:07+0000" + }, + "ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION": { + "content": [ + [ + [ + "contrasts.csv:md5,1b63fd558a509d430609c5d824f7c090", + "SampleTable.csv:md5,308d0c4af88b557ef857f3be8eb2a339", + "differential_expression.csv:md5,de5fe517a08851b0a4817c459ce95d8a", + "visualization_output_table.csv:md5,7f72442d40a030d50211db3ab6885dfb", + "visualization_PCA_table.csv:md5,c19f946356e520bd9bf68606d639f21c" + ] + ], + [ + [ + "Normalized_Counts.csv:md5,c148732be1d0b1bb61278bfef612f07b", + "RSEM_Unnormalized_Counts.csv:md5,fd101e235076c3ae66c513bc96017b33" + ] + ], + [ + "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" + ], + { + "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, + "dp_tools_version": "1.1.8" + } + ], + "timestamp": "2023-06-15T17:38:07+0000" + }, + "Baseline_ON_GLDS-194 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { + "content": [ + [ + [ + "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce", + "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9", + "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59", + "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5", + "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3" + ] + ], + [ + [ + "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b", + "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b" + ] + ], + [ + + ], + [ + + ], + [ + "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" + ], + { + "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, + "dp_tools_version": "1.1.8" + } + ], + "timestamp": "2023-06-15T17:38:07+0000" + }, + "GLDS-423 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { + "content": [ + [ + [ + "contrasts.csv:md5,6ec6d3af26f8d482b64112b88d4e8417", + "SampleTable.csv:md5,e007d8494abcd37c46c012fabe91d2e7", + "differential_expression.csv:md5,76fd2a34d35cd3110198a0b22b381846", + "visualization_output_table.csv:md5,0427f37905a2b55a595bfbeefddff1ac", + "visualization_PCA_table.csv:md5,24e62c90c57550de57bac0b35c3a0121" + ] + ], + [ + [ + "Normalized_Counts.csv:md5,0ce1b39d28e7cce080e0661c56ed5f76", + "RSEM_Unnormalized_Counts.csv:md5,5e3ee6c36e6ebefa1bd947c2fa586b99" + ] + ], + [ + + ], + [ + + ], + [ + "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" + ], + { + "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, + "dp_tools_version": "1.1.8" + } + ], + "timestamp": "2023-06-15T17:38:07+0000" + }, + "ISSUE_55_ON_Mock:Should run without failures": { + "content": [ + [ + [ + "contrasts.csv:md5,b99c88e9c92f1d1588727df47523c4ad", + "SampleTable.csv:md5,229c988f09fbfeca182da7011f6f93b4", + "differential_expression.csv:md5,f58b6f602598a0c25379afd0c5e87a71", + "visualization_output_table.csv:md5,d056472d2ac135cad9ee4d9f33bde387", + "visualization_PCA_table.csv:md5,1293b99878d7a7eb0e02dc6a38e33d39" + ] + ], + [ + [ + "Normalized_Counts.csv:md5,393160aee08165165ccd2b8579a45161", + "RSEM_Unnormalized_Counts.csv:md5,6759e0e7ec07960691d3913b3877c129" + ] + ], + [ + "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a" + ], + { + "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, + "dp_tools_version": "1.1.8" + } + ], + "timestamp": "2023-06-15T17:38:07+0000" + } +} \ No newline at end of file From 2dd8fcea96e75a38983455b479a72cb28ceafa1f Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 17:51:28 +0000 Subject: [PATCH 20/58] ci[debug]: Add tag --- .../NF_RCP-F/workflow_code/tests/config/nftest_modules.yml | 2 +- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml index 92ce1aa4..25b92aa4 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml @@ -1,3 +1,3 @@ -fastqc: +dge: - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/** - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/** \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index fe30615d..8b742780 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -3,6 +3,7 @@ nextflow_process { name "Test Process DGE_BY_DESEQ2" script "modules/dge.nf" process "DGE_BY_DESEQ2" + tag 'dge' test("Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION") { tag 'dev' From 11a22f011efa06a68defcccedfe5f06e0eb703d5 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 17:55:17 +0000 Subject: [PATCH 21/58] ci[debug]: Adjust pathing in test --- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 8b742780..286c8882 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -1,7 +1,7 @@ nextflow_process { name "Test Process DGE_BY_DESEQ2" - script "modules/dge.nf" + script "../modules/dge.nf" process "DGE_BY_DESEQ2" tag 'dge' @@ -16,8 +16,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/NF_RCP-F/1.0.3/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("testdata/NF_RCP-F/1.0.3/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From d15b39bcc7d8b41826acee376559267140529b8a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 17:58:42 +0000 Subject: [PATCH 22/58] ci[debug]: Assess pathing --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20c97eb4..2b1c78f7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,7 +85,7 @@ jobs: - name: Run nf-test run: | ls test-datasets/ - echo "ls done" + echo "ls done for $(pwd)" cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ ${{ steps.nf-test.outputs.bin_path}} test \ --profile=${{ matrix.profile }} \ From 7ba75d0e88d1acd60ad3eaf55e1cfd622b7e445b Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:07:53 +0000 Subject: [PATCH 23/58] ci[debug]: Assess pathing --- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 286c8882..7e0234cc 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -1,7 +1,7 @@ nextflow_process { name "Test Process DGE_BY_DESEQ2" - script "../modules/dge.nf" + script "modules/dge/dge.nf" process "DGE_BY_DESEQ2" tag 'dge' From 4e317f2148615b8e647937f646f4bf7fe9102679 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:22:01 +0000 Subject: [PATCH 24/58] ci[debug]: Update test asset pathing --- .../workflow_code/tests/modules/dge/dge.nf.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 7e0234cc..5e9d8edb 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -16,8 +16,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("testdata/NF_RCP-F/1.0.3/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("testdata/NF_RCP-F/1.0.3/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") @@ -49,8 +49,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.4/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.4/GLDS-423/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/NF_RCP-F/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") + input[1] = file("testdata/NF_RCP-F/GLDS-423/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") @@ -82,8 +82,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets-extended/testdata/NF_RCP-F/1.0.3/GLDS-321/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/NF_RCP-F/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") + input[1] = file("testdata/NF_RCP-F/GLDS-321/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'TAIR' ] input[3] = file("https://figshare.com/ndownloader/files/36597132") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From 33c63ce1d95735362c9defdb6801b27fb4c7a7e3 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:27:31 +0000 Subject: [PATCH 25/58] ci[debug]: Update test asset pathing --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b1c78f7..319e9c22 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,13 +79,11 @@ jobs: with: repository: J-81/test-datasets-extended ref: NF_RCP-F - path: test-datasets/ + path: ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ # Test the module - name: Run nf-test run: | - ls test-datasets/ - echo "ls done for $(pwd)" cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ ${{ steps.nf-test.outputs.bin_path}} test \ --profile=${{ matrix.profile }} \ From 5d35b6194a80c84e5670c0574f3d3f5234005ac6 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:34:47 +0000 Subject: [PATCH 26/58] ci[debug]: Update test asset pathing --- .github/workflows/ci.yml | 3 +++ .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 319e9c22..84f52590 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,6 +85,9 @@ jobs: - name: Run nf-test run: | cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ + ls -l + pwd + echo 'done' ${{ steps.nf-test.outputs.bin_path}} test \ --profile=${{ matrix.profile }} \ --tag ${{ matrix.tags }} \ diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 5e9d8edb..6e2d4b18 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -3,10 +3,9 @@ nextflow_process { name "Test Process DGE_BY_DESEQ2" script "modules/dge/dge.nf" process "DGE_BY_DESEQ2" - tag 'dge' test("Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION") { - tag 'dev' + tag 'dge' when { params { From 47bebcd55046a569f96c3a9db069d7467e29ab67 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:40:42 +0000 Subject: [PATCH 27/58] ci[debug]: Update test asset pathing --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 84f52590..d75ba8dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,7 +70,7 @@ jobs: id: cache-testdata uses: actions/cache@v3 with: - path: test-datasets/ + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ key: ${{ steps.hash_workspace.outputs.digest }} - name: Check out test data @@ -79,7 +79,7 @@ jobs: with: repository: J-81/test-datasets-extended ref: NF_RCP-F - path: ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ # Test the module - name: Run nf-test From 7d2cc24bc1a02c091d5f3eba61a99fbf133df74e Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:44:55 +0000 Subject: [PATCH 28/58] ci[debug]: Update test asset pathing --- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 6e2d4b18..16916910 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -15,8 +15,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From 1e6a9cbfe28fe0fe72e0af458b1fd84475ece9cc Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:50:19 +0000 Subject: [PATCH 29/58] ci[debug]: Assess pathing --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d75ba8dc..da0128fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -86,6 +86,7 @@ jobs: run: | cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ ls -l + ls -l test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv pwd echo 'done' ${{ steps.nf-test.outputs.bin_path}} test \ From 53363e42c87445fc92c108b7cf7267f99da55c5a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 18:55:12 +0000 Subject: [PATCH 30/58] ci[debug]: Assess pathing --- .github/workflows/ci.yml | 6 +++--- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da0128fc..2b7c1585 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,7 +70,7 @@ jobs: id: cache-testdata uses: actions/cache@v3 with: - path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets key: ${{ steps.hash_workspace.outputs.digest }} - name: Check out test data @@ -79,14 +79,14 @@ jobs: with: repository: J-81/test-datasets-extended ref: NF_RCP-F - path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets/ + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets # Test the module - name: Run nf-test run: | cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ ls -l - ls -l test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv + ls -l test-datasets pwd echo 'done' ${{ steps.nf-test.outputs.bin_path}} test \ diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 16916910..7f38c164 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -15,8 +15,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("test-datasets/test-datasets-extended/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets/test-datasets-extended/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From 1b6e325e9060e880092ec841cb60b98d10ce517a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:04:32 +0000 Subject: [PATCH 31/58] ci[debug]: Update pathing --- .../NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 7f38c164..16916910 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -15,8 +15,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets/test-datasets-extended/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets/test-datasets-extended/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From b5013e9fde298af6935208ab4568bfd2451a5285 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:09:00 +0000 Subject: [PATCH 32/58] ci[debug]: Update pathing --- .../workflow_code/tests/modules/dge/dge.nf.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 16916910..e5966972 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -15,8 +15,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") - input[1] = file("test-datasets/testdata/NF_RCP-F/GLDS-194/03-RSEM_Counts/*.genes.results") + input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") @@ -48,8 +48,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("testdata/NF_RCP-F/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") - input[1] = file("testdata/NF_RCP-F/GLDS-423/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") + input[1] = file("testdata/GLDS-423/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'ENSEMBL' ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") @@ -81,8 +81,8 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("testdata/NF_RCP-F/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") - input[1] = file("testdata/NF_RCP-F/GLDS-321/03-RSEM_Counts/*.genes.results") + input[0] = file("testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") + input[1] = file("testdata/GLDS-321/03-RSEM_Counts/*.genes.results") input[2] = [ primary_keytype:'TAIR' ] input[3] = file("https://figshare.com/ndownloader/files/36597132") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") From 574eb79756d9c61112df6ea2f999f14671a1b154 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:15:25 +0000 Subject: [PATCH 33/58] ci[debug]: Bump and attempt to resolve eof error on command.sh --- .../NF_RCP-F/workflow_code/modules/dge/dge.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 2a21232e..9a8ee724 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -61,5 +61,7 @@ process DGE_BY_DESEQ2 { --extended_table_output_prefix "dge_output_ercc/"\\ --extended_table_output_suffix "_ERCCnorm.csv" \\ fi + + echo "done" """ } From fc89c5e9df118629635f7848689262c7847afe65 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:20:20 +0000 Subject: [PATCH 34/58] ci[debug]: Bump and attempt to resolve eof error on command.sh --- .../NF_RCP-F/workflow_code/modules/dge/dge.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 9a8ee724..14fc0710 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -45,7 +45,7 @@ process DGE_BY_DESEQ2 { --dge_output_prefix "dge_output/" \\ --annotation_file_path ${annotation_file} \\ --extended_table_output_prefix "dge_output/"\\ - --extended_table_output_suffix ".csv" \\ + --extended_table_output_suffix ".csv" if ${ meta.has_ercc ? 'true' : 'false'} then @@ -59,7 +59,7 @@ process DGE_BY_DESEQ2 { --dge_output_prefix "dge_output_ercc/ERCCnorm_" \\ --annotation_file_path ${annotation_file} \\ --extended_table_output_prefix "dge_output_ercc/"\\ - --extended_table_output_suffix "_ERCCnorm.csv" \\ + --extended_table_output_suffix "_ERCCnorm.csv" fi echo "done" From 3362429bcd71bb286d1abd972d9fa4093bc183bb Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:39:15 +0000 Subject: [PATCH 35/58] ci[clean]: Remove extra prints --- .github/workflows/ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b7c1585..5275ba59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,10 +85,6 @@ jobs: - name: Run nf-test run: | cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ - ls -l - ls -l test-datasets - pwd - echo 'done' ${{ steps.nf-test.outputs.bin_path}} test \ --profile=${{ matrix.profile }} \ --tag ${{ matrix.tags }} \ From 09726a5e631ccc3789912564fc2682a88cae30ba Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:42:44 +0000 Subject: [PATCH 36/58] ci: only run based on filter compared to last commit --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5275ba59..13cd88f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: - uses: dorny/paths-filter@v2 id: filter with: + base: 'DEV_NF_RCP-F' filters: "RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml" test: From 150001939f3db91635aa4004e7ffd4c847259b87 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 19:45:08 +0000 Subject: [PATCH 37/58] feat: remove extra print --- .../NF_RCP-F/workflow_code/modules/dge/dge.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 14fc0710..638b8922 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -61,7 +61,5 @@ process DGE_BY_DESEQ2 { --extended_table_output_prefix "dge_output_ercc/"\\ --extended_table_output_suffix "_ERCCnorm.csv" fi - - echo "done" """ } From 6719218995a485f6fc92fc0c2d89456ff201b88d Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 21:37:58 +0000 Subject: [PATCH 38/58] feat: add summary.txt output and rework tests ### Prior * Tests used dge table md5sum for testing, this was subject to numerical instability driven differences causing test failure. ### Now * summary.txt files are output from the `summary` function in R. This naturally reduces precision in summary statistics allowing detection of meaningful numeric differences (non numerical stability related) --- .../bin/dge_annotation_R_scripts.zip | Bin 11850 -> 11685 bytes .../dge_annotation_R_scripts/Perform_DGE.Rmd | 89 ++++++++++-------- .../dge_annotation_workflow.R | 13 --- .../NF_RCP-F/workflow_code/modules/dge/dge.nf | 3 + .../tests/modules/dge/dge.nf.test | 9 +- .../tests/modules/dge/dge.nf.test.snap | 30 ++---- 6 files changed, 67 insertions(+), 77 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts.zip index 3e0230d8523657deee489abe6e8e004c7e2b090f..fc85919065fb7c29a0ee4bb2b7d40f94616326a3 100644 GIT binary patch delta 7401 zcmZ9RRZyMH(uMbijk^SQ*I>bd1=k?KLLj(XAZV}`*Wm8%?hxE%;{NKe6mh zmKL0r83`2ck+hfzESdbBy9G=QKJT47sq!8N$9- z$C}k;K@_#U^2~JIARwRG$|U*4)Q~5}GSHQ&ObKq%t(gDI?a)9*3e(U?!CT`i3{gCc{Ui|(y8X2P+% zPA6}iUt247Yc;{7Qx)f(Lo9@vSJ*N=5G8LMAoghyn^7QhW3I zih%6uG1)O$WbaY=^VZ7g&+_zi3K^bSSC;K42 z_iUUw9o7=6Ayny!TPWbzou=YL;)DnzP^;4)hfB9_0Pns1(Kdp2-ma=~Z&q_4?5!vX z;2ud9vUQ(qxvdbghVao1p8cEsT{HoDZ?*Mq?W&7jmm0{m#>tVB(-o26vHcdx*72k} z5IpFC2fNqbCm5UR7`n6?#QE)TvM2>idU4O8fO1q~3GZDpkINDtIVzj08=X1htt*2< z4O!hH^^P<<&(Tw3l)JV$i^PDw~HebrIW&@-U$rP$xd@k>R+c-93Do<4ZRd6ZjhRy{>plUj)i| zkLGqVa9{5#Zh*o$<#){Es^&2Z7We{kXe2R5R@{ot@Ry;RD$`E_-LXq+CB+-S#0Sjs zfi}3=(6fiY6qGYJ-h(=V@QTCMXe8kvK`uPeXr|v9Puxw(c%qO(P}@SaLcbBm5(dT% zu$wLIM!)PpvKd-&3Tx$8cz0e`oRPvSiYoS`IP{aiz!kDLg_)2_i-Fs&c5_0*mPb|e zIFocz#6!WSv~S^Ca3J*F8PNy8@n)-+KwYVo#Jed>p(LM07B|i&pDLQjm057kUymn) z!$ZXv{eo=1H(K#ER9d3hJm1?9r@Xfxw2v|DygX7%e4)W(k-$@|q22T3rFuOugA0+S zTY&s_`Dk6(=c=2Bc9ikJ7vv&ys}^^4kX&Yft|-tvxB<~$3oiQj0Gl>5QqQvrnFAQbJ?IPH$8R|al_Ay)p(x;fvJGYh|w9-Z?2w4h#l@EPZA!} zqEps{p|<{sZ`|sWt)c#U^N_}X@)m9h%>l6x<%_T%siWc++GYTk{}V{#3lDZwaB)`9 z(}Fzsk0tvx%27Jt8Fbd7mJ>qesiULZE-e*19C*ZDAlKms60u=a_@tELUb+v^XpraDD26BJ)-vTKQNKi% znN|eF0%cW^!5d&sr}72+68s6V7Bu|Mky${SjFMMd0AV##MwA#G?_LLVPsnXw=T22% zh9e@R6uQpQlfq5$&5&6msFu6CC3dKUdHGMB;>C6BAQxZY>3#VSM<3n=O!ol8C>>+l zl5WYGS44n%_DUkrxt0jy zhr=HEQ#>{5v|})|0RFI8(xAQ)VuMS~eTi+sCx7+8r1^>YV+Fgpmaqw?<1mjxd80P5MlXw$9IB6UE=>d&7W96jsSHx0NwAjdvBhOzIAJFW)2%z z*RKPw((1U-zgd&rh${P@!{cNk+>dRSGf49bScoaYK*jV{-C4ltJv#5E0d9-)Li_KA znLeOQQ%PXoh+)o2&NQ`xW+2N-zX{i%gK<`OZ;o@xbf6YbRi-$W(p7uAgyY}?d^_$g zuUibJrH-G;69THCzrGYbx}a;{5LgHItDBmR9>60&;n51Q~0Dz;FTtx^DqG`ktLml<7m#>4TY-idcTcF zNr2;m#3?vb)ygbxgk2B*z8)^fM+B3H&|A{nTronp?0%_fzHL^_C(==%m-OUoSAi|d z@i%YKq*^SG(Q+O%&B`0;-mo+BG~#>WxkS%K%BAfmekgTz{4yn8E$mj^-VTt3`Q`Rx z;xRm4cin8fbep4=rMx-#yeG(k?d5Rr@M!KNVJG8q^hDAsX~5}%{KFCxj=eBXEOa`~ zDUZ&u)jv?5zoA9}l@TQNo-0l)VQ(}*?CLzVjCnH1TlSC->6ulth#iB!-{ay~li${@ z)PA)>4zV$~M~O3VP+(sOdU*l0>Y`YN2CHYsTK9;6-s-PSOiTmk>-V5AZClJ}Sy~a>KUml^ z9c|M+GHlSnC_5A9ci-c8xXrgi>}6+zU`wiMcXFrG7#SAUaNpM4a8tx&QjfjX7$4fp zWXa!B>Tgyn&2HJhq5@GP54zwfjV*8zM@c&JPJv!9L?uq&giQFgAt;r;lQ&+UN30nq zA%uUvejJ(b_qV=YzfUL*ia#_ zmrw!;C#nS1=1pE6+Wx5TS{q8W%xCCUhZ#Alp>%QWeK^d97 zn@}GU8Oz!sKDzz`2&rsSna@Jh0LhN^&}o&rx+OqpmCcb5<|!K;JcETV+8aDnAZa6d zH8{fZ_KFpaS`Mq_FmvV4(kCaYHa^CJkR;~jw+gCA%Sis7pSfh@IdldhFft~4n@ARb zsF)JM&Va3ZH6R%$mLl}bV1cy#L`JbxKDk~I<%*|F)_Z6A2821RppOuGK>6+iJ%TX0 z)k0DZb#O(gANPi4aPF(A zxs$D0eBj1D*VAKC%=KcV+w&#!#(v8LWuuwYbF$m>{-Zz_4DtJYkLm7zu3x|T)Zd|J z*5d@SE`QT4yZN3X(VplhDjl%WTn2qX#^MMzG_UhSGXV0~Lw zi3`III_~&|hJ{{Spp_h&= zACKm=cUkOr7`kn=^ujnuBXTmoC?XR8DmI$fCLvClxFfNz4WiylLQMm?T6aM=KYr@m zw~--KiZpJ(A~07iBTj7$&Jd&%XLuvydi=uV?3B|+h7%Rg19xgTBB^OIP?Q!eA5idD zw(-OWc$u*k3)sI|7bz(WG_t~q}qFaCVy9d zIJ2vwJ5};$b^q)@+~ud?6a3-BjJ4`r6-`B*%m|S`ZRr&wNj|G$ zltxa3J-ejzMOr#iy^m^f6J2gv04!w}vM-;J**6g(YT1`Fn?g{Iw#2R4hto1(S@W?e z_q1z;8MGcJq;aL9mz7J4h}G-c_ltUxl^yT*+z&L4HuQy4qFoT;-v0knCt5hVyWT#)PV0Rd6Udb6eG-N`_E9YW@XeOQo<(G=%i zU+yfyM#KqEJYcR^x+^d`?u*NjuZ zxpga5Zi|7@(hsgA_0Ry`CWpo)GV>PZr=jyY5nJZI^>4BygUA%5j)mjg#lIYm6J1yo z{0TfOkcoB`EE{FYaG4!QF4m{WD9<4DX;rX&y=%aG8|zCh_q%jlD*{q~+5OFGwUdYtW*6E#fGTj|zrsd_#hnYYHw#%QrQwt`Fl{PYw320$JY=1^;4I_-Nl zj$SM4F;33_rn)Ms+=WaTvge`5h<-6u=bs71(M!K(o&eM$bW_;Xzzai_93?aKsoJ4P;_bRcJS3;Iw;SzI>_fol2N)v zQRPu8G`EoW%qcn!^J|u|^;PGnJ;8j|JG-}q=v&Lh#%Z>lO6fjdqS`U?4CEsaMRw+c zGY8l<6DQqYiLp}%J?F@mkSD#I+Bpd6n_jym=T+zVcyLq}$=FY%cYnj0+M09_wO3?} z?%mMB0276mVIo9V3+&Cdl0Ngc*F?EbxR55B`QTy&C|iQE3^@WMST(uvyhyM|t-#OC z()I)ylR%*;$lBtpTai1t^+|lfNak03CS$8_CEMAL2>mxXw<>WceiE=G-!C*sdtZOx z@49@A7nk;APSd z^?PO5x0>q+)-sqR)G5>pTiepLt&VJzi;tFntDT}75?wpamoiND z9bi8fGD(;vbe)E#iT95&A_dVF3ijI*!|N5x z@Jz0GDcrUjcB>Ja{yqp!?3tq6Jz|*kcR+%#HV*N@my}9f>ysw=(duu`jd!68BZSzq zCq$^hPAn+wmy0he-f6H}f4o()Ni?uqemk=yF$SD~ zzYZI@I=-r|xP+42El^VLoS@ZJEm~U_qumUc+Xm!{OUf#mG4I z{>W@{o5Q&1en|ZGt7g|0uNn_hd1)!Y0qSyB-y$j3@x2b(9&AN&byg*f6%|BDEn)g? z{*Fk{x_L#Bd8HeCXY;-X3YWm#%V^s&6;Ha|`3nB;96UPrNJ9o?DF6xqf!uLpN$G?D z`<8O!zw!SE*qUqK4ta3B_A2>!QzfDpj4*EDRvtf~lrq)l=bW^q6byaGK`48#Xe^Le zWAXYJk)9Xz;q%)niyu@s_ZK^Nk7FG21@q>;Na3DrI_zTk3aH$I(Z-67S+16Ul*R*r`ocM86C=d^TRC46>yn_h(5)7u6fbAp&& zn}>Tta+tN2rin^HT1His97ErxR^1NRyNS-lZnN_`SzKxau+Y*b8oaT?gWGo-5tU6V@ zd=W{E$v1#ThBG86?#L%ER1$b9M=!;M29l@+My38TwC)xq8zW?( zE2@TTKv=*|vQeTJ?E>lVhvkn|6VOdV_pHA2rYa1bjINC0G-s?Tk-G%Ml{Sj*yqUoG z5j%Zm2;+!}OZew%ARFfnYra1gDOqrB016w`c@4B5DEC_(HY!aJ*9!7pAJWsu`?ylWq<@yPpe5m-b zb~N^_mv`ZUF;9(M)rK5PUTn=5V~*HCKU$PyzC%YC%RzZM2$yg?(HV~~TOOOmNZmK_ z8#m8$eF&e%idA6`^|{FWn#k-KzR=yYC&H=@JGyFMSy4^w!0m2dE33cUIS+f%041Pn zR|d<&drd^g%umFqO}n!MRlcG{>2ei_z9d>KWO&=aUREIf1^p--O(WE>;|>+3lIrEL z##3&~)a>y%vnPN2WyR8e5ta3;6;wSlbVNRK*l?zu6IK54pVP}sFiu#2`0?u9NQ>Jp ze68ikvr!IeJaWdU`x9+QcVMuf@(?F~j^xssXyiKfcG&ljF~Wi85Ai&r?qDurwtP=9#WC7k zeZz!F%{8T)H?eu{%_O|IhT?xtRy4Y?(PH&8a(*H9fAZ8E`fLdci!BY zH#J?=UDdx{yY`o^{b8*#{Rsm!HAOgh6u>`gqv%r-8XHZ2>9~2W;hed<*9QO$Ja;ev z@bBXv9qzwWNPzW!=}csidjDJ3)ZhgV_b-;h2$cK}Y;gyQ0RK(fQKbJ5h5PUXht(Cw zuVd8feAzH&dy%~L;3K(?52#Nc2t#YVww)Io73Rh)iV*kvjtorjJ zw5NWgn0{y5oqMvlG;{HjE>AhtD=!F}NI8aC{-&l$ZTW}8SMrQ&rR!8KM5H;qGfbFA zff}vU(qFn>cLbS}h0AgtGmxfjW#G71X@E#U$J za9L=FZ*%WvA7hNq+fs?^Iv$G?XB3@u9t5r}6A_EuDtk+>i}th2kDR>Cj>~Q7h*81r zxq-LcH@#cO2kQqn=KFSZ70>QZ?H(S*c@Gg0?_>@XeWC((%H1pz7}=lUgwPw?r?JgK zYo)X!nd@)H+X6u@8hF2iQ{H9LGs2P)G$=|NJKE_~P8GGsJF)xFsvof_g?6s82_fji zRkgpp8y*%^MgQuABlYA*^oPjn(7U7*3f+nv*l5R({vc%to?$b|HpZ9c4m|%+Zj$6{ z>V)p2=c5NZ>i>n1(y15D=>Xg4wuM?I3GD}JsmS+aG)(_swT%&QJnV}9&8$8SheAar z9<7`$-R1lwmtu{tm_E%9hxmT!v}p*V12q_l3W4SP##{CPr z8q&5sfXGCrc$m4ji|!=D69%Kq@70js` zU$x#iQDDjr$kBr$1Yv}2#w^NtKK7J8)m39#!;~)m-t11WTJ`eNFHeiTT7?{4t9j|w zsl4n=zYzTP{6S1(09}SS0W<_;%r9 z2%~SUDbW+(E41@AVkGr`I6-!F3%12hKTAV2rc8QclWjg;s^-GSQwN2rs`wrg?cliM zC{LQ!JZk{Ac|!_Xn~16%!NHF4YDC5=-o$RaJA%- z9!`x-o5b8{-;lGWbOf9^fq(5@!k<5S`7nmhKWv2e{L(d#R!Dzb~ z!VvuXHJoJVEuUM06p4)S;xaHe>E`BjPvV`f%C!yKvIB~}{5#|5 zXz0Tt5$B_=BII08na-+50R!fs1y|i#+0D&5j5xcc_c4lFy^!v0@p#u-Gfmuo?aF#j=8C4p|i!$!sGsg_KF&`(i@7DbGj(F8n4x1Df-0W4) zPIo(85K0g@8sdb?JA)NOr2C57@#%N!VIvhC8)0ji+ZO*YXROwx_?Xi8Qhh6h`gzeP zJ^k-5w2-wc;q4D6%Ev&X%DZs(&U!0nl{cc3T9Ei85iYFyUD@T|ne~wU^S<+ZsU1V% zN7bo443~a;u5La}YYqe{J2WLN<~`T6J);iHk|-P`UmQO5F$7=yp_5%vXtWvIFL%Eb zeD|o|vuBf-PShhU%tXV(caYkUu1Et3FO-w)T!^CFpqyUr4ae_T`uzDKgUcpQvT^$w z3^s2AwK%6{im4jx$_o52=y&W^^~gmzP*~=ujdu=XRT`Z(;fuKfz&kX_8DAI9+yn@l zZg3cYkUAdQdAGR`+(3r4KsnATFJ#WX>1iKv+pZ)PntI4#MgGkkZr4+8o?D zPSaY@pLa5(mN~KLLJgNO_ur>&)6|s^*D^ffQj+ zY5DN{^QG{Uj~Il*i5syhWL|V~g`dS|X`~z6G^ESdd*HtsT>7v7M;f^j8b|$8+P8>Foe~L`hkFQR)hJB? z-*<3K+(LtV&|Bv%P_B5{TP#|}yq+Ok!kxnar zisY|Y@Bk_d?LMxOV6w-f&qwyVl^$CLiT=^OGUAvJH-u}PT-GIN>w2zg6oLhLb)f1c z6mV)BASU0BBGnC5oi4-4)SS`!h`B}5C;;i$I@m3#bIc3M!3mjlO10QLc*GvW}dF1cd@*^x>m*7)ox zxv}NnA{8c_R z&ReW5kynvJY&hX#g2%!zzZ)B5O(rv<$RCUc6Ed#4qFV~X{3~I&4)(W?^0pZcNC=oJ zAk4W9I)N}|A|C+J%Z(k$8hePOMfoUth)VX3f=5QYfy<_v2#eNapq_jAI^BD^zikk- zaX9UMaB=U2Brim61f)naudGQ0tgf+Tb&qi%CJ-PPja>;J(E2kwaQvrL*|+3V zbZwwiozUkaog4wArQL><93H!MBF6{P51qCxPTJIgSmVoZO*f9qar)R*2W%xG623$irAbq2Fer(hTv7WIH6Chs#)yuDX8#d304 zs_J~x%Sl9~FJ~#gEav;*X;K1eqQEu2fGI-g5`48{r$4jdsqfos1i2$Q%r@=_QEt7d ze5R#$5tw0*^vc4E%XRm<2T0jt@?NXmd>7wy-BFURiC=J#=6QK&vfls4Ul2R8vY%y` zRti}&u^IyJfM+p)cl_ntKLz2d?-~D&jZbHmp+NIItu$B zk;%pfaaZ@8#5OSiR_)%gpsko`7DV{ied}XW_YdE#8BZeE*CKMvI~08J(8%#}X$SF} zu_3~Sk>;nXkJQiCznskB#wyS-exsm=M7N^C$OJ301WT?TDBLR(-bd&HcXp`4 zajzlGg%XR=5@crrdbE^flnP;MotZM?b#6~%TGH0Gil(()!pi^_LW;uQDd+};k#L$6 zt_NhjD8vye&vX7{Zkyfv9vsLrSjDcU>Xeqtld04*FYQrnfeJKbZWs6v%mb`54x{$w z&JTfl)e;m5cj{5JNM?@u9i*-hZpeBOj})Y2`}>peU%ppMYjV>SkpSW5gs(hj`!&n= z&Wl3d9VvSPz`Fr|3+|n7%Y&}k$ox0^{cn^%4ZxBJ9Q!Tw|LeMG9mxHNZp@7nj<)gD zsNvovTgrhAF(e+P+f^@P{o`1V|My`RRU#_+uqHPxCuNi*g0v*e5<(gL%o(IZR69sd zRrBumYQ@#&954OwFA=DbIyK(U;`a+Zo^F1)=V$rnjs@pK3#$+G-4)vu6lZ}uj<Q(#n{bPfo*G+P8H&W_ z5#!Vfl` z=L+aGoyPtgPrc!W-AcHy%kT{P#T9}D<&vnvSL#wlSxSB{rw>{e?yf7FZ%%eYu-aXCMMUnXxyQ8f0_6=!~Wpo_ciT?#?WB;t$JQt|96c`@U#8 z;lyEZ8;n*v)K-q{RXXl(u#ed;oN?y+_1|qQ&)uz(s38|~Pp7TGOLmkj@a{i1idfr1 zYg&Gml<7`S!d^ycG@+r<+*c5q$2pK_PAGypj2UV=UgNnFx?J|2-5{?;m+%^^>Gf39e=U6H8j&G!agDFUS)3`({nOv$DOBxE12H7 zs^0EbFfvfCj?wbxcSuc#)V2}B>gJh7c7KdV|Bqo43)-zhgcO$jtaDgX{*T1jt&xLg zvefgJItZmRAU5--DG+o>dT^H>}ELIX=NuKXNta8M8;fRD?D^8 zu12-oBS@b1M%Jz+C4ozPc3{s~%As#veOh!b09x+Npm3_FC5d!pEAFP-U;Zw3$^F4ndT3<2om_RPG7qRp`2;vZl`Sb z&t!|W8p^I$-$i+l(P0|%T`Ys1VW1R-1Sm?Q%;S~8ooZ&_kKc9sFETjc$Z7ne2grT+ zOxN3sG6bkul+hS)Mhbn>;o5Z)=h7ofkrM=O4X(4WwjtMrDtDy_N#ATFN=J@ZPP1Ap ztuG&%^!KC-j{Pr`EKqhFpTA!XzM*hG6hz{eTr0h-2(IH-*7Q$+48lB)39=+R{$X+T%l;rrGgob(Y);rY!`YZc%A zXu06W)iy*H>A-{ukGTHgqluZmx!}Z-s>jy(?+*n?5$l*$%}nkaCc)4gEodA|j+Hc6 zq%{^P(!=nCRw!X~WK2pM>w1}Jn*S;w_2y3hh-ox^Bi-FG8;VG8Rc?GgO=_D!T!0B* zioU0*XjHHY%H}&2Q-Y zN^NmbBi$9kcuMTd{)+5asl=~97!R`#t2Iqc)1JIf?*8#>hXX*twZ>URZ^86V8}1eN zQ?I;1?FnDUgzM$wc!?-6bT0GjMEn3Jw~Ky9_B$H@I^fs8&TbStW84&*VAw=(!}qib zzJ#!RI~2)p#W+mzWw%JbXFB}qjy5+3Z4)CwC-|jH_OPnUSkJWGaen;<@Ch#v6uyCagv8XK<}!+^ta|Ucg*-8Au1s zOGJ^sSbVomD1SY+-|#CySbIbtg&;CCDUSwQ=DAh{2>0~&Zun4q#bf%IVbIBSj{VW9 zz=l(@vcqn^U=`2EWF73=XN_rE(jYm_ZITu|#Df_IE6axwNf-EtLhw({aBpFx zN|qC9xC5k2ebsNf!t}tbT&#zgnGnzB3GUjKagN<74eH|=46#N=SKBl5lbkNPTzRZ8 ze)++9!!V8}bSoH{FKeQ2iF;reafG6$1zcm{yhzqpvY0EPtcsheNzK_%D|%O%qrnrW zjjG>)gsTYxkG_DTUkko;kZI%3QKf%}7}brWI1W1ScujI~HJOzi30QqtXB;z3oz|G4 z>&yWhQu68%2yV*nP?`Sidp=&%&b3$D5N=cn$%oiipoFWM2MIZxjlbVPCk2JKD#Y(V zdrQ{ePs(^_wnO&n{l3kr-XTa~$6r6u>CZ`Np@dEAf?xD@obCA?MCI;E1YNV9A74yf zm_*ip0+X`#Kv1nUzw6nDcXm&Z_59&hZqZi5>q01jwUF|sI1ZxF%vT5eMnbQz3uc0x zfB`>(6kMp#AgPc5$5)|zL%fR8xd3h4EaHUGkRz6kb(<=EK0sgG(TYij!R{qf<(=HJ zP9oep;(vSVSpUmh*C7h&!Jq&F0QUGPQ?$a6|82DYKZZP1*YN8eAFlt==Z*1tsl-4q z#Ub!qh&`;1v8^oFL#JkLj5A8IHk-yyux!fq{l!QSIZ3_%W}$L|p|SECU3|6LF`*Q7V;M_V4UG$J+gN{K~|MvQZH;24DO zrf`zkiH2Pu(v4CTbEK3_?Z9mQ5e>=(hP^p#w=omHI1~3VJTIMqq%CWEFb@6~Z3CnM`zMx-_q=aw%cig&KONZ(8* z22B<}h|g(L>q_EA9WdDw{+Z(ONollww$q%?ah5^uo^$Xvl}>@5yEP@YK2Y#=5iQv8 z?{R?3ZljRpZE3&L{h=^)ckYEQ0@dQ!Y!$<5&O>=?8%~8?`TIsqiA}V>^}>$LKoEI{ zg9p-tNIq^%&$Mhhs9Jvh+*=mAaA5v2$s24Ua=phjC^_*QM3W4BZ9`%Dnoh=VeIRLR z8`JawQIF4#GOKV6cZJkK(nJQt790#Cwh&e{-AOo-lAyYL;3$xvxSrFUBkLmESulk` z0wNg3+3SQ(C><=eX939Ir4jx<2AGh%qS}{0t}|l`E-nISKQC0gjK%<_b*U?@Y{SXE z0_jCZxtFQrYH218VrY%&vfySpdMHp4l)>Wa$!cUe2{)D^!AZ9gZ-f1{o=uz#(=f;G z_Wim`4VmTTll{>#zl+88^M^;)(g{q{%(T_dwXR3?-Lfwn2&KK{nVSBHb?hqxUb$Mka2VZj1v5rv5JmWA`@s2VqV>!qS^ zKCTgpF4bI*?YW&H!*eT~U6!e~xx^mGj4>0g58W;C1-qE^uO+Qm)nPbr**qKL< zmFnL}jag(XW0Aw`ez~qzl;VcTo|pr{^?v#WPrj?wTzI(UWeMPy5Z~r2eqFx&io?AI z%X$u`QK0~1{G#}VC7H@CTiHP^<@Tax$78K*xia!C`w90EnMkk${a+X*%6(hkS*1hy zxKnZc&jDCm*5_D^1zC#YD*aGMn6MGWp90(aTI5eZr*kL=nj+9Ph?|4=Zz%ioq_Z`} zSC|4mPni!~Zj6{{v>3)c+@tWY+u-K>wfl zWy%n ```{r, setup, include=FALSE} knitr::opts_knit$set(root.dir = params$work_dir) library(knitr) ``` -```{r libary-loading, message = params$verbose, warning = params$verbose} +```{r libary-loading} # allow more flexibility in download time # useful for slower connections where the default of 60 seconds might be exceeded options(timeout=600) -# Import libraries (tximport, DESeq2, tidyverse, Risa) +# Import libraries (tximport, DESeq2, tidyverse) library(tximport) library(DESeq2) library(stringr) params +SUMMARY_FILE_PATH <- params$SUMMARY_FILE_PATH yaml::write_yaml(params, "last_params.yml") -``` -```{r validate_params} -# assert either runsheet_path OR isa_path supplied in params -if (!xor(!is.null(params$runsheet_path), !is.null(params$isa_path))) { - stop("Must supply EITHER runsheet_path or isa_path in params") -} +# END:NON_DPPD + +# START:ONLY_DPPD +# params <- c( +# runsheet_path = "/path/to/runsheet", # Used for downloading +# input_gene_results_dir = "/path/to/genes_results_files", # Location of the gene results files +# primary_keytype = "", # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR) +# normalization = "", # ENUM like, supports "ERCC-groupB" and "default" +# normalized_counts_output_prefix = "", # Output prefix for normalized counts files +# dge_output_prefix = "" # Output prefix for DGE files +# ) +# END:ONLY_DPPD ``` ### 2. Load Study Metadata -```{r runsheet-to-compare_df, include=(!is.null(params$runsheet_path)), eval=(!is.null(params$runsheet_path))} +```{r runsheet-to-compare_df} +#' Calculate the square of a number +#' +#' This function takes a numeric input and returns its square. +#' +#' @param x Numeric value to be squared. +#' +#' @return The square of the input value. +#' +#' @examples +#' square(2) +#' # Output: 4 +#' +#' square(-3) +#' # Output: 9 +#' compare_csv_from_runsheet <- function(runsheet_path) { df = read.csv(runsheet_path) # get only Factor Value columns @@ -64,25 +85,6 @@ compare_csv <- compare_csv_from_runsheet(params$runsheet_path) #DT::datatable(compare_csv, caption = "Data Frame of parsed runsheet filtered to required columns") ``` -```{r isa-to-compare_df, include=(!is.null(params$isa_path)), eval=(!is.null(params$isa_path))} -# TODO: Remove this route, ISA zip support will be dropped as of DPPD-7101-F -library(Risa) - -compare_csv_from_isa_archive <- function(isa_path) { - td = tempdir() - unzip(isa_path, exdir = td) - isa <- Risa::readISAtab(path = td) - n = as.numeric(which(isa@assay.technology.types == "RNA Sequencing (RNA-Seq)")) - isa_tabs <- isa@assay.tabs[[n]]@assay.file - factors <- as.data.frame(isa@factors[[1]], stringsAsFactors = FALSE) - colnames(factors) <- paste("factor",1:dim(factors)[2], sep = "_") - return(data.frame(sample_id = isa_tabs$`Sample Name`, factors)) -} -# Loading metadata from isa archive -compare_csv <- compare_csv_from_isa_archive(params$isa_path) -#DT::datatable(compare_csv, caption = "Data Frame of parsed isa archive filtered to required metadata") -``` - ```{r compare_df-to-study_df} study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]] @@ -130,8 +132,7 @@ files <- list.files( ## Reorder the *genes.results files to match the ordering of the ISA samples -# Replace spaces in sample names from ISA with "_", consistent with runsheet generation -samples = str_replace_all(rownames(study), " ", "_") +samples = rownames(study) reordering <- sapply(samples, function(x)grep(paste0("Rsem_gene_counts/", x,".genes.results$"), files, value=FALSE)) files <- files[reordering] names(files) <- samples @@ -335,12 +336,12 @@ output_table_1$LRT.p.value <- res_1_lrt@listData$padj ```{r wald-test-iteration} ## Iterate through Wald Tests to generate pairwise comparisons of all groups for (i in 1:dim(contrasts)[2]){ - res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i])) - res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)] - colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i])) - output_table_1<-cbind(output_table_1,res_1) - rm(res_1) + res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i])) + res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)] + colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i])) + output_table_1<-cbind(output_table_1,res_1) } + ``` ```{r} @@ -385,6 +386,16 @@ write.csv( sampleTable, file = paste0(params$dge_output_prefix, "SampleTable.csv") ) + +# Create summary file based on output_table_1 +output <- capture.output(summary(output_table_1)) + +# Open file connection +conn <- file(paste0(params$dge_output_prefix, "summary.txt"), "w") + +# Write the captured output to the file +writeLines(output, conn) + # DT::datatable(head(output_table_1, n = 30), # caption = "First 30 rows of differential gene expression table", # extensions = "FixedColumns", diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R index fedf44d6..1d5965e0 100755 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R @@ -4,10 +4,6 @@ library("here") library("cli") parser <- OptionParser() -parser <- add_option(parser, c("-v", "--verbose"), - action = "store_true", - default = FALSE, help = "Print extra output [default]" -) parser <- add_option(parser, c("--skip_perform_dge"), action = "store_true", default = FALSE, help = "Skips running the DGE, this can be used when the output from the DGE already exist", @@ -43,9 +39,6 @@ parser <- add_option(parser, c("--DEBUG_MODE_ADD_DUMMY_COUNTS"), default = FALSE, action = "store_true", help = "Replaces all gene counts with random values from 0 to 5000", ) -parser <- add_option(parser, c("--isa_path"), - help = "ISA Archive path, one of two allowed metadata inputs, exactly one metadata input must be supplied", -) parser <- add_option(parser, c("--runsheet_path"), help = "runsheet csv path, one of two allowed metadata inputs, exactly one metadata input must be supplied", ) @@ -69,14 +62,11 @@ if (!args$skip_perform_dge) { cli_alert_warning("Running Perform_DGE.Rmd") rmarkdown::render(here("dge_annotation_R_scripts", "Perform_DGE.Rmd"), output_dir = args$work_dir, - quiet = !args$verbose, params = list( work_dir = args$work_dir, - verbose = args$verbose, input_gene_results_dir = args$input_gene_results_dir, primary_keytype = args$primary_keytype, runsheet_path = args$runsheet_path, - isa_path = args$isa_path, normalization = args$normalization, dge_output_prefix = args$dge_output_prefix, normalized_counts_output_prefix = args$normalized_counts_output_prefix, @@ -93,7 +83,6 @@ if (!args$skip_gene_annotation) { cli_alert_warning("Running Add_Gene_Annotations.Rmd") rmarkdown::render(here("dge_annotation_R_scripts", "Add_Gene_Annotations.Rmd"), output_dir = args$work_dir, - quiet = !args$verbose, params = list( input_table_path = paste0(args$dge_output_prefix, "differential_expression_no_annotations.csv"), work_dir = args$work_dir, @@ -111,7 +100,6 @@ if (!args$skip_gene_annotation) { cli_alert_warning("Running Extend_DGE_Table.Rmd") rmarkdown::render(here("dge_annotation_R_scripts", "Extend_DGE_Table.Rmd"), output_dir = args$work_dir, - quiet = !args$verbose, params = list( input_table_path = paste0(args$dge_output_prefix, "differential_expression.csv"), work_dir = args$work_dir, @@ -128,7 +116,6 @@ if (!args$skip_gene_annotation) { cli_alert_warning("Running Generate_PCA_Table.Rmd") rmarkdown::render(here("dge_annotation_R_scripts", "Generate_PCA_Table.Rmd"), output_dir = args$work_dir, - quiet = !args$verbose, params = list( input_table_path = paste0(args$normalized_counts_output_prefix, "Normalized_Counts.csv"), work_dir = args$work_dir, diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 638b8922..84118d9b 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -28,6 +28,9 @@ process DGE_BY_DESEQ2 { path("dge_output_ercc/visualization_output_table_ERCCnorm.csv"), path("dge_output_ercc/visualization_PCA_table_ERCCnorm.csv"), optional: true, emit: dge_ercc + path("dge_output/summary.txt"), emit: summary + path("dge_output_ercc/ERCCnorm_summary.txt"), optional: true, emit: summary_ercc + path("versions.txt"), emit: version script: diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index e5966972..5c068fcc 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -17,7 +17,7 @@ nextflow_process { // define inputs of the process here. Example: input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv") input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results") - input[2] = [ primary_keytype:'ENSEMBL' ] + input[2] = [ primary_keytype:'ENSEMBL', has_ercc:true ] input[3] = file("https://figshare.com/ndownloader/files/36597114") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") """ @@ -27,12 +27,11 @@ nextflow_process { then { assert process.success assert snapshot( - process.out.dge, + process.out.summary, process.out.norm_counts, - process.out.dge_ercc, + process.out.summary_ercc, process.out.norm_counts_ercc, - process.out.version, - ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated! + process.out.version ).match() } diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap index 5bf55599..24de1f5e 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap @@ -2,13 +2,7 @@ "Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION": { "content": [ [ - [ - "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce", - "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9", - "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59", - "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5", - "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3" - ] + "summary.txt:md5,6c202fd3c11a747e40a49d1369e8875f" ], [ [ @@ -17,20 +11,16 @@ ] ], [ - + "ERCCnorm_summary.txt:md5,1f77ed6cd1a8435038859b0361f6b047" ], [ - + "ERCC_Normalized_Counts.csv:md5,b1d9d5a546a23b6709a9f8c60548b6a7" ], [ - "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } + "versions.txt:md5,1865b6b8900d83ec7881f58fe301da11" + ] ], - "timestamp": "2023-06-15T17:38:07+0000" + "timestamp": "2023-07-11T21:35:03+0000" }, "ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION": { "content": [ @@ -57,7 +47,7 @@ "dp_tools_version": "1.1.8" } ], - "timestamp": "2023-06-15T17:38:07+0000" + "timestamp": "2023-07-11T21:35:02+0000" }, "Baseline_ON_GLDS-194 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { "content": [ @@ -90,7 +80,7 @@ "dp_tools_version": "1.1.8" } ], - "timestamp": "2023-06-15T17:38:07+0000" + "timestamp": "2023-07-11T21:35:02+0000" }, "GLDS-423 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { "content": [ @@ -123,7 +113,7 @@ "dp_tools_version": "1.1.8" } ], - "timestamp": "2023-06-15T17:38:07+0000" + "timestamp": "2023-07-11T21:35:02+0000" }, "ISSUE_55_ON_Mock:Should run without failures": { "content": [ @@ -150,6 +140,6 @@ "dp_tools_version": "1.1.8" } ], - "timestamp": "2023-06-15T17:38:07+0000" + "timestamp": "2023-07-11T21:35:02+0000" } } \ No newline at end of file From ac5dbcebfd7a76379f8ed4c2bca3b5bcacf54dbf Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 21:42:10 +0000 Subject: [PATCH 39/58] ci[fix]: update tap location --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 13cd88f8..90409875 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,4 +94,4 @@ jobs: - uses: pcolby/tap-summary@v1 with: path: >- - test.tap \ No newline at end of file + ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test.tap \ No newline at end of file From 16ecc58a7ce040cb89d8eb8cbff6298be83207d5 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 21:45:54 +0000 Subject: [PATCH 40/58] ci: bump to trigger tests --- .../NF_RCP-F/workflow_code/modules/dge/dge.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf index 84118d9b..1bc7017d 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf @@ -64,5 +64,6 @@ process DGE_BY_DESEQ2 { --extended_table_output_prefix "dge_output_ercc/"\\ --extended_table_output_suffix "_ERCCnorm.csv" fi + """ } From 5e8416eb019420e712067fb88382bbe1057c7984 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:03:38 +0000 Subject: [PATCH 41/58] ci: update outputs format; update tag --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 90409875..a6a1122f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,7 @@ jobs: filters: "RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml" test: - name: ${{ matrix.tags }} ${{ matrix.profile }} + name: ${{ matrix.tags }} ${{ matrix.profile }} ${{ matrix.NXF_VER }} runs-on: ubuntu-latest needs: changes if: needs.changes.outputs.modules != '[]' @@ -59,7 +59,7 @@ jobs: run: | curl -fsSL https://code.askimed.com/install/nf-test | bash chmod u+x nf-test - echo "::set-output name=bin_path::$(pwd)/nf-test" + echo "bin_path=$(pwd)/nf-test" >> $GITHUB_OUTPUT - name: Hash Github Workspace From 35c98236f5b674e719ba2dc686ae64a4266cee84 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:05:12 +0000 Subject: [PATCH 42/58] test: update dge tests --- .../tests/modules/dge/dge.nf.test | 49 +------ .../tests/modules/dge/dge.nf.test.snap | 125 ++---------------- 2 files changed, 18 insertions(+), 156 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test index 5c068fcc..0954b609 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test @@ -4,7 +4,7 @@ nextflow_process { script "modules/dge/dge.nf" process "DGE_BY_DESEQ2" - test("Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION") { + test("GLDS-194") { tag 'dge' when { @@ -37,40 +37,8 @@ nextflow_process { } - test("GLDS-423 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION") { - - when { - params { - // define parameters here. Example: - use_dummy_gene_counts = true - } - process { - """ - // define inputs of the process here. Example: - input[0] = file("testdata/GLDS-423/Metadata/GLDS-423_bulkRNASeq_vTECHREPMOCK_runsheet.csv") - input[1] = file("testdata/GLDS-423/03-RSEM_Counts/*.genes.results") - input[2] = [ primary_keytype:'ENSEMBL' ] - input[3] = file("https://figshare.com/ndownloader/files/36597114") - input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") - """ - } - } - - then { - assert process.success - assert snapshot( - process.out.dge, - process.out.norm_counts, - process.out.dge_ercc, - process.out.norm_counts_ercc, - process.out.version, - ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated! - ).match() - } - - } - - test("ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION") { + test("GLDS-321:55_.ISSUE") { + tag 'dge' when { params { @@ -80,9 +48,9 @@ nextflow_process { process { """ // define inputs of the process here. Example: - input[0] = file("testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") - input[1] = file("testdata/GLDS-321/03-RSEM_Counts/*.genes.results") - input[2] = [ primary_keytype:'TAIR' ] + input[0] = file("test-datasets/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv") + input[1] = file("test-datasets/testdata/GLDS-321/03-RSEM_Counts/*.genes.results") + input[2] = [ primary_keytype:'TAIR', has_ercc:false ] input[3] = file("https://figshare.com/ndownloader/files/36597132") input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip") """ @@ -92,12 +60,9 @@ nextflow_process { then { assert process.success assert snapshot( - process.out.dge, + process.out.summary, process.out.norm_counts, - // NON_ERCC process.out.dge_ercc, - // NON_ERCC process.out.norm_counts_ercc, process.out.version, - ['Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints' : true, 'dp_tools_version' : '1.1.8'] // MANUALLY Validated! ).match() } diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap index 24de1f5e..8223ca2a 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap @@ -1,37 +1,8 @@ { - "Baseline_ON_GLDS-194:Should run without failures AND PASS VV VALIDATION": { + "GLDS-321:55_.ISSUE": { "content": [ [ - "summary.txt:md5,6c202fd3c11a747e40a49d1369e8875f" - ], - [ - [ - "Normalized_Counts.csv:md5,b4ba348d5446f8ba546a46b966087c1b", - "RSEM_Unnormalized_Counts.csv:md5,931c6070b5e19909929c5a217713500b" - ] - ], - [ - "ERCCnorm_summary.txt:md5,1f77ed6cd1a8435038859b0361f6b047" - ], - [ - "ERCC_Normalized_Counts.csv:md5,b1d9d5a546a23b6709a9f8c60548b6a7" - ], - [ - "versions.txt:md5,1865b6b8900d83ec7881f58fe301da11" - ] - ], - "timestamp": "2023-07-11T21:35:03+0000" - }, - "ISSUE_55_ON_GLDS-321:Should run without failures AND PASS VV VALIDATION": { - "content": [ - [ - [ - "contrasts.csv:md5,1b63fd558a509d430609c5d824f7c090", - "SampleTable.csv:md5,308d0c4af88b557ef857f3be8eb2a339", - "differential_expression.csv:md5,de5fe517a08851b0a4817c459ce95d8a", - "visualization_output_table.csv:md5,7f72442d40a030d50211db3ab6885dfb", - "visualization_PCA_table.csv:md5,c19f946356e520bd9bf68606d639f21c" - ] + "summary.txt:md5,2ae67caf20a32f00b87e0f340a4c505b" ], [ [ @@ -41,24 +12,14 @@ ], [ "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } + ] ], - "timestamp": "2023-07-11T21:35:02+0000" + "timestamp": "2023-07-11T22:30:32+0000" }, - "Baseline_ON_GLDS-194 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { + "GLDS-194": { "content": [ [ - [ - "contrasts.csv:md5,66d74b686885ffd4eccdf55823c0e3ce", - "SampleTable.csv:md5,bfd18bbc7d34e41c23f0c9107f5d75c9", - "differential_expression.csv:md5,00cf45e546529c81c0a43ae1b8495a59", - "visualization_output_table.csv:md5,58578caedc33e6a0230ba80abe61f0d5", - "visualization_PCA_table.csv:md5,5c461d35b12d5946c2105f705a03c6d3" - ] + "summary.txt:md5,6c202fd3c11a747e40a49d1369e8875f" ], [ [ @@ -67,79 +28,15 @@ ] ], [ - - ], - [ - - ], - [ - "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } - ], - "timestamp": "2023-07-11T21:35:02+0000" - }, - "GLDS-423 TECH REP VERSION:Should run without failures AND PASS VV VALIDATION": { - "content": [ - [ - [ - "contrasts.csv:md5,6ec6d3af26f8d482b64112b88d4e8417", - "SampleTable.csv:md5,e007d8494abcd37c46c012fabe91d2e7", - "differential_expression.csv:md5,76fd2a34d35cd3110198a0b22b381846", - "visualization_output_table.csv:md5,0427f37905a2b55a595bfbeefddff1ac", - "visualization_PCA_table.csv:md5,24e62c90c57550de57bac0b35c3a0121" - ] - ], - [ - [ - "Normalized_Counts.csv:md5,0ce1b39d28e7cce080e0661c56ed5f76", - "RSEM_Unnormalized_Counts.csv:md5,5e3ee6c36e6ebefa1bd947c2fa586b99" - ] - ], - [ - - ], - [ - - ], - [ - "versions.txt:md5,5fac4f3186014a43b8aa3b41d66b2311" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } - ], - "timestamp": "2023-07-11T21:35:02+0000" - }, - "ISSUE_55_ON_Mock:Should run without failures": { - "content": [ - [ - [ - "contrasts.csv:md5,b99c88e9c92f1d1588727df47523c4ad", - "SampleTable.csv:md5,229c988f09fbfeca182da7011f6f93b4", - "differential_expression.csv:md5,f58b6f602598a0c25379afd0c5e87a71", - "visualization_output_table.csv:md5,d056472d2ac135cad9ee4d9f33bde387", - "visualization_PCA_table.csv:md5,1293b99878d7a7eb0e02dc6a38e33d39" - ] + "ERCCnorm_summary.txt:md5,1f77ed6cd1a8435038859b0361f6b047" ], [ - [ - "Normalized_Counts.csv:md5,393160aee08165165ccd2b8579a45161", - "RSEM_Unnormalized_Counts.csv:md5,6759e0e7ec07960691d3913b3877c129" - ] + "ERCC_Normalized_Counts.csv:md5,b1d9d5a546a23b6709a9f8c60548b6a7" ], [ - "versions.txt:md5,6e364ecf476a7729d5edd52335fb074a" - ], - { - "Passes bulkRNASeq.checks.check_dge_table_group_columns_constraints": true, - "dp_tools_version": "1.1.8" - } + "versions.txt:md5,1865b6b8900d83ec7881f58fe301da11" + ] ], - "timestamp": "2023-07-11T21:35:02+0000" + "timestamp": "2023-07-11T22:30:32+0000" } } \ No newline at end of file From 802f9b1701315a286155a6d8ccc7d043fcd28e3d Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:18:27 +0000 Subject: [PATCH 43/58] feat: git ignore *.pyc and .nextflow folders --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..c05e0a71 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.nextflow +*.pyc \ No newline at end of file From abc9f7e24a584ed1fdcabc58dfe6f8284f9b990a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:18:53 +0000 Subject: [PATCH 44/58] feat: add gitpod yaml --- .gitpod.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .gitpod.yml diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 00000000..cd0f7c9f --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,13 @@ +image: nfcore/gitpod:latest + +vscode: + extensions: + - ms-python.python + - eamodio.gitlens + - GitHub.copilot + - REditorSupport.r + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code From af0b7160eb3cc537680e3d73fe9173b7d92796b9 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:43:37 +0000 Subject: [PATCH 45/58] feat: remove deprecated conda support files --- .../config/software/by_conda_yml.config | 60 --- .../NF_RCP-F/workflow_code/envs/AST.yml | 13 - .../workflow_code/envs/RNAseq_Rtools.yml | 378 ------------------ .../NF_RCP-F/workflow_code/envs/VV.yml | 13 - .../workflow_code/envs/download_tools.yml | 10 - .../NF_RCP-F/workflow_code/envs/dp_tools.yml | 16 - .../workflow_code/envs/ercc_analysis.yml | 22 - .../NF_RCP-F/workflow_code/envs/fastqc.yml | 5 - .../workflow_code/envs/genelab_utils.yml | 10 - .../NF_RCP-F/workflow_code/envs/isatools.yml | 6 - .../NF_RCP-F/workflow_code/envs/main.yml | 8 - .../NF_RCP-F/workflow_code/envs/multiqc.yml | 6 - .../NF_RCP-F/workflow_code/envs/python.yml | 7 - .../NF_RCP-F/workflow_code/envs/r_deseq2.yml | 8 - .../workflow_code/envs/rnaseq_v1.0_modify.yml | 322 --------------- .../NF_RCP-F/workflow_code/envs/rsem.yml | 7 - .../NF_RCP-F/workflow_code/envs/samtools.yml | 5 - .../workflow_code/envs/samtools_rseqc.yml | 10 - .../NF_RCP-F/workflow_code/envs/star.yml | 6 - .../workflow_code/envs/trim_galore.yml | 5 - .../workflow_code/envs/ucsc_gtf_Pred_BED.yml | 10 - 21 files changed, 927 deletions(-) delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml delete mode 100644 RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config deleted file mode 100644 index faca1b05..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/config/software/by_conda_yml.config +++ /dev/null @@ -1,60 +0,0 @@ -// Config that specifies packaged conda yml files for each process - -process { - withName: 'TO_PRED|TO_BED' { - conda = "${projectDir}/envs/ucsc_gtf_Pred_BED.yml" - } - - withName: 'INFER_EXPERIMENT|SORT_INDEX_BAM|GENEBODY_COVERAGE|INNER_DISTANCE|READ_DISTRIBUTION' { - conda = "${projectDir}/envs/samtools_rseqc.yml" - } - - withName: 'DGE_BY_DESEQ2|QUANTIFY_GENES' { - conda = "${projectDir}/envs/RNAseq_Rtools.yml" - } - - withName: 'FASTQC' { - conda = "${projectDir}/envs/fastqc.yml" - } - - withName: 'MULTIQC' { - conda = "${projectDir}/envs/multiqc.yml" - } - - withName: 'TRIMGALORE' { - conda = "${projectDir}/envs/trim_galore.yml" - } - - withName: 'DOWNLOAD_GENOME_ANNOTATIONS|GENERATE_METASHEET' { - conda = "${projectDir}/envs/download_tools.yml" - } - - withName: 'RNASEQ_RUNSHEET_FROM_GLDS' { - conda = "${projectDir}/envs/dp_tools.yml" - } - - withName: 'BUILD_STAR|ALIGN_STAR' { - conda = "${projectDir}/envs/star.yml" - } - - withName: 'BUILD_RSEM|COUNT_ALIGNED' { - conda = "${projectDir}/envs/rsem.yml" - } - - withName: 'SUBSAMPLE_GENOME' { - conda = "${projectDir}/envs/samtools.yml" - } - - withName: 'POST_PROCESSING|SOFTWARE_VERSIONS' { - conda = "${projectDir}/envs/genelab_utils.yml" - } - - withLabel: 'VV' { - conda = "${projectDir}/envs/dp_tools.yml" - } - - withName: 'GET_MAX_READ_LENGTH|ASSESS_STRANDEDNESS' { - conda = "${projectDir}/envs/python.yml" - } - -} diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml deleted file mode 100644 index 5b4107da..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/AST.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: AST -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python=3.8 - - pandas=1.2 - - isatools - - peppy - - pip - - pip: - - git+https://github.com/J-81/Analysis_Staging.git@0.4.0-beta.7 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml deleted file mode 100644 index dcdf990a..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/RNAseq_Rtools.yml +++ /dev/null @@ -1,378 +0,0 @@ -name: RNAseq_Rtools -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=1_gnu - - _r-mutex=1.0.1=anacondar_1 - - binutils_impl_linux-64=2.36.1=h193b22a_2 - - binutils_linux-64=2.36=hf3e587d_6 - - bioconductor-affy=1.72.0=r41h5c21468_1 - - bioconductor-affyio=1.64.0=r41h5c21468_1 - - bioconductor-annotate=1.72.0=r41hdfd78af_0 - - bioconductor-annotationdbi=1.56.1=r41hdfd78af_0 - - bioconductor-annotationhub=3.2.0=r41hdfd78af_0 - - bioconductor-biobase=2.54.0=r41h5c21468_1 - - bioconductor-biocfilecache=2.2.0=r41hdfd78af_0 - - bioconductor-biocgenerics=0.40.0=r41hdfd78af_0 - - bioconductor-biocparallel=1.28.3=r41h619a076_0 - - bioconductor-biocversion=3.14.0=r41hdfd78af_0 - - bioconductor-biocviews=1.62.0=r41hdfd78af_0 - - bioconductor-biostrings=2.62.0=r41h5c21468_1 - - bioconductor-complexheatmap=2.10.0=r41hdfd78af_0 - - bioconductor-delayedarray=0.20.0=r41h5c21468_1 - - bioconductor-deseq2=1.34.0=r41h619a076_1 - - bioconductor-enhancedvolcano=1.12.0=r41hdfd78af_0 - - bioconductor-genefilter=1.76.0=r41ha086028_1 - - bioconductor-geneplotter=1.72.0=r41hdfd78af_0 - - bioconductor-genomeinfodb=1.30.0=r41hdfd78af_0 - - bioconductor-genomeinfodbdata=1.2.7=r41hdfd78af_1 - - bioconductor-genomicranges=1.46.1=r41h5c21468_0 - - bioconductor-graph=1.72.0=r41h5c21468_1 - - bioconductor-impute=1.68.0=r41h77f299f_1 - - bioconductor-interactivedisplaybase=1.32.0=r41hdfd78af_0 - - bioconductor-iranges=2.28.0=r41h5c21468_1 - - bioconductor-keggrest=1.34.0=r41hdfd78af_0 - - bioconductor-limma=3.50.1=r41h5c21468_0 - - bioconductor-massspecwavelet=1.60.0=r41h5c21468_1 - - bioconductor-matrixgenerics=1.6.0=r41hdfd78af_0 - - bioconductor-mscoreutils=1.6.1=r41h619a076_0 - - bioconductor-msfeatures=1.2.0=r41hdfd78af_0 - - bioconductor-msnbase=2.20.4=r41h619a076_0 - - bioconductor-mzid=1.32.0=r41hdfd78af_0 - - bioconductor-mzr=2.28.0=r41h619a076_1 - - bioconductor-org.at.tair.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.ce.eg.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.dm.eg.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.eck12.eg.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.hs.eg.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.mm.eg.db=3.14.0=r41hdfd78af_0 - - bioconductor-org.sc.sgd.db=3.14.0=r41hdfd78af_0 - - bioconductor-panther.db=1.0.11=r41hdfd78af_1 - - bioconductor-pcamethods=1.86.0=r41h619a076_1 - - bioconductor-preprocesscore=1.56.0=r41h5c21468_1 - - bioconductor-protgenerics=1.26.0=r41hdfd78af_0 - - bioconductor-rbgl=1.70.0=r41h619a076_1 - - bioconductor-rhdf5lib=1.16.0=r41h5c21468_1 - - bioconductor-risa=1.36.0=r41h619a076_1 - - bioconductor-s4vectors=0.32.3=r41h5c21468_0 - - bioconductor-stringdb=2.6.0=r41hdfd78af_0 - - bioconductor-summarizedexperiment=1.24.0=r41hdfd78af_0 - - bioconductor-tximport=1.22.0=r41hdfd78af_0 - - bioconductor-vsn=3.62.0=r41h5c21468_1 - - bioconductor-xcms=3.16.1=r41h619a076_0 - - bioconductor-xvector=0.34.0=r41h5c21468_1 - - bioconductor-zlibbioc=1.40.0=r41h5c21468_1 - - bwidget=1.9.14=ha770c72_1 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.18.1=h7f98852_0 - - ca-certificates=2022.5.18.1=ha878542_0 - - cairo=1.16.0=ha00ac49_1009 - - curl=7.81.0=h494985f_0 - - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - - font-ttf-inconsolata=3.000=h77eed37_0 - - font-ttf-source-code-pro=2.038=h77eed37_0 - - font-ttf-ubuntu=0.83=hab24e00_0 - - fontconfig=2.13.96=ha180cfb_0 - - fonts-conda-ecosystem=1=0 - - fonts-conda-forge=1=0 - - freetype=2.10.4=h0708190_1 - - fribidi=1.0.10=h36c2ea0_0 - - gcc_impl_linux-64=9.4.0=h03d3576_13 - - gcc_linux-64=9.4.0=h391b98a_6 - - gettext=0.19.8.1=h73d1719_1008 - - gfortran_impl_linux-64=9.4.0=h0003116_13 - - gfortran_linux-64=9.4.0=hf0ab688_6 - - gmp=6.2.1=h58526e2_0 - - graphite2=1.3.13=h58526e2_1001 - - gsl=2.7=he838d99_0 - - gxx_impl_linux-64=9.4.0=h03d3576_13 - - gxx_linux-64=9.4.0=h0316aca_6 - - harfbuzz=3.4.0=hb4a5f5f_0 - - hdf4=4.2.15=h10796ff_3 - - hdf5=1.12.1=nompi_h7f166f4_103 - - icu=69.1=h9c3ff4c_0 - - jbig=2.1=h7f98852_2003 - - jpeg=9e=h7f98852_0 - - kernel-headers_linux-64=2.6.32=he073ed8_15 - - keyutils=1.6.1=h166bdaf_0 - - krb5=1.19.2=h08a2579_4 - - ld_impl_linux-64=2.36.1=hea4e1c9_2 - - lerc=3.0=h9c3ff4c_0 - - libblas=3.9.0=13_linux64_openblas - - libcblas=3.9.0=13_linux64_openblas - - libcurl=7.81.0=h494985f_0 - - libdeflate=1.10=h7f98852_0 - - libedit=3.1.20191231=he28a2e2_2 - - libev=4.33=h516909a_1 - - libffi=3.4.2=h7f98852_5 - - libgcc-devel_linux-64=9.4.0=hd854feb_13 - - libgcc-ng=11.2.0=h1d223b6_13 - - libgfortran-ng=11.2.0=h69a702a_13 - - libgfortran5=11.2.0=h5c6108e_13 - - libglib=2.70.2=h174f98d_4 - - libgomp=11.2.0=h1d223b6_13 - - libiconv=1.16=h516909a_0 - - liblapack=3.9.0=13_linux64_openblas - - libnetcdf=4.8.1=nompi_hb3fd0d9_101 - - libnghttp2=1.47.0=he49606f_0 - - libopenblas=0.3.18=pthreads_h8fe5266_0 - - libpng=1.6.37=h21135ba_2 - - libsanitizer=9.4.0=h79bfe98_13 - - libssh2=1.10.0=ha35d2d1_2 - - libstdcxx-devel_linux-64=9.4.0=hd854feb_13 - - libstdcxx-ng=11.2.0=he4da1e4_13 - - libtiff=4.3.0=h542a066_3 - - libuuid=2.32.1=h7f98852_1000 - - libwebp-base=1.2.2=h7f98852_1 - - libxcb=1.13=h7f98852_1004 - - libxml2=2.9.12=h885dcf4_1 - - libzip=1.8.0=h1c5bbd1_1 - - libzlib=1.2.11=h36c2ea0_1013 - - lz4-c=1.9.3=h9c3ff4c_1 - - make=4.3=hd18ef5c_1 - - ncurses=6.2=h58526e2_4 - - openssl=3.0.3=h166bdaf_0 - - pandoc=2.17.1.1=ha770c72_0 - - pango=1.50.3=h9967ed3_0 - - pcre=8.45=h9c3ff4c_0 - - pcre2=10.37=h032f7d1_0 - - pixman=0.40.0=h36c2ea0_0 - - proj=8.2.1=h277dcde_0 - - pthread-stubs=0.4=h36c2ea0_1001 - - r-ash=1.0_15=r41h859d828_1006 - - r-askpass=1.1=r41hcfec24a_2 - - r-assertthat=0.2.1=r41hc72bb7e_2 - - r-backports=1.4.1=r41hcfec24a_0 - - r-base=4.1.2=h2553ce4_1 - - r-base64enc=0.1_3=r41hcfec24a_1004 - - r-beeswarm=0.4.0=r41hcfec24a_1 - - r-bh=1.78.0_0=r41hc72bb7e_0 - - r-biocmanager=1.30.16=r41hc72bb7e_0 - - r-bit=4.0.4=r41hcfec24a_0 - - r-bit64=4.0.5=r41hcfec24a_0 - - r-bitops=1.0_7=r41hcfec24a_0 - - r-blob=1.2.2=r41hc72bb7e_0 - - r-brew=1.0_7=r41hc72bb7e_0 - - r-brio=1.1.3=r41hcfec24a_0 - - r-broom=0.7.12=r41hc72bb7e_0 - - r-bslib=0.3.1=r41hc72bb7e_0 - - r-cachem=1.0.6=r41hcfec24a_0 - - r-cairo=1.5_14=r41hcfec24a_0 - - r-callr=3.7.0=r41hc72bb7e_0 - - r-catools=1.18.2=r41h03ef668_0 - - r-cellranger=1.1.0=r41hc72bb7e_1004 - - r-chron=2.3_56=r41hcfec24a_0 - - r-circlize=0.4.14=r41hc72bb7e_0 - - r-cli=3.3.0=r41h7525677_0 - - r-clipr=0.8.0=r41hc72bb7e_0 - - r-clue=0.3_60=r41hcfec24a_0 - - r-cluster=2.1.2=r41h859d828_0 - - r-codetools=0.2_18=r41hc72bb7e_0 - - r-collections=0.3.5=r41hcfec24a_0 - - r-colorspace=2.0_3=r41h06615bd_0 - - r-commonmark=1.7=r41hcfec24a_1002 - - r-cpp11=0.4.2=r41hc72bb7e_0 - - r-crayon=1.5.0=r41hc72bb7e_0 - - r-crosstalk=1.2.0=r41hc72bb7e_0 - - r-curl=4.3.2=r41hcfec24a_0 - - r-cyclocomp=1.1.0=r41hc72bb7e_1004 - - r-data.table=1.14.2=r41hcfec24a_0 - - r-dbi=1.1.2=r41hc72bb7e_0 - - r-dbplyr=2.1.1=r41hc72bb7e_0 - - r-deoptimr=1.0_10=r41hc72bb7e_0 - - r-desc=1.4.0=r41hc72bb7e_0 - - r-diffobj=0.3.5=r41hcfec24a_0 - - r-digest=0.6.29=r41h03ef668_0 - - r-doparallel=1.0.17=r41hc72bb7e_0 - - r-dplyr=1.0.7=r41h03ef668_0 - - r-dt=0.21=r41hc72bb7e_0 - - r-dtplyr=1.2.1=r41hc72bb7e_0 - - r-ellipsis=0.3.2=r41hcfec24a_0 - - r-evaluate=0.15=r41hc72bb7e_0 - - r-extrafont=0.17=r41ha770c72_1002 - - r-extrafontdb=1.0=r41hc72bb7e_1003 - - r-fansi=1.0.2=r41hcfec24a_0 - - r-farver=2.1.0=r41h03ef668_0 - - r-fastmap=1.1.0=r41h03ef668_0 - - r-filelock=1.0.2=r41hcfec24a_1002 - - r-fontawesome=0.2.2=r41hc72bb7e_0 - - r-forcats=0.5.1=r41hc72bb7e_0 - - r-foreach=1.5.2=r41hc72bb7e_0 - - r-formatr=1.11=r41hc72bb7e_0 - - r-fs=1.5.2=r41h03ef668_0 - - r-futile.logger=1.4.3=r41hc72bb7e_1003 - - r-futile.options=1.0.1=r41hc72bb7e_1002 - - r-gargle=1.2.0=r41hc72bb7e_0 - - r-generics=0.1.2=r41hc72bb7e_0 - - r-getopt=1.20.3=r41ha770c72_2 - - r-getoptlong=1.0.5=r41hc72bb7e_0 - - r-ggalt=0.4.0=r41ha770c72_2 - - r-ggbeeswarm=0.6.0=r41ha770c72_1003 - - r-ggdendro=0.1.23=r41hc72bb7e_0 - - r-ggfortify=0.4.14=r41hc72bb7e_0 - - r-ggplot2=3.3.5=r41hc72bb7e_0 - - r-ggrastr=1.0.1=r41hc72bb7e_0 - - r-ggrepel=0.9.1=r41h03ef668_0 - - r-globaloptions=0.1.2=r41ha770c72_0 - - r-glue=1.6.2=r41h06615bd_0 - - r-googledrive=2.0.0=r41hc72bb7e_0 - - r-googlesheets4=1.0.0=r41h785f33e_0 - - r-gplots=3.1.1=r41hc72bb7e_0 - - r-gridextra=2.3=r41hc72bb7e_1003 - - r-gsubfn=0.7=r41hc72bb7e_1002 - - r-gtable=0.3.0=r41hc72bb7e_3 - - r-gtools=3.9.2=r41hcfec24a_0 - - r-hash=3.0.1=r41hc72bb7e_2 - - r-haven=2.4.3=r41h2713e49_0 - - r-here=1.0.1=r41hc72bb7e_0 - - r-hexbin=1.28.2=r41h8da6f51_0 - - r-highr=0.9=r41hc72bb7e_0 - - r-hms=1.1.1=r41hc72bb7e_0 - - r-htmltools=0.5.2=r41h03ef668_0 - - r-htmlwidgets=1.5.4=r41hc72bb7e_0 - - r-httpuv=1.6.5=r41h03ef668_0 - - r-httr=1.4.2=r41hc72bb7e_0 - - r-ids=1.0.1=r41hc72bb7e_1 - - r-igraph=1.2.11=r41he0372cf_0 - - r-isoband=0.2.5=r41h03ef668_0 - - r-iterators=1.0.14=r41hc72bb7e_0 - - r-jquerylib=0.1.4=r41hc72bb7e_0 - - r-jsonlite=1.8.0=r41h06615bd_0 - - r-kernsmooth=2.23_20=r41h742201e_0 - - r-knitr=1.37=r41hc72bb7e_1 - - r-labeling=0.4.2=r41hc72bb7e_1 - - r-lambda.r=1.2.4=r41hc72bb7e_1 - - r-languageserver=0.3.12=r41h06615bd_0 - - r-later=1.2.0=r41h03ef668_0 - - r-lattice=0.20_45=r41hcfec24a_0 - - r-lazyeval=0.2.2=r41hcfec24a_2 - - r-lifecycle=1.0.1=r41hc72bb7e_0 - - r-lintr=3.0.0=r41hc72bb7e_0 - - r-locfit=1.5_9.5=r41h06615bd_0 - - r-lubridate=1.8.0=r41h03ef668_0 - - r-magrittr=2.0.2=r41hcfec24a_0 - - r-maldiquant=1.21=r41h7f98852_0 - - r-maps=3.4.0=r41hcfec24a_0 - - r-mass=7.3_55=r41hcfec24a_0 - - r-matrix=1.4_0=r41he454529_0 - - r-matrixstats=0.61.0=r41hcfec24a_0 - - r-memoise=2.0.1=r41hc72bb7e_0 - - r-mgcv=1.8_39=r41h0154571_0 - - r-mime=0.12=r41hcfec24a_0 - - r-modelr=0.1.8=r41hc72bb7e_0 - - r-munsell=0.5.0=r41hc72bb7e_1004 - - r-ncdf4=1.19=r41h186726c_0 - - r-nlme=3.1_155=r41h859d828_0 - - r-openssl=2.0.0=r41h1f3e0c5_0 - - r-optparse=1.7.1=r41hc72bb7e_0 - - r-pillar=1.7.0=r41hc72bb7e_0 - - r-pkgconfig=2.0.3=r41hc72bb7e_1 - - r-pkgload=1.2.4=r41h03ef668_0 - - r-plogr=0.2.0=r41hc72bb7e_1003 - - r-plotly=4.10.0=r41hc72bb7e_0 - - r-plotrix=3.8_2=r41hc72bb7e_0 - - r-plyr=1.8.6=r41h03ef668_1 - - r-png=0.1_7=r41hcfec24a_1004 - - r-praise=1.0.0=r41hc72bb7e_1005 - - r-prettyunits=1.1.1=r41hc72bb7e_1 - - r-processx=3.5.2=r41hcfec24a_0 - - r-progress=1.2.2=r41hc72bb7e_2 - - r-proj4=1.0_11=r41h0ae476a_0 - - r-promises=1.2.0.1=r41h03ef668_0 - - r-proto=1.0.0=r41ha770c72_2003 - - r-ps=1.6.0=r41hcfec24a_0 - - r-purrr=0.3.4=r41hcfec24a_1 - - r-r.cache=0.15.0=r41hc72bb7e_0 - - r-r.methodss3=1.8.2=r41hc72bb7e_0 - - r-r.oo=1.25.0=r41hc72bb7e_0 - - r-r.utils=2.11.0=r41hc72bb7e_0 - - r-r6=2.5.1=r41hc72bb7e_0 - - r-ragg=1.2.2=r41hc1f6985_0 - - r-rann=2.6.1=r41h03ef668_2 - - r-rappdirs=0.3.3=r41hcfec24a_0 - - r-rcolorbrewer=1.1_2=r41h785f33e_1003 - - r-rcpp=1.0.8=r41h03ef668_0 - - r-rcpparmadillo=0.10.8.1.0=r41h306847c_0 - - r-rcurl=1.98_1.6=r41hcfec24a_0 - - r-readr=2.1.2=r41h03ef668_0 - - r-readxl=1.3.1=r41h2713e49_4 - - r-rematch=1.0.1=r41hc72bb7e_1004 - - r-rematch2=2.1.2=r41hc72bb7e_1 - - r-remotes=2.4.2=r41hc72bb7e_0 - - r-repr=1.1.4=r41h785f33e_0 - - r-reprex=2.0.1=r41hc72bb7e_0 - - r-reshape2=1.4.4=r41h03ef668_1 - - r-rex=1.2.1=r41hc72bb7e_0 - - r-rjson=0.2.21=r41h03ef668_0 - - r-rlang=0.4.12=r41hcfec24a_0 - - r-rmarkdown=2.12=r41hc72bb7e_0 - - r-robustbase=0.93_9=r41h52d45c5_0 - - r-roxygen2=7.2.0=r41h7525677_0 - - r-rprojroot=2.0.2=r41hc72bb7e_0 - - r-rsqlite=2.2.8=r41h03ef668_0 - - r-rstudioapi=0.13=r41hc72bb7e_0 - - r-rttf2pt1=1.3.10=r41hcfec24a_0 - - r-runit=0.4.32=r41hc72bb7e_1002 - - r-rvest=1.0.2=r41hc72bb7e_0 - - r-sass=0.4.0=r41h03ef668_0 - - r-scales=1.1.1=r41hc72bb7e_0 - - r-selectr=0.4_2=r41hc72bb7e_1 - - r-shape=1.4.6=r41ha770c72_0 - - r-shiny=1.7.1=r41h785f33e_0 - - r-snow=0.4_4=r41hc72bb7e_0 - - r-sourcetools=0.1.7=r41h03ef668_1002 - - r-sqldf=0.4_11=r41hc72bb7e_2 - - r-stringdist=0.9.8=r41hcfec24a_1 - - r-stringi=1.7.6=r41h337692f_1 - - r-stringr=1.4.0=r41hc72bb7e_2 - - r-styler=1.7.0=r41hc72bb7e_0 - - r-survival=3.3_1=r41h06615bd_0 - - r-sys=3.4=r41hcfec24a_0 - - r-systemfonts=1.0.4=r41hef9c87a_0 - - r-testthat=3.1.2=r41h03ef668_0 - - r-textshaping=0.3.6=r41hcb6d10c_0 - - r-tibble=3.1.6=r41hcfec24a_0 - - r-tidyr=1.2.0=r41h03ef668_0 - - r-tidyselect=1.1.1=r41hc72bb7e_0 - - r-tidyverse=1.3.1=r41hc72bb7e_0 - - r-tinytex=0.37=r41hc72bb7e_0 - - r-tzdb=0.2.0=r41h03ef668_0 - - r-utf8=1.2.2=r41hcfec24a_0 - - r-uuid=1.0_3=r41hcfec24a_0 - - r-vctrs=0.3.8=r41hcfec24a_1 - - r-vipor=0.4.5=r41hc72bb7e_1003 - - r-viridislite=0.4.0=r41hc72bb7e_0 - - r-vroom=1.5.7=r41h03ef668_0 - - r-waldo=0.3.1=r41hc72bb7e_0 - - r-waveslim=1.8.2=r41h859d828_2 - - r-withr=2.5.0=r41hc72bb7e_0 - - r-xfun=0.30=r41h7525677_0 - - r-xml=3.99_0.9=r41h06615bd_0 - - r-xml2=1.3.3=r41h03ef668_0 - - r-xmlparsedata=1.0.5=r41hc72bb7e_0 - - r-xtable=1.8_4=r41hc72bb7e_3 - - r-yaml=2.3.5=r41h06615bd_0 - - readline=8.1=h46c0cb4_0 - - sed=4.8=he412f7d_0 - - sqlite=3.37.0=h9cd32fc_0 - - sysroot_linux-64=2.12=he073ed8_15 - - tk=8.6.12=h27826a3_0 - - tktable=2.10=hb7b940f_3 - - xorg-kbproto=1.0.7=h7f98852_1002 - - xorg-libice=1.0.10=h7f98852_0 - - xorg-libsm=1.2.3=hd9c2040_1000 - - xorg-libx11=1.7.2=h7f98852_0 - - xorg-libxau=1.0.9=h7f98852_0 - - xorg-libxdmcp=1.1.3=h7f98852_0 - - xorg-libxext=1.3.4=h7f98852_1 - - xorg-libxrender=0.9.10=h7f98852_1003 - - xorg-libxt=1.2.1=h7f98852_2 - - xorg-renderproto=0.11.1=h7f98852_1002 - - xorg-xextproto=7.3.0=h7f98852_1002 - - xorg-xproto=7.0.31=h7f98852_1007 - - xz=5.2.5=h516909a_1 - - zlib=1.2.11=h36c2ea0_1013 - - zstd=1.5.2=ha95c52a_0 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml deleted file mode 100644 index bc39c7c0..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/VV.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: VV -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python=3.8 - - pandas=1.2 - - samtools - - isatools - - pip - - pip: - - git+https://github.com/J-81/JDO_V-V.git@0.6.0-beta.3 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml deleted file mode 100644 index be88ce26..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/download_tools.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: download_tools -channels: - - anaconda - - conda-forge - - bioconda - - defaults -dependencies: - - tqdm=4.59 - - requests=2.25 - - pandas=1.2 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml deleted file mode 100644 index 03494e70..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/dp_tools.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: dp_tools -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python=3.10 # required for in-house codebase compatibility - - pandas - - samtools - - isatools - - multiqc - - schema - - pytest - - pip - - pip: - - git+https://github.com/J-81/dp_tools.git@1.0.7rc2 \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml deleted file mode 100644 index 2adf7e89..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ercc_analysis.yml +++ /dev/null @@ -1,22 +0,0 @@ -# source: https://raw.githubusercontent.com/J-81/gl_dockerfiles/6bb3de66396b98e1533119e203a26ed3a8abcdc8/assets/conda.yaml -channels: - - conda-forge - - bioconda - - r - - defaults -dependencies: - - python==3.10 - - jupyter - - seaborn - - plotly - - matplotlib - - pandas - - scikit-learn - - statsmodels - - papermill - - r-base==4.1.2 - - bioconductor-deseq2=1.34.0 - - r-tidyverse==1.3.1 - - r-plotly - - r-knitr - - r-irkernel diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml deleted file mode 100644 index 9362200d..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/fastqc.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - defaults -dependencies: - - fastqc diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml deleted file mode 100644 index b7bfe567..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/genelab_utils.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: genelab_utils -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python=3.8 - - pandas - - tabulate - - openpyxl diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml deleted file mode 100644 index 26c52ade..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/isatools.yml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - isatools diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml deleted file mode 100644 index 3d965dbe..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/main.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: main -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - nextflow=20.07 - - python=3.8 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml deleted file mode 100644 index 944c1bbf..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/multiqc.yml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - multiqc diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml deleted file mode 100644 index e46f23db..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/python.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: python -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - python=3.8 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml deleted file mode 100644 index 6add00db..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/r_deseq2.yml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - R - - r-biocmanager - - r-rnetcdf diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml deleted file mode 100644 index 4caada28..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rnaseq_v1.0_modify.yml +++ /dev/null @@ -1,322 +0,0 @@ -name: rnaseq_v1.0_modify -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=1_gnu - - _r-mutex=1.0.1=anacondar_1 - - alsa-lib=1.2.3=h516909a_0 - - bcftools=1.10.2=h4f4756c_3 - - binutils_impl_linux-64=2.35.1=h193b22a_1 - - binutils_linux-64=2.35=hc3fd857_29 - - bioconductor-biobase=2.50.0=r40h037d062_0 - - bioconductor-biocgenerics=0.36.0=r40_0 - - bioconductor-biocparallel=1.24.0=r40h5f743cb_0 - - bioconductor-biostrings=2.58.0=r40h037d062_0 - - bioconductor-delayedarray=0.16.0=r40h037d062_0 - - bioconductor-ebseq=1.30.0=r40_0 - - bioconductor-genomeinfodb=1.26.0=r40_0 - - bioconductor-genomeinfodbdata=1.2.4=r40_0 - - bioconductor-genomicalignments=1.26.0=r40h037d062_0 - - bioconductor-genomicranges=1.42.0=r40h037d062_0 - - bioconductor-iranges=2.24.0=r40h037d062_0 - - bioconductor-matrixgenerics=1.2.0=r40_0 - - bioconductor-noiseq=2.34.0=r40_0 - - bioconductor-rhtslib=1.22.0=r40h037d062_0 - - bioconductor-rsamtools=2.6.0=r40h5f743cb_0 - - bioconductor-rtracklayer=1.50.0=r40h9bb0e53_1 - - bioconductor-s4vectors=0.28.0=r40h037d062_0 - - bioconductor-summarizedexperiment=1.20.0=r40_0 - - bioconductor-xvector=0.30.0=r40h037d062_0 - - bioconductor-zlibbioc=1.36.0=r40h037d062_0 - - brotlipy=0.7.0=py38h8df0ef7_1001 - - bwidget=1.9.14=ha770c72_0 - - bx-python=0.8.9=py38hb90e610_2 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.17.1=h36c2ea0_0 - - ca-certificates=2021.1.19=h06a4308_0 - - cairo=1.16.0=h9f066cc_1006 - - certifi=2020.12.5=py38h578d9bd_1 - - cffi=1.14.4=py38ha65f79e_1 - - chardet=4.0.0=py38h578d9bd_1 - - click=7.1.2=pyh9f0ad1d_0 - - coloredlogs=15.0=py38h578d9bd_0 - - colormath=3.0.0=py_2 - - cryptography=3.3.1=py38h2b97feb_1 - - curl=7.71.1=he644dc0_8 - - cutadapt=3.2=py38h0213d0e_0 - - cycler=0.10.0=py_2 - - decorator=4.4.2=py_0 - - dnaio=0.5.0=py38h0213d0e_0 - - fastqc=0.11.9=0 - - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - - font-ttf-inconsolata=2.001=hab24e00_0 - - font-ttf-source-code-pro=2.030=hab24e00_0 - - font-ttf-ubuntu=0.83=hab24e00_0 - - fontconfig=2.13.1=h7e3eb15_1002 - - fonts-conda-ecosystem=1=0 - - fonts-conda-forge=1=0 - - freetype=2.10.4=h7ca028e_0 - - fribidi=1.0.10=h36c2ea0_0 - - future=0.18.2=py38h578d9bd_3 - - gcc_impl_linux-64=9.3.0=h28f5a38_17 - - gcc_linux-64=9.3.0=h7247604_29 - - gettext=0.19.8.1=h0b5b191_1005 - - gfortran_impl_linux-64=9.3.0=h2bb4189_17 - - gfortran_linux-64=9.3.0=ha1c937c_29 - - giflib=5.2.1=h36c2ea0_2 - - graphite2=1.3.13=h58526e2_1001 - - gsl=2.6=he838d99_1 - - gxx_impl_linux-64=9.3.0=h53cdd4c_17 - - gxx_linux-64=9.3.0=h0d07fa4_29 - - harfbuzz=2.7.2=ha5b49bf_1 - - htslib=1.10.2=hd3b49d5_1 - - humanfriendly=9.1=py38h578d9bd_0 - - icu=67.1=he1b5a44_0 - - idna=2.10=pyh9f0ad1d_0 - - importlib-metadata=3.4.0=py38h578d9bd_0 - - isa-l=2.30.0=h36c2ea0_0 - - jinja2=2.11.2=pyh9f0ad1d_0 - - jpeg=9d=h36c2ea0_0 - - kernel-headers_linux-64=2.6.32=h77966d4_13 - - kiwisolver=1.3.1=py38h1fd1430_1 - - krb5=1.17.2=h926e7f8_0 - - lcms2=2.11=hcbb858e_1 - - ld_impl_linux-64=2.35.1=hea4e1c9_1 - - libblas=3.8.0=17_openblas - - libcblas=3.8.0=17_openblas - - libcurl=7.71.1=hcdd3856_8 - - libdeflate=1.6=h516909a_0 - - libedit=3.1.20191231=he28a2e2_2 - - libev=4.33=h516909a_1 - - libffi=3.3=h58526e2_2 - - libgcc-devel_linux-64=9.3.0=hfd08b2a_17 - - libgcc-ng=9.3.0=h5dbcf3e_17 - - libgfortran-ng=9.3.0=he4bcb1c_17 - - libgfortran5=9.3.0=he4bcb1c_17 - - libglib=2.66.4=h164308a_1 - - libgomp=9.3.0=h5dbcf3e_17 - - libiconv=1.16=h516909a_0 - - liblapack=3.8.0=17_openblas - - libnghttp2=1.41.0=h8cfc5f6_2 - - libopenblas=0.3.10=pthreads_h4812303_5 - - libpng=1.6.37=h21135ba_2 - - libssh2=1.9.0=hab1572f_5 - - libstdcxx-devel_linux-64=9.3.0=h4084dd6_17 - - libstdcxx-ng=9.3.0=h2ae2ef3_17 - - libtiff=4.2.0=hdc55705_0 - - libuuid=2.32.1=h7f98852_1000 - - libwebp-base=1.1.0=h36c2ea0_3 - - libxcb=1.13=h7f98852_1003 - - libxml2=2.9.10=h68273f3_2 - - lz4-c=1.9.3=h9c3ff4c_0 - - lzo=2.10=h516909a_1000 - - lzstring=1.0.4=py_1001 - - make=4.3=hd18ef5c_1 - - markdown=3.3.3=pyh9f0ad1d_0 - - markupsafe=1.1.1=py38h497a2fe_3 - - matplotlib-base=3.3.3=py38h5c7f4ab_0 - - multiqc=1.9=py_1 - - mysql-connector-c=6.1.11=h6eb9d5d_1007 - - ncurses=6.2=h58526e2_4 - - networkx=2.5=py_0 - - numpy=1.19.5=py38h18fd61f_1 - - olefile=0.46=pyh9f0ad1d_1 - - openjdk=11.0.8=hacce0ff_0 - - openssl=1.1.1i=h7f98852_0 - - pandas=1.2.0=py38h51da96c_0 - - pango=1.42.4=h69149e4_5 - - pcre=8.44=he1b5a44_0 - - pcre2=10.36=h032f7d1_0 - - perl=5.26.2=h36c2ea0_1008 - - perl-app-cpanminus=1.7044=pl526_1 - - perl-carp=1.38=pl526_3 - - perl-constant=1.33=pl526_1 - - perl-cpan-meta=2.150010=pl526_0 - - perl-cpan-meta-requirements=2.140=pl526_0 - - perl-cpan-meta-yaml=0.018=pl526_0 - - perl-data-dumper=2.173=pl526_0 - - perl-encode=2.88=pl526_1 - - perl-exporter=5.72=pl526_1 - - perl-extutils-cbuilder=0.280230=pl526_1 - - perl-extutils-makemaker=7.36=pl526_1 - - perl-extutils-manifest=1.72=pl526_0 - - perl-extutils-parsexs=3.35=pl526_0 - - perl-file-path=2.16=pl526_0 - - perl-file-temp=0.2304=pl526_2 - - perl-getopt-long=2.50=pl526_1 - - perl-ipc-cmd=1.02=pl526_0 - - perl-json-pp=4.04=pl526_0 - - perl-locale-maketext-simple=0.21=pl526_2 - - perl-module-build=0.4224=pl526_3 - - perl-module-corelist=5.20190524=pl526_0 - - perl-module-load=0.32=pl526_1 - - perl-module-load-conditional=0.68=pl526_2 - - perl-module-metadata=1.000036=pl526_0 - - perl-params-check=0.38=pl526_1 - - perl-parent=0.236=pl526_1 - - perl-perl-ostype=1.010=pl526_1 - - perl-scalar-list-utils=1.52=pl526h516909a_0 - - perl-text-abbrev=1.02=pl526_0 - - perl-text-parsewords=3.30=pl526_0 - - perl-version=0.9924=pl526_0 - - pigz=2.3.4=hed695b0_1 - - pillow=8.1.0=py38h357d4e7_1 - - pip=20.3.3=pyhd8ed1ab_0 - - pixman=0.40.0=h36c2ea0_0 - - pthread-stubs=0.4=h36c2ea0_1001 - - pybigwig=0.3.17=py38h55f8d50_2 - - pybktree=1.1=pyh9f0ad1d_0 - - pycparser=2.20=pyh9f0ad1d_2 - - pyopenssl=20.0.1=pyhd8ed1ab_0 - - pyparsing=2.4.7=pyh9f0ad1d_0 - - pysam=0.16.0.1=py38hbdc2ae9_1 - - pysocks=1.7.1=py38h578d9bd_3 - - python=3.8.6=hffdb5ce_4_cpython - - python-dateutil=2.8.1=py_0 - - python-lzo=1.12=py38h86e1cee_1003 - - python_abi=3.8=1_cp38 - - pytz=2020.5=pyhd8ed1ab_0 - - pyyaml=5.3.1=py38h497a2fe_2 - - qualimap=2.2.2d=1 - - r-assertthat=0.2.1=r40h6115d3f_2 - - r-backports=1.2.1=r40hcfec24a_0 - - r-base=4.0.3=ha43b4e8_3 - - r-bh=1.75.0_0=r40hc72bb7e_0 - - r-bibtex=0.4.2.3=r40hcdcec82_0 - - r-bitops=1.0_6=r40hcdcec82_1004 - - r-blockmodeling=1.0.0=r40h580db52_1 - - r-brio=1.1.0=r40h9e2df91_1 - - r-callr=3.5.1=r40h142f84f_0 - - r-catools=1.18.1=r40h03ef668_0 - - r-cli=2.2.0=r40hc72bb7e_0 - - r-codetools=0.2_18=r40hc72bb7e_0 - - r-crayon=1.3.4=r40h6115d3f_1003 - - r-desc=1.2.0=r40h6115d3f_1003 - - r-diffobj=0.3.3=r40hcfec24a_0 - - r-digest=0.6.27=r40h1b71b39_0 - - r-doparallel=1.0.16=r40h142f84f_0 - - r-dorng=1.8.2=r40h6115d3f_1 - - r-ellipsis=0.3.1=r40hcdcec82_0 - - r-evaluate=0.14=r40h6115d3f_2 - - r-fansi=0.4.1=r40hcdcec82_1 - - r-foreach=1.5.1=r40h142f84f_0 - - r-formatr=1.7=r40h6115d3f_2 - - r-futile.logger=1.4.3=r40h6115d3f_1003 - - r-futile.options=1.0.1=r40h6115d3f_1002 - - r-getopt=1.20.3=r40_2 - - r-glue=1.4.2=r40hcdcec82_0 - - r-gplots=3.1.1=r40hc72bb7e_0 - - r-gtools=3.8.2=r40hcdcec82_1 - - r-iterators=1.0.13=r40h142f84f_0 - - r-jsonlite=1.7.2=r40hcfec24a_0 - - r-kernsmooth=2.23_18=r40h742201e_0 - - r-lambda.r=1.2.4=r40h6115d3f_1 - - r-lattice=0.20_41=r40hcfec24a_2 - - r-lifecycle=0.2.0=r40h6115d3f_1 - - r-magrittr=2.0.1=r40h9e2df91_1 - - r-matrix=1.3_2=r40he454529_0 - - r-matrixstats=0.57.0=r40hcfec24a_0 - - r-optparse=1.6.6=r40h6115d3f_1 - - r-pillar=1.4.7=r40hc72bb7e_0 - - r-pkgbuild=1.2.0=r40hc72bb7e_0 - - r-pkgconfig=2.0.3=r40h6115d3f_1 - - r-pkgload=1.1.0=r40h0357c0b_0 - - r-pkgmaker=0.32.2=r40h142f84f_0 - - r-praise=1.0.0=r40h6115d3f_1004 - - r-prettyunits=1.1.1=r40h6115d3f_1 - - r-processx=3.4.5=r40hcfec24a_0 - - r-ps=1.5.0=r40hcfec24a_0 - - r-r6=2.5.0=r40hc72bb7e_0 - - r-rcurl=1.98_1.2=r40hcdcec82_1 - - r-registry=0.5_1=r40h6115d3f_2 - - r-rematch2=2.1.2=r40h6115d3f_1 - - r-rlang=0.4.10=r40hcfec24a_0 - - r-rngtools=1.5=r40h6115d3f_1 - - r-rprojroot=2.0.2=r40hc72bb7e_0 - - r-rstudioapi=0.13=r40hc72bb7e_0 - - r-snow=0.4_3=r40h6115d3f_1002 - - r-stringi=1.5.3=r40h604b29c_0 - - r-stringr=1.4.0=r40h6115d3f_2 - - r-testthat=3.0.1=r40h03ef668_0 - - r-tibble=3.0.4=r40h0eb13af_0 - - r-utf8=1.1.4=r40hcdcec82_1003 - - r-vctrs=0.3.6=r40hcfec24a_0 - - r-waldo=0.2.3=r40hc72bb7e_0 - - r-withr=2.3.0=r40h6115d3f_0 - - r-xml=3.99_0.5=r40hcfec24a_0 - - r-xtable=1.8_4=r40h6115d3f_3 - - r-zeallot=0.1.0=r40h6115d3f_1002 - - readline=8.0=he28a2e2_2 - - regex=2020.11.13=py38h497a2fe_1 - - requests=2.25.1=pyhd3deb0d_0 - - rsem=1.3.3=pl526hfbaaabd_1 - - rseqc=4.0.0=py38h0213d0e_0 - - samtools=1.10=h2e538c0_3 - - scipy=1.6.0=py38hb2138dd_0 - - sed=4.8=he412f7d_0 - - seqtk=1.3=hed695b0_2 - - setuptools=49.6.0=py38h578d9bd_3 - - simplejson=3.17.2=py38h497a2fe_2 - - six=1.15.0=pyh9f0ad1d_0 - - spectra=0.0.11=py_1 - - sqlite=3.34.0=h74cdb3f_0 - - star=2.7.7a=0 - - sysroot_linux-64=2.12=h77966d4_13 - - tk=8.6.10=h21135ba_1 - - tktable=2.10=hb7b940f_3 - - tornado=6.1=py38h497a2fe_1 - - trim-galore=0.6.6=0 - - ucsc-bigwigsummary=377=h446ed27_1 - - umi_tools=1.1.1=py38h0213d0e_1 - - urllib3=1.26.2=pyhd8ed1ab_0 - - wheel=0.36.2=pyhd3deb0d_0 - - xopen=1.0.1=py38h578d9bd_1 - - xorg-fixesproto=5.0=h14c3975_1002 - - xorg-inputproto=2.3.2=h7f98852_1002 - - xorg-kbproto=1.0.7=h7f98852_1002 - - xorg-libice=1.0.10=h516909a_0 - - xorg-libsm=1.2.3=h84519dc_1000 - - xorg-libx11=1.6.12=h516909a_0 - - xorg-libxau=1.0.9=h7f98852_0 - - xorg-libxdmcp=1.1.3=h7f98852_0 - - xorg-libxext=1.3.4=h516909a_0 - - xorg-libxfixes=5.0.3=h516909a_1004 - - xorg-libxi=1.7.10=h516909a_0 - - xorg-libxrender=0.9.10=h516909a_1002 - - xorg-libxt=1.2.0=h516909a_0 - - xorg-libxtst=1.2.3=h516909a_1002 - - xorg-recordproto=1.14.2=h516909a_1002 - - xorg-renderproto=0.11.1=h14c3975_1002 - - xorg-xextproto=7.3.0=h7f98852_1002 - - xorg-xproto=7.0.31=h7f98852_1007 - - xz=5.2.5=h516909a_1 - - yaml=0.2.5=h516909a_0 - - zipp=3.4.0=py_0 - - zlib=1.2.11=h516909a_1010 - - zstd=1.4.8=ha95c52a_1 - - pip: - - appdirs==1.4.4 - - attrs==20.3.0 - - beautifulsoup4==4.9.3 - - biopython==1.78 - - cached-property==1.5.2 - - deepdiff==5.2.2 - - et-xmlfile==1.0.1 - - fs==2.4.12 - - isatools==0.11.0 - - iso8601==0.1.13 - - jdcal==1.4.1 - - jsonschema==3.2.0 - - lxml==4.6.2 - - mzml2isa==1.0.3 - - openpyxl==2.6.4 - - ordered-set==4.0.2 - - progressbar2==3.53.1 - - pronto==0.12.2 - - pyrsistent==0.17.3 - - python-utils==2.5.5 - - soupsieve==2.1 -prefix: /home/joribello/anaconda3/envs/rnaseq_v1.0_modify diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml deleted file mode 100644 index cd5c013a..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/rsem.yml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - bioconda - - conda-forge - - defaults -dependencies: - - rsem - - R diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml deleted file mode 100644 index 0509dd42..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - defaults -dependencies: - - samtools diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml deleted file mode 100644 index 3c4afe5e..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/samtools_rseqc.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: samtools_rseqc -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - python - - rseqc - - samtools >= 1.13 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml deleted file mode 100644 index 7f0da546..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/star.yml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - bioconda - - defaults -dependencies: - - star=2.7.8a # pinned due to a bug in 2.7.9a in outputting transcriptTome.bam, should update once the bug is fixed in a newer release - - python=3.8 diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml deleted file mode 100644 index dd50a690..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/trim_galore.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - defaults -dependencies: - - trim-galore diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml deleted file mode 100644 index 6ae38a48..00000000 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/envs/ucsc_gtf_Pred_BED.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: ucsc_gtf_Pred_BED -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - python - - ucsc-gtftogenepred - - ucsc-genepredtobed From cbf605576f5ef7297c701ca0627014fe47a5fdfd Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 11 Jul 2023 23:45:48 +0000 Subject: [PATCH 46/58] feat: rename and reformat dge to DGE_BY_DESEQ2 --- .../modules/{dge/dge.nf => DGE_BY_DESEQ2/main.nf} | 0 .../NF_RCP-F/workflow_code/tests/config/nftest_modules.yml | 6 +++--- .../dge.nf.test => DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test} | 6 +++--- .../DGE_BY_DESEQ2.nf.test.snap} | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/{dge/dge.nf => DGE_BY_DESEQ2/main.nf} (100%) rename RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/{dge/dge.nf.test => DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test} (95%) rename RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/{dge/dge.nf.test.snap => DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap} (100%) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf similarity index 100% rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/dge.nf rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml index 25b92aa4..f2722c8e 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/config/nftest_modules.yml @@ -1,3 +1,3 @@ -dge: - - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge/** - - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/** \ No newline at end of file +DGE_BY_DESEQ2: + - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/** + - RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/** \ No newline at end of file diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test similarity index 95% rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test index 0954b609..443c7188 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test @@ -1,11 +1,11 @@ nextflow_process { name "Test Process DGE_BY_DESEQ2" - script "modules/dge/dge.nf" + script "modules/DGE_BY_DESEQ2/main.nf" process "DGE_BY_DESEQ2" test("GLDS-194") { - tag 'dge' + tag 'DGE_BY_DESEQ2' when { params { @@ -38,7 +38,7 @@ nextflow_process { } test("GLDS-321:55_.ISSUE") { - tag 'dge' + tag 'DGE_BY_DESEQ2' when { params { diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap similarity index 100% rename from RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge/dge.nf.test.snap rename to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap From 6a0d105ccb67b6626e6640ac80b8a4597a8b5581 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 17:59:51 +0000 Subject: [PATCH 47/58] test --- .../NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf index 1bc7017d..25e5a59a 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf @@ -64,6 +64,6 @@ process DGE_BY_DESEQ2 { --extended_table_output_prefix "dge_output_ercc/"\\ --extended_table_output_suffix "_ERCCnorm.csv" fi - + # bump """ } From 70488e6ab4180e4c8450cf32bc386ab090d9393d Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 19:00:51 +0000 Subject: [PATCH 48/58] fix: bump to unreleased new version for act support Enables usage of act to run actions via cli --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6a1122f..f1baeae3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,7 +44,7 @@ jobs: uses: actions/checkout@v3 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@be72b1dc0f932cea69aef64479ac863a86516c0c with: version: "${{ matrix.NXF_VER }}" From 62fe7683d9abf572df9be57900d07f305df97bda Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 19:44:58 +0000 Subject: [PATCH 49/58] feat: update spell ignore --- .codespellignore | 5 +++++ .github/workflows/check_typos.yml | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 .codespellignore diff --git a/.codespellignore b/.codespellignore new file mode 100644 index 00000000..24717958 --- /dev/null +++ b/.codespellignore @@ -0,0 +1,5 @@ +RNAseq +OTU +otu +groupD +groupd \ No newline at end of file diff --git a/.github/workflows/check_typos.yml b/.github/workflows/check_typos.yml index 600f3d3d..1bea1476 100644 --- a/.github/workflows/check_typos.yml +++ b/.github/workflows/check_typos.yml @@ -15,8 +15,9 @@ jobs: fail-fast: false steps: - uses: actions/checkout@v3 - - uses: codespell-project/actions-codespell@master + - uses: codespell-project/codespell-problem-matcher@v1 + - uses: codespell-project/actions-codespell@v2.0 with: check_filenames: true - skip: "*.yml,*.cff,*.js,*.lock" - ignore_words_list: RNAseq + skip: "*.yml,*.cff,*.js,*.lock,*.pdf,*.ipynb" + ignore_words_file: ".codespellignore" From a83006ba91b1209e1857fefd96e9ff950ebb0cdc Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 19:45:15 +0000 Subject: [PATCH 50/58] feat: rework to only check links on create due to slower speed --- .github/workflows/markdown-link-check.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index 04a60dc9..ab39ff62 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -1,6 +1,9 @@ name: Check Markdown links -on: push +on: + create: # runs when a reference (branch or tag) is created + tags: + - v* # wildcard can be used to match tag patterns, this example matches tags like v1.0, v2.3.4, etc. jobs: markdown-link-check: From ee188ebcf04f940f0225fd360f88ce5c224ee4fe Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 20:06:29 +0000 Subject: [PATCH 51/58] docs: update CHANGELOG --- RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md b/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md index eef3c524..a89e3ab6 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- Github action support for CI testing (a83006ba91b1209e1857fefd96e9ff950ebb0cdc) + +### Fixed +- Workflow usage files will all follow output directory set by workflow user (3e69f06432f62b7924d2e043ef4768c5d09bf614) +### Changed +- TrimGalore! will now use autodetect for adaptor type (3b7e0bab4017e90481359c48f9cf7c8837ed54d2) +- V&V migrated from dp_tools version 1.1.8 to 1.3.2 including: + - Migration of V&V protocol code to this codebase instead of dp_tools (b3684a4c1db5df06eab20916ef7e130c410c147c) + - Fix for sample wise checks reusing same samples (dca4fdad7518ac9ead3ee2e4c5f57ac0fe25c715) + ## [1.0.3](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.3/RNAseq/Workflow_Documentation/NF_RCP-F) - 2023-01-25 ### Added From 16cbbebcf0333be3d7c77932ec027daacb3f9030 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 23:55:56 +0000 Subject: [PATCH 52/58] feat: add minimal size full pipeline --- .../workflows/ci_minimal_full_pipeline.yml | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 .github/workflows/ci_minimal_full_pipeline.yml diff --git a/.github/workflows/ci_minimal_full_pipeline.yml b/.github/workflows/ci_minimal_full_pipeline.yml new file mode 100644 index 00000000..4d1c9446 --- /dev/null +++ b/.github/workflows/ci_minimal_full_pipeline.yml @@ -0,0 +1,77 @@ +name: CI Minimal Dataset Full Pipeline +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + create: # runs when a reference (branch or tag) is created + +env: + NXF_ANSI_LOG: false + +jobs: + Minimal_Dataset_Full_Pipeline: + name: ${{ matrix.tags }} ${{ matrix.profile }} ${{ matrix.NXF_VER }} + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "22.10.1" + - "latest-everything" + profile: + - "docker" + - "singularity" + + steps: + - name: Check out pipeline code + uses: actions/checkout@v3 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@be72b1dc0f932cea69aef64479ac863a86516c0c + with: + version: "${{ matrix.NXF_VER }}" + + - name: Set up Singularity + if: matrix.profile == 'singularity' + uses: eWaterCycle/setup-singularity@v5 + with: + singularity-version: 3.7.1 + + - name: Install nf-test + id: nf-test + run: | + curl -fsSL https://code.askimed.com/install/nf-test | bash + chmod u+x nf-test + echo "bin_path=$(pwd)/nf-test" >> $GITHUB_OUTPUT + + + - name: Hash Github Workspace + id: hash_workspace + run: | + echo "digest=$(echo RNA_3.10.1_${{ github.workspace }} | md5sum | cut -c 1-25)" >> $GITHUB_OUTPUT + + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets + key: ${{ steps.hash_workspace.outputs.digest }} + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: J-81/test-datasets-extended + ref: NF_RCP-F + path: RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test-datasets + + # Test the module + - name: Run nf-test on minimal core test datasets + run: | + cd ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/ + ${{ steps.nf-test.outputs.bin_path}} test \ + --profile=${{ matrix.profile }} \ + --tag core \ + --tap=test.tap + + - uses: pcolby/tap-summary@v1 + with: + path: >- + ${GITHUB_WORKSPACE}/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/test.tap \ No newline at end of file From ed053bd08bdbcd83fb636f14046c5bc7928d3344 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Tue, 8 Aug 2023 23:59:09 +0000 Subject: [PATCH 53/58] fix[ci]: update nextflow setup version --- .github/workflows/ci_minimal_full_pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_minimal_full_pipeline.yml b/.github/workflows/ci_minimal_full_pipeline.yml index 4d1c9446..c43bb094 100644 --- a/.github/workflows/ci_minimal_full_pipeline.yml +++ b/.github/workflows/ci_minimal_full_pipeline.yml @@ -24,7 +24,7 @@ jobs: uses: actions/checkout@v3 - name: Install Nextflow - uses: nf-core/setup-nextflow@be72b1dc0f932cea69aef64479ac863a86516c0c + uses: nf-core/setup-nextflow@v1.3.0 with: version: "${{ matrix.NXF_VER }}" From 4dd300cd5a33b5375b011b325030ce56f2489720 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 9 Aug 2023 00:05:55 +0000 Subject: [PATCH 54/58] fix[ci]: Test location --- .github/workflows/ci_minimal_full_pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci_minimal_full_pipeline.yml b/.github/workflows/ci_minimal_full_pipeline.yml index c43bb094..0aca9c8f 100644 --- a/.github/workflows/ci_minimal_full_pipeline.yml +++ b/.github/workflows/ci_minimal_full_pipeline.yml @@ -69,7 +69,8 @@ jobs: ${{ steps.nf-test.outputs.bin_path}} test \ --profile=${{ matrix.profile }} \ --tag core \ - --tap=test.tap + --tap=test.tap \ + tests/*.test - uses: pcolby/tap-summary@v1 with: From 85754aaf32f933a19e00776ff83434f5b5414db6 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 9 Aug 2023 02:13:25 +0000 Subject: [PATCH 55/58] ci: trigger on any new branch or tag --- .github/workflows/markdown-link-check.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index ab39ff62..774d6745 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -2,9 +2,7 @@ name: Check Markdown links on: create: # runs when a reference (branch or tag) is created - tags: - - v* # wildcard can be used to match tag patterns, this example matches tags like v1.0, v2.3.4, etc. - + jobs: markdown-link-check: runs-on: ubuntu-latest From 43f5b5f1733c2ffca97b0924e2b50164e0a6c70b Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 9 Aug 2023 02:15:02 +0000 Subject: [PATCH 56/58] ci: trigger on all pushes --- .github/workflows/ci_minimal_full_pipeline.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci_minimal_full_pipeline.yml b/.github/workflows/ci_minimal_full_pipeline.yml index 0aca9c8f..d0b4c44b 100644 --- a/.github/workflows/ci_minimal_full_pipeline.yml +++ b/.github/workflows/ci_minimal_full_pipeline.yml @@ -1,7 +1,6 @@ name: CI Minimal Dataset Full Pipeline # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - create: # runs when a reference (branch or tag) is created +on: push env: NXF_ANSI_LOG: false From 43f1147208bf81185f06681e74c50b6530ae3adf Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 9 Aug 2023 02:16:06 +0000 Subject: [PATCH 57/58] ci: trigger on all pushes --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1baeae3..18c89177 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: CI +name: CI Updated Modules Testing # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors on: push: From a8c65d3d8f94c5e7b965cb0f332d82a07cbd0b50 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 9 Aug 2023 02:16:45 +0000 Subject: [PATCH 58/58] fix: importing DGE_BY_DESEQ2 --- RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf index 916f3d88..477b1929 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf @@ -22,7 +22,7 @@ include { BUILD_STAR; CONCAT_ERCC; QUANTIFY_STAR_GENES; QUANTIFY_RSEM_GENES } from './modules/genome.nf' -include { DGE_BY_DESEQ2 } from './modules/dge.nf' +include { DGE_BY_DESEQ2 } from './modules/DGE_BY_DESEQ2' include { VV_RAW_READS; VV_TRIMMED_READS; VV_STAR_ALIGNMENTS;