diff --git a/.github/skip_nf_test.json b/.github/skip_nf_test.json index 58ef7e41..adf0afbe 100644 --- a/.github/skip_nf_test.json +++ b/.github/skip_nf_test.json @@ -1,5 +1,9 @@ { "conda": [ + "modules/msk/custom/fingerprintvcfparser", + "modules/msk/custom/fingerprintcontamination", + "modules/msk/custom/fingerprintcombine", + "modules/msk/custom/fingerprintcorrelation", "modules/msk/calculatenoise", "modules/msk/ppflagfixer", "modules/msk/facets", @@ -30,6 +34,7 @@ "modules/msk/phylowgs/parsecnvs", "modules/msk/pvmaf/concat", "modules/msk/pvmaf/tagtraceback", + "subworkflows/msk/fingerprint_gbcms", "modules/msk/oncokb/mafannotate", "subworkflows/msk/genome_nexus", "subworkflows/msk/netmhcstabandpan", diff --git a/modules/msk/fingerprint/combine/environment.yml b/modules/msk/fingerprint/combine/environment.yml new file mode 100644 index 00000000..8a3b7591 --- /dev/null +++ b/modules/msk/fingerprint/combine/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: +- conda-forge +- bioconda +dependencies: +- conda-forge::r-argparse=2.2.5 +- conda-forge::r-data.table=1.17.8 +- conda-forge::r-dplyr=1.1.4 +- conda-forge::r-plyr=1.8.9 +- conda-forge::r-tidyverse=2.0.0 diff --git a/modules/msk/fingerprint/combine/main.nf b/modules/msk/fingerprint/combine/main.nf new file mode 100644 index 00000000..25e0cae6 --- /dev/null +++ b/modules/msk/fingerprint/combine/main.nf @@ -0,0 +1,57 @@ +process FINGERPRINT_COMBINE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66': + 'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66' }" + //' oras://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:d96a65055f79744c': + + + input: + tuple val(meta), path(fp_tsv), val(sample), val(genome_build), val(patient) + path(liftover_loci_mapping) + + output: + tuple val(meta), path("*DPfilter_ALL_FP.txt") , emit: combined_fp_tsv + tuple val("${task.process}"), val('complete_FP_table.R'), val('0.1.0'), emit: versions_combine, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + declare -a fp_tsv_list + declare -a sample_list + declare -a genome_build_list + declare -a patient_list + fp_tsv_list=(${fp_tsv.join(' ')}) + sample_list=(${sample.join(' ')}) + genome_build_list=(${genome_build.join(' ')}) + patient_list=(${patient.join(' ')}) + echo -e "sample_id\tgenome_build\tfp_tsv\tpatient" > input.tsv + for i in \$(seq 0 1 \$((\${#fp_tsv_list[@]}-1)) ) ; do + fp_tsv=\${fp_tsv_list[i]} + sample=\${sample_list[i]} + genome=\${genome_build_list[i]} + patient=\${patient_list[i]} + echo -e "\$sample\t\$genome\t\$fp_tsv\t\$patient" + done >> input.tsv + + complete_FP_table.R \\ + -i input.tsv \\ + -l $liftover_loci_mapping \\ + $args + """ + + stub: + def args = task.ext.args ?: '' + + """ + echo $args + + touch XDPfilter_ALL_FP.txt + """ +} diff --git a/modules/msk/fingerprint/combine/meta.yml b/modules/msk/fingerprint/combine/meta.yml new file mode 100644 index 00000000..726cb86d --- /dev/null +++ b/modules/msk/fingerprint/combine/meta.yml @@ -0,0 +1,71 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fingerprint_combine" +description: | + A module to combine multiple fingerprint TSV files into a single comprehensive + table, with optional liftover of loci coordinates. +keywords: + - fingerprint + - qc + - loci + - tsv + - correlation +tools: + - "complete_FP_table.R": + description: "A custom R script to combine fingerprint TSV files" + homepage: "https://github.com/mskcc-omics-workflows/modules/tree/main/modules/msk/fingerprint/combine/meta.yml" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - fp_tsv: + type: file + description: | + Fingerprint TSV files to be combined. + Structure: [ val(sample), val(genome_build), path(fp_tsv) ] + ontologies: [] + - sample: + type: string + description: Sample identifier corresponding to each fingerprint TSV file. + - genome_build: + type: string + description: + Genome build (e.g., hg19, hg38) corresponding to each fingerprint + TSV file. + - - liftover_loci_mapping: + type: file + description: | + A TSV file mapping original loci to liftover loci. + Format: original_chr, original_pos, liftover_chr, liftover_pos + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + combined_fp_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*DPfilter_ALL_FP.txt": + type: file + description: Wide table combining all input fingerprint TSV files. + pattern: "*DPfilter_ALL_FP.txt" + ontologies: + - edam: http://edamontology.org/format_3750 # TSV + versions_combine: + - - ${task.process}: + type: string + description: The name of the process + - complete_FP_table.R: + type: string + description: The name of the tool + - 0.1.0: + type: string + description: Version of the custom script +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/fingerprint/combine/resources/usr/bin/complete_FP_table.R b/modules/msk/fingerprint/combine/resources/usr/bin/complete_FP_table.R new file mode 100755 index 00000000..f551459e --- /dev/null +++ b/modules/msk/fingerprint/combine/resources/usr/bin/complete_FP_table.R @@ -0,0 +1,110 @@ +#! /usr/bin/env Rscript + +#------------------------------------------------------------------------------- +# Script: complete_FP_table.R +# Author: Erika Gedvilaite +# Date: 2025-09-23 +# Version: 0.1.0 +# +# Description: This script takes in standard fingerprint tables and combines +# them into a single, wide table for downstream plotting and analysis. +# +# Annotation: +# - Input table should have three columns: sample_id, genome_build, fp_tsv +# - Genome build should be either "hg19" or "hg38" or "GRCh37" or "GRCh38" +# (case insensitive) +# +#------------------------------------------------------------------------------- + + +rm(list=ls()) + +library(argparse, quietly = T) +library(plyr, quietly = T) +library(dplyr, quietly = T) +library(data.table, quietly = T) +library(tidyverse, quietly = T) + +`%notin%` <- Negate(`%in%`) +`%notlike%` <- Negate(`%like%`) + +parser = ArgumentParser(description = 'Generate FP tables for plotting') +parser$add_argument('-i', '--input_table', required = TRUE, + help = 'Input table with paths to individual fingerprint TSV files, sample ids, and genome build') +parser$add_argument('-o', '--analysis_folder', required = FALSE, default = ".", + help = 'Output folder') +parser$add_argument('-l', '--loci_mapper', required = TRUE, + help = 'Loci mapper file') +parser$add_argument('-d', '--depth_filter', required = FALSE, default = 20, + help = 'Depth filter to apply to individual fingerprint TSV files (default: 20)') +args = parser$parse_args() + + + +message("Reading in Liftover file") + +hg19_hg38_mapper = fread(args$loci_mapper,header = T) +hg19_hg38_mapper$Loci_hg19 = paste(hg19_hg38_mapper$GRCH37_CHROM,hg19_hg38_mapper$GRCH37_POS,sep=":") +hg19_hg38_mapper$Loci_hg38 = paste(hg19_hg38_mapper$GRCH38_CHROM,hg19_hg38_mapper$GRCH38_POS,sep=":") +hg19_hg38_mapper = hg19_hg38_mapper %>% select(Loci_hg19, Loci_hg38) %>% unique() + +message("Loading Samples") +input_table = fread(args$input_table, header = T) %>% arrange(patient, sample_id) +for (i in 1:nrow(input_table)){ + sample = input_table$sample_id[i] + genome_build = input_table$genome_build[i] + print(genome_build) + if (tolower(genome_build) %notin% c("hg19","grch37","hg38","grch38")){ + stop(paste0("Genome build not recognized: ", genome_build, ". Must be in the following list: hg19, hg38, grch37, grch38 (case will be ignored).")) + } + file = input_table$fp_tsv[i] + if (!file.exists(file)){ + stop(paste0("File does not exist: ", input_table$fp_tsv[i])) + } + temp_dataset <- fread(file, header = T, sep="\t") + colnames(temp_dataset) = c("Locus", "Count", "Genotype","VAF") + temp_dataset = separate(temp_dataset, Count, into = c(NA,'DP1',NA,'DP2'), remove = F) + temp_dataset$DP2[is.na(temp_dataset$DP2)==T] <- 0 + temp_dataset$DP = as.numeric(temp_dataset$DP1) + as.numeric(temp_dataset$DP2) + temp_dataset = temp_dataset[temp_dataset$DP >= args$depth_filter,] ## keeping loci >= 20 dp by default + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + #temp_dataset$Sample = sample #only loci with DP >= depth filter will have Sample info + temp_dataset$Sample <- rep(sample, nrow(temp_dataset)) + temp_dataset = temp_dataset %>% select("Locus","Genotype","Sample","VAF") + temp_dataset$Locus = str_replace(temp_dataset$Locus,"chr","") + + if (tolower(genome_build) %in% c("hg19","grch37")){ + temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg19", by.y = "Locus", all.x = T) + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + } else if (tolower(genome_build) %in% c("hg38","grch38")){ + temp_dataset = merge(hg19_hg38_mapper, temp_dataset, by.x = "Loci_hg38", by.y = "Locus", all.x = T) + temp_dataset$VAF[is.na(temp_dataset$VAF)==T] <- 0 + } + + if (!exists("all_gbcm")){ + all_gbcm = temp_dataset + } else { + all_gbcm = rbind(all_gbcm, temp_dataset) + } +} +all_gbcm = all_gbcm[is.na(all_gbcm$Sample)==F,] # filters out loci that don't have Sample info (i.e. loci not passing DP filter) +all_gbcm$VAF = round(as.numeric(all_gbcm$VAF), 5) + +wide_all_gbcm = all_gbcm %>% pivot_wider(names_from = Sample, values_from = c(Genotype, VAF)) + +message("Creating final GBCM file") + +all_fp_gbcm_final = merge(hg19_hg38_mapper, wide_all_gbcm,all.x = T) + +if (!dir.exists(args$analysis_folder)) { + dir.create(args$analysis_folder, recursive = TRUE) +} else { + print(paste("Directory already exists:", args$analysis_folder)) +} + +message(paste("Output file: ", args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep="")) + +all_fp_gbcm_final <- apply(all_fp_gbcm_final,2,as.character) +write.table(all_fp_gbcm_final, file = paste(args$analysis_folder,"/",args$depth_filter,"DPfilter_ALL_FP.txt", sep=""), append = F, sep = "\t", row.names = F, quote = F) + +message("FP file completed") diff --git a/modules/msk/fingerprint/combine/tests/loci_mapping.tsv b/modules/msk/fingerprint/combine/tests/loci_mapping.tsv new file mode 100644 index 00000000..0339b805 --- /dev/null +++ b/modules/msk/fingerprint/combine/tests/loci_mapping.tsv @@ -0,0 +1,10 @@ +GRCH37_CHROM GRCH37_POS GRCH38_CHROM GRCH38_POS +MT192765.1 197 MT192765.1 199 +MT192765.1 4788 MT192765.1 4900 +MT192765.1 8236 MT192765.1 8257 +MT192765.1 10506 MT192765.1 10528 +MT192765.1 11037 MT192765.1 11059 +MT192765.1 15009 MT192765.1 15500 +MT192765.1 18807 MT192765.1 18929 +MT192765.1 23813 MT192765.1 24835 +MT192765.1 24103 MT192765.1 25125 diff --git a/modules/msk/fingerprint/combine/tests/main.nf.test b/modules/msk/fingerprint/combine/tests/main.nf.test new file mode 100644 index 00000000..effbbbf2 --- /dev/null +++ b/modules/msk/fingerprint/combine/tests/main.nf.test @@ -0,0 +1,105 @@ +// nf-core modules test fingerprint/combine +nextflow_process { + + name "Test Process FINGERPRINT_COMBINE" + script "../main.nf" + process "FINGERPRINT_COMBINE" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "fingerprint" + tag "fingerprint/combine" + tag "gbcms" + tag "fingerprint/vcfparser" + + test("sarscov2 - bam") { + + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test', pool:'mypool' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2', pool:'mypool' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv + .map{ meta, tsv -> + println meta + [[id:meta.pool], tsv, meta.id, "hg19","default"] + }.groupTuple(by:[0]) + input[1] = file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id:"testsample"], + [file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true)], + ["testsample"], + ["hg19"], + ["default"] + ] + input[1] = file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/fingerprint/combine/tests/main.nf.test.snap b/modules/msk/fingerprint/combine/tests/main.nf.test.snap new file mode 100644 index 00000000..4016b7bd --- /dev/null +++ b/modules/msk/fingerprint/combine/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "testsample" + }, + "XDPfilter_ALL_FP.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FINGERPRINT_COMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "testsample" + }, + "XDPfilter_ALL_FP.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_combine": [ + [ + "FINGERPRINT_COMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ] + } + ], + "timestamp": "2026-03-31T11:50:45.066162946", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "mypool" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ], + "1": [ + [ + "FINGERPRINT_COMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ], + "combined_fp_tsv": [ + [ + { + "id": "mypool" + }, + "0DPfilter_ALL_FP.txt:md5,66113c255cf1f52e27802183764a406d" + ] + ], + "versions_combine": [ + [ + "FINGERPRINT_COMBINE", + "complete_FP_table.R", + "0.1.0" + ] + ] + } + ], + "timestamp": "2026-03-31T11:50:39.126837772", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/modules/msk/fingerprint/combine/tests/nextflow.config b/modules/msk/fingerprint/combine/tests/nextflow.config new file mode 100644 index 00000000..7a504791 --- /dev/null +++ b/modules/msk/fingerprint/combine/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'FINGERPRINT_COMBINE' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/fingerprint/contamination/environment.yml b/modules/msk/fingerprint/contamination/environment.yml new file mode 100644 index 00000000..21c00633 --- /dev/null +++ b/modules/msk/fingerprint/contamination/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - numpy=2.3.3 + - pandas=2.3.2 diff --git a/modules/msk/fingerprint/contamination/main.nf b/modules/msk/fingerprint/contamination/main.nf new file mode 100644 index 00000000..a0014547 --- /dev/null +++ b/modules/msk/fingerprint/contamination/main.nf @@ -0,0 +1,39 @@ +process FINGERPRINT_CONTAMINATION { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + //'oras://community.wave.seqera.io/library/numpy_pandas:1f8cb70bfdb82865': + 'docker://community.wave.seqera.io/library/numpy_pandas:f27ed83387b3c038': + 'community.wave.seqera.io/library/numpy_pandas:f27ed83387b3c038' }" + + input: + tuple val(meta), path(fp_tumor), path(fp_normal) + + output: + tuple val(meta), path("*.contamination.tsv") , emit: contamination_tsv + tuple val("${task.process}"), val('calculate_contamination.py'), eval('calculate_contamination.py -v | cut -f 2 -d" "'), emit: versions_contamination, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + calculate_contamination.py \\ + -t ${fp_tumor} \\ + -n ${fp_normal ?: fp_tumor} \\ + -o ${prefix}.contamination.tsv \\ + ${args} + + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.contamination.tsv + + """ +} diff --git a/modules/msk/fingerprint/contamination/meta.yml b/modules/msk/fingerprint/contamination/meta.yml new file mode 100644 index 00000000..08acef21 --- /dev/null +++ b/modules/msk/fingerprint/contamination/meta.yml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fingerprint_contamination" +description: "Calculate major and minor contamination from fingerprint tables" +version: "0.1.0" +keywords: + - fingerprint + - contamination + - qc +tools: + - "pandas": + description: "Python Data Analysis Library" + homepage: "https://pandas.pydata.org/" + documentation: "https://pandas.pydata.org/docs/" + identifier: biotools:pandas + - "numpy": + description: "Scientific computing library for Python" + homepage: "https://numpy.org/" + documentation: "https://numpy.org/doc/" + + identifier: biotools:numpy +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fp_tumor: + type: file + description: Fingerprint table file for tumor sample + pattern: "*.fp.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" # TSV + - edam: http://edamontology.org/format_3475 # TSV + - fp_normal: + type: file + description: Fingerprint table file for normal sample + pattern: "*.fp.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" + + - edam: http://edamontology.org/format_3475 # TSV +output: + contamination_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.contamination.tsv": + type: file + description: Contamination results table + pattern: "*.contamination.tsv" + ontologies: + - edam: "http://edamontology.org/format_3750" # TSV + - edam: http://edamontology.org/format_3475 # TSV + versions_contamination: + - - ${task.process}: {} + - calculate_contamination.py: {} + - 'calculate_contamination.py -v | cut -f 2 -d" ': {} +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/fingerprint/contamination/resources/usr/bin/calculate_contamination.py b/modules/msk/fingerprint/contamination/resources/usr/bin/calculate_contamination.py new file mode 100755 index 00000000..23febaff --- /dev/null +++ b/modules/msk/fingerprint/contamination/resources/usr/bin/calculate_contamination.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + + +""" +Calculates contamination from fingerprint table +""" + +__author__ = "Hanan Salim" +__email__ = "salimh@mskcc.org" +__contributors__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +__version__ = "0.1.0" +__status__ = "Dev" + +import argparse +import pandas as pd +import numpy as np +import os +import sys + +def major_contamination(tumor, depth_filter): + tumor_filtered = get_coverage(tumor, depth_filter) + + homozygous = ['AA','CC','GG','TT','A','C','G','T'] + heterozygous = ~tumor_filtered['Genotype'].isin(homozygous) + + try: + return sum(heterozygous)/tumor_filtered.shape[0] + except Exception as e: + return 0 + +def get_coverage(file, depth_filter): + #print(file['Alleles'].str.split(' ', expand=True)) + file[['A1', 'A2']] = file['Alleles'].str.split(' ', expand=True) + + A1_count = list(file['A1'].str.split(':', expand=True)[1]) + A2_count = list(file['A2'].str.split(':', expand=True)[1]) + A1_int = list(map(int, A1_count)) + A2_int = list(map(int, A2_count)) + + file['coverage'] = list(map(lambda x, y: x + y, A1_int, A2_int)) + + filtered_data = file[file['coverage'] > depth_filter] + + return(filtered_data) + +def minor_contamination(normal, tumor, depth_filter): + homozygous_sites = normal.index[normal['MAF'] < .10] + tumor_homozygous = tumor.loc[[i for i in homozygous_sites if i in tumor.index]] + tumor_homozygous_filtered = get_coverage(tumor_homozygous, depth_filter) + + return tumor_homozygous_filtered['MAF'].mean() + +def main(): + parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), description='Calculate major and minor contamination') + + parser.add_argument('-t','--tumor', + required=True, + help='Tumor fingerprint table file') + + parser.add_argument('-n','--normal', + required=True, + help='Normal fingerprint table file') + + parser.add_argument('-o','--output', + required=True, + help='Output file for contamination results') + + parser.add_argument('-d','--depthfilter', + required=False, + default=20, + type=int, + help='Depth filter for coverage (default: 20)' + ) + + parser.add_argument('--version', + action='version', + version='%(prog)s ' + __version__ + ) + + args = parser.parse_args() + + fields = ['Position', 'Alleles', 'Genotype', 'MAF'] + + tumor = pd.read_csv(args.tumor, sep='\t',names=fields,header=0) + tumor = tumor[~tumor['Position'].str.contains('X|Y', na=False)] + tumor = tumor.set_index('Position') + normal = pd.read_csv(args.normal, sep='\t',names=fields,header=0) + normal = normal[~normal['Position'].str.contains('X|Y', na=False)] + normal = normal.set_index('Position') + + major_contam = major_contamination(tumor, depth_filter=args.depthfilter) + minor_contam = minor_contamination(normal, tumor, depth_filter=args.depthfilter) + + with open(args.output,'w') as f: + f.write("Tumor\tNormal\tMajor_Contamination\tMinor_Contamination\n") + f.write("{}\t{}\t{:.4f}\t{:.4f}\n".format( + os.path.basename(args.tumor), + os.path.basename(args.normal), + major_contam, + minor_contam)) + +if __name__== "__main__": + main() diff --git a/modules/msk/fingerprint/contamination/tests/main.nf.test b/modules/msk/fingerprint/contamination/tests/main.nf.test new file mode 100644 index 00000000..ed558200 --- /dev/null +++ b/modules/msk/fingerprint/contamination/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_process { + + name "Test Process FINGERPRINT_CONTAMINATION" + script "../main.nf" + process "FINGERPRINT_CONTAMINATION" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "fingerprint" + tag "fingerprint/contamination" + tag "gbcms" + tag "fingerprint/vcfparser" + + test("sarscov2 - bam") { + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + process { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv.map{ meta, tsv -> [meta,tsv,[]]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/fingerprint/contamination/tests/main.nf.test.snap b/modules/msk/fingerprint/contamination/tests/main.nf.test.snap new file mode 100644 index 00000000..6016aace --- /dev/null +++ b/modules/msk/fingerprint/contamination/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FINGERPRINT_CONTAMINATION", + "calculate_contamination.py", + "" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_contamination": [ + [ + "FINGERPRINT_CONTAMINATION", + "calculate_contamination.py", + "" + ] + ] + } + ], + "timestamp": "2026-03-31T11:55:42.453612823", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ] + ], + "1": [ + [ + "FINGERPRINT_CONTAMINATION", + "calculate_contamination.py", + "" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ] + ], + "versions_contamination": [ + [ + "FINGERPRINT_CONTAMINATION", + "calculate_contamination.py", + "" + ] + ] + } + ], + "timestamp": "2026-03-31T11:55:33.454910171", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/modules/msk/fingerprint/contamination/tests/nextflow.config b/modules/msk/fingerprint/contamination/tests/nextflow.config new file mode 100644 index 00000000..fe62a31a --- /dev/null +++ b/modules/msk/fingerprint/contamination/tests/nextflow.config @@ -0,0 +1,10 @@ +process { + + withName: 'FINGERPRINT_CONTAMINATION' { + ext.args = "-d 0" + } + + withName: 'FINGERPRINT_VCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/fingerprint/correlation/environment.yml b/modules/msk/fingerprint/correlation/environment.yml new file mode 100644 index 00000000..73a37db1 --- /dev/null +++ b/modules/msk/fingerprint/correlation/environment.yml @@ -0,0 +1,18 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-argparse=2.3.1 + - conda-forge::r-data.table=1.17.8 + - conda-forge::r-dplyr=1.1.4 + - conda-forge::r-ggforce=0.5.0 + - conda-forge::r-ggiraph=0.8.12 + - conda-forge::r-gtools=3.9.5 + - conda-forge::r-htmlwidgets=1.6.4 + - conda-forge::r-plotly=4.11.0 + - conda-forge::r-plyr=1.8.9 + - conda-forge::r-reshape2=1.4.5 + - conda-forge::r-scales=1.4.0 + - conda-forge::r-tidyverse=2.0.0 diff --git a/modules/msk/fingerprint/correlation/main.nf b/modules/msk/fingerprint/correlation/main.nf new file mode 100644 index 00000000..50f807e3 --- /dev/null +++ b/modules/msk/fingerprint/correlation/main.nf @@ -0,0 +1,46 @@ +process FINGERPRINT_CORRELATION { + tag {"$prefix"} + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:5c045bc9fea1dbd5': + 'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:5c045bc9fea1dbd5' } " + // 'oras://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-ggforce_pruned:8211a2010a4712ea': + + input: + tuple val(meta), path(combined_fp_tsv) + val(filter_term) + + output: + tuple val(meta), path("*.pdf") , emit: heatmap_pdf + tuple val(meta), path("*.html") , emit: heatmap_html + tuple val(meta), path("*_observations.tab") , emit: observations_tab + tuple val(meta), path("*_correlations.tab") , emit: correlations_tab + tuple val("${task.process}"), val('plot_gbcm.R'), val("0.1.0"), topic: versions, emit: versions_correlation + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = meta.id ?: "batch" + def pool_arg = "-p ${prefix}" + filter_args = (filter_term && filter_term != "") ? pool_arg + " -f" : pool_arg + """ + plot_gbcm.R \\ + -t ${combined_fp_tsv} \\ + -o ./ \\ + ${filter_args} + """ + + stub: + def args = task.ext.args ?: '' + prefix = meta.id ?: "batch" + """ + touch ${prefix}.pdf + touch ${prefix}.html + touch ${prefix}_observations.tab + touch ${prefix}_correlations.tab + """ +} diff --git a/modules/msk/fingerprint/correlation/meta.yml b/modules/msk/fingerprint/correlation/meta.yml new file mode 100644 index 00000000..ce40abf4 --- /dev/null +++ b/modules/msk/fingerprint/correlation/meta.yml @@ -0,0 +1,72 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fingerprint_correlation" +description: "Generate fingerprint correlation heatmaps and tables from a combined fingerprint table" +keywords: + - fingerprint + - correlation + - qc + - heatmap +tools: + - "plot_gbcm.R": + description: "In-house R script for fingerprint correlation plotting" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: null + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - combined_fp_tsv: + type: file + description: Combined fingerprint TSV from FINGERPRINT_COMBINE + pattern: "*DPfilter_ALL_FP.txt" + - - filter_term: + type: string + description: Optional filter term to create pool-level plots +output: + heatmap_pdf: + - - meta: + type: map + description: | + Groovy Map containing sample information + - "*.pdf": + type: file + description: PDF heatmap of fingerprint correlations + heatmap_html: + - - meta: + type: map + - "*.html": + type: file + description: Interactive HTML heatmap + observations_tab: + - - meta: + type: map + - "*_observations.tab": + type: file + description: Table of loci overlap observations + correlations_tab: + - - meta: + type: map + - "*_correlations.tab": + type: file + description: Table of pairwise correlations + versions_correlation: + - - ${task.process}: + type: string + description: The name of the process + - plot_gbcm.R: + type: string + description: The name of the tool + - 0.1.0: + type: eval + description: The expression to obtain the version of the tool +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/fingerprint/correlation/resources/usr/bin/plot_gbcm.R b/modules/msk/fingerprint/correlation/resources/usr/bin/plot_gbcm.R new file mode 100755 index 00000000..176b388b --- /dev/null +++ b/modules/msk/fingerprint/correlation/resources/usr/bin/plot_gbcm.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript + +#------------------------------------------------------------------------------- +# Script: plot_gbcm.R +# Author: Hanan Salim +# Date: 2026-02-09 +# Version: 0.2.0 +# +# Description: This script takes in a wide fingerprinting table pertaining +# to multiple samples and plots in pdf and html formats. +# Additionally, a table with the number of observations for each correlation +# is also written to an output file. +# +#------------------------------------------------------------------------------- + +rm(list=ls()) + +library(argparse, quietly = T) +library(plyr, quietly = T) +library(dplyr, quietly = T) +library(data.table, quietly = T) +library(tidyverse, quietly = T) +library(scales, quietly = T) +library(ggforce, quietly = T) +library(gtools, quietly = T) +library(htmlwidgets) +library(ggiraph) + + +`%notin%` <- Negate(`%in%`) +`%notlike%` <- Negate(`%like%`) + + +#function to size the dots +calculate_point_size <- function(x,y) { + n_x <- length(unique(x)) + n_y <- length(unique(y)) + + #define your plot size (in inches) + plot_width_in <- 20 + plot_height_in <- 20 + + #convert to mm (1 inch = 25.4 mm) + plot_width_mm <- plot_width_in * 25.4 + plot_height_mm <- plot_height_in * 25.4 + + #calculate tile size in mm + tile_width_mm <- plot_width_mm / n_x + tile_height_mm <- plot_height_mm / n_y + + #max circle diameter (fits inside smallest tile dimension) + max_diameter_mm <- min(tile_width_mm, tile_height_mm) + + #approximate max point size for geom_point (radius in mm) + max_point_size <- max_diameter_mm + + return(max_point_size) +} + + +#function to create static plots +static_plot <- function(data, max_point_size) { + n = length(unique(data$Var1)) + legend_size = max_point_size * n * .4 + + axis_text_size = if (n < 25) 14 else 10 + + p <- ggplot(data, aes(x = Var1, y = Var2)) + + geom_tile(color = "gray50", linewidth = 0.25, fill = NA) + + geom_point_interactive( + aes(size = log2_size, + fill = value, + tooltip = paste0( + "x: ", Var1, "\n", + "y: ", Var2, "\n", + "Loci Overlap: ", size, "\n", + "Correlation: ", round(value, 2) + )), + shape = 21, + color="NA" + ) + + scale_x_discrete(limits = levels(data$Var1)) + + scale_y_discrete(limits = rev(levels(data$Var2))) + + scale_fill_viridis_c( + name = "Correlation", + option = "viridis", + direction = -1, + alpha = 0.75, + begin = 0, + end = 1, + limits = c(-1, 1), + breaks = seq(-1, 1, by = .25), + guide = guide_colorbar(direction = "vertical", + title.position = "top", + barheight = unit(legend_size, "mm"), + barwidth = unit(legend_size*.05, "mm") + )) + + scale_size_continuous( + limits = c(0, 14.2), #known max of log2(size) + range = c(0, max_point_size), + breaks = seq(2, 14, by = 4), + name = "Loci Overlap (log2)", + guide = guide_legend(direction = "vertical", + title.position = "top", + keyheight = unit(legend_size/4, "mm"), + override.aes = list( + color = "black", + stroke = 0.5 + )) + ) + + labs(title = title) + + theme_minimal() + + theme( + text = element_text(family = "Courier"), + panel.grid = element_blank(), + axis.text.x = element_text(angle = 90, hjust = 1, size = 10, color = "black"), + axis.text.y = element_text(size = 10, color = "black"), + axis.title = element_blank(), + plot.title = element_text(hjust = 0.5, size = 24, margin = margin(b = 15)), + legend.position = "right", + legend.box = "horizontal", + legend.box.just = "left", + legend.title.align = 0.5, + legend.spacing.x = unit(1, "cm"), + aspect.ratio = 1 + ) + + return(p) +} + + +parser = ArgumentParser(description = 'create correlation plots for a given sample') + +parser$add_argument('-t', '--table', required = TRUE, + help = 'summary table') + +parser$add_argument('-o', '--analysis_folder', required = TRUE, + help = 'output folder') + +parser$add_argument('-p', '--pool', required = FALSE, + default = "fp_plots", + help = 'pool ID') + +parser$add_argument('-f', '--filter', + action = "store_true", + default = FALSE, + help = "create pool levelel plots instead of extended plots" +) + +args = parser$parse_args() + +fingerprints = fread(args$table, sep = '\t') +outdir = args$analysis_folder +sample = args$pool + + +#format data +fingerprints <- fingerprints %>% select(-contains(c('Loci_hg19', 'Loci_hg38'))) +cols <- grep("VAF", names(fingerprints), value = TRUE) +fingerprints <- fingerprints[, ..cols] + +for ( col in 1:ncol(fingerprints)){ + colnames(fingerprints)[col] <- sub("VAF_", "", colnames(fingerprints)[col]) +} + +title = paste("Pool:", sample,"; ", nrow(fingerprints)," Loci used",sep = "") + +fp_matrix <- data.matrix(fingerprints) +fp_matrix = cor(as.matrix(fp_matrix), method = c("pearson"), use = "pairwise.complete.obs") + +fp_long <- reshape2::melt(fp_matrix) +observations = crossprod(!is.na(fingerprints)) +obs_long <- reshape2::melt(observations) +final <- data.frame(fp_long, size = obs_long$value) + +#calculate log2 size column +final$log2_size <- log2(final$size) + +if (args$filter) { + + if (identical(args$pool, "fp_plots")) { + message("A pool ID is required to create pool level plots") + quit(status = 1) + } + + message("Creating pool level plots") + type="pool" + + final = final %>% filter(grepl(args$pool, Var1) & grepl(args$pool, Var2)) + final = droplevels(final) + +} else { + message("Creating extended plots") + type="extended" +} + +#get max point size +max_point_size = calculate_point_size(final$Var1, final$Var2) + +#create static plot +s <- static_plot(final, max_point_size) +ggsave(paste(outdir,"/",sample,"_", type, '.pdf', sep = ""), plot = s, width = 25, height = 25, units = "in", device = cairo_pdf) + +#create interactive plot +i = girafe(ggobj = s, width_svg = 25, height_svg = 25, + options = list(opts_tooltip(css = "padding:5pt; font-size:16pt; color:white; background-color:black;"))) +saveWidget(i, paste(outdir,"/",sample,"_", type,'.html', sep = ""), selfcontained = TRUE) + +#save tables +write.table(observations, paste(outdir,"/",sample, '_observations.tab', sep = ''), sep = '\t') +write.table(fp_matrix, paste(outdir,"/",sample, '_correlations.tab', sep = ''), sep = '\t') diff --git a/modules/msk/fingerprint/correlation/tests/main.nf.test b/modules/msk/fingerprint/correlation/tests/main.nf.test new file mode 100644 index 00000000..c7ff43e8 --- /dev/null +++ b/modules/msk/fingerprint/correlation/tests/main.nf.test @@ -0,0 +1,113 @@ +nextflow_process { + + name "Test Process FINGERPRINT_CORRELATION" + script "../main.nf" + process "FINGERPRINT_CORRELATION" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "fingerprint" + tag "fingerprint/correlation" + tag "fingerprint/combine" + tag "gbcms" + tag "fingerprint/vcfparser" + + test("sarscov2 - bam") { + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + run("FINGERPRINT_COMBINE"){ + script "../../combine/main.nf" + process { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv + .map{ meta, tsv -> + def meta2 = [id:meta.pool] + [[id:meta.pool], tsv, meta.id, "hg19", "default"] + }.groupTuple(by:[0]) + input[1] = file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + } + when { + process { + """ + input[0] = FINGERPRINT_COMBINE.out.combined_fp_tsv + input[1] = "" + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.correlations_tab, + process.out.observations_tab, + process.out.versions_correlation + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[id:'thispool'], file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true)] + input[1] = "" + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.correlations_tab, + process.out.observations_tab, + process.out.versions_correlation + ).match() } + ) + } + + } + +} diff --git a/modules/msk/fingerprint/correlation/tests/main.nf.test.snap b/modules/msk/fingerprint/correlation/tests/main.nf.test.snap new file mode 100644 index 00000000..9ccb6af1 --- /dev/null +++ b/modules/msk/fingerprint/correlation/tests/main.nf.test.snap @@ -0,0 +1,66 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + [ + [ + { + "id": "thispool" + }, + "thispool_correlations.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "thispool" + }, + "thispool_observations.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + "FINGERPRINT_CORRELATION", + "plot_gbcm.R", + "0.1.0" + ] + ] + ], + "timestamp": "2026-03-31T11:51:06.685431978", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - bam": { + "content": [ + [ + [ + { + "id": null + }, + "batch_correlations.tab:md5,dbc55d8829950501d3ed2db9a832165c" + ] + ], + [ + [ + { + "id": null + }, + "batch_observations.tab:md5,858d6d115a4da81652bb98dcc8b8077f" + ] + ], + [ + [ + "FINGERPRINT_CORRELATION", + "plot_gbcm.R", + "0.1.0" + ] + ] + ], + "timestamp": "2026-03-31T11:51:01.173687804", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/modules/msk/fingerprint/correlation/tests/nextflow.config b/modules/msk/fingerprint/correlation/tests/nextflow.config new file mode 100644 index 00000000..06367761 --- /dev/null +++ b/modules/msk/fingerprint/correlation/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'FINGERPRINT_COMBINE' { + ext.args = "-d 0" + } + withName: 'FINGERPRINT_VCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/fingerprint/mislabels/environment.yml b/modules/msk/fingerprint/mislabels/environment.yml new file mode 100644 index 00000000..db92c11c --- /dev/null +++ b/modules/msk/fingerprint/mislabels/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-argparse=2.3.1 + - conda-forge::r-data.table=1.17.8 + - conda-forge::r-dplyr=1.1.4 + - conda-forge::r-plyr=1.8.9 + - conda-forge::r-tidyverse=2.0.0 diff --git a/modules/msk/fingerprint/mislabels/main.nf b/modules/msk/fingerprint/mislabels/main.nf new file mode 100644 index 00000000..35989422 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/main.nf @@ -0,0 +1,46 @@ +process FINGERPRINT_MISLABELS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66': + 'community.wave.seqera.io/library/r-argparse_r-data.table_r-dplyr_r-plyr_r-tidyverse:8c0daffb3624cb66' }" + + input: + tuple val(meta), path(correlations_tab), path(observations_tab) + path(sample_sheet) + + output: + tuple val(meta), path("*_unexpected_match.pdf"), emit: unexpected_match_pdf + tuple val(meta), path("*_unexpected_match.txt"), emit: unexpected_match_txt + tuple val(meta), path("*_unexpected_mismatch.pdf"), emit: unexpected_mismatch_pdf + tuple val(meta), path("*_unexpected_mismatch.txt"), emit: unexpected_mismatch_txt + tuple val("${task.process}"), val('unexpected_match_mismatch.R'), val("0.1.0"), emit: versions_mislabels, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + prefix = (prefix && prefix != "") ? prefix : "batch" + """ + unexpected_match_mismatch.R \\ + -r ${prefix} \\ + -o ./ \\ + -i ${sample_sheet} \\ + -c ${correlations_tab} \\ + -n ${observations_tab} \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_unexpected_match.pdf + touch ${prefix}_unexpected_match.txt + touch ${prefix}_unexpected_mismatch.pdf + touch ${prefix}_unexpected_mismatch.txt + """ +} diff --git a/modules/msk/fingerprint/mislabels/meta.yml b/modules/msk/fingerprint/mislabels/meta.yml new file mode 100644 index 00000000..f8fe26e7 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/meta.yml @@ -0,0 +1,99 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fingerprint_mislabels" +description: "Identify unexpected matches and mismatches from fingerprint correlations and observations based on patient labels" +version: "0.1.0" +keywords: + - fingerprint + - mislabels + - qc + - sample_swap +tools: + - "unexpected_match_mismatch.R": + description: "In-house R script for detecting unexpected sample matches and mismatches." + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: null + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - correlations_tab: + type: file + description: Fingerprint correlations file from FINGERPRINT_CORRELATION + pattern: "*_correlations.tab" + ontologies: + - edam: http://edamontology.org/format_3475 + - observations_tab: + type: file + description: Fingerprint observations file from FINGERPRINT_CORRELATION + pattern: "*_observations.tab" + ontologies: + - edam: http://edamontology.org/format_3475 + - - sample_sheet: + type: file + description: CSV sample sheet with columns sample, patient, is_donor + pattern: "*.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + +output: + unexpected_match_pdf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - "*_unexpected_match.pdf": + type: file + description: PDF plots of unexpected match analysis + pattern: "*_unexpected_match.pdf" + unexpected_match_txt: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - "*_unexpected_match.txt": + type: file + description: Table of flagged unexpected matches + pattern: "*_unexpected_match.txt" + unexpected_mismatch_pdf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - "*_unexpected_mismatch.pdf": + type: file + description: PDF plots of unexpected mismatch analysis + pattern: "*_unexpected_mismatch.pdf" + unexpected_mismatch_txt: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'pool1' ]` + - "*_unexpected_mismatch.txt": + type: file + description: Table of flagged unexpected mismatches + pattern: "*_unexpected_mismatch.txt" + versions_mislabels: + - - ${task.process}: + type: string + description: The name of the process + - unexpected_match_mismatch.R: + type: string + description: The name of the tool + - 0.1.0: + type: eval + description: The version of the tool +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/fingerprint/mislabels/resources/usr/bin/unexpected_match_mismatch.R b/modules/msk/fingerprint/mislabels/resources/usr/bin/unexpected_match_mismatch.R new file mode 100755 index 00000000..2cfbd617 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/resources/usr/bin/unexpected_match_mismatch.R @@ -0,0 +1,317 @@ +#!/usr/bin/env Rscript + +#------------------------------------------------------------------------------- +# Script: unexpected_match_mismatch.R +# Author: Erika Gedvilaite +# Date: 2026-03-10 +# Version: 0.1.0 +# +# Description: This script takes in fingerprint corrations and observation +# counts and identifies patient mismatches and matches based on patient labels. +# +#------------------------------------------------------------------------------- + +rm(list=ls()) +library(plyr, quietly = T) +library(dplyr, quietly = T) +library(data.table, quietly = T) +library(tidyverse, quietly = T) +library(argparse, quietly = T) + +`%notin%` <- Negate(`%in%`) +`%notlike%` <- Negate(`%like%`) + +parser = ArgumentParser(description = 'Generate Unexpected Match and Mismatch results for FPv3 (TRACE)') +parser$add_argument('-r', '--run_id', required = TRUE, + help = 'Sequencing Run') +parser$add_argument('-o', '--output_folder', required = TRUE, + help = 'Output folder') +parser$add_argument('-i', '--sample_sheet', required = TRUE, + help = 'Sample Sheet') +parser$add_argument('-c', '--correlations', required = TRUE, + help = 'Path to fingerprint correlations file') +parser$add_argument('-n', '--observations', required = TRUE, + help = 'Path to fingerprint observations file') +args = parser$parse_args() +theme_set(theme_classic()) + +# Helper: build seq() breakpoints for cut(); expands range when all values +# fall within the same integer interval (avoids "invalid number of intervals"). +make_corr_breaks <- function(x) { + lo <- floor(min(x, na.rm = TRUE)) + hi <- ceiling(max(x, na.rm = TRUE)) + if (lo == hi) { lo <- lo - 1L; hi <- hi + 1L } + seq(lo, hi, by = 0.1) +} + +# Setting up input collection + +poolID = args$run_id +samplesheetpath = args$sample_sheet +outputpath = args$output_folder + +print(paste("Correlations file: ", args$correlations, sep = "")) +print(paste("Observations file: ", args$observations, sep = "")) +print(paste("Output directory: ",outputpath, sep = "")) +print(paste("Sample Sheet: ",samplesheetpath, sep = "")) +print(paste("Run ID: ",poolID, sep = "")) + + +sample_sheet = read.csv(samplesheetpath,header = T, sep = ",", check.names = F) +sample_sheet = sample_sheet %>% select(sample, patient, is_donor) %>% unique() +sample_sheet$patient = str_pad(sample_sheet$patient, 8, pad = "0") +colnames(sample_sheet) = c("Sample","Patient","IsDonor") + +sample_sheet <- sample_sheet %>% + mutate( + Transplant = case_when(IsDonor == "true" ~ "Donor Found", + TRUE ~ "No Donor Found") + ) + +sample_sheet_transplant = sample_sheet %>% select(Patient, Transplant) %>% unique() +sample_sheet_transplant = sample_sheet_transplant[sample_sheet_transplant$Transplant == "Donor Found",] + +sample_sheet = sample_sheet %>% select(Patient, Sample) %>% unique() +sample_sheet = merge(sample_sheet, sample_sheet_transplant, by = "Patient", all.x = T) +sample_sheet$Transplant[is.na(sample_sheet$Transplant)==T] <- "No Donor Found" + +correlation_f = read.csv(args$correlations, header = T, sep = "\t", check.names = F) + +observations_f = read.csv(args$observations, header = T, sep = "\t", check.names = F) + +correlation_f = as.data.frame(correlation_f) +observations_f = as.data.frame(observations_f) + +correlation_wide_df <- as.data.frame(correlation_f) + +correlation_wide_df$Assay1 <- rownames(correlation_wide_df) +rownames(correlation_wide_df) <- NULL +correlation_wide_df <- correlation_wide_df[, c("Assay1", colnames(correlation_wide_df))] + +correlation_wide_df <- correlation_wide_df[, !(names(correlation_wide_df) %in% c("Assay1.1"))] + +correlation_long_df <- melt(setDT(correlation_wide_df), id.vars = c("Assay1"), variable.name = "Sample") + +colnames(correlation_long_df) = c("Sample1", "Sample2", "Correlation") + +correlation_long_df = correlation_long_df %>% select(Sample1, Sample2, Correlation) %>% unique() + +observations_wide_df <- as.data.frame(observations_f) + +observations_wide_df$Assay1 <- rownames(observations_wide_df) +rownames(observations_wide_df) <- NULL +observations_wide_df <- observations_wide_df[, c("Assay1", colnames(observations_wide_df))] + +observations_wide_df <- observations_wide_df[, !(names(observations_wide_df) %in% c("Assay1.1"))] + +observations_long_df <- melt(setDT(observations_wide_df), id.vars = c("Assay1"), variable.name = "Sample") + +colnames(observations_long_df) = c("Sample1", "Sample2", "Observation") + +observations_long_df = observations_long_df %>% select(Sample1, Sample2, Observation) %>% unique() + +correlation_long_df = merge(correlation_long_df, observations_long_df) %>% unique() %>% drop_na() + +correlation_long_df$Correlation = round(correlation_long_df$Correlation,2) + +correlation_long_df = merge(correlation_long_df, sample_sheet, by.x = "Sample1", by.y = "Sample", all.x = T) +correlation_long_df = merge(correlation_long_df, sample_sheet, by.x = "Sample2", by.y = "Sample", all.x = T) + +colnames(correlation_long_df) = c("Sample2", "Sample1", "Correlation", "Observation", "Patient1", "Donor_Status1", "Patient2", "Donor_Status2") + +## Data clean-out +### 1. Remove same sample-to-sample comparison (assume 1 for these) +### 2. Only keeping one pair per match (removing pair duplicates) + +key <- apply(correlation_long_df[, c("Sample1", "Sample2")], 1, function(x) paste(sort(x), collapse = "|")) +correlation_long_df_clean <- correlation_long_df[!duplicated(key), ] + +## Analysis organization +### 1. Unexpected match: Sample 1 and Sample 2 are coming from DIFFERENT Patient ID +### 2. Unexpected mismatch: Sample 1 and Sample 2 are coming from the SAME Patient ID + +unexpected_match = correlation_long_df_clean[correlation_long_df_clean$Patient1!=correlation_long_df_clean$Patient2,] +unexpected_mismatch = correlation_long_df_clean[correlation_long_df_clean$Patient1==correlation_long_df_clean$Patient2,] + +## Unexpected match calculation - sample + +unexpected_match_sample = copy(unexpected_match) +unexpected_match_sample$Loci_Status = ifelse(unexpected_match_sample$Observation >= 10, "Loci Pass","Loci Low") +unexpected_match_sample$Donor_Status = ifelse((unexpected_match_sample$Donor_Status1 == "Donor Found" | unexpected_match_sample$Donor_Status2 == "Donor Found"), "Donor Present","No Donor") + +unexpected_match_sample$Pool_mean = round(mean(unexpected_match_sample$Correlation),2) +unexpected_match_sample$Pool_SD = round(sd(unexpected_match_sample$Correlation),2) + +unexpected_match_sample$Cohort_mean = 0.02 +unexpected_match_sample$Cohort_SD = 0.07 + +unexpected_match_sample$Pool_meanplussd = unexpected_match_sample$Pool_mean + unexpected_match_sample$Pool_SD +unexpected_match_sample$Pool_meanplussd = round(unexpected_match_sample$Pool_meanplussd,2) + +unexpected_match_sample$Mean_plusSD = unexpected_match_sample$Cohort_mean+unexpected_match_sample$Cohort_SD +unexpected_match_sample$Mean_plus2SD = unexpected_match_sample$Cohort_mean+2*unexpected_match_sample$Cohort_SD +unexpected_match_sample$Mean_plus25SD = unexpected_match_sample$Cohort_mean+2.5*unexpected_match_sample$Cohort_SD +unexpected_match_sample$Mean_minusSD = unexpected_match_sample$Cohort_mean-unexpected_match_sample$Cohort_SD +unexpected_match_sample$Mean_minus2SD = unexpected_match_sample$Cohort_mean-2*unexpected_match_sample$Cohort_SD +unexpected_match_sample$Mean_minus25SD = unexpected_match_sample$Cohort_mean-2.5*unexpected_match_sample$Cohort_SD + + +unexpected_match_sample$Match_Status = ifelse(unexpected_match_sample$Correlation >= unexpected_match_sample$Mean_plus25SD, "Matching","Pass") + +unexpected_match_sample$key = paste(unexpected_match_sample$Sample1, unexpected_match_sample$Sample2, sep=":") + +unexpected_match_sample_intervals_corr = unexpected_match_sample %>% + mutate(interval = cut(Correlation, breaks = make_corr_breaks(Correlation), include.lowest = TRUE)) %>% + count(interval) + +intervals_set <- c("[-1,-0.9]", "(-0.9,-0.8]", "(-0.8,-0.7]", "(-0.7,-0.6]", "(-0.6,-0.5]","(-0.5,-0.4]", "(-0.4,-0.3]", "(-0.3,-0.2]", "(-0.2,-0.1]", "(-0.1,0]", "(0,0.1]", "(0.1,0.2]", "(0.2,0.3]", "(0.3,0.4]", "(0.4,0.5]", "(0.5,0.6]", "(0.6,0.7]", "(0.7,0.8]", "(0.8,0.9]","(0.9,1]") +intervals_df <- data.frame( + interval = intervals_set +) + +intervals_df = merge(intervals_df, unexpected_match_sample_intervals_corr, all.x = T) +intervals_df$n[is.na(intervals_df$n)] <- 0 +intervals_df$percent = round(intervals_df$n/nrow(unexpected_match_sample),digits = 2) + +intervals_df$interval <- factor(intervals_df$interval, levels = c("[-1,-0.9]", "(-0.9,-0.8]", "(-0.8,-0.7]", "(-0.7,-0.6]", "(-0.6,-0.5]","(-0.5,-0.4]", "(-0.4,-0.3]", "(-0.3,-0.2]", "(-0.2,-0.1]", "(-0.1,0]", "(0,0.1]", "(0.1,0.2]", "(0.2,0.3]", "(0.3,0.4]", "(0.4,0.5]", "(0.5,0.6]", "(0.6,0.7]", "(0.7,0.8]", "(0.8,0.9]","(0.9,1]")) + +pdf(file = paste(outputpath,"/",poolID,"_unexpected_match.pdf",sep = ""), width = 10, height = 6) + + +group_colors <- c(Pass = "#D3D3D3", Matching = "#CC6600") + +ggplot(unexpected_match_sample, aes(x = key, y = Correlation)) + + geom_point(aes(colour = Match_Status, shape = Donor_Status), size = 1.0) + + geom_hline(aes(yintercept = Mean_plus25SD, linetype = "Mean+2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minus25SD, linetype = "Mean-2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_plusSD, linetype = "Mean+SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minusSD, linetype = "Mean-SD"), size = 0.5) + + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + labs(colour = "Match Status") + + labs(shape = "Donor Status") + + ylim(-1,1) + + scale_color_manual(values = group_colors) + + labs(linetype = "Limits") + + ggtitle(paste("Pool:",poolID,sep=""),subtitle = "Unexpected Match Overall") + +ggplot(unexpected_match_sample, aes(x = key, y = Correlation)) + + geom_point(aes(colour = Match_Status, shape = Donor_Status), size = 1.0) + + geom_hline(aes(yintercept = Mean_plus25SD, linetype = "Mean+2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minus25SD, linetype = "Mean-2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_plusSD, linetype = "Mean+SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minusSD, linetype = "Mean-SD"), size = 0.5) + + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + labs(colour = "Match Status") + + labs(shape = "Donor Status") + + ylim(-1,1) + + scale_color_manual(values = group_colors) + + labs(linetype = "Limits") + + facet_wrap(~Patient2, scales = "free_x") + + ggtitle(paste("Pool:",poolID, sep=""),subtitle = "Unexpected Match Overall") + +ggplot(intervals_df, aes(x=interval, y = log10(n))) + + geom_bar(stat = "identity", position = "dodge") + + geom_text(aes(label = paste(n,"\n",percent,sep="")), vjust = -0.5, color = "black")+ + annotate("text", x=1, y=5, label= paste("Threshold Mean + SD: ",unexpected_match_sample$Mean_plusSD,sep=""), hjust = 0) + + annotate("text", x=1, y=4.5, label= paste("Threshold Mean + 2.5*SD: ",unexpected_match_sample$Mean_plus25SD,sep=""), hjust = 0) + + annotate("text", x=1, y=4, label= paste("Threshold Mean - SD: ",unexpected_match_sample$Mean_minusSD,sep=""), hjust = 0) + + annotate("text", x=1, y=3.5, label= paste("Threshold Mean - 2.5SD: ",unexpected_match_sample$Mean_minus25SD,sep=""), hjust = 0) + + annotate("text", x=1, y=3., label= paste("Pool Mean + SD: ",unexpected_match_sample$Pool_meanplussd,sep=""), hjust = 0) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + xlab("Intervals") + + ylab("log10(Compared Pairs)") + + ggtitle(paste("Pool:",poolID, sep=""),subtitle = "Unexpected Match Intervals") + +dev.off() + +unexpected_match_sample_table = unexpected_match_sample[unexpected_match_sample$Match_Status == "Matching",] + +unexpected_match_sample_table = unexpected_match_sample_table %>% select(Sample1, Sample2, Correlation, Observation, Mean_minusSD, Mean_minus2SD, Mean_minus25SD, Mean_plusSD, Mean_plus2SD, Mean_plus25SD, Match_Status, Loci_Status, Donor_Status) %>% unique() +write.table(unexpected_match_sample_table,file = paste(outputpath,"/",poolID,"_unexpected_match.txt",sep = ""), append = F, quote = F, sep = "\t", row.names = F) + +## Unexpected mismatch calculation - sample + +unexpected_mismatch_sample = copy(unexpected_mismatch) +unexpected_mismatch_sample$Loci_Status = ifelse(unexpected_mismatch_sample$Observation >= 10, "Loci Pass","Loci Low") +unexpected_mismatch_sample$Donor_Status = ifelse((unexpected_mismatch_sample$Donor_Status1 == "Donor Found" | unexpected_mismatch_sample$Donor_Status2 == "Donor Found"), "Donor Present","No Donor") +unexpected_mismatch_sample$Correlation = as.numeric(unexpected_mismatch_sample$Correlation) + +unexpected_mismatch_sample$Pool_mean = round(mean(unexpected_mismatch_sample$Correlation),2) +unexpected_mismatch_sample$Pool_sd = round(sd(unexpected_mismatch_sample$Correlation),2) + +unexpected_mismatch_sample$Pool_meanminussd = unexpected_mismatch_sample$Pool_mean - unexpected_mismatch_sample$Pool_sd + +unexpected_mismatch_sample$Cohort_mean = 0.96 +unexpected_mismatch_sample$Cohort_SD = 0.07 + +unexpected_mismatch_sample$Mean_minus25SD = unexpected_mismatch_sample$Cohort_mean-2.5*unexpected_mismatch_sample$Cohort_SD +unexpected_mismatch_sample$Mean_minusSD = unexpected_mismatch_sample$Cohort_mean-unexpected_mismatch_sample$Cohort_SD +unexpected_mismatch_sample$Mean_minus2SD = unexpected_mismatch_sample$Cohort_mean-2*unexpected_mismatch_sample$Cohort_SD + +unexpected_mismatch_sample$Match_Status = ifelse(unexpected_mismatch_sample$Correlation <= unexpected_mismatch_sample$Mean_minus25SD, "Mismatching","Pass") + +unexpected_mismatch_sample$key = paste(unexpected_mismatch_sample$Sample1, unexpected_mismatch_sample$Sample2, sep=":") + +unexpected_mismatch_sample_intervals_corr = unexpected_mismatch_sample %>% + mutate(interval = cut(Correlation, breaks = make_corr_breaks(Correlation), include.lowest = TRUE)) %>% + count(interval) + +intervals_set <- c("[-1,-0.9]", "(-0.9,-0.8]", "(-0.8,-0.7]", "(-0.7,-0.6]", "(-0.6,-0.5]","(-0.5,-0.4]", "(-0.4,-0.3]", "(-0.3,-0.2]", "(-0.2,-0.1]", "(-0.1,0]", "(0,0.1]", "(0.1,0.2]", "(0.2,0.3]", "(0.3,0.4]", "(0.4,0.5]", "(0.5,0.6]", "(0.6,0.7]", "(0.7,0.8]", "(0.8,0.9]","(0.9,1]") +intervals_df <- data.frame( + interval = intervals_set +) + +intervals_df = merge(intervals_df, unexpected_mismatch_sample_intervals_corr, all.x = T) +intervals_df$n[is.na(intervals_df$n)] <- 0 +intervals_df$percent = round(intervals_df$n/nrow(unexpected_mismatch_sample),digits = 2) + +intervals_df$interval <- factor(intervals_df$interval, levels = c("[-1,-0.9]", "(-0.9,-0.8]", "(-0.8,-0.7]", "(-0.7,-0.6]", "(-0.6,-0.5]","(-0.5,-0.4]", "(-0.4,-0.3]", "(-0.3,-0.2]", "(-0.2,-0.1]", "(-0.1,0]", "(0,0.1]", "(0.1,0.2]", "(0.2,0.3]", "(0.3,0.4]", "(0.4,0.5]", "(0.5,0.6]", "(0.6,0.7]", "(0.7,0.8]", "(0.8,0.9]","(0.9,1]")) + + +pdf(file = paste(outputpath,"/",poolID,"_unexpected_mismatch.pdf",sep = ""), width = 10, height = 6) + +group_colors <- c(Pass = "#D3D3D3", Mismatching = "#CC6600") + + +ggplot(unexpected_mismatch_sample, aes(x = key, y = Correlation)) + + geom_point(aes(colour = Match_Status, shape = Donor_Status), size = 1.0) + + geom_hline(aes(yintercept = Mean_minus25SD, linetype = "Mean-2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minusSD, linetype = "Mean-SD"), size = 0.5) + + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + labs(colour = "Match Status") + + labs(shape = "Donor Status") + + ylim(0,1) + + scale_color_manual(values = group_colors) + + labs(linetype = "Limits") + + ggtitle(paste("Pool:",poolID,sep=""),subtitle = "Unexpected Mismatch Overall") + +ggplot(unexpected_mismatch_sample, aes(x = key, y = Correlation)) + + geom_point(aes(colour = Match_Status, shape = Donor_Status), size = 1.0) + + geom_hline(aes(yintercept = Mean_minus25SD, linetype = "Mean-2.5SD"), size = 0.5) + + geom_hline(aes(yintercept = Mean_minusSD, linetype = "Mean-SD"), size = 0.5) + + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + labs(colour = "Match Status") + + labs(shape = "Donor Status") + + ylim(0,1) + + scale_color_manual(values = group_colors) + + labs(linetype = "Limits") + + facet_wrap(~Patient2, scales = "free_x") + + ggtitle(paste("Pool:",poolID,sep=""),subtitle = "Unexpected Mismatch Overall") + +ggplot(intervals_df, aes(x=interval, y = log10(n))) + + geom_bar(stat = "identity", position = "dodge") + + geom_text(aes(label = paste(n,"\n",percent,sep="")), vjust = -0.5, color = "black")+ + annotate("text", x=1, y=4.5, label= paste("Threshold Mean - SD: ",unexpected_mismatch_sample$Mean_minusSD,sep=""), hjust = 0) + + annotate("text", x=1, y=4, label= paste("Threshold Mean - 2.5SD: ",unexpected_mismatch_sample$Mean_minus25SD,sep=""), hjust = 0) + + annotate("text", x=1, y=3.5, label= paste("Pool Mean + SD: ",unexpected_mismatch_sample$Pool_meanminussd,sep=""), hjust = 0) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + ggtitle(paste("Pool:",poolID,sep=""),subtitle = "Unexpected Mismatch Intervals") + + xlab("Intervals") + + ylab("log10(Compared Pairs)") + +dev.off() + +unexpected_mismatch_sample_table = unexpected_mismatch_sample[unexpected_mismatch_sample$Match_Status == "Mismatching",] + +unexpected_mismatch_sample_table = unexpected_mismatch_sample_table %>% select(Sample1, Sample2, Correlation, Observation, Mean_minusSD, Mean_minus2SD, Mean_minus25SD, Loci_Status, Donor_Status) %>% unique() +write.table(unexpected_mismatch_sample_table,file = paste(outputpath,"/",poolID,"_unexpected_mismatch.txt",sep = ""), append = F, quote = F, sep = "\t", row.names = F) diff --git a/modules/msk/fingerprint/mislabels/tests/correlations.tab b/modules/msk/fingerprint/mislabels/tests/correlations.tab new file mode 100644 index 00000000..e69de29b diff --git a/modules/msk/fingerprint/mislabels/tests/main.nf.test b/modules/msk/fingerprint/mislabels/tests/main.nf.test new file mode 100644 index 00000000..41e205f0 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/tests/main.nf.test @@ -0,0 +1,125 @@ +nextflow_process { + + name "Test Process FINGERPRINT_MISLABELS" + script "../main.nf" + process "FINGERPRINT_MISLABELS" + config "./nextflow.config" + + tag "modules" + tag "modules_msk" + tag "fingerprint" + tag "fingerprint/mislabels" + tag "fingerprint/correlation" + tag "fingerprint/combine" + tag "gbcms" + tag "fingerprint/vcfparser" + + test("sarscov2 - bam") { + setup { + run("GBCMS") { + script "../../../gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2' ], + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER") { + script "../../vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + run("FINGERPRINT_COMBINE") { + script "../../combine/main.nf" + process { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv + .map{ meta, tsv -> + def meta2 = [id:meta.pool] + [[id:meta.pool], tsv, meta.id, "hg19", "default"] + }.groupTuple(by:[0]) + input[1] = file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true) + """ + } + } + run("FINGERPRINT_CORRELATION") { + script "../../correlation/main.nf" + process { + """ + input[0] = FINGERPRINT_COMBINE.out.combined_fp_tsv + input[1] = "" + """ + } + } + } + + when { + process { + """ + input[0] = FINGERPRINT_CORRELATION.out.correlations_tab + .join(FINGERPRINT_CORRELATION.out.observations_tab) + input[1] = file("$baseDir/modules/msk/fingerprint/mislabels/tests/sample_sheet.csv", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.unexpected_match_txt, + process.out.unexpected_mismatch_txt, + process.out.versions_mislabels + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id:'thispool'], + file("$baseDir/modules/msk/fingerprint/mislabels/tests/correlations.tab", checkIfExists: true), + file("$baseDir/modules/msk/fingerprint/mislabels/tests/observations.tab", checkIfExists: true) + ] + input[1] = file("$baseDir/modules/msk/fingerprint/mislabels/tests/sample_sheet.csv", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/fingerprint/mislabels/tests/main.nf.test.snap b/modules/msk/fingerprint/mislabels/tests/main.nf.test.snap new file mode 100644 index 00000000..5fe6fb12 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/tests/main.nf.test.snap @@ -0,0 +1,123 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_match.pdf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_match.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_mismatch.pdf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_mismatch.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + "FINGERPRINT_MISLABELS", + "unexpected_match_mismatch.R", + "0.1.0" + ] + ], + "unexpected_match_pdf": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_match.pdf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unexpected_match_txt": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_match.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unexpected_mismatch_pdf": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_mismatch.pdf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unexpected_mismatch_txt": [ + [ + { + "id": "thispool" + }, + "thispool_unexpected_mismatch.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_mislabels": [ + [ + "FINGERPRINT_MISLABELS", + "unexpected_match_mismatch.R", + "0.1.0" + ] + ] + } + ], + "timestamp": "2026-03-31T11:51:32.987116913", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - bam": { + "content": [ + [ + [ + { + "id": null + }, + "null_unexpected_match.txt:md5,14af9ffece921578088528e9c1663886" + ] + ], + [ + [ + { + "id": null + }, + "null_unexpected_mismatch.txt:md5,d5c8b66fd71b1e4b20ee403d822cd7b9" + ] + ], + [ + [ + "FINGERPRINT_MISLABELS", + "unexpected_match_mismatch.R", + "0.1.0" + ] + ] + ], + "timestamp": "2026-03-31T11:51:27.508330482", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/modules/msk/fingerprint/mislabels/tests/nextflow.config b/modules/msk/fingerprint/mislabels/tests/nextflow.config new file mode 100644 index 00000000..06367761 --- /dev/null +++ b/modules/msk/fingerprint/mislabels/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'FINGERPRINT_COMBINE' { + ext.args = "-d 0" + } + withName: 'FINGERPRINT_VCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/modules/msk/fingerprint/mislabels/tests/observations.tab b/modules/msk/fingerprint/mislabels/tests/observations.tab new file mode 100644 index 00000000..e69de29b diff --git a/modules/msk/fingerprint/mislabels/tests/sample_sheet.csv b/modules/msk/fingerprint/mislabels/tests/sample_sheet.csv new file mode 100644 index 00000000..19476b3f --- /dev/null +++ b/modules/msk/fingerprint/mislabels/tests/sample_sheet.csv @@ -0,0 +1,3 @@ +sample,patient,is_donor +test,1,false +test2,2,false diff --git a/modules/msk/fingerprint/vcfparser/environment.yml b/modules/msk/fingerprint/vcfparser/environment.yml new file mode 100644 index 00000000..a5547b5c --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::pysam=0.23.3" diff --git a/modules/msk/fingerprint/vcfparser/main.nf b/modules/msk/fingerprint/vcfparser/main.nf new file mode 100644 index 00000000..a82ddd9f --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/main.nf @@ -0,0 +1,41 @@ +process FINGERPRINT_VCFPARSER { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pysam:0.23.0--py39hdd5828d_0': + 'biocontainers/pysam:0.23.0--py39hdd5828d_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("${prefix}.fp.tsv") , emit: tsv + tuple val("${task.process}"), val('parse_fingerprint_vcf.py'), eval('parse_fingerprint_vcf.py -v | cut -f 2 -d" "'), emit: versions_vcfparser, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + parse_fingerprint_vcf.py \\ + --input ${vcf} \\ + --output ${prefix}.fp.tsv \\ + --samplename ${prefix} \\ + $args + + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo $args + + touch ${prefix}.fp.tsv + + """ +} diff --git a/modules/msk/fingerprint/vcfparser/meta.yml b/modules/msk/fingerprint/vcfparser/meta.yml new file mode 100644 index 00000000..15034410 --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/meta.yml @@ -0,0 +1,56 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fingerprint_vcfparser" +description: + Custom script to parse fingerprint VCF files, generated by the GBCMS + module. +keywords: + - fingerprint + - vcf + - pysam +tools: + - "parse_fingerprint_vcf.py": + description: + "Pysam is a Python module for reading and manipulating SAM/BAM/VCF/BCF + files. It's a lightweight wrapper of the htslib C-API, the same one that powers + samtools, bcftools, and tabix." + homepage: "https://pysam.readthedocs.io/en/latest/api.html" + documentation: "https://pysam.readthedocs.io/en/latest/api.html" + tool_dev_url: "https://github.com/pysam-developers/pysam" + licence: ["MIT"] + identifier: biotools:pysam + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: VCF file output from GBCMS + pattern: "*.vcf" + ontologies: + - edam: http://edamontology.org/format_3016 # VCF +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}.fp.tsv: + type: file + description: + Tab-separated values (TSV) file containing parsed fingerprint + data + pattern: "${prefix}.fp.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + versions_vcfparser: + - - ${task.process}: {} + - parse_fingerprint_vcf.py: {} + - 'parse_fingerprint_vcf.py -v | cut -f 2 -d" ': {} +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/msk/fingerprint/vcfparser/resources/usr/bin/parse_fingerprint_vcf.py b/modules/msk/fingerprint/vcfparser/resources/usr/bin/parse_fingerprint_vcf.py new file mode 100755 index 00000000..b4ddd044 --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/resources/usr/bin/parse_fingerprint_vcf.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +import argparse + +""" +Converts fingerprint vcf to a formatted table +""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.1.0" +__status__ = "Dev" + +import sys, os +from pysam import VariantFile # version >= 0.15.2 +from itertools import groupby + +def usage(): + parser = argparse.ArgumentParser(prog='parse_fingerprint_vcf.py') + parser.add_argument('--input','-i', help = 'input file', required = True) + parser.add_argument('--samplename','-n', help = 'sample name', required = True) + parser.add_argument('--output','-o', help = 'output file', required = True) + parser.add_argument('--depth-filter','-d', default = 20, type = int, help = 'minimum read depth for outputting a minor allele frequency [default = 20]') + parser.add_argument('--version','-v',action='version',version='%(prog)s ' + __version__, help="Show program's version number and exit.") + return parser.parse_args() + +def main(): + args = usage() + + fp_out_list = [] + + vcf_in = VariantFile(args.input, "r") + for vcf_rec in vcf_in.fetch(): + ref_allele = vcf_rec.ref + alt_allele = vcf_rec.alts[0] + ref_allele_count = vcf_rec.samples[args.samplename]["RD"] + alt_allele_count = vcf_rec.samples[args.samplename]["AD"] + if ref_allele_count >= alt_allele_count and ref_allele_count > 0: + maf = alt_allele_count / float(ref_allele_count + alt_allele_count) + if maf < .1: + genotype = ref_allele*2 + else: + genotype = ref_allele + alt_allele + elif alt_allele_count > ref_allele_count: + maf = ref_allele_count / float(ref_allele_count + alt_allele_count) + if maf < .1: + genotype = alt_allele*2 + #else: genotype = alt_allele + ref_allele + else: + genotype = ref_allele + alt_allele + elif ref_allele_count == 0: + genotype = "--" + else: + genotype = ref_allele + alt_allele + if ref_allele_count + alt_allele_count < args.depth_filter or genotype == "--": + maf = "" + + + formatted_counts = "{}:{} {}:{}".format(ref_allele,ref_allele_count,alt_allele,alt_allele_count) + + locus = "{}:{}".format(vcf_rec.chrom,vcf_rec.pos) + depth = vcf_rec.samples[args.samplename]["DP"] + + fp_out_list += [[locus,formatted_counts, genotype, maf]] + + with open(args.output,'w') as f: + f.write("\t".join(['Locus', args.samplename + '_Counts', args.samplename + '_Genotypes', args.samplename + '_MinorAlleleFreq']) + "\n") + for i in fp_out_list: + f.write("\t".join([str(j) for j in i]) + "\n") + +if __name__ == "__main__": + main() diff --git a/modules/msk/fingerprint/vcfparser/tests/main.nf.test b/modules/msk/fingerprint/vcfparser/tests/main.nf.test new file mode 100644 index 00000000..d18ec635 --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/tests/main.nf.test @@ -0,0 +1,75 @@ +// nf-core modules test fingerprint/vcfparser +nextflow_process { + + name "Test Process FINGERPRINT_VCFPARSER" + script "../main.nf" + process "FINGERPRINT_VCFPARSER" + + tag "modules" + tag "modules_msk" + tag "fingerprint" + tag "fingerprint/vcfparser" + tag "gbcms" + + test("sarscov2 - vcf") { + config "./nextflow.config" + setup { + run("GBCMS"){ + script "../../../gbcms/main.nf" + process { + """ + input[0] = [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ] + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + } + when { + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - vcf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/fingerprint/vcfparser/tests/main.nf.test.snap b/modules/msk/fingerprint/vcfparser/tests/main.nf.test.snap new file mode 100644 index 00000000..21623821 --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "sarscov2 - vcf": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,9fa9a081f17ee52f03463c96d46a23aa" + ] + ], + "1": [ + [ + "FINGERPRINT_VCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ], + "tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,9fa9a081f17ee52f03463c96d46a23aa" + ] + ], + "versions_vcfparser": [ + [ + "FINGERPRINT_VCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ] + } + ], + "timestamp": "2026-03-31T11:56:37.863052588", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - vcf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fp.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FINGERPRINT_VCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fp.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_vcfparser": [ + [ + "FINGERPRINT_VCFPARSER", + "parse_fingerprint_vcf.py", + "0.1.0" + ] + ] + } + ], + "timestamp": "2026-03-31T11:56:43.764600116", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/modules/msk/fingerprint/vcfparser/tests/nextflow.config b/modules/msk/fingerprint/vcfparser/tests/nextflow.config new file mode 100644 index 00000000..f2cf46a3 --- /dev/null +++ b/modules/msk/fingerprint/vcfparser/tests/nextflow.config @@ -0,0 +1,3 @@ +params { + enable_conda = false +} diff --git a/modules/msk/gbcms/main.nf b/modules/msk/gbcms/main.nf index 38922559..bd0d8dbf 100644 --- a/modules/msk/gbcms/main.nf +++ b/modules/msk/gbcms/main.nf @@ -12,7 +12,7 @@ process GBCMS { output: tuple val(meta), path('*.{vcf,maf}'), emit: variant_file - path "versions.yml" , emit: versions + tuple val("${task.process}"), val('gbcms'), eval("GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]'"), emit: versions_gbcms, topic: versions when: task.ext.when == null || task.ext.when @@ -44,10 +44,6 @@ process GBCMS { --output ${output} \\ --bam $sample:${bam} $args - cat <<-END_VERSIONS > versions.yml - "${task.process}": - GetBaseCountsMultiSample: \$(echo \$(GetBaseCountsMultiSample --help) | grep -oP '[0-9]\\.[0-9]\\.[0-9]') - END_VERSIONS """ stub: @@ -56,9 +52,5 @@ process GBCMS { """ touch variant_file.maf - cat <<-END_VERSIONS > versions.yml - "${task.process}": - GetBaseCountsMultiSample: 1.2.5 - END_VERSIONS """ } diff --git a/modules/msk/gbcms/meta.yml b/modules/msk/gbcms/meta.yml index 170a3e3c..a782f77a 100644 --- a/modules/msk/gbcms/meta.yml +++ b/modules/msk/gbcms/meta.yml @@ -14,10 +14,9 @@ tools: in a given VCF file or MAF file" homepage: "https://github.com/msk-access/GetBaseCountsMultiSample" documentation: "https://github.com/msk-access/GetBaseCountsMultiSample/blob/master/README.md" - identifier: "" + input: - # Only when we have meta - - meta: type: map description: | @@ -29,45 +28,65 @@ input: Input bam file, in the format of SAMPLE_NAME:BAM_FILE. This paramter need to be specified at least once pattern: "*.bam" + ontologies: [] - bambai: type: file description: Index of Bam pattern: "*.bai" + ontologies: [] - variant_file: type: file description: Input variant file in TCGA maf format. --maf or --vcf need to be specified at least once. But --maf and --vcf are mutually exclusive pattern: "*.{maf,vcf}" + ontologies: [] - output: type: string description: Output file - - - fasta: - type: file - description: Input reference sequence file - pattern: "*.fasta" - - - fastafai: - type: file - description: Index of the reference Fasta - pattern: "*.fai" + - fasta: + type: file + description: Input reference sequence file + pattern: "*.fasta" + ontologies: [] + - fastafai: + type: file + description: Index of the reference Fasta + pattern: "*.fai" + + ontologies: [] output: - - variant_file: - - meta: - type: file - description: - base counts in multiple BAM files for all the sites in a given - VCF file or MAF file - pattern: "*.{vcf,maf}" + variant_file: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` - "*.{vcf,maf}": type: file description: base counts in multiple BAM files for all the sites in a given VCF file or MAF file pattern: "*.{vcf,maf}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions_gbcms: + - - ${task.process}: + type: string + description: The name of the process + - gbcms: + type: string + description: The name of the tool + - GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]': {} +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - gbcms: + type: string + description: The name of the tool + - GetBaseCountsMultiSample --help | grep -oP '[0-9]\\.[0-9]\\.[0-9]': + type: eval + description: The expression to obtain the version of the tool authors: - "@buehlere" diff --git a/modules/msk/gbcms/tests/main.nf.test.snap b/modules/msk/gbcms/tests/main.nf.test.snap index 31b547e2..60ff40c5 100644 --- a/modules/msk/gbcms/tests/main.nf.test.snap +++ b/modules/msk/gbcms/tests/main.nf.test.snap @@ -12,7 +12,11 @@ ] ], "1": [ - "versions.yml:md5,a94265ed3bc4b5631d85b9b9b5d2b7e5" + [ + "GBCMS", + "gbcms", + "1.2.4" + ] ], "variant_file": [ [ @@ -23,15 +27,19 @@ "variant_file.vcf:md5,28c8df33c7ea5ed5d1cf9997d8e00ffa" ] ], - "versions": [ - "versions.yml:md5,a94265ed3bc4b5631d85b9b9b5d2b7e5" + "versions_gbcms": [ + [ + "GBCMS", + "gbcms", + "1.2.4" + ] ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-02-13T17:19:51.302342" + "timestamp": "2025-12-17T12:07:13.813792199" } } \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms/main.nf b/subworkflows/msk/fingerprint_gbcms/main.nf new file mode 100644 index 00000000..16bbf38b --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/main.nf @@ -0,0 +1,51 @@ +include { GBCMS } from '../../../modules/msk/gbcms/main' +include { FINGERPRINT_VCFPARSER } from '../../../modules/msk/fingerprint/vcfparser/main' +include { FINGERPRINT_CONTAMINATION } from '../../../modules/msk/fingerprint/contamination/main' + +workflow FINGERPRINT_GBCMS { + + take: + ch_bam // channel: [ val(meta), [ bam ] ] + ch_bai // channel: [ val(meta), [ bai ] ] + ch_fp_tsv // channel: [ val(meta), [ tsv ] ] + ch_fp_loci_vcf // channel: [ val(meta), [ vcf ] ] + ch_fasta // channel: [ fasta ] + ch_fastafai // channel: [ fastafai ] + + main: + + + GBCMS( + ch_bam + .combine(ch_bai, by:[0]) + .combine(ch_fp_loci_vcf.map{ if ( [it].flatten().size() > 1){ it[1] } else { it }}.first()) + .map{ meta, bam, bai, vcf -> [ meta, bam, bai, vcf, meta.id + ".fp.vcf" ] }, + ch_fasta.first(), + ch_fastafai.first() + ) + + FINGERPRINT_VCFPARSER ( GBCMS.out.variant_file ) + + all_fps = FINGERPRINT_VCFPARSER.out.tsv.mix(ch_fp_tsv) + + paired_fps = all_fps + .filter{ meta, tsv -> meta.case_id != null && meta.control_id != null && meta.id == meta.case_id } + .combine(all_fps) + .filter{ meta1, fp1, meta2, fp2 -> + meta1.control_id == meta2.id + }.map{ meta1, fp1, meta2, fp2 -> + [ meta1, fp1, fp2] + } + + unpaired_fps = all_fps + .filter{ meta, tsv -> ! meta.control_id } + .map{ meta, tsv -> [ meta, tsv, [] ] } + + FINGERPRINT_CONTAMINATION ( paired_fps.mix(unpaired_fps) ) + + emit: + fp_tsv_from_bam = FINGERPRINT_VCFPARSER.out.tsv // channel: [ val(meta), tsv ] + fp_tsv = all_fps // channel: [ val(meta), tsv ] + contamination_tsv = FINGERPRINT_CONTAMINATION.out.contamination_tsv // channel: [ val(meta), contamination_tsv ] + +} diff --git a/subworkflows/msk/fingerprint_gbcms/meta.yml b/subworkflows/msk/fingerprint_gbcms/meta.yml new file mode 100644 index 00000000..0f2da6e7 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/meta.yml @@ -0,0 +1,64 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fingerprint_gbcms" +description: | + Get base counts for all fingerprinting sites from BAM/CRAM/SAM files using the GBCMS module, + and parse the resulting VCF files into standardized TSV format using a custom parser. +keywords: + - fingerprint + - fingerprinting + - loci + - vcf + - bam + - qc +components: + - gbcms + - fingerprint/vcfparser + - fingerprint/contamination +input: + - ch_bam: + type: file + description: | + The input channel containing the BAM/CRAM/SAM files + Structure: [ val(meta), path(bam) ] + pattern: "*.{bam/cram/sam}" + - ch_bai: + type: file + description: | + The input channel containing the BAM index files (BAI/CSI) + Structure: [ val(meta), path(bai) ] # or path(csi) + pattern: "*.{bai/csi}" + - ch_fp_vcf: + type: file + description: | + Channel containing fingerprint VCF files + Structure: [ val(meta), path(vcf) ] + pattern: "*.vcf" + - ch_fasta: + type: file + description: | + Channel containing reference FASTA files + Structure: [ path(fasta) ] + pattern: "*.{fasta,fa}" + - ch_fastafai: + type: file + description: | + Channel containing reference FASTA index files + Structure: [ path(fasta.fai) ] + pattern: "*.{fasta,fa}.fai" +output: + - tsv: + type: file + description: | + Channel containing standardized fingerprint TSV files + Structure: [ val(meta), path(tsv) ] + pattern: "*.fp.tsv" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test new file mode 100644 index 00000000..11c1db5a --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test Subworkflow FINGERPRINT_GBCMS" + script "../main.nf" + config "./nextflow.config" + workflow "FINGERPRINT_GBCMS" + + tag "subworkflows" + tag "subworkflows_msk" + tag "subworkflows/fingerprint_gbcms" + tag "gbcms" + tag "fingerprint/vcfparser" + tag "fingerprint/contamination" + + test("sarscov2 - bam") { + + when { + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true) + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true) + ], + ) + input[1] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true) + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true) + ], + ) + input[2] = Channel.empty() + input[3] = Channel.of(file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true)) + input[4] = Channel.of(file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)) + input[5] = Channel.of(file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} diff --git a/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap new file mode 100644 index 00000000..76692254 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/main.nf.test.snap @@ -0,0 +1,109 @@ +{ + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "1": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "2": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.contamination.tsv:md5,2eb950d4d5e0f9b4f7ae53d41d22fb5f" + ] + ], + "contamination_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.contamination.tsv:md5,5b533c60b8eff1f4d2c5fe58a8262303" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.contamination.tsv:md5,2eb950d4d5e0f9b4f7ae53d41d22fb5f" + ] + ], + "fp_tsv": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ], + "fp_tsv_from_bam": [ + [ + { + "id": "test", + "sample": "test" + }, + "test.fp.tsv:md5,c467328eb3c7fb534b555b83b0227206" + ], + [ + { + "id": "test2", + "sample": "test2" + }, + "test2.fp.tsv:md5,c3fbcee584048e9bc4fc93bc6ca487d2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "26.01.1" + }, + "timestamp": "2026-02-26T22:35:39.868458916" + } +} \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config b/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config new file mode 100644 index 00000000..75bb2dd5 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + withName: 'FINGERPRINT_CONTAMINATION' { + ext.args = "-d 0" + } + + withName: 'FINGERPRINT_VCFPARSER' { + ext.args = "-d 0" + } +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/main.nf b/subworkflows/msk/fingerprint_gbcms_batch/main.nf new file mode 100644 index 00000000..7a56fd51 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/main.nf @@ -0,0 +1,98 @@ +include { FINGERPRINT_COMBINE as FINGERPRINT_COMBINE_ALL } from '../../../modules/msk/fingerprint/combine/main' +include { FINGERPRINT_COMBINE as FINGERPRINT_COMBINE_POOLS } from '../../../modules/msk/fingerprint/combine/main' +include { FINGERPRINT_COMBINE as FINGERPRINT_COMBINE_PATIENTS } from '../../../modules/msk/fingerprint/combine/main' +include { FINGERPRINT_CORRELATION as FINGERPRINT_CORRELATION_ALL } from '../../../modules/msk/fingerprint/correlation/main' +include { FINGERPRINT_CORRELATION as FINGERPRINT_CORRELATION_POOLS } from '../../../modules/msk/fingerprint/correlation/main' +include { FINGERPRINT_CORRELATION as FINGERPRINT_CORRELATION_PATIENTS } from '../../../modules/msk/fingerprint/correlation/main' +include { FINGERPRINT_MISLABELS } from '../../../modules/msk/fingerprint/mislabels/main' + +workflow FINGERPRINT_GBCMS_BATCH { + + take: + ch_fp // channel: [ val(meta), tsv ] + ch_liftover_loci_mapping // channel: [ liftover_loci_mapping ] + default_genome + ch_pool // channel: [ poolid ] + ch_patients // channel: [ patientid ] + + main: + + ch_sample_sheet = ch_fp + .filter { meta, tsv -> meta.patient != null } + .map { meta, tsv -> + def is_donor = meta.is_donor != null ? meta.is_donor : false + "${meta.sample},${meta.patient},${is_donor}\n" + } + .collectFile( + name: 'sample_sheet.csv', + seed: 'sample,patient,is_donor\n', + newLine: false, + sort: true + ) + + // All samples combined into a single group + FINGERPRINT_COMBINE_ALL( + ch_fp + .map { meta, tsv -> + [[id:"all"], tsv, meta.id, meta.genome ?: default_genome, meta.patient ?: meta.sample] + }.groupTuple(by:[0]), + ch_liftover_loci_mapping.first() + ) + + // Samples grouped by pool + FINGERPRINT_COMBINE_POOLS( + ch_fp + .combine(ch_pool.unique()) + .filter { meta, tsv, pool -> + pool == meta.pool + } + .map { meta, tsv, pool -> + [[id:pool], tsv, meta.id, meta.genome ?: default_genome, meta.patient ?: meta.sample] + }.groupTuple(by:[0]), + ch_liftover_loci_mapping.first() + ) + + // Samples grouped by patient + FINGERPRINT_COMBINE_PATIENTS( + ch_fp + .combine(ch_patients.unique()) + .filter { meta, tsv, patient -> + patient.toString() == meta.patient.toString() + }.map { meta, tsv, patient -> + [[id:meta.patient.toString()], tsv, meta.id, meta.genome ?: default_genome, meta.patient ?: meta.sample] + }.groupTuple(by:[0]), + ch_liftover_loci_mapping.first() + ) + + FINGERPRINT_CORRELATION_ALL( + FINGERPRINT_COMBINE_ALL.out.combined_fp_tsv, + [] + ) + + FINGERPRINT_CORRELATION_POOLS( + FINGERPRINT_COMBINE_POOLS.out.combined_fp_tsv, + [] + ) + + FINGERPRINT_CORRELATION_PATIENTS( + FINGERPRINT_COMBINE_PATIENTS.out.combined_fp_tsv, + [] + ) + + FINGERPRINT_MISLABELS( + FINGERPRINT_CORRELATION_ALL.out.correlations_tab + .join(FINGERPRINT_CORRELATION_ALL.out.observations_tab), + ch_sample_sheet + .filter { csv -> csv.readLines().size() >= 3 } + .first() + ) + + emit: + combined_fp_tsv_all = FINGERPRINT_COMBINE_ALL.out.combined_fp_tsv // channel: [ val(meta), tsv ] + combined_fp_tsv_pools = FINGERPRINT_COMBINE_POOLS.out.combined_fp_tsv // channel: [ val(meta), tsv ] + combined_fp_tsv_patients = FINGERPRINT_COMBINE_PATIENTS.out.combined_fp_tsv // channel: [ val(meta), tsv ] + unexpected_match_pdf = FINGERPRINT_MISLABELS.out.unexpected_match_pdf // channel: [ val(meta), pdf ] + unexpected_match_txt = FINGERPRINT_MISLABELS.out.unexpected_match_txt // channel: [ val(meta), txt ] + unexpected_mismatch_pdf = FINGERPRINT_MISLABELS.out.unexpected_mismatch_pdf // channel: [ val(meta), pdf ] + unexpected_mismatch_txt = FINGERPRINT_MISLABELS.out.unexpected_mismatch_txt // channel: [ val(meta), txt ] +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/meta.yml b/subworkflows/msk/fingerprint_gbcms_batch/meta.yml new file mode 100644 index 00000000..832e3761 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fingerprint_gbcms_batch" +description: "Subworkflow to combine and compare Fingerprint files from different samples" +keywords: + - fingerprint + - qc + - liftover + - batch + - pool +components: + - fingerprint/combine + - fingerprint/correlation + - fingerprint/mislabels +input: + - ch_fp: + type: file + description: | + The input channel containing one fingerprint file per sample + Structure: [ val(meta), path(fp_tsv) ] + pattern: "*.fp.tsv" + - ch_liftover_loci_mapping: + type: file + description: | + The input channel containing the loci mapping file for liftover + Structure: [ path(loci_mapping.tsv) ] + pattern: "*.tsv" +output: + - combined_fp_tsv: + type: file + description: | + Channel containing combined fingerprint TSV file + Structure: [ path(combined_fp_tsv) ] + pattern: "*DPfilter_ALL_FP.txt" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test new file mode 100644 index 00000000..b99f5a6a --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test @@ -0,0 +1,143 @@ +// nf-core subworkflows test fingerprint_gbcms_batch +nextflow_workflow { + + name "Test Subworkflow FINGERPRINT_GBCMS_BATCH" + script "../main.nf" + config "./nextflow.config" + workflow "FINGERPRINT_GBCMS_BATCH" + + tag "subworkflows" + tag "subworkflows_msk" + tag "subworkflows/fingerprint_gbcms_batch" + tag "gbcms" + tag "fingerprint/vcfparser" + tag "fingerprint/combine" + tag "fingerprint/correlation" + tag "fingerprint/mislabels" + + + test("sarscov2 - bam - single_end - no patient - fingerprintmislabels skipped") { + + setup { + run("GBCMS"){ + script "../../../../modules/msk/gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../../../modules/msk/fingerprint/vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + workflow { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv + input[1] = [file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true)] + input[2] = "hg19" + input[3] = Channel.empty() + input[4] = Channel.empty() + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert workflow.out.unexpected_match_txt == [] }, + { assert workflow.out.unexpected_mismatch_txt == [] }, + { assert snapshot(workflow.out.combined_fp_tsv_all).match() } + ) + } + + } + + test("sarscov2 - bam - single_end - with patient - fingerprintmislabels runs") { + + setup { + run("GBCMS"){ + script "../../../../modules/msk/gbcms/main.nf" + process { + """ + input[0] = Channel.of( + [ + [ id:'test', sample:'test', patient: 1 ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + [ + [ id:'test2', sample:'test2', patient: 2 ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + "variant_file.vcf" + ], + ) + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + """ + } + } + run("FINGERPRINT_VCFPARSER"){ + script "../../../../modules/msk/fingerprint/vcfparser/main.nf" + process { + """ + input[0] = GBCMS.out.variant_file + """ + } + } + } + + when { + workflow { + """ + input[0] = FINGERPRINT_VCFPARSER.out.tsv + input[1] = [file("$baseDir/modules/msk/fingerprint/combine/tests/loci_mapping.tsv", checkIfExists:true)] + input[2] = "hg19" + input[3] = Channel.empty() + input[4] = Channel.empty() + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert workflow.out.unexpected_match_txt.size() == 1 }, + { assert workflow.out.unexpected_mismatch_txt.size() == 1 }, + { assert snapshot( + workflow.out.unexpected_match_txt, + workflow.out.unexpected_mismatch_txt + ).match() } + ) + } + + } +} diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap new file mode 100644 index 00000000..1a655c94 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "sarscov2 - bam - single_end - with patient - fingerprintmislabels runs": { + "content": [ + [ + [ + { + "id": "all" + }, + "all_unexpected_match.txt:md5,14af9ffece921578088528e9c1663886" + ] + ], + [ + [ + { + "id": "all" + }, + "all_unexpected_mismatch.txt:md5,d5c8b66fd71b1e4b20ee403d822cd7b9" + ] + ] + ], + "timestamp": "2026-03-26T17:19:28.68144948", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + }, + "sarscov2 - bam - single_end - no patient - fingerprintmislabels skipped": { + "content": [ + [ + [ + { + "id": "all" + }, + "0DPfilter_ALL_FP.txt:md5,2b376a207fd1bd6bec55fa765e3a3947" + ] + ] + ], + "timestamp": "2026-03-26T17:19:06.918847051", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.01.1" + } + } +} \ No newline at end of file diff --git a/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config b/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config new file mode 100644 index 00000000..144a1d90 --- /dev/null +++ b/subworkflows/msk/fingerprint_gbcms_batch/tests/nextflow.config @@ -0,0 +1,13 @@ +process { + withName: 'FINGERPRINT_CONTAMINATION' { + ext.args = "-d 0" + } + + withName: 'FINGERPRINT_VCFPARSER' { + ext.args = "-d 0" + } + + withName: 'FINGERPRINT_COMBINE' { + ext.args = "-d 0" + } +}