From 62a94dbba05b660426a9c4c99cfa55a53088e1e4 Mon Sep 17 00:00:00 2001 From: mathysgrapotte Date: Mon, 5 May 2025 12:46:09 +0200 Subject: [PATCH 1/3] now compares all-against-all and outputs raw comparison instead of comparing only within transforms. --- .../local/stimulus/compare_tensors/main.nf | 12 +---- subworkflows/local/evaluation/main.nf | 53 +++++++++++++------ 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/modules/local/stimulus/compare_tensors/main.nf b/modules/local/stimulus/compare_tensors/main.nf index d2c1fff..7966c7b 100644 --- a/modules/local/stimulus/compare_tensors/main.nf +++ b/modules/local/stimulus/compare_tensors/main.nf @@ -18,19 +18,9 @@ process STIMULUS_COMPARE_TENSORS { """ stimulus compare-tensors \ ${tensors} \ - -s scores.csv \ + -o "${prefix}_scores.csv" \ ${args} - # Extract first row of scores.csv - header_scores=\$(head -n 1 scores.csv) - - # Add metadata info to output file - echo "${header},\$header_scores" > "${prefix}_scores.csv" - - # Add values - scores=\$(awk 'NR==2 {sub(/[[:space:]]+\$/, "")} NR==2' scores.csv | tr -s '[:blank:]' ',') - echo "${values},\$scores" >> "${prefix}_scores.csv" - cat <<-END_VERSIONS > versions.yml "${task.process}": stimulus: \$(stimulus -v | cut -d ' ' -f 3) diff --git a/subworkflows/local/evaluation/main.nf b/subworkflows/local/evaluation/main.nf index bd0b364..50c7b03 100644 --- a/subworkflows/local/evaluation/main.nf +++ b/subworkflows/local/evaluation/main.nf @@ -41,30 +41,51 @@ workflow EVALUATION_WF { // and the same number of trials, we can estimate the noise across replicates // This is done by comparing the predictions of the alternative models between each other // and then calculatin a summary metric over them (e.g. mean, median, std, etc.) - - replicate_predictions = predictions.map{ - meta, prediction -> - [["id": meta.id, - "split_id": meta.split_id, - "transform_id": meta.transform_id, - "n_trials": meta.n_trials ], meta, prediction] - }.groupTuple(by:0) - .map{ - merging_meta, metas, predictions -> - [merging_meta, predictions] + pairs = predictions + .collate(2) + .collect() + .map { items -> + def pairs = [] + // Create all unique combinations using index comparison + (0.. + (i+1.. + def meta1 = items[i][0] + def meta2 = items[j][0] + def files = [items[i][1], items[j][1]] + // Only compare different transforms OR different replicates + if(meta1.transform_id != meta2.transform_id || meta1.replicate != meta2.replicate) { + pairs << [ + [ + "id1": meta1.id, + "id2": meta2.id, + "split_id1": meta1.split_id, + "split_id2": meta2.split_id, + "transform_id1": meta1.transform_id, + "transform_id2": meta2.transform_id, + "replicate1": meta1.replicate, + "replicate2": meta2.replicate + ], + // Create unique filenames using both transforms and replicates + files + ] + } + } } + pairs + } + .flatMap { it } + + //pairs.dump(tag: "pairs") - // check if the predictions are at least 2, meta,predictions - replicate_predictions.filter{ - it[1].size() > 1 - }.set{ replicate_predictions } STIMULUS_COMPARE_TENSORS_COSINE( - replicate_predictions + pairs ) cosine_scores = STIMULUS_COMPARE_TENSORS_COSINE.out.csv + cosine_scores.dump(tag: "cosine_scores") + cosine_scores .map { meta, csv -> csv From 755eca613d32a98af97e54b017a5f6b8712589fe Mon Sep 17 00:00:00 2001 From: mathysgrapotte Date: Mon, 5 May 2025 16:41:27 +0200 Subject: [PATCH 2/3] compare_tensors now compare all against all. --- modules/local/stimulus/compare_tensors/main.nf | 12 +++++++++++- subworkflows/local/evaluation/main.nf | 5 ----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/modules/local/stimulus/compare_tensors/main.nf b/modules/local/stimulus/compare_tensors/main.nf index 7966c7b..072f6fe 100644 --- a/modules/local/stimulus/compare_tensors/main.nf +++ b/modules/local/stimulus/compare_tensors/main.nf @@ -18,9 +18,19 @@ process STIMULUS_COMPARE_TENSORS { """ stimulus compare-tensors \ ${tensors} \ - -o "${prefix}_scores.csv" \ + -o scores.csv \ ${args} + # Extract first row of scores.csv + header_scores=\$(head -n 1 scores.csv) + + # Add metadata info to output file + echo "${header},\$header_scores" > "${prefix}_scores.csv" + + # Add values + scores=\$(awk 'NR==2 {sub(/[[:space:]]+\$/, "")} NR==2' scores.csv | tr -s '[:blank:]' ',') + echo "${values},\$scores" >> "${prefix}_scores.csv" + cat <<-END_VERSIONS > versions.yml "${task.process}": stimulus: \$(stimulus -v | cut -d ' ' -f 3) diff --git a/subworkflows/local/evaluation/main.nf b/subworkflows/local/evaluation/main.nf index 50c7b03..5e13a82 100644 --- a/subworkflows/local/evaluation/main.nf +++ b/subworkflows/local/evaluation/main.nf @@ -75,17 +75,12 @@ workflow EVALUATION_WF { } .flatMap { it } - //pairs.dump(tag: "pairs") - - STIMULUS_COMPARE_TENSORS_COSINE( pairs ) cosine_scores = STIMULUS_COMPARE_TENSORS_COSINE.out.csv - cosine_scores.dump(tag: "cosine_scores") - cosine_scores .map { meta, csv -> csv From dddffbfb1fcfdfddce979fcabbc886354dcb3f4c Mon Sep 17 00:00:00 2001 From: mathysgrapotte Date: Mon, 16 Jun 2025 15:16:50 +0200 Subject: [PATCH 3/3] feat(hf-integration): merge huggingface datasets integration with new version of stimulus-py --- conf/test_noise_eval.config | 4 ++ modules/local/stimulus/check_model/main.nf | 1 - .../local/stimulus/compare_tensors/main.nf | 2 +- modules/local/stimulus/encode/main.nf | 41 +++++++++++++++++++ modules/local/stimulus/predict/main.nf | 3 +- modules/local/stimulus/split_csv/main.nf | 6 +-- modules/local/stimulus/transform_csv/main.nf | 4 +- modules/local/stimulus/tune/main.nf | 1 - subworkflows/local/evaluation/main.nf | 3 +- subworkflows/local/transform_csv/main.nf | 13 ++++-- workflows/deepmodeloptim.nf | 6 +++ 11 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 modules/local/stimulus/encode/main.nf diff --git a/conf/test_noise_eval.config b/conf/test_noise_eval.config index ac10e8e..91d574e 100644 --- a/conf/test_noise_eval.config +++ b/conf/test_noise_eval.config @@ -31,6 +31,10 @@ params { save_data = false } +env { + HF_DATASETS_CACHE = '/tmp/hf_cache' +} + // Limit resources so that this can run on GitHub Actions process { maxRetries = params.max_retries diff --git a/modules/local/stimulus/check_model/main.nf b/modules/local/stimulus/check_model/main.nf index 6f198df..0d2ef05 100644 --- a/modules/local/stimulus/check_model/main.nf +++ b/modules/local/stimulus/check_model/main.nf @@ -19,7 +19,6 @@ process CHECK_MODEL { def args = task.ext.args ?: '' """ stimulus check-model \ - -e ${data_config} \ -d ${data} \ -m ${model} \ -c ${model_config} \ diff --git a/modules/local/stimulus/compare_tensors/main.nf b/modules/local/stimulus/compare_tensors/main.nf index 072f6fe..b0e3d73 100644 --- a/modules/local/stimulus/compare_tensors/main.nf +++ b/modules/local/stimulus/compare_tensors/main.nf @@ -1,5 +1,5 @@ process STIMULUS_COMPARE_TENSORS { - tag "${meta.id}" + tag "${meta.id1}" label 'process_medium' container "docker.io/mathysgrapotte/stimulus-py:dev" diff --git a/modules/local/stimulus/encode/main.nf b/modules/local/stimulus/encode/main.nf new file mode 100644 index 0000000..f993c15 --- /dev/null +++ b/modules/local/stimulus/encode/main.nf @@ -0,0 +1,41 @@ +process ENCODE_CSV { + + tag "${meta.id}" + label 'process_medium' + // TODO: push image to nf-core quay.io + container "docker.io/mathysgrapotte/stimulus-py:dev" + + input: + tuple val(meta), path(data) + tuple val(meta2), path(config) + + output: + tuple val(meta2), path("${prefix}_encoded"), emit: encoded + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.split_id}-${meta2.transform_id}" + """ + stimulus encode-csv \ + -d ${data} \ + -y ${config} \ + -o ${prefix}_encoded \ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + """ + echo passing check-model stub + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ +} diff --git a/modules/local/stimulus/predict/main.nf b/modules/local/stimulus/predict/main.nf index d5be055..fb181bf 100644 --- a/modules/local/stimulus/predict/main.nf +++ b/modules/local/stimulus/predict/main.nf @@ -5,7 +5,7 @@ process STIMULUS_PREDICT { input: tuple val(meta) , path(model), path(model_config), path(weigths) - tuple val(meta2), path(data), path(data_config) + tuple val(meta2), path(data), path(config) output: tuple val(meta), path("${prefix}-pred.safetensors"), emit: predictions @@ -17,7 +17,6 @@ process STIMULUS_PREDICT { """ stimulus predict \ -d ${data} \ - -e ${data_config} \ -m ${model} \ -c ${model_config} \ -w ${weigths} \ diff --git a/modules/local/stimulus/split_csv/main.nf b/modules/local/stimulus/split_csv/main.nf index 9f44855..51b1a1c 100644 --- a/modules/local/stimulus/split_csv/main.nf +++ b/modules/local/stimulus/split_csv/main.nf @@ -10,7 +10,7 @@ process STIMULUS_SPLIT_DATA { tuple val(meta2), path(sub_config) output: - tuple val(meta2), path("${prefix}.csv"), emit: csv_with_split + tuple val(meta2), path("${prefix}_split"), emit: csv_with_split path "versions.yml" , emit: versions script: @@ -19,7 +19,7 @@ process STIMULUS_SPLIT_DATA { stimulus split-csv \ -c ${data} \ -y ${sub_config} \ - -o ${prefix}.csv + -o ${prefix}_split cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -30,7 +30,7 @@ process STIMULUS_SPLIT_DATA { stub: prefix = task.ext.prefix ?: "${meta.id}-split-${meta2.id}" """ - touch ${prefix}.csv + touch ${prefix}_split cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/stimulus/transform_csv/main.nf b/modules/local/stimulus/transform_csv/main.nf index 787bfe2..285050b 100644 --- a/modules/local/stimulus/transform_csv/main.nf +++ b/modules/local/stimulus/transform_csv/main.nf @@ -11,7 +11,7 @@ process STIMULUS_TRANSFORM_CSV { tuple val(meta2), path(config) output: - tuple val(meta), path("${prefix}.csv"), emit: transformed_data + tuple val(meta), path("${prefix}"), emit: transformed_data path "versions.yml" , emit: versions script: @@ -20,7 +20,7 @@ process STIMULUS_TRANSFORM_CSV { stimulus transform-csv \ -c ${data} \ -y ${config} \ - -o ${prefix}.csv + -o ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/stimulus/tune/main.nf b/modules/local/stimulus/tune/main.nf index 520add0..c8fa5c9 100644 --- a/modules/local/stimulus/tune/main.nf +++ b/modules/local/stimulus/tune/main.nf @@ -25,7 +25,6 @@ process STIMULUS_TUNE { stimulus tune \ -d ${transformed_data} \ -m ${model} \ - -e ${data_sub_config} \ -c ${model_config} \ -o ${prefix}-best-model.safetensors \ -bo ${prefix}-best-optimizer.opt \ diff --git a/subworkflows/local/evaluation/main.nf b/subworkflows/local/evaluation/main.nf index 5e13a82..38076a0 100644 --- a/subworkflows/local/evaluation/main.nf +++ b/subworkflows/local/evaluation/main.nf @@ -28,10 +28,9 @@ workflow EVALUATION_WF { // Evaluation mode 1: Predict the data using the best model // and then compare the predictions of 2 different models // - STIMULUS_PREDICT( model, - ch_data.collect() + ch_data.first() // converts a queue channel to a value channel ) ch_versions = ch_versions.mix(STIMULUS_PREDICT.out.versions) predictions = STIMULUS_PREDICT.out.predictions diff --git a/subworkflows/local/transform_csv/main.nf b/subworkflows/local/transform_csv/main.nf index 8e02b82..f6a925a 100644 --- a/subworkflows/local/transform_csv/main.nf +++ b/subworkflows/local/transform_csv/main.nf @@ -5,6 +5,7 @@ */ include { STIMULUS_TRANSFORM_CSV } from '../../../modules/local/stimulus/transform_csv' +include { ENCODE_CSV } from '../../../modules/local/stimulus/encode' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -48,17 +49,23 @@ workflow TRANSFORM_CSV_WF { config: [meta, config] } - // run stimulus transform STIMULUS_TRANSFORM_CSV( ch_input.data, ch_input.config ) ch_transformed_data = STIMULUS_TRANSFORM_CSV.out.transformed_data - ch_versions = ch_versions.mix(STIMULUS_TRANSFORM_CSV.out.versions) + + // run stimulus encode + ENCODE_CSV( + ch_transformed_data, + ch_input.config + ) + ch_encoded_data = ENCODE_CSV.out.encoded + ch_versions = ch_versions.mix(ENCODE_CSV.out.versions) emit: - transformed_data = ch_transformed_data + transformed_data = ch_encoded_data versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/deepmodeloptim.nf b/workflows/deepmodeloptim.nf index e7efa9b..d13f032 100644 --- a/workflows/deepmodeloptim.nf +++ b/workflows/deepmodeloptim.nf @@ -16,6 +16,7 @@ include { SPLIT_CSV_WF } from '../subworkflows/local/spli include { TRANSFORM_CSV_WF } from '../subworkflows/local/transform_csv' include { TUNE_WF } from '../subworkflows/local/tune' include { EVALUATION_WF } from '../subworkflows/local/evaluation' +include { ENCODE_CSV } from '../modules/local/stimulus/encode' // // MODULES: Consisting of nf-core/modules @@ -149,6 +150,11 @@ workflow DEEPMODELOPTIM { // Now the data config will not work if passed in full // We need to pass in the split data config, any of them, for the predict modules // This will be changed in the future + ENCODE_CSV( + prediction_data, + TUNE_WF.out.data_config_tmp.first() + ) + prediction_data = ENCODE_CSV.out.encoded prediction_data = prediction_data.combine(TUNE_WF.out.data_config_tmp.first().map{meta,file -> file}) EVALUATION_WF( TUNE_WF.out.model_tmp,