diff --git a/conf/test_noise_eval.config b/conf/test_noise_eval.config index ac10e8e..91d574e 100644 --- a/conf/test_noise_eval.config +++ b/conf/test_noise_eval.config @@ -31,6 +31,10 @@ params { save_data = false } +env { + HF_DATASETS_CACHE = '/tmp/hf_cache' +} + // Limit resources so that this can run on GitHub Actions process { maxRetries = params.max_retries diff --git a/modules/local/stimulus/check_model/main.nf b/modules/local/stimulus/check_model/main.nf index 6f198df..0d2ef05 100644 --- a/modules/local/stimulus/check_model/main.nf +++ b/modules/local/stimulus/check_model/main.nf @@ -19,7 +19,6 @@ process CHECK_MODEL { def args = task.ext.args ?: '' """ stimulus check-model \ - -e ${data_config} \ -d ${data} \ -m ${model} \ -c ${model_config} \ diff --git a/modules/local/stimulus/compare_tensors/main.nf b/modules/local/stimulus/compare_tensors/main.nf index d2c1fff..b0e3d73 100644 --- a/modules/local/stimulus/compare_tensors/main.nf +++ b/modules/local/stimulus/compare_tensors/main.nf @@ -1,5 +1,5 @@ process STIMULUS_COMPARE_TENSORS { - tag "${meta.id}" + tag "${meta.id1}" label 'process_medium' container "docker.io/mathysgrapotte/stimulus-py:dev" @@ -18,7 +18,7 @@ process STIMULUS_COMPARE_TENSORS { """ stimulus compare-tensors \ ${tensors} \ - -s scores.csv \ + -o scores.csv \ ${args} # Extract first row of scores.csv diff --git a/modules/local/stimulus/encode/main.nf b/modules/local/stimulus/encode/main.nf new file mode 100644 index 0000000..f993c15 --- /dev/null +++ b/modules/local/stimulus/encode/main.nf @@ -0,0 +1,41 @@ +process ENCODE_CSV { + + tag "${meta.id}" + label 'process_medium' + // TODO: push image to nf-core quay.io + container "docker.io/mathysgrapotte/stimulus-py:dev" + + input: + tuple val(meta), path(data) + tuple val(meta2), path(config) + + output: + tuple val(meta2), path("${prefix}_encoded"), emit: encoded + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.split_id}-${meta2.transform_id}" + """ + stimulus encode-csv \ + -d ${data} \ + -y ${config} \ + -o ${prefix}_encoded \ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + """ + echo passing check-model stub + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ +} diff --git a/modules/local/stimulus/predict/main.nf b/modules/local/stimulus/predict/main.nf index d5be055..fb181bf 100644 --- a/modules/local/stimulus/predict/main.nf +++ b/modules/local/stimulus/predict/main.nf @@ -5,7 +5,7 @@ process STIMULUS_PREDICT { input: tuple val(meta) , path(model), path(model_config), path(weigths) - tuple val(meta2), path(data), path(data_config) + tuple val(meta2), path(data), path(config) output: tuple val(meta), path("${prefix}-pred.safetensors"), emit: predictions @@ -17,7 +17,6 @@ process STIMULUS_PREDICT { """ stimulus predict \ -d ${data} \ - -e ${data_config} \ -m ${model} \ -c ${model_config} \ -w ${weigths} \ diff --git a/modules/local/stimulus/split_csv/main.nf b/modules/local/stimulus/split_csv/main.nf index 9f44855..51b1a1c 100644 --- a/modules/local/stimulus/split_csv/main.nf +++ b/modules/local/stimulus/split_csv/main.nf @@ -10,7 +10,7 @@ process STIMULUS_SPLIT_DATA { tuple val(meta2), path(sub_config) output: - tuple val(meta2), path("${prefix}.csv"), emit: csv_with_split + tuple val(meta2), path("${prefix}_split"), emit: csv_with_split path "versions.yml" , emit: versions script: @@ -19,7 +19,7 @@ process STIMULUS_SPLIT_DATA { stimulus split-csv \ -c ${data} \ -y ${sub_config} \ - -o ${prefix}.csv + -o ${prefix}_split cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -30,7 +30,7 @@ process STIMULUS_SPLIT_DATA { stub: prefix = task.ext.prefix ?: "${meta.id}-split-${meta2.id}" """ - touch ${prefix}.csv + touch ${prefix}_split cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/stimulus/transform_csv/main.nf b/modules/local/stimulus/transform_csv/main.nf index 787bfe2..285050b 100644 --- a/modules/local/stimulus/transform_csv/main.nf +++ b/modules/local/stimulus/transform_csv/main.nf @@ -11,7 +11,7 @@ process STIMULUS_TRANSFORM_CSV { tuple val(meta2), path(config) output: - tuple val(meta), path("${prefix}.csv"), emit: transformed_data + tuple val(meta), path("${prefix}"), emit: transformed_data path "versions.yml" , emit: versions script: @@ -20,7 +20,7 @@ process STIMULUS_TRANSFORM_CSV { stimulus transform-csv \ -c ${data} \ -y ${config} \ - -o ${prefix}.csv + -o ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/stimulus/tune/main.nf b/modules/local/stimulus/tune/main.nf index 520add0..c8fa5c9 100644 --- a/modules/local/stimulus/tune/main.nf +++ b/modules/local/stimulus/tune/main.nf @@ -25,7 +25,6 @@ process STIMULUS_TUNE { stimulus tune \ -d ${transformed_data} \ -m ${model} \ - -e ${data_sub_config} \ -c ${model_config} \ -o ${prefix}-best-model.safetensors \ -bo ${prefix}-best-optimizer.opt \ diff --git a/subworkflows/local/evaluation/main.nf b/subworkflows/local/evaluation/main.nf index bd0b364..38076a0 100644 --- a/subworkflows/local/evaluation/main.nf +++ b/subworkflows/local/evaluation/main.nf @@ -28,10 +28,9 @@ workflow EVALUATION_WF { // Evaluation mode 1: Predict the data using the best model // and then compare the predictions of 2 different models // - STIMULUS_PREDICT( model, - ch_data.collect() + ch_data.first() // converts a queue channel to a value channel ) ch_versions = ch_versions.mix(STIMULUS_PREDICT.out.versions) predictions = STIMULUS_PREDICT.out.predictions @@ -41,26 +40,42 @@ workflow EVALUATION_WF { // and the same number of trials, we can estimate the noise across replicates // This is done by comparing the predictions of the alternative models between each other // and then calculatin a summary metric over them (e.g. mean, median, std, etc.) - - replicate_predictions = predictions.map{ - meta, prediction -> - [["id": meta.id, - "split_id": meta.split_id, - "transform_id": meta.transform_id, - "n_trials": meta.n_trials ], meta, prediction] - }.groupTuple(by:0) - .map{ - merging_meta, metas, predictions -> - [merging_meta, predictions] + pairs = predictions + .collate(2) + .collect() + .map { items -> + def pairs = [] + // Create all unique combinations using index comparison + (0.. + (i+1.. + def meta1 = items[i][0] + def meta2 = items[j][0] + def files = [items[i][1], items[j][1]] + // Only compare different transforms OR different replicates + if(meta1.transform_id != meta2.transform_id || meta1.replicate != meta2.replicate) { + pairs << [ + [ + "id1": meta1.id, + "id2": meta2.id, + "split_id1": meta1.split_id, + "split_id2": meta2.split_id, + "transform_id1": meta1.transform_id, + "transform_id2": meta2.transform_id, + "replicate1": meta1.replicate, + "replicate2": meta2.replicate + ], + // Create unique filenames using both transforms and replicates + files + ] + } + } } - - // check if the predictions are at least 2, meta,predictions - replicate_predictions.filter{ - it[1].size() > 1 - }.set{ replicate_predictions } + pairs + } + .flatMap { it } STIMULUS_COMPARE_TENSORS_COSINE( - replicate_predictions + pairs ) cosine_scores = STIMULUS_COMPARE_TENSORS_COSINE.out.csv diff --git a/subworkflows/local/transform_csv/main.nf b/subworkflows/local/transform_csv/main.nf index 8e02b82..f6a925a 100644 --- a/subworkflows/local/transform_csv/main.nf +++ b/subworkflows/local/transform_csv/main.nf @@ -5,6 +5,7 @@ */ include { STIMULUS_TRANSFORM_CSV } from '../../../modules/local/stimulus/transform_csv' +include { ENCODE_CSV } from '../../../modules/local/stimulus/encode' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -48,17 +49,23 @@ workflow TRANSFORM_CSV_WF { config: [meta, config] } - // run stimulus transform STIMULUS_TRANSFORM_CSV( ch_input.data, ch_input.config ) ch_transformed_data = STIMULUS_TRANSFORM_CSV.out.transformed_data - ch_versions = ch_versions.mix(STIMULUS_TRANSFORM_CSV.out.versions) + + // run stimulus encode + ENCODE_CSV( + ch_transformed_data, + ch_input.config + ) + ch_encoded_data = ENCODE_CSV.out.encoded + ch_versions = ch_versions.mix(ENCODE_CSV.out.versions) emit: - transformed_data = ch_transformed_data + transformed_data = ch_encoded_data versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/deepmodeloptim.nf b/workflows/deepmodeloptim.nf index e7efa9b..d13f032 100644 --- a/workflows/deepmodeloptim.nf +++ b/workflows/deepmodeloptim.nf @@ -16,6 +16,7 @@ include { SPLIT_CSV_WF } from '../subworkflows/local/spli include { TRANSFORM_CSV_WF } from '../subworkflows/local/transform_csv' include { TUNE_WF } from '../subworkflows/local/tune' include { EVALUATION_WF } from '../subworkflows/local/evaluation' +include { ENCODE_CSV } from '../modules/local/stimulus/encode' // // MODULES: Consisting of nf-core/modules @@ -149,6 +150,11 @@ workflow DEEPMODELOPTIM { // Now the data config will not work if passed in full // We need to pass in the split data config, any of them, for the predict modules // This will be changed in the future + ENCODE_CSV( + prediction_data, + TUNE_WF.out.data_config_tmp.first() + ) + prediction_data = ENCODE_CSV.out.encoded prediction_data = prediction_data.combine(TUNE_WF.out.data_config_tmp.first().map{meta,file -> file}) EVALUATION_WF( TUNE_WF.out.model_tmp,