From 889727ed1ff140d73e471baa50e96ac20d80669c Mon Sep 17 00:00:00 2001 From: mathysgrapotte Date: Fri, 20 Jun 2025 15:04:30 +0200 Subject: [PATCH 1/4] deepmodeloptim now properly splits and use yaml configs. --- .gitignore | 2 + README.md | 18 +----- conf/modules.config | 2 +- modules/local/stimulus/check_model/main.nf | 9 ++- modules/local/stimulus/predict/main.nf | 2 +- modules/local/stimulus/split_split/main.nf | 37 ------------ .../local/stimulus/split_transform/main.nf | 37 ------------ modules/local/stimulus/split_yaml/main.nf | 41 +++++++++++++ modules/local/stimulus/tune/main.nf | 3 +- subworkflows/local/check_model/main.nf | 2 - .../local/split_data_config_split/main.nf | 41 ------------- .../local/split_data_config_transform/main.nf | 40 ------------- .../local/split_data_config_unified/main.nf | 58 +++++++++++++++++++ subworkflows/local/transform_csv/main.nf | 45 +++++++++----- subworkflows/local/tune/main.nf | 31 +++++----- workflows/deepmodeloptim.nf | 32 ++++------ 16 files changed, 166 insertions(+), 234 deletions(-) delete mode 100644 modules/local/stimulus/split_split/main.nf delete mode 100644 modules/local/stimulus/split_transform/main.nf create mode 100644 modules/local/stimulus/split_yaml/main.nf delete mode 100644 subworkflows/local/split_data_config_split/main.nf delete mode 100644 subworkflows/local/split_data_config_transform/main.nf create mode 100644 subworkflows/local/split_data_config_unified/main.nf diff --git a/.gitignore b/.gitignore index 83f79597..cf3c3909 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ bin/.vscode/ .nf-test/ prototype/ *.ipynb +CLAUDE.md +.claude diff --git a/README.md b/README.md index 96742615..65098e9b 100644 --- a/README.md +++ b/README.md @@ -21,24 +21,10 @@ ## Introduction -**nf-core/deepmodeloptim** is a bioinformatics end-to-end pipeline designed to facilitate the testing and development of deep learning models for genomics. +**nf-core/deepmodeloptim** augments your bio data towards an optimal task-specific training set. -Deep learning model development in natural science is an empirical and costly process. Despite the existence of generic tools for the tuning of hyperparameters and the training of the models, the connection between these procedures and the impact coming from the data is often underlooked, or at least not easily automatized. Indeed, researchers must define a pre-processing pipeline, an architecture, find the best parameters for said architecture and iterate over this process, often manually. +Methods in deep learning are vastly equivalent (see neural scaling laws paper), most of the performance is driven by the training data. -Leveraging the power of Nextflow (polyglotism, container integration, scalable on the cloud), this pipeline will help users to 1) automatize the testing of the model, 2) gain useful insights with respect to the learning behaviour of the model, and hence 3) accelerate the development. - -## Pipeline summary - -It takes as input: - -- A dataset -- A configuration file to describe the data pre-processing steps to be performed -- An user defined PyTorch model -- A configuration file describing the range of parameters for the PyTorch model - -It then transforms the data according to all possible pre-processing steps, finds the best architecture parameters for each of the transformed datasets, performs sanity checks on the models and train a minimal deep learning version for each dataset/architecture. - -Those experiments are then compiled into an intuitive report, making it easier for scientists to pick the best design choice to be sent to large scale training. diff --git a/conf/modules.config b/conf/modules.config index d08bffcc..d979f24d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,7 +90,7 @@ process { // main config // ============================================================================== - withName: "STIMULUS_SPLIT_TRANSFORM" { + withName: "STIMULUS_SPLIT_YAML" { publishDir = [ path: { "${params.outdir}/configs/${meta.id}" }, mode: params.publish_dir_mode, diff --git a/modules/local/stimulus/check_model/main.nf b/modules/local/stimulus/check_model/main.nf index 0d2ef053..e95fea1c 100644 --- a/modules/local/stimulus/check_model/main.nf +++ b/modules/local/stimulus/check_model/main.nf @@ -6,11 +6,10 @@ process CHECK_MODEL { container "docker.io/mathysgrapotte/stimulus-py:dev" input: - tuple val(meta), path(data_config) - tuple val(meta2), path(data) - tuple val(meta3), path(model) - tuple val(meta4), path(model_config) - tuple val(meta5), path(initial_weights) + tuple val(meta1), path(data) + tuple val(meta2), path(model) + tuple val(meta3), path(model_config) + tuple val(meta4), path(initial_weights) output: stdout emit: standardout diff --git a/modules/local/stimulus/predict/main.nf b/modules/local/stimulus/predict/main.nf index fb181bf3..3de92e0d 100644 --- a/modules/local/stimulus/predict/main.nf +++ b/modules/local/stimulus/predict/main.nf @@ -5,7 +5,7 @@ process STIMULUS_PREDICT { input: tuple val(meta) , path(model), path(model_config), path(weigths) - tuple val(meta2), path(data), path(config) + tuple val(meta2), path(data) output: tuple val(meta), path("${prefix}-pred.safetensors"), emit: predictions diff --git a/modules/local/stimulus/split_split/main.nf b/modules/local/stimulus/split_split/main.nf deleted file mode 100644 index e7aaa6bb..00000000 --- a/modules/local/stimulus/split_split/main.nf +++ /dev/null @@ -1,37 +0,0 @@ - -process STIMULUS_SPLIT_SPLIT { - - tag "$meta.id" - label 'process_low' - // TODO: push image to nf-core quay.io - container "docker.io/mathysgrapotte/stimulus-py:dev" - - input: - tuple val(meta), path(data_config) - - output: - tuple val(meta), path ("*.yaml"), emit: sub_config - path "versions.yml" , emit: versions - - script: - """ - stimulus split-split -y ${data_config} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - stimulus: \$(stimulus -v | cut -d ' ' -f 3) - END_VERSIONS - """ - - stub: - """ - touch test_0.yaml - touch test_1.yaml - touch test_2.yaml - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - stimulus: \$(stimulus -v | cut -d ' ' -f 3) - END_VERSIONS - """ -} diff --git a/modules/local/stimulus/split_transform/main.nf b/modules/local/stimulus/split_transform/main.nf deleted file mode 100644 index 58ccc10f..00000000 --- a/modules/local/stimulus/split_transform/main.nf +++ /dev/null @@ -1,37 +0,0 @@ - -process STIMULUS_SPLIT_TRANSFORM { - - tag "$meta.id" - label 'process_low' - // TODO: push image to nf-core quay.io - container "docker.io/mathysgrapotte/stimulus-py:dev" - - input: - tuple val(meta), path(data_config) - - output: - tuple val(meta), path ("*.yaml"), emit: sub_config - path "versions.yml" , emit: versions - - script: - """ - stimulus split-transforms -y ${data_config} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - stimulus: \$(stimulus -v | cut -d ' ' -f 3) - END_VERSIONS - """ - - stub: - """ - touch test_0.yaml - touch test_1.yaml - touch test_2.yaml - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - stimulus: \$(stimulus -v | cut -d ' ' -f 3) - END_VERSIONS - """ -} diff --git a/modules/local/stimulus/split_yaml/main.nf b/modules/local/stimulus/split_yaml/main.nf new file mode 100644 index 00000000..2938df23 --- /dev/null +++ b/modules/local/stimulus/split_yaml/main.nf @@ -0,0 +1,41 @@ +process STIMULUS_SPLIT_YAML { + + tag "$meta.id" + label 'process_low' + // TODO: push image to nf-core quay.io + container "docker.io/mathysgrapotte/stimulus-py:dev" + + input: + tuple val(meta), path(data_config) + + output: + tuple val(meta), path("*_encode.yaml") , emit: encode_config + tuple val(meta), path("*_split.yaml") , emit: split_config + tuple val(meta), path("*_transform.yaml") , emit: transform_config + path "versions.yml" , emit: versions + + script: + """ + stimulus split-yaml -y ${data_config} --out-dir ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + def prefix = data_config.baseName + """ + touch ${prefix}_encode.yaml + touch ${prefix}_RandomSplit_70-30_split.yaml + touch ${prefix}_noise_std0.1_transform.yaml + touch ${prefix}_noise_std0.2_transform.yaml + touch ${prefix}_noise_std0.3_transform.yaml + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stimulus: \$(stimulus -v | cut -d ' ' -f 3) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/stimulus/tune/main.nf b/modules/local/stimulus/tune/main.nf index c8fa5c96..fc81c79a 100644 --- a/modules/local/stimulus/tune/main.nf +++ b/modules/local/stimulus/tune/main.nf @@ -4,7 +4,7 @@ process STIMULUS_TUNE { container "docker.io/mathysgrapotte/stimulus-py:dev" input: - tuple val(meta), path(transformed_data), path(data_sub_config) + tuple val(meta), path(transformed_data) tuple val(meta2), path(model), path(model_config), path(initial_weights) output: @@ -15,7 +15,6 @@ process STIMULUS_TUNE { path "versions.yml" , emit: versions // now we need to output these in this format for the predict module - thiw will have to be changed! tuple val(meta), path(model), path("best_config.json"), path("${prefix}-best-model.safetensors"), emit: model_tmp - tuple val(meta), path(data_sub_config) , emit: data_config_tmp script: prefix = task.ext.prefix ?: meta.id diff --git a/subworkflows/local/check_model/main.nf b/subworkflows/local/check_model/main.nf index 599d61e8..4cc83606 100644 --- a/subworkflows/local/check_model/main.nf +++ b/subworkflows/local/check_model/main.nf @@ -17,7 +17,6 @@ workflow CHECK_MODEL_WF { take: ch_data - ch_data_config ch_model ch_model_config ch_initial_weights @@ -27,7 +26,6 @@ workflow CHECK_MODEL_WF { ch_versions = Channel.empty() CHECK_MODEL( - ch_data_config, ch_data, ch_model, ch_model_config, diff --git a/subworkflows/local/split_data_config_split/main.nf b/subworkflows/local/split_data_config_split/main.nf deleted file mode 100644 index d1ac5a81..00000000 --- a/subworkflows/local/split_data_config_split/main.nf +++ /dev/null @@ -1,41 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { STIMULUS_SPLIT_SPLIT } from '../../../modules/local/stimulus/split_split' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow SPLIT_DATA_CONFIG_SPLIT_WF { - take: - ch_data_config - - main: - - ch_versions = Channel.empty() - - STIMULUS_SPLIT_SPLIT( ch_data_config ) - ch_versions = ch_versions.mix(STIMULUS_SPLIT_SPLIT.out.versions) - - // transpose - // and add sub config id called split_id - ch_yaml_sub_config = STIMULUS_SPLIT_SPLIT.out.sub_config - .transpose() - .map { meta, yaml -> [ meta + [split_id: yaml.baseName], yaml] } - - emit: - sub_config = ch_yaml_sub_config - versions = ch_versions // channel: [ versions.yml ] -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/subworkflows/local/split_data_config_transform/main.nf b/subworkflows/local/split_data_config_transform/main.nf deleted file mode 100644 index 11057110..00000000 --- a/subworkflows/local/split_data_config_transform/main.nf +++ /dev/null @@ -1,40 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { STIMULUS_SPLIT_TRANSFORM } from '../../../modules/local/stimulus/split_transform' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow SPLIT_DATA_CONFIG_TRANSFORM_WF { - take: - ch_data_config - - main: - - ch_versions = Channel.empty() - - STIMULUS_SPLIT_TRANSFORM( ch_data_config ) - ch_versions = ch_versions.mix(STIMULUS_SPLIT_TRANSFORM.out.versions) - // transpose - // and add transform_id to meta - ch_yaml_sub_config = STIMULUS_SPLIT_TRANSFORM.out.sub_config - .transpose() - .map { meta,yaml -> [ meta + [transform_id: yaml.baseName], yaml] } - - emit: - sub_config = ch_yaml_sub_config - versions = ch_versions // channel: [ versions.yml ] -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/subworkflows/local/split_data_config_unified/main.nf b/subworkflows/local/split_data_config_unified/main.nf new file mode 100644 index 00000000..9132f776 --- /dev/null +++ b/subworkflows/local/split_data_config_unified/main.nf @@ -0,0 +1,58 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { STIMULUS_SPLIT_YAML } from '../../../modules/local/stimulus/split_yaml' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow SPLIT_DATA_CONFIG_UNIFIED_WF { + take: + ch_data_config + + main: + + ch_versions = Channel.empty() + + STIMULUS_SPLIT_YAML( ch_data_config ) + ch_versions = ch_versions.mix(STIMULUS_SPLIT_YAML.out.versions) + + // Process split configs - transpose and add split_id to meta + ch_split_configs = STIMULUS_SPLIT_YAML.out.split_config + .transpose() + .map { meta, yaml -> + // Extract split info from descriptive filename + def split_id = yaml.baseName.replaceAll(/.*_([^_]+_[^_]+)_split$/, '$1') + [ meta + [split_id: split_id], yaml] + } + + // Process transform configs - transpose and add transform_id to meta + ch_transform_configs = STIMULUS_SPLIT_YAML.out.transform_config + .transpose() + .map { meta, yaml -> + // Extract transform info from descriptive filename + def transform_id = yaml.baseName.replaceAll(/.*_([^_]+_[^_]+)_transform$/, '$1') + [ meta + [transform_id: transform_id], yaml] + } + + // Encoding configs don't need transposition as there's only one per input + ch_encoding_configs = STIMULUS_SPLIT_YAML.out.encode_config + + emit: + split_config = ch_split_configs // channel: [ meta + [split_id: split_id], yaml ] + transform_config = ch_transform_configs // channel: [ meta + [transform_id: transform_id], yaml ] + encode_config = ch_encoding_configs // channel: [ meta, yaml ] + versions = ch_versions // channel: [ versions.yml ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ \ No newline at end of file diff --git a/subworkflows/local/transform_csv/main.nf b/subworkflows/local/transform_csv/main.nf index f6a925a5..3280eab2 100644 --- a/subworkflows/local/transform_csv/main.nf +++ b/subworkflows/local/transform_csv/main.nf @@ -18,6 +18,7 @@ workflow TRANSFORM_CSV_WF { take: ch_split_data ch_config_transform + ch_config_encode main: @@ -33,22 +34,19 @@ workflow TRANSFORM_CSV_WF { // combine data vs configs based on common key: split_id ch_input = ch_split_data - .map { meta, data -> - [[split_id: meta.split_id], meta, data] - } - .combine( - ch_config_transform.map { meta, config -> - [[split_id: meta.split_id], meta, config] - } - ,by: 0 - ) - .multiMap{ key, meta_data, data, meta_config, config -> + .combine(ch_config_transform, by: []) + .map { meta_data, data, meta_config, config -> def meta = meta_data + [transform_id: meta_config.transform_id] - data: - [meta, data] - config: - [meta, config] + [ + data: [meta, data], + config: [meta, config] + ] + } + .multiMap { item -> + data: item.data + config: item.config } + // run stimulus transform STIMULUS_TRANSFORM_CSV( ch_input.data, @@ -56,10 +54,25 @@ workflow TRANSFORM_CSV_WF { ) ch_transformed_data = STIMULUS_TRANSFORM_CSV.out.transformed_data + ch_encode_input = ch_transformed_data + .combine(ch_config_encode, by: []) + .map { meta_data, data, meta_config, config -> + def meta = meta_data + [encode_id: meta_config.encode_id] + [ + data: [meta, data], + config: [meta, config] + ] + } + .multiMap { item -> + data: item.data + config: item.config + } + + // run stimulus encode ENCODE_CSV( - ch_transformed_data, - ch_input.config + ch_encode_input.data, + ch_encode_input.config ) ch_encoded_data = ENCODE_CSV.out.encoded ch_versions = ch_versions.mix(ENCODE_CSV.out.versions) diff --git a/subworkflows/local/tune/main.nf b/subworkflows/local/tune/main.nf index 50faca63..803ff367 100644 --- a/subworkflows/local/tune/main.nf +++ b/subworkflows/local/tune/main.nf @@ -16,7 +16,6 @@ include { CUSTOM_MODIFY_MODEL_CONFIG } from '../../../modules/local/custom/modif workflow TUNE_WF { take: ch_transformed_data - ch_yaml_sub_config ch_model ch_model_config ch_initial_weights @@ -38,28 +37,33 @@ workflow TUNE_WF { ch_versions = ch_versions.mix(CUSTOM_MODIFY_MODEL_CONFIG.out.versions) ch_model_config = CUSTOM_MODIFY_MODEL_CONFIG.out.config + // ch_input = ch_split_data + // .combine(ch_config_transform, by: []) + // .map { meta_data, data, meta_config, config -> + // def meta = meta_data + [transform_id: meta_config.transform_id] + // [ + // data: [meta, data], + // config: [meta, config] + // ] + // } + // .multiMap { item -> + // data: item.data + // config: item.config + // } + + // ch_transformed_data.view() ch_tune_input = ch_transformed_data - .map { meta, data -> - [[split_id: meta.split_id, transform_id: meta.transform_id], meta, data] - } - .combine( - ch_yaml_sub_config.map { meta, config -> - [[split_id: meta.split_id, transform_id: meta.transform_id], config] - } - ,by: 0 - ) .combine(ch_model.map{it[1]}) .combine(ch_model_config) .combine(ch_initial_weights) // when initial_weights is empty .map{it[1]} will return [], and not properly combined .combine(tune_replicates) - .multiMap { key, meta, data, data_config, model, meta_model_config, model_config, meta_weights, initial_weights, n_replicate -> + .multiMap { meta, data, model, meta_model_config, model_config, meta_weights, initial_weights, n_replicate -> def meta_new = meta + [replicate: n_replicate] + [n_trials: meta_model_config.n_trials] data: - [meta_new, data, data_config] + [meta_new, data] model: [meta_new, model, model_config, initial_weights] } - // run stimulus tune STIMULUS_TUNE( ch_tune_input.data, @@ -78,7 +82,6 @@ workflow TUNE_WF { versions = ch_versions // channel: [ versions.yml ] // these are temporaly needed for predict, it will be changed in the future! model_tmp = STIMULUS_TUNE.out.model_tmp - data_config_tmp = STIMULUS_TUNE.out.data_config_tmp } /* diff --git a/workflows/deepmodeloptim.nf b/workflows/deepmodeloptim.nf index d13f032e..f07b86e5 100644 --- a/workflows/deepmodeloptim.nf +++ b/workflows/deepmodeloptim.nf @@ -10,8 +10,7 @@ include { softwareVersionsToYAML } from '../subworkflows/nf-core/ut include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_deepmodeloptim_pipeline' include { CHECK_MODEL_WF } from '../subworkflows/local/check_model' include { PREPROCESS_IBIS_BEDFILE_TO_STIMULUS } from '../subworkflows/local/preprocess_ibis_bedfile_to_stimulus' -include { SPLIT_DATA_CONFIG_SPLIT_WF } from '../subworkflows/local/split_data_config_split' -include { SPLIT_DATA_CONFIG_TRANSFORM_WF } from '../subworkflows/local/split_data_config_transform' +include { SPLIT_DATA_CONFIG_UNIFIED_WF } from '../subworkflows/local/split_data_config_unified' include { SPLIT_CSV_WF } from '../subworkflows/local/split_csv' include { TRANSFORM_CSV_WF } from '../subworkflows/local/transform_csv' include { TUNE_WF } from '../subworkflows/local/tune' @@ -72,11 +71,13 @@ workflow DEEPMODELOPTIM { } // ============================================================================== - // split meta yaml split config file into individual yaml files + // split meta yaml config file into individual component yaml files // ============================================================================== - SPLIT_DATA_CONFIG_SPLIT_WF( ch_data_config ) - ch_yaml_sub_config_split = SPLIT_DATA_CONFIG_SPLIT_WF.out.sub_config + SPLIT_DATA_CONFIG_UNIFIED_WF( ch_data_config ) + ch_yaml_split_config = SPLIT_DATA_CONFIG_UNIFIED_WF.out.split_config + ch_yaml_transform_config = SPLIT_DATA_CONFIG_UNIFIED_WF.out.transform_config + ch_yaml_encode_config = SPLIT_DATA_CONFIG_UNIFIED_WF.out.encode_config // ============================================================================== // split csv data file @@ -84,24 +85,18 @@ workflow DEEPMODELOPTIM { SPLIT_CSV_WF( ch_data, - ch_yaml_sub_config_split + ch_yaml_split_config ) ch_split_data = SPLIT_CSV_WF.out.split_data - // ============================================================================== - // split meta yaml transform config file into individual yaml files - // ============================================================================== - - SPLIT_DATA_CONFIG_TRANSFORM_WF( ch_yaml_sub_config_split ) - ch_yaml_sub_config = SPLIT_DATA_CONFIG_TRANSFORM_WF.out.sub_config - // ============================================================================== // transform csv file // ============================================================================== TRANSFORM_CSV_WF( ch_split_data, - ch_yaml_sub_config + ch_yaml_transform_config, + ch_yaml_encode_config ) ch_transformed_data = TRANSFORM_CSV_WF.out.transformed_data @@ -115,11 +110,9 @@ workflow DEEPMODELOPTIM { // we sort the channel so that we always get the same input, as the default order // of the channel depends on which process finishes first (run in parallel) ch_check_input_data = ch_transformed_data.toSortedList().flatten().buffer(size:2).first() - ch_check_input_config = ch_yaml_sub_config.toSortedList().flatten().buffer(size:2).first() CHECK_MODEL_WF ( ch_check_input_data, - ch_check_input_config, ch_model, ch_model_config, ch_initial_weights @@ -135,7 +128,6 @@ workflow DEEPMODELOPTIM { TUNE_WF( ch_transformed_data, - ch_yaml_sub_config, ch_model, ch_model_config, ch_initial_weights, @@ -147,15 +139,11 @@ workflow DEEPMODELOPTIM { // Evaluation // ============================================================================== - // Now the data config will not work if passed in full - // We need to pass in the split data config, any of them, for the predict modules - // This will be changed in the future ENCODE_CSV( prediction_data, - TUNE_WF.out.data_config_tmp.first() + ch_yaml_encode_config ) prediction_data = ENCODE_CSV.out.encoded - prediction_data = prediction_data.combine(TUNE_WF.out.data_config_tmp.first().map{meta,file -> file}) EVALUATION_WF( TUNE_WF.out.model_tmp, prediction_data From 5e87aa7c93b4ddd1c18b256a06e3032aa5924f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= Date: Thu, 26 Jun 2025 14:19:30 +0200 Subject: [PATCH 2/4] don't modify n_trials if tune_trials_range is null --- .../local/custom/modify_model_config/main.nf | 6 +++++- .../main.nf | 18 +++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/modules/local/custom/modify_model_config/main.nf b/modules/local/custom/modify_model_config/main.nf index 2ccb2cc1..aa9367f3 100644 --- a/modules/local/custom/modify_model_config/main.nf +++ b/modules/local/custom/modify_model_config/main.nf @@ -20,7 +20,11 @@ process CUSTOM_MODIFY_MODEL_CONFIG { meta_updated = meta + ["n_trials": "${n_trials}"] """ # substitte the line containing n_trials in the config file with n_trials: \${n_trials} - awk -v n_trials=${n_trials} '/n_trials: [0-9]+/ {gsub(/n_trials: [0-9]+/, "n_trials: " n_trials)}1' ${config} > ${prefix}.yaml + if [ "${n_trials}" = "[]" ]; then + cp "${config}" "${prefix}.yaml" + else + awk -v n_trials="${n_trials}" '/n_trials: [0-9]+/ {gsub(/n_trials: [0-9]+/, "n_trials: " n_trials)}1' "${config}" > "${prefix}.yaml" + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/utils_nfcore_deepmodeloptim_pipeline/main.nf b/subworkflows/local/utils_nfcore_deepmodeloptim_pipeline/main.nf index c5c3cd66..a5abc83e 100644 --- a/subworkflows/local/utils_nfcore_deepmodeloptim_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_deepmodeloptim_pipeline/main.nf @@ -110,12 +110,16 @@ workflow PIPELINE_INITIALISATION { // range = validate_range(params.tune_trials_range) - val_tune_trials_range = Channel.from(range) - .map { rangeStr -> - def (min, max, step) = rangeStr.tokenize(',')*.toInteger() - (min..max).step(step).toList() - } - .flatten() + if (range) { + val_tune_trials_range = Channel.from(range) + .map { rangeStr -> + def (min, max, step) = rangeStr.tokenize(',')*.toInteger() + (min..max).step(step).toList() + } + .flatten() + } else { + val_tune_trials_range = [] + } // // Create the channels for the number of replicates // @@ -217,7 +221,7 @@ def validateInputSamplesheet(input) { def validate_range(range) { if (range == null) { - return "1,1,1" + return range } def (min, max, step) = range.tokenize(',')*.toInteger() if (min > max) { From 1d517605944f904fb3deaca05f72fb3358afb75e Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Thu, 26 Jun 2025 12:22:10 +0000 Subject: [PATCH 3/4] [automated] Fix code linting --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 65098e9b..6dca052a 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,9 @@ ## Introduction -**nf-core/deepmodeloptim** augments your bio data towards an optimal task-specific training set. - -Methods in deep learning are vastly equivalent (see neural scaling laws paper), most of the performance is driven by the training data. +**nf-core/deepmodeloptim** augments your bio data towards an optimal task-specific training set. +Methods in deep learning are vastly equivalent (see neural scaling laws paper), most of the performance is driven by the training data. From a2d0a973e13517db0dce858c39945804bb2b15da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlia=20Mir=20Pedrol?= Date: Thu, 26 Jun 2025 14:25:03 +0200 Subject: [PATCH 4/4] fix pre-commit linting --- modules/local/stimulus/split_yaml/main.nf | 2 +- subworkflows/local/split_data_config_unified/main.nf | 8 ++++---- subworkflows/local/transform_csv/main.nf | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/stimulus/split_yaml/main.nf b/modules/local/stimulus/split_yaml/main.nf index 2938df23..630c1536 100644 --- a/modules/local/stimulus/split_yaml/main.nf +++ b/modules/local/stimulus/split_yaml/main.nf @@ -38,4 +38,4 @@ process STIMULUS_SPLIT_YAML { stimulus: \$(stimulus -v | cut -d ' ' -f 3) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/subworkflows/local/split_data_config_unified/main.nf b/subworkflows/local/split_data_config_unified/main.nf index 9132f776..7a5d5f2c 100644 --- a/subworkflows/local/split_data_config_unified/main.nf +++ b/subworkflows/local/split_data_config_unified/main.nf @@ -26,13 +26,13 @@ workflow SPLIT_DATA_CONFIG_UNIFIED_WF { // Process split configs - transpose and add split_id to meta ch_split_configs = STIMULUS_SPLIT_YAML.out.split_config .transpose() - .map { meta, yaml -> + .map { meta, yaml -> // Extract split info from descriptive filename def split_id = yaml.baseName.replaceAll(/.*_([^_]+_[^_]+)_split$/, '$1') - [ meta + [split_id: split_id], yaml] + [ meta + [split_id: split_id], yaml] } - // Process transform configs - transpose and add transform_id to meta + // Process transform configs - transpose and add transform_id to meta ch_transform_configs = STIMULUS_SPLIT_YAML.out.transform_config .transpose() .map { meta, yaml -> @@ -55,4 +55,4 @@ workflow SPLIT_DATA_CONFIG_UNIFIED_WF { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ \ No newline at end of file +*/ diff --git a/subworkflows/local/transform_csv/main.nf b/subworkflows/local/transform_csv/main.nf index 3280eab2..abb89706 100644 --- a/subworkflows/local/transform_csv/main.nf +++ b/subworkflows/local/transform_csv/main.nf @@ -46,7 +46,7 @@ workflow TRANSFORM_CSV_WF { data: item.data config: item.config } - + // run stimulus transform STIMULUS_TRANSFORM_CSV( ch_input.data,