cellgeni · Claptar · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025 · May 1, 2025
diff --git a/modules/sanger/anndatautils/toh5ad/Dockerfile b/modules/sanger/anndatautils/toh5ad/Dockerfile
@@ -0,0 +1,37 @@
+FROM python:3.11-slim
+
+
+# non-interactive mode
+ENV DEBIAN_FRONTEND=noninteractive
+ENV VENV_PATH="/env"
+ENV PATH="${VENV_PATH}/bin:$PATH"
+
+
+# Update and install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    procps \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install uv package manager
+RUN pip install uv
+
+# Create a new environment with uv and install packages
+RUN uv venv "${VENV_PATH}" && \
+    uv pip install --no-cache-dir \
+    scanpy \
+    numpy \
+    pandas \
+    muon \
+    scikit-network \
+    jupyterlab \
+    notebook \
+    tqdm \
+    ipywidgets \
+    papermill
+
+# Copy Dockerfile to the container
+COPY Dockerfile /docker/
+RUN chmod -R 755 /docker
diff --git a/modules/sanger/anndatautils/toh5ad/main.nf b/modules/sanger/anndatautils/toh5ad/main.nf
@@ -0,0 +1,32 @@
+
+
+process ANNDATAUTILS_TOH5AD {
+    tag "Converting ${sample_id}'s file to .h5ad"
+    container 'docker://quay.io/cellgeni/metacells-python:latest'
+
+    input:
+        tuple val(sample_id), path(input, name: 'input/*')
+        val(delimiter)
+    output:
+        tuple val(sample_id), path("${sample_id}.h5ad"), emit: h5ad
+        path "versions.yml", emit: versions
+    script:
+        """
+        convert_to_h5ad.py \
+            --input "${input}" \
+            --sample_id "${sample_id}" \
+            --delimiter "${delimiter}" \
+            --output "${sample_id}.h5ad"
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            anndata: \$( python -c "import anndata; print(anndata.__version__)" )
+            scanpy: \$( python -c "import scanpy; print(scanpy.__version__)" )
+        END_VERSIONS
+        """
+    stub:
+        """
+        touch "${sample_id}.h5ad"
+        touch versions.yml
+        """
+}
diff --git a/modules/sanger/anndatautils/toh5ad/meta.yml b/modules/sanger/anndatautils/toh5ad/meta.yml
@@ -0,0 +1,67 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "anndatautils_toh5ad"
+description: A module to convert AnnData convertible files to AnnData object and save it as .h5ad file
+keywords:
+  - anndata
+  - h5ad
+  - conversion
+tools:
+  - anndata:
+      description: |
+        Anndata is a Python package for handling annotated data
+        matrices in memory and on disk, positioned between pandas
+        and xarray. anndata offers a broad range of computationally
+        efficient features including, among others, sparse data support,
+        lazy operations, and a PyTorch interface.
+      homepage: https://anndata.readthedocs.io/en/stable/
+      documentation: https://anndata.readthedocs.io/en/stable/
+      doi: 10.21105/joss.04371
+      license: BSD-3-Clause
+  - scanpy:
+      description: |
+        Scanpy is a scalable toolkit for analyzing single-cell gene
+        expression data built jointly with anndata. It includes preprocessing,
+        visualization, clustering, trajectory inference and differential expression testing.
+        The Python-based implementation efficiently deals with datasets of more than one million cells.
+      homepage: https://scanpy.readthedocs.io/en/stable/
+      documentation: https://scanpy.readthedocs.io/en/stable/
+      doi: 10.1186/s13059-017-1382-0
+      license: BSD-3-Clause
+
+input:
+  - - sample_id:
+        type: string
+        description: |
+          Sample ID to be used as obs_names in the AnnData object
+    - input:
+        type: [ "file" , "directory" ]
+        description: |
+          Input files to be converted to AnnData object. The first element of the tuple is the sample ID, and the second element is the file path.
+  - - delimiter:
+        type: string
+        description: |
+          Delimiter to be used to create obs_names in the format sample_id + delimiter + barcode
+
+output:
+  - h5ad:
+    - sample_id:
+        type: string
+        description: |
+          Sample ID that was used as obs_names in the AnnData object
+    - "*.h5ad":
+        type: file
+        description: .h5ad file containing the AnnData object
+        pattern: "*.h5ad"
+  - versions:
+    - versions.yml:
+        type: file
+        description: |
+          YML file containing the versions of the software used to create the AnnData object
+        pattern: "versions.yml"
+
+
+authors:
+  - "@claptar"
+maintainers:
+  - "@claptar"
diff --git a/modules/sanger/anndatautils/toh5ad/module.config b/modules/sanger/anndatautils/toh5ad/module.config
@@ -0,0 +1,12 @@
+process {
+    withName: "*ANNDATAUTILS_TOH5AD" {
+        queue      = 'normal'
+        cpus       = 4
+        memory     = { 16.GB + 32.GB * (task.attempt - 1) }
+        publishDir = [
+            mode     : params.publish_mode,
+            path     : "${outputDir}/adata/raw",
+            overwrite: true
+        ]
+    }
+}
diff --git a/modules/sanger/anndatautils/toh5ad/resources/usr/bin/convert_to_h5ad.py b/modules/sanger/anndatautils/toh5ad/resources/usr/bin/convert_to_h5ad.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import logging
+import argparse
+import scanpy as sc
+
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s: %(message)s",
+    stream=sys.stdout,  # Direct output to stdout instead of a file
+)
+
+
+def init_parser() -> argparse.ArgumentParser:
+    """
+    Initialise argument parser for the script
+    """
+    parser = argparse.ArgumentParser(
+        description="Script validates sample and annotation tables and splits annotation table into separate celltypes"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        metavar="<path>",
+        help="Specify a path to input file",
+        required=True,
+    )
+    parser.add_argument(
+        "--sample_id",
+        type=str,
+        metavar="<str>",
+        default=None,
+        help="Specify sample name for the file",
+    )
+    parser.add_argument(
+        "--output",
+        metavar="<path>",
+        type=str,
+        help="Specify a path to output .h5ad file",
+        default=10,
+    )
+    parser.add_argument(
+        "--delimiter",
+        type=str,
+        metavar="<str>",
+        default=None,
+        help="Specify sample delimiter",
+    )
+
+    return parser
+
+
+def check_10x_mtx_files(directory: str) -> bool:
+    """
+    Check if the directory contains the required 10x mtx files
+    """
+    required_files = ["matrix.mtx", "barcodes.tsv", "features.tsv"]
+    for file in required_files:
+        filepath = os.path.join(directory, file)
+        if not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz"):
+            logging.error(f"Missing required file: {file}")
+            return False
+    return True
+
+
+def main() -> None:
+    # Parse arguments
+    parser = init_parser()
+    args = parser.parse_args()
+
+    # Check if input is directory
+    if os.path.isdir(args.input):
+        # return error if any of the required files are missing
+        if not check_10x_mtx_files(args.input):
+            raise FileNotFoundError(
+                "The specified directory does not contain the required files: matrix.mtx, barcodes.tsv, and features.tsv"
+            )
+
+        # load 10x mtx file
+        logging.info("Loading 10x mtx file to AnnData object")
+        adata = sc.read_10x_mtx(
+            args.input,
+            var_names="gene_symbols",
+            gex_only=True,
+        )
+    else:
+        # get file extension
+        _, extension = os.path.splitext(args.input)
+
+        # read file based on extension using case match
+        match extension:
+            case ".h5":
+                logging.info("Loading .h5 file to AnnData object")
+                adata = sc.read_10x_h5(args.input, gex_only=True)
+            case ".mtx":
+                logging.info("Loading .mtx file to AnnData object")
+                adata = sc.read_mtx(args.input).T
+            case ".zarr":
+                logging.info("Loading .zarr file to AnnData object")
+                raise NotImplementedError(
+                    "Loading .zarr file is not implemented yet. Please provide a path to .h5 file or .mtx file."
+                )
+            case _:
+                raise ValueError(
+                    "Unsupported file format. Please provide a path to .h5 file, .mtx file or .mtx file directory."
+                )
+
+    # Add sample name to obs
+    logging.info("Adding sample name to obs")
+    adata.obs["sample"] = args.sample_id
+
+    # Add delimiter to obs index if specified
+    if args.delimiter:
+        logging.info("Adding delimiter to obs index")
+        adata.obs["barcode"] = adata.obs.index
+        adata.obs.index = adata.obs["barcode"] + args.delimiter + adata.obs["sample"]
+        adata.obs.index.name = "barcode_sample"
+
+    # Save adata abject
+    logging.info("Saving AnnData object to .h5ad file")
+    adata.write_h5ad(args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/sanger/anndatautils/toh5ad/tests/main.nf.test b/modules/sanger/anndatautils/toh5ad/tests/main.nf.test
@@ -0,0 +1,114 @@
+nextflow_process {
+
+    name "Test Process ANNDATAUTILS_TOH5AD"
+    script "../main.nf"
+    process "ANNDATAUTILS_TOH5AD"
+
+    tag "modules"
+    tag "modules_sanger"
+    tag "anndatautils" 
+    tag "anndatautils/toh5ad"
+
+    test("Conversion of .h5 file to .h5ad") {
+        when {
+            process {
+                """
+                input[0] = ['pbmc_1k_h5', file(params.test_data_base + "anndatautils/pbmc_1k.h5")]
+                input[1] = "___"
+                """
+            }
+        }
+
+        then {
+            // basec run health
+            assert process.success
+            assert process.exitStatus == 0
+
+            // h5ad files
+            assert process.out.h5ad
+            assert process.out.h5ad.get(0).get(0) ==~ "pbmc_1k_h5"
+            assert process.out.h5ad.get(0).get(1) ==~ ".*/pbmc_1k_h5.h5ad"
+
+            // versions.yml
+            assert process.out.versions
+            assert process.out.versions.get(0) ==~ ".*/versions.yml"
+
+            // Snapshot all output channels of a process
+            assert snapshot(process.out).match()
+
+        }
+
+    }
+
+    test("Conversion of 10x .mtx file to .h5ad") {
+        when {
+            process {
+                """
+                input[0] = ['pbmc_1k_mtx', file(params.test_data_base + "anndatautils/raw_feature_bc_matrix")]
+                input[1] = "___"
+                """
+            }
+        }
+
+        then {
+            // basec run health
+            assert process.success
+            assert process.exitStatus == 0
+
+            // h5ad files
+            assert process.out.h5ad
+            assert process.out.h5ad.get(0).get(0) ==~ "pbmc_1k_mtx"
+            assert process.out.h5ad.get(0).get(1) ==~ ".*/pbmc_1k_mtx.h5ad"
+
+            // versions.yml
+            assert process.out.versions
+            assert process.out.versions.get(0) ==~ ".*/versions.yml"
+
+            // Snapshot all output channels of a process
+            assert snapshot(process.out).match()
+
+        }
+
+    }
+
+    test("Run -stub for .mtx and .h5 files") {
+
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] = Channel.fromList([
+                    ['pbmc_1k_h5', file(params.test_data_base + "anndatautils/pbmc_1k.h5")],
+                    ['pbmc_1k_mtx', file(params.test_data_base + "anndatautils/raw_feature_bc_matrix")]
+                ])
+                input[1] = Channel.fromList(["___", "___"])
+                """
+            }
+        }
+
+        then {
+            // basec run health
+            assert process.success
+            assert process.exitStatus == 0
+            assert process.trace.succeeded().size() == 2
+
+            // h5ad files
+            assert process.out.h5ad
+            assert process.out.h5ad.size() == 2
+
+            // versions.yml
+            assert process.out.versions
+            with(process.out.versions) {
+                assert size() == 2
+                assert get(0) ==~ ".*/versions.yml"
+                assert get(1) ==~ ".*/versions.yml"
+            }
+
+            // Snapshot all output channels of a process
+            assert snapshot(process.out).match()
+
+        }
+
+    }
+
+}