Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8efeb53
add ANNDATAUTILS_TOH5AD module with configuration and metadata
Apr 30, 2025
de529da
refactor input type in meta.yml and add initial tests for ANNDATAUTIL…
Apr 30, 2025
822a9b8
refactor meta.yml by removing unused tool entries and cleaning up inp…
Apr 30, 2025
0891679
add convert_to_h5ad script for loading and saving AnnData objects wit…
May 1, 2025
602d7e5
add Dockerfile for setting up Python environment with necessary depen…
May 1, 2025
0948d95
Add raw data files and update tests for h5ad conversion
May 1, 2025
460e82e
Enhance ANNDATAUTILS_TOH5AD process to emit versions.yml and h5ad output
May 1, 2025
a10b328
Refactor meta.yml to standardize keywords and improve input descripti…
May 1, 2025
f18c87f
Update ANNDATAUTILS_TOH5AD process to use wildcard for name matching …
May 1, 2025
fb2cfd5
Refactor tests for ANNDATAUTILS_TOH5AD process to improve clarity and…
May 2, 2025
24213a8
Fix quoting in script and stub sections of ANNDATAUTILS_TOH5AD proces…
May 2, 2025
fa36589
Add initial configuration for nf-test module with tests directory and…
May 2, 2025
292f451
Fix process name matching for ANNDATAUTILS_TOH5AD to ensure accurate …
May 2, 2025
1ddd9fe
Add raw data files for PBMC 1K dataset
May 2, 2025
0634482
Update nf-test configuration: set testsDir to current directory and a…
May 2, 2025
80c5f4c
Refactor ANNDATAUTILS_TOH5AD process name matching and update test da…
May 2, 2025
4e34040
Fix indentation and structure in meta.yml for input and output defini…
Claptar May 3, 2025
9c207ec
Update modules/sanger/anndatautils/toh5ad/meta.yml
Claptar May 3, 2025
e37e4c4
Changed typo in directory name
Claptar May 3, 2025
5ed50e3
Added tool description to meta.yml file
Claptar May 3, 2025
77ac7ce
fix doi line
Claptar May 3, 2025
992558b
Removed TODO comment from tests
Claptar May 3, 2025
7353d82
Fix output emit syntax in ANNDATAUTILS_TOH5AD process
Claptar May 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions modules/sanger/anndatautils/toh5ad/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM python:3.11-slim


# non-interactive mode
ENV DEBIAN_FRONTEND=noninteractive
ENV VENV_PATH="/env"
ENV PATH="${VENV_PATH}/bin:$PATH"


# Update and install system dependencies
RUN apt-get update && apt-get install -y \
git \
build-essential \
procps \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install uv package manager
RUN pip install uv

# Create a new environment with uv and install packages
RUN uv venv "${VENV_PATH}" && \
uv pip install --no-cache-dir \
scanpy \
numpy \
pandas \
muon \
scikit-network \
jupyterlab \
notebook \
tqdm \
ipywidgets \
papermill

# Copy Dockerfile to the container
COPY Dockerfile /docker/
RUN chmod -R 755 /docker
32 changes: 32 additions & 0 deletions modules/sanger/anndatautils/toh5ad/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@


process ANNDATAUTILS_TOH5AD {
tag "Converting ${sample_id}'s file to .h5ad"
container 'docker://quay.io/cellgeni/metacells-python:latest'

input:
tuple val(sample_id), path(input, name: 'input/*')
val(delimiter)
output:
tuple val(sample_id), path("${sample_id}.h5ad"), emit: h5ad
path "versions.yml", emit: versions
script:
"""
convert_to_h5ad.py \
--input "${input}" \
--sample_id "${sample_id}" \
--delimiter "${delimiter}" \
--output "${sample_id}.h5ad"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
anndata: \$( python -c "import anndata; print(anndata.__version__)" )
scanpy: \$( python -c "import scanpy; print(scanpy.__version__)" )
END_VERSIONS
"""
stub:
"""
touch "${sample_id}.h5ad"
touch versions.yml
"""
}
67 changes: 67 additions & 0 deletions modules/sanger/anndatautils/toh5ad/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "anndatautils_toh5ad"
description: A module to convert AnnData convertible files to AnnData object and save it as .h5ad file
keywords:
- anndata
- h5ad
- conversion
tools:
- anndata:
description: |
Anndata is a Python package for handling annotated data
matrices in memory and on disk, positioned between pandas
and xarray. anndata offers a broad range of computationally
efficient features including, among others, sparse data support,
lazy operations, and a PyTorch interface.
homepage: https://anndata.readthedocs.io/en/stable/
documentation: https://anndata.readthedocs.io/en/stable/
doi: 10.21105/joss.04371
license: BSD-3-Clause
- scanpy:
description: |
Scanpy is a scalable toolkit for analyzing single-cell gene
expression data built jointly with anndata. It includes preprocessing,
visualization, clustering, trajectory inference and differential expression testing.
The Python-based implementation efficiently deals with datasets of more than one million cells.
homepage: https://scanpy.readthedocs.io/en/stable/
documentation: https://scanpy.readthedocs.io/en/stable/
doi: 10.1186/s13059-017-1382-0
license: BSD-3-Clause

input:
- - sample_id:
type: string
description: |
Sample ID to be used as obs_names in the AnnData object
- input:
type: [ "file" , "directory" ]
description: |
Input files to be converted to AnnData object. The first element of the tuple is the sample ID, and the second element is the file path.
- - delimiter:
type: string
description: |
Delimiter to be used to create obs_names in the format sample_id + delimiter + barcode

output:
- h5ad:
- sample_id:
type: string
description: |
Sample ID that was used as obs_names in the AnnData object
- "*.h5ad":
type: file
description: .h5ad file containing the AnnData object
pattern: "*.h5ad"
- versions:
- versions.yml:
type: file
description: |
YML file containing the versions of the software used to create the AnnData object
pattern: "versions.yml"


authors:
- "@claptar"
maintainers:
- "@claptar"
12 changes: 12 additions & 0 deletions modules/sanger/anndatautils/toh5ad/module.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
process {
withName: "*ANNDATAUTILS_TOH5AD" {
queue = 'normal'
cpus = 4
memory = { 16.GB + 32.GB * (task.attempt - 1) }
publishDir = [
mode : params.publish_mode,
path : "${outputDir}/adata/raw",
overwrite: true
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3

import os
import sys
import logging
import argparse
import scanpy as sc


# Set up logging
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s: %(message)s",
stream=sys.stdout, # Direct output to stdout instead of a file
)


def init_parser() -> argparse.ArgumentParser:
"""
Initialise argument parser for the script
"""
parser = argparse.ArgumentParser(
description="Script validates sample and annotation tables and splits annotation table into separate celltypes"
)
parser.add_argument(
"--input",
type=str,
metavar="<path>",
help="Specify a path to input file",
required=True,
)
parser.add_argument(
"--sample_id",
type=str,
metavar="<str>",
default=None,
help="Specify sample name for the file",
)
parser.add_argument(
"--output",
metavar="<path>",
type=str,
help="Specify a path to output .h5ad file",
default=10,
)
parser.add_argument(
"--delimiter",
type=str,
metavar="<str>",
default=None,
help="Specify sample delimiter",
)

return parser


def check_10x_mtx_files(directory: str) -> bool:
"""
Check if the directory contains the required 10x mtx files
"""
required_files = ["matrix.mtx", "barcodes.tsv", "features.tsv"]
for file in required_files:
filepath = os.path.join(directory, file)
if not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz"):
logging.error(f"Missing required file: {file}")
return False
return True


def main() -> None:
# Parse arguments
parser = init_parser()
args = parser.parse_args()

# Check if input is directory
if os.path.isdir(args.input):
# return error if any of the required files are missing
if not check_10x_mtx_files(args.input):
raise FileNotFoundError(
"The specified directory does not contain the required files: matrix.mtx, barcodes.tsv, and features.tsv"
)

# load 10x mtx file
logging.info("Loading 10x mtx file to AnnData object")
adata = sc.read_10x_mtx(
args.input,
var_names="gene_symbols",
gex_only=True,
)
else:
# get file extension
_, extension = os.path.splitext(args.input)

# read file based on extension using case match
match extension:
case ".h5":
logging.info("Loading .h5 file to AnnData object")
adata = sc.read_10x_h5(args.input, gex_only=True)
case ".mtx":
logging.info("Loading .mtx file to AnnData object")
adata = sc.read_mtx(args.input).T
case ".zarr":
logging.info("Loading .zarr file to AnnData object")
raise NotImplementedError(
"Loading .zarr file is not implemented yet. Please provide a path to .h5 file or .mtx file."
)
case _:
raise ValueError(
"Unsupported file format. Please provide a path to .h5 file, .mtx file or .mtx file directory."
)

# Add sample name to obs
logging.info("Adding sample name to obs")
adata.obs["sample"] = args.sample_id

# Add delimiter to obs index if specified
if args.delimiter:
logging.info("Adding delimiter to obs index")
adata.obs["barcode"] = adata.obs.index
adata.obs.index = adata.obs["barcode"] + args.delimiter + adata.obs["sample"]
adata.obs.index.name = "barcode_sample"

# Save adata abject
logging.info("Saving AnnData object to .h5ad file")
adata.write_h5ad(args.output)


if __name__ == "__main__":
main()
114 changes: 114 additions & 0 deletions modules/sanger/anndatautils/toh5ad/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
nextflow_process {

name "Test Process ANNDATAUTILS_TOH5AD"
script "../main.nf"
process "ANNDATAUTILS_TOH5AD"

tag "modules"
tag "modules_sanger"
tag "anndatautils"
tag "anndatautils/toh5ad"

test("Conversion of .h5 file to .h5ad") {
when {
process {
"""
input[0] = ['pbmc_1k_h5', file(params.test_data_base + "anndatautils/pbmc_1k.h5")]
input[1] = "___"
"""
}
}

then {
// basec run health
assert process.success
assert process.exitStatus == 0

// h5ad files
assert process.out.h5ad
assert process.out.h5ad.get(0).get(0) ==~ "pbmc_1k_h5"
assert process.out.h5ad.get(0).get(1) ==~ ".*/pbmc_1k_h5.h5ad"

// versions.yml
assert process.out.versions
assert process.out.versions.get(0) ==~ ".*/versions.yml"

// Snapshot all output channels of a process
assert snapshot(process.out).match()

}

}

test("Conversion of 10x .mtx file to .h5ad") {
when {
process {
"""
input[0] = ['pbmc_1k_mtx', file(params.test_data_base + "anndatautils/raw_feature_bc_matrix")]
input[1] = "___"
"""
}
}

then {
// basec run health
assert process.success
assert process.exitStatus == 0

// h5ad files
assert process.out.h5ad
assert process.out.h5ad.get(0).get(0) ==~ "pbmc_1k_mtx"
assert process.out.h5ad.get(0).get(1) ==~ ".*/pbmc_1k_mtx.h5ad"

// versions.yml
assert process.out.versions
assert process.out.versions.get(0) ==~ ".*/versions.yml"

// Snapshot all output channels of a process
assert snapshot(process.out).match()

}

}

test("Run -stub for .mtx and .h5 files") {

options "-stub"
when {
process {
"""
input[0] = Channel.fromList([
['pbmc_1k_h5', file(params.test_data_base + "anndatautils/pbmc_1k.h5")],
['pbmc_1k_mtx', file(params.test_data_base + "anndatautils/raw_feature_bc_matrix")]
])
input[1] = Channel.fromList(["___", "___"])
"""
}
}

then {
// basec run health
assert process.success
assert process.exitStatus == 0
assert process.trace.succeeded().size() == 2

// h5ad files
assert process.out.h5ad
assert process.out.h5ad.size() == 2

// versions.yml
assert process.out.versions
with(process.out.versions) {
assert size() == 2
assert get(0) ==~ ".*/versions.yml"
assert get(1) ==~ ".*/versions.yml"
}

// Snapshot all output channels of a process
assert snapshot(process.out).match()

}

}

}
Loading