Merge pull request #1 from eporetsky/docker-merge

eporetsky · web-flow · commit d4478234da4e · 2025-04-21T23:58:07.000-07:00
# MutClust v0.1.2

## Major Changes
- Added Docker support for easy deployment and reproducibility
- Improved test coverage and stability
- Updated documentation with Docker usage examples
- Updated dependency specifications
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,18 @@
+.git
+.gitignore
+.pytest_cache
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+venv
+.env
+.venv
+*.egg-info
+dist
+build
+.DS_Store
+*.swp
+*.swo 
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,33 @@
+FROM ubuntu:20.04
+
+# Set environment variables to avoid interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.9 \
+    python3.9-dev \
+    python3-pip \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set up Python 3.9 as default
+RUN ln -sf /usr/bin/python3.9 /usr/bin/python3 && \
+    ln -sf /usr/bin/python3.9 /usr/bin/python && \
+    ln -sf /usr/bin/pip3 /usr/bin/pip
+
+# Create a working directory
+WORKDIR /app
+
+# Copy the project files
+COPY . /app/
+
+# Install MutClust and its dependencies
+RUN pip install --upgrade pip && \
+    pip install .
+
+# Create a directory for mounting data
+RUN mkdir /data
+
+# Set the entrypoint
+ENTRYPOINT ["mutclust"] 
diff --git a/README.md b/README.md
@@ -32,6 +32,20 @@ cd mutclust
 pip install .
 ```
 
+### Docker Installation
+
+For users who prefer containerized deployment, MutClust is available as a Docker container:
+
+```bash
+# Build the container
+docker build -t mutclust .
+
+# Run MutClust with your data
+docker run -v /path/to/your/data:/data mutclust --expression /data/your_expression.tsv --output /data/results
+```
+
+The container uses Ubuntu 20.04 and includes all necessary dependencies. Mount your data directory to `/data` inside the container to access your files.
+
 ---
 
 ## Usage
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "MutClust"
-version = "0.1.1"
+version = "0.1.2"
 authors = [
     {name = "Elly Poretsky", email = "eporetsky@plantapp.org"},
 ]
@@ -15,16 +15,17 @@ keywords = ["bioinformatics", "coexpression", "mutual rank", "clustering", "leid
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
+    "Operating System :: POSIX :: Linux",
     "Topic :: Scientific/Engineering :: Bio-Informatics"
 ]
 dependencies = [
-    "numpy",
-    "pandas",
-    "pynetcor",
-    "python-igraph",
-    "goatools",
-    'importlib-metadata; python_version<"3.10"',
+    "numpy==2.0.2",
+    "pandas==2.2.3",
+    "pynetcor==0.1.1",
+    "python-igraph==0.11.8",
+    "goatools==1.4.12",
+    "scikit-learn==1.6.1",
+    'importlib-metadata==8.6.1; python_version<"3.10"',
 ]
 requires-python = ">=3.9"
 
@@ -34,17 +35,17 @@ find = { include = ["mutclust"]}
 
 [project.optional-dependencies]
 dev = [
-    "pytest",          # For running tests
-    "pytest-cov",      # For test coverage reports
-    "black",           # For code formatting
-    "flake8",          # For linting
-    "mypy",            # For type checking
-    "pre-commit"       # For managing pre-commit hooks
+    "pytest>=7.0.0",          # For running tests
+    "pytest-cov>=3.0.0",      # For test coverage reports
+    "black>=22.0.0",          # For code formatting
+    "flake8>=4.0.0",          # For linting
+    "mypy>=0.900",            # For type checking
+    "pre-commit>=2.0.0"       # For managing pre-commit hooks
 ]
 docs = [
-    "sphinx",          # For generating documentation
-    "sphinx-rtd-theme" # For the ReadTheDocs theme
+    "sphinx>=4.0.0",          # For generating documentation
+    "sphinx-rtd-theme>=1.0.0" # For the ReadTheDocs theme
 ]
 
 [project.scripts]
-mutclust = "mutclust.__main__:main"
+mutclust = "mutclust.__main__:main"
diff --git a/tests/test_pca_analysis.py b/tests/test_pca_analysis.py
@@ -0,0 +1,73 @@
+import pytest
+import pandas as pd
+import numpy as np
+from mutclust.pca_analysis import calculate_eigen_genes
+
+def test_empty_cluster_error():
+    """Test that empty clusters raise an error."""
+    expression_data = pd.DataFrame({
+        'Sample1': [1.0],
+        'Sample2': [2.0]
+    }, index=['Gene1'])
+    
+    gene_clusters = [
+        ['Gene1'],
+        []  # Empty cluster
+    ]
+    
+    with pytest.raises(ValueError, match="Cannot perform PCA on empty clusters"):
+        calculate_eigen_genes(expression_data, gene_clusters)
+
+def test_single_gene_clusters():
+    """Test that single-gene clusters work correctly."""
+    expression_data = pd.DataFrame({
+        'Sample1': [1.0, 2.0],
+        'Sample2': [3.0, 4.0]
+    }, index=['Gene1', 'Gene2'])
+    
+    gene_clusters = [
+        ['Gene1'],
+        ['Gene2']
+    ]
+    
+    eigen_genes = calculate_eigen_genes(expression_data, gene_clusters)
+    
+    # Check that each cluster's values match the original expression
+    assert np.allclose(eigen_genes['Cluster_0'].values, 
+                      expression_data.loc['Gene1'].values, rtol=1e-5)
+    assert np.allclose(eigen_genes['Cluster_1'].values, 
+                      expression_data.loc['Gene2'].values, rtol=1e-5)
+
+def test_large_dataset_parallel():
+    """Test that the function can handle larger datasets in parallel."""
+    # Create a larger dataset with 100 genes and 50 samples
+    np.random.seed(42)
+    n_genes = 100
+    n_samples = 50
+    
+    # Generate random expression data
+    expression_data = pd.DataFrame(
+        np.random.randn(n_genes, n_samples),
+        index=[f'Gene{i}' for i in range(n_genes)],
+        columns=[f'Sample{i}' for i in range(n_samples)]
+    )
+    
+    # Create 10 clusters with 10 genes each
+    gene_clusters = [
+        [f'Gene{i}' for i in range(j*10, (j+1)*10)]
+        for j in range(10)
+    ]
+    
+    # Calculate eigen-genes
+    eigen_genes = calculate_eigen_genes(expression_data, gene_clusters)
+    
+    # Check the output format
+    assert isinstance(eigen_genes, pd.DataFrame)
+    assert eigen_genes.shape == (n_samples, 10)  # 50 samples, 10 clusters
+    assert all(eigen_genes.columns == [f'Cluster_{i}' for i in range(10)])
+    
+    # Check that each cluster's eigen-gene has the right dimensions
+    for i in range(10):
+        assert len(eigen_genes[f'Cluster_{i}']) == n_samples
+        # Check that the values are not all zeros
+        assert not np.allclose(eigen_genes[f'Cluster_{i}'], 0)