diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index a2de033..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitattributes b/.gitattributes index 73704d2..a894e29 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1 @@ -*.ipynb linguist-detectable=false \ No newline at end of file +*.ipynb linguist-detectable=false diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 5a8bab0..fe65630 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -10,22 +10,22 @@ on: jobs: pre-commit: - runs-on: macos-latest + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.10", "3.11", "3.12"] + + runs-on: ${{ matrix.os }} steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v3 - - name: Set up Python + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: "3.10" - - - name: Upgrade pre-commit - run: pip install --upgrade pre-commit - - - name: Verify pre-commit installation - run: pre-commit --version + python-version: ${{ matrix.python-version }} + check-latest: true - name: Cache pip dependencies uses: actions/cache@v3 @@ -43,24 +43,19 @@ jobs: pip install torch_geometric shell: bash - - name: Install system dependencies - run: | - brew update - brew upgrade - - - name: Install R - uses: r-lib/actions/setup-r@v2 - with: - r-version: "latest" - - - name: Install R packages - run: | - Rscript -e "if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager', repos='https://cran.r-project.org')" - Rscript -e "install.packages(c('dplyr', 'jsonlite'), repos='https://cran.r-project.org')" - Rscript -e "BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'), update=FALSE, ask=FALSE)" - Rscript -e "install.packages('SmCCNet', repos='https://cran.r-project.org')" - Rscript -e "install.packages('WGCNA', repos='https://cran.r-project.org')" - shell: bash - - # - name: Run Pre-Commit Checks - # run: pre-commit run --all-files --show-diff-on-failure + # - name: Install R + # uses: r-lib/actions/setup-r@v2 + # with: + # r-version: "latest" + + # - name: Install R packages + # run: | + # Rscript -e "if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager', repos='https://cran.r-project.org')" + # Rscript -e "install.packages(c('dplyr', 'jsonlite'), repos='https://cran.r-project.org')" + # Rscript -e "BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'), update=FALSE, ask=FALSE)" + # Rscript -e "install.packages('SmCCNet', repos='https://cran.r-project.org')" + # Rscript -e "install.packages('WGCNA', repos='https://cran.r-project.org')" + # shell: bash + + - name: Run Pre-Commit Checks + run: pre-commit run --all-files --show-diff-on-failure diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 6b365e5..5b79ee4 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} @@ -41,29 +41,29 @@ jobs: pip install torch_geometric shell: bash - - name: Install R - uses: r-lib/actions/setup-r@v2 - with: - r-version: "latest" + # - name: Install R + # uses: r-lib/actions/setup-r@v2 + # with: + # r-version: "latest" - - name: Install R packages - run: | - Rscript -e "if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager', repos='https://cran.r-project.org')" - Rscript -e "install.packages(c('dplyr', 'jsonlite'), repos='https://cran.r-project.org')" - Rscript -e "BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'), update=FALSE, ask=FALSE)" - Rscript -e "install.packages('SmCCNet', repos='https://cran.r-project.org')" - Rscript -e "install.packages('WGCNA', repos='https://cran.r-project.org')" - shell: bash + # - name: Install R packages + # run: | + # Rscript -e "if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager', repos='https://cran.r-project.org')" + # Rscript -e "install.packages(c('dplyr', 'jsonlite'), repos='https://cran.r-project.org')" + # Rscript -e "BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'), update=FALSE, ask=FALSE)" + # Rscript -e "install.packages('SmCCNet', repos='https://cran.r-project.org')" + # Rscript -e "install.packages('WGCNA', repos='https://cran.r-project.org')" + # shell: bash - - name: Run tests with pytest - run: | - find . -name ".coverage*" -delete - pytest --cov=bioneuralnet --cov-report=xml tests/ + # - name: Run tests with pytest + # run: | + # find . -name ".coverage*" -delete + # pytest --cov=bioneuralnet --cov-report=xml tests/ - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: ./coverage.xml - flags: unittests - name: codecov-umbrella + # - name: Upload coverage to Codecov + # uses: codecov/codecov-action@v3 + # with: + # token: ${{ secrets.CODECOV_TOKEN }} + # files: ./coverage.xml + # flags: unittests + # name: codecov-umbrella diff --git a/.gitignore b/.gitignore index 3b4a45e..083507e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,9 @@ bioneuralnet.egg-info .mypy_cache Quick Start.ipynb TCGA-BRCA_Datatest.ipynb - +DevNotes.md +TCGA-BRCA_Datatest_copy.ipynb +split # Other example data and tests not needed in the repo. Output** diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..37b3aad --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,62 @@ +exclude: ^docs/ + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + args: ['--maxkb=92160'] + + # - repo: https://github.com/psf/black + # rev: 24.10.0 + # hooks: + # - id: black + # name: Format Python code with Black + # language_version: python3 + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.14.1 + hooks: + - id: mypy + name: Static type checking with MyPy + args: [--ignore-missing-imports] + + - repo: local + hooks: + - id: remove-pyc-and-pycache + name: Remove .pyc files and __pycache__ directories + entry: bash -c "find . \( -path './.venv' -o -path './docs' -o -path './node_modules' \) -prune -o -type f -name '*.pyc' -exec rm -f {} + -o -type d -name '__pycache__' -exec rm -rf {} +" + language: system + stages: [pre-commit] + + - id: clean-coverage-files + name: Remove stale .coverage files + entry: bash -c "find . -name '.coverage*' -delete" + language: system + stages: [pre-commit] + + - id: check-forbidden-files + name: Prevent adding forbidden file types except in allowed directory + entry: | + bash -c ' + FILES="$(git diff --cached --name-only)" + if echo "$FILES" | grep -E "\.RData$"; then + echo "Forbidden file types detected (RData)!" + exit 1 + fi + if echo "$FILES" | grep -E "\.csv$" | grep -vE "^bioneuralnet/datasets/(example1|brca|monet)/"; then + echo "Forbidden CSV files detected (outside allowed folders)!" + exit 1 + fi + ' + language: system + stages: [pre-commit] + +# - id: run-tests +# name: Run Tests with Pytest +# entry: pytest --ignore=docs/source/examples --cov=bioneuralnet --cov-report=term-missing || true +# language: system +# types: [python] diff --git a/CHANGELOG.md b/CHANGELOG.md index f7d172f..84b195f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to this project will be documented in this file. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/). ## [0.2.0b2] - 2025-02-16 @@ -69,4 +69,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## [1.0.1] to [1.0.7] - 2025-04-24 - **BUG**: A bug related to rdata files missing -- **New realease**: A new release will include documentation for the other updates. (1.1.0) \ No newline at end of file +- **New realease**: A new release will include documentation for the other updates. (1.1.0) diff --git a/README.md b/README.md index 9aeae1f..2fd1240 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ ![GitHub Issues](https://img.shields.io/github/issues/UCD-BDLab/BioNeuralNet) ![GitHub Contributors](https://img.shields.io/github/contributors/UCD-BDLab/BioNeuralNet) ![Downloads](https://static.pepy.tech/badge/bioneuralnet) - [![Documentation](https://img.shields.io/badge/docs-read%20the%20docs-blue.svg)](https://bioneuralnet.readthedocs.io/en/latest/) ## Welcome to BioNeuralNet 1.0.7 ![BioNeuralNet Logo](assets/LOGO_WB.png) -BioNeuralNet is a robust Python framework for integrating multi-omics data with Graph Neural Networks (GNNs). +**BioNeuralNet** is a Python framework for integrating and analyzing multi-omics data using **Graph Neural Networks (GNNs)**. +It provides tools for network construction, embedding generation, clustering, and disease prediction, all within a modular, scalable, and reproducible pipeline. ![BioNeuralNet Workflow](assets/BioNeuralNet.png) @@ -30,10 +30,11 @@ BioNeuralNet is a robust Python framework for integrating multi-omics data with - [8. Contributing](#8-contributing) - [9. License](#9-license) - [10. Contact](#10-contact) +- [11. References](#11-References) ## 1. Installation -BioNeuralNet supports Python 3.10 and 3.11. +BioNeuralNet supports Python `3.10`, `3.11` and `3.12`. ### 1.1. Install BioNeuralNet ```bash @@ -58,115 +59,116 @@ For GPU acceleration, please refer to: - [PyTorch Geometric Installation Guide](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) -## 2. BioNeuralNet Core Features - -For an End-to-End example example of BioNeuralNet, see [BioNeuralNet Demo](https://bioneuralnet.readthedocs.io/en/latest/BioNeuralNet.html) +## **BioNeuralNet Core Features** -**Network Embedding**: +For an end-to-end example of BioNeuralNet, see the [Quick Start](https://bioneuralnet.readthedocs.io/en/latest/Quick_Start.html) and [TCGA-BRCA Dataset](https://bioneuralnet.readthedocs.io/en/latest/TCGA-BRCA_Dataset.html) guides. +### [Network Embedding](https://bioneuralnet.readthedocs.io/en/latest/gnns.html) - Given a multi-omics network as input, BioNeuralNet can generate embeddings using Graph Neural Networks (GNNs). - Generate embeddings using methods such as **GCN**, **GAT**, **GraphSAGE**, and **GIN**. - Outputs can be obtained as native tensors or converted to pandas DataFrames for easy analysis and visualization. - Embeddings unlock numerous downstream applications, including disease prediction, enhanced subject representation, clustering, and more. -**Graph Clustering**: - -- Identify functional modules or communities using **correlated clustering methods** (e.g., CorrelatedPageRank, CorrelatedLouvain, HybridLouvain) that integrate phenotype correlation to extract biologically relevant modules [1]_. -- Clustering methods can be applied to any network represented allowing flexible analysis across different domains. -- All clustering components return either raw partitions dictionaries or induced subnetwork adjacency matrices (as DataFrames) for visualization. -- Use cases include, feature selection, biomarker discovery, and network-based analysis. - -**Downstream Tasks**: - -- **Subject Representation**: - - - Integrate node embeddings back into omics data to enrich subject-level profiles by weighting features with learned embedding. - - This embedding-enriched data can be used for downstream tasks such as disease prediction or biomarker discovery. - - The result can be returned as a DataFrame or a PyTorch tensor, fitting naturally into downstream analyses. - -- **Disease Prediction for Multi-Omics Network DPMON**: - - - Classification End-to-End pipeline for disease prediction using Graph Neural Network embeddings. - - DPMON supports hyperparameter tuning-when enabled, it finds the best for the given data. - - This approach, along with the native pandas integration across modules, ensures that BioNeuralNet can be easily incorporated into your analysis workflows. - -**Metrics**: - -- Several plotting funcctions to visualize networks, emebddings, variance distribution, cluster comparison, and more. -- Correlation based functions to compare clustersand omics data with the phenotype. - -**Utilities**: - -- **Filtering Functions**: - - - Network filtering allows users to select variance or zero-fraction filtering to an omics network. - - Reducing noise, and removing outliers. - -- **Data Conversion**: - - - Convert RData files both CSV and to Pandas DataFrame. For ease of integration for R-based workflows. - -**External Tools**: - +### [Graph Clustering](https://bioneuralnet.readthedocs.io/en/latest/clustering.html) +- Identify functional modules or communities using **correlated clustering methods** (e.g., `CorrelatedPageRank`, `CorrelatedLouvain`, `HybridLouvain`) that integrate phenotype correlation to extract biologically relevant modules [[1]](#1). +- Clustering methods can be applied to any network representation, allowing flexible analysis across different domains. +- All clustering components return either raw partition dictionaries or induced subnetwork adjacency matrices (as DataFrames) for visualization. +- Use cases include feature selection, biomarker discovery, and network-based analysis. + +### [Downstream Tasks](https://bioneuralnet.readthedocs.io/en/latest/downstream_tasks.html) + +#### Subject Representation +- Integrate node embeddings back into omics data to enrich subject-level profiles by weighting features with the learned embedding. +- This embedding-enriched data can be used for downstream tasks such as disease prediction or biomarker discovery. +- The result can be returned as a DataFrame or a PyTorch tensor, fitting naturally into downstream analyses. + +#### Disease Prediction for Multi-Omics Network (DPMON) [[2]](#2) +- Classification end-to-end pipeline for disease prediction using Graph Neural Network embeddings. +- DPMON supports hyperparameter tuning, when enabled, it finds the best configuration for the given data. +- This approach, along with native pandas integration across modules, ensures that BioNeuralNet can be easily incorporated into your analysis workflows. + +### [Metrics](https://bioneuralnet.readthedocs.io/en/latest/metrics.html) +- Visualize embeddings, feature variance, clustering comparison, and network structure in 2D. +- Evaluate embedding quality and clustering relevance using correlation with phenotype. +- Performance benchmarking tools for classification tasks using various models. +- Useful for assessing feature importance, validating network structure, and comparing cluster outputs. + +### [Utilities](https://bioneuralnet.readthedocs.io/en/latest/utils.html) +- Build graphs using k-NN similarity, Pearson/Spearman correlation, RBF kernels, mutual information, or soft-thresholding. +- Filter and preprocess omics or clinical data by variance, correlation, random forest importance, or ANOVA F-test. +- Tools for network pruning, feature selection, and data cleaning. +- Quickly summarize datasets with variance, zero-fraction, expression level, or correlation overviews. +- Includes conversion tools for RData and integrated logging. + +### [External Tools](https://bioneuralnet.readthedocs.io/en/latest/external_tools/index.html) - **Graph Construction**: + - BioNeuralNet provides additional tools in the `bioneuralnet.external_tools` module. + - Includes support for **SmCCNet** (Sparse Multiple Canonical Correlation Network), an R-based tool for constructing phenotype-informed correlation networks [[3]](#3). + - These tools are optional but enhance BioNeuralNet’s graph construction capabilities and are recommended for more integrative or exploratory workflows. - - BioNeuralNet provides additional tools in the [External Tools](https://bioneuralnet.readthedocs.io/en/latest/external_tools/index.html) module. - - Allowing users to generate networks using R-based tools like WGCNA and SmCCNet. - - While optional, these tools enhance BioNeuralNet's capabilities and are recommended for comprehensive analysis. -## 3. Quick Example: SmCCNet + DPMON for Disease Prediction +## 3. Example: SmCCNet + DPMON for Disease Prediction ```python import pandas as pd -from bioneuralnet.datasets import DatasetLoader from bioneuralnet.external_tools import SmCCNet from bioneuralnet.downstream_task import DPMON +from bioneuralnet.datasets import DatasetLoader -# 1. Load dataset -loader = DatasetLoader("example1") -omics1, omics2, phenotype, clinical = loader.load_data() +# Step 1: Load your data or use one of the provided datasets +Example = DatasetLoader("example1") +omics_proteins = Example.data["X1"] +omics_metabolites = Example.data["X2"] +phenotype_data = Example.data["Y"] +clinical_data = Example.data["clinical_data"] -# 2. Generate adjacency matrix using SmCCNet +# Step 2: Network Construction smccnet = SmCCNet( - phenotype_df=phenotype, - omics_dfs=[omics1, omics2], - data_types=["genes", "proteins"], - kfold=3, - subSampNum=500, + phenotype_df=phenotype_data, + omics_dfs=[omics_proteins, omics_metabolites], + data_types=["protein", "metabolite"], + kfold=5, + summarization="PCA", ) -global_network, _ = smccnet.run() +global_network, clusters = smccnet.run() +print("Adjacency matrix generated.") -# 3. Run Disease Prediction using DPMON +# Step 3: Disease Prediction (DPMON) dpmon = DPMON( adjacency_matrix=global_network, - omics_list=[omics1, omics2], - phenotype_data=phenotype, - clinical_data=clinical, - tune=True, + omics_list=[omics_proteins, omics_metabolites], + phenotype_data=phenotype_data, + clinical_data=clinical_data, + model="GCN", ) -dpmon_predictions = dpmon.run() -print("Disease Predictions:\n", dpmon_predictions.head()) +predictions = dpmon.run() +print("Disease phenotype predictions:\n", predictions) ``` ## 4. Documentation and Tutorials -- Full documentation: [BioNeuralNet Documentation](https://bioneuralnet.readthedocs.io/en/latest/) +- **Full documentation**: [BioNeuralNet Documentation](https://bioneuralnet.readthedocs.io/en/latest/) + +- **Jupyter Notebook Examples**: + - [Quick Start](https://bioneuralnet.readthedocs.io/en/latest/Quick_Start.html) + - [TCGA-BRCA Dataset](https://bioneuralnet.readthedocs.io/en/latest/TCGA-BRCA_Dataset.html) + - Tutorials include: - - Multi-omics graph construction - - GNN embeddings for disease prediction - - Subject representation with integrated embeddings - - Clustering using Hybrid Louvain and Correlated PageRank + - Multi-omics graph construction. + - GNN embeddings for disease prediction. + - Subject representation with integrated embeddings. + - Clustering using Hybrid Louvain and Correlated PageRank. - API details are available in the [API Reference](https://bioneuralnet.readthedocs.io/en/latest/api.html). ## 5. Frequently Asked Questions (FAQ) -- **Does BioNeuralNet support GPU acceleration?** +- **Does BioNeuralNet support GPU acceleration?** Yes, install PyTorch with CUDA support. -- **Can I use my own omics network?** +- **Can I use my own omics network?** Yes, you can provide a custom network as an adjancy matrix instead of using SmCCNet. -- **What clustering methods are supported?** +- **What clustering methods are supported?** BioNeuralNet supports Correlated Louvain, Hybrid Louvain, and Correlated PageRank. For more FAQs, please visit our [FAQ page](https://bioneuralnet.readthedocs.io/en/latest/faq.html). @@ -184,11 +186,11 @@ BioNeuralNet integrates multiple open-source libraries. We acknowledge key depen - [**matplotlib**](https://github.com/matplotlib/matplotlib) - Data visualization. - [**cptac**](https://github.com/PNNL-CompBio/cptac) - Dataset handling for clinical proteomics. - [**python-louvain**](https://github.com/taynaud/python-louvain) - Community detection algorithms. +- [**statsmodels**](https://github.com/statsmodels/statsmodels) - Statistical models and hypothesis testing (e.g., ANOVA, regression). We also acknowledge R-based tools for external network construction: - [**SmCCNet**](https://github.com/UCD-BDLab/BioNeuralNet/tree/main/bioneuralnet/external_tools/smccnet) - Sparse multiple canonical correlation network. -- [**WGCNA**](https://cran.r-project.org/web/packages/WGCNA/) - Weighted gene co-expression network analysis. ## 7. Testing and Continuous Integration @@ -214,12 +216,11 @@ pytest ``` ### How to Contribute + - Fork the repository, create a new branch, and implement your changes. - Add tests and documentation for any new features. - Submit a pull request with a clear description of your changes. -For more details, see our [Contributing Guide](https://github.com/UCD-BDLab/BioNeuralNet/blob/main/CONTRIBUTING.md). - ## 9. License BioNeuralNet is distributed under the [MIT License](https://github.com/UCD-BDLab/BioNeuralNet/blob/main/LICENSE). @@ -228,3 +229,11 @@ BioNeuralNet is distributed under the [MIT License](https://github.com/UCD-BDLab - **Issues and Feature Requests:** [Open an Issue](https://github.com/UCD-BDLab/BioNeuralNet/issues) - **Email:** [vicente.ramos@ucdenver.edu](mailto:vicente.ramos@ucdenver.edu) + +## 11. References + +[1] Abdel-Hafiz, M., Najafi, M., et al. "Significant Subgraph Detection in Multi-omics Networks for Disease Pathway Identification." *Frontiers in Big Data*, 5 (2022). [DOI: 10.3389/fdata.2022.894632](https://doi.org/10.3389/fdata.2022.894632) + +[2] Hussein, S., Ramos, V., et al. "Learning from Multi-Omics Networks to Enhance Disease Prediction: An Optimized Network Embedding and Fusion Approach." In *2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)*, Lisbon, Portugal, 2024, pp. 4371-4378. [DOI: 10.1109/BIBM62325.2024.10822233](https://doi.org/10.1109/BIBM62325.2024.10822233) + +[3] Liu, W., Vu, T., Konigsberg, I. R., Pratte, K. A., Zhuang, Y., & Kechris, K. J. (2023). "Network-Based Integration of Multi-Omics Data for Biomarker Discovery and Phenotype Prediction." *Bioinformatics*, 39(5), btat204. [DOI: 10.1093/bioinformatics/btat204](https://doi.org/10.1093/bioinformatics/btat204) diff --git a/TCGA-BRCA_Datatest copy.ipynb b/TCGA-BRCA_Datatest copy.ipynb deleted file mode 100644 index e34cdae..0000000 --- a/TCGA-BRCA_Datatest copy.ipynb +++ /dev/null @@ -1,5438 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d182cc95", - "metadata": {}, - "source": [ - "# TCGA-BRCA Demo\n", - "\n", - "## Dataset Source\n", - "\n", - "- **Omics Data**: [FireHose BRCA](http://firebrowse.org/?cohort=BRCA)\n", - "- **Clinical and PAM50 Data**: [TCGAbiolinks](http://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html)\n", - "\n", - "## Dataset Overview\n", - "\n", - "**Original Data**:\n", - "\n", - "- **Methylation**: 20,107 × 885\n", - "- **mRNA**: 18,321 × 1,212\n", - "- **miRNA**: 503 × 1,189\n", - "- **PAM50**: 1,087 × 1\n", - "- **Clinical**: 1,098 × 101\n", - "\n", - "- **Note: Omics matrices are features × samples; clinical matrices are samples × fields.**\n", - "\n", - "### PAM50 Subtype Counts (Original)\n", - "\n", - "- **LumA**: 419\n", - "- **LumB**: 140\n", - "- **Basal**: 130\n", - "- **Her2**: 46\n", - "- **Normal**: 34\n", - "\n", - "## Patients in Every Dataset\n", - "\n", - "- Total patients present in methylation, mRNA, miRNA, PAM50, and clinical: **769**\n", - "\n", - "## Final Shapes (Per-Patient)\n", - "\n", - "After aggregating multiple aliquots by mean, all modalities align on 769 patients:\n", - "\n", - "- **Methylation**: 769 × 20,107\n", - "- **mRNA**: 769 × 20,531\n", - "- **miRNA**: 769 × 503\n", - "- **PAM50**: 769 × 1\n", - "- **Clinical**: 769 × 119\n", - "\n", - "## Data Summary Table\n", - "\n", - "| Stage | Clinical | Methylation | miRNA | mRNA | PAM50 (Subtype Counts) | Notes |\n", - "| ------------------------------ | ----------- | ------------ | ----------- | -------------- | -------------------------------------------------------------- | --------------------------------------- |\n", - "| **Original Raw Data** | 1,098 × 101 | 20,107 × 885 | 503 × 1,189 | 18,321 × 1,212 | LumA: 509
LumB: 209
Basal: 192
Her2: 82
Normal: 40 | Raw FireHose & TCGAbiolinks files |\n", - "| **Patient-Level Intersection** | 769 × 101 | 769 × 20,107 | 769 × 1,046 | 769 × 20,531 | LumA: 419
LumB: 140
Basal: 130
Her2: 46
Normal: 34 | Patients with complete data in all sets |\n", - "\n", - "## Reference Links\n", - "\n", - "- [FireHose BRCA](http://firebrowse.org/?cohort=BRCA)\n", - "- [TCGAbiolinks](http://bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html)\n", - "- [Direct Download BRCA](http://firebrowse.org/?cohort=BRCA&download_dialog=true)\n" - ] - }, - { - "cell_type": "markdown", - "id": "c9698b74", - "metadata": {}, - "source": [ - "## Lets take a look at the data from FireHose directly after download" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9c0bda23", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mirna shape: (503, 1189), rna shape: (18321, 1212), meth shape: (20107, 885), clinical shape: (18, 1097)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TCGA-3C-AAAU-01TCGA-3C-AALI-01TCGA-3C-AALJ-01TCGA-3C-AALK-01TCGA-4H-AAAK-01TCGA-5L-AAT0-01TCGA-5L-AAT1-01TCGA-5T-A9QA-01TCGA-A1-A0SB-01TCGA-A1-A0SD-01...TCGA-BH-A0WA-01TCGA-E2-A105-01TCGA-E2-A106-01TCGA-E2-A107-01TCGA-E2-A108-01TCGA-E2-A109-01TCGA-E2-A10B-01TCGA-E2-A10C-01TCGA-E2-A10E-01TCGA-E2-A10F-01
gene
hsa-let-7a-113.12976512.91806913.01203313.14469713.41168413.31630113.44523013.72785013.60150413.598739...12.22513213.93813413.60985313.50829013.40635913.73064713.19842612.79335014.06026812.990403
hsa-let-7a-214.11793313.92230014.01000214.14172114.41351814.31091714.44855614.71455114.60869314.606942...13.23506514.93002114.60338914.52502614.40273514.71916614.20052313.79662315.04759214.006035
hsa-let-7a-313.14771412.91319413.02848313.15128113.42048113.32714413.44680613.73689113.61310513.606224...12.26197113.97201113.64327413.54998113.43873713.73207013.21236712.79335014.07497813.018659
hsa-let-7b14.59513514.51265713.41961214.66719614.43854814.57649314.61113715.09880516.50575815.638855...14.68491215.23045715.35765515.11201115.04031515.80677115.64591014.72410616.37074115.439239
hsa-let-7c8.4148909.6465369.31245511.51143111.69392711.13841911.2844469.19751413.39216411.419823...10.56569810.48374511.15905612.47334012.40582810.61371211.3954529.08720210.88552011.385638
\n", - "

5 rows × 1189 columns

\n", - "
" - ], - "text/plain": [ - " TCGA-3C-AAAU-01 TCGA-3C-AALI-01 TCGA-3C-AALJ-01 \\\n", - "gene \n", - "hsa-let-7a-1 13.129765 12.918069 13.012033 \n", - "hsa-let-7a-2 14.117933 13.922300 14.010002 \n", - "hsa-let-7a-3 13.147714 12.913194 13.028483 \n", - "hsa-let-7b 14.595135 14.512657 13.419612 \n", - "hsa-let-7c 8.414890 9.646536 9.312455 \n", - "\n", - " TCGA-3C-AALK-01 TCGA-4H-AAAK-01 TCGA-5L-AAT0-01 \\\n", - "gene \n", - "hsa-let-7a-1 13.144697 13.411684 13.316301 \n", - "hsa-let-7a-2 14.141721 14.413518 14.310917 \n", - "hsa-let-7a-3 13.151281 13.420481 13.327144 \n", - "hsa-let-7b 14.667196 14.438548 14.576493 \n", - "hsa-let-7c 11.511431 11.693927 11.138419 \n", - "\n", - " TCGA-5L-AAT1-01 TCGA-5T-A9QA-01 TCGA-A1-A0SB-01 \\\n", - "gene \n", - "hsa-let-7a-1 13.445230 13.727850 13.601504 \n", - "hsa-let-7a-2 14.448556 14.714551 14.608693 \n", - "hsa-let-7a-3 13.446806 13.736891 13.613105 \n", - "hsa-let-7b 14.611137 15.098805 16.505758 \n", - "hsa-let-7c 11.284446 9.197514 13.392164 \n", - "\n", - " TCGA-A1-A0SD-01 ... TCGA-BH-A0WA-01 TCGA-E2-A105-01 \\\n", - "gene ... \n", - "hsa-let-7a-1 13.598739 ... 12.225132 13.938134 \n", - "hsa-let-7a-2 14.606942 ... 13.235065 14.930021 \n", - "hsa-let-7a-3 13.606224 ... 12.261971 13.972011 \n", - "hsa-let-7b 15.638855 ... 14.684912 15.230457 \n", - "hsa-let-7c 11.419823 ... 10.565698 10.483745 \n", - "\n", - " TCGA-E2-A106-01 TCGA-E2-A107-01 TCGA-E2-A108-01 \\\n", - "gene \n", - "hsa-let-7a-1 13.609853 13.508290 13.406359 \n", - "hsa-let-7a-2 14.603389 14.525026 14.402735 \n", - "hsa-let-7a-3 13.643274 13.549981 13.438737 \n", - "hsa-let-7b 15.357655 15.112011 15.040315 \n", - "hsa-let-7c 11.159056 12.473340 12.405828 \n", - "\n", - " TCGA-E2-A109-01 TCGA-E2-A10B-01 TCGA-E2-A10C-01 \\\n", - "gene \n", - "hsa-let-7a-1 13.730647 13.198426 12.793350 \n", - "hsa-let-7a-2 14.719166 14.200523 13.796623 \n", - "hsa-let-7a-3 13.732070 13.212367 12.793350 \n", - "hsa-let-7b 15.806771 15.645910 14.724106 \n", - "hsa-let-7c 10.613712 11.395452 9.087202 \n", - "\n", - " TCGA-E2-A10E-01 TCGA-E2-A10F-01 \n", - "gene \n", - "hsa-let-7a-1 14.060268 12.990403 \n", - "hsa-let-7a-2 15.047592 14.006035 \n", - "hsa-let-7a-3 14.074978 13.018659 \n", - "hsa-let-7b 16.370741 15.439239 \n", - "hsa-let-7c 10.885520 11.385638 \n", - "\n", - "[5 rows x 1189 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TCGA-3C-AAAU-01TCGA-3C-AALI-01TCGA-3C-AALJ-01TCGA-3C-AALK-01TCGA-4H-AAAK-01TCGA-5L-AAT0-01TCGA-5L-AAT1-01TCGA-5T-A9QA-01TCGA-A1-A0SB-01TCGA-A1-A0SD-01...TCGA-UL-AAZ6-01TCGA-UU-A93S-01TCGA-V7-A7HQ-01TCGA-W8-A86G-01TCGA-WT-AB41-01TCGA-WT-AB44-01TCGA-XX-A899-01TCGA-XX-A89A-01TCGA-Z7-A8R5-01TCGA-Z7-A8R6-01
gene
?|1001331444.0324893.2119313.5388863.5956712.7754301.995991NaN0.5503103.9391893.250628...-1.3248162.108558NaN2.475707NaNNaN3.8465744.4805241.1787472.783771
?|1001348693.6928294.1192733.2062373.4698733.8509793.7664893.4052983.1692523.8473463.501324...3.8451893.4439781.6225563.8450992.6574341.7039874.4222944.7694762.8665724.631075
?|103575.7046046.1242317.2695707.1685656.3959686.8361416.8579616.7490356.8627865.913201...7.0834707.0888294.9067667.0035475.7449095.4013687.1061776.0032136.4101737.388457
?|104318.6726949.13927910.4102759.7574509.5819229.65775310.11425610.4721859.3603679.933569...10.61668211.49505410.7497709.44641010.28224110.8745349.3504009.49729510.1551739.970921
?|15506010.2131109.0113439.2095069.1104878.0270838.1100237.7048656.2547418.1280526.387132...8.0524787.5162369.2807619.6313068.1372259.4605398.7386518.5564147.9776707.894918
\n", - "

5 rows × 1212 columns

\n", - "
" - ], - "text/plain": [ - " TCGA-3C-AAAU-01 TCGA-3C-AALI-01 TCGA-3C-AALJ-01 \\\n", - "gene \n", - "?|100133144 4.032489 3.211931 3.538886 \n", - "?|100134869 3.692829 4.119273 3.206237 \n", - "?|10357 5.704604 6.124231 7.269570 \n", - "?|10431 8.672694 9.139279 10.410275 \n", - "?|155060 10.213110 9.011343 9.209506 \n", - "\n", - " TCGA-3C-AALK-01 TCGA-4H-AAAK-01 TCGA-5L-AAT0-01 \\\n", - "gene \n", - "?|100133144 3.595671 2.775430 1.995991 \n", - "?|100134869 3.469873 3.850979 3.766489 \n", - "?|10357 7.168565 6.395968 6.836141 \n", - "?|10431 9.757450 9.581922 9.657753 \n", - "?|155060 9.110487 8.027083 8.110023 \n", - "\n", - " TCGA-5L-AAT1-01 TCGA-5T-A9QA-01 TCGA-A1-A0SB-01 \\\n", - "gene \n", - "?|100133144 NaN 0.550310 3.939189 \n", - "?|100134869 3.405298 3.169252 3.847346 \n", - "?|10357 6.857961 6.749035 6.862786 \n", - "?|10431 10.114256 10.472185 9.360367 \n", - "?|155060 7.704865 6.254741 8.128052 \n", - "\n", - " TCGA-A1-A0SD-01 ... TCGA-UL-AAZ6-01 TCGA-UU-A93S-01 \\\n", - "gene ... \n", - "?|100133144 3.250628 ... -1.324816 2.108558 \n", - "?|100134869 3.501324 ... 3.845189 3.443978 \n", - "?|10357 5.913201 ... 7.083470 7.088829 \n", - "?|10431 9.933569 ... 10.616682 11.495054 \n", - "?|155060 6.387132 ... 8.052478 7.516236 \n", - "\n", - " TCGA-V7-A7HQ-01 TCGA-W8-A86G-01 TCGA-WT-AB41-01 \\\n", - "gene \n", - "?|100133144 NaN 2.475707 NaN \n", - "?|100134869 1.622556 3.845099 2.657434 \n", - "?|10357 4.906766 7.003547 5.744909 \n", - "?|10431 10.749770 9.446410 10.282241 \n", - "?|155060 9.280761 9.631306 8.137225 \n", - "\n", - " TCGA-WT-AB44-01 TCGA-XX-A899-01 TCGA-XX-A89A-01 \\\n", - "gene \n", - "?|100133144 NaN 3.846574 4.480524 \n", - "?|100134869 1.703987 4.422294 4.769476 \n", - "?|10357 5.401368 7.106177 6.003213 \n", - "?|10431 10.874534 9.350400 9.497295 \n", - "?|155060 9.460539 8.738651 8.556414 \n", - "\n", - " TCGA-Z7-A8R5-01 TCGA-Z7-A8R6-01 \n", - "gene \n", - "?|100133144 1.178747 2.783771 \n", - "?|100134869 2.866572 4.631075 \n", - "?|10357 6.410173 7.388457 \n", - "?|10431 10.155173 9.970921 \n", - "?|155060 7.977670 7.894918 \n", - "\n", - "[5 rows x 1212 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TCGA-3C-AAAU-01TCGA-3C-AALI-01TCGA-3C-AALJ-01TCGA-3C-AALK-01TCGA-4H-AAAK-01TCGA-5L-AAT0-01TCGA-5L-AAT1-01TCGA-5T-A9QA-01TCGA-A1-A0SB-01TCGA-A1-A0SE-01...TCGA-UL-AAZ6-01TCGA-UU-A93S-01TCGA-V7-A7HQ-01TCGA-W8-A86G-01TCGA-WT-AB41-01TCGA-WT-AB44-01TCGA-XX-A899-01TCGA-XX-A89A-01TCGA-Z7-A8R5-01TCGA-Z7-A8R6-01
Hybridization REF
Composite Element REFBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_Value...Beta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_ValueBeta_Value
A1BG0.4837161196760.6371912261310.6560923982420.6151944713570.6120803705110.4696007406780.5821882394220.666170730970.6599656119590.641701155202...0.6314132417240.649522943950.5965851695970.6155583576510.5808378802620.6158140233240.5898977949570.5726066361280.6178595861610.568150149265
A1CF0.2958272034920.4589729985710.4897252896380.6257652232430.5077365096650.5147708663260.5498509587290.3810386544480.8263121563930.606699429409...0.3834691928550.1833548539380.4039091613120.7169802550140.6131312950740.6650437132130.7051537253750.4948486860210.6918353871890.224696596211
A2BP10.1876998695910.2405158477040.2790878512260.4888885104740.4638454946350.5044508553530.4808858167450.6228323992160.4746788315630.339829506578...0.1305299155360.3198553107430.3355174560530.5121853966380.5635198068110.5073643246350.5205427471670.4125620685740.5221699781430.33955834608
A2LD10.629585513220.6662722886750.7556304999860.745751212870.6985157391240.7068127066610.7590173559960.6940109398850.8478375222560.786662091353...0.5874759953130.6679696423210.6891402110360.7913812835240.6804993231480.6604763600540.7457254204120.743900498750.7912299995770.637764188841
\n", - "

5 rows × 885 columns

\n", - "
" - ], - "text/plain": [ - " TCGA-3C-AAAU-01 TCGA-3C-AALI-01 TCGA-3C-AALJ-01 \\\n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value Beta_Value \n", - "A1BG 0.483716119676 0.637191226131 0.656092398242 \n", - "A1CF 0.295827203492 0.458972998571 0.489725289638 \n", - "A2BP1 0.187699869591 0.240515847704 0.279087851226 \n", - "A2LD1 0.62958551322 0.666272288675 0.755630499986 \n", - "\n", - " TCGA-3C-AALK-01 TCGA-4H-AAAK-01 TCGA-5L-AAT0-01 \\\n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value Beta_Value \n", - "A1BG 0.615194471357 0.612080370511 0.469600740678 \n", - "A1CF 0.625765223243 0.507736509665 0.514770866326 \n", - "A2BP1 0.488888510474 0.463845494635 0.504450855353 \n", - "A2LD1 0.74575121287 0.698515739124 0.706812706661 \n", - "\n", - " TCGA-5L-AAT1-01 TCGA-5T-A9QA-01 TCGA-A1-A0SB-01 \\\n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value Beta_Value \n", - "A1BG 0.582188239422 0.66617073097 0.659965611959 \n", - "A1CF 0.549850958729 0.381038654448 0.826312156393 \n", - "A2BP1 0.480885816745 0.622832399216 0.474678831563 \n", - "A2LD1 0.759017355996 0.694010939885 0.847837522256 \n", - "\n", - " TCGA-A1-A0SE-01 ... TCGA-UL-AAZ6-01 TCGA-UU-A93S-01 \\\n", - "Hybridization REF ... \n", - "Composite Element REF Beta_Value ... Beta_Value Beta_Value \n", - "A1BG 0.641701155202 ... 0.631413241724 0.64952294395 \n", - "A1CF 0.606699429409 ... 0.383469192855 0.183354853938 \n", - "A2BP1 0.339829506578 ... 0.130529915536 0.319855310743 \n", - "A2LD1 0.786662091353 ... 0.587475995313 0.667969642321 \n", - "\n", - " TCGA-V7-A7HQ-01 TCGA-W8-A86G-01 TCGA-WT-AB41-01 \\\n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value Beta_Value \n", - "A1BG 0.596585169597 0.615558357651 0.580837880262 \n", - "A1CF 0.403909161312 0.716980255014 0.613131295074 \n", - "A2BP1 0.335517456053 0.512185396638 0.563519806811 \n", - "A2LD1 0.689140211036 0.791381283524 0.680499323148 \n", - "\n", - " TCGA-WT-AB44-01 TCGA-XX-A899-01 TCGA-XX-A89A-01 \\\n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value Beta_Value \n", - "A1BG 0.615814023324 0.589897794957 0.572606636128 \n", - "A1CF 0.665043713213 0.705153725375 0.494848686021 \n", - "A2BP1 0.507364324635 0.520542747167 0.412562068574 \n", - "A2LD1 0.660476360054 0.745725420412 0.74390049875 \n", - "\n", - " TCGA-Z7-A8R5-01 TCGA-Z7-A8R6-01 \n", - "Hybridization REF \n", - "Composite Element REF Beta_Value Beta_Value \n", - "A1BG 0.617859586161 0.568150149265 \n", - "A1CF 0.691835387189 0.224696596211 \n", - "A2BP1 0.522169978143 0.33955834608 \n", - "A2LD1 0.791229999577 0.637764188841 \n", - "\n", - "[5 rows x 885 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tcga-5l-aat0tcga-5l-aat1tcga-a1-a0sptcga-a2-a04vtcga-a2-a04ytcga-a2-a0cqtcga-a2-a1g4tcga-a2-a25atcga-a7-a0cdtcga-a7-a13g...tcga-s3-aa11tcga-s3-aa14tcga-s3-aa15tcga-ul-aaz6tcga-uu-a93stcga-v7-a7hqtcga-wt-ab44tcga-xx-a899tcga-xx-a89atcga-z7-a8r6
Hybridization REF
Composite Element REFvaluevaluevaluevaluevaluevaluevaluevaluevaluevalue...valuevaluevaluevaluevaluevaluevaluevaluevaluevalue
years_to_birth42634039536271446679...674751736375NaN466846
vital_status0001000000...0000100000
days_to_deathNaNNaNNaN1920NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaN116NaNNaNNaNNaNNaN
days_to_last_followup14771471584NaN1099269559532761165718...421529525518NaN20338834674883256
\n", - "

5 rows × 1097 columns

\n", - "
" - ], - "text/plain": [ - " tcga-5l-aat0 tcga-5l-aat1 tcga-a1-a0sp tcga-a2-a04v \\\n", - "Hybridization REF \n", - "Composite Element REF value value value value \n", - "years_to_birth 42 63 40 39 \n", - "vital_status 0 0 0 1 \n", - "days_to_death NaN NaN NaN 1920 \n", - "days_to_last_followup 1477 1471 584 NaN \n", - "\n", - " tcga-a2-a04y tcga-a2-a0cq tcga-a2-a1g4 tcga-a2-a25a \\\n", - "Hybridization REF \n", - "Composite Element REF value value value value \n", - "years_to_birth 53 62 71 44 \n", - "vital_status 0 0 0 0 \n", - "days_to_death NaN NaN NaN NaN \n", - "days_to_last_followup 1099 2695 595 3276 \n", - "\n", - " tcga-a7-a0cd tcga-a7-a13g ... tcga-s3-aa11 \\\n", - "Hybridization REF ... \n", - "Composite Element REF value value ... value \n", - "years_to_birth 66 79 ... 67 \n", - "vital_status 0 0 ... 0 \n", - "days_to_death NaN NaN ... NaN \n", - "days_to_last_followup 1165 718 ... 421 \n", - "\n", - " tcga-s3-aa14 tcga-s3-aa15 tcga-ul-aaz6 tcga-uu-a93s \\\n", - "Hybridization REF \n", - "Composite Element REF value value value value \n", - "years_to_birth 47 51 73 63 \n", - "vital_status 0 0 0 1 \n", - "days_to_death NaN NaN NaN 116 \n", - "days_to_last_followup 529 525 518 NaN \n", - "\n", - " tcga-v7-a7hq tcga-wt-ab44 tcga-xx-a899 tcga-xx-a89a \\\n", - "Hybridization REF \n", - "Composite Element REF value value value value \n", - "years_to_birth 75 NaN 46 68 \n", - "vital_status 0 0 0 0 \n", - "days_to_death NaN NaN NaN NaN \n", - "days_to_last_followup 2033 883 467 488 \n", - "\n", - " tcga-z7-a8r6 \n", - "Hybridization REF \n", - "Composite Element REF value \n", - "years_to_birth 46 \n", - "vital_status 0 \n", - "days_to_death NaN \n", - "days_to_last_followup 3256 \n", - "\n", - "[5 rows x 1097 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", - "from pathlib import Path\n", - "root = Path(\"/home/vicente/Github/BioNeuralNet/TCGA_BRCA_DATA\")\n", - "\n", - "mirna_raw = pd.read_csv(root/\"BRCA.miRseq_RPKM_log2.txt\", sep=\"\\t\",index_col=0,low_memory=False) \n", - "rna_raw = pd.read_csv(root / \"BRCA.uncv2.mRNAseq_RSEM_normalized_log2.txt\", sep=\"\\t\",index_col=0,low_memory=False)\n", - "meth_raw = pd.read_csv(root/\"BRCA.meth.by_mean.data.txt\", sep='\\t',index_col=0,low_memory=False)\n", - "clinical_raw = pd.read_csv(root / \"BRCA.clin.merged.picked.txt\",sep=\"\\t\", index_col=0, low_memory=False)\n", - "\n", - "print(f\"mirna shape: {mirna_raw.shape}, rna shape: {rna_raw.shape}, meth shape: {meth_raw.shape}, clinical shape: {clinical_raw.shape}\")\n", - "display(mirna_raw.head())\n", - "display(rna_raw.head())\n", - "display(meth_raw.head())\n", - "display(clinical_raw.head())" - ] - }, - { - "cell_type": "markdown", - "id": "aacae339", - "metadata": {}, - "source": [ - "## TCGA-BioLink\n", - "\n", - "This section demonstrates how to use the `TCGAbiolinks` R package to access and download clinical and molecular subtype data. It begins by ensuring `TCGAbiolinks` is installed, then loads the package. It retrieves PAM50 molecular subtype labels using `TCGAquery_subtype()` and writes them to a CSV file. Additionally, it downloads clinical data using `GDCquery_clinic()` and formats it with `GDCprepare_clinic()`, saving the result as another CSV file." - ] - }, - { - "cell_type": "markdown", - "id": "a445601f", - "metadata": {}, - "source": [ - "```R\n", - " # Install TCGAbiolinks\n", - " if (!requireNamespace(\"TCGAbiolinks\", quietly = TRUE)) {\n", - " if (!requireNamespace(\"BiocManager\", quietly = TRUE))\n", - " install.packages(\"BiocManager\")\n", - " BiocManager::install(\"TCGAbiolinks\")\n", - " }\n", - "\n", - " # Load the library\n", - " library(TCGAbiolinks)\n", - "\n", - " # Download PAM50 subtype labels\n", - " pam50_df <- TCGAquery_subtype(tumor = \"BRCA\")[ , c(\"patient\", \"BRCA_Subtype_PAM50\")]\n", - " write.csv(pam50_df, file = \"BRCA_PAM50_labels.csv\", row.names = FALSE, quote = FALSE)\n", - "\n", - " # Download clinical data\n", - " clin_raw <- GDCquery_clinic(project = \"TCGA-BRCA\", type = \"clinical\")\n", - " clin_df <- GDCprepare_clinic(clin_raw, clinical.info = \"patient\")\n", - " write.csv(clin_df, file = \"BRCA_clinical_data.csv\", row.names = FALSE, quote = FALSE)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "128f63dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Initial shapes\n", - "meth: (20107, 885)\n", - "rna: (18321, 1212)\n", - "mirna: (503, 1189)\n", - "pam50: (1087, 1)\n", - "clinical TCGABioLinks: (1098, 101)\n", - "clinical FireHose: (1097, 18)\n", - "\n", - "After tranpose\n", - "meth: (885, 20107)\n", - "rna: (1212, 18321)\n", - "mirna: (1189, 503)\n", - "Patients in both clinical datasets: 1097\n", - "Combined Clinical shape (1097, 119)\n", - "Patients in every dataset: 769\n", - "\n", - "Final shapes:\n", - "meth: (863, 20107)\n", - "rna: (865, 18321)\n", - "mirna: (855, 503)\n", - "pam50: (769, 1)\n", - "clinical: (769, 119)\n", - "\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# from Firehose\n", - "mirna = pd.read_csv(root/\"BRCA.miRseq_RPKM_log2.txt\", sep=\"\\t\",index_col=0,low_memory=False)\n", - "meth = pd.read_csv(root/\"BRCA.meth.by_mean.data.txt\", sep='\\t',index_col=0,low_memory=False) \n", - "rna = pd.read_csv(root / \"BRCA.uncv2.mRNAseq_RSEM_normalized_log2.txt\", sep=\"\\t\",index_col=0,low_memory=False)\n", - "clinical_firehose = pd.read_csv(root / \"BRCA.clin.merged.picked.txt\",sep=\"\\t\", index_col=0, low_memory=False).T\n", - "\n", - "# from TCGABiolinks\n", - "pam50 = pd.read_csv(root /\"BRCA_PAM50_labels.csv\",index_col=0)\n", - "clinical_biolinks = pd.read_csv(root /\"BRCA_clinical_data.csv\",index_col=1)\n", - "\n", - "print(\"Initial shapes\")\n", - "print(f\"meth: {meth.shape}\")\n", - "print(f\"rna: {rna.shape}\")\n", - "print(f\"mirna: {mirna.shape}\")\n", - "print(f\"pam50: {pam50.shape}\")\n", - "print(f\"clinical TCGABioLinks: {clinical_biolinks.shape}\")\n", - "print(f\"clinical FireHose: {clinical_firehose.shape}\")\n", - "\n", - "meth = meth.T\n", - "rna = rna.T\n", - "mirna = mirna.T\n", - "\n", - "print(\"\\nAfter tranpose\")\n", - "print(f\"meth: {meth.shape}\")\n", - "print(f\"rna: {rna.shape}\")\n", - "print(f\"mirna: {mirna.shape}\")\n", - "\n", - "def trim(idx):\n", - " return idx.to_series().str.extract(r'(^TCGA-\\w\\w-\\w\\w\\w\\w)')[0]\n", - "\n", - "meth.index = trim(meth.index)\n", - "rna.index = trim(rna.index)\n", - "mirna.index = trim(mirna.index)\n", - "pam50.index = pam50.index.str.upper()\n", - "clinical_biolinks.index = clinical_biolinks.index.str.upper()\n", - "clinical_firehose.index = clinical_firehose.index.str.upper()\n", - "\n", - "idx1 = clinical_biolinks.index\n", - "idx2 = clinical_firehose.index\n", - "\n", - "# intersection and unique counts\n", - "common = idx1.intersection(idx2)\n", - "only_in_1 = idx1.difference(idx2)\n", - "only_in_2 = idx2.difference(idx1)\n", - "\n", - "print(f\"Patients in both clinical datasets: {len(common)}\")\n", - "common = clinical_biolinks.index.intersection(clinical_firehose.index)\n", - "clinical_biolinks = clinical_biolinks.loc[common]\n", - "clinical_firehose = clinical_firehose.loc[common]\n", - "\n", - "clinical = pd.concat([clinical_biolinks, clinical_firehose], axis=1)\n", - "\n", - "print(f\"Combined Clinical shape {clinical.shape}\")\n", - "\n", - "common = sorted(set(meth.index) & set(rna.index) & set(mirna.index) & set(pam50.index) & set(clinical.index))\n", - "print(f\"Patients in every dataset: {len(common)}\")\n", - "\n", - "meth = meth.loc[common]\n", - "rna = rna.loc[common]\n", - "mirna = mirna.loc[common]\n", - "pam50 = pam50.loc[common]\n", - "clinical = clinical.loc[common]\n", - "\n", - "print(\"\\nFinal shapes:\")\n", - "print(f\"meth: {meth.shape}\")\n", - "print(f\"rna: {rna.shape}\")\n", - "print(f\"mirna: {mirna.shape}\")\n", - "print(f\"pam50: {pam50.shape}\")\n", - "print(f\"clinical: {clinical.shape}\\n\")" - ] - }, - { - "cell_type": "markdown", - "id": "32ba4b2c", - "metadata": {}, - "source": [ - "## Handling Multiple Aliquots per Sample\n", - "\n", - "This section addresses cases where some patients have multiple aliquots per sample in the `meth`, `rna`, and `mirna` datasets. It first identifies and counts patients with duplicate entries. Then, it coerces all data to numeric types and aggregates the duplicates by computing the mean across aliquots for each patient, ensuring only one row per patient. After aggregation, the datasets are aligned by keeping only the patients that are common across all five datasets (`meth`, `rna`, `mirna`, `pam50`, and `clinical`). The result is s set of matched samples ready for integrated analysis." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b841497a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "meth:\n", - "patients with >1 aliquot: 91\n", - "total duplicate rows: 94\n", - "\n", - "rna:\n", - "patients with >1 aliquot: 93\n", - "total duplicate rows: 96\n", - "\n", - "mirna:\n", - "patients with >1 aliquot: 84\n", - "total duplicate rows: 86\n", - "\n", - "Post-aggregation shapes:\n", - "meth: (769, 20107)\n", - "rna: (769, 18321)\n", - "mirna: (769, 503)\n", - "Patients in every dataset: 769\n", - "\n", - "Final shapes\n", - "meth: (769, 20107)\n", - "rna: (769, 18321)\n", - "mirna: (769, 503)\n", - "pam50: (769, 1)\n", - "clinical:(769, 119)\n" - ] - } - ], - "source": [ - "for name, df in [(\"meth\", meth), (\"rna\", rna), (\"mirna\", mirna)]:\n", - " counts = df.index.value_counts()\n", - " n_multiple = (counts > 1).sum()\n", - " total_duplicates = counts[counts > 1].sum() - n_multiple\n", - " \n", - " print(f\"{name}:\")\n", - " print(f\"patients with >1 aliquot: {n_multiple}\")\n", - " print(f\"total duplicate rows: {total_duplicates}\\n\")\n", - "\n", - "meth = meth.apply(pd.to_numeric, errors=\"coerce\")\n", - "rna = rna .apply(pd.to_numeric, errors=\"coerce\")\n", - "mirna = mirna.apply(pd.to_numeric, errors=\"coerce\")\n", - "\n", - "meth = meth.groupby(level=0).mean()\n", - "rna = rna.groupby(level=0).mean()\n", - "mirna = mirna.groupby(level=0).mean()\n", - "\n", - "# Now each has one row per patient\n", - "print(\"Post-aggregation shapes:\")\n", - "print(f\"meth: {meth.shape}\")\n", - "print(f\"rna: {rna.shape}\")\n", - "print(f\"mirna: {mirna.shape}\")\n", - "\n", - "common = sorted( set(meth.index) & set(rna.index) & set(mirna.index)& set(pam50.index) & set(clinical.index) )\n", - "print(f\"Patients in every dataset: {len(common)}\")\n", - "\n", - "meth = meth.loc[common]\n", - "rna = rna.loc[common]\n", - "mirna = mirna.loc[common]\n", - "pam50 = pam50.loc[common]\n", - "clinical = clinical.loc[common]\n", - "\n", - "print(\"\\nFinal shapes\")\n", - "print(f\"meth: {meth.shape}\")\n", - "print(f\"rna: {rna.shape}\")\n", - "print(f\"mirna: {mirna.shape}\")\n", - "print(f\"pam50: {pam50.shape}\")\n", - "print(f\"clinical:{clinical.shape}\")" - ] - }, - { - "cell_type": "markdown", - "id": "9d8dac23", - "metadata": {}, - "source": [ - "## Review the first few rows of each file" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4f35bd67", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hybridization REFComposite Element REFA1BGA1CFA2BP1A2LD1A2MA2ML1A4GALTA4GNTAAA1...ZWILCHZWINTZXDCZYG11AZYG11BZYXZZEF1ZZZ3psiTPTE22tAKR
0
TCGA-3C-AAAUNaN0.4837160.2958270.1877000.6295860.5596540.8354120.4848000.6902170.807805...0.1129780.0539390.2876650.3280870.5029350.2206830.4820440.1073960.2473040.506404
TCGA-3C-AALINaN0.6371910.4589730.2405160.6662720.6075050.8423910.5500470.7498900.395290...0.1118340.0461600.2653220.4058510.4340240.2363620.4588470.1196520.1630220.623865
TCGA-3C-AALJNaN0.6560920.4897250.2790880.7556300.6623600.8290200.4761070.6537560.795102...0.1132180.0426570.2721030.3913260.4495250.2109760.4826410.1023850.2523280.504451
TCGA-3C-AALKNaN0.6151940.6257650.4888890.7457510.7279820.8353650.5560160.6520050.816423...0.1451330.0470220.3012840.4103480.4465710.2201850.4859440.1129410.4719560.682468
TCGA-4H-AAAKNaN0.6120800.5077370.4638450.6985160.6923640.8023880.5048700.5311830.851114...0.1189280.0450570.3006470.3799980.4879290.2333240.4907360.1156460.3148770.744877
\n", - "

5 rows × 20107 columns

\n", - "
" - ], - "text/plain": [ - "Hybridization REF Composite Element REF A1BG A1CF A2BP1 \\\n", - "0 \n", - "TCGA-3C-AAAU NaN 0.483716 0.295827 0.187700 \n", - "TCGA-3C-AALI NaN 0.637191 0.458973 0.240516 \n", - "TCGA-3C-AALJ NaN 0.656092 0.489725 0.279088 \n", - "TCGA-3C-AALK NaN 0.615194 0.625765 0.488889 \n", - "TCGA-4H-AAAK NaN 0.612080 0.507737 0.463845 \n", - "\n", - "Hybridization REF A2LD1 A2M A2ML1 A4GALT A4GNT AAA1 \\\n", - "0 \n", - "TCGA-3C-AAAU 0.629586 0.559654 0.835412 0.484800 0.690217 0.807805 \n", - "TCGA-3C-AALI 0.666272 0.607505 0.842391 0.550047 0.749890 0.395290 \n", - "TCGA-3C-AALJ 0.755630 0.662360 0.829020 0.476107 0.653756 0.795102 \n", - "TCGA-3C-AALK 0.745751 0.727982 0.835365 0.556016 0.652005 0.816423 \n", - "TCGA-4H-AAAK 0.698516 0.692364 0.802388 0.504870 0.531183 0.851114 \n", - "\n", - "Hybridization REF ... ZWILCH ZWINT ZXDC ZYG11A ZYG11B \\\n", - "0 ... \n", - "TCGA-3C-AAAU ... 0.112978 0.053939 0.287665 0.328087 0.502935 \n", - "TCGA-3C-AALI ... 0.111834 0.046160 0.265322 0.405851 0.434024 \n", - "TCGA-3C-AALJ ... 0.113218 0.042657 0.272103 0.391326 0.449525 \n", - "TCGA-3C-AALK ... 0.145133 0.047022 0.301284 0.410348 0.446571 \n", - "TCGA-4H-AAAK ... 0.118928 0.045057 0.300647 0.379998 0.487929 \n", - "\n", - "Hybridization REF ZYX ZZEF1 ZZZ3 psiTPTE22 tAKR \n", - "0 \n", - "TCGA-3C-AAAU 0.220683 0.482044 0.107396 0.247304 0.506404 \n", - "TCGA-3C-AALI 0.236362 0.458847 0.119652 0.163022 0.623865 \n", - "TCGA-3C-AALJ 0.210976 0.482641 0.102385 0.252328 0.504451 \n", - "TCGA-3C-AALK 0.220185 0.485944 0.112941 0.471956 0.682468 \n", - "TCGA-4H-AAAK 0.233324 0.490736 0.115646 0.314877 0.744877 \n", - "\n", - "[5 rows x 20107 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gene?|100133144?|100134869?|10357?|10431?|155060?|26823?|340602?|388795?|390284?|391343...ZWINT|11130ZXDA|7789ZXDB|158586ZXDC|79364ZYG11A|440590ZYG11B|79699ZYX|7791ZZEF1|23140ZZZ3|26009psiTPTE22|387590
0
TCGA-3C-AAAU4.0324893.6928295.7046048.67269410.213110NaN0.785174-1.5365872.048201NaN...9.8641207.0178309.97696810.6956628.01398810.23885111.77612410.88793210.2051290.785174
TCGA-3C-AALI3.2119314.1192736.1242319.1392799.0113430.1210157.1709282.2910140.7060223.027968...9.9146825.9024388.80932910.3913747.6328319.23742212.42642810.3648488.6679739.855788
TCGA-3C-AALJ3.5388863.2062377.26957010.4102759.209506NaNNaN1.4435541.443554NaN...11.3056505.1439699.0606919.5864888.3742679.05578412.4143559.8809358.9929945.143969
TCGA-3C-AALK3.5956713.4698737.1685659.7574509.110487-1.273343NaN1.0487242.186215NaN...9.3849945.7820658.7739069.7546887.4547039.24641912.4745569.6094269.4530016.057699
TCGA-4H-AAAK2.7754303.8509796.3959689.5819228.027083-1.232769-1.2327691.5746831.574683NaN...9.3976065.6128308.72878910.0358813.8117389.59943811.9807479.7002929.7841477.548699
\n", - "

5 rows × 18321 columns

\n", - "
" - ], - "text/plain": [ - "gene ?|100133144 ?|100134869 ?|10357 ?|10431 ?|155060 \\\n", - "0 \n", - "TCGA-3C-AAAU 4.032489 3.692829 5.704604 8.672694 10.213110 \n", - "TCGA-3C-AALI 3.211931 4.119273 6.124231 9.139279 9.011343 \n", - "TCGA-3C-AALJ 3.538886 3.206237 7.269570 10.410275 9.209506 \n", - "TCGA-3C-AALK 3.595671 3.469873 7.168565 9.757450 9.110487 \n", - "TCGA-4H-AAAK 2.775430 3.850979 6.395968 9.581922 8.027083 \n", - "\n", - "gene ?|26823 ?|340602 ?|388795 ?|390284 ?|391343 ... \\\n", - "0 ... \n", - "TCGA-3C-AAAU NaN 0.785174 -1.536587 2.048201 NaN ... \n", - "TCGA-3C-AALI 0.121015 7.170928 2.291014 0.706022 3.027968 ... \n", - "TCGA-3C-AALJ NaN NaN 1.443554 1.443554 NaN ... \n", - "TCGA-3C-AALK -1.273343 NaN 1.048724 2.186215 NaN ... \n", - "TCGA-4H-AAAK -1.232769 -1.232769 1.574683 1.574683 NaN ... \n", - "\n", - "gene ZWINT|11130 ZXDA|7789 ZXDB|158586 ZXDC|79364 ZYG11A|440590 \\\n", - "0 \n", - "TCGA-3C-AAAU 9.864120 7.017830 9.976968 10.695662 8.013988 \n", - "TCGA-3C-AALI 9.914682 5.902438 8.809329 10.391374 7.632831 \n", - "TCGA-3C-AALJ 11.305650 5.143969 9.060691 9.586488 8.374267 \n", - "TCGA-3C-AALK 9.384994 5.782065 8.773906 9.754688 7.454703 \n", - "TCGA-4H-AAAK 9.397606 5.612830 8.728789 10.035881 3.811738 \n", - "\n", - "gene ZYG11B|79699 ZYX|7791 ZZEF1|23140 ZZZ3|26009 \\\n", - "0 \n", - "TCGA-3C-AAAU 10.238851 11.776124 10.887932 10.205129 \n", - "TCGA-3C-AALI 9.237422 12.426428 10.364848 8.667973 \n", - "TCGA-3C-AALJ 9.055784 12.414355 9.880935 8.992994 \n", - "TCGA-3C-AALK 9.246419 12.474556 9.609426 9.453001 \n", - "TCGA-4H-AAAK 9.599438 11.980747 9.700292 9.784147 \n", - "\n", - "gene psiTPTE22|387590 \n", - "0 \n", - "TCGA-3C-AAAU 0.785174 \n", - "TCGA-3C-AALI 9.855788 \n", - "TCGA-3C-AALJ 5.143969 \n", - "TCGA-3C-AALK 6.057699 \n", - "TCGA-4H-AAAK 7.548699 \n", - "\n", - "[5 rows x 18321 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
genehsa-let-7a-1hsa-let-7a-2hsa-let-7a-3hsa-let-7bhsa-let-7chsa-let-7dhsa-let-7ehsa-let-7f-1hsa-let-7f-2hsa-let-7g...hsa-mir-937hsa-mir-939hsa-mir-940hsa-mir-942hsa-mir-944hsa-mir-95hsa-mir-96hsa-mir-98hsa-mir-99ahsa-mir-99b
0
TCGA-3C-AAAU13.12976514.11793313.14771414.5951358.4148908.66592110.5217773.87939211.8248178.597744...0.906699-0.0933022.6722342.4674141.0442022.0442026.9066995.7546967.02460215.506461
TCGA-3C-AALI12.91806913.92230012.91319414.5126579.6465369.0036539.1317604.38695212.6788418.455144...1.579597-0.0833670.1390243.032109-0.6683310.3316705.9128706.4270667.88529913.626182
TCGA-3C-AALJ13.01203314.01000213.02848313.4196129.3124559.27694311.3957115.31469213.5302559.230563...3.270298-2.1891340.3958281.855261-0.3817780.7177576.6036576.8783017.58070415.013822
TCGA-3C-AALK13.14469714.14172113.15128114.66719611.5114318.38476310.3689814.15918212.6525598.471503...0.923965-0.660997-0.0760341.7984351.7984350.7984356.1813545.37792210.03161914.554783
TCGA-4H-AAAK13.41168414.41351813.42048114.43854811.6939278.45374710.7413714.49453713.0094998.381220...0.182950-0.624403-1.6244031.0760360.182950-0.3024754.3181105.10351610.07820114.650338
\n", - "

5 rows × 503 columns

\n", - "
" - ], - "text/plain": [ - "gene hsa-let-7a-1 hsa-let-7a-2 hsa-let-7a-3 hsa-let-7b \\\n", - "0 \n", - "TCGA-3C-AAAU 13.129765 14.117933 13.147714 14.595135 \n", - "TCGA-3C-AALI 12.918069 13.922300 12.913194 14.512657 \n", - "TCGA-3C-AALJ 13.012033 14.010002 13.028483 13.419612 \n", - "TCGA-3C-AALK 13.144697 14.141721 13.151281 14.667196 \n", - "TCGA-4H-AAAK 13.411684 14.413518 13.420481 14.438548 \n", - "\n", - "gene hsa-let-7c hsa-let-7d hsa-let-7e hsa-let-7f-1 hsa-let-7f-2 \\\n", - "0 \n", - "TCGA-3C-AAAU 8.414890 8.665921 10.521777 3.879392 11.824817 \n", - "TCGA-3C-AALI 9.646536 9.003653 9.131760 4.386952 12.678841 \n", - "TCGA-3C-AALJ 9.312455 9.276943 11.395711 5.314692 13.530255 \n", - "TCGA-3C-AALK 11.511431 8.384763 10.368981 4.159182 12.652559 \n", - "TCGA-4H-AAAK 11.693927 8.453747 10.741371 4.494537 13.009499 \n", - "\n", - "gene hsa-let-7g ... hsa-mir-937 hsa-mir-939 hsa-mir-940 \\\n", - "0 ... \n", - "TCGA-3C-AAAU 8.597744 ... 0.906699 -0.093302 2.672234 \n", - "TCGA-3C-AALI 8.455144 ... 1.579597 -0.083367 0.139024 \n", - "TCGA-3C-AALJ 9.230563 ... 3.270298 -2.189134 0.395828 \n", - "TCGA-3C-AALK 8.471503 ... 0.923965 -0.660997 -0.076034 \n", - "TCGA-4H-AAAK 8.381220 ... 0.182950 -0.624403 -1.624403 \n", - "\n", - "gene hsa-mir-942 hsa-mir-944 hsa-mir-95 hsa-mir-96 hsa-mir-98 \\\n", - "0 \n", - "TCGA-3C-AAAU 2.467414 1.044202 2.044202 6.906699 5.754696 \n", - "TCGA-3C-AALI 3.032109 -0.668331 0.331670 5.912870 6.427066 \n", - "TCGA-3C-AALJ 1.855261 -0.381778 0.717757 6.603657 6.878301 \n", - "TCGA-3C-AALK 1.798435 1.798435 0.798435 6.181354 5.377922 \n", - "TCGA-4H-AAAK 1.076036 0.182950 -0.302475 4.318110 5.103516 \n", - "\n", - "gene hsa-mir-99a hsa-mir-99b \n", - "0 \n", - "TCGA-3C-AAAU 7.024602 15.506461 \n", - "TCGA-3C-AALI 7.885299 13.626182 \n", - "TCGA-3C-AALJ 7.580704 15.013822 \n", - "TCGA-3C-AALK 10.031619 14.554783 \n", - "TCGA-4H-AAAK 10.078201 14.650338 \n", - "\n", - "[5 rows x 503 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectsynchronous_malignancyajcc_pathologic_stagedays_to_diagnosislateralitycreated_datetimelast_known_disease_statustissue_or_organ_of_origindays_to_last_follow_upage_at_diagnosis...pathology_N_stagepathology_M_stagegenderdate_of_initial_pathologic_diagnosisdays_to_last_known_aliveradiation_therapyhistological_typenumber_of_lymph_nodesraceethnicity
TCGA-3C-AAAUTCGA-BRCANoStage X0.0LeftNaNNaNBreast, NOSNaN20211.0...nxmxfemale2004NaNnoinfiltrating lobular carcinoma4whitenot hispanic or latino
TCGA-3C-AALITCGA-BRCANoStage IIB0.0RightNaNNaNBreast, NOSNaN18538.0...n1am0female2003NaNyesinfiltrating ductal carcinoma1black or african americannot hispanic or latino
TCGA-3C-AALJTCGA-BRCANoStage IIB0.0RightNaNNaNBreast, NOSNaN22848.0...n1am0female2011NaNnoinfiltrating ductal carcinoma1black or african americannot hispanic or latino
TCGA-3C-AALKTCGA-BRCANoStage IA0.0RightNaNNaNBreast, NOSNaN19074.0...n0 (i+)m0female2011NaNnoinfiltrating ductal carcinoma0black or african americannot hispanic or latino
TCGA-4H-AAAKTCGA-BRCANoStage IIIA0.0LeftNaNNaNBreast, NOSNaN18371.0...n2am0female2013NaNnoinfiltrating lobular carcinoma4whitenot hispanic or latino
\n", - "

5 rows × 119 columns

\n", - "
" - ], - "text/plain": [ - " project synchronous_malignancy ajcc_pathologic_stage \\\n", - "TCGA-3C-AAAU TCGA-BRCA No Stage X \n", - "TCGA-3C-AALI TCGA-BRCA No Stage IIB \n", - "TCGA-3C-AALJ TCGA-BRCA No Stage IIB \n", - "TCGA-3C-AALK TCGA-BRCA No Stage IA \n", - "TCGA-4H-AAAK TCGA-BRCA No Stage IIIA \n", - "\n", - " days_to_diagnosis laterality created_datetime \\\n", - "TCGA-3C-AAAU 0.0 Left NaN \n", - "TCGA-3C-AALI 0.0 Right NaN \n", - "TCGA-3C-AALJ 0.0 Right NaN \n", - "TCGA-3C-AALK 0.0 Right NaN \n", - "TCGA-4H-AAAK 0.0 Left NaN \n", - "\n", - " last_known_disease_status tissue_or_organ_of_origin \\\n", - "TCGA-3C-AAAU NaN Breast, NOS \n", - "TCGA-3C-AALI NaN Breast, NOS \n", - "TCGA-3C-AALJ NaN Breast, NOS \n", - "TCGA-3C-AALK NaN Breast, NOS \n", - "TCGA-4H-AAAK NaN Breast, NOS \n", - "\n", - " days_to_last_follow_up age_at_diagnosis ... pathology_N_stage \\\n", - "TCGA-3C-AAAU NaN 20211.0 ... nx \n", - "TCGA-3C-AALI NaN 18538.0 ... n1a \n", - "TCGA-3C-AALJ NaN 22848.0 ... n1a \n", - "TCGA-3C-AALK NaN 19074.0 ... n0 (i+) \n", - "TCGA-4H-AAAK NaN 18371.0 ... n2a \n", - "\n", - " pathology_M_stage gender date_of_initial_pathologic_diagnosis \\\n", - "TCGA-3C-AAAU mx female 2004 \n", - "TCGA-3C-AALI m0 female 2003 \n", - "TCGA-3C-AALJ m0 female 2011 \n", - "TCGA-3C-AALK m0 female 2011 \n", - "TCGA-4H-AAAK m0 female 2013 \n", - "\n", - " days_to_last_known_alive radiation_therapy \\\n", - "TCGA-3C-AAAU NaN no \n", - "TCGA-3C-AALI NaN yes \n", - "TCGA-3C-AALJ NaN no \n", - "TCGA-3C-AALK NaN no \n", - "TCGA-4H-AAAK NaN no \n", - "\n", - " histological_type number_of_lymph_nodes \\\n", - "TCGA-3C-AAAU infiltrating lobular carcinoma 4 \n", - "TCGA-3C-AALI infiltrating ductal carcinoma 1 \n", - "TCGA-3C-AALJ infiltrating ductal carcinoma 1 \n", - "TCGA-3C-AALK infiltrating ductal carcinoma 0 \n", - "TCGA-4H-AAAK infiltrating lobular carcinoma 4 \n", - "\n", - " race ethnicity \n", - "TCGA-3C-AAAU white not hispanic or latino \n", - "TCGA-3C-AALI black or african american not hispanic or latino \n", - "TCGA-3C-AALJ black or african american not hispanic or latino \n", - "TCGA-3C-AALK black or african american not hispanic or latino \n", - "TCGA-4H-AAAK white not hispanic or latino \n", - "\n", - "[5 rows x 119 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "BRCA_Subtype_PAM50\n", - "LumA 419\n", - "LumB 140\n", - "Basal 130\n", - "Her2 46\n", - "Normal 34\n", - "Name: count, dtype: int64" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(meth.head())\n", - "display(rna.head())\n", - "display(mirna.head())\n", - "display(clinical.head())\n", - "display(pam50.value_counts())" - ] - }, - { - "cell_type": "markdown", - "id": "17f7d599", - "metadata": {}, - "source": [ - "## Preprocessing\n", - "\n", - "After reviewing the data above, we applied the following steps to the data before further analysis.\n", - "\n", - "1. Methylation (B -> M-value)\n", - " - Clip B-values to \\[E, 1-E] and apply logit transform: M = log_2(B / (1-B)).\n", - " - Drop the original `Composite Element REF` column.\n", - "\n", - "2. mRNA & miRNA:\n", - " - Already in log_2 scale (RSEM normalized and RPKM).\n", - "\n", - "3. Quality Control:\n", - " - Count samples with all-zero rows in each modality.\n", - " - Compute NaN counts post-transformation, then replace all NaNs with 0.\n", - "\n", - "4. Column Name Cleaning:\n", - " - Replace all `-` and `|` characters with `_`.\n", - " - Replace `?` with `unknown`.\n", - "\n", - "5. Label Encoding:\n", - " - Map `PAM50` subtypes to integers: \n", - " - Normal = 0\n", - " - Basal = 1 \n", - " - Her2 = 2\n", - " - LumA = 3\n", - " - LumB = 4\n", - "\n", - "6. Alignment & Aggregation:\n", - " - Trim barcodes to patient level.\n", - " - Aggregate duplicate aliquots by mean per patient.\n", - " - Drop the `project` column from clinical.\n", - " - Subset all tables to the common patient set (no missing or all-zero samples).\n", - " - Set up a commong index across all files.\n", - "\n", - "7. Final Output Shapes:\n", - " - Methylation M-value: 769 × 20,107\n", - " - mRNA (log_2): 769 × 20,531\n", - " - miRNA (log_2): 769 × 503\n", - " - PAM50 labels: 769 × 1\n", - " - Clinical covariates: 769 × 101" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5bb6450e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All zeros: meth: 0, rna: 0, mirna: 0\n", - "nan_meth: 0, nan_rna: 0, nan_mirna: 0, nan_clinical: 0, nan_pam50: 0\n", - "NaN counts after filling:\n", - "0 0 0 46476 0\n", - "new shapes: meth: (769, 20106), rna: (769, 18321), mirna: (769, 503), pam50: (769, 1), clinical: (769, 118)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hybridization REFA1BGA1CFA2BP1A2LD1A2MA2ML1A4GALTA4GNTAAA1AAAS...ZWILCHZWINTZXDCZYG11AZYG11BZYXZZEF1ZZZ3psiTPTE22tAKR
patient
TCGA-3C-AAAU-0.094004-1.251175-2.1135850.7652620.3458962.343631-0.0877411.1557912.071436-2.650851...-2.972923-4.132523-1.308165-1.0341990.016935-1.820233-0.103662-3.055084-1.6057830.036955
TCGA-3C-AALI0.812517-0.237291-1.6588880.9974400.6302212.4181350.2897801.584114-0.613329-4.072465...-2.989465-4.369032-1.469365-0.549876-0.382967-1.691887-0.238022-2.879231-2.3601280.729981
TCGA-3C-AALJ0.931878-0.059301-1.3691041.6286170.9721302.277584-0.1379880.9169641.956230-3.781647...-2.969472-4.488190-1.419578-0.637297-0.292273-1.902991-0.100215-3.132087-1.5671040.025686
TCGA-3C-AALK0.6769130.741678-0.0641331.5524541.4202002.3431330.3246210.9058162.152928-3.894574...-2.558319-4.341028-1.213585-0.523013-0.309506-1.824419-0.081137-2.973455-0.1620041.103860
TCGA-4H-AAAK0.6579630.044649-0.2090041.2122101.1703042.0216280.0281030.1801842.515149-3.885526...-2.889175-4.405580-1.217950-0.706284-0.069670-1.716283-0.053464-2.934908-1.1215751.545812
\n", - "

5 rows × 20106 columns

\n", - "
" - ], - "text/plain": [ - "Hybridization REF A1BG A1CF A2BP1 A2LD1 A2M A2ML1 \\\n", - "patient \n", - "TCGA-3C-AAAU -0.094004 -1.251175 -2.113585 0.765262 0.345896 2.343631 \n", - "TCGA-3C-AALI 0.812517 -0.237291 -1.658888 0.997440 0.630221 2.418135 \n", - "TCGA-3C-AALJ 0.931878 -0.059301 -1.369104 1.628617 0.972130 2.277584 \n", - "TCGA-3C-AALK 0.676913 0.741678 -0.064133 1.552454 1.420200 2.343133 \n", - "TCGA-4H-AAAK 0.657963 0.044649 -0.209004 1.212210 1.170304 2.021628 \n", - "\n", - "Hybridization REF A4GALT A4GNT AAA1 AAAS ... ZWILCH \\\n", - "patient ... \n", - "TCGA-3C-AAAU -0.087741 1.155791 2.071436 -2.650851 ... -2.972923 \n", - "TCGA-3C-AALI 0.289780 1.584114 -0.613329 -4.072465 ... -2.989465 \n", - "TCGA-3C-AALJ -0.137988 0.916964 1.956230 -3.781647 ... -2.969472 \n", - "TCGA-3C-AALK 0.324621 0.905816 2.152928 -3.894574 ... -2.558319 \n", - "TCGA-4H-AAAK 0.028103 0.180184 2.515149 -3.885526 ... -2.889175 \n", - "\n", - "Hybridization REF ZWINT ZXDC ZYG11A ZYG11B ZYX ZZEF1 \\\n", - "patient \n", - "TCGA-3C-AAAU -4.132523 -1.308165 -1.034199 0.016935 -1.820233 -0.103662 \n", - "TCGA-3C-AALI -4.369032 -1.469365 -0.549876 -0.382967 -1.691887 -0.238022 \n", - "TCGA-3C-AALJ -4.488190 -1.419578 -0.637297 -0.292273 -1.902991 -0.100215 \n", - "TCGA-3C-AALK -4.341028 -1.213585 -0.523013 -0.309506 -1.824419 -0.081137 \n", - "TCGA-4H-AAAK -4.405580 -1.217950 -0.706284 -0.069670 -1.716283 -0.053464 \n", - "\n", - "Hybridization REF ZZZ3 psiTPTE22 tAKR \n", - "patient \n", - "TCGA-3C-AAAU -3.055084 -1.605783 0.036955 \n", - "TCGA-3C-AALI -2.879231 -2.360128 0.729981 \n", - "TCGA-3C-AALJ -3.132087 -1.567104 0.025686 \n", - "TCGA-3C-AALK -2.973455 -0.162004 1.103860 \n", - "TCGA-4H-AAAK -2.934908 -1.121575 1.545812 \n", - "\n", - "[5 rows x 20106 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
geneunknown_100133144unknown_100134869unknown_10357unknown_10431unknown_155060unknown_26823unknown_340602unknown_388795unknown_390284unknown_391343...ZWINT_11130ZXDA_7789ZXDB_158586ZXDC_79364ZYG11A_440590ZYG11B_79699ZYX_7791ZZEF1_23140ZZZ3_26009psiTPTE22_387590
patient
TCGA-3C-AAAU4.0324893.6928295.7046048.67269410.2131100.0000000.785174-1.5365872.0482010.000000...9.8641207.0178309.97696810.6956628.01398810.23885111.77612410.88793210.2051290.785174
TCGA-3C-AALI3.2119314.1192736.1242319.1392799.0113430.1210157.1709282.2910140.7060223.027968...9.9146825.9024388.80932910.3913747.6328319.23742212.42642810.3648488.6679739.855788
TCGA-3C-AALJ3.5388863.2062377.26957010.4102759.2095060.0000000.0000001.4435541.4435540.000000...11.3056505.1439699.0606919.5864888.3742679.05578412.4143559.8809358.9929945.143969
TCGA-3C-AALK3.5956713.4698737.1685659.7574509.110487-1.2733430.0000001.0487242.1862150.000000...9.3849945.7820658.7739069.7546887.4547039.24641912.4745569.6094269.4530016.057699
TCGA-4H-AAAK2.7754303.8509796.3959689.5819228.027083-1.232769-1.2327691.5746831.5746830.000000...9.3976065.6128308.72878910.0358813.8117389.59943811.9807479.7002929.7841477.548699
\n", - "

5 rows × 18321 columns

\n", - "
" - ], - "text/plain": [ - "gene unknown_100133144 unknown_100134869 unknown_10357 \\\n", - "patient \n", - "TCGA-3C-AAAU 4.032489 3.692829 5.704604 \n", - "TCGA-3C-AALI 3.211931 4.119273 6.124231 \n", - "TCGA-3C-AALJ 3.538886 3.206237 7.269570 \n", - "TCGA-3C-AALK 3.595671 3.469873 7.168565 \n", - "TCGA-4H-AAAK 2.775430 3.850979 6.395968 \n", - "\n", - "gene unknown_10431 unknown_155060 unknown_26823 unknown_340602 \\\n", - "patient \n", - "TCGA-3C-AAAU 8.672694 10.213110 0.000000 0.785174 \n", - "TCGA-3C-AALI 9.139279 9.011343 0.121015 7.170928 \n", - "TCGA-3C-AALJ 10.410275 9.209506 0.000000 0.000000 \n", - "TCGA-3C-AALK 9.757450 9.110487 -1.273343 0.000000 \n", - "TCGA-4H-AAAK 9.581922 8.027083 -1.232769 -1.232769 \n", - "\n", - "gene unknown_388795 unknown_390284 unknown_391343 ... \\\n", - "patient ... \n", - "TCGA-3C-AAAU -1.536587 2.048201 0.000000 ... \n", - "TCGA-3C-AALI 2.291014 0.706022 3.027968 ... \n", - "TCGA-3C-AALJ 1.443554 1.443554 0.000000 ... \n", - "TCGA-3C-AALK 1.048724 2.186215 0.000000 ... \n", - "TCGA-4H-AAAK 1.574683 1.574683 0.000000 ... \n", - "\n", - "gene ZWINT_11130 ZXDA_7789 ZXDB_158586 ZXDC_79364 ZYG11A_440590 \\\n", - "patient \n", - "TCGA-3C-AAAU 9.864120 7.017830 9.976968 10.695662 8.013988 \n", - "TCGA-3C-AALI 9.914682 5.902438 8.809329 10.391374 7.632831 \n", - "TCGA-3C-AALJ 11.305650 5.143969 9.060691 9.586488 8.374267 \n", - "TCGA-3C-AALK 9.384994 5.782065 8.773906 9.754688 7.454703 \n", - "TCGA-4H-AAAK 9.397606 5.612830 8.728789 10.035881 3.811738 \n", - "\n", - "gene ZYG11B_79699 ZYX_7791 ZZEF1_23140 ZZZ3_26009 \\\n", - "patient \n", - "TCGA-3C-AAAU 10.238851 11.776124 10.887932 10.205129 \n", - "TCGA-3C-AALI 9.237422 12.426428 10.364848 8.667973 \n", - "TCGA-3C-AALJ 9.055784 12.414355 9.880935 8.992994 \n", - "TCGA-3C-AALK 9.246419 12.474556 9.609426 9.453001 \n", - "TCGA-4H-AAAK 9.599438 11.980747 9.700292 9.784147 \n", - "\n", - "gene psiTPTE22_387590 \n", - "patient \n", - "TCGA-3C-AAAU 0.785174 \n", - "TCGA-3C-AALI 9.855788 \n", - "TCGA-3C-AALJ 5.143969 \n", - "TCGA-3C-AALK 6.057699 \n", - "TCGA-4H-AAAK 7.548699 \n", - "\n", - "[5 rows x 18321 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
genehsa_let_7a_1hsa_let_7a_2hsa_let_7a_3hsa_let_7bhsa_let_7chsa_let_7dhsa_let_7ehsa_let_7f_1hsa_let_7f_2hsa_let_7g...hsa_mir_937hsa_mir_939hsa_mir_940hsa_mir_942hsa_mir_944hsa_mir_95hsa_mir_96hsa_mir_98hsa_mir_99ahsa_mir_99b
patient
TCGA-3C-AAAU13.12976514.11793313.14771414.5951358.4148908.66592110.5217773.87939211.8248178.597744...0.906699-0.0933022.6722342.4674141.0442022.0442026.9066995.7546967.02460215.506461
TCGA-3C-AALI12.91806913.92230012.91319414.5126579.6465369.0036539.1317604.38695212.6788418.455144...1.579597-0.0833670.1390243.032109-0.6683310.3316705.9128706.4270667.88529913.626182
TCGA-3C-AALJ13.01203314.01000213.02848313.4196129.3124559.27694311.3957115.31469213.5302559.230563...3.270298-2.1891340.3958281.855261-0.3817780.7177576.6036576.8783017.58070415.013822
TCGA-3C-AALK13.14469714.14172113.15128114.66719611.5114318.38476310.3689814.15918212.6525598.471503...0.923965-0.660997-0.0760341.7984351.7984350.7984356.1813545.37792210.03161914.554783
TCGA-4H-AAAK13.41168414.41351813.42048114.43854811.6939278.45374710.7413714.49453713.0094998.381220...0.182950-0.624403-1.6244031.0760360.182950-0.3024754.3181105.10351610.07820114.650338
\n", - "

5 rows × 503 columns

\n", - "
" - ], - "text/plain": [ - "gene hsa_let_7a_1 hsa_let_7a_2 hsa_let_7a_3 hsa_let_7b \\\n", - "patient \n", - "TCGA-3C-AAAU 13.129765 14.117933 13.147714 14.595135 \n", - "TCGA-3C-AALI 12.918069 13.922300 12.913194 14.512657 \n", - "TCGA-3C-AALJ 13.012033 14.010002 13.028483 13.419612 \n", - "TCGA-3C-AALK 13.144697 14.141721 13.151281 14.667196 \n", - "TCGA-4H-AAAK 13.411684 14.413518 13.420481 14.438548 \n", - "\n", - "gene hsa_let_7c hsa_let_7d hsa_let_7e hsa_let_7f_1 hsa_let_7f_2 \\\n", - "patient \n", - "TCGA-3C-AAAU 8.414890 8.665921 10.521777 3.879392 11.824817 \n", - "TCGA-3C-AALI 9.646536 9.003653 9.131760 4.386952 12.678841 \n", - "TCGA-3C-AALJ 9.312455 9.276943 11.395711 5.314692 13.530255 \n", - "TCGA-3C-AALK 11.511431 8.384763 10.368981 4.159182 12.652559 \n", - "TCGA-4H-AAAK 11.693927 8.453747 10.741371 4.494537 13.009499 \n", - "\n", - "gene hsa_let_7g ... hsa_mir_937 hsa_mir_939 hsa_mir_940 \\\n", - "patient ... \n", - "TCGA-3C-AAAU 8.597744 ... 0.906699 -0.093302 2.672234 \n", - "TCGA-3C-AALI 8.455144 ... 1.579597 -0.083367 0.139024 \n", - "TCGA-3C-AALJ 9.230563 ... 3.270298 -2.189134 0.395828 \n", - "TCGA-3C-AALK 8.471503 ... 0.923965 -0.660997 -0.076034 \n", - "TCGA-4H-AAAK 8.381220 ... 0.182950 -0.624403 -1.624403 \n", - "\n", - "gene hsa_mir_942 hsa_mir_944 hsa_mir_95 hsa_mir_96 hsa_mir_98 \\\n", - "patient \n", - "TCGA-3C-AAAU 2.467414 1.044202 2.044202 6.906699 5.754696 \n", - "TCGA-3C-AALI 3.032109 -0.668331 0.331670 5.912870 6.427066 \n", - "TCGA-3C-AALJ 1.855261 -0.381778 0.717757 6.603657 6.878301 \n", - "TCGA-3C-AALK 1.798435 1.798435 0.798435 6.181354 5.377922 \n", - "TCGA-4H-AAAK 1.076036 0.182950 -0.302475 4.318110 5.103516 \n", - "\n", - "gene hsa_mir_99a hsa_mir_99b \n", - "patient \n", - "TCGA-3C-AAAU 7.024602 15.506461 \n", - "TCGA-3C-AALI 7.885299 13.626182 \n", - "TCGA-3C-AALJ 7.580704 15.013822 \n", - "TCGA-3C-AALK 10.031619 14.554783 \n", - "TCGA-4H-AAAK 10.078201 14.650338 \n", - "\n", - "[5 rows x 503 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
synchronous_malignancyajcc_pathologic_stagedays_to_diagnosislateralitycreated_datetimelast_known_disease_statustissue_or_organ_of_origindays_to_last_follow_upage_at_diagnosisprimary_diagnosis...pathology_N_stagepathology_M_stagegenderdate_of_initial_pathologic_diagnosisdays_to_last_known_aliveradiation_therapyhistological_typenumber_of_lymph_nodesraceethnicity
patient
TCGA-3C-AAAUNoStage X0.0LeftNaNNaNBreast, NOSNaN20211.0Lobular carcinoma, NOS...nxmxfemale2004NaNnoinfiltrating lobular carcinoma4whitenot hispanic or latino
TCGA-3C-AALINoStage IIB0.0RightNaNNaNBreast, NOSNaN18538.0Infiltrating duct carcinoma, NOS...n1am0female2003NaNyesinfiltrating ductal carcinoma1black or african americannot hispanic or latino
TCGA-3C-AALJNoStage IIB0.0RightNaNNaNBreast, NOSNaN22848.0Infiltrating duct carcinoma, NOS...n1am0female2011NaNnoinfiltrating ductal carcinoma1black or african americannot hispanic or latino
TCGA-3C-AALKNoStage IA0.0RightNaNNaNBreast, NOSNaN19074.0Infiltrating duct carcinoma, NOS...n0 (i+)m0female2011NaNnoinfiltrating ductal carcinoma0black or african americannot hispanic or latino
TCGA-4H-AAAKNoStage IIIA0.0LeftNaNNaNBreast, NOSNaN18371.0Lobular carcinoma, NOS...n2am0female2013NaNnoinfiltrating lobular carcinoma4whitenot hispanic or latino
\n", - "

5 rows × 118 columns

\n", - "
" - ], - "text/plain": [ - " synchronous_malignancy ajcc_pathologic_stage days_to_diagnosis \\\n", - "patient \n", - "TCGA-3C-AAAU No Stage X 0.0 \n", - "TCGA-3C-AALI No Stage IIB 0.0 \n", - "TCGA-3C-AALJ No Stage IIB 0.0 \n", - "TCGA-3C-AALK No Stage IA 0.0 \n", - "TCGA-4H-AAAK No Stage IIIA 0.0 \n", - "\n", - " laterality created_datetime last_known_disease_status \\\n", - "patient \n", - "TCGA-3C-AAAU Left NaN NaN \n", - "TCGA-3C-AALI Right NaN NaN \n", - "TCGA-3C-AALJ Right NaN NaN \n", - "TCGA-3C-AALK Right NaN NaN \n", - "TCGA-4H-AAAK Left NaN NaN \n", - "\n", - " tissue_or_organ_of_origin days_to_last_follow_up \\\n", - "patient \n", - "TCGA-3C-AAAU Breast, NOS NaN \n", - "TCGA-3C-AALI Breast, NOS NaN \n", - "TCGA-3C-AALJ Breast, NOS NaN \n", - "TCGA-3C-AALK Breast, NOS NaN \n", - "TCGA-4H-AAAK Breast, NOS NaN \n", - "\n", - " age_at_diagnosis primary_diagnosis ... \\\n", - "patient ... \n", - "TCGA-3C-AAAU 20211.0 Lobular carcinoma, NOS ... \n", - "TCGA-3C-AALI 18538.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-3C-AALJ 22848.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-3C-AALK 19074.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-4H-AAAK 18371.0 Lobular carcinoma, NOS ... \n", - "\n", - " pathology_N_stage pathology_M_stage gender \\\n", - "patient \n", - "TCGA-3C-AAAU nx mx female \n", - "TCGA-3C-AALI n1a m0 female \n", - "TCGA-3C-AALJ n1a m0 female \n", - "TCGA-3C-AALK n0 (i+) m0 female \n", - "TCGA-4H-AAAK n2a m0 female \n", - "\n", - " date_of_initial_pathologic_diagnosis days_to_last_known_alive \\\n", - "patient \n", - "TCGA-3C-AAAU 2004 NaN \n", - "TCGA-3C-AALI 2003 NaN \n", - "TCGA-3C-AALJ 2011 NaN \n", - "TCGA-3C-AALK 2011 NaN \n", - "TCGA-4H-AAAK 2013 NaN \n", - "\n", - " radiation_therapy histological_type \\\n", - "patient \n", - "TCGA-3C-AAAU no infiltrating lobular carcinoma \n", - "TCGA-3C-AALI yes infiltrating ductal carcinoma \n", - "TCGA-3C-AALJ no infiltrating ductal carcinoma \n", - "TCGA-3C-AALK no infiltrating ductal carcinoma \n", - "TCGA-4H-AAAK no infiltrating lobular carcinoma \n", - "\n", - " number_of_lymph_nodes race \\\n", - "patient \n", - "TCGA-3C-AAAU 4 white \n", - "TCGA-3C-AALI 1 black or african american \n", - "TCGA-3C-AALJ 1 black or african american \n", - "TCGA-3C-AALK 0 black or african american \n", - "TCGA-4H-AAAK 4 white \n", - "\n", - " ethnicity \n", - "patient \n", - "TCGA-3C-AAAU not hispanic or latino \n", - "TCGA-3C-AALI not hispanic or latino \n", - "TCGA-3C-AALJ not hispanic or latino \n", - "TCGA-3C-AALK not hispanic or latino \n", - "TCGA-4H-AAAK not hispanic or latino \n", - "\n", - "[5 rows x 118 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "pam50\n", - "3 419\n", - "4 140\n", - "1 130\n", - "2 46\n", - "0 34\n", - "Name: count, dtype: int64" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "def beta_to_m(df, eps=1e-6):\n", - " B = np.clip(df.values, eps, 1.0 - eps)\n", - " M = np.log2(B / (1 - B))\n", - " return pd.DataFrame(M, index=df.index, columns=df.columns)\n", - "\n", - "# find rows that are all 0s\n", - "zeros_meth = (meth == 0).all(axis=1).sum()\n", - "zeros_rna = (rna == 0).all(axis=1).sum()\n", - "zeros_mirna = (mirna == 0).all(axis=1).sum()\n", - "print(f\"All zeros: meth: {zeros_meth}, rna: {zeros_rna}, mirna: {zeros_mirna}\")\n", - "\n", - "# find rows with all nans\n", - "nan_meth = meth.isna().all(axis=1).sum()\n", - "nan_rna = rna.isna().all(axis=1).sum()\n", - "nan_mirna = mirna.isna().all(axis=1).sum()\n", - "nan_clinical = clinical.isna().all(axis=1).sum()\n", - "nan_pam50 = pam50.isna().all(axis=1).sum()\n", - "print(f\"nan_meth: {nan_meth}, nan_rna: {nan_rna}, nan_mirna: {nan_mirna}, nan_clinical: {nan_clinical}, nan_pam50: {nan_pam50}\")\n", - "\n", - "# map PAM50 subtypes to integers\n", - "mapping = {\"Normal\":0, \"Basal\":1, \"Her2\":2, \"LumA\":3, \"LumB\":4}\n", - "pam50 = pam50[\"BRCA_Subtype_PAM50\"].map(mapping).to_frame(name=\"pam50\")\n", - "\n", - "# drop and transform methylation\n", - "meth_clean = meth.drop(columns=[\"Composite Element REF\"], errors=\"ignore\")\n", - "meth_m = beta_to_m(meth_clean)\n", - "clinical = clinical.drop(columns=[\"project\"], errors=\"ignore\")\n", - "\n", - "# clean column names and fill nans\n", - "for df in [meth_m, rna, mirna]:\n", - " df.columns = df.columns.str.replace(r\"\\?\", \"unknown_\", regex=True)\n", - " df.columns = df.columns.str.replace(r\"\\|\", \"_\", regex=True)\n", - " df.columns = df.columns.str.replace(\"-\", \"_\", regex=False)\n", - " df.columns = df.columns.str.replace(r\"_+\", \"_\", regex=True)\n", - " df.columns = df.columns.str.strip(\"_\")\n", - " df.fillna(0, inplace=True)\n", - "\n", - "# check for nans after filling\n", - "print(\"NaN counts after filling:\")\n", - "print(meth_m.isna().sum().sum(),rna.isna().sum().sum(),mirna.isna().sum().sum(),clinical.isna().sum().sum(),pam50.isna().sum().sum())\n", - "\n", - "# align index to PAM50\n", - "X_meth = meth_m.loc[pam50.index]\n", - "X_rna = rna.loc[pam50.index]\n", - "X_mirna = mirna.loc[pam50.index]\n", - "clinical= clinical.loc[pam50.index]\n", - "\n", - "print(f\"new shapes: meth: {X_meth.shape}, rna: {X_rna.shape}, mirna: {X_mirna.shape}, pam50: {pam50.shape}, clinical: {clinical.shape}\")\n", - "display(X_meth.head())\n", - "display(X_rna.head())\n", - "display(X_mirna.head())\n", - "display(clinical.head())\n", - "display(pam50.value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2f0714e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up a commong index and saving to csv\n", - "X_meth.index.name = \"patient\"\n", - "X_rna.index.name = \"patient\"\n", - "X_mirna.index.name = \"patient\"\n", - "pam50.index.name = \"patient\"\n", - "clinical.index.name = \"patient\"\n", - "\n", - "X_meth.to_csv(root / \"meth.csv\", index=True)\n", - "X_rna.to_csv(root / \"rna.csv\", index=True)\n", - "X_mirna.to_csv(root / \"mirna.csv\", index=True)\n", - "pam50.to_csv(root / \"pam50.csv\", index=True)\n", - "clinical.to_csv(root / \"clinical.csv\", index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ef2982ef", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
A1BGA1CFA2BP1A2LD1A2MA2ML1A4GALTA4GNTAAA1AAAS...ZWILCHZWINTZXDCZYG11AZYG11BZYXZZEF1ZZZ3psiTPTE22tAKR
patient
TCGA-3C-AAAU-0.094004-1.251175-2.1135850.7652620.3458962.343631-0.0877411.1557912.071436-2.650851...-2.972923-4.132523-1.308165-1.0341990.016935-1.820233-0.103662-3.055084-1.6057830.036955
TCGA-3C-AALI0.812517-0.237291-1.6588880.9974400.6302212.4181350.2897801.584114-0.613329-4.072465...-2.989465-4.369032-1.469365-0.549876-0.382967-1.691887-0.238022-2.879231-2.3601280.729981
TCGA-3C-AALJ0.931878-0.059301-1.3691041.6286170.9721302.277584-0.1379880.9169641.956230-3.781647...-2.969472-4.488190-1.419578-0.637297-0.292273-1.902991-0.100215-3.132087-1.5671040.025686
TCGA-3C-AALK0.6769130.741678-0.0641331.5524541.4202002.3431330.3246210.9058162.152928-3.894574...-2.558319-4.341028-1.213585-0.523013-0.309506-1.824419-0.081137-2.973455-0.1620041.103860
TCGA-4H-AAAK0.6579630.044649-0.2090041.2122101.1703042.0216280.0281030.1801842.515149-3.885526...-2.889175-4.405580-1.217950-0.706284-0.069670-1.716283-0.053464-2.934908-1.1215751.545812
\n", - "

5 rows × 20106 columns

\n", - "
" - ], - "text/plain": [ - " A1BG A1CF A2BP1 A2LD1 A2M A2ML1 \\\n", - "patient \n", - "TCGA-3C-AAAU -0.094004 -1.251175 -2.113585 0.765262 0.345896 2.343631 \n", - "TCGA-3C-AALI 0.812517 -0.237291 -1.658888 0.997440 0.630221 2.418135 \n", - "TCGA-3C-AALJ 0.931878 -0.059301 -1.369104 1.628617 0.972130 2.277584 \n", - "TCGA-3C-AALK 0.676913 0.741678 -0.064133 1.552454 1.420200 2.343133 \n", - "TCGA-4H-AAAK 0.657963 0.044649 -0.209004 1.212210 1.170304 2.021628 \n", - "\n", - " A4GALT A4GNT AAA1 AAAS ... ZWILCH ZWINT \\\n", - "patient ... \n", - "TCGA-3C-AAAU -0.087741 1.155791 2.071436 -2.650851 ... -2.972923 -4.132523 \n", - "TCGA-3C-AALI 0.289780 1.584114 -0.613329 -4.072465 ... -2.989465 -4.369032 \n", - "TCGA-3C-AALJ -0.137988 0.916964 1.956230 -3.781647 ... -2.969472 -4.488190 \n", - "TCGA-3C-AALK 0.324621 0.905816 2.152928 -3.894574 ... -2.558319 -4.341028 \n", - "TCGA-4H-AAAK 0.028103 0.180184 2.515149 -3.885526 ... -2.889175 -4.405580 \n", - "\n", - " ZXDC ZYG11A ZYG11B ZYX ZZEF1 ZZZ3 \\\n", - "patient \n", - "TCGA-3C-AAAU -1.308165 -1.034199 0.016935 -1.820233 -0.103662 -3.055084 \n", - "TCGA-3C-AALI -1.469365 -0.549876 -0.382967 -1.691887 -0.238022 -2.879231 \n", - "TCGA-3C-AALJ -1.419578 -0.637297 -0.292273 -1.902991 -0.100215 -3.132087 \n", - "TCGA-3C-AALK -1.213585 -0.523013 -0.309506 -1.824419 -0.081137 -2.973455 \n", - "TCGA-4H-AAAK -1.217950 -0.706284 -0.069670 -1.716283 -0.053464 -2.934908 \n", - "\n", - " psiTPTE22 tAKR \n", - "patient \n", - "TCGA-3C-AAAU -1.605783 0.036955 \n", - "TCGA-3C-AALI -2.360128 0.729981 \n", - "TCGA-3C-AALJ -1.567104 0.025686 \n", - "TCGA-3C-AALK -0.162004 1.103860 \n", - "TCGA-4H-AAAK -1.121575 1.545812 \n", - "\n", - "[5 rows x 20106 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unknown_100133144unknown_100134869unknown_10357unknown_10431unknown_155060unknown_26823unknown_340602unknown_388795unknown_390284unknown_391343...ZWINT_11130ZXDA_7789ZXDB_158586ZXDC_79364ZYG11A_440590ZYG11B_79699ZYX_7791ZZEF1_23140ZZZ3_26009psiTPTE22_387590
patient
TCGA-3C-AAAU4.0324893.6928295.7046048.67269410.2131100.0000000.785174-1.5365872.0482010.000000...9.8641207.0178309.97696810.6956628.01398810.23885111.77612410.88793210.2051290.785174
TCGA-3C-AALI3.2119314.1192736.1242319.1392799.0113430.1210157.1709282.2910140.7060223.027968...9.9146825.9024388.80932910.3913747.6328319.23742212.42642810.3648488.6679739.855788
TCGA-3C-AALJ3.5388863.2062377.26957010.4102759.2095060.0000000.0000001.4435541.4435540.000000...11.3056505.1439699.0606919.5864888.3742679.05578412.4143559.8809358.9929945.143969
TCGA-3C-AALK3.5956713.4698737.1685659.7574509.110487-1.2733430.0000001.0487242.1862150.000000...9.3849945.7820658.7739069.7546887.4547039.24641912.4745569.6094269.4530016.057699
TCGA-4H-AAAK2.7754303.8509796.3959689.5819228.027083-1.232769-1.2327691.5746831.5746830.000000...9.3976065.6128308.72878910.0358813.8117389.59943811.9807479.7002929.7841477.548699
\n", - "

5 rows × 18321 columns

\n", - "
" - ], - "text/plain": [ - " unknown_100133144 unknown_100134869 unknown_10357 \\\n", - "patient \n", - "TCGA-3C-AAAU 4.032489 3.692829 5.704604 \n", - "TCGA-3C-AALI 3.211931 4.119273 6.124231 \n", - "TCGA-3C-AALJ 3.538886 3.206237 7.269570 \n", - "TCGA-3C-AALK 3.595671 3.469873 7.168565 \n", - "TCGA-4H-AAAK 2.775430 3.850979 6.395968 \n", - "\n", - " unknown_10431 unknown_155060 unknown_26823 unknown_340602 \\\n", - "patient \n", - "TCGA-3C-AAAU 8.672694 10.213110 0.000000 0.785174 \n", - "TCGA-3C-AALI 9.139279 9.011343 0.121015 7.170928 \n", - "TCGA-3C-AALJ 10.410275 9.209506 0.000000 0.000000 \n", - "TCGA-3C-AALK 9.757450 9.110487 -1.273343 0.000000 \n", - "TCGA-4H-AAAK 9.581922 8.027083 -1.232769 -1.232769 \n", - "\n", - " unknown_388795 unknown_390284 unknown_391343 ... \\\n", - "patient ... \n", - "TCGA-3C-AAAU -1.536587 2.048201 0.000000 ... \n", - "TCGA-3C-AALI 2.291014 0.706022 3.027968 ... \n", - "TCGA-3C-AALJ 1.443554 1.443554 0.000000 ... \n", - "TCGA-3C-AALK 1.048724 2.186215 0.000000 ... \n", - "TCGA-4H-AAAK 1.574683 1.574683 0.000000 ... \n", - "\n", - " ZWINT_11130 ZXDA_7789 ZXDB_158586 ZXDC_79364 ZYG11A_440590 \\\n", - "patient \n", - "TCGA-3C-AAAU 9.864120 7.017830 9.976968 10.695662 8.013988 \n", - "TCGA-3C-AALI 9.914682 5.902438 8.809329 10.391374 7.632831 \n", - "TCGA-3C-AALJ 11.305650 5.143969 9.060691 9.586488 8.374267 \n", - "TCGA-3C-AALK 9.384994 5.782065 8.773906 9.754688 7.454703 \n", - "TCGA-4H-AAAK 9.397606 5.612830 8.728789 10.035881 3.811738 \n", - "\n", - " ZYG11B_79699 ZYX_7791 ZZEF1_23140 ZZZ3_26009 \\\n", - "patient \n", - "TCGA-3C-AAAU 10.238851 11.776124 10.887932 10.205129 \n", - "TCGA-3C-AALI 9.237422 12.426428 10.364848 8.667973 \n", - "TCGA-3C-AALJ 9.055784 12.414355 9.880935 8.992994 \n", - "TCGA-3C-AALK 9.246419 12.474556 9.609426 9.453001 \n", - "TCGA-4H-AAAK 9.599438 11.980747 9.700292 9.784147 \n", - "\n", - " psiTPTE22_387590 \n", - "patient \n", - "TCGA-3C-AAAU 0.785174 \n", - "TCGA-3C-AALI 9.855788 \n", - "TCGA-3C-AALJ 5.143969 \n", - "TCGA-3C-AALK 6.057699 \n", - "TCGA-4H-AAAK 7.548699 \n", - "\n", - "[5 rows x 18321 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
hsa_let_7a_1hsa_let_7a_2hsa_let_7a_3hsa_let_7bhsa_let_7chsa_let_7dhsa_let_7ehsa_let_7f_1hsa_let_7f_2hsa_let_7g...hsa_mir_937hsa_mir_939hsa_mir_940hsa_mir_942hsa_mir_944hsa_mir_95hsa_mir_96hsa_mir_98hsa_mir_99ahsa_mir_99b
patient
TCGA-3C-AAAU13.12976514.11793313.14771414.5951358.4148908.66592110.5217773.87939211.8248178.597744...0.906699-0.0933022.6722342.4674141.0442022.0442026.9066995.7546967.02460215.506461
TCGA-3C-AALI12.91806913.92230012.91319414.5126579.6465369.0036539.1317604.38695212.6788418.455144...1.579597-0.0833670.1390243.032109-0.6683310.3316705.9128706.4270667.88529913.626182
TCGA-3C-AALJ13.01203314.01000213.02848313.4196129.3124559.27694311.3957115.31469213.5302559.230563...3.270298-2.1891340.3958281.855261-0.3817780.7177576.6036576.8783017.58070415.013822
TCGA-3C-AALK13.14469714.14172113.15128114.66719611.5114318.38476310.3689814.15918212.6525598.471503...0.923965-0.660997-0.0760341.7984351.7984350.7984356.1813545.37792210.03161914.554783
TCGA-4H-AAAK13.41168414.41351813.42048114.43854811.6939278.45374710.7413714.49453713.0094998.381220...0.182950-0.624403-1.6244031.0760360.182950-0.3024754.3181105.10351610.07820114.650338
\n", - "

5 rows × 503 columns

\n", - "
" - ], - "text/plain": [ - " hsa_let_7a_1 hsa_let_7a_2 hsa_let_7a_3 hsa_let_7b \\\n", - "patient \n", - "TCGA-3C-AAAU 13.129765 14.117933 13.147714 14.595135 \n", - "TCGA-3C-AALI 12.918069 13.922300 12.913194 14.512657 \n", - "TCGA-3C-AALJ 13.012033 14.010002 13.028483 13.419612 \n", - "TCGA-3C-AALK 13.144697 14.141721 13.151281 14.667196 \n", - "TCGA-4H-AAAK 13.411684 14.413518 13.420481 14.438548 \n", - "\n", - " hsa_let_7c hsa_let_7d hsa_let_7e hsa_let_7f_1 hsa_let_7f_2 \\\n", - "patient \n", - "TCGA-3C-AAAU 8.414890 8.665921 10.521777 3.879392 11.824817 \n", - "TCGA-3C-AALI 9.646536 9.003653 9.131760 4.386952 12.678841 \n", - "TCGA-3C-AALJ 9.312455 9.276943 11.395711 5.314692 13.530255 \n", - "TCGA-3C-AALK 11.511431 8.384763 10.368981 4.159182 12.652559 \n", - "TCGA-4H-AAAK 11.693927 8.453747 10.741371 4.494537 13.009499 \n", - "\n", - " hsa_let_7g ... hsa_mir_937 hsa_mir_939 hsa_mir_940 \\\n", - "patient ... \n", - "TCGA-3C-AAAU 8.597744 ... 0.906699 -0.093302 2.672234 \n", - "TCGA-3C-AALI 8.455144 ... 1.579597 -0.083367 0.139024 \n", - "TCGA-3C-AALJ 9.230563 ... 3.270298 -2.189134 0.395828 \n", - "TCGA-3C-AALK 8.471503 ... 0.923965 -0.660997 -0.076034 \n", - "TCGA-4H-AAAK 8.381220 ... 0.182950 -0.624403 -1.624403 \n", - "\n", - " hsa_mir_942 hsa_mir_944 hsa_mir_95 hsa_mir_96 hsa_mir_98 \\\n", - "patient \n", - "TCGA-3C-AAAU 2.467414 1.044202 2.044202 6.906699 5.754696 \n", - "TCGA-3C-AALI 3.032109 -0.668331 0.331670 5.912870 6.427066 \n", - "TCGA-3C-AALJ 1.855261 -0.381778 0.717757 6.603657 6.878301 \n", - "TCGA-3C-AALK 1.798435 1.798435 0.798435 6.181354 5.377922 \n", - "TCGA-4H-AAAK 1.076036 0.182950 -0.302475 4.318110 5.103516 \n", - "\n", - " hsa_mir_99a hsa_mir_99b \n", - "patient \n", - "TCGA-3C-AAAU 7.024602 15.506461 \n", - "TCGA-3C-AALI 7.885299 13.626182 \n", - "TCGA-3C-AALJ 7.580704 15.013822 \n", - "TCGA-3C-AALK 10.031619 14.554783 \n", - "TCGA-4H-AAAK 10.078201 14.650338 \n", - "\n", - "[5 rows x 503 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
synchronous_malignancyajcc_pathologic_stagedays_to_diagnosislateralitycreated_datetimelast_known_disease_statustissue_or_organ_of_origindays_to_last_follow_upage_at_diagnosisprimary_diagnosis...pathology_N_stagepathology_M_stagegender.1date_of_initial_pathologic_diagnosisdays_to_last_known_aliveradiation_therapyhistological_typenumber_of_lymph_nodesrace.1ethnicity.1
patient
TCGA-3C-AAAUNoStage X0.0LeftNaNNaNBreast, NOSNaN20211.0Lobular carcinoma, NOS...nxmxfemale2004.0NaNnoinfiltrating lobular carcinoma4.0whitenot hispanic or latino
TCGA-3C-AALINoStage IIB0.0RightNaNNaNBreast, NOSNaN18538.0Infiltrating duct carcinoma, NOS...n1am0female2003.0NaNyesinfiltrating ductal carcinoma1.0black or african americannot hispanic or latino
TCGA-3C-AALJNoStage IIB0.0RightNaNNaNBreast, NOSNaN22848.0Infiltrating duct carcinoma, NOS...n1am0female2011.0NaNnoinfiltrating ductal carcinoma1.0black or african americannot hispanic or latino
TCGA-3C-AALKNoStage IA0.0RightNaNNaNBreast, NOSNaN19074.0Infiltrating duct carcinoma, NOS...n0 (i+)m0female2011.0NaNnoinfiltrating ductal carcinoma0.0black or african americannot hispanic or latino
TCGA-4H-AAAKNoStage IIIA0.0LeftNaNNaNBreast, NOSNaN18371.0Lobular carcinoma, NOS...n2am0female2013.0NaNnoinfiltrating lobular carcinoma4.0whitenot hispanic or latino
\n", - "

5 rows × 118 columns

\n", - "
" - ], - "text/plain": [ - " synchronous_malignancy ajcc_pathologic_stage days_to_diagnosis \\\n", - "patient \n", - "TCGA-3C-AAAU No Stage X 0.0 \n", - "TCGA-3C-AALI No Stage IIB 0.0 \n", - "TCGA-3C-AALJ No Stage IIB 0.0 \n", - "TCGA-3C-AALK No Stage IA 0.0 \n", - "TCGA-4H-AAAK No Stage IIIA 0.0 \n", - "\n", - " laterality created_datetime last_known_disease_status \\\n", - "patient \n", - "TCGA-3C-AAAU Left NaN NaN \n", - "TCGA-3C-AALI Right NaN NaN \n", - "TCGA-3C-AALJ Right NaN NaN \n", - "TCGA-3C-AALK Right NaN NaN \n", - "TCGA-4H-AAAK Left NaN NaN \n", - "\n", - " tissue_or_organ_of_origin days_to_last_follow_up \\\n", - "patient \n", - "TCGA-3C-AAAU Breast, NOS NaN \n", - "TCGA-3C-AALI Breast, NOS NaN \n", - "TCGA-3C-AALJ Breast, NOS NaN \n", - "TCGA-3C-AALK Breast, NOS NaN \n", - "TCGA-4H-AAAK Breast, NOS NaN \n", - "\n", - " age_at_diagnosis primary_diagnosis ... \\\n", - "patient ... \n", - "TCGA-3C-AAAU 20211.0 Lobular carcinoma, NOS ... \n", - "TCGA-3C-AALI 18538.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-3C-AALJ 22848.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-3C-AALK 19074.0 Infiltrating duct carcinoma, NOS ... \n", - "TCGA-4H-AAAK 18371.0 Lobular carcinoma, NOS ... \n", - "\n", - " pathology_N_stage pathology_M_stage gender.1 \\\n", - "patient \n", - "TCGA-3C-AAAU nx mx female \n", - "TCGA-3C-AALI n1a m0 female \n", - "TCGA-3C-AALJ n1a m0 female \n", - "TCGA-3C-AALK n0 (i+) m0 female \n", - "TCGA-4H-AAAK n2a m0 female \n", - "\n", - " date_of_initial_pathologic_diagnosis days_to_last_known_alive \\\n", - "patient \n", - "TCGA-3C-AAAU 2004.0 NaN \n", - "TCGA-3C-AALI 2003.0 NaN \n", - "TCGA-3C-AALJ 2011.0 NaN \n", - "TCGA-3C-AALK 2011.0 NaN \n", - "TCGA-4H-AAAK 2013.0 NaN \n", - "\n", - " radiation_therapy histological_type \\\n", - "patient \n", - "TCGA-3C-AAAU no infiltrating lobular carcinoma \n", - "TCGA-3C-AALI yes infiltrating ductal carcinoma \n", - "TCGA-3C-AALJ no infiltrating ductal carcinoma \n", - "TCGA-3C-AALK no infiltrating ductal carcinoma \n", - "TCGA-4H-AAAK no infiltrating lobular carcinoma \n", - "\n", - " number_of_lymph_nodes race.1 \\\n", - "patient \n", - "TCGA-3C-AAAU 4.0 white \n", - "TCGA-3C-AALI 1.0 black or african american \n", - "TCGA-3C-AALJ 1.0 black or african american \n", - "TCGA-3C-AALK 0.0 black or african american \n", - "TCGA-4H-AAAK 4.0 white \n", - "\n", - " ethnicity.1 \n", - "patient \n", - "TCGA-3C-AAAU not hispanic or latino \n", - "TCGA-3C-AALI not hispanic or latino \n", - "TCGA-3C-AALJ not hispanic or latino \n", - "TCGA-3C-AALK not hispanic or latino \n", - "TCGA-4H-AAAK not hispanic or latino \n", - "\n", - "[5 rows x 118 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pam50
patient
TCGA-3C-AAAU3
TCGA-3C-AALI2
TCGA-3C-AALJ4
TCGA-3C-AALK3
TCGA-4H-AAAK3
\n", - "
" - ], - "text/plain": [ - " pam50\n", - "patient \n", - "TCGA-3C-AAAU 3\n", - "TCGA-3C-AALI 2\n", - "TCGA-3C-AALJ 4\n", - "TCGA-3C-AALK 3\n", - "TCGA-4H-AAAK 3" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# To confirm our data saved and loads properly:\n", - "meth = pd.read_csv(root / \"meth.csv\", index_col=0)\n", - "rna = pd.read_csv(root / \"rna.csv\", index_col=0)\n", - "mirna = pd.read_csv(root / \"mirna.csv\", index_col=0)\n", - "pam50 = pd.read_csv(root / \"pam50.csv\", index_col=0)\n", - "clinical = pd.read_csv(root / \"clinical.csv\", index_col=0)\n", - " \n", - "display(meth.head())\n", - "display(rna.head())\n", - "display(mirna.head())\n", - "display(clinical.head())\n", - "display(pam50.head())" - ] - }, - { - "cell_type": "markdown", - "id": "09265512", - "metadata": {}, - "source": [ - "## Feature Selection\n", - "\n", - "To support downstream analysis, we provide three built-in feature selection methods: +\n", - "- variance thresholding \n", - "- ANOVA F-test\n", - "- Random Forest importance. \n", - "\n", - "These are designed to help users quickly identify the most informative features from high-dimensional omics datasets. Each method captures different statistical properties, ranging from general variability to class-based separability and model-derived relevance. In this section, we put all three to the test and examine how much they agree with each other." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fa70dcca", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-17 21:01:38,128 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:38,129 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:38,129 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:01:38,230 - bioneuralnet.utils.preprocess - INFO - Selected top 6000 features by variance\n", - "2025-05-17 21:01:40,830 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:40,831 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:40,831 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:01:40,923 - bioneuralnet.utils.preprocess - INFO - Selected top 6000 features by variance\n", - "2025-05-17 21:01:43,760 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:43,761 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:43,761 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:01:43,919 - bioneuralnet.utils.preprocess - INFO - Selected 6000 features by ANOVA (task=classification), 17514 significant, 0 padded\n", - "2025-05-17 21:01:46,512 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:46,513 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:46,513 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:01:46,654 - bioneuralnet.utils.preprocess - INFO - Selected 6000 features by ANOVA (task=classification), 16864 significant, 0 padded\n", - "2025-05-17 21:01:49,491 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:49,491 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:49,491 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:01:57,430 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:01:57,431 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:01:57,440 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Methylation feature selection:\n", - "\n", - "Anova-F & variance selection share: 2091 features\n", - "Random Forest & variance selection share: 1871 features\n", - "Anova-F & Random Forest share: 2201 features\n", - "All three methods agree on: 815 features\n", - "\n", - "RNA feature selection:\n", - "\n", - "Anova-F & variance selection share: 2340 features\n", - "Random Forest & variance selection share: 2218 features\n", - "Anova-F & Random Forest share: 2546 features\n", - "All three methods agree on: 1134 features\n" - ] - } - ], - "source": [ - "from bioneuralnet.utils.preprocess import select_top_k_variance\n", - "from bioneuralnet.utils.preprocess import top_anova_f_features\n", - "from bioneuralnet.utils.preprocess import select_top_randomforest\n", - "\n", - "# feature selection\n", - "meth_highvar = select_top_k_variance(meth, k=6000)\n", - "rna_highvar = select_top_k_variance(rna, k=6000)\n", - "\n", - "meth_af = top_anova_f_features(meth, pam50, max_features=6000)\n", - "rna_af = top_anova_f_features(rna, pam50, max_features=6000)\n", - "\n", - "meth_rf = select_top_randomforest(meth, pam50, top_k=6000)\n", - "rna_rf = select_top_randomforest(rna, pam50, top_k=6000)\n", - "\n", - "meth_var = list(meth_highvar.columns)\n", - "meth_anova = list(meth_af.columns)\n", - "meth_rf = list(meth_rf.columns)\n", - "\n", - "rna_var = list(rna_highvar.columns)\n", - "rna_anova = list(rna_af.columns)\n", - "rna_rf = list(rna_rf.columns)\n", - "\n", - "inter1 = []\n", - "for x in meth_anova:\n", - " if x in meth_var:\n", - " inter1.append(x)\n", - "\n", - "inter2 = []\n", - "for x in meth_rf:\n", - " if x in meth_var:\n", - " inter2.append(x)\n", - "\n", - "inter3 = []\n", - "for x in meth_anova:\n", - " if x in meth_rf:\n", - " inter3.append(x)\n", - "\n", - "meth_all_three = []\n", - "for x in meth_anova:\n", - " if x in meth_rf and x in meth_var:\n", - " meth_all_three.append(x)\n", - "\n", - "print(\"Methylation feature selection:\\n\")\n", - "print(f\"Anova-F & variance selection share: {len(inter1)} features\")\n", - "print(f\"Random Forest & variance selection share: {len(inter2)} features\")\n", - "print(f\"Anova-F & Random Forest share: {len(inter3)} features\")\n", - "print(f\"All three methods agree on: {len(meth_all_three)} features\")\n", - "\n", - "inter4 = []\n", - "for x in rna_anova:\n", - " if x in rna_var:\n", - " inter4.append(x)\n", - "\n", - "inter5 = []\n", - "for x in rna_rf:\n", - " if x in rna_var:\n", - " inter5.append(x)\n", - "\n", - "inter6 = []\n", - "for x in rna_anova:\n", - " if x in rna_rf:\n", - " inter6.append(x)\n", - "\n", - "rna_all_three = []\n", - "for x in rna_anova:\n", - " if x in rna_rf and x in rna_var:\n", - " rna_all_three.append(x)\n", - "\n", - "print(\"\\nRNA feature selection:\\n\")\n", - "print(f\"Anova-F & variance selection share: {len(inter4)} features\")\n", - "print(f\"Random Forest & variance selection share: {len(inter5)} features\")\n", - "print(f\"Anova-F & Random Forest share: {len(inter6)} features\")\n", - "print(f\"All three methods agree on: {len(rna_all_three)} features\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "1c371389", - "metadata": {}, - "outputs": [], - "source": [ - "out_dir = Path(\"/home/vicente/Github/BioNeuralNet/TCGA_BRCA_DATA/ANOVA\")\n", - "# def split_half(df: pd.DataFrame, name: str, out_dir: Path):\n", - "# n = len(df)\n", - "# half = n // 2\n", - "\n", - "# df_1 = df.iloc[:half]\n", - "# df_2 = df.iloc[half:]\n", - "\n", - "# df_1.to_csv(out_dir / f\"{name}_1.csv\")\n", - "# df_2.to_csv(out_dir / f\"{name}_2.csv\")\n", - "\n", - "rna_af.to_csv(out_dir / \"rna_anova.csv\")\n", - "meth_af.to_csv(out_dir / \"meth_anova.csv\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "c2168980", - "metadata": {}, - "source": [ - "## Easy Access via DatasetLoader\n", - "\n", - "To facilitate working with this data, we have made it available through our `DatasetLoader` component. Due to GitHub and PyPI file size limitations, we selected the top 6,000 features from each omics modality using ANOVA F-test. If you have additional pre-processed or raw datasets you would like to include, feel free to reach out, we are happy to support expanding the platform and adding new datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2d0340f6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TGCA BRCA dataset shape: {'mirna': (769, 503), 'pam50': (769, 1), 'clinical': (769, 118), 'rna': (769, 6000), 'meth': (769, 6000)}\n" - ] - } - ], - "source": [ - "from bioneuralnet.datasets import DatasetLoader\n", - "\n", - "tgca_brca = DatasetLoader(\"brca\")\n", - "\n", - "print(f\"TGCA BRCA dataset shape: {tgca_brca.shape}\")\n", - "brca_meth = tgca_brca.data[\"meth\"]\n", - "brca_rna = tgca_brca.data[\"rna\"]\n", - "brca_mirna = tgca_brca.data[\"mirna\"]\n", - "brca_clinical = tgca_brca.data[\"clinical\"]\n", - "brca_pam50 = tgca_brca.data[\"pam50\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "338dc995", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-17 21:02:10,064 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:02:10,065 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 31384 NaNs after median imputation\n", - "2025-05-17 21:02:10,065 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 39 columns dropped due to zero variance\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RNA shape: (769, 6000)\n", - "METH shape: (769, 6000)\n", - "miRNA shape: (769, 503)\n", - "Clinical shape: (769, 118)\n", - "Phenotype shape: (769, 1)\n", - "Phenotype counts:\n", - "pam50\n", - "3 419\n", - "4 140\n", - "1 130\n", - "2 46\n", - "0 34\n", - "Name: count, dtype: int64\n", - "Nan values in pam50 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-17 21:02:10,435 - bioneuralnet.utils.preprocess - INFO - Selected top 15 features by RandomForest importance\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
age_at_diagnosisdays_to_birthyears_to_birthage_at_indexdays_to_last_followupyear_of_diagnosisnumber_of_lymph_nodesdate_of_initial_pathologic_diagnosishistological_type_infiltrating lobular carcinomahistological_type_infiltrating ductal carcinomaprimary_diagnosis_Lobular carcinoma, NOSmorphology_8520/3method_of_diagnosis_Core Biopsymorphology_8500/3laterality_Right
patient
TCGA-3C-AAAU20211.0-20211.055.055.04047.0-1.501.5-1.50TrueFalseTrueTrueFalseFalseFalse
TCGA-3C-AALI18538.0-18538.050.050.04005.0-1.750.0-1.75FalseTrueFalseFalseTrueTrueTrue
TCGA-3C-AALJ22848.0-22848.062.062.01474.00.250.00.25FalseTrueFalseFalseTrueTrueTrue
TCGA-3C-AALK19074.0-19074.052.052.01448.00.25-0.50.25FalseTrueFalseFalseTrueTrueTrue
TCGA-4H-AAAK18371.0-18371.050.050.0348.00.751.50.75TrueFalseTrueTrueTrueFalseFalse
\n", - "
" - ], - "text/plain": [ - " age_at_diagnosis days_to_birth years_to_birth age_at_index \\\n", - "patient \n", - "TCGA-3C-AAAU 20211.0 -20211.0 55.0 55.0 \n", - "TCGA-3C-AALI 18538.0 -18538.0 50.0 50.0 \n", - "TCGA-3C-AALJ 22848.0 -22848.0 62.0 62.0 \n", - "TCGA-3C-AALK 19074.0 -19074.0 52.0 52.0 \n", - "TCGA-4H-AAAK 18371.0 -18371.0 50.0 50.0 \n", - "\n", - " days_to_last_followup year_of_diagnosis number_of_lymph_nodes \\\n", - "patient \n", - "TCGA-3C-AAAU 4047.0 -1.50 1.5 \n", - "TCGA-3C-AALI 4005.0 -1.75 0.0 \n", - "TCGA-3C-AALJ 1474.0 0.25 0.0 \n", - "TCGA-3C-AALK 1448.0 0.25 -0.5 \n", - "TCGA-4H-AAAK 348.0 0.75 1.5 \n", - "\n", - " date_of_initial_pathologic_diagnosis \\\n", - "patient \n", - "TCGA-3C-AAAU -1.50 \n", - "TCGA-3C-AALI -1.75 \n", - "TCGA-3C-AALJ 0.25 \n", - "TCGA-3C-AALK 0.25 \n", - "TCGA-4H-AAAK 0.75 \n", - "\n", - " histological_type_infiltrating lobular carcinoma \\\n", - "patient \n", - "TCGA-3C-AAAU True \n", - "TCGA-3C-AALI False \n", - "TCGA-3C-AALJ False \n", - "TCGA-3C-AALK False \n", - "TCGA-4H-AAAK True \n", - "\n", - " histological_type_infiltrating ductal carcinoma \\\n", - "patient \n", - "TCGA-3C-AAAU False \n", - "TCGA-3C-AALI True \n", - "TCGA-3C-AALJ True \n", - "TCGA-3C-AALK True \n", - "TCGA-4H-AAAK False \n", - "\n", - " primary_diagnosis_Lobular carcinoma, NOS morphology_8520/3 \\\n", - "patient \n", - "TCGA-3C-AAAU True True \n", - "TCGA-3C-AALI False False \n", - "TCGA-3C-AALJ False False \n", - "TCGA-3C-AALK False False \n", - "TCGA-4H-AAAK True True \n", - "\n", - " method_of_diagnosis_Core Biopsy morphology_8500/3 \\\n", - "patient \n", - "TCGA-3C-AAAU False False \n", - "TCGA-3C-AALI True True \n", - "TCGA-3C-AALJ True True \n", - "TCGA-3C-AALK True True \n", - "TCGA-4H-AAAK True False \n", - "\n", - " laterality_Right \n", - "patient \n", - "TCGA-3C-AAAU False \n", - "TCGA-3C-AALI True \n", - "TCGA-3C-AALJ True \n", - "TCGA-3C-AALK True \n", - "TCGA-4H-AAAK False " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from bioneuralnet.utils.preprocess import preprocess_clinical\n", - "\n", - "#shapes\n", - "print(f\"RNA shape: {brca_rna.shape}\")\n", - "print(f\"METH shape: {brca_meth.shape}\")\n", - "print(f\"miRNA shape: {brca_mirna.shape}\")\n", - "print(f\"Clinical shape: {brca_clinical.shape}\")\n", - "print(f\"Phenotype shape: {brca_pam50.shape}\")\n", - "print(f\"Phenotype counts:\\n{brca_pam50.value_counts()}\")\n", - "\n", - "#check nans in pam50\n", - "print(f\"Nan values in pam50 {brca_pam50.isna().sum().sum()}\")\n", - "brca_pam50 = brca_pam50.dropna()\n", - "\n", - "X_rna = brca_rna.loc[brca_pam50.index]\n", - "X_meth = brca_meth.loc[brca_pam50.index]\n", - "X_mirna = brca_mirna.loc[brca_pam50.index]\n", - "clinical = brca_clinical.loc[brca_pam50.index]\n", - "\n", - "# for more details on the preprocessing function, see bioneuralnet.utils.preprocess\n", - "clinical = preprocess_clinical(clinical, brca_pam50, top_k=15, scale=True, ignore_columns=[\"days_to_birth\", \"age_at_diagnosis\", \"days_to_last_followup\", \"age_at_index\", \"years_to_birth\"])\n", - "display(clinical.head())" - ] - }, - { - "cell_type": "markdown", - "id": "89cb8500", - "metadata": {}, - "source": [ - "## Preparing Multi-Omics Data for downstream tasks\n", - "\n", - "1. Check sample overlap.\n", - "\n", - "2. Select top features.\n", - "\n", - " - Although each omics dataset has already been filtered down to the top 6,000 features using the ANOVA F-test, this is still considered high-dimensional for most modeling tasks.\n", - "\n", - " - To make the data more tractable and improve downstream performance, we apply ANOVA F-test again to select the top 1,000 most discriminative features from each dataset.\n", - "\n", - "3. Combine datasets.\n", - "\n", - " - Selected features from RNA, methylation, and miRNA are combined into a single dataset.\n", - "\n", - "4. Clean missing values.\n", - "\n", - " - Counts and removes any missing (nan) values from the combined dataset.\n", - "\n", - "5. Build similarity graph.\n", - "\n", - " - Creates a k-nearest neighbors graph from the transposed feature matrix.\n", - "\n", - " - Other supported methods include correlation-based graphs, soft-thresholding (WGCNA-style), Gaussian kernels, and mutual information networks.\n", - "\n", - "Note: For more details on preprocessing functions and graph generation algorithms, see the [Utils documentation](https://bioneuralnet.readthedocs.io/en/latest/utils.html)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "b4646135", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-17 21:02:11,298 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:02:11,299 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:02:11,299 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:02:11,343 - bioneuralnet.utils.preprocess - INFO - Selected 1000 features by ANOVA (task=classification), 6000 significant, 0 padded\n", - "2025-05-17 21:02:12,188 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:02:12,189 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:02:12,189 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:02:12,232 - bioneuralnet.utils.preprocess - INFO - Selected 1000 features by ANOVA (task=classification), 6000 significant, 0 padded\n", - "2025-05-17 21:02:12,309 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values\n", - "2025-05-17 21:02:12,309 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation\n", - "2025-05-17 21:02:12,310 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance\n", - "2025-05-17 21:02:12,313 - bioneuralnet.utils.preprocess - INFO - Selected 503 features by ANOVA (task=classification), 465 significant, 38 padded\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Nan values in X_train_full: 0\n", - "Nan value in X_train_full after dropping: 0\n", - "X_train_full shape: (769, 2503)\n", - "\n", - "Network shape: (2503, 2503)\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, f1_score\n", - "from bioneuralnet.utils.preprocess import top_anova_f_features\n", - "from bioneuralnet.utils.graph import gen_similarity_graph\n", - "\n", - "meth_sel = top_anova_f_features(X_meth, brca_pam50, max_features=1000)\n", - "rna_sel = top_anova_f_features(X_rna, brca_pam50 ,max_features=1000)\n", - "mirna_sel = top_anova_f_features(X_mirna, brca_pam50,max_features=503)\n", - "X_train_full = pd.concat([meth_sel, rna_sel, mirna_sel], axis=1)\n", - "\n", - "# we check again for nan values then drop if any\n", - "print(f\"Nan values in X_train_full: {X_train_full.isna().sum().sum()}\")\n", - "X_train_full = X_train_full.dropna()\n", - "print(f\"Nan value in X_train_full after dropping: {X_train_full.isna().sum().sum()}\")\n", - "\n", - "print(f\"X_train_full shape: {X_train_full.shape}\")\n", - "# building the graph using the similarity graph function with k=15\n", - "A_train = gen_similarity_graph(X_train_full.T, k=15)\n", - "\n", - "print(f\"\\nNetwork shape: {A_train.shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "423f2808", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-17 21:22:49,775 - bioneuralnet.downstream_task.dpmon - INFO - Output directory set to: /home/vicente/Github/BioNeuralNet/TCGA_BRCA/results/run\n", - "2025-05-17 21:22:49,776 - bioneuralnet.downstream_task.dpmon - INFO - Initialized DPMON with the provided parameters.\n", - "2025-05-17 21:22:49,776 - bioneuralnet.downstream_task.dpmon - INFO - Starting DPMON run.\n", - "2025-05-17 21:22:49,787 - bioneuralnet.downstream_task.dpmon - INFO - Running hyperparameter tuning for DPMON.\n", - "2025-05-17 21:22:49,788 - bioneuralnet.downstream_task.dpmon - INFO - Using GPU 0\n", - "2025-05-17 21:22:49,788 - bioneuralnet.downstream_task.dpmon - INFO - Slicing omics dataset based on network nodes.\n", - "2025-05-17 21:22:49,792 - bioneuralnet.downstream_task.dpmon - INFO - Building PyTorch Geometric Data object from adjacency matrix.\n", - "2025-05-17 21:22:49,863 - bioneuralnet.downstream_task.dpmon - INFO - Number of nodes in network: 2503\n", - "2025-05-17 21:22:49,863 - bioneuralnet.downstream_task.dpmon - INFO - Using clinical vars for node features: ['age_at_diagnosis', 'days_to_birth', 'years_to_birth', 'age_at_index', 'days_to_last_followup', 'year_of_diagnosis', 'number_of_lymph_nodes', 'date_of_initial_pathologic_diagnosis', 'histological_type_infiltrating lobular carcinoma', 'histological_type_infiltrating ductal carcinoma', 'primary_diagnosis_Lobular carcinoma, NOS', 'morphology_8520/3', 'method_of_diagnosis_Core Biopsy', 'morphology_8500/3', 'laterality_Right']\n", - "2025-05-17 21:22:53,182 - bioneuralnet.downstream_task.dpmon - INFO - Starting hyperparameter tuning for dataset shape: (769, 2504)\n", - "2025-05-17 21:22:53,182\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n", - "2025-05-17 21:23:24,958\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/vicente/ray_results/tune_dp' in 0.0119s.\n", - "2025-05-17 21:23:24,983 - bioneuralnet.downstream_task.dpmon - INFO - Best trial config: {'gnn_layer_num': 8, 'gnn_hidden_dim': 16, 'lr': 0.06920478068671862, 'weight_decay': 0.00010778997282137864, 'nn_hidden_dim1': 8, 'nn_hidden_dim2': 4, 'num_epochs': 512}\n", - "2025-05-17 21:23:24,983 - bioneuralnet.downstream_task.dpmon - INFO - Best trial final loss: 1.1827573776245117\n", - "2025-05-17 21:23:24,984 - bioneuralnet.downstream_task.dpmon - INFO - Best trial final accuracy: 0.7126137841352406\n", - "2025-05-17 21:23:24,986 - bioneuralnet.downstream_task.dpmon - INFO - gnn_layer_num gnn_hidden_dim lr weight_decay nn_hidden_dim1 \\\n", - "0 8 16 0.069205 0.000108 8 \n", - "\n", - " nn_hidden_dim2 num_epochs \n", - "0 4 512 \n", - "2025-05-17 21:23:24,987 - bioneuralnet.downstream_task.dpmon - INFO - Best tuned parameters: {'gnn_layer_num': 8, 'gnn_hidden_dim': 16, 'lr': 0.06920478068671862, 'weight_decay': 0.00010778997282137864, 'nn_hidden_dim1': 8, 'nn_hidden_dim2': 4, 'num_epochs': 512}\n", - "2025-05-17 21:23:24,988 - bioneuralnet.downstream_task.dpmon - INFO - Best tuned parameters: {'gnn_layer_num': 8, 'gnn_hidden_dim': 16, 'lr': 0.06920478068671862, 'weight_decay': 0.00010778997282137864, 'nn_hidden_dim1': 8, 'nn_hidden_dim2': 4, 'num_epochs': 512}\n", - "2025-05-17 21:23:24,988 - bioneuralnet.downstream_task.dpmon - INFO - Running standard training with tuned parameters.\n", - "2025-05-17 21:23:24,988 - bioneuralnet.downstream_task.dpmon - INFO - Using GPU 0\n", - "2025-05-17 21:23:24,988 - bioneuralnet.downstream_task.dpmon - INFO - Slicing omics dataset based on network nodes.\n", - "2025-05-17 21:23:24,991 - bioneuralnet.downstream_task.dpmon - INFO - Building PyTorch Geometric Data object from adjacency matrix.\n", - "2025-05-17 21:23:25,063 - bioneuralnet.downstream_task.dpmon - INFO - Number of nodes in network: 2503\n", - "2025-05-17 21:23:25,064 - bioneuralnet.downstream_task.dpmon - INFO - Using clinical vars for node features: ['age_at_diagnosis', 'days_to_birth', 'years_to_birth', 'age_at_index', 'days_to_last_followup', 'year_of_diagnosis', 'number_of_lymph_nodes', 'date_of_initial_pathologic_diagnosis', 'histological_type_infiltrating lobular carcinoma', 'histological_type_infiltrating ductal carcinoma', 'primary_diagnosis_Lobular carcinoma, NOS', 'morphology_8520/3', 'method_of_diagnosis_Core Biopsy', 'morphology_8500/3', 'laterality_Right']\n", - "2025-05-17 21:23:28,362 - bioneuralnet.downstream_task.dpmon - INFO - Training iteration 1/3\n", - "2025-05-17 21:23:28,388 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [1/512], Loss: 1.5414\n", - "2025-05-17 21:23:28,419 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [10/512], Loss: 1.4586\n", - "2025-05-17 21:23:28,452 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [20/512], Loss: 1.2028\n", - "2025-05-17 21:23:28,485 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [30/512], Loss: 1.1805\n", - "2025-05-17 21:23:28,518 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [40/512], Loss: 1.1705\n", - "2025-05-17 21:23:28,550 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [50/512], Loss: 1.1586\n", - "2025-05-17 21:23:28,583 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [60/512], Loss: 1.1320\n", - "2025-05-17 21:23:28,617 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [70/512], Loss: 1.0871\n", - "2025-05-17 21:23:28,653 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [80/512], Loss: 1.0508\n", - "2025-05-17 21:23:28,686 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [90/512], Loss: 1.0249\n", - "2025-05-17 21:23:28,720 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [100/512], Loss: 1.0134\n", - "2025-05-17 21:23:28,753 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [110/512], Loss: 1.0106\n", - "2025-05-17 21:23:28,788 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [120/512], Loss: 1.0081\n", - "2025-05-17 21:23:28,821 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [130/512], Loss: 1.0027\n", - "2025-05-17 21:23:28,855 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [140/512], Loss: 1.0592\n", - "2025-05-17 21:23:28,888 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [150/512], Loss: 0.9912\n", - "2025-05-17 21:23:28,921 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [160/512], Loss: 0.9659\n", - "2025-05-17 21:23:28,954 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [170/512], Loss: 0.9564\n", - "2025-05-17 21:23:29,009 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [180/512], Loss: 0.9538\n", - "2025-05-17 21:23:29,042 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [190/512], Loss: 0.9530\n", - "2025-05-17 21:23:29,073 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [200/512], Loss: 0.9657\n", - "2025-05-17 21:23:29,105 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [210/512], Loss: 1.0156\n", - "2025-05-17 21:23:29,137 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [220/512], Loss: 0.9852\n", - "2025-05-17 21:23:29,169 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [230/512], Loss: 0.9607\n", - "2025-05-17 21:23:29,201 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [240/512], Loss: 0.9553\n", - "2025-05-17 21:23:29,234 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [250/512], Loss: 0.9536\n", - "2025-05-17 21:23:29,266 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [260/512], Loss: 0.9533\n", - "2025-05-17 21:23:29,298 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [270/512], Loss: 0.9522\n", - "2025-05-17 21:23:29,331 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [280/512], Loss: 0.9512\n", - "2025-05-17 21:23:29,364 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [290/512], Loss: 0.9541\n", - "2025-05-17 21:23:29,401 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [300/512], Loss: 0.9187\n", - "2025-05-17 21:23:29,443 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [310/512], Loss: 0.9108\n", - "2025-05-17 21:23:29,480 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [320/512], Loss: 0.9089\n", - "2025-05-17 21:23:29,514 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [330/512], Loss: 0.9085\n", - "2025-05-17 21:23:29,548 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [340/512], Loss: 0.9083\n", - "2025-05-17 21:23:29,581 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [350/512], Loss: 0.9082\n", - "2025-05-17 21:23:29,613 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [360/512], Loss: 0.9944\n", - "2025-05-17 21:23:29,649 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [370/512], Loss: 0.9616\n", - "2025-05-17 21:23:29,690 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [380/512], Loss: 0.9454\n", - "2025-05-17 21:23:29,723 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [390/512], Loss: 0.9328\n", - "2025-05-17 21:23:29,757 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [400/512], Loss: 0.9219\n", - "2025-05-17 21:23:29,790 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [410/512], Loss: 0.9166\n", - "2025-05-17 21:23:29,824 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [420/512], Loss: 0.9157\n", - "2025-05-17 21:23:29,857 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [430/512], Loss: 0.9141\n", - "2025-05-17 21:23:29,891 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [440/512], Loss: 1.0127\n", - "2025-05-17 21:23:29,926 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [450/512], Loss: 1.0930\n", - "2025-05-17 21:23:29,960 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [460/512], Loss: 1.0680\n", - "2025-05-17 21:23:29,994 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [470/512], Loss: 1.0153\n", - "2025-05-17 21:23:30,028 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [480/512], Loss: 0.9372\n", - "2025-05-17 21:23:30,065 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [490/512], Loss: 0.9157\n", - "2025-05-17 21:23:30,101 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [500/512], Loss: 0.9133\n", - "2025-05-17 21:23:30,135 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [510/512], Loss: 0.9130\n", - "2025-05-17 21:23:30,143 - bioneuralnet.downstream_task.dpmon - INFO - Training Accuracy: 0.6151\n", - "2025-05-17 21:23:30,145 - bioneuralnet.downstream_task.dpmon - INFO - Model saved to /home/vicente/Github/BioNeuralNet/TCGA_BRCA/results/run/dpm_model_iter_1.pth\n", - "2025-05-17 21:23:30,147 - bioneuralnet.downstream_task.dpmon - INFO - Training iteration 2/3\n", - "2025-05-17 21:23:30,159 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [1/512], Loss: 1.6205\n", - "2025-05-17 21:23:30,190 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [10/512], Loss: 1.2498\n", - "2025-05-17 21:23:30,223 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [20/512], Loss: 1.1059\n", - "2025-05-17 21:23:30,255 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [30/512], Loss: 1.0680\n", - "2025-05-17 21:23:30,288 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [40/512], Loss: 1.0398\n", - "2025-05-17 21:23:30,322 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [50/512], Loss: 1.0205\n", - "2025-05-17 21:23:30,355 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [60/512], Loss: 1.0039\n", - "2025-05-17 21:23:30,387 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [70/512], Loss: 0.9904\n", - "2025-05-17 21:23:30,421 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [80/512], Loss: 0.9753\n", - "2025-05-17 21:23:30,454 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [90/512], Loss: 0.9660\n", - "2025-05-17 21:23:30,488 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [100/512], Loss: 0.9591\n", - "2025-05-17 21:23:30,520 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [110/512], Loss: 0.9564\n", - "2025-05-17 21:23:30,554 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [120/512], Loss: 0.9545\n", - "2025-05-17 21:23:30,587 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [130/512], Loss: 0.9533\n", - "2025-05-17 21:23:30,619 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [140/512], Loss: 0.9521\n", - "2025-05-17 21:23:30,652 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [150/512], Loss: 0.9528\n", - "2025-05-17 21:23:30,685 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [160/512], Loss: 1.0022\n", - "2025-05-17 21:23:30,718 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [170/512], Loss: 0.9523\n", - "2025-05-17 21:23:30,750 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [180/512], Loss: 0.9362\n", - "2025-05-17 21:23:30,783 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [190/512], Loss: 0.9329\n", - "2025-05-17 21:23:30,815 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [200/512], Loss: 0.9325\n", - "2025-05-17 21:23:30,847 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [210/512], Loss: 0.9846\n", - "2025-05-17 21:23:30,880 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [220/512], Loss: 0.9593\n", - "2025-05-17 21:23:30,912 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [230/512], Loss: 0.9345\n", - "2025-05-17 21:23:30,946 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [240/512], Loss: 0.9237\n", - "2025-05-17 21:23:30,982 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [250/512], Loss: 0.9193\n", - "2025-05-17 21:23:31,021 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [260/512], Loss: 0.9133\n", - "2025-05-17 21:23:31,055 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [270/512], Loss: 0.9110\n", - "2025-05-17 21:23:31,087 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [280/512], Loss: 0.9094\n", - "2025-05-17 21:23:31,120 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [290/512], Loss: 0.9090\n", - "2025-05-17 21:23:31,153 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [300/512], Loss: 0.9079\n", - "2025-05-17 21:23:31,185 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [310/512], Loss: 0.9069\n", - "2025-05-17 21:23:31,218 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [320/512], Loss: 0.9069\n", - "2025-05-17 21:23:31,251 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [330/512], Loss: 0.9068\n", - "2025-05-17 21:23:31,284 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [340/512], Loss: 0.9068\n", - "2025-05-17 21:23:31,323 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [350/512], Loss: 1.0586\n", - "2025-05-17 21:23:31,356 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [360/512], Loss: 1.0480\n", - "2025-05-17 21:23:31,390 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [370/512], Loss: 1.0303\n", - "2025-05-17 21:23:31,425 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [380/512], Loss: 1.0170\n", - "2025-05-17 21:23:31,461 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [390/512], Loss: 0.9990\n", - "2025-05-17 21:23:31,497 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [400/512], Loss: 1.0054\n", - "2025-05-17 21:23:31,531 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [410/512], Loss: 0.9924\n", - "2025-05-17 21:23:31,573 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [420/512], Loss: 0.9848\n", - "2025-05-17 21:23:31,624 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [430/512], Loss: 0.9793\n", - "2025-05-17 21:23:31,659 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [440/512], Loss: 0.9739\n", - "2025-05-17 21:23:31,692 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [450/512], Loss: 0.9674\n", - "2025-05-17 21:23:31,724 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [460/512], Loss: 0.9655\n", - "2025-05-17 21:23:31,762 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [470/512], Loss: 0.9447\n", - "2025-05-17 21:23:31,795 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [480/512], Loss: 0.9309\n", - "2025-05-17 21:23:31,829 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [490/512], Loss: 0.9234\n", - "2025-05-17 21:23:31,861 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [500/512], Loss: 0.9212\n", - "2025-05-17 21:23:31,894 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [510/512], Loss: 0.9209\n", - "2025-05-17 21:23:31,902 - bioneuralnet.downstream_task.dpmon - INFO - Training Accuracy: 0.9623\n", - "2025-05-17 21:23:31,904 - bioneuralnet.downstream_task.dpmon - INFO - Model saved to /home/vicente/Github/BioNeuralNet/TCGA_BRCA/results/run/dpm_model_iter_2.pth\n", - "2025-05-17 21:23:31,906 - bioneuralnet.downstream_task.dpmon - INFO - Training iteration 3/3\n", - "2025-05-17 21:23:31,920 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [1/512], Loss: 1.6264\n", - "2025-05-17 21:23:31,952 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [10/512], Loss: 1.3673\n", - "2025-05-17 21:23:32,007 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [20/512], Loss: 1.1722\n", - "2025-05-17 21:23:32,040 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [30/512], Loss: 1.0565\n", - "2025-05-17 21:23:32,082 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [40/512], Loss: 1.0221\n", - "2025-05-17 21:23:32,162 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [50/512], Loss: 1.0092\n", - "2025-05-17 21:23:32,197 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [60/512], Loss: 1.0085\n", - "2025-05-17 21:23:32,230 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [70/512], Loss: 1.0065\n", - "2025-05-17 21:23:32,268 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [80/512], Loss: 1.0054\n", - "2025-05-17 21:23:32,314 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [90/512], Loss: 1.0063\n", - "2025-05-17 21:23:32,348 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [100/512], Loss: 1.0057\n", - "2025-05-17 21:23:32,382 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [110/512], Loss: 1.0051\n", - "2025-05-17 21:23:32,415 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [120/512], Loss: 1.0040\n", - "2025-05-17 21:23:32,448 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [130/512], Loss: 1.0034\n", - "2025-05-17 21:23:32,484 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [140/512], Loss: 1.0031\n", - "2025-05-17 21:23:32,517 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [150/512], Loss: 1.0024\n", - "2025-05-17 21:23:32,549 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [160/512], Loss: 1.0185\n", - "2025-05-17 21:23:32,583 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [170/512], Loss: 1.0023\n", - "2025-05-17 21:23:32,616 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [180/512], Loss: 0.9825\n", - "2025-05-17 21:23:32,649 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [190/512], Loss: 0.9519\n", - "2025-05-17 21:23:32,686 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [200/512], Loss: 0.9414\n", - "2025-05-17 21:23:32,733 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [210/512], Loss: 0.9456\n", - "2025-05-17 21:23:32,767 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [220/512], Loss: 0.9167\n", - "2025-05-17 21:23:32,801 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [230/512], Loss: 0.9224\n", - "2025-05-17 21:23:32,833 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [240/512], Loss: 0.9117\n", - "2025-05-17 21:23:32,866 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [250/512], Loss: 0.9092\n", - "2025-05-17 21:23:32,899 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [260/512], Loss: 0.9073\n", - "2025-05-17 21:23:32,932 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [270/512], Loss: 0.9070\n", - "2025-05-17 21:23:32,965 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [280/512], Loss: 0.9064\n", - "2025-05-17 21:23:32,998 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [290/512], Loss: 0.9063\n", - "2025-05-17 21:23:33,030 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [300/512], Loss: 0.9063\n", - "2025-05-17 21:23:33,063 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [310/512], Loss: 0.9063\n", - "2025-05-17 21:23:33,095 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [320/512], Loss: 0.9112\n", - "2025-05-17 21:23:33,128 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [330/512], Loss: 1.0498\n", - "2025-05-17 21:23:33,161 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [340/512], Loss: 1.0127\n", - "2025-05-17 21:23:33,196 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [350/512], Loss: 0.9632\n", - "2025-05-17 21:23:33,230 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [360/512], Loss: 0.9584\n", - "2025-05-17 21:23:33,263 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [370/512], Loss: 0.9563\n", - "2025-05-17 21:23:33,297 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [380/512], Loss: 0.9543\n", - "2025-05-17 21:23:33,330 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [390/512], Loss: 0.9533\n", - "2025-05-17 21:23:33,364 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [400/512], Loss: 1.1039\n", - "2025-05-17 21:23:33,398 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [410/512], Loss: 0.9633\n", - "2025-05-17 21:23:33,432 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [420/512], Loss: 0.9506\n", - "2025-05-17 21:23:33,467 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [430/512], Loss: 0.9426\n", - "2025-05-17 21:23:33,500 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [440/512], Loss: 0.9288\n", - "2025-05-17 21:23:33,532 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [450/512], Loss: 0.9181\n", - "2025-05-17 21:23:33,565 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [460/512], Loss: 0.9154\n", - "2025-05-17 21:23:33,598 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [470/512], Loss: 0.9377\n", - "2025-05-17 21:23:33,631 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [480/512], Loss: 0.9158\n", - "2025-05-17 21:23:33,664 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [490/512], Loss: 0.9137\n", - "2025-05-17 21:23:33,697 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [500/512], Loss: 0.9127\n", - "2025-05-17 21:23:33,730 - bioneuralnet.downstream_task.dpmon - INFO - Epoch [510/512], Loss: 0.9106\n", - "2025-05-17 21:23:33,739 - bioneuralnet.downstream_task.dpmon - INFO - Training Accuracy: 0.8908\n", - "2025-05-17 21:23:33,740 - bioneuralnet.downstream_task.dpmon - INFO - Model saved to /home/vicente/Github/BioNeuralNet/TCGA_BRCA/results/run/dpm_model_iter_3.pth\n", - "2025-05-17 21:23:33,742 - bioneuralnet.downstream_task.dpmon - INFO - Best Accuracy: 0.9623\n", - "2025-05-17 21:23:33,743 - bioneuralnet.downstream_task.dpmon - INFO - Average Accuracy: 0.8227\n", - "2025-05-17 21:23:33,743 - bioneuralnet.downstream_task.dpmon - INFO - Standard Deviation of Accuracy: 0.1833\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "DPMON results:\n", - "Accuracy: 0.9622886866059818\n", - "F1 weighted: 0.9613234297251081\n", - "F1 macro: 0.9412636611836438\n" - ] - } - ], - "source": [ - "from bioneuralnet.downstream_task import DPMON\n", - "\n", - "save = Path(\"/home/vicente/Github/BioNeuralNet/TCGA_BRCA/results\")\n", - "brca_pam50 = brca_pam50.rename(columns={\"pam50\": \"phenotype\"})\n", - "\n", - "dpmon = DPMON(\n", - " adjacency_matrix=A_train,\n", - " omics_list=[meth_sel, rna_sel, mirna_sel],\n", - " phenotype_data=brca_pam50,\n", - " clinical_data=clinical,\n", - " repeat_num=3,\n", - " tune=True,\n", - " gpu=True, \n", - " cuda=0,\n", - " output_dir=Path(save/\"run\"),\n", - ")\n", - "\n", - "predictions_df, avg_accuracy = dpmon.run()\n", - "actual = predictions_df[\"Actual\"]\n", - "pred = predictions_df[\"Predicted\"]\n", - "dp_acc = accuracy_score(actual, pred)\n", - "dp_f1w = f1_score(actual, pred, average='weighted')\n", - "dp_f1m = f1_score(actual, pred, average='macro')\n", - "\n", - "print(f\"\\nDPMON results:\")\n", - "print(f\"Accuracy: {dp_acc}\")\n", - "print(f\"F1 weighted: {dp_f1w}\")\n", - "print(f\"F1 macro: {dp_f1m}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".testing", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/assets/BioNeuralNet1.png b/assets/BioNeuralNet1.png deleted file mode 100644 index a308430..0000000 Binary files a/assets/BioNeuralNet1.png and /dev/null differ diff --git a/assets/BioNeuralNet2.png b/assets/BioNeuralNet2.png deleted file mode 100644 index 2885136..0000000 Binary files a/assets/BioNeuralNet2.png and /dev/null differ diff --git a/assets/LOGO_TB.svg b/assets/LOGO_TB.svg index 72966bf..2c54b73 100644 --- a/assets/LOGO_TB.svg +++ b/assets/LOGO_TB.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/assets/LOGO_WB.svg b/assets/LOGO_WB.svg index caa62e9..3f529c6 100644 --- a/assets/LOGO_WB.svg +++ b/assets/LOGO_WB.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/assets/ccrcc_search.png b/assets/ccrcc_search.png deleted file mode 100644 index df545b4..0000000 Binary files a/assets/ccrcc_search.png and /dev/null differ diff --git a/assets/cptac_clinical.png b/assets/cptac_clinical.png deleted file mode 100644 index f96b554..0000000 Binary files a/assets/cptac_clinical.png and /dev/null differ diff --git a/assets/cptac_genomics.png b/assets/cptac_genomics.png deleted file mode 100644 index 0e5634b..0000000 Binary files a/assets/cptac_genomics.png and /dev/null differ diff --git a/assets/cptac_proteomics.png b/assets/cptac_proteomics.png deleted file mode 100644 index 7999c95..0000000 Binary files a/assets/cptac_proteomics.png and /dev/null differ diff --git a/assets/cptac_search.png b/assets/cptac_search.png deleted file mode 100644 index c587d11..0000000 Binary files a/assets/cptac_search.png and /dev/null differ diff --git a/assets/images_download.png b/assets/images_download.png deleted file mode 100644 index 7bce675..0000000 Binary files a/assets/images_download.png and /dev/null differ diff --git a/bioneuralnet/clustering/correlated_louvain.py b/bioneuralnet/clustering/correlated_louvain.py index af51418..7e955ee 100644 --- a/bioneuralnet/clustering/correlated_louvain.py +++ b/bioneuralnet/clustering/correlated_louvain.py @@ -3,7 +3,7 @@ import pandas as pd import torch import os -from typing import Optional, Union +from typing import Optional, Union, Any from community.community_louvain import ( modularity as original_modularity, @@ -26,7 +26,7 @@ class CorrelatedLouvain: """ CorrelatedLouvain Class for Community Detection with Correlated Omics Data. Attributes: - + G (nx.Graph): NetworkX graph object. B (pd.DataFrame): Omics data. Y (pd.DataFrame): Phenotype data. @@ -79,6 +79,7 @@ def __init__( torch.backends.cudnn.benchmark = False self.seed = seed self.gpu = gpu + self.clusters: dict[Any, Any] = {} self.device = torch.device("cuda" if gpu and torch.cuda.is_available() else "cpu") self.logger.info(f"Initialized Correlated Louvain. device={self.device}") @@ -213,23 +214,23 @@ def partition_to_adjacency(self, partition: dict) -> list: Convert the partition dictionary into a list of adjacency matrices (DataFrames), where each adjacency matrix represents a cluster with more than 2 nodes. """ - clusters = {} + for node, cl in partition.items(): - clusters.setdefault(cl, []).append(node) + self.clusters.setdefault(cl, []).append(node) - self.logger.debug(f"Total detected clusters: {len(clusters)}") + self.logger.debug(f"Total detected clusters: {len(self.clusters)}") adjacency_matrices = [] - for cl, nodes in clusters.items(): - self.logger.debug(f"Cluster {cl} size: {len(nodes)}") - if len(nodes) > 2: - valid_nodes = list(set(nodes).intersection(set(self.B.columns))) + for cl, nodes in self.clusters.items(): + self.logger.debug(f"Cluster {cl} size: {len(nodes)}") + if len(nodes) > 2: + valid_nodes = list(set(nodes).intersection(set(self.B.columns))) if valid_nodes: adjacency_matrix = self.B.loc[:, valid_nodes].fillna(0) adjacency_matrices.append(adjacency_matrix) print(f"Clusters with >2 nodes: {len(adjacency_matrices)}") - + return adjacency_matrices def get_quality(self) -> float: diff --git a/bioneuralnet/clustering/correlated_pagerank.py b/bioneuralnet/clustering/correlated_pagerank.py index d3e301a..4a8f9f2 100644 --- a/bioneuralnet/clustering/correlated_pagerank.py +++ b/bioneuralnet/clustering/correlated_pagerank.py @@ -29,7 +29,6 @@ class CorrelatedPageRank: max_iter (int): Maximum number of iterations for PageRank convergence. tol (float): Tolerance for convergence. k (float): Weighting factor for composite correlation-conductance score. - output_dir (str): Directory to save outputs. """ def __init__( @@ -90,6 +89,7 @@ def __init__( self.seed = seed self.gpu = gpu + self.results: dict[str, float] = {} self.device = torch.device("cuda" if gpu and torch.cuda.is_available() else "cpu") self.logger.info(f"Initialized Correlated Louvain. device={self.device}") @@ -121,7 +121,7 @@ def _validate_inputs(self): self.logger.error(f"Input validation error: {e}") raise - def phen_omics_corr(self, nodes: List[Any]) -> Tuple[float, str]: + def phen_omics_corr(self, nodes: List[Any]= []) -> Tuple[float, str]: """ Calculates the Pearson correlation between the PCA of omics data and phenotype. @@ -152,8 +152,7 @@ def phen_omics_corr(self, nodes: List[Any]) -> Tuple[float, str]: raise def sweep_cut( - self, p: Dict[Any, float] - ) -> Tuple[List[Any], int, float, float, float, str]: + self, p: Dict[Any, float] = {}) -> Tuple[List[Any], int, float, float, float, str]: try: best_cluster = set() min_comp_score = float("inf") @@ -241,7 +240,7 @@ def sweep_cut( self.logger.error(f"Error in sweep_cut: {e}") raise - def generate_weighted_personalization(self, nodes: List[Any]) -> Dict[Any, float]: + def generate_weighted_personalization(self, nodes: List[Any] = []) -> Dict[Any, float]: """ Generates a weighted personalization vector for PageRank. @@ -279,7 +278,7 @@ def generate_weighted_personalization(self, nodes: List[Any]) -> Dict[Any, float raise - def run_pagerank_clustering(self, seed_nodes: List[Any]) -> Dict[str, Any]: + def run_pagerank_clustering(self, seed_nodes: List[Any] = []) -> Dict[str, Any]: """ Executes the PageRank clustering algorithm. @@ -353,7 +352,7 @@ def run_pagerank_clustering(self, seed_nodes: List[Any]) -> Dict[str, Any]: raise - def run(self, seed_nodes: List[Any]) -> Dict[str, Any]: + def run(self, seed_nodes: List[Any] = []) -> Dict[str, Any]: """ Executes the correlated PageRank clustering pipeline. @@ -459,7 +458,7 @@ def run_tuning(self, num_samples: int = 10) -> Dict[str, Any]: def short_dirname_creator(trial): return f"_{trial.trial_id}" - + resources = {"cpu": 1, "gpu": 1} if self.device.type == "cuda" else {"cpu": 1, "gpu": 0} analysis = tune.run( diff --git a/bioneuralnet/clustering/hybrid_louvain.py b/bioneuralnet/clustering/hybrid_louvain.py index f6d47a4..f891184 100644 --- a/bioneuralnet/clustering/hybrid_louvain.py +++ b/bioneuralnet/clustering/hybrid_louvain.py @@ -16,7 +16,7 @@ class HybridLouvain: HybridLouvain Class that combines Correlated Louvain and Correlated PageRank for community detection. Attributes: - + G (nx.Graph): NetworkX graph object. B (pd.DataFrame): Omics data. Y (pd.DataFrame): Phenotype data. @@ -41,7 +41,7 @@ def __init__( ): self.logger = get_logger(__name__) - + if seed is not None: torch.manual_seed(seed) np.random.seed(seed) @@ -49,7 +49,7 @@ def __init__( torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - + self.gpu = gpu self.seed = seed self.logger.info("Initializing HybridLouvain...") @@ -157,10 +157,19 @@ def run(self, as_dfs: bool = False) -> Union[dict, list]: best_corr = 0 best_seed = None + + if not isinstance(partition, dict): + raise TypeError("Expected 'partition' to be a dict") + for com in set(partition.values()): - nodes = [n for n in self.G.nodes() if partition[n] == com] + nodes = [] + for n in self.G.nodes(): + if partition[n] == com: + nodes.append(n) + if len(nodes) < 2: continue + try: corr, _ = louvain._compute_community_correlation(nodes) if abs(corr) > abs(best_corr): @@ -170,6 +179,7 @@ def run(self, as_dfs: bool = False) -> Union[dict, list]: self.logger.info( f"Error computing correlation for community {com}: {e}" ) + if best_seed is None: self.logger.info("No valid seed community found; stopping iterations.") break @@ -241,4 +251,3 @@ def run(self, as_dfs: bool = False) -> Union[dict, list]: return dfs else: return {"curr": current_partition, "clus": all_clusters} - \ No newline at end of file diff --git a/bioneuralnet/datasets/dataset_loader.py b/bioneuralnet/datasets/dataset_loader.py index d5ca367..f5e99c7 100644 --- a/bioneuralnet/datasets/dataset_loader.py +++ b/bioneuralnet/datasets/dataset_loader.py @@ -11,7 +11,7 @@ def __init__(self, dataset_name: str): - data (dict): Dictionary of DataFrames, where keys are table names and values are DataFrames. - shape (dict): Dictionary of table names to their shapes (n_rows, n_cols). - + Example: tcga_brca = DatasetLoader("tcga_brca") @@ -19,12 +19,12 @@ def __init__(self, dataset_name: str): # {'brca_mirna': (108, 1000), 'brca_pam50': (108, 50), ...} mirna = tcga_brca.data["brca_mirna"] rna = tcga_brca.data["brca_rna"] - + """ self.dataset_name = dataset_name.strip().lower() self.base_dir = Path(__file__).parent self.data: dict[str, pd.DataFrame] = {} - + self._load_data() def _load_data(self): @@ -78,4 +78,4 @@ def shape(self) -> dict[str, tuple[int, int]]: result = {} for name, df in self.data.items(): result[name] = df.shape - return result \ No newline at end of file + return result diff --git a/bioneuralnet/downstream_task/dpmon.py b/bioneuralnet/downstream_task/dpmon.py index aedcfa2..8c9f938 100644 --- a/bioneuralnet/downstream_task/dpmon.py +++ b/bioneuralnet/downstream_task/dpmon.py @@ -39,9 +39,9 @@ class DPMON: Instead of node-level MSE regression, DPMON aggregates node embeddings with patient-level omics data. A downstream classification head (e.g., softmax layer with CrossEntropyLoss) is applied for sample-level disease prediction. This end-to-end approach leverages both local (node-level) and global (patient-level) network information - + Attributes: - + adjacency_matrix (pd.DataFrame): The adjacency matrix of the network. omics_list (List[pd.DataFrame]): A list of omics datasets. phenotype_data (pd.DataFrame): A DataFrame containing the disease phenotype. @@ -58,7 +58,7 @@ class DPMON: tune (bool): Whether to perform hyperparameter tuning. Default=False. gpu (bool): Whether to use GPU. Default=False. cuda (int): The CUDA device ID. Default=0. - output_dir (Optional[str]): The output directory. Default=None. + output_dir (Optional[str]): The output directory. Default=None. """ def __init__( self, @@ -111,11 +111,11 @@ def __init__( self.cuda = cuda if output_dir is None: - self.output_dir = Path(os.getcwd()) / "dpmon" + self.output_dir = Path(os.getcwd()) / "dpmon" else: self.output_dir = Path(output_dir) - self.output_dir.mkdir(parents=True, exist_ok=True) + self.output_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Output directory set to: {self.output_dir}") @@ -180,7 +180,7 @@ def run(self) -> pd.DataFrame: # Combine omics datasets combined_omics = pd.concat(self.omics_list, axis=1) combined_omics = combined_omics[self.adjacency_matrix.columns] - + if "phenotype" not in combined_omics.columns: combined_omics = combined_omics.merge( self.phenotype_data[["phenotype"]], @@ -201,7 +201,7 @@ def run(self) -> pd.DataFrame: best_config["nn_hidden_dim2"] = int(best_config["nn_hidden_dim2"]) best_config["num_epochs"] = int(best_config["num_epochs"]) logger.info(f"Best tuned parameters: {best_config}") - + logger.info(f"Best tuned parameters: {best_config}") dpmon_params.update(best_config) logger.info("Running standard training with tuned parameters.") @@ -213,7 +213,7 @@ def run(self) -> pd.DataFrame: output_dir=self.output_dir, ) return predictions - + else: logger.info("Running standard training for DPMON.") predictions = run_standard_training( @@ -418,7 +418,7 @@ def run_hyperparameter_tuning( gpu_resources = 1 if dpmon_params["gpu"] else 0 best_configs = [] - + for omics_data, omics_network_tg in zip(omics_dataset, omics_networks_tg): logger.info( f"Starting hyperparameter tuning for dataset shape: {omics_data.shape}" diff --git a/bioneuralnet/downstream_task/subject_representation.py b/bioneuralnet/downstream_task/subject_representation.py index b63a972..2809d60 100644 --- a/bioneuralnet/downstream_task/subject_representation.py +++ b/bioneuralnet/downstream_task/subject_representation.py @@ -2,7 +2,7 @@ import os import json import tempfile -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any, List, Union, Sequence from pathlib import Path from datetime import datetime @@ -42,7 +42,7 @@ def __init__( reduce_method: str = "AE", seed: Optional[int] = None, tune: Optional[bool] = False, - output_dir: Optional[str] = None, + output_dir: Optional[Union[str, Path]] = None, ): """ Initializes the SubjectRepresentation instance. @@ -64,15 +64,15 @@ def __init__( if embeddings is None or embeddings.empty: self.logger.info( - "No embeddings provided, please review documentation to see how to generate embeddings." - ) + "No embeddings provided, please review documentation to see how to generate embeddings.") raise ValueError("Embeddings must be non-empty.") + if not isinstance(embeddings, pd.DataFrame): raise ValueError("Embeddings must be provided as a pandas DataFrame.") - + if tune and phenotype_data is None: raise ValueError("Phenotype data must be provided for classification-based tuning.") - + if seed is not None: torch.manual_seed(seed) if torch.cuda.is_available(): @@ -84,8 +84,8 @@ def __init__( self.logger.info(f"Seed set to {self.seed}.") self.omics_data = omics_data.copy(deep=True) - self.embeddings = embeddings.copy(deep=True) - self.phenotype_data = phenotype_data.copy(deep=True) + self.embeddings = embeddings.copy(deep=True) + self.phenotype_data = phenotype_data self.phenotype_col = phenotype_col self.reduce_method = reduce_method.upper() self.tune = tune @@ -107,18 +107,17 @@ def __init__( f"Embeddings: {self.embeddings.shape} and Omics: {self.omics_data.shape}" ) self.logger.info(f"Found {len(common_features)} common features between network and omics data.") - + # output directory if output_dir is None: self.temp_dir_obj = tempfile.TemporaryDirectory() - self.output_dir = self.temp_dir_obj.name - + self.output_dir = Path(self.temp_dir_obj.name) self.logger.info(f"No output_dir provided; using temporary directory: {self.output_dir}") else: self.output_dir = Path(output_dir) self.logger.info(f"Output directory set to: {self.output_dir}") - # create the directory with pathlib - Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + self.output_dir.mkdir(parents=True, exist_ok=True) def run(self) -> pd.DataFrame: """ @@ -126,7 +125,7 @@ def run(self) -> pd.DataFrame: If tuning is enabled, runs hyperparameter tuning and uses the best config to reduce embeddings. Otherwise, uses the default reduction method. Returns: - + - Enhanced omics data as a DataFrame. """ self.logger.info("Starting Subject Representation workflow.") @@ -144,7 +143,7 @@ def run(self) -> pd.DataFrame: ae_params = best_config.get("ae_params", {"epochs": 16, "hidden_dim": 8, "lr": 1e-3, "dropout": 0.2, "activation": "relu"}) reduced = self._reduce_embeddings(method=best_config["method"], ae_params=ae_params, compressed_dim=best_config["compressed_dim"]) enhanced_omics_data = self._integrate_embeddings(reduced=reduced, method=best_config["integration_method"], alpha=best_config["alpha"], beta=best_config["beta"]) - + else: method = self.reduce_method.upper() ae_params_def = {"epochs": 16, "hidden_dim": 8, "lr": 1e-3, "dropout": 0.2, "activation": "relu"} @@ -162,7 +161,7 @@ def run(self) -> pd.DataFrame: self.logger.error(f"Error in Subject Representation workflow: {e}") raise - def _reduce_embeddings(self, method: str, ae_params: dict = None, compressed_dim: int = 2) -> pd.DataFrame: + def _reduce_embeddings(self, method: str, ae_params: Optional[dict[Any, Any]] = None, compressed_dim: int = 2) -> pd.DataFrame: """ Reduces the dimensionality of the embeddings. Returns a DataFrame with `compressed_dim` columns. @@ -195,11 +194,11 @@ def _reduce_embeddings(self, method: str, ae_params: dict = None, compressed_dim elif method in ["ae"]: self.logger.info("Using autoencoder for reduction.") ae_params = ae_params or { - "epochs": 64, - "hidden_dim": 8, - "lr": 1e-3, - "dropout": 0.2, - "activation": "relu", + "epochs": 64, + "hidden_dim": 8, + "lr": 1e-3, + "dropout": 0.2, + "activation": "relu", } X = torch.tensor(self.embeddings.values, dtype=torch.float) @@ -226,7 +225,7 @@ def _reduce_embeddings(self, method: str, ae_params: dict = None, compressed_dim model.eval() with torch.no_grad(): z, _ = model(X) - + z_np = z.detach().cpu().numpy() if z_np.ndim == 1: z_np = z_np.reshape(-1, 1) @@ -245,18 +244,18 @@ def _reduce_embeddings(self, method: str, ae_params: dict = None, compressed_dim def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha=2.0, beta=0.5) -> pd.DataFrame: """ Integrates the reduced embeddings with the omics data using a multiplicative approach. - + With the default parameters (alpha = 2.0, beta = 0.5), each feature is updated as: - + - enhanced = beta * raw + (1 - beta) * (alpha * normalized_weight * raw) - + For example, with alpha = 2.0 and beta = 0.5: - + - If a features normalized weight is 1.0: - enhanced = 0.5xraw + 0.5x(2.0x1.0xraw) = 0.5xraw + raw = 1.5xraw - If a features normalized weight is 0.5: - enhanced = 0.5xraw + 0.5x(2.0x0.5xraw) = 0.5xraw + 0.5xraw = raw - + This is so at least 50% of the final output is influenced by the computed weight """ missing_features = set(self.omics_data.columns) - set(reduced.index) @@ -268,7 +267,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= if not reduced.index.equals(self.omics_data.columns): self.logger.info("Aligning reduced embeddings index with omics_data columns.") reduced = reduced.reindex(self.omics_data.columns) - + # normalize the embeddings and compute a weight series if isinstance(reduced, pd.DataFrame): for col in reduced.columns: @@ -278,7 +277,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= weight_series = (reduced - reduced.min()) / (reduced.max() - reduced.min()) else: raise ValueError("Reduced embeddings must be a pandas DataFrame or Series.") - + weight_series = weight_series.fillna(0) ranks = weight_series.rank(method="average") scaled_ranks = 2 * (ranks - ranks.min()) / (ranks.max() - ranks.min()) - 1 @@ -306,7 +305,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= self.logger.info(f"Final Enhanced Omics Shape: {enhanced.shape}") return enhanced - + def _run_tuning(self) -> Dict[str, Any]: """ Runs tuning for SubjectRepresentation. @@ -325,7 +324,7 @@ def _run_classification_tuning(self) -> Dict[str, Any]: """ search_config = { "method": tune.choice(["PCA", "AE"]), - "compressed_dim": tune.choice([1, 2, 3, 4]), + "compressed_dim": tune.choice([1, 2, 3, 4]), "ae_params": { "epochs": tune.choice([64, 128, 256, 512, 1024]), "hidden_dim": tune.choice([16, 32, 64, 128, 256, 512]), @@ -334,7 +333,7 @@ def _run_classification_tuning(self) -> Dict[str, Any]: "activation": tune.choice(["relu", "tanh", "sigmoid"]), }, "integration_method": tune.choice(["multiply"]), - "alpha": tune.choice([1.5, 2.0, 2.5, 3.0]), + "alpha": tune.choice([1.5, 2.0, 2.5, 3.0]), "beta": tune.choice([0.1, 0.3, 0.5, 0.7]), } @@ -342,11 +341,11 @@ def tune_helper(config): try: method = config["method"].upper() ae_params = config.get("ae_params", { - "epochs": 64, - "hidden_dim": 4, - "dropout": 0.2, - "lr": 1e-3, - "activation": "relu", + "epochs": 64, + "hidden_dim": 4, + "dropout": 0.2, + "lr": 1e-3, + "activation": "relu", }) alpha = config.get("alpha", 2.0) beta = config.get("beta", 0.5) @@ -409,7 +408,7 @@ def short_dirname_creator(trial): analysis = e.args[1] best_trial = None - best_score = 0.0 + best_score = 0.0 if analysis and hasattr(analysis, "get_best_trial"): try: @@ -431,7 +430,7 @@ def short_dirname_creator(trial): self.logger.info(f"Best Graph Embedding parameters saved to {best_params_file}") else: self.logger.info("No valid best trial config found; skipping save.") - + return best_trial.config if best_trial else {} def get_activation(activation: str): @@ -464,16 +463,15 @@ class AutoEncoder(nn.Module): Builds encoder and decoder layers based on a list of hidden dimensions. Allows tuning of dropout, activation, and network architecture. """ - def __init__(self, input_dim: int, hidden_dims: int = 64, compressed_dim: int = 1, - dropout: float = 0.0, activation: str = "relu"): + def __init__(self, input_dim: int, hidden_dims: Union[int, Sequence[int]] = 64, compressed_dim: int = 1,dropout: float = 0.0, activation: str = "relu"): super(AutoEncoder, self).__init__() self.activation = get_activation(activation) - + if isinstance(hidden_dims, (int, float)): hidden_dims = generate_hidden_dims(int(hidden_dims)) elif not isinstance(hidden_dims, list): raise ValueError("hidden_dims must be an int, float, or a list of ints.") - + # encoder: encoder_layers = [] current_dim = input_dim @@ -485,7 +483,7 @@ def __init__(self, input_dim: int, hidden_dims: int = 64, compressed_dim: int = encoder_layers.append(nn.Linear(current_dim, compressed_dim)) # the * operator unpacks the list into separate arguments self.encoder = nn.Sequential(*encoder_layers) - + # decoder: decoder_layers = [] current_dim = compressed_dim @@ -496,7 +494,7 @@ def __init__(self, input_dim: int, hidden_dims: int = 64, compressed_dim: int = current_dim = h_dim decoder_layers.append(nn.Linear(current_dim, input_dim)) self.decoder = nn.Sequential(*decoder_layers) - + def forward(self, x): z = self.encoder(x) recon = self.decoder(z) diff --git a/bioneuralnet/external_tools/SmCCNet.R b/bioneuralnet/external_tools/SmCCNet.R index 6ac4375..db325e2 100644 --- a/bioneuralnet/external_tools/SmCCNet.R +++ b/bioneuralnet/external_tools/SmCCNet.R @@ -3,7 +3,7 @@ library("SmCCNet") library("WGCNA") library("jsonlite") -library("dplyr") +library("dplyr") options(stringsAsFactors = FALSE) allowWGCNAThreads(nThreads = 4) @@ -69,7 +69,7 @@ omics_list <- lapply(omics_list, function(m) m[ids, , drop = FALSE]) phenotype_df <- phenotype_df[ids, , drop = FALSE] Y <- as.numeric(phenotype_df$phenotype) -Yfactor <- factor(Y) +Yfactor <- factor(Y) args <- commandArgs(trailingOnly = TRUE) if (length(args) < 10) { @@ -146,7 +146,7 @@ if (any(is.na(Y)) || any(is.infinite(Y))) { stop("ERROR: Phenotype vector Y contains NA or Inf") } -#Y_binary <- ifelse(Y > median(Y), 1, 0) +#Y_binary <- ifelse(Y > median(Y), 1, 0) if (length(data_types) == 1 && !is.null(ncomp_pls)) { message("Single-omics PLS scenario") @@ -224,16 +224,16 @@ if (length(rdata_files) == 0) { } else { for (file in rdata_files) { message("Processing file: ", file, "\n") - + temp_env <- new.env() loaded_names <- load(file, envir = temp_env) - + if ("M" %in% loaded_names && exists("M", envir = temp_env)) { sub_net <- get("M", envir = temp_env) - + file_base <- tools::file_path_sans_ext(basename(file)) csv_filename <- paste0(file_base, ".csv") - + write.csv(sub_net, file = csv_filename, row.names = TRUE) message("Subnetwork matrix from ", file, " written to ", csv_filename, "\n\n") } else { @@ -242,4 +242,4 @@ if (length(rdata_files) == 0) { } } -quit(status = 0) \ No newline at end of file +quit(status = 0) diff --git a/bioneuralnet/external_tools/smccnet.py b/bioneuralnet/external_tools/smccnet.py index 6486656..733e866 100644 --- a/bioneuralnet/external_tools/smccnet.py +++ b/bioneuralnet/external_tools/smccnet.py @@ -4,7 +4,7 @@ from pathlib import Path import json import tempfile -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional, Union from ..utils.logger import get_logger import shutil @@ -25,7 +25,7 @@ class SmCCNet: This class handles the preprocessing of omics data, execution of the SmCCNet R script, and retrieval of the resulting adjacency matrix from a designated output directory. - + Attributes: phenotype_df (pd.DataFrame): DataFrame containing phenotype data, shape [samples x 1 or more]. @@ -46,15 +46,15 @@ def __init__( omics_dfs: List[pd.DataFrame], data_types: List[str], kfold: int = 5, - eval_method: str = "", + eval_method: str = "", subSampNum: int = 500, summarization: str = "NetSHy", seed: int = 119, - ncomp_pls: int = 0, + ncomp_pls: int = 0, between_shrinkage: float = 5.0, cut_height: float = (1.0 - 0.1**10.0), preprocess: int = 0, - output_dir: str = None + output_dir: Optional[Union[str, Path]] = None, ): """ Initializes the SmCCNet instance. @@ -86,18 +86,18 @@ def __init__( self.logger.info(f"Using R script via importlib: {self.r_script}") except Exception: - script_dir = os.path.dirname(os.path.abspath(__file__)) - r_script_path = os.path.join(script_dir, "SmCCNet.R") + script_dir = Path(__file__).resolve().parent + r_script_path = script_dir / "SmCCNet.R" - if not os.path.isfile(r_script_path): + if not r_script_path.is_file(): raise FileNotFoundError(f"SmCCNet.R script not found via importlib or local path: {r_script_path}") - self.r_script = r_script_path + self.r_script = str(r_script_path) self.logger.warning(f"Using fallback R script path: {self.r_script}") - + if isinstance(phenotype_df, pd.Series): phenotype_df = phenotype_df.to_frame(name="phenotype") - + if isinstance(phenotype_df, pd.DataFrame) and phenotype_df.shape[1] > 1: self.logger.warning("Phenotype DataFrame has more than one column. Renaming to phenotype and keeping only the first column") phenotype_df = phenotype_df.iloc[:, :1] @@ -105,7 +105,7 @@ def __init__( if not isinstance(phenotype_df, pd.DataFrame): raise ValueError("phenotype_df must be a pandas DataFrame or Series.") - + self.phenotype_df = phenotype_df.copy(deep=True) self.omics_dfs = [] @@ -138,22 +138,22 @@ def __init__( if len(self.omics_dfs) != len(self.data_types): self.logger.error("Number of omics DataFrames does not match number of data types.") raise ValueError("Mismatch between omics dataframes and data types.") - + if eval_method in ("auc","accuracy","f1"): uniques = set(phenotype_df.iloc[:, 0].unique()) if not uniques.issubset({0,1}): raise ValueError("eval_method=classification, but phenotype is not strictly 0/1.") - + if eval_method == "Rsquared" and ncomp_pls>0: raise ValueError("Continuous eval can't use PLS. Set ncomp_pls=0 for CCA.") # output directory if output_dir is None: self.temp_dir_obj = tempfile.TemporaryDirectory() - self.output_dir = self.temp_dir_obj.name + self.output_dir = Path(self.temp_dir_obj.name) self.logger.info(f"No output_dir provided; using temporary directory: {self.output_dir}") else: - self.output_dir = output_dir + self.output_dir = Path(output_dir) self.logger.info(f"Output directory set to: {self.output_dir}") # create the directory with pathlib Path(self.output_dir).mkdir(parents=True, exist_ok=True) @@ -184,8 +184,8 @@ def preprocess_data(self) -> Dict[str, Any]: self.logger.info("Validating and serializing input data for SmCCNet...") if self.phenotype_df.columns[0] != "phenotype": - self.logger.warning("Renaming target column to 'phenotype' for consistency.") - self.phenotype_df.columns = ["phenotype"] + self.logger.warning("Renaming target column to 'phenotype' for consistency.") + self.phenotype_df.columns = ["phenotype"] # if index_match == True: @@ -233,7 +233,7 @@ def preprocess_data(self) -> Dict[str, Any]: self.logger.info(f"Serialized phenotype with {len(pheno_df)} samples.") return serialized_data - + def run_smccnet(self, serialized_data: Dict[str, Any]) -> None: """ @@ -243,7 +243,7 @@ def run_smccnet(self, serialized_data: Dict[str, Any]) -> None: try: self.logger.info("Executing SmCCNet R script...") json_data = json.dumps(serialized_data) + "\n" - + # script_dir = os.path.dirname(os.path.abspath(__file__)) # r_script = os.path.join(script_dir, "SmCCNet.R") @@ -253,7 +253,6 @@ def run_smccnet(self, serialized_data: Dict[str, Any]) -> None: # rscript_path = shutil.which("Rscript") # if rscript_path is None: # raise EnvironmentError("Rscript not found in system PATH.") - cmd = [ self.rscript_path, self.r_script, @@ -268,7 +267,7 @@ def run_smccnet(self, serialized_data: Dict[str, Any]) -> None: str(self.cut_height), str(self.preprocess), ] - self.logger.debug("Running command: " + " ".join(cmd)) + self.logger.info(f"Running command: {cmd}") # fire off spinner thread stop_spinner = threading.Event() @@ -283,10 +282,15 @@ def spinner(): spin_thread = threading.Thread(target=spinner) spin_thread.start() + cmd_clean: list[str] = [] + for c in cmd: + if c is None: + raise ValueError("Command argument cannot be None") + cmd_clean.append(str(c)) # run Rscript (blocks until done) result = subprocess.run( - cmd, + cmd_clean, input=json_data, text=True, capture_output=True, @@ -316,7 +320,7 @@ def spinner(): raise - def get_clusters(self) -> list[pd.DataFrame, Any]: + def get_clusters(self) -> list[Any]: """ Retrieves the subnetwork clusters generated by SmCCNet. diff --git a/bioneuralnet/metrics/__init__.py b/bioneuralnet/metrics/__init__.py index e5f3d08..0e8c7bb 100644 --- a/bioneuralnet/metrics/__init__.py +++ b/bioneuralnet/metrics/__init__.py @@ -1,9 +1,9 @@ from .correlation import omics_correlation, cluster_correlation, louvain_to_adjacency from .plot import plot_variance_distribution, plot_variance_by_feature, plot_performance_three, plot_performance, plot_embeddings, plot_network, compare_clusters -from .evaluation import evaluate_model, evaluate_rf, evaluate_xgb, evaluate_f1m, evaluate_f1w, plot_multiple_metrics, evaluate_single_run +from .evaluation import evaluate_model, evaluate_rf, evaluate_f1m, evaluate_f1w, plot_multiple_metrics, evaluate_single_run __all__ = ["omics_correlation", "cluster_correlation", "louvain_to_adjacency", "plot_variance_distribution", "plot_variance_by_feature", "plot_performance_three", "plot_performance", "plot_embeddings", "plot_network", "compare_clusters", - "evaluate_model", "evaluate_rf", "evaluate_xgb", "evaluate_single_run", "evaluate_f1m", "evaluate_f1w", - "plot_multiple_metrics"] \ No newline at end of file + "evaluate_model", "evaluate_rf", "evaluate_single_run", "evaluate_f1m", "evaluate_f1w", + "plot_multiple_metrics"] diff --git a/bioneuralnet/metrics/correlation.py b/bioneuralnet/metrics/correlation.py index 9d7b294..5b37818 100644 --- a/bioneuralnet/metrics/correlation.py +++ b/bioneuralnet/metrics/correlation.py @@ -31,7 +31,7 @@ def omics_correlation(omics: pd.DataFrame, pheno: pd.DataFrame) -> Tuple[float, if omics.empty or target.empty: logger.error("Omics data and phenotype must not be empty.") raise ValueError("Omics data and phenotype must not be empty.") - + if omics.shape[0] != len(target): logger.error("Number of rows in omics data and phenotype must be the same.") raise ValueError("Omics data and phenotype must have the same length.") @@ -50,12 +50,12 @@ def omics_correlation(omics: pd.DataFrame, pheno: pd.DataFrame) -> Tuple[float, def cluster_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple: """ Compute the Pearson correlation coefficient between PC1 of a cluster and phenotype. - + Parameters: - + cluster_df: DataFrame representing a cluster of samples. pheno: DataFrame representing the phenotype. - + Returns: (cluster_size, correlation) or (size, None) if correlation fails. @@ -67,7 +67,7 @@ def cluster_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple: return (cluster_size, None) subset = cluster_df.fillna(0) - + if subset.var().sum() == 0: logger.warning("Cluster skipped: all features have zero variance.") return (cluster_size, None) @@ -77,7 +77,7 @@ def cluster_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple: pc1 = pca.fit_transform(subset) pc1_series = pd.Series(pc1.flatten(), index=subset.index, name="PC1") - pheno_series = pheno.iloc[:, 0] + pheno_series = pheno.iloc[:, 0] pc1_series, pheno_series = pc1_series.align(pheno_series, join="inner") if len(pc1_series) < 3: @@ -95,11 +95,11 @@ def cluster_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple: def louvain_to_adjacency(louvain_cluster: pd.DataFrame) -> pd.DataFrame: """ Convert a Louvain cluster to an adjacency matrix. - + Parameters: louvain_cluster: represents an induced subnetwork (from Louvain). - + Returns: pd.DataFrame: Adjacency matrix diff --git a/bioneuralnet/metrics/evaluation.py b/bioneuralnet/metrics/evaluation.py index e61fda9..73cf7bd 100644 --- a/bioneuralnet/metrics/evaluation.py +++ b/bioneuralnet/metrics/evaluation.py @@ -1,23 +1,27 @@ import numpy as np from pathlib import Path +from typing import Union, Optional, Tuple, cast import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, r2_score, f1_score -from xgboost import XGBClassifier, XGBRegressor from bioneuralnet.utils import get_logger logger = get_logger(__name__) -def evaluate_model(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 150,runs: int = 100,seed: int = 119,): +def evaluate_model(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 150,runs: int = 100,seed: int = 119,) -> Tuple[ + Tuple[float, float], + Tuple[Optional[float], Optional[float]], + Tuple[Optional[float], Optional[float]] + ]: """ - Evaluate a single model (RF or XGB, classif or reg) over multiple runs, returning three tuples. + Evaluate a single model (RF classif or reg) over multiple runs, returning three tuples. For classification: - (accuracy_mean, accuracy_std) - (f1_weighted_mean, f1_weighted_std) - (f1_macro_mean, f1_macro_std) - + For regression: - (r2_mean, r2_std) @@ -29,20 +33,14 @@ def evaluate_model(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_ for run in range(runs): stratify = y if is_classif else None - X_tr, X_te, y_tr, y_te = train_test_split( - X, y, test_size=0.3, random_state=seed + run, stratify=stratify - ) + X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=seed + run, stratify=stratify) if model_type == "rf_classif": mdl = RandomForestClassifier(n_estimators=n_estimators, random_state=seed + run) - elif model_type == "xgb_classif": - mdl = XGBClassifier( n_estimators=n_estimators, eval_metric="logloss", random_state=seed + run) elif model_type == "rf_reg": mdl = RandomForestRegressor(n_estimators=n_estimators, random_state=seed + run) - elif model_type == "xgb_reg": - mdl = XGBRegressor(n_estimators=n_estimators, random_state=seed + run) else: - raise ValueError("model_type must be one of: rf_classif, xgb_classif, rf_reg, xgb_reg") + raise ValueError("model_type must be one of: rf_classif, rf_reg") mdl.fit(X_tr, y_tr) y_pred = mdl.predict(X_te) @@ -65,10 +63,10 @@ def evaluate_model(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_ (None, None), (None, None)) -def evaluate_single_run(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,test_size: float = 0.3,seed: int = 119): +def evaluate_single_run(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,test_size: float = 0.3,seed: int = 119) -> Tuple[float, float, float]: """ Do one train/test split, train the specified model. - + Return: (accuracy, f1_weighted, f1_macro) """ stratify = y if "classif" in model_type else None @@ -76,10 +74,8 @@ def evaluate_single_run(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classi if model_type == "rf_classif": mdl = RandomForestClassifier(n_estimators=n_estimators, random_state=seed) - elif model_type == "xgb_classif": - mdl = XGBClassifier(n_estimators=n_estimators,eval_metric="logloss",random_state=seed) else: - raise ValueError("model_type must be 'rf_classif' or 'xgb_classif'") + raise ValueError("model_type must be rf_classif") mdl.fit(X_tr, y_tr) y_pred = mdl.predict(X_te) @@ -90,22 +86,19 @@ def evaluate_single_run(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classi return acc, f1w, f1m -def evaluate_rf(X: np.ndarray,y: np.ndarray,mode: str = "classification",n_estimators: int = 150,runs: int = 100,seed: int = 119,return_all: bool = False): +def evaluate_rf(X: np.ndarray,y: np.ndarray,mode: str = "classification",n_estimators: int = 150,runs: int = 100,seed: int = 119) -> Tuple[ + Tuple[float, float], + Tuple[Optional[float], Optional[float]], + Tuple[Optional[float], Optional[float]] + ]: """ Shortcut function: evaluate a RandomForest (classification or regression). """ mt = "rf_classif" if mode == "classification" else "rf_reg" - return evaluate_model(X, y, model_type=mt, n_estimators=n_estimators,runs=runs, seed=seed, return_all=return_all) - -def evaluate_xgb(X: np.ndarray,y: np.ndarray,mode: str = "classification",n_estimators: int = 150,runs: int = 100,seed: int = 119,return_all: bool = False): - """ - Shortcut function: evaluate an XGBoost (classification or regression). - """ + return evaluate_model(X, y, model_type=mt, n_estimators=n_estimators,runs=runs, seed=seed) - mt = "xgb_classif" if mode == "classification" else "xgb_reg" - return evaluate_model(X, y, model_type=mt, n_estimators=n_estimators, runs=runs, seed=seed, return_all=return_all) -def evaluate_f1w(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,runs: int = 5,seed: int = 119): +def evaluate_f1w(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,runs: int = 5,seed: int = 119) -> Tuple[float, float]: """ Evaluate weighted F1-score over multiple runs. """ @@ -115,18 +108,16 @@ def evaluate_f1w(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_es X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed+run, stratify=stratify) if model_type == "rf_classif": mdl = RandomForestClassifier(n_estimators=n_estimators, random_state=seed+run) - elif model_type == "xgb_classif": - mdl = XGBClassifier(n_estimators=n_estimators,eval_metric="logloss",random_state=seed+run) else: raise ValueError("Unsupported model_type for F1 scoring") - + mdl.fit(X_train, y_train) y_pred = mdl.predict(X_test) scores.append(f1_score(y_test, y_pred, average="weighted")) return np.mean(scores), np.std(scores) -def evaluate_f1m(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,runs: int = 5,seed: int = 119): +def evaluate_f1m(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_estimators: int = 100,runs: int = 5,seed: int = 119) -> Tuple[float, float]: """Evaluate macro F1-score over multiple runs.""" scores = [] for run in range(runs): @@ -136,52 +127,59 @@ def evaluate_f1m(X: np.ndarray,y: np.ndarray,model_type: str = "rf_classif",n_es if model_type == "rf_classif": mdl = RandomForestClassifier(n_estimators=n_estimators, random_state=seed+run) - elif model_type == "xgb_classif": - mdl = XGBClassifier(n_estimators=n_estimators,eval_metric="logloss",random_state=seed+run) else: raise ValueError("Unsupported model_type for F1 scoring") - + mdl.fit(X_train, y_train) y_pred = mdl.predict(X_test) scores.append(f1_score(y_test, y_pred, average="macro")) return np.mean(scores), np.std(scores) -def plot_grouped_performance(scores: dict[str, dict[str, tuple[float, float]]],title: str,ylabel: str = "Score",filename: str | Path = None): +def plot_grouped_performance( + scores: dict[str, dict[str, tuple[float, float]]], + title: str, + ylabel: str = "Score", + filename: Optional[Union[str, Path]] = None +) -> None: """ Plot grouped bar chart and save results to text file. """ - logger.info(f"Plotting grouped performance: {title}") groups = list(scores.keys()) sublabels = list(next(iter(scores.values())).keys()) - means = [] - errs = [] + + means_list: list[list[float]] = [] + errs_list: list[list[float]] = [] for g in groups: - row_m = [] - row_e = [] + row_m: list[float] = [] + row_e: list[float] = [] for s in sublabels: m, e = scores[g][s] row_m.append(m) row_e.append(e) - means.append(row_m) - errs.append(row_e) - means = np.array(means) - errs = np.array(errs) + means_list.append(row_m) + errs_list.append(row_e) + + means = cast(np.ndarray, np.array(means_list, dtype=float)) + errs = cast(np.ndarray, np.array(errs_list, dtype=float)) + ind = np.arange(len(groups)) width = 0.8 / len(sublabels) - fig, ax = plt.subplots(figsize=(1.2*len(groups), 4)) + fig, ax = plt.subplots(figsize=(1.2 * len(groups), 4)) + for i, s in enumerate(sublabels): ax.bar( - ind + i*width, + ind + i * width, means[:, i], width, yerr=errs[:, i], capsize=3, label=s ) - ax.set_xticks(ind + width*(len(sublabels)-1)/2) + + ax.set_xticks(ind + width * (len(sublabels) - 1) / 2) ax.set_xticklabels(groups, fontsize=11) ax.set_ylabel(ylabel, fontsize=12) ax.set_title(title, fontsize=14, pad=12) @@ -189,12 +187,22 @@ def plot_grouped_performance(scores: dict[str, dict[str, tuple[float, float]]],t ax.set_ylim(0, 1) ax.legend(title="Method", fontsize=10) ax.grid(True, axis="y", linestyle="--", alpha=0.3) + + # Annotate bars with values for i in range(len(groups)): for j in range(len(sublabels)): - x = ind[i] + j*width + x = ind[i] + j * width h = means[i, j] e = errs[i, j] - ax.text(x + width/2, h + e + 0.01, f"{h:.3f}", ha="center", va="bottom", fontsize=9) + ax.text( + x + width / 2, + h + e + 0.01, + f"{h:.3f}", + ha="center", + va="bottom", + fontsize=9 + ) + plt.tight_layout() if filename: @@ -208,17 +216,21 @@ def plot_grouped_performance(scores: dict[str, dict[str, tuple[float, float]]],t logger.info(f"Saved results to {txt_path}") fig.savefig(str(fig_path), dpi=300) logger.info(f"Saved plot to {fig_path}") + plt.show() -def plot_multiple_metrics(metrics: dict[str, dict[str, dict[str, tuple[float, float]]]],title_map: dict[str, str] = None,ylabel_map: dict[str, str] = None,filename: Path = None): +def plot_multiple_metrics( + metrics: dict[str, dict[str, dict[str, tuple[float, float]]]], + title_map: Optional[dict[str, str]] = None, + ylabel_map: Optional[dict[str, str]] = None, + filename: Optional[Union[str, Path]] = None +) -> None: """ Consolidate multiple metric grouped performances into one figure. - - Adds numeric labels on top of each bar. """ logger.info(f"Plotting multiple metrics: {list(metrics.keys())}") n = len(metrics) - fig, axes = plt.subplots(1, n, figsize=(5*n, 5), sharey=True) + fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), sharey=True) if n == 1: axes = [axes] @@ -226,63 +238,47 @@ def plot_multiple_metrics(metrics: dict[str, dict[str, dict[str, tuple[float, fl groups = list(sc.keys()) sublabels = list(next(iter(sc.values())).keys()) - means = [] - errs = [] + means_list: list[list[float]] = [] + errs_list: list[list[float]] = [] for g in groups: - rm = [] - re = [] - + row_m: list[float] = [] + row_e: list[float] = [] for s in sublabels: m, e = sc[g][s] - rm.append(m) - re.append(e) + row_m.append(m) + row_e.append(e) + means_list.append(row_m) + errs_list.append(row_e) - means.append(rm) - errs.append(re) - - means = np.array(means) - errs = np.array(errs) + means = cast(np.ndarray, np.array(means_list, dtype=float)) + errs = cast(np.ndarray, np.array(errs_list, dtype=float)) ind = np.arange(len(groups)) total = 0.7 width = total / len(sublabels) - # plot bars and annotate for i in range(len(sublabels)): x = ind + i * width y = means[:, i] yerr = errs[:, i] bars = ax.bar(x, y, width, yerr=yerr, capsize=3, label=sublabels[i]) - - # add value labels for bar in bars: height = bar.get_height() - x = bar.get_x() + bar.get_width() / 2 - y = height + 0.01 - label = f"{height:.2f}" - - ax.text(x,y,label,ha='center', va='bottom', fontsize=8) + bar_x = bar.get_x() + bar.get_width() / 2 + bar_y = height + 0.01 + ax.text(bar_x, bar_y, f"{height:.2f}", ha='center', va='bottom', fontsize=8) - ax.set_xticks(ind + total/2 - width/2) + ax.set_xticks(ind + total / 2 - width / 2) ax.set_xticklabels(groups, fontsize=11) - if title_map: - title = title_map.get(metric, metric) - else: - title = metric - ax.set_title(title, fontsize=14, pad=12) - - if ylabel_map: - ylabel = ylabel_map.get(metric, metric) - else: - ylabel = metric + title = title_map.get(metric, metric) if title_map else metric + ylabel = ylabel_map.get(metric, metric) if ylabel_map else metric + ax.set_title(title, fontsize=14, pad=12) ax.set_ylabel(ylabel, fontsize=12) - if "Accuracy" in ylabel or "F1" in ylabel: ax.set_ylim(0, 1) - ax.legend(title="Method", fontsize=9) ax.grid(True, axis="y", linestyle="--", alpha=0.3) diff --git a/bioneuralnet/metrics/plot.py b/bioneuralnet/metrics/plot.py index 33a75c6..1308c93 100644 --- a/bioneuralnet/metrics/plot.py +++ b/bioneuralnet/metrics/plot.py @@ -22,9 +22,9 @@ def plot_variance_distribution(df: pd.DataFrame, bins: int = 50): df (pd.DataFrame): Input data. bins (int): Number of bins for the histogram. - + Returns: - + matplotlib.figure.Figure: Generated figure. """ variances = df.var() @@ -35,14 +35,14 @@ def plot_variance_distribution(df: pd.DataFrame, bins: int = 50): ax.set_title("Distribution of Feature Variances") ax.set_xlabel("Variance") ax.set_ylabel("Frequency") - + logger.info("Variance distribution plot generated.") return fig def plot_variance_by_feature(df: pd.DataFrame): """ Plot the variance for each feature against its index or name. - + Parameters: df (pd.DataFrame): Input data. @@ -60,7 +60,7 @@ def plot_variance_by_feature(df: pd.DataFrame): ax.set_xlabel("Feature") ax.set_ylabel("Variance") ax.tick_params(axis='x', rotation=90) - + logger.info("Variance vs. feature index plot generated.") return fig @@ -72,7 +72,7 @@ def plot_performance_three(raw_score, gnn_score, other_score, labels=["Raw","GNN raise ValueError("Scores must be tuples of (mean, std)") scores = [raw_score[0], gnn_score[0], other_score[0]] errors = [raw_score[1], gnn_score[1], other_score[1]] - + x = np.arange(len(scores)) width = 0.23 @@ -144,19 +144,19 @@ def parse_score(x): if filename: plt.savefig(str(filename), dpi=300, bbox_inches="tight") print(f"Saved plot to {filename}") - + plt.show() def plot_embeddings(embeddings, node_labels=None): """ Plot the embeddings in 2D space using t-SNE. - + Parameters: embeddings (array-like): High-dimensional embedding data. node_labels (array-like or DataFrame, optional): Labels for the nodes to color the points. - + """ X = np.array(embeddings) @@ -165,9 +165,9 @@ def plot_embeddings(embeddings, node_labels=None): logger.info(f"Skipping plot: not enough samples ({X.shape[0]}) for TSNE.") return reducer = TSNE(n_components=2, init="pca", perplexity=perplexity) - + X_reduced = reducer.fit_transform(X) - + if node_labels is None: c_values = np.zeros(X.shape[0]) elif hasattr(node_labels, "iloc"): @@ -175,18 +175,18 @@ def plot_embeddings(embeddings, node_labels=None): c_values = np.array(node_labels.iloc[:, 0], dtype=float).flatten() else: c_values = np.array(node_labels, dtype=float).flatten() - + fig, ax = plt.subplots(figsize=(10, 8)) - + scatter = ax.scatter( X_reduced[:, 0], X_reduced[:, 1], c=c_values, cmap="viridis", s=60, alpha=0.9, - edgecolor="k" + edgecolor="k" ) - + ax.invert_yaxis() ax.set_title(f"Embeddings in 2D space from {embeddings.shape[1]}D") @@ -205,9 +205,9 @@ def plot_network(adjacency_matrix, weight_threshold=0.0, show_labels=False, show weight_threshold (float): Minimum weight to keep an edge (default: 0.0). show_labels (bool): Whether to show node labels. show_edge_weights (bool): Whether to show edge weights. - + Returns: - + pd.DataFrame: Mapping of node indexes to actual gene names. """ full_G = nx.from_pandas_adjacency(adjacency_matrix) @@ -218,13 +218,13 @@ def plot_network(adjacency_matrix, weight_threshold=0.0, show_labels=False, show if weight_threshold > 0: edges_to_remove = [] - + for u, v, d in G.edges(data=True): weight = d.get('weight', 0) if weight < weight_threshold: edges_to_remove.append((u, v)) - G.remove_edges_from(edges_to_remove) + G.remove_edges_from(edges_to_remove) isolated_nodes = list(nx.isolates(G)) G.remove_nodes_from(isolated_nodes) @@ -269,11 +269,11 @@ def plot_network(adjacency_matrix, weight_threshold=0.0, show_labels=False, show if show_edge_weights and edge_weights: edge_labels = nx.get_edge_attributes(G, 'weight') - + formatted_edge_labels = {} for edge, weight in edge_labels.items(): formatted_edge_labels[edge] = f"{weight:.4f}" - + nx.draw_networkx_edge_labels(G, pos, edge_labels=formatted_edge_labels, font_size=9, ax=ax_graph) if show_labels: @@ -305,13 +305,13 @@ def plot_network(adjacency_matrix, weight_threshold=0.0, show_labels=False, show return mapping_df -def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.DataFrame, +def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.DataFrame, omics_merged: pd.DataFrame, label1: str = "Louvain", label2: str = "SmCCNet"): """ Compare clusters from two methods by computing the correlation for each induced subnetwork. Both inputs are expected to be lists of pandas DataFrames. If the lists have different lengths, only the first min(n, m) clusters are compared. - + Parameters: louvain_clusters: list of pd.DataFrame @@ -326,7 +326,7 @@ def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.D Label for the first method. label2: str Label for the second method. - + Returns: pd.DataFrame: Results table with cluster indices, sizes, and correlations @@ -334,12 +334,12 @@ def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.D smccnet_clusters_fixed = [] for cluster_df in smccnet_clusters: - valid_genes = [] - + valid_genes = [] + for gene in cluster_df.index: if gene in omics_merged.columns: valid_genes.append(gene) - + if len(valid_genes) > 0: sample_level_data = omics_merged[valid_genes] smccnet_clusters_fixed.append(sample_level_data) @@ -349,7 +349,7 @@ def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.D smccnet_clusters_fixed = smccnet_clusters_fixed[:min_len] results = [] - + for i, (df_louvain, df_smccnet) in enumerate(zip(louvain_clusters, smccnet_clusters_fixed), start=1): size_louvain, corr_louvain = cluster_correlation(df_louvain, pheno) size_smccnet, corr_smccnet = cluster_correlation(df_smccnet, pheno) @@ -357,23 +357,23 @@ def compare_clusters(louvain_clusters: list, smccnet_clusters: list, pheno: pd.D if corr_louvain is not None and corr_smccnet is not None: results.append((f"Cluster_{i}", size_louvain, corr_louvain, size_smccnet, corr_smccnet)) - df_results = pd.DataFrame(results, columns=["Cluster", "Louvain Size", "Louvain Correlation", + df_results = pd.DataFrame(results, columns=["Cluster", "Louvain Size", "Louvain Correlation", "SMCCNET Size", "SMCCNET Correlation"]) - + fig, ax = plt.subplots(figsize=(10, 5)) - - ax.plot(df_results.index + 1, df_results["Louvain Correlation"], marker="o", linestyle="-", + + ax.plot(df_results.index + 1, df_results["Louvain Correlation"], marker="o", linestyle="-", label=label1, color="blue") - ax.plot(df_results.index + 1, df_results["SMCCNET Correlation"], marker="s", linestyle="--", + ax.plot(df_results.index + 1, df_results["SMCCNET Correlation"], marker="s", linestyle="--", label=label2, color="red") for i, row in df_results.iterrows(): - ax.text(i + 1, row["Louvain Correlation"] + 0.05, - f"{row['Louvain Size']}", ha="center", fontsize=10, + ax.text(i + 1, row["Louvain Correlation"] + 0.05, + f"{row['Louvain Size']}", ha="center", fontsize=10, color="blue", fontweight="bold", bbox=dict(facecolor="white", alpha=0.7, edgecolor="none")) - - ax.text(i + 1, row["SMCCNET Correlation"] + 0.05, - f"{row['SMCCNET Size']}", ha="center", fontsize=10, + + ax.text(i + 1, row["SMCCNET Correlation"] + 0.05, + f"{row['SMCCNET Size']}", ha="center", fontsize=10, color="red", fontweight="bold", bbox=dict(facecolor="white", alpha=0.7, edgecolor="none")) ax.set_xticks(range(1, len(df_results) + 1)) diff --git a/bioneuralnet/network_embedding/gnn_embedding.py b/bioneuralnet/network_embedding/gnn_embedding.py index 97ffed7..0ffd7ed 100644 --- a/bioneuralnet/network_embedding/gnn_embedding.py +++ b/bioneuralnet/network_embedding/gnn_embedding.py @@ -40,7 +40,7 @@ class GNNEmbedding: """ GNNEmbedding Class for Generating Graph Neural Network (GNN) Based Embeddings. - + Attributes: adjacency_matrix : pd.DataFrame omics_data : pd.DataFrame @@ -77,14 +77,14 @@ def __init__( activation: str = "relu", seed: Optional[int] = None, tune: Optional[bool] = False, - output_dir: Optional[str] = None, + output_dir: Optional[Union[str, Path]] = None, ): """ Initializes the GNNEmbedding instance. """ self.logger = get_logger(__name__) - + # Input validation if adjacency_matrix.empty: raise ValueError("Adjacency matrix cannot be empty.") @@ -94,7 +94,7 @@ def __init__( raise ValueError("Adjacency matrix, omics data must have the same number of samples.") if clinical_data is not None and clinical_data.empty: raise ValueError("Clinical data was provided but is empty.") - + if isinstance(phenotype_data, pd.Series): self.phenotype_data = phenotype_data.copy(deep=True) @@ -111,7 +111,7 @@ def __init__( else: raise ValueError("Phenotype data must be a Series or a DataFrame.") - + if seed is not None: torch.manual_seed(seed) np.random.seed(seed) @@ -146,7 +146,7 @@ def __init__( if output_dir is None: self.temp_dir_obj = tempfile.TemporaryDirectory() - self.output_dir = self.temp_dir_obj.name + self.output_dir = Path(self.temp_dir_obj.name) self.logger.info(f"No output_dir provided; using temporary directory: {self.output_dir}") else: self.output_dir = Path(output_dir) @@ -196,7 +196,7 @@ def embed(self, as_df: bool = False) -> Union[torch.Tensor, pd.DataFrame]: self.logger.info(f"Retraining with best config: {best_config}") self.fit() self.logger.info("Model retrained with best hyperparameters.") - + try: self.embeddings = self._generate_embeddings(self.model, self.data) self.logger.info("Node embeddings generated.") @@ -235,7 +235,7 @@ def _tensor_to_df(self, embeddings_tensor: torch.Tensor, network: pd.DataFrame) except Exception as e: self.logger.error(f"Error during conversion: {e}") raise - + def _prepare_node_features(self) -> pd.DataFrame: """ 1. Align network & omics nodes. @@ -254,7 +254,7 @@ def _prepare_node_features(self) -> pd.DataFrame: if len(nodes) == 0: raise ValueError("No common features found between the network and omics data.") - + if len(nodes) != len(network_features): missing = set(network_features) - set(nodes) self.logger.warning(f"Length of common features: {len(nodes)}") @@ -306,9 +306,9 @@ def _prepare_node_features(self) -> pd.DataFrame: for cvar in clinical_cols: clinical_series = self.clinical_data[cvar].loc[common_index] corr_val = vec.corr(clinical_series) - + corr_vector[cvar] = corr_val if not pd.isna(corr_val) else 0.0 - + full_feature_vec = { "pagerank": pagerank[node], "eigenvector": eigenvector[node], @@ -317,7 +317,7 @@ def _prepare_node_features(self) -> pd.DataFrame: full_feature_vec.update(corr_vector) node_features_dict[node] = full_feature_vec - + node_features_df = pd.DataFrame.from_dict(node_features_dict, orient="index") self.logger.info(f"Built feature matrix with clinical correlations shape: {node_features_df.shape}") @@ -326,7 +326,7 @@ def _prepare_node_features(self) -> pd.DataFrame: if self.phenotype_data is None or self.phenotype_data.empty: raise ValueError("No phenotype data available for statistical features.") pheno = self.phenotype_data.loc[omics_filtered.index].dropna() - + stat_features = {} for node in nodes: vec = omics_filtered[node].loc[pheno.index].dropna() @@ -372,7 +372,7 @@ def _prepare_node_labels(self) -> pd.Series: samples = self.omics_data.index.intersection(self.phenotype_data.index) omics_data = self.omics_data.loc[samples, nodes] pheno = self.phenotype_data.loc[samples] - + if len(samples)==0: raise ValueError("No overlapping samples between omics and phenotype.") if len(nodes)==0: @@ -385,7 +385,7 @@ def _prepare_node_labels(self) -> pd.Series: val = vec.corr(pheno) labels_dict[node] = 0.0 if pd.isna(val) else val - + labels_series = pd.Series(labels_dict, index=nodes) ranks = labels_series.rank(method="average") @@ -522,7 +522,7 @@ def _generate_embeddings(self, model: nn.Module, data: Data) -> torch.Tensor: embeddings = model.get_embeddings(data) return embeddings.cpu() - + def _tune_helper(self, config): """ The function that each Ray Tune trial calls. @@ -553,8 +553,8 @@ def _tune_helper(self, config): X = node_embeddings.detach().cpu().numpy() - dim_stds = np.std(X, axis=0) - keep_dims = dim_stds >= 1e-4 + dim_stds = np.std(X, axis=0) + keep_dims = dim_stds >= 1e-4 num_dims_kept = np.sum(keep_dims) if num_dims_kept == 0: @@ -651,12 +651,12 @@ def short_dirname_creator(trial): best_config_json = json.dumps(best_trial.config, indent=4) try: - df = result.get_dataframe() + df = result.get_dataframe() except AttributeError: df = result.dataframe(metric="composite_score", mode="min") summary_file = save_dir / f"summary_{num_nodes}_{timestamp}.txt" - + with open(summary_file, "w") as f: f.write(f"Best trial\n") f.write(best_config_json) @@ -678,5 +678,5 @@ def short_dirname_creator(trial): json.dump(best_trial.config, f, indent=4) self.logger.info(f"Best embedding parameters saved to {best_params_file}") - + return best_trial.config diff --git a/bioneuralnet/network_embedding/gnn_models.py b/bioneuralnet/network_embedding/gnn_models.py index b0ad242..4d6a3bd 100644 --- a/bioneuralnet/network_embedding/gnn_models.py +++ b/bioneuralnet/network_embedding/gnn_models.py @@ -48,9 +48,9 @@ def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer in_dim = input_dim if i == 0 else hidden_dim self.convs.append(GCNConv(in_dim, hidden_dim)) self.bns.append(nn.Identity()) - + self.regressor = nn.Linear(hidden_dim, 1) if self.final_layer == "regression" else nn.Identity() - + def forward(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -61,7 +61,7 @@ def forward(self, data): x = F.dropout(x, p=self.dropout, training=self.training) x = self.regressor(x) return x - + def get_embeddings(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -87,16 +87,16 @@ def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, heads=1, fi self.final_layer = final_layer self.heads = heads self.activation = get_activation(activation) - + self.convs = nn.ModuleList() self.bns = nn.ModuleList() for i in range(layer_num): in_dim = input_dim if i == 0 else hidden_dim * heads self.convs.append(GATConv(in_dim, hidden_dim, heads=heads)) self.bns.append(nn.Identity()) - + self.regressor = nn.Linear(hidden_dim * heads, 1) if self.final_layer == "regression" else nn.Identity() - + def forward(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -107,7 +107,7 @@ def forward(self, data): x = F.dropout(x, p=self.dropout, training=self.training) x = self.regressor(x) return x - + def get_embeddings(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -132,16 +132,16 @@ def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer self.dropout = process_dropout(dropout) self.final_layer = final_layer self.activation = get_activation(activation) - + self.convs = nn.ModuleList() self.bns = nn.ModuleList() for i in range(layer_num): in_dim = input_dim if i == 0 else hidden_dim self.convs.append(SAGEConv(in_dim, hidden_dim)) self.bns.append(nn.Identity()) - + self.regressor = nn.Linear(hidden_dim, 1) if self.final_layer == "regression" else nn.Identity() - + def forward(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -152,7 +152,7 @@ def forward(self, data): x = F.dropout(x, p=self.dropout, training=self.training) x = self.regressor(x) return x - + def get_embeddings(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -164,8 +164,8 @@ def get_embeddings(self, data): return x class GIN(nn.Module): - def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer="regression", activation="relu", seed=None): - + def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer="regression", activation="relu", seed=None): + if seed is not None: torch.manual_seed(seed) if torch.cuda.is_available(): @@ -178,7 +178,7 @@ def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer self.dropout = process_dropout(dropout) self.final_layer = final_layer self.activation = get_activation(activation) - + self.convs = nn.ModuleList() self.bns = nn.ModuleList() for i in range(layer_num): @@ -191,9 +191,9 @@ def __init__(self, input_dim, hidden_dim, layer_num=2, dropout=True, final_layer ) self.convs.append(GINConv(mlp)) self.bns.append(nn.Identity()) - + self.regressor = nn.Linear(hidden_dim, 1) if self.final_layer == "regression" else nn.Identity() - + def forward(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): @@ -204,7 +204,7 @@ def forward(self, data): x = F.dropout(x, p=self.dropout, training=self.training) x = self.regressor(x) return x - + def get_embeddings(self, data): x, edge_index = data.x, data.edge_index for conv, bn in zip(self.convs, self.bns): diff --git a/bioneuralnet/utils/__init__.py b/bioneuralnet/utils/__init__.py index fcc7c42..4a278a1 100644 --- a/bioneuralnet/utils/__init__.py +++ b/bioneuralnet/utils/__init__.py @@ -1,7 +1,7 @@ from .logger import get_logger from .rdata_convert import rdata_to_df from .data import variance_summary, zero_fraction_summary, expression_summary, correlation_summary, explore_data_stats -from .preprocess import preprocess_clinical, clean_inf_nan, select_top_k_variance, select_top_k_correlation, select_top_randomforest, top_anova_f_features, prune_network, prune_network_by_quantile, network_remove_low_variance, network_remove_high_zero_fraction +from .preprocess import preprocess_clinical, clean_inf_nan, select_top_k_variance, select_top_k_correlation, select_top_randomforest, top_anova_f_features, prune_network, prune_network_by_quantile, network_remove_low_variance, network_remove_high_zero_fraction from .graph import gen_similarity_graph, gen_correlation_graph, gen_threshold_graph, gen_gaussian_knn_graph, gen_lasso_graph, gen_mst_graph, gen_snn_graph @@ -9,4 +9,4 @@ "explore_data_stats", "preprocess_clinical", "clean_inf_nan", "select_top_k_variance", "select_top_k_correlation", "select_top_randomforest", "top_anova_f_features", "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", "gen_similarity_graph", "gen_correlation_graph", "gen_threshold_graph", - "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"] \ No newline at end of file + "gen_gaussian_knn_graph", "gen_lasso_graph", "gen_mst_graph", "gen_snn_graph"] diff --git a/bioneuralnet/utils/data.py b/bioneuralnet/utils/data.py index 7d51ceb..406a8fd 100644 --- a/bioneuralnet/utils/data.py +++ b/bioneuralnet/utils/data.py @@ -1,10 +1,11 @@ import pandas as pd import numpy as np +from typing import Optional from .logger import get_logger logger = get_logger(__name__) -def variance_summary(df: pd.DataFrame, low_var_threshold: float = None) -> dict: +def variance_summary(df: pd.DataFrame, low_var_threshold: Optional[float] = None) -> dict: """ Compute summary statistics for column variances in the DataFrame """ @@ -19,10 +20,10 @@ def variance_summary(df: pd.DataFrame, low_var_threshold: float = None) -> dict: } if low_var_threshold is not None: summary["num_low_variance_features"] = (variances < low_var_threshold).sum() - + return summary -def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: float = None) -> dict: +def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: Optional[float] = None) -> dict: """ Compute summary statistics for the fraction of zeros in each column """ @@ -37,7 +38,7 @@ def zero_fraction_summary(df: pd.DataFrame, high_zero_threshold: float = None) - } if high_zero_threshold is not None: summary["num_high_zero_features"] = (zero_fraction > high_zero_threshold).sum() - + return summary def expression_summary(df: pd.DataFrame) -> dict: @@ -81,16 +82,16 @@ def explore_data_stats(omics_df: pd.DataFrame, name: str = "Data") -> None: print(f"Statistics for {name}:") var_stats = variance_summary(omics_df, low_var_threshold=1e-4) print(f"Variance Summary: {var_stats}") - + zero_stats = zero_fraction_summary(omics_df, high_zero_threshold=0.50) print(f"Zero Fraction Summary: {zero_stats}") - + expr_stats = expression_summary(omics_df) print(f"Expression Summary: {expr_stats}") - + try: corr_stats = correlation_summary(omics_df) print(f"Correlation Summary: {corr_stats}") except Exception as e: print(f"Correlation Summary: Could not compute due to: {e}") - print("\n") \ No newline at end of file + print("\n") diff --git a/bioneuralnet/utils/graph.py b/bioneuralnet/utils/graph.py index cdff9d3..eac85b8 100644 --- a/bioneuralnet/utils/graph.py +++ b/bioneuralnet/utils/graph.py @@ -1,6 +1,7 @@ import torch import pandas as pd import numpy as np +from typing import Optional import torch.nn.functional as F from sklearn.covariance import GraphicalLasso @@ -27,7 +28,7 @@ def gen_similarity_graph(X:pd.DataFrame, k:int = 15, metric:str = "cosine", mutu x_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = x_torch.size(0) # full similarity matrix @@ -41,7 +42,7 @@ def gen_similarity_graph(X:pd.DataFrame, k:int = 15, metric:str = "cosine", mutu # building the knn graph or global threshold mask if per_node: - _, index = torch.topk(S, k=k+1, dim=1) + _, index = torch.topk(S, k=k+1, dim=1) mask = torch.zeros(N, N, dtype=torch.bool, device=device) for i in range(N): for j in index[i, 1:k+1]: @@ -66,11 +67,11 @@ def gen_similarity_graph(X:pd.DataFrame, k:int = 15, metric:str = "cosine", mutu A_numpy = A.cpu().numpy() final_graph =pd.DataFrame(A_numpy, index=nodes, columns=nodes) - + return final_graph -def gen_correlation_graph(X: pd.DataFrame, k: int = 15,method: str = 'pearson', mutual: bool = False, per_node: bool = True,threshold: float = None, self_loops:bool = True) -> pd.DataFrame: +def gen_correlation_graph(X: pd.DataFrame, k: int = 15,method: str = 'pearson', mutual: bool = False, per_node: bool = True,threshold: Optional[float] = None, self_loops:bool = True) -> pd.DataFrame: """ Build a graph based on pairwise Pearson or Spearman correlations. @@ -100,7 +101,7 @@ def gen_correlation_graph(X: pd.DataFrame, k: int = 15,method: str = 'pearson', x_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = x_torch.size(0) # rank transform for Spearman @@ -171,7 +172,7 @@ def gen_threshold_graph(X:pd.DataFrame, b: float = 6.0,k: int = 15, mutual: bool x_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = x_torch.size(0) # pearson correlation matrix @@ -202,11 +203,11 @@ def gen_threshold_graph(X:pd.DataFrame, b: float = 6.0,k: int = 15, mutual: bool W.fill_diagonal_(1.0) W = F.normalize(W, p=1, dim=1) - final_graph = pd.DataFrame(W.cpu().numpy(), index=nodes, columns=nodes) + final_graph = pd.DataFrame(W.cpu().numpy(), index=nodes, columns=nodes) return final_graph -def gen_gaussian_knn_graph(X: pd.DataFrame,k: int = 15,sigma: float = None,mutual: bool = False,self_loops: bool = True) -> pd.DataFrame: +def gen_gaussian_knn_graph(X: pd.DataFrame,k: int = 15,sigma: Optional[float] = None ,mutual: bool = False,self_loops: bool = True) -> pd.DataFrame: """ Build a normalized knn similarity graph from feature vectors. Computes pairwise cosine or Euclidean similarities, sparsifies via k-nearest neighbors or a global threshold. Optionally prunes to mutual neighbors and/or adds self-loops. @@ -232,7 +233,7 @@ def gen_gaussian_knn_graph(X: pd.DataFrame,k: int = 15,sigma: float = None,mutua X_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = X_torch.size(0) D2 = torch.cdist(X_torch, X_torch).pow(2) @@ -282,7 +283,7 @@ def gen_lasso_graph(X: pd.DataFrame, alpha: float = 0.01, self_loops: bool = Tru x_numpy = X.values else: raise TypeError("X must be a pandas.DataFrame") - + model = GraphicalLasso(alpha=alpha, max_iter=200) model.fit(x_numpy) @@ -318,7 +319,7 @@ def gen_mst_graph(X: pd.DataFrame, self_loops: bool = True) -> pd.DataFrame: X_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = X_torch.size(0) D = torch.cdist(X_torch, X_torch) @@ -378,7 +379,7 @@ def gen_snn_graph(X: pd.DataFrame,k: int = 15,mutual: bool = False, self_loops: X_torch = torch.tensor(X.values, dtype=torch.float32, device=device) else: raise TypeError("X must be a pandas.DataFrame") - + N = X_torch.size(0) S = torch.mm(X_torch, X_torch.t()) _, index = torch.topk(S, k=k+1, dim=1) diff --git a/bioneuralnet/utils/logger.py b/bioneuralnet/utils/logger.py index 6c3c39e..049cd81 100644 --- a/bioneuralnet/utils/logger.py +++ b/bioneuralnet/utils/logger.py @@ -10,7 +10,7 @@ def get_logger(name: str) -> logging.Logger: name (str): Name of the logger. Returns: - + logging.Logger: Configured logger instance. """ logger = logging.getLogger(name) diff --git a/bioneuralnet/utils/preprocess.py b/bioneuralnet/utils/preprocess.py index d748c1f..c905911 100644 --- a/bioneuralnet/utils/preprocess.py +++ b/bioneuralnet/utils/preprocess.py @@ -6,6 +6,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.feature_selection import f_classif, f_regression from statsmodels.stats.multitest import multipletests +from typing import Callable, TypeAlias, overload from .logger import get_logger logger = get_logger(__name__) @@ -35,39 +36,39 @@ def preprocess_clinical(X: pd.DataFrame, y: pd.Series, top_k: int = 10, scale: b y_series = y.copy() else: raise ValueError("y must be a pandas Series or single-column DataFrame") - + ignore_columns = ignore_columns or [] missing = set(ignore_columns) - set(X.columns) if missing: raise KeyError(f"Ignored columns not in X: {missing}") df_ignore = X[ignore_columns].copy() X = X.drop(columns=ignore_columns) - + df_numeric = X.select_dtypes(include="number") df_categorical = X.select_dtypes(include=["object", "category", "bool"]) df_numeric_clean = clean_inf_nan(df_numeric) - + if scale: scaler = RobustScaler() scaled_array = scaler.fit_transform(df_numeric_clean) df_numeric_scaled = pd.DataFrame(scaled_array,columns=df_numeric_clean.columns,index=df_numeric_clean.index) else: df_numeric_scaled = df_numeric_clean.copy() - + if not df_categorical.empty: df_cat_filled = df_categorical.fillna("Missing").astype(str) df_cat_encoded = pd.get_dummies(df_cat_filled, drop_first=True) else: df_cat_encoded = pd.DataFrame(index=df_numeric_scaled.index) - + df_combined = pd.concat([df_numeric_scaled, df_cat_encoded, df_ignore],axis=1,join="inner") df_features = df_combined.loc[:, df_combined.std(axis=0) > 0] - + if y_series.nunique() <= 10: model = RandomForestClassifier(n_estimators=150,random_state=119,class_weight="balanced") else: model = RandomForestRegressor(n_estimators=150,random_state=119) - + model.fit(df_features, y_series) importances = model.feature_importances_ feature_names = df_features.columns.tolist() @@ -77,7 +78,7 @@ def preprocess_clinical(X: pd.DataFrame, y: pd.Series, top_k: int = 10, scale: b for i in range(len(order) - 1, -1, -1): descending.append(order[i]) - + if top_k < len(descending): count = top_k logger.info(f"Selected top {count} features by RandomForest importance") @@ -88,11 +89,11 @@ def preprocess_clinical(X: pd.DataFrame, y: pd.Series, top_k: int = 10, scale: b selected_idx = [] for i in range(count): selected_idx.append(descending[i]) - + selected_columns = [] for idx in selected_idx: selected_columns.append(feature_names[idx]) - + return df_features[selected_columns] def clean_inf_nan(df: pd.DataFrame) -> pd.DataFrame: @@ -192,11 +193,12 @@ def select_top_k_correlation(X: pd.DataFrame, y: pd.Series = None, top_k: int = correlations[column] = abs(col) # descending correlations - features = list(correlations.keys()) - features.sort(key=correlations.get, reverse=True) - select = min(top_k, len(features)) + def key_fn(k: str) -> float: + return correlations[k] - selected = features[: select] + features = list(correlations.keys()) + features.sort(key=key_fn, reverse=True) + selected = features[:top_k] # unsupervised else: @@ -219,13 +221,15 @@ def select_top_k_correlation(X: pd.DataFrame, y: pd.Series = None, top_k: int = avg = total / (len(columns) - 1) correlations_avg[col] = avg + def key_fn(k: str) -> float: + return correlations_avg[k] + features = list(correlations_avg.keys()) - features.sort(key=correlations_avg.get) - select = min(top_k, len(features)) - selected = features[: select] + features.sort(key=key_fn, reverse=True) + selected = features[:top_k] logger.info(f"Selected {len(selected)} features by correlation") - + return numbers_only[selected] def select_top_randomforest(X: pd.DataFrame, y: pd.Series, top_k: int = 1000, seed: int = 119) -> pd.DataFrame: @@ -345,7 +349,7 @@ def prune_network(adjacency_matrix, weight_threshold=0.0): - weight_threshold (float): Minimum weight to keep an edge (default: 0.0). Returns: - + - pd.DataFrame: """ logger.info(f"Pruning network with weight threshold: {weight_threshold}") @@ -357,13 +361,13 @@ def prune_network(adjacency_matrix, weight_threshold=0.0): if weight_threshold > 0: edges_to_remove = [] - + for u, v, d in G.edges(data=True): weight = d.get('weight', 0) if weight < weight_threshold: edges_to_remove.append((u, v)) - G.remove_edges_from(edges_to_remove) + G.remove_edges_from(edges_to_remove) isolated_nodes = list(nx.isolates(G)) G.remove_nodes_from(isolated_nodes) @@ -371,7 +375,7 @@ def prune_network(adjacency_matrix, weight_threshold=0.0): network_after_prunning = nx.to_pandas_adjacency(G, dtype=float) current_nodes = G.number_of_nodes() current_edges = G.number_of_edges() - + logger.info(f"Pruning network with weight threshold: {weight_threshold}") logger.info(f"Number of nodes in full network: {total_nodes}") logger.info(f"Number of edges in full network: {total_edges}") @@ -395,7 +399,7 @@ def prune_network_by_quantile(adjacency_matrix, quantile=0.5): """ logger.info(f"Pruning network using quantile: {quantile}") G = nx.from_pandas_adjacency(adjacency_matrix) - + weights = [] for u, v, data in G.edges(data=True): @@ -405,10 +409,10 @@ def prune_network_by_quantile(adjacency_matrix, quantile=0.5): if len(weights) == 0: logger.warning("Network contains no edges") return nx.to_pandas_adjacency(G, dtype=float) - + weight_threshold = np.quantile(weights, quantile) logger.info(f"Computed weight threshold: {weight_threshold} for quantile: {quantile}") - + edges_to_remove = [] for u, v, data in G.edges(data=True): @@ -418,22 +422,22 @@ def prune_network_by_quantile(adjacency_matrix, quantile=0.5): G.remove_edges_from(edges_to_remove) isolated_nodes = list(nx.isolates(G)) G.remove_nodes_from(isolated_nodes) - + pruned_adjacency = nx.to_pandas_adjacency(G, dtype=float) logger.info(f"Number of nodes after pruning: {G.number_of_nodes()}") logger.info(f"Number of edges after pruning: {G.number_of_edges()}") - + return pruned_adjacency def network_remove_low_variance(network: pd.DataFrame, threshold: float = 1e-6) -> pd.DataFrame: """ Remove rows and columns from adjacency matrix where the variance is below a threshold. - + Parameters: network (pd.DataFrame): Adjacency matrix. threshold (float): Variance threshold. - + Returns: pd.DataFrame: Filtered adjacency matrix. @@ -448,12 +452,12 @@ def network_remove_low_variance(network: pd.DataFrame, threshold: float = 1e-6) def network_remove_high_zero_fraction(network: pd.DataFrame, threshold: float = 0.95) -> pd.DataFrame: """ Remove rows and columns from adjacency matrix where the fraction of zero entries is higher than the threshold. - + Parameters: network (pd.DataFrame): Adjacency matrix. threshold (float): Zero-fraction threshold. - + Returns: pd.DataFrame: Filtered adjacency matrix. diff --git a/bioneuralnet/utils/rdata_convert.py b/bioneuralnet/utils/rdata_convert.py index 938bdc0..46eedb6 100644 --- a/bioneuralnet/utils/rdata_convert.py +++ b/bioneuralnet/utils/rdata_convert.py @@ -6,7 +6,7 @@ logger = get_logger(__name__) def rdata_to_df(rdata_file: Path, csv_file: Path, Object=None) -> pd.DataFrame: - + rscript = shutil.which("Rscript") if rscript is None: raise EnvironmentError("Rscript not found…") @@ -43,4 +43,3 @@ def rdata_to_df(rdata_file: Path, csv_file: Path, Object=None) -> pd.DataFrame: raise FileNotFoundError(f"No CSV at {csv_file}, nor in {possibilities}") return pd.read_csv(csv_file, index_col=0) - diff --git a/docs/source/_autosummary/bioneuralnet.external_tools.rst b/docs/source/_autosummary/bioneuralnet.external_tools.rst index d86bc04..761a07b 100644 --- a/docs/source/_autosummary/bioneuralnet.external_tools.rst +++ b/docs/source/_autosummary/bioneuralnet.external_tools.rst @@ -3,4 +3,17 @@ .. automodule:: bioneuralnet.external_tools - \ No newline at end of file + + .. rubric:: Classes + + .. autosummary:: + + SmCCNet + +.. rubric:: Modules + +.. autosummary:: + :toctree: + :recursive: + + smccnet diff --git a/docs/source/_autosummary/bioneuralnet.external_tools.smccnet.rst b/docs/source/_autosummary/bioneuralnet.external_tools.smccnet.rst new file mode 100644 index 0000000..7ceb9da --- /dev/null +++ b/docs/source/_autosummary/bioneuralnet.external_tools.smccnet.rst @@ -0,0 +1,21 @@ +bioneuralnet.external\_tools.smccnet +==================================== + +.. automodule:: bioneuralnet.external_tools.smccnet + + + .. rubric:: Functions + + .. autosummary:: + + files + get_logger + + .. rubric:: Classes + + .. autosummary:: + + Any + Path + SmCCNet + \ No newline at end of file diff --git a/docs/source/_autosummary/bioneuralnet.metrics.evaluation.rst b/docs/source/_autosummary/bioneuralnet.metrics.evaluation.rst index 1f98068..f3fda49 100644 --- a/docs/source/_autosummary/bioneuralnet.metrics.evaluation.rst +++ b/docs/source/_autosummary/bioneuralnet.metrics.evaluation.rst @@ -13,7 +13,6 @@ bioneuralnet.metrics.evaluation evaluate_model evaluate_rf evaluate_single_run - evaluate_xgb get_logger plot_grouped_performance plot_multiple_metrics @@ -23,6 +22,4 @@ bioneuralnet.metrics.evaluation .. autosummary:: Path - XGBClassifier - XGBRegressor \ No newline at end of file diff --git a/docs/source/_autosummary/bioneuralnet.metrics.rst b/docs/source/_autosummary/bioneuralnet.metrics.rst index 58251eb..b08be47 100644 --- a/docs/source/_autosummary/bioneuralnet.metrics.rst +++ b/docs/source/_autosummary/bioneuralnet.metrics.rst @@ -15,7 +15,6 @@ evaluate_model evaluate_rf evaluate_single_run - evaluate_xgb louvain_to_adjacency omics_correlation plot_embeddings diff --git a/docs/source/_static/BioNeuralNet1.png b/docs/source/_static/BioNeuralNet1.png deleted file mode 100644 index a308430..0000000 Binary files a/docs/source/_static/BioNeuralNet1.png and /dev/null differ diff --git a/docs/source/_static/BioNeuralNet2.png b/docs/source/_static/BioNeuralNet2.png deleted file mode 100644 index 2885136..0000000 Binary files a/docs/source/_static/BioNeuralNet2.png and /dev/null differ diff --git a/docs/source/_static/ccrcc_search.png b/docs/source/_static/ccrcc_search.png deleted file mode 100644 index df545b4..0000000 Binary files a/docs/source/_static/ccrcc_search.png and /dev/null differ diff --git a/docs/source/_static/cptac_clinical.png b/docs/source/_static/cptac_clinical.png deleted file mode 100644 index f96b554..0000000 Binary files a/docs/source/_static/cptac_clinical.png and /dev/null differ diff --git a/docs/source/_static/cptac_genomics.png b/docs/source/_static/cptac_genomics.png deleted file mode 100644 index 0e5634b..0000000 Binary files a/docs/source/_static/cptac_genomics.png and /dev/null differ diff --git a/docs/source/_static/cptac_proteomics.png b/docs/source/_static/cptac_proteomics.png deleted file mode 100644 index 7999c95..0000000 Binary files a/docs/source/_static/cptac_proteomics.png and /dev/null differ diff --git a/docs/source/_static/cptac_search.png b/docs/source/_static/cptac_search.png deleted file mode 100644 index c587d11..0000000 Binary files a/docs/source/_static/cptac_search.png and /dev/null differ diff --git a/docs/source/_static/images_download.png b/docs/source/_static/images_download.png deleted file mode 100644 index 7bce675..0000000 Binary files a/docs/source/_static/images_download.png and /dev/null differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 4e9d888..005c497 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -58,8 +58,6 @@ autodoc_mock_imports = [ "torch", "torch_geometric", - "bioneuralnet.external_tools", "sklearn", "statsmodels", - "xgboost", ] diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 146f238..5535c33 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -17,6 +17,7 @@ BioNeuralNet integrates multiple open-source libraries to deliver advanced multi - **ray[tune]:** Scalable hyperparameter tuning for GNN models. `ray[tune] `_ - **matplotlib:** Data visualization. `matplotlib `_ - **python-louvain:** Community detection algorithms for graphs. `python louvain `_ +- **statsmodels:** Statistical tests and models, including ANOVA and linear regression. `statsmodels `_ We also acknowledge R-based tools for external network construction: @@ -35,47 +36,65 @@ Please refer to our contribution guidelines in the repository for more details. Frequently Asked Questions (FAQ) -------------------------------- -**Q1: What is BioNeuralNet?** -A1: BioNeuralNet is a Python framework for integrating multi-omics data with Graph Neural Networks (GNNs). It provides end-to-end solutions for network embedding, clustering, subject representation, and disease prediction. +**Q1: What is BioNeuralNet?**: -**Q2: What are the key features of BioNeuralNet?** -A2: -- **Graph Clustering:** Identify communities using Correlated Louvain, Hybrid Louvain, and Correlated PageRank methods. -- **GNN Embedding:** Generate node embeddings using advanced GNN models. -- **Subject Representation:** Enrich omics data with learned embeddings. -- **Disease Prediction:** Leverage DPMON for integrated, end-to-end disease prediction. + - BioNeuralNet is a Python framework for integrating multi-omics data with Graph Neural Networks (GNNs). It provides end-to-end solutions for network embedding, clustering, subject representation, and disease prediction. -**Q3: How do I install BioNeuralNet?** -A3: Install via pip: +**Q2: What are the key features of BioNeuralNet?**: + + - **Graph Clustering:** Identify communities using Correlated Louvain, Hybrid Louvain, and Correlated PageRank methods. + - **GNN Embedding:** Generate node embeddings using advanced GNN models. + - **Subject Representation:** Enrich omics data with learned embeddings. + - **Disease Prediction:** Leverage DPMON for integrated, end-to-end disease prediction. + +**Q3: How do I install BioNeuralNet?**: + + - Install via pip .. code-block:: bash pip install bioneuralnet -For full installation instructions, see the :doc:`installation` guide. -**Q4: Does BioNeuralNet support GPU acceleration?** -A4: Yes. If a CUDA-compatible GPU is available, BioNeuralNet will utilize it via PyTorch. +**Q4: Does BioNeuralNet support GPU acceleration?**: -**Q5: Can I use my own network instead of SmCCNet?** -A5: Absolutely. You can supply a pre-computed adjacency matrix directly to the GNNEmbedding or DPMON modules. + - Yes. If a CUDA-compatible GPU is available, BioNeuralNet will utilize it via PyTorch. + +**Q5: Can I use my own network instead of SmCCNet or internal graph generation functions?** + + - Absolutely. You can supply a pre-computed adjacency matrix directly to the GNNEmbedding or DPMON modules. **Q6: How is DPMON different from standard GNN models?** -A6: DPMON is tailored for multi-omics disease prediction by jointly learning node embeddings and a classifier, integrating both local and global graph structures. -**Q7: What clustering methods does BioNeuralNet support?** -A7: BioNeuralNet offers: -- Correlated Louvain -- Hybrid Louvain -- Correlated PageRank + - DPMON is tailored for multi-omics disease prediction by jointly learning node embeddings and a classifier, integrating both local and global graph structures. + +**Q7: What clustering methods does BioNeuralNet support?**: + + - Correlated Louvain + - Hybrid Louvain + - Correlated PageRank + +**Q8: How can I contribute to BioNeuralNet?**: + + - Contributions are welcome! You can: + - Report issues or bugs on our `GitHub Issues page `_. + - Suggest new features or improvements. + - Share your experiences or use cases with the community. + + - How to contribute: + - Fork the repository, add your features, components, or algorithms, and submit a pull request. + - Please refer to our `contribution guidelines `_ for more details. + +**Q9: Where can I find tutorials and examples?**: + + - We provide a set of tutorials and example notebooks to help you get started with BioNeuralNet. You can find them in the `tutorials` directory of the repository. + - For a quick start, check out the following notebooks: -**Q8: How can I contribute to BioNeuralNet?** -A8: Contributions are encouraged! Fork the repository, develop your feature, and submit a pull request. See our contribution guidelines on GitHub. + - :doc:`Quick_Start`. + - :doc:`TCGA-BRCA_Dataset`. -**Q9: Where can I find tutorials and examples?** -A9: For detailed guides and demos, visit :doc:`tutorials/index` and check out the example notebooks provided in the repository. +**Q10: What license is BioNeuralNet released under?**: -**Q10: What license is BioNeuralNet released under?** -A10: BioNeuralNet is distributed under the MIT License. For details, see the `MIT LICENSE `_ page. + - BioNeuralNet is released under the MIT License. You can find the full license text in the `MIT LICENSE `_ file in the repository. Return to :doc:`../index` diff --git a/docs/source/index.rst b/docs/source/index.rst index 7f44bb9..b46f8b6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,16 +13,14 @@ BioNeuralNet - Multi-Omics Integration with Graph Neural Networks .. image:: https://img.shields.io/badge/GitHub-View%20Code-blue :target: https://github.com/UCD-BDLab/BioNeuralNet - .. figure:: _static/LOGO_WB.png :align: center :alt: BioNeuralNet Logo +:doc:`installation` +------------------- -Installation ------------- - -To install BioNeuralNet, simply run: +To install BioNeuralNet via pip: .. code-block:: bash @@ -30,6 +28,10 @@ To install BioNeuralNet, simply run: For additional installation details, see :doc:`installation`. +For end-to-end examples of `BioNeuralNet`: + + - :doc:`Quick_Start`. + - :doc:`TCGA-BRCA_Dataset`. **BioNeuralNet Overview** ------------------------- @@ -40,52 +42,6 @@ For additional installation details, see :doc:`installation`. Embeddings form the core of BioNeuralNet, enabling a number of downstream applications. -**BioNeuralNet Core Features** ------------------------------- - -For an End-to-End example example of BioNeuralNet, see :doc:`Quick_Start`. - -**Network Embedding**: :doc:`gnns` - - Given a multi-omics network as input, BioNeuralNet can generate embeddings using Graph Neural Networks (GNNs). - - Generate embeddings using methods such as **GCN**, **GAT**, **GraphSAGE**, and **GIN**. - - Outputs can be obtained as native tensors or converted to pandas DataFrames for easy analysis and visualization. - - Embeddings unlock numerous downstream applications, including disease prediction, enhanced subject representation, clustering, and more. - -**Graph Clustering**: :doc:`clustering` - - Identify functional modules or communities using **correlated clustering methods** (e.g., CorrelatedPageRank, CorrelatedLouvain, HybridLouvain) that integrate phenotype correlation to extract biologically relevant modules [1]_. - - Clustering methods can be applied to any network represented allowing flexible analysis across different domains. - - All clustering components return either raw partitions dictionaries or induced subnetwork adjacency matrices (as DataFrames) for visualization. - - Use cases include, feature selection, biomarker discovery, and network-based analysis. - -**Downstream Tasks**: :doc:`downstream_tasks` - - **Subject Representation**: - - Integrate node embeddings back into omics data to enrich subject-level profiles by weighting features with learned embedding. - - This embedding-enriched data can be used for downstream tasks such as disease prediction or biomarker discovery. - - The result can be returned as a DataFrame or a PyTorch tensor, fitting naturally into downstream analyses. - - - **Disease Prediction for Multi-Omics Network DPMON** [2]_: - - Classification End-to-End pipeline for disease prediction using Graph Neural Network embeddings. - - DPMON supports hyperparameter tuning-when enabled, it finds the best for the given data. - - This approach, along with the native pandas integration across modules, ensures that BioNeuralNet can be easily incorporated into your analysis workflows. - -**Metrics**: :doc:`metrics` - - Several plotting funcctions to visualize networks, emebddings, variance distribution, cluster comparison, and more. - - Correlation based functions to compare clustersand omics data with the phenotype. - -**Utilities**: :doc:`utils` - - **Filtering Functions**: - - Network filtering allows users to select variance or zero-fraction filtering to an omics network. - - Reducing noise, and removing outliers. - - - **Data Conversion**: - - Convert RData files both CSV and to Pandas DataFrame. For ease of integration for R-based workflows. - -**External Tools**: :doc:`external_tools/index` - - **Graph Construction**: - - BioNeuralNet provides additional tools in the `bioneuralnet.external_tools` module. - - Allowing users to generate networks using R-based tools like SmCCNet. - - While optional, these tools enhance BioNeuralNet's capabilities and are recommended for comprehensive analysis. - What is BioNeuralNet? --------------------- BioNeuralNet is a **Python-based** framework designed to bridge the gap between **multi-omics data analysis** and **Graph Neural Networks (GNNs)**. By leveraging advanced techniques, it enables: @@ -94,6 +50,7 @@ BioNeuralNet is a **Python-based** framework designed to bridge the gap between - **GNN Embeddings**: Learns network-based feature representations from biological graphs, capturing both **biological structure** and **feature correlations** for enhanced analysis. - **Subject Representation**: Generates high-quality embeddings for individuals based on multi-omics profiles. - **Disease Prediction**: Builds predictive models using integrated multi-layer biological networks. +- **Interoperability**: Component outputs are structured as **pandas DataFrames**, ensuring easy integration with existing workflows and tools. Why GNNs? --------- @@ -136,7 +93,7 @@ Below is a quick example demonstrating the following steps: 2. **Network Construction**: - - **Not performed internally**: Generate the network adjacency matrix externally (SmCCNet). + - In this example we generate the network using a external R package (SmCCNet[3]_). - Lightweight wrappers (SmCCNet) are available in `bioneuralnet.external_tools` for convenience, R is required for their usage. 3. **Disease Prediction**: @@ -151,12 +108,14 @@ Below is a quick example demonstrating the following steps: import pandas as pd from bioneuralnet.external_tools import SmCCNet from bioneuralnet.downstream_task import DPMON + from bioneuralnet.datasets import DatasetLoader - # Step 1: Data Preparation - phenotype_data = pd.read_csv('phenotype_data.csv') - omics_proteins = pd.read_csv('omics_proteins.csv') - omics_metabolites = pd.read_csv('omics_metabolites.csv') - clinical_dt = pd.read_csv('clinical_data.csv') + # Step 1: Load your data or use one of the provided datasets + Example = DatasetLoader("example1") + omics_proteins = Example.data["X1"] + omics_metabolites = Example.data["X2"] + phenotype_data = Example.data["Y"] + clinical_data = Example.data["clinical_data"] # Step 2: Network Construction smccnet = SmCCNet( @@ -174,12 +133,58 @@ Below is a quick example demonstrating the following steps: adjacency_matrix=global_network, omics_list=[omics_proteins, omics_metabolites], phenotype_data=phenotype_data, - clinical_data=clinical_dt, + clinical_data=clinical_data, model="GCN", ) predictions = dpmon.run() print("Disease phenotype predictions:\n", predictions) +**BioNeuralNet Core Features** +------------------------------ + +For an End-to-End example example of BioNeuralNet, see :doc:`Quick_Start` and :doc:`TCGA-BRCA_Dataset`. + +:doc:`gnns`: + - Given a multi-omics network as input, BioNeuralNet can generate embeddings using Graph Neural Networks (GNNs). + - Generate embeddings using methods such as **GCN**, **GAT**, **GraphSAGE**, and **GIN**. + - Outputs can be obtained as native tensors or converted to pandas DataFrames for easy analysis and visualization. + - Embeddings unlock numerous downstream applications, including disease prediction, enhanced subject representation, clustering, and more. + +:doc:`clustering`: + - Identify functional modules or communities using **correlated clustering methods** (e.g., CorrelatedPageRank, CorrelatedLouvain, HybridLouvain) that integrate phenotype correlation to extract biologically relevant modules [1]_. + - Clustering methods can be applied to any network represented allowing flexible analysis across different domains. + - All clustering components return either raw partitions dictionaries or induced subnetwork adjacency matrices (as DataFrames) for visualization. + - Use cases include, feature selection, biomarker discovery, and network-based analysis. + +:doc:`downstream_tasks`: + - **Subject Representation**: + - Integrate node embeddings back into omics data to enrich subject-level profiles by weighting features with learned embedding. + - This embedding-enriched data can be used for downstream tasks such as disease prediction or biomarker discovery. + - The result can be returned as a DataFrame or a PyTorch tensor, fitting naturally into downstream analyses. + + - **Disease Prediction for Multi-Omics Network DPMON** [2]_: + - Classification End-to-End pipeline for disease prediction using Graph Neural Network embeddings. + - DPMON supports hyperparameter tuning-when enabled, it finds the best for the given data. + - This approach, along with the native pandas integration across modules, ensures that BioNeuralNet can be easily incorporated into your analysis workflows. + +:doc:`metrics`: + - Visualize embeddings, feature variance, clustering comparison, and network structure in 2D. + - Evaluate embedding quality and clustering relevance using correlation with phenotype. + - Performance benchmarking tools for classification tasks using various models. + - Useful for assessing feature importance, validating network structure, and comparing cluster outputs. + +:doc:`utils`: + - Build graphs using k-NN similarity, Pearson/Spearman correlation, RBF kernels, mutual information, or soft-thresholding. + - Filter and preprocess omics or clinical data by variance, correlation, random forest importance, or ANOVA F-test. + - Tools for network pruning, feature selection, and data cleaning. + - Quickly summarize datasets with variance, zero-fraction, expression level, or correlation overviews. + - Includes conversion tools for RData and integrated logging. + +:doc:`external_tools/index`: + - **Graph Construction**: + - BioNeuralNet provides additional tools in the `bioneuralnet.external_tools` module. + - Includes support for **SmCCNet** (Sparse Multiple Canonical Correlation Network), an R-based tool for constructing phenotype-informed correlation networks [3]_. + - These tools are optional but enhance BioNeuralNet's graph construction capabilities and are recommended for more integrative or exploratory workflows. .. toctree:: :maxdepth: 2 @@ -208,3 +213,4 @@ Indices and References .. [1] Abdel-Hafiz, M., Najafi, M., et al. "Significant Subgraph Detection in Multi-omics Networks for Disease Pathway Identification." *Frontiers in Big Data*, 5 (2022). DOI: `10.3389/fdata.2022.894632 `_. .. [2] Hussein, S., Ramos, V., et al. "Learning from Multi-Omics Networks to Enhance Disease Prediction: An Optimized Network Embedding and Fusion Approach." In *2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)*, Lisbon, Portugal, 2024, pp. 4371-4378. DOI: `10.1109/BIBM62325.2024.10822233 `_. +.. [3] Liu, W., Vu, T., Konigsberg, I. R., Pratte, K. A., Zhuang, Y., & Kechris, K. J. (2023). "Network-Based Integration of Multi-Omics Data for Biomarker Discovery and Phenotype Prediction." *Bioinformatics*, 39(5), btat204. DOI: `10.1093/bioinformatics/btat204 `_. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 28d0e71..4bc0125 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,7 +1,7 @@ Installation ============ -BioNeuralNet supports Python 3.10 and 3.11 in this beta release. Follow the steps below to set up BioNeuralNet and its dependencies. +BioNeuralNet is fully compatible with Python 3.10, 3.11, and 3.12, and runs seamlessly on Windows, macOS, and Linux. Follow the steps below to set up BioNeuralNet and its dependencies. 1. **Install BioNeuralNet via pip**: @@ -18,7 +18,7 @@ BioNeuralNet supports Python 3.10 and 3.11 in this beta release. Follow the step .. code-block:: bash - pip install torch torchvision torchaudio + pip install torch pip install torch_geometric For GPU-accelerated builds or other configurations visit the official sites: diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index 992c143..11234d4 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -24,12 +24,11 @@ The module also contains several plotting functions: Evaluation ---------- -Functions to train and evaluate RandomForest and XGBoost models over one or multiple runs (Mostly used internally for testings purposes): +Functions to train and evaluate RandomForest over one or multiple runs (Mostly used internally for testings purposes): - :func:`bioneuralnet.metrics.evaluate_model` evaluates a model over multiple runs. - :func:`bioneuralnet.metrics.evaluate_single_run` runs a single evaluation loop. - :func:`bioneuralnet.metrics.evaluate_rf` evaluates a Random Forest model. -- :func:`bioneuralnet.metrics.evaluate_xgb` evaluates an XGBoost model. - :func:`bioneuralnet.metrics.evaluate_f1w` computes the weighted F1 score. - :func:`bioneuralnet.metrics.evaluate_f1m` computes the macro F1 score. - :func:`bioneuralnet.metrics.compare_clusters` compares and plots clusters from different methods. diff --git a/pytest.ini b/pytest.ini index 6160c1a..d866786 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -norecursedirs = docs/source/examples \ No newline at end of file +norecursedirs = docs/source/examples diff --git a/requirements-dev.txt b/requirements-dev.txt index 0c3dccf..ed81539 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,4 +35,4 @@ tensorboardX debugpy PyYAML node2vec -wheel \ No newline at end of file +wheel diff --git a/requirements.txt b/requirements.txt index e43b72d..288e2a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,5 @@ networkx python-louvain ray[tune] statsmodels -xgboost # torch -# torch_geometric \ No newline at end of file +# torch_geometric diff --git a/setup.cfg b/setup.cfg index f96e703..296ed31 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,7 +29,6 @@ install_requires = statsmodels networkx python-louvain - xgboost ray[tune] [options.packages.find] diff --git a/tests/__init__.py b/tests/__init__.py index 14de918..b7e2985 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -6,4 +6,4 @@ from .test_smccnet import * from .test_correlated_louvain import * from .test_correlated_pagerank import * -from .test_hybrid_louvain import * \ No newline at end of file +from .test_hybrid_louvain import * diff --git a/tests/test_correlated_louvain.py b/tests/test_correlated_louvain.py index 51792c7..fa60587 100644 --- a/tests/test_correlated_louvain.py +++ b/tests/test_correlated_louvain.py @@ -40,7 +40,7 @@ def setUp(self): phenos.append(np.random.rand()) i = i + 1 self.Y = pd.DataFrame({"phenotype": phenos}) - + def test_run_partition(self): cl = CorrelatedLouvain(self.G, self.B, self.Y, k3=0.2, k4=0.8, weight="weight", tune=False) part = cl.run(as_dfs=False) @@ -49,7 +49,7 @@ def test_run_partition(self): for k in part: keys.append(k) self.assertTrue(len(keys) > 0) - + def test_run_dfs(self): cl = CorrelatedLouvain(self.G, self.B, self.Y, k3=0.2, k4=0.8, weight="weight", tune=False) dfs = cl.run(as_dfs=True) @@ -58,7 +58,7 @@ def test_run_dfs(self): while i < len(dfs): self.assertIsInstance(dfs[i], pd.DataFrame) i = i + 1 - + def test_get_quality(self): cl = CorrelatedLouvain(self.G, self.B, self.Y, k3=0.2, k4=0.8, weight="weight", tune=False) cl.run(as_dfs=False) diff --git a/tests/test_correlated_pagerank.py b/tests/test_correlated_pagerank.py index c68c77f..222ba43 100644 --- a/tests/test_correlated_pagerank.py +++ b/tests/test_correlated_pagerank.py @@ -8,12 +8,12 @@ class TestCorrelatedPageRank(unittest.TestCase): def setUp(self): self.G = nx.complete_graph(4, create_using=nx.DiGraph()) nodes = list(self.G.nodes()) - + data = {node: np.random.rand(10) for node in nodes} self.B = pd.DataFrame(data) - + self.Y = pd.DataFrame({"phenotype": np.random.rand(10)}) - + def test_run_valid(self): cp = CorrelatedPageRank( self.G, self.B, self.Y, @@ -28,7 +28,7 @@ def test_run_valid(self): ] for key in expected_keys: self.assertIn(key, res) - + def test_run_empty_seed(self): cp = CorrelatedPageRank( self.G, self.B, self.Y, diff --git a/tests/test_hybrid_louvain.py b/tests/test_hybrid_louvain.py index b83de61..5e7d4d4 100644 --- a/tests/test_hybrid_louvain.py +++ b/tests/test_hybrid_louvain.py @@ -40,7 +40,7 @@ def setUp(self): phenos.append(np.random.rand()) i = i + 1 self.Y = pd.DataFrame({"phenotype": phenos}) - + def test_run(self): hl = HybridLouvain(self.G, self.B, self.Y, k3=0.2, k4=0.8, max_iter=5, weight="weight", tune=False) res = hl.run()