From 0f5527e6acc44fd170c5009c56a517d4d5acbe3f Mon Sep 17 00:00:00 2001
From: Mustafa Sadiq <mustafa.sadiq@outlook.com>
Date: Sun, 7 Dec 2025 22:55:14 -0500
Subject: [PATCH] add-tcga-paad-dataset-sadiq5

---
 pyhealth/datasets/__init__.py                 |   1 +
 pyhealth/datasets/configs/tcga_paad.yaml      |  22 ++
 pyhealth/datasets/tcga_paad.py                | 271 ++++++++++++++++++
 test-resources/tcga_paad/PAAD_clinical.csv    |   4 +
 test-resources/tcga_paad/PAAD_mutations.csv   |   4 +
 .../tcga_paad/tcga_paad_clinical-pyhealth.csv |   4 +
 .../tcga_paad_mutations-pyhealth.csv          |   4 +
 tests/core/test_tcga_paad.py                  |  57 ++++
 8 files changed, 367 insertions(+)
 create mode 100644 pyhealth/datasets/configs/tcga_paad.yaml
 create mode 100644 pyhealth/datasets/tcga_paad.py
 create mode 100644 test-resources/tcga_paad/PAAD_clinical.csv
 create mode 100644 test-resources/tcga_paad/PAAD_mutations.csv
 create mode 100644 test-resources/tcga_paad/tcga_paad_clinical-pyhealth.csv
 create mode 100644 test-resources/tcga_paad/tcga_paad_mutations-pyhealth.csv
 create mode 100644 tests/core/test_tcga_paad.py

diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
index 7d6a65f16..e4bdd3da2 100644
--- a/pyhealth/datasets/__init__.py
+++ b/pyhealth/datasets/__init__.py
@@ -67,6 +67,7 @@ def __init__(self, *args, **kwargs):
 from .bmd_hs import BMDHSDataset
 from .support2 import Support2Dataset
 from .tcga_prad import TCGAPRADDataset
+from .tcga_paad import TCGAPAADDataset
 from .splitter import (
     split_by_patient,
     split_by_patient_conformal,
diff --git a/pyhealth/datasets/configs/tcga_paad.yaml b/pyhealth/datasets/configs/tcga_paad.yaml
new file mode 100644
index 000000000..f83cf2780
--- /dev/null
+++ b/pyhealth/datasets/configs/tcga_paad.yaml
@@ -0,0 +1,22 @@
+version: "1.0"
+tables:
+  mutations:
+    file_path: "tcga_paad_mutations-pyhealth.csv"
+    patient_id: "patient_id"
+    timestamp: null
+    attributes:
+    - "hugo_symbol"
+    - "variant_classification"
+    - "variant_type"
+    - "hgvsc"
+    - "hgvsp"
+    - "tumor_sample_barcode"
+  clinical:
+    file_path: "tcga_paad_clinical-pyhealth.csv"
+    patient_id: "patient_id"
+    timestamp: null
+    attributes:
+    - "age_at_diagnosis"
+    - "vital_status"
+    - "days_to_death"
+    - "tumor_stage"
diff --git a/pyhealth/datasets/tcga_paad.py b/pyhealth/datasets/tcga_paad.py
new file mode 100644
index 000000000..4b9a7b791
--- /dev/null
+++ b/pyhealth/datasets/tcga_paad.py
@@ -0,0 +1,271 @@
+"""TCGA-PAAD dataset for PyHealth.
+
+This module provides the TCGAPAADDataset class for loading and processing
+TCGA Pancreatic Adenocarcinoma (PAAD) data for machine learning tasks.
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+
+from .base_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+
+class TCGAPAADDataset(BaseDataset):
+    """TCGA Pancreatic Adenocarcinoma (PAAD) dataset.
+
+    The Cancer Genome Atlas (TCGA) PAAD dataset contains multi-omics data
+    for pancreatic adenocarcinoma patients, including somatic mutations,
+    clinical data, and survival outcomes. This dataset enables cancer
+    survival prediction and mutation analysis tasks.
+
+    Dataset is available at:
+    https://portal.gdc.cancer.gov/projects/TCGA-PAAD
+
+    Args:
+        root: Root directory of the raw data containing the TCGA-PAAD files.
+        tables: Optional list of additional tables to load beyond defaults.
+        dataset_name: Optional name of the dataset. Defaults to "tcga_paad".
+        config_path: Optional path to the configuration file. If not provided,
+            uses the default config in the configs directory.
+
+    Attributes:
+        root: Root directory of the raw data.
+        dataset_name: Name of the dataset.
+        config_path: Path to the configuration file.
+
+    Examples:
+        >>> from pyhealth.datasets import TCGAPAADDataset
+        >>> dataset = TCGAPAADDataset(root="/path/to/tcga_paad")
+        >>> dataset.stats()
+        >>> samples = dataset.set_task()
+        >>> print(samples[0])
+    """
+
+    def __init__(
+        self,
+        root: str,
+        tables: List[str] = None,
+        dataset_name: Optional[str] = None,
+        config_path: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        if config_path is None:
+            logger.info("No config path provided, using default config")
+            config_path = Path(__file__).parent / "configs" / "tcga_paad.yaml"
+
+        # Prepare standardized CSVs if not exists
+        mutations_csv = os.path.join(root, "tcga_paad_mutations-pyhealth.csv")
+        clinical_csv = os.path.join(root, "tcga_paad_clinical-pyhealth.csv")
+
+        if not os.path.exists(mutations_csv) or not os.path.exists(clinical_csv):
+            logger.info("Preparing TCGA-PAAD metadata...")
+            self.prepare_metadata(root)
+
+        default_tables = ["mutations", "clinical"]
+        tables = default_tables + (tables or [])
+
+        super().__init__(
+            root=root,
+            tables=tables,
+            dataset_name=dataset_name or "tcga_paad",
+            config_path=config_path,
+            **kwargs,
+        )
+
+    @staticmethod
+    def prepare_metadata(root: str) -> None:
+        """Prepare metadata for the TCGA-PAAD dataset.
+
+        Converts raw TCGA MAF and clinical files to standardized CSV format.
+
+        Args:
+            root: Root directory containing the TCGA-PAAD files.
+        """
+        # Process mutations file
+        TCGAPAADDataset._prepare_mutations(root)
+        # Process clinical file
+        TCGAPAADDataset._prepare_clinical(root)
+
+    @staticmethod
+    def _prepare_mutations(root: str) -> None:
+        """Prepare mutations data from MAF file."""
+        # Try to find the raw mutations file
+        possible_files = [
+            "PAAD_mutations.csv",
+            "TCGA.PAAD.mutect.maf",
+            "TCGA.PAAD.mutect.maf.gz",
+            "PAAD.maf",
+            "PAAD.maf.gz",
+            "mutations.maf",
+        ]
+
+        raw_file = None
+        for fname in possible_files:
+            fpath = os.path.join(root, fname)
+            if os.path.exists(fpath):
+                raw_file = fpath
+                break
+
+        output_path = os.path.join(root, "tcga_paad_mutations-pyhealth.csv")
+
+        if raw_file is None:
+            logger.warning(
+                f"No raw TCGA-PAAD mutations file found in {root}. "
+                "Please download from GDC portal or use TCGAmutations R package."
+            )
+            # Create empty placeholder
+            pd.DataFrame(
+                columns=[
+                    "patient_id",
+                    "hugo_symbol",
+                    "variant_classification",
+                    "variant_type",
+                    "hgvsc",
+                    "hgvsp",
+                    "tumor_sample_barcode",
+                ]
+            ).to_csv(output_path, index=False)
+            return
+
+        logger.info(f"Processing TCGA-PAAD mutations file: {raw_file}")
+
+        # Read the raw file
+        if raw_file.endswith(".gz"):
+            df = pd.read_csv(
+                raw_file, sep="\t", compression="gzip", comment="#", low_memory=False
+            )
+        elif raw_file.endswith(".maf"):
+            df = pd.read_csv(raw_file, sep="\t", comment="#", low_memory=False)
+        else:
+            df = pd.read_csv(raw_file, low_memory=False)
+
+        # Standardize column names
+        column_mapping = {
+            "Hugo_Symbol": "hugo_symbol",
+            "Variant_Classification": "variant_classification",
+            "Variant_Type": "variant_type",
+            "HGVSc": "hgvsc",
+            "HGVSp_Short": "hgvsp",
+            "HGVSp": "hgvsp",
+            "Tumor_Sample_Barcode": "tumor_sample_barcode",
+        }
+
+        rename_dict = {k: v for k, v in column_mapping.items() if k in df.columns}
+        df = df.rename(columns=rename_dict)
+
+        # Extract patient_id from tumor_sample_barcode (first 12 characters)
+        if "tumor_sample_barcode" in df.columns:
+            df["patient_id"] = df["tumor_sample_barcode"].str[:12]
+        else:
+            df["patient_id"] = df.index.astype(str)
+
+        # Select output columns
+        output_cols = [
+            "patient_id",
+            "hugo_symbol",
+            "variant_classification",
+            "variant_type",
+            "hgvsc",
+            "hgvsp",
+            "tumor_sample_barcode",
+        ]
+        available_cols = [c for c in output_cols if c in df.columns]
+        df_out = df[available_cols]
+
+        df_out.to_csv(output_path, index=False)
+        logger.info(f"Saved {len(df_out)} mutations to {output_path}")
+
+    @staticmethod
+    def _prepare_clinical(root: str) -> None:
+        """Prepare clinical data file."""
+        # Try to find the raw clinical file
+        possible_files = [
+            "PAAD_clinical.csv",
+            "clinical.tsv",
+            "clinical.csv",
+            "nationwidechildrens.org_clinical_patient_paad.txt",
+        ]
+
+        raw_file = None
+        for fname in possible_files:
+            fpath = os.path.join(root, fname)
+            if os.path.exists(fpath):
+                raw_file = fpath
+                break
+
+        output_path = os.path.join(root, "tcga_paad_clinical-pyhealth.csv")
+
+        if raw_file is None:
+            logger.warning(
+                f"No raw TCGA-PAAD clinical file found in {root}. "
+                "Please download from GDC portal."
+            )
+            # Create empty placeholder
+            pd.DataFrame(
+                columns=[
+                    "patient_id",
+                    "age_at_diagnosis",
+                    "vital_status",
+                    "days_to_death",
+                    "tumor_stage",
+                ]
+            ).to_csv(output_path, index=False)
+            return
+
+        logger.info(f"Processing TCGA-PAAD clinical file: {raw_file}")
+
+        # Read the raw file
+        sep = "\t" if raw_file.endswith(".tsv") or raw_file.endswith(".txt") else ","
+        df = pd.read_csv(raw_file, sep=sep, low_memory=False)
+
+        # Standardize column names (TCGA uses various naming conventions)
+        column_mapping = {
+            "submitter_id": "patient_id",
+            "bcr_patient_barcode": "patient_id",
+            "case_id": "patient_id",
+            "age_at_diagnosis": "age_at_diagnosis",
+            "age_at_initial_pathologic_diagnosis": "age_at_diagnosis",
+            "vital_status": "vital_status",
+            "days_to_death": "days_to_death",
+            "tumor_stage": "tumor_stage",
+            "ajcc_pathologic_stage": "tumor_stage",
+            "pathologic_stage": "tumor_stage",
+        }
+
+        rename_dict = {k: v for k, v in column_mapping.items() if k in df.columns}
+        df = df.rename(columns=rename_dict)
+
+        # If patient_id doesn't exist, create from index
+        if "patient_id" not in df.columns:
+            df["patient_id"] = df.index.astype(str)
+
+        # Select output columns
+        output_cols = [
+            "patient_id",
+            "age_at_diagnosis",
+            "vital_status",
+            "days_to_death",
+            "tumor_stage",
+        ]
+        available_cols = [c for c in output_cols if c in df.columns]
+        df_out = df[available_cols].drop_duplicates(subset=["patient_id"])
+
+        df_out.to_csv(output_path, index=False)
+        logger.info(f"Saved {len(df_out)} clinical records to {output_path}")
+
+    @property
+    def default_task(self):
+        """Returns the default task for this dataset.
+
+        Returns:
+            CancerSurvivalPrediction: The default prediction task.
+        """
+        from pyhealth.tasks import CancerSurvivalPrediction
+
+        return CancerSurvivalPrediction()
diff --git a/test-resources/tcga_paad/PAAD_clinical.csv b/test-resources/tcga_paad/PAAD_clinical.csv
new file mode 100644
index 000000000..66297d133
--- /dev/null
+++ b/test-resources/tcga_paad/PAAD_clinical.csv
@@ -0,0 +1,4 @@
+submitter_id,age_at_diagnosis,vital_status,days_to_death,tumor_stage
+TCGA-AB-1234,23000,Alive,,Stage II
+TCGA-AB-5678,25000,Dead,300,Stage III
+TCGA-AB-9012,28000,Alive,,Stage II
diff --git a/test-resources/tcga_paad/PAAD_mutations.csv b/test-resources/tcga_paad/PAAD_mutations.csv
new file mode 100644
index 000000000..4d0ebac95
--- /dev/null
+++ b/test-resources/tcga_paad/PAAD_mutations.csv
@@ -0,0 +1,4 @@
+Hugo_Symbol,Variant_Classification,Variant_Type,HGVSc,HGVSp_Short,Tumor_Sample_Barcode
+KRAS,Missense_Mutation,SNP,c.35G>T,p.G12V,TCGA-AB-1234-01A-01D-1234-08
+TP53,Nonsense_Mutation,SNP,c.743G>A,p.R248Q,TCGA-AB-5678-01A-01D-1234-08
+SMAD4,Frame_Shift_Del,DEL,c.123_124del,p.L41fs,TCGA-AB-9012-01A-01D-1234-08
diff --git a/test-resources/tcga_paad/tcga_paad_clinical-pyhealth.csv b/test-resources/tcga_paad/tcga_paad_clinical-pyhealth.csv
new file mode 100644
index 000000000..cd3162f63
--- /dev/null
+++ b/test-resources/tcga_paad/tcga_paad_clinical-pyhealth.csv
@@ -0,0 +1,4 @@
+patient_id,age_at_diagnosis,vital_status,days_to_death,tumor_stage
+TCGA-AB-1234,23000,Alive,,Stage II
+TCGA-AB-5678,25000,Dead,300.0,Stage III
+TCGA-AB-9012,28000,Alive,,Stage II
diff --git a/test-resources/tcga_paad/tcga_paad_mutations-pyhealth.csv b/test-resources/tcga_paad/tcga_paad_mutations-pyhealth.csv
new file mode 100644
index 000000000..638d940a1
--- /dev/null
+++ b/test-resources/tcga_paad/tcga_paad_mutations-pyhealth.csv
@@ -0,0 +1,4 @@
+patient_id,hugo_symbol,variant_classification,variant_type,hgvsc,hgvsp,tumor_sample_barcode
+TCGA-AB-1234,KRAS,Missense_Mutation,SNP,c.35G>T,p.G12V,TCGA-AB-1234-01A-01D-1234-08
+TCGA-AB-5678,TP53,Nonsense_Mutation,SNP,c.743G>A,p.R248Q,TCGA-AB-5678-01A-01D-1234-08
+TCGA-AB-9012,SMAD4,Frame_Shift_Del,DEL,c.123_124del,p.L41fs,TCGA-AB-9012-01A-01D-1234-08
diff --git a/tests/core/test_tcga_paad.py b/tests/core/test_tcga_paad.py
new file mode 100644
index 000000000..81f9cb2a0
--- /dev/null
+++ b/tests/core/test_tcga_paad.py
@@ -0,0 +1,57 @@
+"""
+Unit tests for the TCGAPAADDataset, mirroring PRAD tests style.
+"""
+import unittest
+from pathlib import Path
+
+from pyhealth.datasets import TCGAPAADDataset
+from pyhealth.tasks import CancerSurvivalPrediction
+
+
+class TestTCGAPAADDataset(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.test_resources = (
+            Path(__file__).parent.parent.parent / "test-resources" / "tcga_paad"
+        )
+
+    def test_dataset_initialization(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        self.assertIsNotNone(dataset)
+        self.assertEqual(dataset.dataset_name, "tcga_paad")
+
+    def test_stats(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        dataset.stats()
+
+    def test_get_patient(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        patient = dataset.get_patient("TCGA-AB-1234")
+        self.assertIsNotNone(patient)
+        self.assertEqual(patient.patient_id, "TCGA-AB-1234")
+
+    def test_get_mutation_events(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        patient = dataset.get_patient("TCGA-AB-1234")
+        events = patient.get_events(event_type="mutations")
+        self.assertGreaterEqual(len(events), 1)
+
+    def test_get_clinical_events(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        patient = dataset.get_patient("TCGA-AB-1234")
+        events = patient.get_events(event_type="clinical")
+        self.assertEqual(len(events), 1)
+
+    def test_default_task(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        self.assertIsInstance(dataset.default_task, CancerSurvivalPrediction)
+
+    def test_set_task_survival(self):
+        dataset = TCGAPAADDataset(root=str(self.test_resources))
+        task = CancerSurvivalPrediction()
+        samples = dataset.set_task(task)
+        self.assertGreater(len(samples), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()