From dfcd1159feb47524f79e748560c8bff1a80369da Mon Sep 17 00:00:00 2001
From: Aadithya Anumala <aadianumala@Aadithyas-MacBook-Pro.local>
Date: Sun, 7 Dec 2025 23:26:27 -0500
Subject: [PATCH 1/3] add gdsc dataset + unit tests

---
 pyhealth/datasets/__init__.py         |  1 +
 pyhealth/datasets/configs/gdsc.yaml   | 15 ++++++
 pyhealth/datasets/gdsc.py             | 40 ++++++++++++++++
 tests/todo/test_datasets/test_gdsc.py | 67 +++++++++++++++++++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 pyhealth/datasets/configs/gdsc.yaml
 create mode 100644 pyhealth/datasets/gdsc.py
 create mode 100644 tests/todo/test_datasets/test_gdsc.py

diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
index 7d6a65f16..e680f9dc9 100644
--- a/pyhealth/datasets/__init__.py
+++ b/pyhealth/datasets/__init__.py
@@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs):
 from .dreamt import DREAMTDataset
 from .ehrshot import EHRShotDataset
 from .eicu import eICUDataset
+from .gdsc import GDSCDataset
 from .isruc import ISRUCDataset
 from .medical_transcriptions import MedicalTranscriptionsDataset
 from .mimic3 import MIMIC3Dataset
diff --git a/pyhealth/datasets/configs/gdsc.yaml b/pyhealth/datasets/configs/gdsc.yaml
new file mode 100644
index 000000000..80589a4c8
--- /dev/null
+++ b/pyhealth/datasets/configs/gdsc.yaml
@@ -0,0 +1,15 @@
+version: "1.0"
+tables:
+  drug_info:
+    file_path: "drug_info_gdsc.csv"
+    patient_id: null
+    timestamp: null
+    attributes:
+    - "drug_id"
+    - "Name"
+    - "Synonyms"
+    - "Targets"
+    - "Target pathway"
+    - "PubCHEM"
+    - "Sample Size"
+    - "Count"
\ No newline at end of file
diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py
new file mode 100644
index 000000000..a3c24dd11
--- /dev/null
+++ b/pyhealth/datasets/gdsc.py
@@ -0,0 +1,40 @@
+import logging
+from pathlib import Path
+from typing import List, Optional
+from .base_dataset import BaseDataset
+
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+class GDSCDataset(BaseDataset):
+
+    def __init__(
+        self,
+        root: str,
+        tables: List[str],
+        dataset_name: Optional[str] = None,
+        config_path: Optional[str] = None,
+        **kwargs
+    ) -> None:
+        """
+        Initializes the GDSC Dataset with the specified parameters.
+
+        Args:
+            root (str): The root directory where the dataset is stored.
+            tables (List[str]): A list of additional tables to include.
+            dataset_name (Optional[str]): The name of the dataset. Defaults to "gdsc".
+            config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used.
+        """
+        if config_path is None:
+            logger.info("No config path provided, using default config")
+            config_path = Path(__file__).parent / "configs" / "gdsc.yaml"
+        super().__init__(
+            root=root,
+            tables=tables,
+            dataset_name=dataset_name or "gdsc",
+            config_path=config_path,
+            **kwargs
+        )
+        return
diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py
new file mode 100644
index 000000000..4b5d7a3bd
--- /dev/null
+++ b/tests/todo/test_datasets/test_gdsc.py
@@ -0,0 +1,67 @@
+import unittest
+
+from pyhealth.datasets import GDSCDataset
+import os
+import sys
+
+current = os.path.dirname(os.path.realpath(__file__))
+repo_root = os.path.dirname(os.path.dirname(os.path.dirname(current)))
+sys.path.append(repo_root)
+
+
+class TestsGDSCDataset(unittest.TestCase):
+    DATASET_NAME = "gdsc-demo"
+    ROOT = "https://github.com/svshah4/extending-cadre/blob/main/data/input/"
+    TABLES = ["drug_info"]
+    REFRESH_CACHE = True
+
+    dataset = GDSCDataset(
+        dataset_name=DATASET_NAME,
+        root=ROOT,
+        tables=TABLES,
+    )
+
+    def setUp(self):
+        pass
+
+    def test_drug_info(self):
+        """Tests that a drug entry from drug_info_gdsc.csv is parsed correctly."""
+
+        # Pick a deterministic row that should always exist
+        selected_drug_id = "1242"
+
+        expected_name = "(5Z)-7-Oxozeaenol"
+        expected_synonyms = "5Z-7-Oxozeaenol, LL-Z1640-2"
+        expected_targets = "TAK1"
+        expected_pathway = "Other, kinases"
+        expected_pubchem = "9863776"
+        expected_sample_size = "945"
+        expected_count = "266"
+
+        # dataset.tables["drug_info"] should be a Polars DataFrame
+        drug_df = self.dataset.tables["drug_info"]
+
+        # Basic checks
+        self.assertTrue(len(drug_df) > 0)
+        self.assertIn("drug_id", drug_df.columns)
+        self.assertIn("Name", drug_df.columns)
+
+        # Row lookup
+        row = drug_df.filter(pl.col("drug_id") == selected_drug_id)
+
+        self.assertEqual(1, len(row), "Expected exactly one matched drug entry.")
+
+        row = row.to_dicts()[0]
+
+        # Field-level checks
+        self.assertEqual(expected_name, row["Name"])
+        self.assertEqual(expected_synonyms, row["Synonyms"])
+        self.assertEqual(expected_targets, row["Targets"])
+        self.assertEqual(expected_pathway, row["Target pathway"])
+        self.assertEqual(expected_pubchem, row["PubCHEM"])
+        self.assertEqual(expected_sample_size, row["Sample Size"])
+        self.assertEqual(expected_count, row["Count"])
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
\ No newline at end of file

From cc342a5b60903ac6aaf1a4d06867969537bf4e32 Mon Sep 17 00:00:00 2001
From: Aadithya Anumala <aadianumala@Aadithyas-MacBook-Pro.local>
Date: Mon, 8 Dec 2025 00:04:42 -0500
Subject: [PATCH 2/3] add polars dependency

---
 tests/todo/test_datasets/test_gdsc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py
index 4b5d7a3bd..2739063d0 100644
--- a/tests/todo/test_datasets/test_gdsc.py
+++ b/tests/todo/test_datasets/test_gdsc.py
@@ -1,6 +1,7 @@
 import unittest
 
 from pyhealth.datasets import GDSCDataset
+import polars as pl
 import os
 import sys
 

From 5e49041d6defd3d37f8fc59d7b3c0887c2e80545 Mon Sep 17 00:00:00 2001
From: Aadithya Anumala <aadianumala@Aadithyas-MacBook-Pro.local>
Date: Mon, 8 Dec 2025 00:07:23 -0500
Subject: [PATCH 3/3] add detail to the gdscdataset docstring

---
 pyhealth/datasets/gdsc.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py
index a3c24dd11..a41f73505 100644
--- a/pyhealth/datasets/gdsc.py
+++ b/pyhealth/datasets/gdsc.py
@@ -19,7 +19,14 @@ def __init__(
         **kwargs
     ) -> None:
         """
-        Initializes the GDSC Dataset with the specified parameters.
+        Initializes the GDSC (Genomics of Drug Sensitivity in Cancer) Dataset with the specified parameters.
+        The GDSC drug_info table is a drug-centric metadata table that describes compounds screened across
+          the Genomics of Drug Sensitivity in Cancer (GDSC) cell-line drug-sensitivity project.
+            Typical columns include unique drug identifiers, canonical names, alternate names/synonyms,
+              molecular or protein targets, higher-level pathways targeted, external chemical identifiers
+                (e.g., PubChem CID), and bookkeeping counts such as sample sizes or number of experiments.
+                  The broader GDSC resource pairs these drug metadata with measured drug response (e.g., IC50)
+                    across hundreds to thousands of cancer cell lines, enabling pharmacogenomic analyses.
 
         Args:
             root (str): The root directory where the dataset is stored.