From dfcd1159feb47524f79e748560c8bff1a80369da Mon Sep 17 00:00:00 2001 From: Aadithya Anumala Date: Sun, 7 Dec 2025 23:26:27 -0500 Subject: [PATCH 1/3] add gdsc dataset + unit tests --- pyhealth/datasets/__init__.py | 1 + pyhealth/datasets/configs/gdsc.yaml | 15 ++++++ pyhealth/datasets/gdsc.py | 40 ++++++++++++++++ tests/todo/test_datasets/test_gdsc.py | 67 +++++++++++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 pyhealth/datasets/configs/gdsc.yaml create mode 100644 pyhealth/datasets/gdsc.py create mode 100644 tests/todo/test_datasets/test_gdsc.py diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 7d6a65f16..e680f9dc9 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs): from .dreamt import DREAMTDataset from .ehrshot import EHRShotDataset from .eicu import eICUDataset +from .gdsc import GDSCDataset from .isruc import ISRUCDataset from .medical_transcriptions import MedicalTranscriptionsDataset from .mimic3 import MIMIC3Dataset diff --git a/pyhealth/datasets/configs/gdsc.yaml b/pyhealth/datasets/configs/gdsc.yaml new file mode 100644 index 000000000..80589a4c8 --- /dev/null +++ b/pyhealth/datasets/configs/gdsc.yaml @@ -0,0 +1,15 @@ +version: "1.0" +tables: + drug_info: + file_path: "drug_info_gdsc.csv" + patient_id: null + timestamp: null + attributes: + - "drug_id" + - "Name" + - "Synonyms" + - "Targets" + - "Target pathway" + - "PubCHEM" + - "Sample Size" + - "Count" \ No newline at end of file diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py new file mode 100644 index 000000000..a3c24dd11 --- /dev/null +++ b/pyhealth/datasets/gdsc.py @@ -0,0 +1,40 @@ +import logging +from pathlib import Path +from typing import List, Optional +from .base_dataset import BaseDataset + +import polars as pl + +logger = logging.getLogger(__name__) + + +class GDSCDataset(BaseDataset): + + def __init__( + self, + root: str, + tables: List[str], + dataset_name: Optional[str] = None, + config_path: Optional[str] = None, + **kwargs + ) -> None: + """ + Initializes the GDSC Dataset with the specified parameters. + + Args: + root (str): The root directory where the dataset is stored. + tables (List[str]): A list of additional tables to include. + dataset_name (Optional[str]): The name of the dataset. Defaults to "gdsc". + config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used. + """ + if config_path is None: + logger.info("No config path provided, using default config") + config_path = Path(__file__).parent / "configs" / "gdsc.yaml" + super().__init__( + root=root, + tables=tables, + dataset_name=dataset_name or "gdsc", + config_path=config_path, + **kwargs + ) + return diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py new file mode 100644 index 000000000..4b5d7a3bd --- /dev/null +++ b/tests/todo/test_datasets/test_gdsc.py @@ -0,0 +1,67 @@ +import unittest + +from pyhealth.datasets import GDSCDataset +import os +import sys + +current = os.path.dirname(os.path.realpath(__file__)) +repo_root = os.path.dirname(os.path.dirname(os.path.dirname(current))) +sys.path.append(repo_root) + + +class TestsGDSCDataset(unittest.TestCase): + DATASET_NAME = "gdsc-demo" + ROOT = "https://github.com/svshah4/extending-cadre/blob/main/data/input/" + TABLES = ["drug_info"] + REFRESH_CACHE = True + + dataset = GDSCDataset( + dataset_name=DATASET_NAME, + root=ROOT, + tables=TABLES, + ) + + def setUp(self): + pass + + def test_drug_info(self): + """Tests that a drug entry from drug_info_gdsc.csv is parsed correctly.""" + + # Pick a deterministic row that should always exist + selected_drug_id = "1242" + + expected_name = "(5Z)-7-Oxozeaenol" + expected_synonyms = "5Z-7-Oxozeaenol, LL-Z1640-2" + expected_targets = "TAK1" + expected_pathway = "Other, kinases" + expected_pubchem = "9863776" + expected_sample_size = "945" + expected_count = "266" + + # dataset.tables["drug_info"] should be a Polars DataFrame + drug_df = self.dataset.tables["drug_info"] + + # Basic checks + self.assertTrue(len(drug_df) > 0) + self.assertIn("drug_id", drug_df.columns) + self.assertIn("Name", drug_df.columns) + + # Row lookup + row = drug_df.filter(pl.col("drug_id") == selected_drug_id) + + self.assertEqual(1, len(row), "Expected exactly one matched drug entry.") + + row = row.to_dicts()[0] + + # Field-level checks + self.assertEqual(expected_name, row["Name"]) + self.assertEqual(expected_synonyms, row["Synonyms"]) + self.assertEqual(expected_targets, row["Targets"]) + self.assertEqual(expected_pathway, row["Target pathway"]) + self.assertEqual(expected_pubchem, row["PubCHEM"]) + self.assertEqual(expected_sample_size, row["Sample Size"]) + self.assertEqual(expected_count, row["Count"]) + + +if __name__ == "__main__": + unittest.main(verbosity=2) \ No newline at end of file From cc342a5b60903ac6aaf1a4d06867969537bf4e32 Mon Sep 17 00:00:00 2001 From: Aadithya Anumala Date: Mon, 8 Dec 2025 00:04:42 -0500 Subject: [PATCH 2/3] add polars dependency --- tests/todo/test_datasets/test_gdsc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py index 4b5d7a3bd..2739063d0 100644 --- a/tests/todo/test_datasets/test_gdsc.py +++ b/tests/todo/test_datasets/test_gdsc.py @@ -1,6 +1,7 @@ import unittest from pyhealth.datasets import GDSCDataset +import polars as pl import os import sys From 5e49041d6defd3d37f8fc59d7b3c0887c2e80545 Mon Sep 17 00:00:00 2001 From: Aadithya Anumala Date: Mon, 8 Dec 2025 00:07:23 -0500 Subject: [PATCH 3/3] add detail to the gdscdataset docstring --- pyhealth/datasets/gdsc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py index a3c24dd11..a41f73505 100644 --- a/pyhealth/datasets/gdsc.py +++ b/pyhealth/datasets/gdsc.py @@ -19,7 +19,14 @@ def __init__( **kwargs ) -> None: """ - Initializes the GDSC Dataset with the specified parameters. + Initializes the GDSC (Genomics of Drug Sensitivity in Cancer) Dataset with the specified parameters. + The GDSC drug_info table is a drug-centric metadata table that describes compounds screened across + the Genomics of Drug Sensitivity in Cancer (GDSC) cell-line drug-sensitivity project. + Typical columns include unique drug identifiers, canonical names, alternate names/synonyms, + molecular or protein targets, higher-level pathways targeted, external chemical identifiers + (e.g., PubChem CID), and bookkeeping counts such as sample sizes or number of experiments. + The broader GDSC resource pairs these drug metadata with measured drug response (e.g., IC50) + across hundreds to thousands of cancer cell lines, enabling pharmacogenomic analyses. Args: root (str): The root directory where the dataset is stored.