diff --git a/biasanalyzer/api.py b/biasanalyzer/api.py index 57d947c..c7a59aa 100644 --- a/biasanalyzer/api.py +++ b/biasanalyzer/api.py @@ -40,7 +40,7 @@ def set_config(self, config_file_path: str): except ValidationError as ex: notify_users(f"configuration yaml file is not valid with validation error: {ex}", level="error") - def set_root_omop(self): + def set_root_omop(self, read_only=True): if not self.config: notify_users( "no valid configuration to set root OMOP CDM data. " @@ -62,7 +62,7 @@ def set_root_omop(self): self.bias_db = BiasDatabase(":memory:", omop_db_url=db_url) elif db_type == "duckdb": db_path = self.config["root_omop_cdm_database"].get("database", ":memory:") - self.omop_cdm_db = OMOPCDMDatabase(db_path) + self.omop_cdm_db = OMOPCDMDatabase(db_path, read_only=read_only) self.bias_db = BiasDatabase(":memory:", omop_db_url=db_path) else: notify_users(f"Unsupported database type: {db_type}") diff --git a/biasanalyzer/cohort.py b/biasanalyzer/cohort.py index 979eac5..bd0968a 100644 --- a/biasanalyzer/cohort.py +++ b/biasanalyzer/cohort.py @@ -10,7 +10,7 @@ from biasanalyzer.concept import ConceptHierarchy from biasanalyzer.config import load_cohort_creation_config from biasanalyzer.database import BiasDatabase, OMOPCDMDatabase -from biasanalyzer.models import CohortDefinition, DOMAIN_MAPPING +from biasanalyzer.models import DOMAIN_MAPPING, CohortDefinition from biasanalyzer.utils import clean_string, hellinger_distance, notify_users @@ -60,7 +60,7 @@ def get_concept_stats( Get cohort concept statistics such as concept prevalence """ if concept_type not in DOMAIN_MAPPING: - raise ValueError(f'input concept_type {concept_type} is not a valid concept type to get concept stats') + raise ValueError(f"input concept_type {concept_type} is not a valid concept type to get concept stats") cohort_stats = self.bias_db.get_cohort_concept_stats( self.cohort_id, diff --git a/biasanalyzer/database.py b/biasanalyzer/database.py index 44e75d1..5f18b3d 100644 --- a/biasanalyzer/database.py +++ b/biasanalyzer/database.py @@ -1,6 +1,5 @@ # ruff: noqa: S608 import gc -import platform from datetime import datetime from typing import Optional @@ -322,13 +321,13 @@ class OMOPCDMDatabase: _instance = None # indicating a singleton with only one instance of the class ever created _database_type = None - def __new__(cls, *args, **kwargs): + def __new__(cls, db_url, read_only=True): if cls._instance is None: cls._instance = super().__new__(cls) - cls._instance._initialize(*args, **kwargs) # Initialize only once + cls._instance._initialize(db_url, read_only=read_only) # Initialize only once return cls._instance - def _initialize(self, db_url): + def _initialize(self, db_url, read_only=True): if db_url.endswith(".duckdb"): # close any potential global connections if any for obj in gc.get_objects(): # pragma: no cover @@ -340,11 +339,8 @@ def _initialize(self, db_url): # Handle DuckDB connection try: - if platform.system().lower() == "windows": # pragma: no cover - # it is critical to set duckdb connection to be read-only on windows platform - self.engine = duckdb.connect(db_url, read_only=True) - else: - self.engine = duckdb.connect(db_url) + # it is critical to set duckdb connection to be read-only on windows and Mac platforms + self.engine = duckdb.connect(db_url, read_only=read_only) notify_users(f"Connected to the DuckDB database: {db_url}.") except duckdb.Error as e: # pragma: no cover notify_users(f"Failed to connect to DuckDB: {e}", level="error") @@ -573,4 +569,4 @@ def close(self): else: self.engine.dispose() # pragma: no cover OMOPCDMDatabase._instance = None - notify_users("Connection to the OMOP CDM database closed.") \ No newline at end of file + notify_users("Connection to the OMOP CDM database closed.") diff --git a/poetry.lock b/poetry.lock index 28cf8c9..1fa9bb1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -12,9 +12,6 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} - [[package]] name = "appnope" version = "0.1.4" @@ -273,7 +270,7 @@ description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.10\"" +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -635,7 +632,7 @@ description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version < \"3.12\"" +markers = "python_version <= \"3.11\"" files = [ {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, @@ -728,71 +725,103 @@ files = [ [[package]] name = "pandas" -version = "2.0.3" +version = "2.3.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, - {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, - {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, - {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, - {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, - {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, - {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, - {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, - {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, - {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, - {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, + {file = "pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c"}, + {file = "pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a"}, + {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1"}, + {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838"}, + {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250"}, + {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4"}, + {file = "pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826"}, + {file = "pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523"}, + {file = "pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45"}, + {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66"}, + {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b"}, + {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791"}, + {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151"}, + {file = "pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c"}, + {file = "pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53"}, + {file = "pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35"}, + {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908"}, + {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89"}, + {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98"}, + {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084"}, + {file = "pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b"}, + {file = "pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713"}, + {file = "pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8"}, + {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d"}, + {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac"}, + {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c"}, + {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493"}, + {file = "pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee"}, + {file = "pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5"}, + {file = "pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21"}, + {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78"}, + {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110"}, + {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86"}, + {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc"}, + {file = "pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0"}, + {file = "pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593"}, + {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c"}, + {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b"}, + {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6"}, + {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3"}, + {file = "pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5"}, + {file = "pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec"}, + {file = "pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7"}, + {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450"}, + {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5"}, + {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788"}, + {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87"}, + {file = "pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2"}, + {file = "pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8"}, + {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff"}, + {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29"}, + {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73"}, + {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9"}, + {file = "pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa"}, + {file = "pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b"}, ] [package.dependencies] numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, - {version = ">=1.21.0", markers = "python_version == \"3.10\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" -tzdata = ">=2022.1" +tzdata = ">=2022.7" [package.extras] -all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] -aws = ["s3fs (>=2021.08.0)"] -clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] -compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] -computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2021.07.0)"] -gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] -hdf5 = ["tables (>=3.6.1)"] -html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] -mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] -spss = ["pyreadstat (>=1.1.2)"] -sql-other = ["SQLAlchemy (>=1.4.16)"] -test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.6.3)"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "parso" @@ -883,6 +912,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -1233,7 +1263,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = false python-versions = "<3.12,>=3.8" groups = ["main"] -markers = "python_version < \"3.12\"" +markers = "python_version <= \"3.11\"" files = [ {file = "scipy-1.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7354fd7527a4b0377ce55f286805b34e8c54b91be865bac273f527e1b839019"}, {file = "scipy-1.10.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4b3f429188c66603a1a5c549fb414e4d3bdc2a24792e061ffbd607d3d75fd84e"}, @@ -1453,7 +1483,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.10\"" +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -1577,5 +1607,5 @@ files = [ [metadata] lock-version = "2.1" -python-versions = ">=3.8.10,<3.13" -content-hash = "a588906e6e90991a51af15f24a95ed2841c834dc72b79872e843b54e889f56c5" +python-versions = ">=3.9,<3.13" +content-hash = "31223cf102fa64b291d8ddbff67377aa0e1c9a03368c30fa6d949b4486cee7a5" diff --git a/pyproject.toml b/pyproject.toml index 34f0901..7037a2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,9 @@ include = [ {path = "biasanalyzer/sql_templates/*.sql", format=["sdist", "wheel"]} ] [tool.poetry.dependencies] -python = ">=3.8.10,<3.13" +python = ">=3.9,<3.13" duckdb = "^1.1.1" -pandas = "2.0.3" +pandas = "^2.1.4" scipy = [ {version = ">=1.10.1,<1.11", markers = "python_version<'3.12'"}, diff --git a/scripts/ingest_csvs_to_omop_duckdb.py b/scripts/ingest_csvs_to_omop_duckdb.py index 633ecc6..c77f4b8 100644 --- a/scripts/ingest_csvs_to_omop_duckdb.py +++ b/scripts/ingest_csvs_to_omop_duckdb.py @@ -11,45 +11,30 @@ """ import argparse -import sys import csv +import sys import time from pathlib import Path import duckdb - FILENAME_STEM_TO_TABLE_NAME_MAPPING = { # 'demographics': 'person' # 'conditions': 'condition_occurrence' # 'drugs': 'drug_exposure' # 'procedures': 'procedure_occurrence' # 'visits': 'visit_occurrence' - 'observations': 'observation' + "observations": "observation" } COLUMN_MAPPINGS = { - "person": { - "deid_pat_id": "person_id" - }, - "condition_occurrence": { - "deid_pat_id": "person_id" - }, - "drug_exposure": { - "deid_pat_id": "person_id" - }, - "procedure_occurrence": { - "deid_pat_id": "person_id" - }, - "visit_occurrence": { - "deid_pat_id": "person_id" - }, - "observation": { - "deid_pat_id": "person_id" - }, - "measurement": { - "deid_pat_id": "person_id" - }, + "person": {"deid_pat_id": "person_id"}, + "condition_occurrence": {"deid_pat_id": "person_id"}, + "drug_exposure": {"deid_pat_id": "person_id"}, + "procedure_occurrence": {"deid_pat_id": "person_id"}, + "visit_occurrence": {"deid_pat_id": "person_id"}, + "observation": {"deid_pat_id": "person_id"}, + "measurement": {"deid_pat_id": "person_id"}, } OMOP_TABLE_SCHEMAS = { @@ -71,7 +56,7 @@ "race_source_value", "race_source_concept_id", "ethnicity_source_value", - "ethnicity_source_concept_id" + "ethnicity_source_concept_id", ], "condition_occurrence": [ "condition_occurrence_id", @@ -89,9 +74,9 @@ "visit_detail_id", "condition_source_value", "condition_source_concept_id", - "condition_status_source_value" + "condition_status_source_value", ], - 'drug_exposure': [ + "drug_exposure": [ "drug_exposure_id", "person_id", "drug_concept_id", @@ -114,9 +99,9 @@ "drug_source_value", "drug_source_concept_id", "route_source_value", - "dose_unit_source_value" + "dose_unit_source_value", ], - 'procedure_occurrence': [ + "procedure_occurrence": [ "procedure_occurrence_id", "person_id", "procedure_concept_id", @@ -132,9 +117,9 @@ "visit_detail_id", "procedure_source_value", "procedure_source_concept_id", - "modifier_source_value" + "modifier_source_value", ], - 'visit_occurrence': [ + "visit_occurrence": [ "visit_occurrence_id", "person_id", "visit_concept_id", @@ -151,9 +136,9 @@ "admitted_from_source_value", "discharged_to_concept_id", "discharged_to_source_value", - "preceding_visit_occurrence_id" + "preceding_visit_occurrence_id", ], - 'observation': [ + "observation": [ "observation_id", "person_id", "observation_concept_id", @@ -174,28 +159,29 @@ "qualifier_source_value", "value_source_value", "observation_event_id", - "obs_event_field_concept_id" - ] + "obs_event_field_concept_id", + ], } + def load_csv_to_duckdb(con, csv_path: Path, table_name: str): """Load a single CSV file into DuckDB.""" t0 = time.time() print(f"loading {table_name} from {csv_path}") # read and normalize header - with open(csv_path, "r", newline="") as f: + with open(csv_path, newline="") as f: reader = csv.reader(f) raw_header = next(reader) # normalize: lower case + strip quotes/spaces - raw_header = [h.strip().replace('"', '') for h in raw_header] + raw_header = [h.strip().replace('"', "") for h in raw_header] header = [h.lower() for h in raw_header] - print(f'normalized header: {header}') + print(f"normalized header: {header}") mapping = COLUMN_MAPPINGS.get(table_name, {}) final_cols = [mapping.get(col, col) for col in header] - print(f'mapped header: {final_cols}') + print(f"mapped header: {final_cols}") expected = OMOP_TABLE_SCHEMAS.get(table_name, []) final_set = set(final_cols) @@ -209,7 +195,7 @@ def load_csv_to_duckdb(con, csv_path: Path, table_name: str): extra = final_set - set(expected) if extra: print(f"WARNING: Extra columns in CSV for {table_name}: {sorted(extra)}") - print(f"Extra columns will NOT be ingested.") + print("Extra columns will NOT be ingested.") select_clauses = [] for orig, new in zip(raw_header, final_cols): @@ -217,7 +203,7 @@ def load_csv_to_duckdb(con, csv_path: Path, table_name: str): # skip extra columns entirely continue if orig != new: - select_clauses.append(f'{orig} AS {new}') + select_clauses.append(f"{orig} AS {new}") else: select_clauses.append(orig) @@ -268,9 +254,13 @@ def main(): required=False, help="Directory containing OMOP vocabulary CSVs (concept, concept_relationship, etc.)", ) - parser.add_argument("--output", type=Path, - default=Path("Y:/OMOP_duckdb/omop.duckdb"), - required=False, help="Output DuckDB file path") + parser.add_argument( + "--output", + type=Path, + default=Path("Y:/OMOP_duckdb/omop.duckdb"), + required=False, + help="Output DuckDB file path", + ) args = parser.parse_args() diff --git a/tests/conftest.py b/tests/conftest.py index e71d1ed..c377057 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -273,7 +273,7 @@ def test_db(): # mock configuration file bias = BIAS(config_file_path=config_file) - bias.set_root_omop() + bias.set_root_omop(read_only=False) yield bias # Provide the connection to the test diff --git a/tests/query_based/test_cohort_creation.py b/tests/query_based/test_cohort_creation.py index 45c349c..e2fa59a 100644 --- a/tests/query_based/test_cohort_creation.py +++ b/tests/query_based/test_cohort_creation.py @@ -86,10 +86,10 @@ def test_cohort_creation_baseline(caplog, test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 5) - assert_equal(patient_ids, {'106', '108', '110', '111', '112'}) + assert_equal(patient_ids, {"106", "108", "110", "111", "112"}) # select two patients to check for cohort_start_date and cohort_end_date automatically computed - patient_106 = next(item for item in cohort.data if item["subject_id"] == '106') - patient_108 = next(item for item in cohort.data if item["subject_id"] == '108') + patient_106 = next(item for item in cohort.data if item["subject_id"] == "106") + patient_108 = next(item for item in cohort.data if item["subject_id"] == "108") # Replace dates with actual values from your test data assert_equal( @@ -127,7 +127,7 @@ def test_cohort_creation_study(test_db): assert cohort.data is not None, "Cohort creation wrongly returned None data" patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 4) - assert_equal(patient_ids, {'108', '110', '111', '112'}) + assert_equal(patient_ids, {"108", "110", "111", "112"}) def test_cohort_creation_study2(caplog, test_db): @@ -155,7 +155,7 @@ def test_cohort_creation_study2(caplog, test_db): assert cohort.data is not None, "Cohort creation wrongly returned None data" patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 1) - assert_equal(patient_ids, {'106'}) + assert_equal(patient_ids, {"106"}) def test_cohort_creation_all(caplog, test_db): @@ -191,7 +191,7 @@ def test_cohort_creation_all(caplog, test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 2) - assert_equal(patient_ids, {'108', '110'}) + assert_equal(patient_ids, {"108", "110"}) def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db): @@ -214,7 +214,7 @@ def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 2) - assert_equal(patient_ids, {'108', '110'}) + assert_equal(patient_ids, {"108", "110"}) def test_cohort_creation_mixed_domains(test_db): @@ -242,7 +242,7 @@ def test_cohort_creation_mixed_domains(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 3) - assert_equal(patient_ids, {'1', '2', '6'}) + assert_equal(patient_ids, {"1", "2", "6"}) start_dates = [item["cohort_start_date"] for item in cohort.data] assert_equal(len(start_dates), 3) assert_equal(start_dates, [datetime.date(2020, 6, 1), datetime.date(2020, 6, 1), datetime.date(2018, 1, 1)]) @@ -356,10 +356,10 @@ def test_cohort_creation_negative_instance(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5 - assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) + assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"}) # Verify dates for a specific patient (e.g., patient 1 with last diabetes diagnosis) - patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') + patient_1 = next(item for item in cohort.data if item["subject_id"] == "1") assert_equal( patient_1["cohort_start_date"], datetime.date(2020, 6, 1), @@ -392,10 +392,10 @@ def test_cohort_creation_offset(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5 - assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) + assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"}) # Verify dates for a specific patient (e.g., patient 1 with offset) - patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') + patient_1 = next(item for item in cohort.data if item["subject_id"] == "1") # Diabetes on 2020-06-01: -730 days = 2018-06-02, +180 days = 2020-11-28 assert_equal( patient_1["cohort_start_date"], @@ -435,10 +435,10 @@ def test_cohort_creation_negative_instance_offset(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) - assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) + assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"}) # Verify dates for a specific patient (e.g., patient 1 with last diabetes and offset) - patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') + patient_1 = next(item for item in cohort.data if item["subject_id"] == "1") # Last diabetes on 2020-06-01: +180 days = 2020-11-28 assert_equal( patient_1["cohort_start_date"], diff --git a/tests/query_based/test_hierarchical_prevalence.py b/tests/query_based/test_hierarchical_prevalence.py index 4541883..e9a4099 100644 --- a/tests/query_based/test_hierarchical_prevalence.py +++ b/tests/query_based/test_hierarchical_prevalence.py @@ -1,7 +1,6 @@ import pytest -from numpy.ma.testutils import assert_equal - from biasanalyzer.concept import ConceptHierarchy +from numpy.ma.testutils import assert_equal def test_cohort_concept_hierarchical_prevalence(test_db, caplog): @@ -28,7 +27,7 @@ def test_cohort_concept_hierarchical_prevalence(test_db, caplog): # test the cohort does not have procedure_occurrence related concepts cohort_stat, _ = cohort.get_concept_stats(concept_type="procedure_occurrence") - assert_equal(cohort_stat, {'procedure_occurrence': []}) + assert_equal(cohort_stat, {"procedure_occurrence": []}) concept_stats, _ = cohort.get_concept_stats(vocab="ICD10CM", print_concept_hierarchy=True) assert concept_stats is not None, "Failed to fetch concept stats" diff --git a/tests/test_database.py b/tests/test_database.py index 97ab8a8..9901331 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -168,8 +168,9 @@ def test_get_cohort_concept_stats_handles_exception(caplog): db = BiasDatabase(":memory:") db.omop_cdm_db_url = "duckdb" qry_builder = CohortQueryBuilder(cohort_creation=False) + with pytest.raises(ValueError): - db.get_cohort_concept_stats(123, qry_builder) + db.get_cohort_concept_stats(123, qry_builder, concept_type="dummy") def test_get_cohort_attributes_handles_exception():