From 9982a8867880f5c40b985f751401df4184bdb4e6 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:03:13 -0700 Subject: [PATCH 1/7] Define dependencies in pyproject.toml Moves dependency constraints to pyproject.toml. Makes requirements.txt a lockfile. --- .github/workflows/lint-format.yml | 2 +- .github/workflows/test-unit.yml | 4 +-- README.md | 8 ++---- pyproject.toml | 42 ++++++++++++++++++++++++++++--- requirements-dev.txt | 7 ------ requirements.txt | 2 ++ 6 files changed, 46 insertions(+), 19 deletions(-) delete mode 100644 requirements-dev.txt diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml index 4c7f70fe0..f455d0b78 100644 --- a/.github/workflows/lint-format.yml +++ b/.github/workflows/lint-format.yml @@ -49,7 +49,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install black flake8 -c requirements-dev.txt + pip install black flake8 - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 38a9e55ae..d11d78b30 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -16,8 +16,8 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements-dev.txt - pip install -e . + pip install -r requirements.txt + pip install --group dev -e . - name: Running Tests env: CDISC_LIBRARY_API_KEY: fakekey12341234 diff --git a/README.md b/README.md index 4340249dc..4b7806a43 100644 --- a/README.md +++ b/README.md @@ -646,11 +646,7 @@ These steps should be run before running any tests or core commands using the no - Install the requirements: - ```bash - python -m pip install -r requirements-dev.txt - ``` - - Run this from the root directory. + `pip install -e . && pip install --group dev` # From the root directory ### Creating an executable version @@ -724,7 +720,7 @@ py -m twine upload --repository {repository_name} dist/* This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements-dev.txt_. +Both dependencies are added to the `dev` dependency group in _pyproject.toml_. Setting up `pre-commit` requires one extra step. After installing it you have to run: diff --git a/pyproject.toml b/pyproject.toml index d0da6d416..191c5a3aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,49 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version", "dependencies"] +dynamic = ["version"] description = "Open source offering of the cdisc rules engine" readme = "PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] +dependencies = [ + "business_rules_enhanced >=1.4.8", + "cachetools >=6.1.0", + "cdisc-library-client >=0.1.6", + "click >=8.1.7, <8.3.0", + "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "fastparquet >=2024.2.0", + "importlib-metadata >=8.5.0", + "jsonata-python >=0.6.0", + "jsonpath-ng >=1.6.1, <1.8.0", + "jsonschema >=4.18.5", + "lxml >=5.2.1", + "numpy >=1.26.0", + "odmlib >=0.1.4", + "openpyxl >=3.1.5", + "pandas >=2.1.4, <2.2.0", + "psutil >=6.1.1", + "pyinstaller >=6.11.0", + "pympler >=1.1", + "pyreadstat >=1.2.7, <1.2.9", + "python-dotenv >=1.0.0", + "pyyaml >=6.0.2", + "redis >=4.5.0", + "requests >=2.32.3", + "setuptools >=75.6.0", + "titlecase >=2.4.1", +] + +[dependency-groups] +dev = [ + "black >=24.10.0", + "flake8 >=6.1.0", + "pre-commit >=2.20.0", + "pytest >=7.4.0, <8.0.0", + "pytest-asyncio >=0.21.0", + "pytest-cov >=6.0.0", +] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -26,5 +63,4 @@ include-package-data = true py-modules = ["version"] [tool.setuptools.dynamic] -version = { attr = "version.__version__" } -dependencies = {file = ["requirements.txt"]} \ No newline at end of file +version = { attr = "version.__version__" } \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index ac709f651..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -black==24.10.0 -flake8==6.1.0 -pre-commit==2.20.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4481051f6..61556482d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Lockfile: exact pinned versions for reproducible installs. +# Dependency constraints are defined in pyproject.toml. business_rules_enhanced==1.4.8 cachetools==6.1.0 cdisc-library-client==0.1.6 From 55c516076ada93f6978d0997f516650339c24166 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:08:54 -0700 Subject: [PATCH 2/7] Support click 8.3.0 Fixes an incompatibility caused by click 8.3.0, which passes the default value as-is. --- core.py | 2 -- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/core.py b/core.py index ea1ff2ce6..8b162a66b 100644 --- a/core.py +++ b/core.py @@ -357,7 +357,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-s", "--standard", required=True, - default=None, help="CDISC standard to validate against", envvar="PRODUCT", ) @@ -365,7 +364,6 @@ def load_custom_dotenv_from_data_options(ctx, param, value): "-v", "--version", required=True, - default=None, help="Standard version to validate against", envvar="VERSION", ) diff --git a/pyproject.toml b/pyproject.toml index 191c5a3aa..6a3c2891e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "business_rules_enhanced >=1.4.8", "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", - "click >=8.1.7, <8.3.0", + "click >=8.1.7", "dask[dataframe,array] >=2024.6.0, <2024.8.1", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", From 92772d8256b2b6088b81aeea691731e7701fd8ae Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:09:40 -0700 Subject: [PATCH 3/7] Support pyreadstat 1.2.9 Fixes an incompatibility caused by pyreadstat 1.2.9, which changed original_variable_type from 'NULL' to None --- cdisc_rules_engine/services/datasetxpt_metadata_reader.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index 02c1a5ff1..dd68ccabb 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -61,7 +61,7 @@ def read(self) -> dict: "variable_labels": list(metadata.column_labels), "variable_names": list(metadata.column_names), "variable_formats": [ - "" if data_type == "NULL" else data_type + "" if (data_type == "NULL" or data_type is None) else data_type for data_type in metadata.original_variable_types.values() ], "variable_name_to_label_map": metadata.column_names_to_labels, diff --git a/pyproject.toml b/pyproject.toml index 6a3c2891e..336321cc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", - "pyreadstat >=1.2.7, <1.2.9", + "pyreadstat >=1.2.7", "python-dotenv >=1.0.0", "pyyaml >=6.0.2", "redis >=4.5.0", From 9e55d8a0d15147f22b8bacd7f8e162ac0061fdbe Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:11:02 -0700 Subject: [PATCH 4/7] Support jsonpath-ng 1.8.0 Works around an behavior change in jsonpath-ng 1.8.0 where Child.str gets wrapped in parenthesis. --- .../services/data_services/usdm_data_service.py | 15 +++++++++++++-- pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 83052f2e6..5b3cc6a99 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -417,8 +417,19 @@ def __read_node_metadata( } @staticmethod - def __get_full_path(node: DatumInContext): - return f"{node.full_path}".replace(".[", "[") + def __get_full_path(node: DatumInContext) -> str: + parts = [] + current = node + while current is not None and current.context is not None: + parts.append(str(current.path)) + current = current.context + result = "" + for part in reversed(parts): + if part.startswith("["): + result += part + else: + result = (result + "." if result else "") + part + return result def __get_datasets_content_index(self) -> List[dict]: """ diff --git a/pyproject.toml b/pyproject.toml index 336321cc3..dae7f8488 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", - "jsonpath-ng >=1.6.1, <1.8.0", + "jsonpath-ng >=1.6.1", "jsonschema >=4.18.5", "lxml >=5.2.1", "numpy >=1.26.0", From 7a81922c43f1b4c559530d7b4d3a6603a8539ed7 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:11:47 -0700 Subject: [PATCH 5/7] Suport dask 2024.8.1 Fixes tokenization errors when using dask 2024.8.1+. Starting with this version, dask enforces that tokens remain stable across pickle round-trips (dask/dask#11320). Capturing self in a lambda fails this check because instance objects can have non-deterministic pickle representations. Since calculate_variable_value_length is already a static method, replacing self with the class name is enough to remove the capture. --- .../dataset_builders/contents_define_vlm_dataset_builder.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py index fb2374b9c..328a90e48 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_vlm_dataset_builder.py @@ -66,7 +66,7 @@ def build(self): data_contents_with_vlm["variable_value_length"] = data_contents_with_vlm.data[ ["variable_value", "define_vlm_data_type"] ].apply( - lambda row: self.calculate_variable_value_length( + lambda row: ValuesDatasetBuilder.calculate_variable_value_length( row["variable_value"], row["define_vlm_data_type"] ), axis=1, diff --git a/pyproject.toml b/pyproject.toml index dae7f8488..6ac810c53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2024.8.1", + "dask[dataframe,array] >=2024.6.0, <2025.4.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From 0b5e61743e7b83ef92baa8f31133e231168536cd Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:12:31 -0700 Subject: [PATCH 6/7] Support dask 2025.4.0 Dask 2025.4.0 optimizes multiple DataFrames together, which exposes division mismatches and causes dask to throw an error. This change removes a source of repartitioning, preserving the divisions when assigning a pandas series to a dask dataframe --- cdisc_rules_engine/models/dataset/dask_dataset.py | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 7a6449d31..8cb84e470 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -81,9 +81,9 @@ def __setitem__(self, key, value): array_values = da.from_array(value, chunks=tuple(chunks)) self._data[key] = array_values elif isinstance(value, pd.Series): - self._data = self._data.reset_index() - self._data = self._data.set_index("index") - self._data[key] = value + chunks = self._data.map_partitions(lambda x: len(x)).compute().to_numpy() + array_values = da.from_array(value.values, chunks=tuple(chunks)) + self._data[key] = array_values elif isinstance(value, dd.DataFrame): for column in value: self._data[column] = value[column] diff --git a/pyproject.toml b/pyproject.toml index 6ac810c53..cd75e8122 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "cachetools >=6.1.0", "cdisc-library-client >=0.1.6", "click >=8.1.7", - "dask[dataframe,array] >=2024.6.0, <2025.4.0", + "dask[dataframe,array] >=2024.6.0", "fastparquet >=2024.2.0", "importlib-metadata >=8.5.0", "jsonata-python >=0.6.0", From a646ccfa167e68fdb23e17569b6e86dada8c15c6 Mon Sep 17 00:00:00 2001 From: Filipp Shpomer Date: Tue, 28 Apr 2026 17:43:41 -0700 Subject: [PATCH 7/7] Support pandas 2.2.0 Fixes a unit test to support pandas 2.2.0+. The pandas release fixes an sorting bug with https://github.com/pandas-dev/pandas/pull/54611. This commit changes the expected results accordingly. --- pyproject.toml | 2 +- .../test_dataset_metadata_define_dataset_builder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cd75e8122..d2c89a6ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "numpy >=1.26.0", "odmlib >=0.1.4", "openpyxl >=3.1.5", - "pandas >=2.1.4, <2.2.0", + "pandas >=2.1.4, <3.0.0", "psutil >=6.1.1", "pyinstaller >=6.11.0", "pympler >=1.1", diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index aff6c25e8..350c2e8dc 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -149,7 +149,7 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): expected_results["dm.xpt"], expected_results["ae.xpt"], ] - ).astype(object) + ).astype(object).sort_values("dataset_location").reset_index(drop=True) result_df = result.data[expected_df.columns].reset_index(drop=True)