From 1a1dbb4a7ccb2641166d0a36b832213c3d33f528 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Tue, 31 Mar 2026 09:57:34 -0300 Subject: [PATCH 1/4] Using outer join for define.xml metadata. Making dataset+define builder consistent with dataset+define+library bilder. --- ...with_define_and_library_dataset_builder.py | 13 +- ...es_metadata_with_define_dataset_builder.py | 26 +++- ...with_define_and_library_dataset_builder.py | 19 ++- ...es_metadata_with_define_dataset_builder.py | 141 ++++++++++++++++++ 4 files changed, 188 insertions(+), 11 deletions(-) create mode 100644 tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index ee00d2cad..e686c0beb 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -64,7 +64,14 @@ def build(self): define_metadata.data, left_on="variable_name", right_on="define_variable_name", - how="left", + how="outer", + ) + # Add united variable name column to use for library merge + merged_data["united_variable_name"] = merged_data[ + "variable_name" + ].where( + merged_data["variable_name"].notna(), + merged_data["define_variable_name"], ) # Second merge: add library metadata final_dataframe = merged_data.merge( @@ -81,9 +88,9 @@ def build(self): ] ], how="left", - left_on="variable_name", + left_on="united_variable_name", right_on="library_variable_name", - ).fillna("") + ).drop(columns=["united_variable_name"]).fillna("") final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( final_dataframe.apply( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index 25237c09f..e52c8586a 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -41,9 +41,31 @@ def build(self): define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata ) - return content_metadata.merge( + + final_dataframe = content_metadata.merge( define_metadata.data, left_on="variable_name", right_on="define_variable_name", - how="left", + how="outer", + ).fillna("") + + dataset_contents = self.get_dataset_contents() + final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( + final_dataframe.apply( + lambda row: self.get_variable_null_stats( + row["variable_name"], dataset_contents + ), + axis=1, + result_type="expand", + ) ) + + return final_dataframe + + def get_variable_null_stats( + self, variable: str, content: DatasetInterface + ) -> tuple[bool, bool]: + if variable not in content: + return True, True + series = content[variable].mask(content[variable] == "") + return series.isnull().any(), series.isnull().all() diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 674e01176..d6941277e 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -224,11 +224,18 @@ def test_build_combined_metadata( assert aeterm_row["variable_has_empty_values"] assert not aeterm_row["variable_is_empty"] - assert len(result) == 3 + # We need to check that the rest of the variables are coming from define.xml + assert len(result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ]) == 34 + assert len(result) == 37 + + # Check that library metadata merged correctly. AESEQ is present only in define metadata. for _, row in result.iterrows(): - assert row["library_variable_name"] != "" - assert row["library_variable_role"] in ["Identifier", "Topic"] - assert row["library_variable_core"] == "Req" - assert row["library_variable_ccode"] in ["C49487", "C69256", "C41331"] - assert row["library_variable_has_codelist"] in [True, True, True] + if (row["variable_name"] != "" or row["define_variable_name"] == "AESEQ"): + assert row["library_variable_name"] != "" + assert row["library_variable_role"] in ["Identifier", "Topic"] + assert row["library_variable_core"] == "Req" + assert row["library_variable_ccode"] in ["C49487", "C69256", "C41331", "C25364"] + assert row["library_variable_has_codelist"] in [True, True, True] diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py new file mode 100644 index 000000000..4702d09d1 --- /dev/null +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py @@ -0,0 +1,141 @@ +from unittest.mock import MagicMock, patch +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata +from cdisc_rules_engine.services.cache.in_memory_cache_service import ( + InMemoryCacheService, +) +from cdisc_rules_engine.services.data_services import LocalDataService +from pathlib import Path +import pandas as pd +from cdisc_rules_engine.dataset_builders.variables_metadata_with_define_dataset_builder import ( + VariablesMetadataWithDefineDatasetBuilder, +) + +resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") +test_define_file_path: Path = resources_path.joinpath("test_defineV21-SDTM.xml") + + +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" +) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_define_xml_contents" +) +def test_build_combined_metadata( + mock_get_define_xml, + mock_get_variables_metadata, + mock_get_dataset, +): + """Test that the builder correctly combines two metadata sources""" + # Load actual define XML content + with open(test_define_file_path, "rb") as f: + define_data = f.read() + mock_get_define_xml.return_value = define_data + + # Setup variables metadata + mock_get_variables_metadata.return_value = pd.DataFrame.from_dict( + { + "variable_name": ["STUDYID", "USUBJID", "AETERM"], + "variable_label": ["Study ID", "Subject ID", "AE Term"], + "variable_size": [16, 16, 200], + "variable_order_number": [1, 2, 9], + "variable_data_type": ["Char", "Char", "Char"], + } + ) + + # Setup dataset contents + mock_get_dataset.return_value = PandasDataset( + pd.DataFrame.from_dict( + { + "STUDYID": ["STUDY1", "STUDY1", "STUDY1"], + "USUBJID": ["SUBJ1", "", "SUBJ3"], + "AETERM": ["Headache", "Nausea", ""], + } + ) + ) + + # Create builder + builder = VariablesMetadataWithDefineDatasetBuilder( + rule=None, + data_service=LocalDataService(MagicMock(), MagicMock(), MagicMock()), + cache_service=InMemoryCacheService(), + rule_processor=None, + data_processor=None, + dataset_path=str(test_define_file_path), + datasets=[], + dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), + define_xml_path=str(test_define_file_path), + standard="sdtmig", + standard_version="3-4", + standard_substandard=None, + ) + + result = builder.build() + + expected_columns = { + "variable_name", + "variable_label", + "variable_size", + "variable_order_number", + "variable_data_type", + "define_variable_name", + "define_variable_label", + "define_variable_data_type", + "define_variable_is_collected", + "define_variable_role", + "define_variable_size", + "define_variable_ccode", + "define_variable_format", + "define_variable_allowed_terms", + "define_variable_origin_type", + "define_variable_has_no_data", + "define_variable_order_number", + "define_variable_length", + "define_variable_has_codelist", + "define_variable_codelist_coded_values", + "define_variable_codelist_coded_codes", + "define_variable_mandatory", + "define_variable_has_comment", + "define_variable_has_method", + "variable_has_empty_values", + "variable_is_empty", + } + assert set(result.columns.tolist()) == expected_columns + + core_variables = ["STUDYID", "USUBJID", "AETERM"] + for var in core_variables: + row = result[result["variable_name"] == var].iloc[0] + assert row["define_variable_name"] == var + assert row["variable_has_empty_values"] == (var in ["USUBJID", "AETERM"]) + + studyid_row = result[result["variable_name"] == "STUDYID"].iloc[0] + assert studyid_row["variable_size"] == 16.0 + assert studyid_row["variable_order_number"] == 1.0 + assert studyid_row["variable_data_type"] == "Char" + assert studyid_row["define_variable_role"] == "Identifier" + assert not studyid_row["variable_has_empty_values"] + assert not studyid_row["variable_is_empty"] + + usubjid_row = result[result["variable_name"] == "USUBJID"].iloc[0] + assert usubjid_row["variable_size"] == 16.0 + assert usubjid_row["variable_order_number"] == 2.0 + assert usubjid_row["variable_data_type"] == "Char" + assert usubjid_row["define_variable_role"] == "Identifier" + assert usubjid_row["variable_has_empty_values"] + assert not usubjid_row["variable_is_empty"] + + aeterm_row = result[result["variable_name"] == "AETERM"].iloc[0] + assert aeterm_row["variable_size"] == 200.0 + assert aeterm_row["variable_order_number"] == 9.0 + assert aeterm_row["variable_data_type"] == "Char" + assert aeterm_row["define_variable_role"] == "Topic" + assert aeterm_row["variable_has_empty_values"] + assert not aeterm_row["variable_is_empty"] + + # We need to check that the rest of the variables are coming from define.xml (variable_name is NaN) + assert len(result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ]) == 34 + + assert len(result) == 37 From ec3d65bb7d31be14560f1e3ff19e5d3118b1c252 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Tue, 31 Mar 2026 11:14:38 -0300 Subject: [PATCH 2/4] Updating the test, as now contents is read when building a dataset with define.xml --- tests/unit/test_rules_engine.py | 40 +++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 5de91eb06..c11911cdd 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1442,7 +1442,7 @@ def test_validate_dataset_metadata_against_define_xml( pd.DataFrame.from_dict( { "variable_name": [ - "TEST", + "TEST2", ], "variable_label": [ "TEST Label", @@ -1494,20 +1494,32 @@ def test_validate_variable_metadata_against_define_xml( """ mock_get_define_xml_variables_metadata.return_value = variable_metadata mock_get_variables_metadata.return_value = dataset_mock - dataset_metadata = SDTMDatasetMetadata( - name="AE", - first_record={"DOMAIN": "AE"}, - filename="test", - full_path="CDISC01/test", - ) - validation_result: List[dict] = RulesEngine( - standard="sdtmig" - ).validate_single_dataset( - dataset_metadata=dataset_metadata, - rule=define_xml_variable_validation_rule, - datasets=[dataset_metadata], + df = PandasDataset( + pd.DataFrame.from_dict( + { + "TEST": ["TEST", "TEST", "TEST"], + "TEST2": ["TEST", "TEST", "TEST"], + } + ) ) - assert validation_result == expected_validation_result + with patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ): + dataset_metadata = SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + filename="test", + full_path="CDISC01/test/ae.xpt", + ) + validation_result: List[dict] = RulesEngine( + standard="sdtmig" + ).validate_single_dataset( + dataset_metadata=dataset_metadata, + rule=define_xml_variable_validation_rule, + datasets=[dataset_metadata], + ) + assert validation_result == expected_validation_result @pytest.mark.parametrize( From 9ebe4c6283ac7a86821e918fa48cb4fb3f15273f Mon Sep 17 00:00:00 2001 From: Dmitry Date: Tue, 31 Mar 2026 11:49:40 -0300 Subject: [PATCH 3/4] Formatting with black --- ...with_define_and_library_dataset_builder.py | 42 ++++++++++--------- ...with_define_and_library_dataset_builder.py | 20 ++++++--- ...es_metadata_with_define_dataset_builder.py | 11 +++-- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index e686c0beb..33f1bf110 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -67,30 +67,32 @@ def build(self): how="outer", ) # Add united variable name column to use for library merge - merged_data["united_variable_name"] = merged_data[ - "variable_name" - ].where( + merged_data["united_variable_name"] = merged_data["variable_name"].where( merged_data["variable_name"].notna(), merged_data["define_variable_name"], ) # Second merge: add library metadata - final_dataframe = merged_data.merge( - library_data[ - [ - "library_variable_name", - "library_variable_label", - "library_variable_data_type", - "library_variable_role", - "library_variable_core", - "library_variable_has_codelist", - "library_variable_ccode", - "library_variable_order_number", - ] - ], - how="left", - left_on="united_variable_name", - right_on="library_variable_name", - ).drop(columns=["united_variable_name"]).fillna("") + final_dataframe = ( + merged_data.merge( + library_data[ + [ + "library_variable_name", + "library_variable_label", + "library_variable_data_type", + "library_variable_role", + "library_variable_core", + "library_variable_has_codelist", + "library_variable_ccode", + "library_variable_order_number", + ] + ], + how="left", + left_on="united_variable_name", + right_on="library_variable_name", + ) + .drop(columns=["united_variable_name"]) + .fillna("") + ) final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( final_dataframe.apply( diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index d6941277e..6341890c4 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -225,17 +225,27 @@ def test_build_combined_metadata( assert not aeterm_row["variable_is_empty"] # We need to check that the rest of the variables are coming from define.xml - assert len(result[ - (result["variable_name"] == "") & result["define_variable_name"].notna() - ]) == 34 + assert ( + len( + result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ] + ) + == 34 + ) assert len(result) == 37 # Check that library metadata merged correctly. AESEQ is present only in define metadata. for _, row in result.iterrows(): - if (row["variable_name"] != "" or row["define_variable_name"] == "AESEQ"): + if row["variable_name"] != "" or row["define_variable_name"] == "AESEQ": assert row["library_variable_name"] != "" assert row["library_variable_role"] in ["Identifier", "Topic"] assert row["library_variable_core"] == "Req" - assert row["library_variable_ccode"] in ["C49487", "C69256", "C41331", "C25364"] + assert row["library_variable_ccode"] in [ + "C49487", + "C69256", + "C41331", + "C25364", + ] assert row["library_variable_has_codelist"] in [True, True, True] diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py index 4702d09d1..6dcf68f1b 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py @@ -134,8 +134,13 @@ def test_build_combined_metadata( assert not aeterm_row["variable_is_empty"] # We need to check that the rest of the variables are coming from define.xml (variable_name is NaN) - assert len(result[ - (result["variable_name"] == "") & result["define_variable_name"].notna() - ]) == 34 + assert ( + len( + result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ] + ) + == 34 + ) assert len(result) == 37 From 6033b20f429aa8e6636726ffe8ec42cef38e9482 Mon Sep 17 00:00:00 2001 From: dmitrymk Date: Mon, 4 May 2026 11:36:09 -0300 Subject: [PATCH 4/4] Updated the new test according to the changes in the main branch --- ...test_variables_metadata_with_define_dataset_builder.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py index 6dcf68f1b..c39f2a0bc 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py @@ -62,9 +62,11 @@ def test_build_combined_metadata( cache_service=InMemoryCacheService(), rule_processor=None, data_processor=None, - dataset_path=str(test_define_file_path), - datasets=[], - dataset_metadata=SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}), + dataset_metadata=SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=str(test_define_file_path), + ), define_xml_path=str(test_define_file_path), standard="sdtmig", standard_version="3-4",