diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index 930c55953..5203cd56b 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -64,26 +64,35 @@ def build(self): define_metadata.data, left_on="variable_name", right_on="define_variable_name", - how="left", + how="outer", + ) + # Add united variable name column to use for library merge + merged_data["united_variable_name"] = merged_data["variable_name"].where( + merged_data["variable_name"].notna(), + merged_data["define_variable_name"], ) # Second merge: add library metadata - final_dataframe = merged_data.merge( - library_data[ - [ - "library_variable_name", - "library_variable_label", - "library_variable_data_type", - "library_variable_role", - "library_variable_core", - "library_variable_has_codelist", - "library_variable_ccode", - "library_variable_order_number", - ] - ], - how="left", - left_on="variable_name", - right_on="library_variable_name", - ).fillna("") + final_dataframe = ( + merged_data.merge( + library_data[ + [ + "library_variable_name", + "library_variable_label", + "library_variable_data_type", + "library_variable_role", + "library_variable_core", + "library_variable_has_codelist", + "library_variable_ccode", + "library_variable_order_number", + ] + ], + how="left", + left_on="united_variable_name", + right_on="library_variable_name", + ) + .drop(columns=["united_variable_name"]) + .fillna("") + ) final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( final_dataframe.apply( diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py index 3c406deea..36f565c8f 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_dataset_builder.py @@ -41,9 +41,31 @@ def build(self): define_metadata: DatasetInterface = self.dataset_implementation.from_records( variable_metadata ) - return content_metadata.merge( + + final_dataframe = content_metadata.merge( define_metadata.data, left_on="variable_name", right_on="define_variable_name", - how="left", + how="outer", + ).fillna("") + + dataset_contents = self.get_dataset_contents() + final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( + final_dataframe.apply( + lambda row: self.get_variable_null_stats( + row["variable_name"], dataset_contents + ), + axis=1, + result_type="expand", + ) ) + + return final_dataframe + + def get_variable_null_stats( + self, variable: str, content: DatasetInterface + ) -> tuple[bool, bool]: + if variable not in content: + return True, True + series = content[variable].mask(content[variable] == "") + return series.isnull().any(), series.isnull().all() diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 341ddb0f0..3129a11a5 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -226,11 +226,28 @@ def test_build_combined_metadata( assert aeterm_row["variable_has_empty_values"] assert not aeterm_row["variable_is_empty"] - assert len(result) == 3 + # We need to check that the rest of the variables are coming from define.xml + assert ( + len( + result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ] + ) + == 34 + ) + + assert len(result) == 37 + # Check that library metadata merged correctly. AESEQ is present only in define metadata. for _, row in result.iterrows(): - assert row["library_variable_name"] != "" - assert row["library_variable_role"] in ["Identifier", "Topic"] - assert row["library_variable_core"] == "Req" - assert row["library_variable_ccode"] in ["C49487", "C69256", "C41331"] - assert row["library_variable_has_codelist"] in [True, True, True] + if row["variable_name"] != "" or row["define_variable_name"] == "AESEQ": + assert row["library_variable_name"] != "" + assert row["library_variable_role"] in ["Identifier", "Topic"] + assert row["library_variable_core"] == "Req" + assert row["library_variable_ccode"] in [ + "C49487", + "C69256", + "C41331", + "C25364", + ] + assert row["library_variable_has_codelist"] in [True, True, True] diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py new file mode 100644 index 000000000..c39f2a0bc --- /dev/null +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_dataset_builder.py @@ -0,0 +1,148 @@ +from unittest.mock import MagicMock, patch +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata +from cdisc_rules_engine.services.cache.in_memory_cache_service import ( + InMemoryCacheService, +) +from cdisc_rules_engine.services.data_services import LocalDataService +from pathlib import Path +import pandas as pd +from cdisc_rules_engine.dataset_builders.variables_metadata_with_define_dataset_builder import ( + VariablesMetadataWithDefineDatasetBuilder, +) + +resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources") +test_define_file_path: Path = resources_path.joinpath("test_defineV21-SDTM.xml") + + +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata" +) +@patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_define_xml_contents" +) +def test_build_combined_metadata( + mock_get_define_xml, + mock_get_variables_metadata, + mock_get_dataset, +): + """Test that the builder correctly combines two metadata sources""" + # Load actual define XML content + with open(test_define_file_path, "rb") as f: + define_data = f.read() + mock_get_define_xml.return_value = define_data + + # Setup variables metadata + mock_get_variables_metadata.return_value = pd.DataFrame.from_dict( + { + "variable_name": ["STUDYID", "USUBJID", "AETERM"], + "variable_label": ["Study ID", "Subject ID", "AE Term"], + "variable_size": [16, 16, 200], + "variable_order_number": [1, 2, 9], + "variable_data_type": ["Char", "Char", "Char"], + } + ) + + # Setup dataset contents + mock_get_dataset.return_value = PandasDataset( + pd.DataFrame.from_dict( + { + "STUDYID": ["STUDY1", "STUDY1", "STUDY1"], + "USUBJID": ["SUBJ1", "", "SUBJ3"], + "AETERM": ["Headache", "Nausea", ""], + } + ) + ) + + # Create builder + builder = VariablesMetadataWithDefineDatasetBuilder( + rule=None, + data_service=LocalDataService(MagicMock(), MagicMock(), MagicMock()), + cache_service=InMemoryCacheService(), + rule_processor=None, + data_processor=None, + dataset_metadata=SDTMDatasetMetadata( + name="AE", + first_record={"DOMAIN": "AE"}, + full_path=str(test_define_file_path), + ), + define_xml_path=str(test_define_file_path), + standard="sdtmig", + standard_version="3-4", + standard_substandard=None, + ) + + result = builder.build() + + expected_columns = { + "variable_name", + "variable_label", + "variable_size", + "variable_order_number", + "variable_data_type", + "define_variable_name", + "define_variable_label", + "define_variable_data_type", + "define_variable_is_collected", + "define_variable_role", + "define_variable_size", + "define_variable_ccode", + "define_variable_format", + "define_variable_allowed_terms", + "define_variable_origin_type", + "define_variable_has_no_data", + "define_variable_order_number", + "define_variable_length", + "define_variable_has_codelist", + "define_variable_codelist_coded_values", + "define_variable_codelist_coded_codes", + "define_variable_mandatory", + "define_variable_has_comment", + "define_variable_has_method", + "variable_has_empty_values", + "variable_is_empty", + } + assert set(result.columns.tolist()) == expected_columns + + core_variables = ["STUDYID", "USUBJID", "AETERM"] + for var in core_variables: + row = result[result["variable_name"] == var].iloc[0] + assert row["define_variable_name"] == var + assert row["variable_has_empty_values"] == (var in ["USUBJID", "AETERM"]) + + studyid_row = result[result["variable_name"] == "STUDYID"].iloc[0] + assert studyid_row["variable_size"] == 16.0 + assert studyid_row["variable_order_number"] == 1.0 + assert studyid_row["variable_data_type"] == "Char" + assert studyid_row["define_variable_role"] == "Identifier" + assert not studyid_row["variable_has_empty_values"] + assert not studyid_row["variable_is_empty"] + + usubjid_row = result[result["variable_name"] == "USUBJID"].iloc[0] + assert usubjid_row["variable_size"] == 16.0 + assert usubjid_row["variable_order_number"] == 2.0 + assert usubjid_row["variable_data_type"] == "Char" + assert usubjid_row["define_variable_role"] == "Identifier" + assert usubjid_row["variable_has_empty_values"] + assert not usubjid_row["variable_is_empty"] + + aeterm_row = result[result["variable_name"] == "AETERM"].iloc[0] + assert aeterm_row["variable_size"] == 200.0 + assert aeterm_row["variable_order_number"] == 9.0 + assert aeterm_row["variable_data_type"] == "Char" + assert aeterm_row["define_variable_role"] == "Topic" + assert aeterm_row["variable_has_empty_values"] + assert not aeterm_row["variable_is_empty"] + + # We need to check that the rest of the variables are coming from define.xml (variable_name is NaN) + assert ( + len( + result[ + (result["variable_name"] == "") & result["define_variable_name"].notna() + ] + ) + == 34 + ) + + assert len(result) == 37 diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index bf06570ca..da79273d4 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1482,7 +1482,7 @@ def test_validate_dataset_metadata_against_define_xml( pd.DataFrame.from_dict( { "variable_name": [ - "TEST", + "TEST2", ], "variable_label": [ "TEST Label", @@ -1543,13 +1543,25 @@ def test_validate_variable_metadata_against_define_xml( full_path="CDISC01/test", ) mock_get_datasets.return_value = [dataset_metadata] - validation_result: List[dict] = RulesEngine( - standard="sdtmig" - ).validate_single_dataset( - dataset_metadata=dataset_metadata, - rule=define_xml_variable_validation_rule, + df = PandasDataset( + pd.DataFrame.from_dict( + { + "TEST": ["TEST", "TEST", "TEST"], + "TEST2": ["TEST", "TEST", "TEST"], + } + ) ) - assert validation_result == expected_validation_result + with patch( + "cdisc_rules_engine.services.data_services.LocalDataService.get_dataset", + return_value=df, + ): + validation_result: List[dict] = RulesEngine( + standard="sdtmig" + ).validate_single_dataset( + dataset_metadata=dataset_metadata, + rule=define_xml_variable_validation_rule, + ) + assert validation_result == expected_validation_result @pytest.mark.parametrize(