Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -64,26 +64,35 @@ def build(self):
define_metadata.data,
left_on="variable_name",
right_on="define_variable_name",
how="left",
how="outer",
)
# Add united variable name column to use for library merge
merged_data["united_variable_name"] = merged_data["variable_name"].where(
merged_data["variable_name"].notna(),
merged_data["define_variable_name"],
)
# Second merge: add library metadata
final_dataframe = merged_data.merge(
library_data[
[
"library_variable_name",
"library_variable_label",
"library_variable_data_type",
"library_variable_role",
"library_variable_core",
"library_variable_has_codelist",
"library_variable_ccode",
"library_variable_order_number",
]
],
how="left",
left_on="variable_name",
right_on="library_variable_name",
).fillna("")
final_dataframe = (
merged_data.merge(
library_data[
[
"library_variable_name",
"library_variable_label",
"library_variable_data_type",
"library_variable_role",
"library_variable_core",
"library_variable_has_codelist",
"library_variable_ccode",
"library_variable_order_number",
]
],
how="left",
left_on="united_variable_name",
right_on="library_variable_name",
)
.drop(columns=["united_variable_name"])
.fillna("")
)

final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = (
final_dataframe.apply(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,31 @@ def build(self):
define_metadata: DatasetInterface = self.dataset_implementation.from_records(
variable_metadata
)
return content_metadata.merge(

final_dataframe = content_metadata.merge(
define_metadata.data,
left_on="variable_name",
right_on="define_variable_name",
how="left",
how="outer",
).fillna("")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we make the Nan's a empty string "" we lose information if the columns was missing or it is just empty. I think better would be to handle this in a different way. Please let me know if I am not understanding the real intent here.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ramil, I did it for consistency with metadata+define+library dataset builder, as it had fillna("") in it.

What is the difference between missing and empty column? In my understanding it either comes from dataset(metadata) or define. So the missing values logically appear only when it is present in define only. I might be wrong here.


dataset_contents = self.get_dataset_contents()
final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = (
final_dataframe.apply(
lambda row: self.get_variable_null_stats(
row["variable_name"], dataset_contents
),
axis=1,
result_type="expand",
)
)

return final_dataframe

def get_variable_null_stats(
self, variable: str, content: DatasetInterface
) -> tuple[bool, bool]:
if variable not in content:
return True, True
series = content[variable].mask(content[variable] == "")
return series.isnull().any(), series.isnull().all()
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,28 @@ def test_build_combined_metadata(
assert aeterm_row["variable_has_empty_values"]
assert not aeterm_row["variable_is_empty"]

assert len(result) == 3
# We need to check that the rest of the variables are coming from define.xml
assert (
len(
result[
(result["variable_name"] == "") & result["define_variable_name"].notna()
]
)
== 34
)

assert len(result) == 37

# Check that library metadata merged correctly. AESEQ is present only in define metadata.
for _, row in result.iterrows():
assert row["library_variable_name"] != ""
assert row["library_variable_role"] in ["Identifier", "Topic"]
assert row["library_variable_core"] == "Req"
assert row["library_variable_ccode"] in ["C49487", "C69256", "C41331"]
assert row["library_variable_has_codelist"] in [True, True, True]
if row["variable_name"] != "" or row["define_variable_name"] == "AESEQ":
assert row["library_variable_name"] != ""
assert row["library_variable_role"] in ["Identifier", "Topic"]
assert row["library_variable_core"] == "Req"
assert row["library_variable_ccode"] in [
"C49487",
"C69256",
"C41331",
"C25364",
]
assert row["library_variable_has_codelist"] in [True, True, True]
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from unittest.mock import MagicMock, patch
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
from cdisc_rules_engine.services.cache.in_memory_cache_service import (
InMemoryCacheService,
)
from cdisc_rules_engine.services.data_services import LocalDataService
from pathlib import Path
import pandas as pd
from cdisc_rules_engine.dataset_builders.variables_metadata_with_define_dataset_builder import (
VariablesMetadataWithDefineDatasetBuilder,
)

resources_path: Path = Path(__file__).parent.parent.parent.joinpath("resources")
test_define_file_path: Path = resources_path.joinpath("test_defineV21-SDTM.xml")


@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset")
@patch(
"cdisc_rules_engine.services.data_services.LocalDataService.get_variables_metadata"
)
@patch(
"cdisc_rules_engine.services.data_services.LocalDataService.get_define_xml_contents"
)
def test_build_combined_metadata(
mock_get_define_xml,
mock_get_variables_metadata,
mock_get_dataset,
):
"""Test that the builder correctly combines two metadata sources"""
# Load actual define XML content
with open(test_define_file_path, "rb") as f:
define_data = f.read()
mock_get_define_xml.return_value = define_data

# Setup variables metadata
mock_get_variables_metadata.return_value = pd.DataFrame.from_dict(
{
"variable_name": ["STUDYID", "USUBJID", "AETERM"],
"variable_label": ["Study ID", "Subject ID", "AE Term"],
"variable_size": [16, 16, 200],
"variable_order_number": [1, 2, 9],
"variable_data_type": ["Char", "Char", "Char"],
}
)

# Setup dataset contents
mock_get_dataset.return_value = PandasDataset(
pd.DataFrame.from_dict(
{
"STUDYID": ["STUDY1", "STUDY1", "STUDY1"],
"USUBJID": ["SUBJ1", "", "SUBJ3"],
"AETERM": ["Headache", "Nausea", ""],
}
)
)

# Create builder
builder = VariablesMetadataWithDefineDatasetBuilder(
rule=None,
data_service=LocalDataService(MagicMock(), MagicMock(), MagicMock()),
cache_service=InMemoryCacheService(),
rule_processor=None,
data_processor=None,
dataset_metadata=SDTMDatasetMetadata(
name="AE",
first_record={"DOMAIN": "AE"},
full_path=str(test_define_file_path),
),
define_xml_path=str(test_define_file_path),
standard="sdtmig",
standard_version="3-4",
standard_substandard=None,
)

result = builder.build()

expected_columns = {
"variable_name",
"variable_label",
"variable_size",
"variable_order_number",
"variable_data_type",
"define_variable_name",
"define_variable_label",
"define_variable_data_type",
"define_variable_is_collected",
"define_variable_role",
"define_variable_size",
"define_variable_ccode",
"define_variable_format",
"define_variable_allowed_terms",
"define_variable_origin_type",
"define_variable_has_no_data",
"define_variable_order_number",
"define_variable_length",
"define_variable_has_codelist",
"define_variable_codelist_coded_values",
"define_variable_codelist_coded_codes",
"define_variable_mandatory",
"define_variable_has_comment",
"define_variable_has_method",
"variable_has_empty_values",
"variable_is_empty",
}
assert set(result.columns.tolist()) == expected_columns

core_variables = ["STUDYID", "USUBJID", "AETERM"]
for var in core_variables:
row = result[result["variable_name"] == var].iloc[0]
assert row["define_variable_name"] == var
assert row["variable_has_empty_values"] == (var in ["USUBJID", "AETERM"])

studyid_row = result[result["variable_name"] == "STUDYID"].iloc[0]
assert studyid_row["variable_size"] == 16.0
assert studyid_row["variable_order_number"] == 1.0
assert studyid_row["variable_data_type"] == "Char"
assert studyid_row["define_variable_role"] == "Identifier"
assert not studyid_row["variable_has_empty_values"]
assert not studyid_row["variable_is_empty"]

usubjid_row = result[result["variable_name"] == "USUBJID"].iloc[0]
assert usubjid_row["variable_size"] == 16.0
assert usubjid_row["variable_order_number"] == 2.0
assert usubjid_row["variable_data_type"] == "Char"
assert usubjid_row["define_variable_role"] == "Identifier"
assert usubjid_row["variable_has_empty_values"]
assert not usubjid_row["variable_is_empty"]

aeterm_row = result[result["variable_name"] == "AETERM"].iloc[0]
assert aeterm_row["variable_size"] == 200.0
assert aeterm_row["variable_order_number"] == 9.0
assert aeterm_row["variable_data_type"] == "Char"
assert aeterm_row["define_variable_role"] == "Topic"
assert aeterm_row["variable_has_empty_values"]
assert not aeterm_row["variable_is_empty"]

# We need to check that the rest of the variables are coming from define.xml (variable_name is NaN)
assert (
len(
result[
(result["variable_name"] == "") & result["define_variable_name"].notna()
]
)
== 34
)

assert len(result) == 37
26 changes: 19 additions & 7 deletions tests/unit/test_rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1482,7 +1482,7 @@ def test_validate_dataset_metadata_against_define_xml(
pd.DataFrame.from_dict(
{
"variable_name": [
"TEST",
"TEST2",
],
"variable_label": [
"TEST Label",
Expand Down Expand Up @@ -1543,13 +1543,25 @@ def test_validate_variable_metadata_against_define_xml(
full_path="CDISC01/test",
)
mock_get_datasets.return_value = [dataset_metadata]
validation_result: List[dict] = RulesEngine(
standard="sdtmig"
).validate_single_dataset(
dataset_metadata=dataset_metadata,
rule=define_xml_variable_validation_rule,
df = PandasDataset(
pd.DataFrame.from_dict(
{
"TEST": ["TEST", "TEST", "TEST"],
"TEST2": ["TEST", "TEST", "TEST"],
}
)
)
assert validation_result == expected_validation_result
with patch(
"cdisc_rules_engine.services.data_services.LocalDataService.get_dataset",
return_value=df,
):
validation_result: List[dict] = RulesEngine(
standard="sdtmig"
).validate_single_dataset(
dataset_metadata=dataset_metadata,
rule=define_xml_variable_validation_rule,
)
assert validation_result == expected_validation_result


@pytest.mark.parametrize(
Expand Down
Loading