diff --git a/cdisc_rules_engine/constants/__init__.py b/cdisc_rules_engine/constants/__init__.py index 77d0a6aea..afe9572f7 100644 --- a/cdisc_rules_engine/constants/__init__.py +++ b/cdisc_rules_engine/constants/__init__.py @@ -14,7 +14,7 @@ " SAS\\s{5}.{8}SASDATA .{16}\\s{24}.{16}(?P.{16})\\s{16}.{40}" ) -NULL_FLAVORS = ["", None, {None}, [], {}, np.nan] +NULL_FLAVORS = ["", None, {}, {None}, [], [None], np.nan] KNOWN_REPORT_EXTENSIONS = [".json", ".xlsx", ".xls"] diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index a7485a4f6..7d3ab9897 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -35,7 +35,7 @@ def _execute_operation(self): data = result[self.params.target].unique() if len(data) > 0 and isinstance(data[0], bytes): data = data.astype(str) - result = set(data) + result = list(data) else: grouped = result.groupby( self.params.grouping, as_index=False, group_keys=False @@ -52,7 +52,9 @@ def get_existing_column_names(group): ), axis=1, ) - return pd.Series({operation_id: set(values.dropna().unique())}) + return pd.Series( + {operation_id: list(values.dropna().sort_index().unique())} + ) result = grouped.apply(get_existing_column_names).reset_index() elif isinstance(result.data, pd.DataFrame): @@ -65,7 +67,7 @@ def get_existing_column_names(group): .unique() .rename({self.params.target: self.params.operation_id}) ) - result = result.apply(set).to_frame().reset_index() + result = result.apply(list).to_frame().reset_index() return result def _get_referenced_datasets(self): @@ -76,4 +78,4 @@ def _get_referenced_datasets(self): return referenced_datasets def _unique_values_for_column(self, column): - return pd.Series({self.params.operation_id: set(column.unique())}) + return pd.Series({self.params.operation_id: list(column.unique())}) diff --git a/tests/unit/test_operations/test_distinct.py b/tests/unit/test_operations/test_distinct.py index 0e0f68b77..e64f2d598 100644 --- a/tests/unit/test_operations/test_distinct.py +++ b/tests/unit/test_operations/test_distinct.py @@ -16,11 +16,11 @@ [ ( PandasDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}), - {5, 9, 11, 12, 18}, + [11, 12, 5, 18, 9], ), ( DaskDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}), - {5, 9, 11, 12, 18}, + [11, 12, 5, 18, 9], ), ], ) @@ -44,14 +44,14 @@ def test_distinct(data, expected, operation_params: OperationParams): PandasDataset.from_dict( {"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]} ), - {1: {5, 9, 11}, 2: {12, 18}}, + {1: [11, 5, 9], 2: [12, 18]}, None, ), ( DaskDataset.from_dict( {"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]} ), - {1: {5, 9, 11}, 2: {12, 18}}, + {1: [11, 5, 9], 2: [12, 18]}, None, ), ( @@ -62,7 +62,7 @@ def test_distinct(data, expected, operation_params: OperationParams): "subject": [1, 2, 2, 1, 2, 3], } ), - {1: {5, 9, 11}, 2: {12, 18}, 3: None}, + {1: [11, 5, 9], 2: [12, 18], 3: None}, ["subject"], ), ( @@ -73,7 +73,7 @@ def test_distinct(data, expected, operation_params: OperationParams): "subject": [1, 2, 2, 1, 2, 3], } ), - {1: {5, 9, 11}, 2: {12, 18}, 3: None}, + {1: [11, 5, 9], 2: [12, 18], 3: None}, ["subject"], ), ], @@ -110,7 +110,7 @@ def test_grouped_distinct( "scat": ["a", "a", "a", "a", "a", "b"], } ), - {1: {5, 11}, 2: {12}}, + {1: [11, 5], 2: [12]}, None, {"cat": 1, "scat": "a"}, ), @@ -123,7 +123,7 @@ def test_grouped_distinct( "scat": ["a", "a", "a", "a", "a", "b"], } ), - {1: {5, 11}, 2: {12}}, + {1: [11, 5], 2: [12]}, None, {"cat": 1, "scat": "a"}, ), @@ -137,7 +137,7 @@ def test_grouped_distinct( "subject": [1, 2, 2, 1, 2, 3], } ), - {1: {5, 11}, 2: {12}, 3: None}, + {1: [11, 5], 2: [12], 3: None}, ["subject"], {"cat": 1, "scat": "a"}, ), @@ -151,7 +151,7 @@ def test_grouped_distinct( "subject": [1, 2, 2, 1, 2, 3], } ), - {1: {5, 11}, 2: {12}, 3: None}, + {1: [11, 5], 2: [12], 3: None}, ["subject"], {"cat": 1, "scat": "a"}, ), @@ -195,7 +195,7 @@ def test_filtered_grouped_distinct( "LBCAT": ["CAT1", "CAT2"], } ), - {"LBTEST", "LBSEQ"}, + ["LBTEST", "LBSEQ"], ), ( DaskDataset.from_dict( @@ -211,7 +211,7 @@ def test_filtered_grouped_distinct( "LBCAT": ["CAT1", "CAT2"], } ), - {"LBTEST", "LBSEQ"}, + ["LBTEST", "LBSEQ"], ), ], ) @@ -262,7 +262,7 @@ def mock_get_dataset(dataset_name, **kwargs): "LBCAT": ["CAT1", "CAT2"], } ), - {1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}}, + {1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]}, ["subject"], ), ( @@ -281,7 +281,7 @@ def mock_get_dataset(dataset_name, **kwargs): "LBCAT": ["CAT1", "CAT2"], } ), - {1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}}, + {1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]}, ["subject"], ), ], diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index e0f0ed23a..18e73e10b 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -498,7 +498,7 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation): assert result["$max_aestdy"][0] == df["AESTDY"].max() assert result["$min_aestdy"][0] == df["AESTDY"].min() assert result["$avg_aestdy"][0] == df["AESTDY"].mean() - assert result["$unique_aestdy"].equals(pd.Series([{11, 12, 40, 59}] * len(df))) + assert result["$unique_aestdy"].equals(pd.Series([[11, 12, 40, 59]] * len(df))) @pytest.mark.parametrize("dataset_implementation", [PandasDataset, DaskDataset]) @@ -603,22 +603,22 @@ def test_perform_rule_operation_with_grouping( 200, ], "$unique_aestdy": [ - { + [ 10, 40, - }, - { + ], + [ 11, 59, - }, - { + ], + [ 10, 40, - }, - { + ], + [ 11, 59, - }, + ], ], } )