cdisc-org · SFJohnson24 · May 7, 2026 · May 5, 2026 · May 6, 2026 · May 6, 2026
diff --git a/cdisc_rules_engine/constants/__init__.py b/cdisc_rules_engine/constants/__init__.py
@@ -14,7 +14,7 @@
     "  SAS\\s{5}.{8}SASDATA .{16}\\s{24}.{16}(?P<modified_date>.{16})\\s{16}.{40}"
 )
 
-NULL_FLAVORS = ["", None, {None}, [], {}, np.nan]
+NULL_FLAVORS = ["", None, {}, {None}, [], [None], np.nan]
 
 KNOWN_REPORT_EXTENSIONS = [".json", ".xlsx", ".xls"]
 

diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py
@@ -35,7 +35,7 @@ def _execute_operation(self):
                 data = result[self.params.target].unique()
             if len(data) > 0 and isinstance(data[0], bytes):
                 data = data.astype(str)
-            result = set(data)
+            result = list(data)
         else:
             grouped = result.groupby(
                 self.params.grouping, as_index=False, group_keys=False
@@ -52,7 +52,9 @@ def get_existing_column_names(group):
                         ),
                         axis=1,
                     )
-                    return pd.Series({operation_id: set(values.dropna().unique())})
+                    return pd.Series(
+                        {operation_id: list(values.dropna().sort_index().unique())}
+                    )
 
                 result = grouped.apply(get_existing_column_names).reset_index()
             elif isinstance(result.data, pd.DataFrame):
@@ -65,7 +67,7 @@ def get_existing_column_names(group):
                     .unique()
                     .rename({self.params.target: self.params.operation_id})
                 )
-                result = result.apply(set).to_frame().reset_index()
+                result = result.apply(list).to_frame().reset_index()
         return result
 
     def _get_referenced_datasets(self):
@@ -76,4 +78,4 @@ def _get_referenced_datasets(self):
         return referenced_datasets
 
     def _unique_values_for_column(self, column):
-        return pd.Series({self.params.operation_id: set(column.unique())})
+        return pd.Series({self.params.operation_id: list(column.unique())})
diff --git a/tests/unit/test_operations/test_distinct.py b/tests/unit/test_operations/test_distinct.py
@@ -16,11 +16,11 @@
     [
         (
             PandasDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}),
-            {5, 9, 11, 12, 18},
+            [11, 12, 5, 18, 9],
         ),
         (
             DaskDataset.from_dict({"values": [11, 12, 12, 5, 18, 9]}),
-            {5, 9, 11, 12, 18},
+            [11, 12, 5, 18, 9],
         ),
     ],
 )
@@ -44,14 +44,14 @@ def test_distinct(data, expected, operation_params: OperationParams):
             PandasDataset.from_dict(
                 {"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]}
             ),
-            {1: {5, 9, 11}, 2: {12, 18}},
+            {1: [11, 5, 9], 2: [12, 18]},
             None,
         ),
         (
             DaskDataset.from_dict(
                 {"values": [11, 12, 12, 5, 18, 9], "patient": [1, 2, 2, 1, 2, 1]}
             ),
-            {1: {5, 9, 11}, 2: {12, 18}},
+            {1: [11, 5, 9], 2: [12, 18]},
             None,
         ),
         (
@@ -62,7 +62,7 @@ def test_distinct(data, expected, operation_params: OperationParams):
                     "subject": [1, 2, 2, 1, 2, 3],
                 }
             ),
-            {1: {5, 9, 11}, 2: {12, 18}, 3: None},
+            {1: [11, 5, 9], 2: [12, 18], 3: None},
             ["subject"],
         ),
         (
@@ -73,7 +73,7 @@ def test_distinct(data, expected, operation_params: OperationParams):
                     "subject": [1, 2, 2, 1, 2, 3],
                 }
             ),
-            {1: {5, 9, 11}, 2: {12, 18}, 3: None},
+            {1: [11, 5, 9], 2: [12, 18], 3: None},
             ["subject"],
         ),
     ],
@@ -110,7 +110,7 @@ def test_grouped_distinct(
                     "scat": ["a", "a", "a", "a", "a", "b"],
                 }
             ),
-            {1: {5, 11}, 2: {12}},
+            {1: [11, 5], 2: [12]},
             None,
             {"cat": 1, "scat": "a"},
         ),
@@ -123,7 +123,7 @@ def test_grouped_distinct(
                     "scat": ["a", "a", "a", "a", "a", "b"],
                 }
             ),
-            {1: {5, 11}, 2: {12}},
+            {1: [11, 5], 2: [12]},
             None,
             {"cat": 1, "scat": "a"},
         ),
@@ -137,7 +137,7 @@ def test_grouped_distinct(
                     "subject": [1, 2, 2, 1, 2, 3],
                 }
             ),
-            {1: {5, 11}, 2: {12}, 3: None},
+            {1: [11, 5], 2: [12], 3: None},
             ["subject"],
             {"cat": 1, "scat": "a"},
         ),
@@ -151,7 +151,7 @@ def test_grouped_distinct(
                     "subject": [1, 2, 2, 1, 2, 3],
                 }
             ),
-            {1: {5, 11}, 2: {12}, 3: None},
+            {1: [11, 5], 2: [12], 3: None},
             ["subject"],
             {"cat": 1, "scat": "a"},
         ),
@@ -195,7 +195,7 @@ def test_filtered_grouped_distinct(
                     "LBCAT": ["CAT1", "CAT2"],
                 }
             ),
-            {"LBTEST", "LBSEQ"},
+            ["LBTEST", "LBSEQ"],
         ),
         (
             DaskDataset.from_dict(
@@ -211,7 +211,7 @@ def test_filtered_grouped_distinct(
                     "LBCAT": ["CAT1", "CAT2"],
                 }
             ),
-            {"LBTEST", "LBSEQ"},
+            ["LBTEST", "LBSEQ"],
         ),
     ],
 )
@@ -262,7 +262,7 @@ def mock_get_dataset(dataset_name, **kwargs):
                     "LBCAT": ["CAT1", "CAT2"],
                 }
             ),
-            {1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}},
+            {1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]},
             ["subject"],
         ),
         (
@@ -281,7 +281,7 @@ def mock_get_dataset(dataset_name, **kwargs):
                     "LBCAT": ["CAT1", "CAT2"],
                 }
             ),
-            {1: {"LBTEST", "LBSEQ"}, 2: {"LBTEST", "LBSEQ", "LBCAT"}},
+            {1: ["LBTEST", "LBSEQ"], 2: ["LBTEST", "LBSEQ", "LBCAT"]},
             ["subject"],
         ),
     ],

diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py
@@ -498,7 +498,7 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation):
     assert result["$max_aestdy"][0] == df["AESTDY"].max()
     assert result["$min_aestdy"][0] == df["AESTDY"].min()
     assert result["$avg_aestdy"][0] == df["AESTDY"].mean()
-    assert result["$unique_aestdy"].equals(pd.Series([{11, 12, 40, 59}] * len(df)))
+    assert result["$unique_aestdy"].equals(pd.Series([[11, 12, 40, 59]] * len(df)))
 
 
 @pytest.mark.parametrize("dataset_implementation", [PandasDataset, DaskDataset])
@@ -603,22 +603,22 @@ def test_perform_rule_operation_with_grouping(
                     200,
                 ],
                 "$unique_aestdy": [
-                    {
+                    [
                         10,
                         40,
-                    },
-                    {
+                    ],
+                    [
                         11,
                         59,
-                    },
-                    {
+                    ],
+                    [
                         10,
                         40,
-                    },
-                    {
+                    ],
+                    [
                         11,
                         59,
-                    },
+                    ],
                 ],
             }
         )