From 39ddfeb276fd77cff56d9afee50e41e553935a97 Mon Sep 17 00:00:00 2001
From: Ansuman Patra <hellothere4604@gmail.com>
Date: Tue, 17 Mar 2026 17:13:06 +0530
Subject: [PATCH 1/4] FIX: Three dataset robustness gaps in OpenMLDataset and
 create_dataset

---
 openml/datasets/dataset.py   | 27 ++++++++++++++++++++++-----
 openml/datasets/functions.py |  6 +++---
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 59d6205ba..db84e5366 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -297,8 +297,10 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
         """Collect all information to display in the __repr__ body."""
         # Obtain number of features in accordance with lazy loading.
         n_features: int | None = None
-        if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
-            n_features = int(self._qualities["NumberOfFeatures"])
+        if self._qualities is not None:
+            n_features_quality = self._qualities.get("NumberOfFeatures")
+            if n_features_quality is not None and not pd.isna(n_features_quality):
+                n_features = int(n_features_quality)
         elif self._features is not None:
             n_features = len(self._features)
 
@@ -312,14 +314,17 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
             "Pickle file": (
                 str(self.data_pickle_file) if self.data_pickle_file is not None else None
             ),
-            "# of features": n_features,
         }
+        if n_features is not None:
+            fields["# of features"] = n_features
         if self.upload_date is not None:
             fields["Upload Date"] = self.upload_date.replace("T", " ")
         if self.dataset_id is not None:
             fields["OpenML URL"] = self.openml_url
-        if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
-            fields["# of instances"] = int(self._qualities["NumberOfInstances"])
+        if self._qualities is not None:
+            n_instances_quality = self._qualities.get("NumberOfInstances")
+            if n_instances_quality is not None and not pd.isna(n_instances_quality):
+                fields["# of instances"] = int(n_instances_quality)
 
         # determines the order in which the information will be printed
         order = [
@@ -794,6 +799,18 @@ def get_data(  # noqa: C901
             )
 
         target_name = target_names[0]
+        if target_name not in data.columns:
+            available_columns = list(data.columns)
+            if target_name in to_exclude:
+                raise ValueError(
+                    f"Target column '{target_name}' was removed because it is listed as a row_id "
+                    "or ignore attribute. Available columns after filtering: "
+                    f"{available_columns}"
+                )
+            raise ValueError(
+                f"Target column '{target_name}' does not exist in this dataset. Available "
+                f"columns: {available_columns}"
+            )
         x = data.drop(columns=[target_name])
         y = data[target_name].squeeze()
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 432938520..d9ff17803 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -600,8 +600,7 @@ def create_dataset(  # noqa: C901, PLR0912, PLR0915
     # Edit: Found it could also be like ["True", "False"]
     attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"],
     data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix,
-    # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None
-    default_target_attribute: str,
+    default_target_attribute: str | None,
     ignore_attribute: str | list[str] | None,
     citation: str,
     row_id_attribute: str | None = None,
@@ -645,7 +644,8 @@ def create_dataset(  # noqa: C901, PLR0912, PLR0915
         passing ``attributes='auto'``.
         The target feature is indicated as meta-data of the dataset.
     default_target_attribute : str
-        The default target attribute, if it exists.
+        The default target attribute, if it exists. Use ``None`` for unsupervised datasets
+        (e.g. clustering, anomaly detection) where no natural target column exists.
         Can have multiple values, comma separated.
     ignore_attribute : str | list
         Attributes that should be excluded in modelling,

From e32a82bd521c018050f3b7dbb2a0a86b0dc1fa03 Mon Sep 17 00:00:00 2001
From: Ansuman Patra <hellothere4604@gmail.com>
Date: Tue, 17 Mar 2026 17:36:38 +0530
Subject: [PATCH 2/4] FIX: reduce branch count in get_data to satisfy PLR0912

Made-with: Cursor
---
 openml/datasets/dataset.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index db84e5366..ca2dd8023 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -802,15 +802,17 @@ def get_data(  # noqa: C901
         if target_name not in data.columns:
             available_columns = list(data.columns)
             if target_name in to_exclude:
-                raise ValueError(
+                msg = (
                     f"Target column '{target_name}' was removed because it is listed as a row_id "
                     "or ignore attribute. Available columns after filtering: "
                     f"{available_columns}"
                 )
-            raise ValueError(
-                f"Target column '{target_name}' does not exist in this dataset. Available "
-                f"columns: {available_columns}"
-            )
+            else:
+                msg = (
+                    f"Target column '{target_name}' does not exist in this dataset. Available "
+                    f"columns: {available_columns}"
+                )
+            raise ValueError(msg)
         x = data.drop(columns=[target_name])
         y = data[target_name].squeeze()
 

From ad6203b04cceb21b855b5daa36578a082af832a1 Mon Sep 17 00:00:00 2001
From: Ansuman Patra <82107832+phantom-712@users.noreply.github.com>
Date: Tue, 17 Mar 2026 18:10:34 +0530
Subject: [PATCH 3/4] fixed pre-commit error

---
 openml/datasets/dataset.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index ca2dd8023..77e57c58c 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -801,17 +801,13 @@ def get_data(  # noqa: C901
         target_name = target_names[0]
         if target_name not in data.columns:
             available_columns = list(data.columns)
-            if target_name in to_exclude:
-                msg = (
-                    f"Target column '{target_name}' was removed because it is listed as a row_id "
-                    "or ignore attribute. Available columns after filtering: "
-                    f"{available_columns}"
-                )
-            else:
-                msg = (
-                    f"Target column '{target_name}' does not exist in this dataset. Available "
-                    f"columns: {available_columns}"
-                )
+            msg = (
+                f"Target column '{target_name}' was removed because it is listed as a "
+                f"row_id or ignore attribute. Available columns after filtering: {available_columns}"
+                if target_name in to_exclude
+                else f"Target column '{target_name}' does not exist in this dataset. "
+                f"Available columns: {available_columns}"
+            )
             raise ValueError(msg)
         x = data.drop(columns=[target_name])
         y = data[target_name].squeeze()

From ee8a741cb7bcb248955dcb4e48a7cc3af3c02905 Mon Sep 17 00:00:00 2001
From: Ansuman Patra <82107832+phantom-712@users.noreply.github.com>
Date: Tue, 17 Mar 2026 19:29:37 +0530
Subject: [PATCH 4/4] fixed according to max number of characters in a line
 rule

---
 openml/datasets/dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 77e57c58c..2274fe1bb 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -803,7 +803,8 @@ def get_data(  # noqa: C901
             available_columns = list(data.columns)
             msg = (
                 f"Target column '{target_name}' was removed because it is listed as a "
-                f"row_id or ignore attribute. Available columns after filtering: {available_columns}"
+                f"row_id or ignore attribute. "
+                f"Available columns after filtering: {available_columns}"
                 if target_name in to_exclude
                 else f"Target column '{target_name}' does not exist in this dataset. "
                 f"Available columns: {available_columns}"