openml · phantom-712 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -297,8 +297,10 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
         """Collect all information to display in the __repr__ body."""
         # Obtain number of features in accordance with lazy loading.
         n_features: int | None = None
-        if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
-            n_features = int(self._qualities["NumberOfFeatures"])
+        if self._qualities is not None:
+            n_features_quality = self._qualities.get("NumberOfFeatures")
+            if n_features_quality is not None and not pd.isna(n_features_quality):
+                n_features = int(n_features_quality)
         elif self._features is not None:
             n_features = len(self._features)
 
@@ -312,14 +314,17 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
             "Pickle file": (
                 str(self.data_pickle_file) if self.data_pickle_file is not None else None
             ),
-            "# of features": n_features,
         }
+        if n_features is not None:
+            fields["# of features"] = n_features
         if self.upload_date is not None:
             fields["Upload Date"] = self.upload_date.replace("T", " ")
         if self.dataset_id is not None:
             fields["OpenML URL"] = self.openml_url
-        if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
-            fields["# of instances"] = int(self._qualities["NumberOfInstances"])
+        if self._qualities is not None:
+            n_instances_quality = self._qualities.get("NumberOfInstances")
+            if n_instances_quality is not None and not pd.isna(n_instances_quality):
+                fields["# of instances"] = int(n_instances_quality)
 
         # determines the order in which the information will be printed
         order = [
@@ -794,6 +799,17 @@ def get_data(  # noqa: C901
             )
 
         target_name = target_names[0]
+        if target_name not in data.columns:
+            available_columns = list(data.columns)
+            msg = (
+                f"Target column '{target_name}' was removed because it is listed as a "
+                f"row_id or ignore attribute. "
+                f"Available columns after filtering: {available_columns}"
+                if target_name in to_exclude
+                else f"Target column '{target_name}' does not exist in this dataset. "
+                f"Available columns: {available_columns}"
+            )
+            raise ValueError(msg)
         x = data.drop(columns=[target_name])
         y = data[target_name].squeeze()
 

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -600,8 +600,7 @@ def create_dataset(  # noqa: C901, PLR0912, PLR0915
     # Edit: Found it could also be like ["True", "False"]
     attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"],
     data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix,
-    # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None
-    default_target_attribute: str,
+    default_target_attribute: str | None,
     ignore_attribute: str | list[str] | None,
     citation: str,
     row_id_attribute: str | None = None,
@@ -645,7 +644,8 @@ def create_dataset(  # noqa: C901, PLR0912, PLR0915
         passing ``attributes='auto'``.
         The target feature is indicated as meta-data of the dataset.
     default_target_attribute : str
-        The default target attribute, if it exists.
+        The default target attribute, if it exists. Use ``None`` for unsupervised datasets
+        (e.g. clustering, anomaly detection) where no natural target column exists.
         Can have multiple values, comma separated.
     ignore_attribute : str | list
         Attributes that should be excluded in modelling,