From 39ddfeb276fd77cff56d9afee50e41e553935a97 Mon Sep 17 00:00:00 2001 From: Ansuman Patra Date: Tue, 17 Mar 2026 17:13:06 +0530 Subject: [PATCH 1/4] FIX: Three dataset robustness gaps in OpenMLDataset and create_dataset --- openml/datasets/dataset.py | 27 ++++++++++++++++++++++----- openml/datasets/functions.py | 6 +++--- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 59d6205ba..db84e5366 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -297,8 +297,10 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]: """Collect all information to display in the __repr__ body.""" # Obtain number of features in accordance with lazy loading. n_features: int | None = None - if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None: - n_features = int(self._qualities["NumberOfFeatures"]) + if self._qualities is not None: + n_features_quality = self._qualities.get("NumberOfFeatures") + if n_features_quality is not None and not pd.isna(n_features_quality): + n_features = int(n_features_quality) elif self._features is not None: n_features = len(self._features) @@ -312,14 +314,17 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]: "Pickle file": ( str(self.data_pickle_file) if self.data_pickle_file is not None else None ), - "# of features": n_features, } + if n_features is not None: + fields["# of features"] = n_features if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace("T", " ") if self.dataset_id is not None: fields["OpenML URL"] = self.openml_url - if self._qualities is not None and self._qualities["NumberOfInstances"] is not None: - fields["# of instances"] = int(self._qualities["NumberOfInstances"]) + if self._qualities is not None: + n_instances_quality = self._qualities.get("NumberOfInstances") + if n_instances_quality is not None and not pd.isna(n_instances_quality): + fields["# of instances"] = int(n_instances_quality) # determines the order in which the information will be printed order = [ @@ -794,6 +799,18 @@ def get_data( # noqa: C901 ) target_name = target_names[0] + if target_name not in data.columns: + available_columns = list(data.columns) + if target_name in to_exclude: + raise ValueError( + f"Target column '{target_name}' was removed because it is listed as a row_id " + "or ignore attribute. Available columns after filtering: " + f"{available_columns}" + ) + raise ValueError( + f"Target column '{target_name}' does not exist in this dataset. Available " + f"columns: {available_columns}" + ) x = data.drop(columns=[target_name]) y = data[target_name].squeeze() diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 432938520..d9ff17803 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -600,8 +600,7 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 # Edit: Found it could also be like ["True", "False"] attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"], data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix, - # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None - default_target_attribute: str, + default_target_attribute: str | None, ignore_attribute: str | list[str] | None, citation: str, row_id_attribute: str | None = None, @@ -645,7 +644,8 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 passing ``attributes='auto'``. The target feature is indicated as meta-data of the dataset. default_target_attribute : str - The default target attribute, if it exists. + The default target attribute, if it exists. Use ``None`` for unsupervised datasets + (e.g. clustering, anomaly detection) where no natural target column exists. Can have multiple values, comma separated. ignore_attribute : str | list Attributes that should be excluded in modelling, From e32a82bd521c018050f3b7dbb2a0a86b0dc1fa03 Mon Sep 17 00:00:00 2001 From: Ansuman Patra Date: Tue, 17 Mar 2026 17:36:38 +0530 Subject: [PATCH 2/4] FIX: reduce branch count in get_data to satisfy PLR0912 Made-with: Cursor --- openml/datasets/dataset.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index db84e5366..ca2dd8023 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -802,15 +802,17 @@ def get_data( # noqa: C901 if target_name not in data.columns: available_columns = list(data.columns) if target_name in to_exclude: - raise ValueError( + msg = ( f"Target column '{target_name}' was removed because it is listed as a row_id " "or ignore attribute. Available columns after filtering: " f"{available_columns}" ) - raise ValueError( - f"Target column '{target_name}' does not exist in this dataset. Available " - f"columns: {available_columns}" - ) + else: + msg = ( + f"Target column '{target_name}' does not exist in this dataset. Available " + f"columns: {available_columns}" + ) + raise ValueError(msg) x = data.drop(columns=[target_name]) y = data[target_name].squeeze() From ad6203b04cceb21b855b5daa36578a082af832a1 Mon Sep 17 00:00:00 2001 From: Ansuman Patra <82107832+phantom-712@users.noreply.github.com> Date: Tue, 17 Mar 2026 18:10:34 +0530 Subject: [PATCH 3/4] fixed pre-commit error --- openml/datasets/dataset.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index ca2dd8023..77e57c58c 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -801,17 +801,13 @@ def get_data( # noqa: C901 target_name = target_names[0] if target_name not in data.columns: available_columns = list(data.columns) - if target_name in to_exclude: - msg = ( - f"Target column '{target_name}' was removed because it is listed as a row_id " - "or ignore attribute. Available columns after filtering: " - f"{available_columns}" - ) - else: - msg = ( - f"Target column '{target_name}' does not exist in this dataset. Available " - f"columns: {available_columns}" - ) + msg = ( + f"Target column '{target_name}' was removed because it is listed as a " + f"row_id or ignore attribute. Available columns after filtering: {available_columns}" + if target_name in to_exclude + else f"Target column '{target_name}' does not exist in this dataset. " + f"Available columns: {available_columns}" + ) raise ValueError(msg) x = data.drop(columns=[target_name]) y = data[target_name].squeeze() From ee8a741cb7bcb248955dcb4e48a7cc3af3c02905 Mon Sep 17 00:00:00 2001 From: Ansuman Patra <82107832+phantom-712@users.noreply.github.com> Date: Tue, 17 Mar 2026 19:29:37 +0530 Subject: [PATCH 4/4] fixed according to max number of characters in a line rule --- openml/datasets/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 77e57c58c..2274fe1bb 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -803,7 +803,8 @@ def get_data( # noqa: C901 available_columns = list(data.columns) msg = ( f"Target column '{target_name}' was removed because it is listed as a " - f"row_id or ignore attribute. Available columns after filtering: {available_columns}" + f"row_id or ignore attribute. " + f"Available columns after filtering: {available_columns}" if target_name in to_exclude else f"Target column '{target_name}' does not exist in this dataset. " f"Available columns: {available_columns}"