diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 59d6205ba..2274fe1bb 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -297,8 +297,10 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]: """Collect all information to display in the __repr__ body.""" # Obtain number of features in accordance with lazy loading. n_features: int | None = None - if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None: - n_features = int(self._qualities["NumberOfFeatures"]) + if self._qualities is not None: + n_features_quality = self._qualities.get("NumberOfFeatures") + if n_features_quality is not None and not pd.isna(n_features_quality): + n_features = int(n_features_quality) elif self._features is not None: n_features = len(self._features) @@ -312,14 +314,17 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]: "Pickle file": ( str(self.data_pickle_file) if self.data_pickle_file is not None else None ), - "# of features": n_features, } + if n_features is not None: + fields["# of features"] = n_features if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace("T", " ") if self.dataset_id is not None: fields["OpenML URL"] = self.openml_url - if self._qualities is not None and self._qualities["NumberOfInstances"] is not None: - fields["# of instances"] = int(self._qualities["NumberOfInstances"]) + if self._qualities is not None: + n_instances_quality = self._qualities.get("NumberOfInstances") + if n_instances_quality is not None and not pd.isna(n_instances_quality): + fields["# of instances"] = int(n_instances_quality) # determines the order in which the information will be printed order = [ @@ -794,6 +799,17 @@ def get_data( # noqa: C901 ) target_name = target_names[0] + if target_name not in data.columns: + available_columns = list(data.columns) + msg = ( + f"Target column '{target_name}' was removed because it is listed as a " + f"row_id or ignore attribute. " + f"Available columns after filtering: {available_columns}" + if target_name in to_exclude + else f"Target column '{target_name}' does not exist in this dataset. " + f"Available columns: {available_columns}" + ) + raise ValueError(msg) x = data.drop(columns=[target_name]) y = data[target_name].squeeze() diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 432938520..d9ff17803 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -600,8 +600,7 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 # Edit: Found it could also be like ["True", "False"] attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"], data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix, - # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None - default_target_attribute: str, + default_target_attribute: str | None, ignore_attribute: str | list[str] | None, citation: str, row_id_attribute: str | None = None, @@ -645,7 +644,8 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915 passing ``attributes='auto'``. The target feature is indicated as meta-data of the dataset. default_target_attribute : str - The default target attribute, if it exists. + The default target attribute, if it exists. Use ``None`` for unsupervised datasets + (e.g. clustering, anomaly detection) where no natural target column exists. Can have multiple values, comma separated. ignore_attribute : str | list Attributes that should be excluded in modelling,