Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,10 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
"""Collect all information to display in the __repr__ body."""
# Obtain number of features in accordance with lazy loading.
n_features: int | None = None
if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
n_features = int(self._qualities["NumberOfFeatures"])
if self._qualities is not None:
n_features_quality = self._qualities.get("NumberOfFeatures")
if n_features_quality is not None and not pd.isna(n_features_quality):
n_features = int(n_features_quality)
elif self._features is not None:
n_features = len(self._features)

Expand All @@ -312,14 +314,17 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
"Pickle file": (
str(self.data_pickle_file) if self.data_pickle_file is not None else None
),
"# of features": n_features,
}
if n_features is not None:
fields["# of features"] = n_features
if self.upload_date is not None:
fields["Upload Date"] = self.upload_date.replace("T", " ")
if self.dataset_id is not None:
fields["OpenML URL"] = self.openml_url
if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
fields["# of instances"] = int(self._qualities["NumberOfInstances"])
if self._qualities is not None:
n_instances_quality = self._qualities.get("NumberOfInstances")
if n_instances_quality is not None and not pd.isna(n_instances_quality):
fields["# of instances"] = int(n_instances_quality)

# determines the order in which the information will be printed
order = [
Expand Down Expand Up @@ -794,6 +799,17 @@ def get_data( # noqa: C901
)

target_name = target_names[0]
if target_name not in data.columns:
available_columns = list(data.columns)
msg = (
f"Target column '{target_name}' was removed because it is listed as a "
f"row_id or ignore attribute. "
f"Available columns after filtering: {available_columns}"
if target_name in to_exclude
else f"Target column '{target_name}' does not exist in this dataset. "
f"Available columns: {available_columns}"
)
raise ValueError(msg)
x = data.drop(columns=[target_name])
y = data[target_name].squeeze()

Expand Down
6 changes: 3 additions & 3 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,8 +600,7 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915
# Edit: Found it could also be like ["True", "False"]
attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"],
data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix,
# TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None
default_target_attribute: str,
default_target_attribute: str | None,
ignore_attribute: str | list[str] | None,
citation: str,
row_id_attribute: str | None = None,
Expand Down Expand Up @@ -645,7 +644,8 @@ def create_dataset( # noqa: C901, PLR0912, PLR0915
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
The default target attribute, if it exists. Use ``None`` for unsupervised datasets
(e.g. clustering, anomaly detection) where no natural target column exists.
Can have multiple values, comma separated.
ignore_attribute : str | list
Attributes that should be excluded in modelling,
Expand Down