⚠ This page is served via a proxy. Original site: https://github.com
This service does not collect credentials or authentication data.
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 3 additions & 35 deletions openml/base.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
# License: BSD 3-Clause
from __future__ import annotations

import re
import webbrowser
from abc import ABC, abstractmethod
from collections.abc import Iterable, Sequence
from collections.abc import Sequence

import xmltodict

import openml._api_calls
import openml.config
from openml.utils import ReprMixin

from .utils import _get_rest_api_type_alias, _tag_openml_base


class OpenMLBase(ABC):
class OpenMLBase(ReprMixin, ABC):
"""Base object for functionality that is shared across entities."""

def __repr__(self) -> str:
body_fields = self._get_repr_body_fields()
return self._apply_repr_template(body_fields)

@property
@abstractmethod
def id(self) -> int | None:
Expand Down Expand Up @@ -60,34 +56,6 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | N
"""
# Should be implemented in the base class.

def _apply_repr_template(
self,
body_fields: Iterable[tuple[str, str | int | list[str] | None]],
) -> str:
"""Generates the header and formats the body for string representation of the object.

Parameters
----------
body_fields: List[Tuple[str, str]]
A list of (name, value) pairs to display in the body of the __repr__.
"""
# We add spaces between capitals, e.g. ClassificationTask -> Classification Task
name_with_spaces = re.sub(
r"(\w)([A-Z])",
r"\1 \2",
self.__class__.__name__[len("OpenML") :],
)
header_text = f"OpenML {name_with_spaces}"
header = f"{header_text}\n{'=' * len(header_text)}\n"

_body_fields: list[tuple[str, str | int | list[str]]] = [
(k, "None" if v is None else v) for k, v in body_fields
]
longest_field_name_length = max(len(name) for name, _ in _body_fields)
field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
return header + body

@abstractmethod
def _to_dict(self) -> dict[str, dict]:
"""Creates a dictionary representation of self.
Expand Down
32 changes: 29 additions & 3 deletions openml/datasets/data_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
if TYPE_CHECKING:
from IPython.lib import pretty

from openml.utils import ReprMixin

class OpenMLDataFeature: # noqa: PLW1641

class OpenMLDataFeature(ReprMixin):
"""
Data Feature (a.k.a. Attribute) object.

Expand Down Expand Up @@ -74,11 +76,35 @@ def __init__( # noqa: PLR0913
self.number_missing_values = number_missing_values
self.ontologies = ontologies

def __repr__(self) -> str:
return f"[{self.index} - {self.name} ({self.data_type})]"
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
"""Collect all information to display in the __repr__ body."""
fields: dict[str, int | str | None] = {
"Index": self.index,
"Name": self.name,
"Data Type": self.data_type,
}

order = [
"Index",
"Name",
"Data Type",
]
return [(key, fields[key]) for key in order if key in fields]

def __eq__(self, other: Any) -> bool:
return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__

def __hash__(self) -> int:
return hash(
(
self.index,
self.name,
self.data_type,
tuple(self.nominal_values) if self.nominal_values is not None else None,
self.number_missing_values,
tuple(self.ontologies) if self.ontologies is not None else None,
)
)

def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: ARG002
pp.text(str(self))
40 changes: 13 additions & 27 deletions openml/setups/setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# License: BSD 3-Clause
from __future__ import annotations

from collections.abc import Sequence
from typing import Any

import openml.config
import openml.flows
from openml.utils import ReprMixin


class OpenMLSetup:
class OpenMLSetup(ReprMixin):
"""Setup object (a.k.a. Configuration).

Parameters
Expand Down Expand Up @@ -43,30 +45,21 @@ def _to_dict(self) -> dict[str, Any]:
else None,
}

def __repr__(self) -> str:
header = "OpenML Setup"
header = f"{header}\n{'=' * len(header)}\n"

fields = {
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
"""Collect all information to display in the __repr__ body."""
fields: dict[str, int | str | None] = {
"Setup ID": self.setup_id,
"Flow ID": self.flow_id,
"Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
"# of Parameters": (
len(self.parameters) if self.parameters is not None else float("nan")
),
"# of Parameters": (len(self.parameters) if self.parameters is not None else "nan"),
}

# determines the order in which the information will be printed
order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
_fields = [(key, fields[key]) for key in order if key in fields]

longest_field_name_length = max(len(name) for name, _ in _fields)
field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
return header + body
return [(key, fields[key]) for key in order if key in fields]


class OpenMLParameter:
class OpenMLParameter(ReprMixin):
"""Parameter object (used in setup).

Parameters
Expand Down Expand Up @@ -123,11 +116,9 @@ def _to_dict(self) -> dict[str, Any]:
"value": self.value,
}

def __repr__(self) -> str:
header = "OpenML Parameter"
header = f"{header}\n{'=' * len(header)}\n"

fields = {
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
"""Collect all information to display in the __repr__ body."""
fields: dict[str, int | str | None] = {
"ID": self.id,
"Flow ID": self.flow_id,
# "Flow Name": self.flow_name,
Expand Down Expand Up @@ -156,9 +147,4 @@ def __repr__(self) -> str:
parameter_default,
parameter_value,
]
_fields = [(key, fields[key]) for key in order if key in fields]

longest_field_name_length = max(len(name) for name, _ in _fields)
field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
return header + body
return [(key, fields[key]) for key in order if key in fields]
44 changes: 43 additions & 1 deletion openml/tasks/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@

import pickle
from collections import OrderedDict
from collections.abc import Sequence
from pathlib import Path
from typing import Any
from typing_extensions import NamedTuple

import arff # type: ignore
import numpy as np

from openml.utils import ReprMixin


class Split(NamedTuple):
"""A single split of a dataset."""
Expand All @@ -18,7 +21,7 @@ class Split(NamedTuple):
test: np.ndarray


class OpenMLSplit: # noqa: PLW1641
class OpenMLSplit(ReprMixin):
"""OpenML Split object.

This class manages train-test splits for a dataset across multiple
Expand Down Expand Up @@ -63,6 +66,22 @@ def __init__(
self.folds = len(self.split[0])
self.samples = len(self.split[0][0])

def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
"""Collect all information to display in the __repr__ body."""
fields = {
"Name": self.name,
"Description": (
self.description if len(self.description) <= 80 else self.description[:77] + "..."
),
"Repeats": self.repeats,
"Folds": self.folds,
"Samples": self.samples,
}

order = ["Name", "Description", "Repeats", "Folds", "Samples"]

return [(key, fields[key]) for key in order if key in fields]

def __eq__(self, other: Any) -> bool:
if (
(not isinstance(self, type(other)))
Expand Down Expand Up @@ -90,6 +109,29 @@ def __eq__(self, other: Any) -> bool:
return False
return True

def __hash__(self) -> int:
split_items = []
for repetition in sorted(self.split):
for fold in sorted(self.split[repetition]):
for sample in sorted(self.split[repetition][fold]):
train, test = self.split[repetition][fold][sample]
split_items.append(
(
repetition,
fold,
sample,
hash(train.tobytes()),
hash(test.tobytes()),
)
)
return hash(
(
self.name,
self.description,
tuple(split_items),
)
)

@classmethod
def _from_arff_file(cls, filename: Path) -> OpenMLSplit: # noqa: C901, PLR0912
repetitions = None
Expand Down
58 changes: 57 additions & 1 deletion openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from __future__ import annotations

import contextlib
import re
import shutil
import warnings
from collections.abc import Callable, Mapping, Sized
from abc import ABC, abstractmethod
from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
from functools import wraps
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload
Expand Down Expand Up @@ -470,3 +472,57 @@ def update(self, length: int) -> None:
self._progress_bar.update(length)
if self._progress_bar.total <= self._progress_bar.n:
self._progress_bar.close()


class ReprMixin(ABC):
"""A mixin class that provides a customizable string representation for OpenML objects.

This mixin standardizes the __repr__ output format across OpenML classes.
Classes inheriting from this mixin should implement the
_get_repr_body_fields method to specify which fields to display.
"""

def __repr__(self) -> str:
body_fields = self._get_repr_body_fields()
return self._apply_repr_template(body_fields)

@abstractmethod
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
"""Collect all information to display in the __repr__ body.

Returns
-------
body_fields : List[Tuple[str, Union[str, int, List[str]]]]
A list of (name, value) pairs to display in the body of the __repr__.
E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
If value is a List of str, then each item of the list will appear in a separate row.
"""
# Should be implemented in the base class.

def _apply_repr_template(
self,
body_fields: Iterable[tuple[str, str | int | list[str] | None]],
) -> str:
"""Generates the header and formats the body for string representation of the object.

Parameters
----------
body_fields: List[Tuple[str, str]]
A list of (name, value) pairs to display in the body of the __repr__.
"""
# We add spaces between capitals, e.g. ClassificationTask -> Classification Task
name_with_spaces = re.sub(
r"(\w)([A-Z])",
r"\1 \2",
self.__class__.__name__[len("OpenML") :],
)
header_text = f"OpenML {name_with_spaces}"
header = f"{header_text}\n{'=' * len(header_text)}\n"

_body_fields: list[tuple[str, str | int | list[str]]] = [
(k, "None" if v is None else v) for k, v in body_fields
]
longest_field_name_length = max(len(name) for name, _ in _body_fields)
field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
return header + body
Loading