diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..c9a90e45a 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,8 @@ # License: BSD 3-Clause from __future__ import annotations +from openml._get import get + from . import ( _api_calls, config, @@ -120,4 +122,13 @@ def populate_cache( "utils", "_api_calls", "__version__", + "get", ] + + +def __getattr__(name: str): + if name in __all__: + return globals()[name] + if name not in __all__: + return get(name) + return None diff --git a/openml/_get.py b/openml/_get.py new file mode 100644 index 000000000..0c5e9739e --- /dev/null +++ b/openml/_get.py @@ -0,0 +1,11 @@ +"""Global get dispatch utility.""" + +# currently just a forward to models +# to discuss and possibly +# todo: add global get utility here +# in general, e.g., datasets will not have same name as models etc +from __future__ import annotations + +from openml.models import get + +__all__ = ["get"] diff --git a/openml/base/__init__.py b/openml/base/__init__.py new file mode 100644 index 000000000..76a88c42b --- /dev/null +++ b/openml/base/__init__.py @@ -0,0 +1,6 @@ +"""Module of base classes.""" + +from openml.base._base import OpenMLBase +from openml.base._base_pkg import _BasePkg + +__all__ = ["_BasePkg", "OpenMLBase"] diff --git a/openml/base.py b/openml/base/_base.py similarity index 98% rename from openml/base.py rename to openml/base/_base.py index fbfb9dfc8..de2b387bf 100644 --- a/openml/base.py +++ b/openml/base/_base.py @@ -10,8 +10,7 @@ import openml._api_calls import openml.config - -from .utils import _get_rest_api_type_alias, _tag_openml_base +from openml.utils import _get_rest_api_type_alias, _tag_openml_base class OpenMLBase(ABC): diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py new file mode 100644 index 000000000..729619fa9 --- /dev/null +++ b/openml/base/_base_pkg.py @@ -0,0 +1,118 @@ +"""Base Packager class.""" + +from __future__ import annotations + +import inspect +import sys +import textwrap +from pathlib import Path + +from skbase.base import BaseObject +from skbase.utils.dependencies import _check_estimator_deps + + +class _BasePkg(BaseObject): + _tags = { + "python_dependencies": None, + "python_version": None, + # package register and manifest + "pkg_id": None, # object id contained, "__multiple" if multiple + "pkg_obj": "reference", # or "code" + "pkg_obj_type": None, # openml API type + "pkg_compression": "zlib", # compression + "pkg_pypi_name": None, # PyPI package name of objects + } + + def __init__(self): + super().__init__() + + def materialize(self): + try: + _check_estimator_deps(obj=self) + except ModuleNotFoundError as e: + # prettier message, so the reference is to the pkg_id + # currently, we cannot simply pass the object name to skbase + # in the error message, so this is a hack + # todo: fix this in scikit-base + msg = str(e) + if len(msg) > 11: + msg = msg[11:] + raise ModuleNotFoundError(msg) from e + + return self._materialize() + + def _materialize(self): + raise RuntimeError("abstract method") + + def serialize(self): + cls_str = class_to_source(type(self)) + compress_method = self.get_tag("pkg_compression") + if compress_method in [None, "None"]: + return cls_str + + cls_str = cls_str.encode("utf-8") + exec(f"import {compress_method}") + return eval(f"{compress_method}.compress(cls_str)") + + +def _has_source(obj) -> bool: + """Return True if inspect.getsource(obj) should succeed.""" + module_name = getattr(obj, "__module__", None) + if not module_name or module_name not in sys.modules: + return False + + module = sys.modules[module_name] + file = getattr(module, "__file__", None) + if not file: + return False + + return Path(file).suffix == ".py" + + +def class_to_source(cls) -> str: + """Return full source definition of python class as string. + + Parameters + ---------- + cls : class to serialize + + Returns + ------- + str : complete definition of cls, as str. + Imports are not contained or serialized. + """ "" + + # Fast path: class has retrievable source + if _has_source(cls): + source = inspect.getsource(cls) + return textwrap.dedent(source) + + # Fallback for dynamically created classes + lines = [] + + bases = [base.__name__ for base in cls.__bases__ if base is not object] + base_str = f"({', '.join(bases)})" if bases else "" + lines.append(f"class {cls.__name__}{base_str}:") + + body_added = False + + for name, value in cls.__dict__.items(): + if name.startswith("__") and name.endswith("__"): + continue + + if inspect.isfunction(value): + if _has_source(value): + method_src = inspect.getsource(value) + method_src = textwrap.indent(textwrap.dedent(method_src), " ") + lines.append(method_src) + else: + lines.append(f" def {name}(self): ...") + body_added = True + else: + lines.append(f" {name} = {value!r}") + body_added = True + + if not body_added: + lines.append(" pass") + + return "\n".join(lines) diff --git a/openml/models/__init__.py b/openml/models/__init__.py new file mode 100644 index 000000000..ae833fc63 --- /dev/null +++ b/openml/models/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models._get import get + +__all__ = ["get"] diff --git a/openml/models/_get.py b/openml/models/_get.py new file mode 100644 index 000000000..7762b8013 --- /dev/null +++ b/openml/models/_get.py @@ -0,0 +1,62 @@ +"""Model retrieval utility.""" + +from __future__ import annotations + +from functools import lru_cache + + +def get(id: str): + """Retrieve model object with unique identifier. + + Parameter + --------- + id : str + unique identifier of object to retrieve + + Returns + ------- + class + retrieved object + + Raises + ------ + ModuleNotFoundError + if dependencies of object to retrieve are not satisfied + """ + id_lookup = _id_lookup() + obj = id_lookup.get(id) + if obj is None: + raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.") + return obj(id).materialize() + + +# todo: need to generalize this later to more types +# currently intentionally retrieves only classifiers +# todo: replace this, optionally, by database backend +def _id_lookup(obj_type=None): + return _id_lookup_cached(obj_type=obj_type).copy() + + +@lru_cache +def _id_lookup_cached(obj_type=None): + all_objs = _all_objects(obj_type=obj_type) + + lookup_dict = {} + for obj in all_objs: + obj_index = obj.get_class_tag("pkg_id") + if obj_index != "__multiple": + lookup_dict[obj_index] = obj + else: + obj_all_ids = obj.contained_ids() + lookup_dict.update({obj_id: obj for obj_id in obj_all_ids}) + + return lookup_dict + + +@lru_cache +def _all_objects(obj_type=None): + from skbase.lookup import all_objects + + from openml.models.apis._classifier import _ModelPkgClassifier + + return all_objects(object_types=_ModelPkgClassifier, package_name="openml", return_names=False) diff --git a/openml/models/apis/__init__.py b/openml/models/apis/__init__.py new file mode 100644 index 000000000..f560dcf6f --- /dev/null +++ b/openml/models/apis/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.apis._classifier import _ModelPkgClassifier + +__all__ = ["_ModelPkgClassifier"] diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py new file mode 100644 index 000000000..c1198ee32 --- /dev/null +++ b/openml/models/apis/_classifier.py @@ -0,0 +1,25 @@ +"""Base package for sklearn classifiers.""" + +from __future__ import annotations + +from openml.models.base import _OpenmlModelPkg + + +class _ModelPkgClassifier(_OpenmlModelPkg): + _tags = { + # tags specific to API type + "pkg_obj_type": "classifier", + } + + def get_obj_tags(self): + """Return tags of the object as a dictionary.""" + return {} # this needs to be implemented + + def get_obj_param_names(self): + """Return parameter names of the object as a list. + + Returns + ------- + list: names of object parameters + """ + return list(self.materialize()().get_params().keys()) diff --git a/openml/models/base/__init__.py b/openml/models/base/__init__.py new file mode 100644 index 000000000..a60e1e404 --- /dev/null +++ b/openml/models/base/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.base._base import _OpenmlModelPkg + +__all__ = ["_OpenmlModelPkg"] diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py new file mode 100644 index 000000000..13166cfe2 --- /dev/null +++ b/openml/models/base/_base.py @@ -0,0 +1,65 @@ +"""Base model package class.""" + +from __future__ import annotations + +from openml.base import _BasePkg + + +class _OpenmlModelPkg(_BasePkg): + _obj = None + _obj_dict = {} + + def __init__(self, id=None): + super().__init__() + + pkg_id = self.get_tag("pkg_id") + if pkg_id == "__multiple": + self._obj = self._obj_dict.get(id, None) + + @classmethod + def contained_ids(cls): + """Return list of ids of objects contained in this package. + + Returns + ------- + ids : list of str + list of unique identifiers of objects contained in this package + """ + pkg_id = cls.get_class_tag("pkg_id") + if pkg_id != "__multiple": + return [cls.get_class_tag("pkg_id")] + return list(cls._obj_dict.keys()) + + def _materialize(self): + pkg_obj = self.get_tag("pkg_obj") + + _obj = self._obj + + if _obj is None: + raise ValueError( + "Error in materialize." + "Either _materialize must be implemented, or" + "the _obj attribute must be not None." + ) + + if pkg_obj == "reference": + from skbase.utils.dependencies import _safe_import + + obj_loc = self._obj + pkg_name = self.get_tag("pkg_pypi_name") + + return _safe_import(obj_loc, pkg_name=pkg_name) + + if pkg_obj == "code": + exec(self._obj) + + return obj + + # elif pkg_obj == "craft": + # identify and call appropriate craft method + + raise ValueError( + 'Error in package tag "pkg_obj", ' + 'must be one of "reference", "code", "craft", ' + f"but found value {pkg_obj}, of type {type(pkg_obj)}" + ) diff --git a/openml/models/classification/__init__.py b/openml/models/classification/__init__.py new file mode 100644 index 000000000..e547a50cf --- /dev/null +++ b/openml/models/classification/__init__.py @@ -0,0 +1 @@ +"""Sklearn classification models.""" diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py new file mode 100644 index 000000000..c4d926e72 --- /dev/null +++ b/openml/models/classification/auto_sklearn.py @@ -0,0 +1,15 @@ +"""Auto-sklearn classifier.""" + +from __future__ import annotations + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): + _tags = { + "pkg_id": "AutoSklearnClassifier", + "python_dependencies": "auto-sklearn", + "pkg_pypi_name": "auto-sklearn", + } + + _obj = "autosklearn.classification.AutoSklearnClassifier" diff --git a/openml/models/classification/scikit_learn.py b/openml/models/classification/scikit_learn.py new file mode 100644 index 000000000..dd05d3d46 --- /dev/null +++ b/openml/models/classification/scikit_learn.py @@ -0,0 +1,227 @@ +"""Auto-sklearn classifier.""" + +from __future__ import annotations + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__Sklearn(_ModelPkgClassifier): + _tags = { + "pkg_id": "__multiple", + "python_dependencies": "scikit-learn", + "pkg_pypi_name": "scikit-learn", + } + + # obtained via utils._indexing._preindex_sklearn + # todo: automate generation + # todo: include version bounds for availability + # todo: test generated index against actual index + _obj_dict = { + "ARDRegression": "sklearn.linear_model._bayes.ARDRegression", + "AdaBoostClassifier": "sklearn.ensemble._weight_boosting.AdaBoostClassifier", + "AdaBoostRegressor": "sklearn.ensemble._weight_boosting.AdaBoostRegressor", + "AdditiveChi2Sampler": "sklearn.kernel_approximation.AdditiveChi2Sampler", + "AffinityPropagation": "sklearn.cluster._affinity_propagation.AffinityPropagation", + "AgglomerativeClustering": "sklearn.cluster._agglomerative.AgglomerativeClustering", + "BaggingClassifier": "sklearn.ensemble._bagging.BaggingClassifier", + "BaggingRegressor": "sklearn.ensemble._bagging.BaggingRegressor", + "BayesianGaussianMixture": "sklearn.mixture._bayesian_mixture.BayesianGaussianMixture", + "BayesianRidge": "sklearn.linear_model._bayes.BayesianRidge", + "BernoulliNB": "sklearn.naive_bayes.BernoulliNB", + "BernoulliRBM": "sklearn.neural_network._rbm.BernoulliRBM", + "Binarizer": "sklearn.preprocessing._data.Binarizer", + "Birch": "sklearn.cluster._birch.Birch", + "BisectingKMeans": "sklearn.cluster._bisect_k_means.BisectingKMeans", + "CCA": "sklearn.cross_decomposition._pls.CCA", + "CalibratedClassifierCV": "sklearn.calibration.CalibratedClassifierCV", + "CategoricalNB": "sklearn.naive_bayes.CategoricalNB", + "ClassifierChain": "sklearn.multioutput.ClassifierChain", + "ColumnTransformer": "sklearn.compose._column_transformer.ColumnTransformer", + "ComplementNB": "sklearn.naive_bayes.ComplementNB", + "CountVectorizer": "sklearn.feature_extraction.text.CountVectorizer", + "DBSCAN": "sklearn.cluster._dbscan.DBSCAN", + "DecisionTreeClassifier": "sklearn.tree._classes.DecisionTreeClassifier", + "DecisionTreeRegressor": "sklearn.tree._classes.DecisionTreeRegressor", + "DictVectorizer": "sklearn.feature_extraction._dict_vectorizer.DictVectorizer", + "DictionaryLearning": "sklearn.decomposition._dict_learning.DictionaryLearning", + "DummyClassifier": "sklearn.dummy.DummyClassifier", + "DummyRegressor": "sklearn.dummy.DummyRegressor", + "ElasticNet": "sklearn.linear_model._coordinate_descent.ElasticNet", + "ElasticNetCV": "sklearn.linear_model._coordinate_descent.ElasticNetCV", + "EllipticEnvelope": "sklearn.covariance._elliptic_envelope.EllipticEnvelope", + "EmpiricalCovariance": "sklearn.covariance._empirical_covariance.EmpiricalCovariance", + "ExtraTreeClassifier": "sklearn.tree._classes.ExtraTreeClassifier", + "ExtraTreeRegressor": "sklearn.tree._classes.ExtraTreeRegressor", + "ExtraTreesClassifier": "sklearn.ensemble._forest.ExtraTreesClassifier", + "ExtraTreesRegressor": "sklearn.ensemble._forest.ExtraTreesRegressor", + "FactorAnalysis": "sklearn.decomposition._factor_analysis.FactorAnalysis", + "FastICA": "sklearn.decomposition._fastica.FastICA", + "FeatureAgglomeration": "sklearn.cluster._agglomerative.FeatureAgglomeration", + "FeatureHasher": "sklearn.feature_extraction._hash.FeatureHasher", + "FeatureUnion": "sklearn.pipeline.FeatureUnion", + "FixedThresholdClassifier": "sklearn.model_selection._classification_threshold.FixedThresholdClassifier", + "FrozenEstimator": "sklearn.frozen._frozen.FrozenEstimator", + "FunctionTransformer": "sklearn.preprocessing._function_transformer.FunctionTransformer", + "GammaRegressor": "sklearn.linear_model._glm.glm.GammaRegressor", + "GaussianMixture": "sklearn.mixture._gaussian_mixture.GaussianMixture", + "GaussianNB": "sklearn.naive_bayes.GaussianNB", + "GaussianProcessClassifier": "sklearn.gaussian_process._gpc.GaussianProcessClassifier", + "GaussianProcessRegressor": "sklearn.gaussian_process._gpr.GaussianProcessRegressor", + "GaussianRandomProjection": "sklearn.random_projection.GaussianRandomProjection", + "GenericUnivariateSelect": "sklearn.feature_selection._univariate_selection.GenericUnivariateSelect", + "GradientBoostingClassifier": "sklearn.ensemble._gb.GradientBoostingClassifier", + "GradientBoostingRegressor": "sklearn.ensemble._gb.GradientBoostingRegressor", + "GraphicalLasso": "sklearn.covariance._graph_lasso.GraphicalLasso", + "GraphicalLassoCV": "sklearn.covariance._graph_lasso.GraphicalLassoCV", + "GridSearchCV": "sklearn.model_selection._search.GridSearchCV", + "HDBSCAN": "sklearn.cluster._hdbscan.hdbscan.HDBSCAN", + "HashingVectorizer": "sklearn.feature_extraction.text.HashingVectorizer", + "HistGradientBoostingClassifier": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier", + "HistGradientBoostingRegressor": "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor", + "HuberRegressor": "sklearn.linear_model._huber.HuberRegressor", + "IncrementalPCA": "sklearn.decomposition._incremental_pca.IncrementalPCA", + "IsolationForest": "sklearn.ensemble._iforest.IsolationForest", + "Isomap": "sklearn.manifold._isomap.Isomap", + "IsotonicRegression": "sklearn.isotonic.IsotonicRegression", + "KBinsDiscretizer": "sklearn.preprocessing._discretization.KBinsDiscretizer", + "KMeans": "sklearn.cluster._kmeans.KMeans", + "KNNImputer": "sklearn.impute._knn.KNNImputer", + "KNeighborsClassifier": "sklearn.neighbors._classification.KNeighborsClassifier", + "KNeighborsRegressor": "sklearn.neighbors._regression.KNeighborsRegressor", + "KNeighborsTransformer": "sklearn.neighbors._graph.KNeighborsTransformer", + "KernelCenterer": "sklearn.preprocessing._data.KernelCenterer", + "KernelDensity": "sklearn.neighbors._kde.KernelDensity", + "KernelPCA": "sklearn.decomposition._kernel_pca.KernelPCA", + "KernelRidge": "sklearn.kernel_ridge.KernelRidge", + "LabelBinarizer": "sklearn.preprocessing._label.LabelBinarizer", + "LabelEncoder": "sklearn.preprocessing._label.LabelEncoder", + "LabelPropagation": "sklearn.semi_supervised._label_propagation.LabelPropagation", + "LabelSpreading": "sklearn.semi_supervised._label_propagation.LabelSpreading", + "Lars": "sklearn.linear_model._least_angle.Lars", + "LarsCV": "sklearn.linear_model._least_angle.LarsCV", + "Lasso": "sklearn.linear_model._coordinate_descent.Lasso", + "LassoCV": "sklearn.linear_model._coordinate_descent.LassoCV", + "LassoLars": "sklearn.linear_model._least_angle.LassoLars", + "LassoLarsCV": "sklearn.linear_model._least_angle.LassoLarsCV", + "LassoLarsIC": "sklearn.linear_model._least_angle.LassoLarsIC", + "LatentDirichletAllocation": "sklearn.decomposition._lda.LatentDirichletAllocation", + "LedoitWolf": "sklearn.covariance._shrunk_covariance.LedoitWolf", + "LinearDiscriminantAnalysis": "sklearn.discriminant_analysis.LinearDiscriminantAnalysis", + "LinearRegression": "sklearn.linear_model._base.LinearRegression", + "LinearSVC": "sklearn.svm._classes.LinearSVC", + "LinearSVR": "sklearn.svm._classes.LinearSVR", + "LocalOutlierFactor": "sklearn.neighbors._lof.LocalOutlierFactor", + "LocallyLinearEmbedding": "sklearn.manifold._locally_linear.LocallyLinearEmbedding", + "LogisticRegression": "sklearn.linear_model._logistic.LogisticRegression", + "LogisticRegressionCV": "sklearn.linear_model._logistic.LogisticRegressionCV", + "MDS": "sklearn.manifold._mds.MDS", + "MLPClassifier": "sklearn.neural_network._multilayer_perceptron.MLPClassifier", + "MLPRegressor": "sklearn.neural_network._multilayer_perceptron.MLPRegressor", + "MaxAbsScaler": "sklearn.preprocessing._data.MaxAbsScaler", + "MeanShift": "sklearn.cluster._mean_shift.MeanShift", + "MinCovDet": "sklearn.covariance._robust_covariance.MinCovDet", + "MinMaxScaler": "sklearn.preprocessing._data.MinMaxScaler", + "MiniBatchDictionaryLearning": "sklearn.decomposition._dict_learning.MiniBatchDictionaryLearning", + "MiniBatchKMeans": "sklearn.cluster._kmeans.MiniBatchKMeans", + "MiniBatchNMF": "sklearn.decomposition._nmf.MiniBatchNMF", + "MiniBatchSparsePCA": "sklearn.decomposition._sparse_pca.MiniBatchSparsePCA", + "MissingIndicator": "sklearn.impute._base.MissingIndicator", + "MultiLabelBinarizer": "sklearn.preprocessing._label.MultiLabelBinarizer", + "MultiOutputClassifier": "sklearn.multioutput.MultiOutputClassifier", + "MultiOutputRegressor": "sklearn.multioutput.MultiOutputRegressor", + "MultiTaskElasticNet": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNet", + "MultiTaskElasticNetCV": "sklearn.linear_model._coordinate_descent.MultiTaskElasticNetCV", + "MultiTaskLasso": "sklearn.linear_model._coordinate_descent.MultiTaskLasso", + "MultiTaskLassoCV": "sklearn.linear_model._coordinate_descent.MultiTaskLassoCV", + "MultinomialNB": "sklearn.naive_bayes.MultinomialNB", + "NMF": "sklearn.decomposition._nmf.NMF", + "NearestCentroid": "sklearn.neighbors._nearest_centroid.NearestCentroid", + "NearestNeighbors": "sklearn.neighbors._unsupervised.NearestNeighbors", + "NeighborhoodComponentsAnalysis": "sklearn.neighbors._nca.NeighborhoodComponentsAnalysis", + "Normalizer": "sklearn.preprocessing._data.Normalizer", + "NuSVC": "sklearn.svm._classes.NuSVC", + "NuSVR": "sklearn.svm._classes.NuSVR", + "Nystroem": "sklearn.kernel_approximation.Nystroem", + "OAS": "sklearn.covariance._shrunk_covariance.OAS", + "OPTICS": "sklearn.cluster._optics.OPTICS", + "OneClassSVM": "sklearn.svm._classes.OneClassSVM", + "OneHotEncoder": "sklearn.preprocessing._encoders.OneHotEncoder", + "OneVsOneClassifier": "sklearn.multiclass.OneVsOneClassifier", + "OneVsRestClassifier": "sklearn.multiclass.OneVsRestClassifier", + "OrdinalEncoder": "sklearn.preprocessing._encoders.OrdinalEncoder", + "OrthogonalMatchingPursuit": "sklearn.linear_model._omp.OrthogonalMatchingPursuit", + "OrthogonalMatchingPursuitCV": "sklearn.linear_model._omp.OrthogonalMatchingPursuitCV", + "OutputCodeClassifier": "sklearn.multiclass.OutputCodeClassifier", + "PCA": "sklearn.decomposition._pca.PCA", + "PLSCanonical": "sklearn.cross_decomposition._pls.PLSCanonical", + "PLSRegression": "sklearn.cross_decomposition._pls.PLSRegression", + "PLSSVD": "sklearn.cross_decomposition._pls.PLSSVD", + "PassiveAggressiveClassifier": "sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier", + "PassiveAggressiveRegressor": "sklearn.linear_model._passive_aggressive.PassiveAggressiveRegressor", + "PatchExtractor": "sklearn.feature_extraction.image.PatchExtractor", + "Perceptron": "sklearn.linear_model._perceptron.Perceptron", + "Pipeline": "sklearn.pipeline.Pipeline", + "PoissonRegressor": "sklearn.linear_model._glm.glm.PoissonRegressor", + "PolynomialCountSketch": "sklearn.kernel_approximation.PolynomialCountSketch", + "PolynomialFeatures": "sklearn.preprocessing._polynomial.PolynomialFeatures", + "PowerTransformer": "sklearn.preprocessing._data.PowerTransformer", + "QuadraticDiscriminantAnalysis": "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis", + "QuantileRegressor": "sklearn.linear_model._quantile.QuantileRegressor", + "QuantileTransformer": "sklearn.preprocessing._data.QuantileTransformer", + "RANSACRegressor": "sklearn.linear_model._ransac.RANSACRegressor", + "RBFSampler": "sklearn.kernel_approximation.RBFSampler", + "RFE": "sklearn.feature_selection._rfe.RFE", + "RFECV": "sklearn.feature_selection._rfe.RFECV", + "RadiusNeighborsClassifier": "sklearn.neighbors._classification.RadiusNeighborsClassifier", + "RadiusNeighborsRegressor": "sklearn.neighbors._regression.RadiusNeighborsRegressor", + "RadiusNeighborsTransformer": "sklearn.neighbors._graph.RadiusNeighborsTransformer", + "RandomForestClassifier": "sklearn.ensemble._forest.RandomForestClassifier", + "RandomForestRegressor": "sklearn.ensemble._forest.RandomForestRegressor", + "RandomTreesEmbedding": "sklearn.ensemble._forest.RandomTreesEmbedding", + "RandomizedSearchCV": "sklearn.model_selection._search.RandomizedSearchCV", + "RegressorChain": "sklearn.multioutput.RegressorChain", + "Ridge": "sklearn.linear_model._ridge.Ridge", + "RidgeCV": "sklearn.linear_model._ridge.RidgeCV", + "RidgeClassifier": "sklearn.linear_model._ridge.RidgeClassifier", + "RidgeClassifierCV": "sklearn.linear_model._ridge.RidgeClassifierCV", + "RobustScaler": "sklearn.preprocessing._data.RobustScaler", + "SGDClassifier": "sklearn.linear_model._stochastic_gradient.SGDClassifier", + "SGDOneClassSVM": "sklearn.linear_model._stochastic_gradient.SGDOneClassSVM", + "SGDRegressor": "sklearn.linear_model._stochastic_gradient.SGDRegressor", + "SVC": "sklearn.svm._classes.SVC", + "SVR": "sklearn.svm._classes.SVR", + "SelectFdr": "sklearn.feature_selection._univariate_selection.SelectFdr", + "SelectFpr": "sklearn.feature_selection._univariate_selection.SelectFpr", + "SelectFromModel": "sklearn.feature_selection._from_model.SelectFromModel", + "SelectFwe": "sklearn.feature_selection._univariate_selection.SelectFwe", + "SelectKBest": "sklearn.feature_selection._univariate_selection.SelectKBest", + "SelectPercentile": "sklearn.feature_selection._univariate_selection.SelectPercentile", + "SelfTrainingClassifier": "sklearn.semi_supervised._self_training.SelfTrainingClassifier", + "SequentialFeatureSelector": "sklearn.feature_selection._sequential.SequentialFeatureSelector", + "ShrunkCovariance": "sklearn.covariance._shrunk_covariance.ShrunkCovariance", + "SimpleImputer": "sklearn.impute._base.SimpleImputer", + "SkewedChi2Sampler": "sklearn.kernel_approximation.SkewedChi2Sampler", + "SparseCoder": "sklearn.decomposition._dict_learning.SparseCoder", + "SparsePCA": "sklearn.decomposition._sparse_pca.SparsePCA", + "SparseRandomProjection": "sklearn.random_projection.SparseRandomProjection", + "SpectralBiclustering": "sklearn.cluster._bicluster.SpectralBiclustering", + "SpectralClustering": "sklearn.cluster._spectral.SpectralClustering", + "SpectralCoclustering": "sklearn.cluster._bicluster.SpectralCoclustering", + "SpectralEmbedding": "sklearn.manifold._spectral_embedding.SpectralEmbedding", + "SplineTransformer": "sklearn.preprocessing._polynomial.SplineTransformer", + "StackingClassifier": "sklearn.ensemble._stacking.StackingClassifier", + "StackingRegressor": "sklearn.ensemble._stacking.StackingRegressor", + "StandardScaler": "sklearn.preprocessing._data.StandardScaler", + "TSNE": "sklearn.manifold._t_sne.TSNE", + "TargetEncoder": "sklearn.preprocessing._target_encoder.TargetEncoder", + "TfidfTransformer": "sklearn.feature_extraction.text.TfidfTransformer", + "TfidfVectorizer": "sklearn.feature_extraction.text.TfidfVectorizer", + "TheilSenRegressor": "sklearn.linear_model._theil_sen.TheilSenRegressor", + "TransformedTargetRegressor": "sklearn.compose._target.TransformedTargetRegressor", + "TruncatedSVD": "sklearn.decomposition._truncated_svd.TruncatedSVD", + "TunedThresholdClassifierCV": "sklearn.model_selection._classification_threshold.TunedThresholdClassifierCV", + "TweedieRegressor": "sklearn.linear_model._glm.glm.TweedieRegressor", + "VarianceThreshold": "sklearn.feature_selection._variance_threshold.VarianceThreshold", + "VotingClassifier": "sklearn.ensemble._voting.VotingClassifier", + "VotingRegressor": "sklearn.ensemble._voting.VotingRegressor", + } diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py new file mode 100644 index 000000000..b320fcabf --- /dev/null +++ b/openml/models/classification/xgboost.py @@ -0,0 +1,15 @@ +"""Xgboost classifier.""" + +from __future__ import annotations + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): + _tags = { + "pkg_id": "XGBClassifier", + "python_dependencies": "xgboost", + "pkg_pypi_name": "xgboost", + } + + _obj = "xgboost.XGBClassifier" diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py new file mode 100644 index 000000000..83e379222 --- /dev/null +++ b/openml/utils/__init__.py @@ -0,0 +1,35 @@ +"""Utilities module.""" + +from openml.utils._openml import ( + ProgressBar, + _create_cache_directory, + _create_cache_directory_for_id, + _create_lockfiles_dir, + _delete_entity, + _get_cache_dir_for_id, + _get_cache_dir_for_key, + _get_rest_api_type_alias, + _list_all, + _remove_cache_dir_for_id, + _tag_entity, + _tag_openml_base, + extract_xml_tags, + thread_safe_if_oslo_installed, +) + +__all__ = [ + "ProgressBar", + "_create_cache_directory", + "_create_cache_directory_for_id", + "_create_lockfiles_dir", + "_delete_entity", + "_get_cache_dir_for_id", + "_get_cache_dir_for_key", + "_get_rest_api_type_alias", + "_list_all", + "_remove_cache_dir_for_id", + "_tag_entity", + "_tag_openml_base", + "extract_xml_tags", + "thread_safe_if_oslo_installed", +] diff --git a/openml/utils/_indexing/__init__.py b/openml/utils/_indexing/__init__.py new file mode 100644 index 000000000..80b82550d --- /dev/null +++ b/openml/utils/_indexing/__init__.py @@ -0,0 +1 @@ +"""Utilities module for indexing third party libraries.""" diff --git a/openml/utils/_indexing/_preindex_sklearn.py b/openml/utils/_indexing/_preindex_sklearn.py new file mode 100644 index 000000000..bf4f8130b --- /dev/null +++ b/openml/utils/_indexing/_preindex_sklearn.py @@ -0,0 +1,137 @@ +"""Registry lookup methods - scikit-learn estimators.""" + +# adapted from the sktime utility of the same name +# copyright: sktime developers, BSD-3-Clause License (see LICENSE file) +from __future__ import annotations + +__author__ = ["fkiraly"] +# all_estimators is also based on the sklearn utility of the same name + +from skbase.lookup import all_objects + + +def _all_sklearn_estimators_locdict(package_name="sklearn", serialized=False): + """Dictionary of all scikit-learn estimators in sktime and sklearn. + + Parameters + ---------- + package_name : str, optional (default="sklearn") + The package from which to retrieve the sklearn estimators. + This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier, + e.g., ``"scikit-learn"``. + + serialized : bool, optional (default=False) + If True, returns a serialized version of the dict, via + ``openml.utils._inmemory._dict.serialize_dict``. + If False, returns the dict directly. + + Returns + ------- + loc_dict : dict + A dictionary with: + + * keys: str, estimator class name, e.g., ``RandomForestClassifier`` + * values: str, public import path of the estimator, e.g., + ``sklearn.ensemble.RandomForestClassifier`` + """ + all_ests = _all_sklearn_estimators( + package_name=package_name, + return_names=False, + ) + + loc_dict = {est.__name__: f"{est.__module__}.{est.__name__}" for est in all_ests} + + if serialized: + from openml.utils._inmemory._dict import serialize_dict + + loc_dict = serialize_dict(loc_dict, name="sklearn_estimators_loc_dict") + + return loc_dict + + +def _all_sklearn_estimators( + package_name="sklearn", + return_names=True, + as_dataframe=False, + suppress_import_stdout=True, +): + """List all scikit-learn objects in a given package. + + This function retrieves all sklearn objects inheriting from ``BaseEstimator``, + from the import location given by ``package_name``. + + Not included are: the base classes themselves, classes defined in test modules. + + Parameters + ---------- + package_name : str, optional (default="sklearn") + The package from which to retrieve the sklearn estimators. + This is an import name, e.g., ``"sklearn"``, not a PEP 440 package identifier, + e.g., ``"scikit-learn"``. + + return_names: bool, optional (default=True) + + if True, estimator class name is included in the ``all_estimators`` + return in the order: name, estimator class, optional tags, either as + a tuple or as pandas.DataFrame columns + + if False, estimator class name is removed from the ``all_estimators`` return. + + as_dataframe: bool, optional (default=False) + + True: ``all_estimators`` will return a ``pandas.DataFrame`` with named + columns for all of the attributes being returned. + + False: ``all_estimators`` will return a list (either a list of + estimators or a list of tuples, see Returns) + + suppress_import_stdout : bool, optional. Default=True + whether to suppress stdout printout upon import. + + Returns + ------- + all_estimators will return one of the following: + + 1. list of estimators, if ``return_names=False``, and ``return_tags`` is None + + 2. list of tuples (optional estimator name, class, ~ptional estimator + tags), if ``return_names=True`` or ``return_tags`` is not ``None``. + + 3. ``pandas.DataFrame`` if ``as_dataframe = True`` + + if list of estimators: + entries are estimators matching the query, + in alphabetical order of estimator name + if list of tuples: + list of (optional estimator name, estimator, optional estimator + tags) matching the query, in alphabetical order of estimator name, + where + ``name`` is the estimator name as string, and is an + optional return + ``estimator`` is the actual estimator + ``tags`` are the estimator's values for each tag in return_tags + and is an optional return. + if ``DataFrame``: + column names represent the attributes contained in each column. + "estimators" will be the name of the column of estimators, "names" + will be the name of the column of estimator class names and the string(s) + passed in return_tags will serve as column names for all columns of + tags that were optionally requested. + """ + from sklearn.base import BaseEstimator + + MODULES_TO_IGNORE_SKLEARN = [ + "array_api_compat", + "tests", + "experimental", + "conftest", + ] + + return all_objects( + object_types=BaseEstimator, + package_name=package_name, + modules_to_ignore=MODULES_TO_IGNORE_SKLEARN, + as_dataframe=as_dataframe, + return_names=return_names, + suppress_import_stdout=suppress_import_stdout, + ) diff --git a/openml/utils/_inmemory/__init__.py b/openml/utils/_inmemory/__init__.py new file mode 100644 index 000000000..07bdfba5a --- /dev/null +++ b/openml/utils/_inmemory/__init__.py @@ -0,0 +1 @@ +"""Utilities module for serializing and deserializing in-memory objects.""" diff --git a/openml/utils/_inmemory/_dict.py b/openml/utils/_inmemory/_dict.py new file mode 100644 index 000000000..c27e78dd7 --- /dev/null +++ b/openml/utils/_inmemory/_dict.py @@ -0,0 +1,57 @@ +"""Utilities module for serializing and deserializing dicts.""" + +from __future__ import annotations + + +def serialize_dict(d, mode="eval", name="d"): + """Serialize a dict as an executable Python code snippet. + + To deserialize, simply execute the code snippet in a Python environment. + + Command for deserialization: + + * if ``mode == "eval"``, use ``deserialized = exec(code_snippet)`` + * if ``mode == "exec"``, use ``exec(code_snippet)`` and then access the dict + + Parameters + ---------- + d : dict + The dictionary to serialize. + + mode : str, "eval" or "exec", default="eval" + The mode of serialization. + + * If ``"eval"``, the returned code snippet is an expression that evaluates to the dict. + * If ``"exec"``, the returned code snippet is a series of statements that assign the dict + to a variable named ``name``. + + name : str, default="d" + The variable name to assign the dict to. + Only used if mode is ``"exec"``. + + Returns + ------- + code_snippet : str + A string containing the Python code snippet that recreates the dict ``d``, + assigned to the specified variable name ``name``. + + Example + ------- + >>> my_dict = {'a': 'apple', 'b': 'banana'} + >>> serialized_dict = serialize_dict(my_dict, name="my_dict") + >>> deserialized_dict = eval(serialized_dict) + >>> assert deserialized_dict == my_dict + """ + + def dq(s): + # Escape backslashes and double quotes for valid Python strings + return s.replace("\\", "\\\\").replace('"', '\\"') + + if mode == "eval": + lines = ["{"] + else: # mode == "exec" + lines = [f"{name} = {{"] + for k, v in d.items(): + lines.append(f' "{dq(k)}": "{dq(v)}",') + lines.append("}") + return "\n".join(lines) diff --git a/openml/utils.py b/openml/utils/_openml.py similarity index 99% rename from openml/utils.py rename to openml/utils/_openml.py index 7e72e7aee..f20aedcca 100644 --- a/openml/utils.py +++ b/openml/utils/_openml.py @@ -17,8 +17,7 @@ import openml import openml._api_calls import openml.exceptions - -from . import config +from openml import config # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: diff --git a/pyproject.toml b/pyproject.toml index 14309c2d5..f10b89de3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "minio", "pyarrow", "tqdm", # For MinIO download progress bars + "scikit-base", ] requires-python = ">=3.10,<3.15" maintainers = [