Skip to content

Changes show_models() function to return a dictionary of models in ensemble #1321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 25, 2021
151 changes: 143 additions & 8 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,16 +1833,151 @@ def get_models_with_weights(self):

return self.ensemble_.get_models_with_weights(self.models_)

def show_models(self):
models_with_weights = self.get_models_with_weights()
def show_models(self) -> Dict[int, Any]:
""" Returns a dictionary containing dictionaries of ensemble models.

with io.StringIO() as sio:
sio.write("[")
for weight, model in models_with_weights:
sio.write("(%f, %s),\n" % (weight, model))
sio.write("]")
Each model in the ensemble can be accessed by giving its ``model_id`` as key.

return sio.getvalue()
A model dictionary contains the following:

* ``"model_id"`` - The id given to a model by ``autosklearn``.
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
* ``"cost"`` - The loss of the model on the validation set.
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
* ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling).
* ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` (for 'cv' resampling).
* ``"data_preprocessor"`` - The preprocessor used on the data.
* ``"balancing"`` - The balancing used on the data (for classification).
* ``"feature_preprocessor"`` - The preprocessor for features types.
* ``"classifier"`` or ``"regressor"`` - The autosklearn wrapped classifier or regressor.
* ``"sklearn_classifier"`` or ``"sklearn_regressor"`` - The sklearn classifier or regressor.

**Example**

.. code-block:: python

import sklearn.datasets
import sklearn.metrics
import autosklearn.regression

X, y = sklearn.datasets.load_diabetes(return_X_y=True)

automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=120
)
automl.fit(X_train, y_train, dataset_name='diabetes')

ensemble_dict = automl.show_models()
print(ensemble_dict)

Output:

.. code-block:: text

{
25: {'model_id': 25.0,
'rank': 1,
'cost': 0.43667876507897496,
'ensemble_weight': 0.38,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': SGDRegressor(alpha=0.0006517033225329654,...)
},
6: {'model_id': 6.0,
'rank': 2,
'cost': 0.4550418898836528,
'ensemble_weight': 0.3,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...)
}...
}

Returns
-------
Dict(int, Any) : dictionary of length = number of models in the ensemble
A dictionary of models in the ensemble, where ``model_id`` is the key.

"""

ensemble_dict = {}

def has_key(rv, key):
return rv.additional_info and key in rv.additional_info

table_dict = {}
for rkey, rval in self.runhistory_.data.items():
if has_key(rval, 'num_run'):
model_id = rval.additional_info['num_run']
table_dict[model_id] = {
'model_id': model_id,
'cost': rval.cost
}

# Checking if the dictionary is empty
if not table_dict:
raise RuntimeError('No model found. Try increasing \'time_left_for_this_task\'.')

for i, weight in enumerate(self.ensemble_.weights_):
(_, model_id, _) = self.ensemble_.identifiers_[i]
table_dict[model_id]['ensemble_weight'] = weight

table = pd.DataFrame.from_dict(table_dict, orient='index')
table.sort_values(by='cost', inplace=True)

# Checking which resampling strategy is chosen and selecting the appropriate models
is_cv = (self._resampling_strategy == "cv")
models = self.cv_models_ if is_cv else self.models_

rank = 1 # Initializing rank for the first model
for (_, model_id, _), model in models.items():
model_dict = {} # Declaring model dictionary

# Inserting model_id, rank, cost and ensemble weight
model_dict['model_id'] = table.loc[model_id]['model_id'].astype(int)
model_dict['rank'] = rank
model_dict['cost'] = table.loc[model_id]['cost']
model_dict['ensemble_weight'] = table.loc[model_id]['ensemble_weight']
rank += 1 # Incrementing rank by 1 for the next model

# The steps in the models pipeline are as follows:
# 'data_preprocessor': DataPreprocessor,
# 'balancing': Balancing,
# 'feature_preprocessor': FeaturePreprocessorChoice,
# 'classifier'/'regressor': ClassifierChoice/RegressorChoice (autosklearn wrapped model)

# For 'cv' (cross validation) strategy
if is_cv:
# Voting model created by cross validation
cv_voting_ensemble = model
model_dict['voting_model'] = cv_voting_ensemble

# List of models, each trained on one cv fold
cv_models = []
for cv_model in cv_voting_ensemble.estimators_:
estimator = dict(cv_model.steps)

# Adding sklearn model to the model dictionary
model_type, autosklearn_wrapped_model = cv_model.steps[-1]
estimator[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator
cv_models.append(estimator)
model_dict['estimators'] = cv_models

# For any other strategy
else:
steps = dict(model.steps)
model_dict.update(steps)

# Adding sklearn model to the model dictionary
model_type, autosklearn_wrapped_model = model.steps[-1]
model_dict[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator

# Insterting model_dict in the ensemble dictionary
ensemble_dict[model_id] = model_dict

return ensemble_dict

def _create_search_space(self, tmp_dir, backend, datamanager,
include=None,
Expand Down
65 changes: 63 additions & 2 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,13 +492,74 @@ def score(self, X, y):
return self.automl_.score(X, y)

def show_models(self):
"""Return a representation of the final ensemble found by auto-sklearn.
""" Returns a dictionary containing dictionaries of ensemble models.

Each model in the ensemble can be accessed by giving its ``model_id`` as key.

A model dictionary contains the following:

* ``"model_id"`` - The id given to a model by ``autosklearn``.
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
* ``"cost"`` - The loss of the model on the validation set.
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
* ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling).
* ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` (for 'cv' resampling).
* ``"data_preprocessor"`` - The preprocessor used on the data.
* ``"balancing"`` - The balancing used on the data (for classification).
* ``"feature_preprocessor"`` - The preprocessor for features types.
* ``"classifier"`` or ``"regressor"`` - The autosklearn wrapped classifier or regressor.
* ``"sklearn_classifier"`` or ``"sklearn_regressor"`` - The sklearn classifier or regressor.

**Example**

.. code-block:: python

import sklearn.datasets
import sklearn.metrics
import autosklearn.regression

X, y = sklearn.datasets.load_diabetes(return_X_y=True)

automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=120
)
automl.fit(X_train, y_train, dataset_name='diabetes')

ensemble_dict = automl.show_models()
print(ensemble_dict)

Output:

.. code-block:: text

{
25: {'model_id': 25.0,
'rank': 1,
'cost': 0.43667876507897496,
'ensemble_weight': 0.38,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': SGDRegressor(alpha=0.0006517033225329654,...)
},
6: {'model_id': 6.0,
'rank': 2,
'cost': 0.4550418898836528,
'ensemble_weight': 0.3,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...)
}...
}

Returns
-------
str
Dict(int, Any) : dictionary of length = number of models in the ensemble
A dictionary of models in the ensemble, where ``model_id`` is the key.

"""

return self.automl_.show_models()

def get_models_with_weights(self):
Expand Down
Loading