automl · charlesfu4 · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -32,6 +32,7 @@
 from autosklearn.smbo import AutoMLSMBO
 from autosklearn.util.hash import hash_array_or_matrix
 from autosklearn.metrics import f1_macro, accuracy, r2
+from autosklearn.constants import MULTIOUTPUT_REGRESSION
 from autosklearn.constants import *
 
 
@@ -1059,7 +1060,8 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
 class AutoMLRegressor(BaseAutoML):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-
+        self._task_mapping = {'continuous-multioutput': MULTIOUTPUT_REGRESSION}
+
     def fit(
         self,
         X: np.ndarray,
@@ -1073,27 +1075,62 @@ def fit(
         load_models: bool = True,
     ):
         X, y = super()._perform_input_checks(X, y)
-        _n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
-        if _n_outputs > 1:
-            raise NotImplementedError(
-                'Multi-output regression is not implemented.')
+        if X_test is not None:
+            X_test, y_test = self._perform_input_checks(X_test, y_test)
+            if len(y.shape) != len(y_test.shape):
+                raise ValueError('Target value shapes do not match: %s vs %s'
+                                 % (y.shape, y_test.shape))
+
+        y_task = type_of_target(y)
+        task = self._task_mapping.get(y_task)
+
+        if task is None:
+            raise ValueError('Cannot work on data of type %s' % y_task)
+
         if metric is None:
             metric = r2
         return super().fit(
             X, y,
             X_test=X_test,
             y_test=y_test,
-            task=REGRESSION,
+            task=task,
             metric=metric,
             feat_type=feat_type,
             dataset_name=dataset_name,
             only_return_configuration_space=only_return_configuration_space,
             load_models=load_models,
         )
-
     def fit_ensemble(self, y, task=None, metric=None, precision='32',
                      dataset_name=None, ensemble_nbest=None,
                      ensemble_size=None):
-        y = super()._check_y(y)
+        y, _target, _n_targets = self._process_targets(y)
+        if not hasattr(self, '_target'):
+            self._target = _target
+        if not hasattr(self, '_n_targets'):
+            self._n_targets = _n_targets
+
         return super().fit_ensemble(y, task, metric, precision, dataset_name,
                                     ensemble_nbest, ensemble_size)
+
+    def _process_targets(self, y):
+        y = super()._check_y(y)
+        self._n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
+
+        y = np.copy(y)
+
+        _target = []
+        _n_targets = []
+
+        if self._n_outputs == 1:
+            target_k, y = np.unique(y, return_inverse=True)
+            _target.append(target_k)
+            _n_targets.append(target_k.shape[0])
+        else:
+            for k in range(self._n_outputs):
+                target_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
+                _target.append(target_k)
+                _n_targets.append(target_k.shape[0])
+
+        _n_targets = np.array(_n_targets, dtype=np.int)
+
+        return y, _target, _n_targets
diff --git a/autosklearn/constants.py b/autosklearn/constants.py
@@ -4,8 +4,9 @@
 MULTICLASS_CLASSIFICATION = 2
 MULTILABEL_CLASSIFICATION = 3
 REGRESSION = 4
+MULTIOUTPUT_REGRESSION = 5
 
-REGRESSION_TASKS = [REGRESSION]
+REGRESSION_TASKS = [REGRESSION, MULTIOUTPUT_REGRESSION]
 CLASSIFICATION_TASKS = [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION,
                         MULTILABEL_CLASSIFICATION]
 
@@ -15,10 +16,12 @@
     {BINARY_CLASSIFICATION: 'binary.classification',
      MULTICLASS_CLASSIFICATION: 'multiclass.classification',
      MULTILABEL_CLASSIFICATION: 'multilabel.classification',
-     REGRESSION: 'regression'}
+     REGRESSION: 'regression',
+     MULTIOUTPUT_REGRESSION: 'multioutpu.regression'}
 
 STRING_TO_TASK_TYPES = \
     {'binary.classification': BINARY_CLASSIFICATION,
      'multiclass.classification': MULTICLASS_CLASSIFICATION,
      'multilabel.classification': MULTILABEL_CLASSIFICATION,
-     'regression': REGRESSION}
+     'regression': REGRESSION,
+     'multioutput.regression': MULTIOUTPUT_REGRESSION}
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -746,15 +746,15 @@ def fit(self, X, y,
         X : array-like or sparse matrix of shape = [n_samples, n_features]
             The training input samples.
 
-        y : array-like, shape = [n_samples]
+        y : array-like, shape = [n_samples] or [n_samples, n_targets]
             The regression target.
 
         X_test : array-like or sparse matrix of shape = [n_samples, n_features]
             Test data input samples. Will be used to save test predictions for
             all models. This allows to evaluate the performance of Auto-sklearn
             over time.
 
-        y_test : array-like, shape = [n_samples]
+        y_test : array-like, shape = [n_samples] or [n_samples, n_targets]
             The regression target. Will be used to calculate the test error
             of all models. This allows to evaluate the performance of
             Auto-sklearn over time.
@@ -782,9 +782,7 @@ def fit(self, X, y,
         # type of data is compatible with auto-sklearn. Legal target
         # types are: continuous, binary, multiclass.
         target_type = type_of_target(y)
-        if target_type in ['multiclass-multioutput',
-                           'multilabel-indicator',
-                           'continuous-multioutput',
+        if target_type in ['multilabel-indicator',
                            'unknown',
                            ]:
             raise ValueError("regression with data of type %s is not"

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -12,6 +12,7 @@
     REGRESSION_TASKS,
     MULTILABEL_CLASSIFICATION,
     MULTICLASS_CLASSIFICATION,
+    MULTIOUTPUT_REGRESSION
 )
 from autosklearn.pipeline.implementations.util import (
     convert_multioutput_multiclass_to_multilabel
@@ -183,12 +184,21 @@ def _get_model(self):
                                      random_state=self.seed,
                                      init_params=self._init_params)
         else:
-            dataset_properties = {
-                'task': self.task_type,
-                'sparse': self.datamanager.info['is_sparse'] == 1,
-                'multilabel': self.task_type == MULTILABEL_CLASSIFICATION,
-                'multiclass': self.task_type == MULTICLASS_CLASSIFICATION,
-            }
+            if self.task_type in REGRESSION_TASKS:
+                dataset_properties = {
+                    'task': self.task_type,
+                    'sparse': self.datamanager.info['is_sparse'] == 1,
+                    'multioutput': self.task_type == MULTIOUTPUT_REGRESSION,
+                }
+            else:
+                dataset_properties = {
+                    'task': self.task_type,
+                    'sparse': self.datamanager.info['is_sparse'] == 1,
+                    'multilabel': self.task_type == MULTILABEL_CLASSIFICATION,
+                    'multiclass': self.task_type == MULTICLASS_CLASSIFICATION,
+                }
+
+
             model = self.model_class(config=self.configuration,
                                      dataset_properties=dataset_properties,
                                      random_state=self.seed,
@@ -412,19 +422,18 @@ def send_warnings_to_log(message, category, filename, lineno,
 
         if len(Y_pred.shape) == 1:
             Y_pred = Y_pred.reshape((-1, 1))
-
+        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
         return Y_pred
 
     def _ensure_prediction_array_sizes(self, prediction, Y_train):
         num_classes = self.datamanager.info['label_num']
-
-        if self.task_type == MULTICLASS_CLASSIFICATION and \
-                prediction.shape[1] < num_classes:
+        multi_type = [MULTICLASS_CLASSIFICATION, MULTIOUTPUT_REGRESSION]
+        if self.task_type in multi_type and prediction.shape[1] < num_classes:
             if Y_train is None:
                 raise ValueError('Y_train must not be None!')
             classes = list(np.unique(Y_train))
-
             mapping = dict()
+
             for class_number in range(num_classes):
                 if class_number in classes:
                     index = classes.index(class_number)

diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py
@@ -278,7 +278,7 @@ def predict(self, X):
 
         Returns
         -------
-        array, shape = (n_samples,)
+        array, shape = (n_samples, ) or shape = (n_samples, n_targets)
             Returns the predicted values
 
         Notes

diff --git a/autosklearn/pipeline/components/regression/__init__.py b/autosklearn/pipeline/components/regression/__init__.py
@@ -30,8 +30,11 @@ def get_components(cls):
 
     @classmethod
     def get_available_components(cls, data_prop,
+                                 dataset_properties=None,
                                  include=None,
                                  exclude=None):
+        if dataset_properties is None:
+            dataset_properties = {}
         available_comp = cls.get_components()
         components_dict = OrderedDict()
 
@@ -59,6 +62,9 @@ def get_available_components(cls, data_prop,
 
             if entry.get_properties()['handles_regression'] is False:
                 continue
+            if dataset_properties.get('multioutput') is True and available_comp[name]. \
+                    get_properties()['handles_multioutput'] is False:
+                continue
             components_dict[name] = entry
 
         return components_dict

diff --git a/autosklearn/pipeline/components/regression/adaboost.py b/autosklearn/pipeline/components/regression/adaboost.py
@@ -49,8 +49,8 @@ def get_properties(dataset_properties=None):
                 'name': 'AdaBoost Regressor',
                 'handles_regression': True,
                 'handles_classification': False,
-                'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': True,
                 'is_deterministic': True,
                 'input': (DENSE, SPARSE, UNSIGNED_DATA),
                 'output': (PREDICTIONS, )}

diff --git a/autosklearn/pipeline/components/regression/ard_regression.py b/autosklearn/pipeline/components/regression/ard_regression.py
@@ -66,6 +66,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': False,
                 'prefers_data_normalized': True,
                 'is_deterministic': True,
                 'input': (DENSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/components/regression/decision_tree.py b/autosklearn/pipeline/components/regression/decision_tree.py
@@ -71,6 +71,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': True,
                 'is_deterministic': False,
                 'input': (DENSE, SPARSE, UNSIGNED_DATA),
                 'output': (PREDICTIONS,)}

diff --git a/autosklearn/pipeline/components/regression/extra_trees.py b/autosklearn/pipeline/components/regression/extra_trees.py
@@ -115,6 +115,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': True,
                 'is_deterministic': True,
                 'input': (DENSE, SPARSE, UNSIGNED_DATA),
                 'output': (PREDICTIONS,)}

diff --git a/autosklearn/pipeline/components/regression/gaussian_process.py b/autosklearn/pipeline/components/regression/gaussian_process.py
@@ -55,6 +55,7 @@ def get_properties(dataset_properties=None):
                 'handles_regression': True,
                 'handles_classification': False,
                 'handles_multiclass': False,
+                'handles_multioutput': True,
                 'handles_multilabel': False,
                 'is_deterministic': True,
                 'input': (DENSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py
@@ -39,6 +39,7 @@ def get_properties(dataset_properties=None):
                 'handles_regression': True,
                 'handles_classification': False,
                 'handles_multiclass': False,
+                'handles_multioutput': True,
                 'handles_multilabel': False,
                 'is_deterministic': True,
                 'input': (DENSE, SPARSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/components/regression/liblinear_svr.py b/autosklearn/pipeline/components/regression/liblinear_svr.py
@@ -57,6 +57,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': False,
                 'is_deterministic': False,
                 'input': (SPARSE, DENSE, UNSIGNED_DATA),
                 'output': (PREDICTIONS,)}

diff --git a/autosklearn/pipeline/components/regression/libsvm_svr.py b/autosklearn/pipeline/components/regression/libsvm_svr.py
@@ -108,6 +108,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': False,
                 'prefers_data_normalized': True,
                 'is_deterministic': True,
                 'input': (SPARSE, DENSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/components/regression/random_forest.py b/autosklearn/pipeline/components/regression/random_forest.py
@@ -96,6 +96,7 @@ def get_properties(dataset_properties=None):
                 'handles_regression': True,
                 'handles_classification': False,
                 'handles_multiclass': False,
+                'handles_multioutput': True,
                 'handles_multilabel': False,
                 'prefers_data_normalized': False,
                 'is_deterministic': True,

diff --git a/autosklearn/pipeline/components/regression/ridge_regression.py b/autosklearn/pipeline/components/regression/ridge_regression.py
@@ -46,6 +46,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': True,
                 'prefers_data_normalized': True,
                 'is_deterministic': True,
                 'input': (SPARSE, DENSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py
@@ -128,6 +128,7 @@ def get_properties(dataset_properties=None):
                 'handles_classification': False,
                 'handles_multiclass': False,
                 'handles_multilabel': False,
+                'handles_multioutput': False,
                 'is_deterministic': True,
                 'handles_sparse': True,
                 'input': (DENSE, SPARSE, UNSIGNED_DATA),

diff --git a/autosklearn/pipeline/constants.py b/autosklearn/pipeline/constants.py
@@ -4,8 +4,9 @@
 MULTICLASS_CLASSIFICATION = 2
 MULTILABEL_CLASSIFICATION = 3
 REGRESSION = 4
+MULTIOUTPUT_REGRESSION = 5
 
-REGRESSION_TASKS = [REGRESSION]
+REGRESSION_TASKS = [REGRESSION, MULTIOUTPUT_REGRESSION]
 CLASSIFICATION_TASKS = [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION,
                         MULTILABEL_CLASSIFICATION]
 
@@ -15,26 +16,28 @@
     {BINARY_CLASSIFICATION: "binary.classification",
      MULTICLASS_CLASSIFICATION: "multiclass.classification",
      MULTILABEL_CLASSIFICATION: "multilabel.classification",
-     REGRESSION: "regression"}
+     REGRESSION: "regression",
+     MULTIOUTPUT_REGRESSION: "multioutput.regression"}
 
 STRING_TO_TASK_TYPES = \
     {"binary.classification": BINARY_CLASSIFICATION,
      "multiclass.classification": MULTICLASS_CLASSIFICATION,
      "multilabel.classification": MULTILABEL_CLASSIFICATION,
-     "regression": REGRESSION}
+     "regression": REGRESSION,
+     "multioutput.regression": MULTIOUTPUT_REGRESSION}
 
-DENSE = 5
-SPARSE = 6
-PREDICTIONS = 7
-INPUT = 8
+DENSE = 6
+SPARSE = 7
+PREDICTIONS = 8
+INPUT = 9
 
-SIGNED_DATA = 9
-UNSIGNED_DATA = 10
+SIGNED_DATA = 10
+UNSIGNED_DATA = 11
 
 DATASET_PROPERTIES_TO_STRING = \
     {DENSE:         'dense',
      SPARSE:        'sparse',
      PREDICTIONS:   'predictions',
      INPUT:         'input',
      SIGNED_DATA:   'signed data',
-     UNSIGNED_DATA: 'unsigned data'}
+     UNSIGNED_DATA: 'unsigned data'}
-Original file line number
+Diff line change
@@ Expand Up / @@ -278,7 +278,7 @@ def predict(self, X): @@
             Returns
             -------
-            array, shape = (n_samples,)
+            array, shape = (n_samples, ) or shape = (n_samples, n_targets)
                 Returns the predicted values
             Notes
@@ Expand Down @@