diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index f530521..3dd3bf7 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -110,6 +110,7 @@ def __init__(self, features, default=False, sparse=False, df_out=False, self.df_out = df_out self.input_df = input_df self.transformed_names_ = [] + self.transformed_cols_ = [] if (df_out and (sparse or default)): raise ValueError("Can not use df_out with sparse or default") @@ -289,6 +290,7 @@ def _transform(self, X, y=None, do_fit=False): extracted = [] self.transformed_names_ = [] + self.transformed_cols_ = [] for columns, transformers, options in self.built_features: input_df = options.get('input_df', self.input_df) @@ -310,6 +312,10 @@ def _transform(self, X, y=None, do_fit=False): self.transformed_names_ += self.get_names( columns, transformers, Xt, alias) + self.transformed_cols_ += [ + (columns, transformers, + self.get_names(columns, transformers, Xt, alias))] + # handle features not explicitly selected if self.built_default is not False: unsel_cols = self._unselected_columns(X) @@ -386,3 +392,34 @@ def fit_transform(self, X, y=None): y the target vector relative to X, optional """ return self._transform(X, y, True) + + def inverse_transform(self, X): + """ + Inverse transform the given data. Assumes that fit has already been + called. + + X the data to inverse transform + """ + + X_inv = pd.DataFrame() + # We will populate the inverse transformed dataframe column by column + + # Let's keep track of the column we've processed + prev_col = 0 + for columns, transformers, transformed_cols in self.transformed_cols_: + # Determine the column number of the last column in X + # corresponding to the original column we're computing + last_col = prev_col + len(transformed_cols) + + # Inverse transform the columns in X for the current transformer + col_inv = pd.DataFrame(transformers.inverse_transform( + X[:, prev_col:last_col]), + columns=[columns]) + + # Append the inverse transformed column to the output data frame + X_inv = pd.concat([X_inv, col_inv], axis=1) + + # For the next iteration, update the last column processed + prev_col = last_col + + return X_inv diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 95adcfb..6047605 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -21,7 +21,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import ( - Imputer, StandardScaler, OneHotEncoder, LabelBinarizer) + Imputer, StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder) from sklearn.feature_selection import SelectKBest, chi2 from sklearn.base import BaseEstimator, TransformerMixin import sklearn.decomposition @@ -950,3 +950,34 @@ def test_heterogeneous_output_types_input_df(): dft = M.fit_transform(df) assert dft['feat1'].dtype == np.dtype('int64') assert dft['feat2'].dtype == np.dtype('float64') + + +def test_inverse_transform_simple(): + df = pd.DataFrame({'colA': list('ynyyn'), 'colB': list('abcab')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df) + + +def test_inverse_transform_multicolumn(): + df = pd.DataFrame({'colA': list('ynyyn'), + 'colB': list('abcab'), + 'colC': list('sttts')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelBinarizer()), + ('colC', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df)