From 1b4edd9e9a7de56a25259b288150d06ece9701fd Mon Sep 17 00:00:00 2001 From: Erik Jan de Vries Date: Tue, 14 Nov 2017 09:21:02 +0100 Subject: [PATCH 1/2] DataFrameMapper.inverse_transform() for simple transformations --- sklearn_pandas/dataframe_mapper.py | 37 ++++++++++++++++++++++++++++++ tests/test_dataframe_mapper.py | 31 ++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 596aa76..94e1d53 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -110,6 +110,7 @@ def __init__(self, features, default=False, sparse=False, df_out=False, self.df_out = df_out self.input_df = input_df self.transformed_names_ = [] + self.transformed_cols_ = [] if (df_out and (sparse or default)): raise ValueError("Can not use df_out with sparse or default") @@ -268,6 +269,7 @@ def transform(self, X): """ extracted = [] self.transformed_names_ = [] + self.transformed_cols_ = [] for columns, transformers, options in self.built_features: input_df = options.get('input_df', self.input_df) # columns could be a string or list of @@ -282,6 +284,10 @@ def transform(self, X): alias = options.get('alias') self.transformed_names_ += self.get_names( columns, transformers, Xt, alias) + + self.transformed_cols_ += [ + (columns, transformers, + self.get_names(columns, transformers, Xt, alias)) ] # handle features not explicitly selected if self.built_default is not False: @@ -328,3 +334,34 @@ def transform(self, X): index=index) else: return stacked + + + def inverse_transform(self, X): + """ + Inverse transform the given data. Assumes that fit has already been called. + + X the data to inverse transform + """ + + X_inv = pd.DataFrame() + # We will populate the inverse transformed dataframe column by column + + # Let's keep track of the column we've processed + prev_col = 0 + for columns, transformers, transformed_cols in self.transformed_cols_: + # Determine the column number of the last column in X corresponding to + # the original column we're computing + last_col = prev_col + len(transformed_cols) + + # Inverse transform the columns in X for the current transformer + col_inv = pd.DataFrame(transformers.inverse_transform(X[:, prev_col:last_col]), + columns = [columns]) + + # Append the inverse transformed column to the output data frame + X_inv = pd.concat([X_inv, col_inv], axis = 1) + + # For the next iteration, update the last column processed + prev_col = last_col + + + return X_inv diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 75da4fd..9f59818 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -19,7 +19,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import ( - Imputer, StandardScaler, OneHotEncoder, LabelBinarizer) + Imputer, StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder) from sklearn.feature_selection import SelectKBest, chi2 from sklearn.base import BaseEstimator, TransformerMixin import sklearn.decomposition @@ -829,3 +829,32 @@ def test_direct_cross_validation(iris_dataframe): scores = sklearn_cv_score(pipeline, data, labels) assert scores.mean() > 0.96 assert (scores.std() * 2) < 0.04 + + +def test_inverse_transform_simple(): + df = pd.DataFrame({'colA': list('ynyyn'), 'colB': list('abcab')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df) + + +def test_inverse_transform_multicolumn(): + df = pd.DataFrame({'colA': list('ynyyn'), 'colB': list('abcab'), 'colC': list('sttts')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelBinarizer()), + ('colC', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df) From 58812acbdae32c8155ce326f8274aa0f675e4a25 Mon Sep 17 00:00:00 2001 From: Erik Jan de Vries Date: Tue, 14 Nov 2017 09:21:02 +0100 Subject: [PATCH 2/2] DataFrameMapper.inverse_transform() for simple transformations --- sklearn_pandas/dataframe_mapper.py | 37 ++++++++++++++++++++++++++++++ tests/test_dataframe_mapper.py | 33 +++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 596aa76..25aac78 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -110,6 +110,7 @@ def __init__(self, features, default=False, sparse=False, df_out=False, self.df_out = df_out self.input_df = input_df self.transformed_names_ = [] + self.transformed_cols_ = [] if (df_out and (sparse or default)): raise ValueError("Can not use df_out with sparse or default") @@ -268,6 +269,7 @@ def transform(self, X): """ extracted = [] self.transformed_names_ = [] + self.transformed_cols_ = [] for columns, transformers, options in self.built_features: input_df = options.get('input_df', self.input_df) # columns could be a string or list of @@ -283,6 +285,10 @@ def transform(self, X): self.transformed_names_ += self.get_names( columns, transformers, Xt, alias) + self.transformed_cols_ += [ + (columns, transformers, + self.get_names(columns, transformers, Xt, alias))] + # handle features not explicitly selected if self.built_default is not False: unsel_cols = self._unselected_columns(X) @@ -328,3 +334,34 @@ def transform(self, X): index=index) else: return stacked + + def inverse_transform(self, X): + """ + Inverse transform the given data. Assumes that fit has already been + called. + + X the data to inverse transform + """ + + X_inv = pd.DataFrame() + # We will populate the inverse transformed dataframe column by column + + # Let's keep track of the column we've processed + prev_col = 0 + for columns, transformers, transformed_cols in self.transformed_cols_: + # Determine the column number of the last column in X + # corresponding to the original column we're computing + last_col = prev_col + len(transformed_cols) + + # Inverse transform the columns in X for the current transformer + col_inv = pd.DataFrame(transformers.inverse_transform( + X[:, prev_col:last_col]), + columns=[columns]) + + # Append the inverse transformed column to the output data frame + X_inv = pd.concat([X_inv, col_inv], axis=1) + + # For the next iteration, update the last column processed + prev_col = last_col + + return X_inv diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 75da4fd..3276db8 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -19,7 +19,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import ( - Imputer, StandardScaler, OneHotEncoder, LabelBinarizer) + Imputer, StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder) from sklearn.feature_selection import SelectKBest, chi2 from sklearn.base import BaseEstimator, TransformerMixin import sklearn.decomposition @@ -829,3 +829,34 @@ def test_direct_cross_validation(iris_dataframe): scores = sklearn_cv_score(pipeline, data, labels) assert scores.mean() > 0.96 assert (scores.std() * 2) < 0.04 + + +def test_inverse_transform_simple(): + df = pd.DataFrame({'colA': list('ynyyn'), 'colB': list('abcab')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df) + + +def test_inverse_transform_multicolumn(): + df = pd.DataFrame({'colA': list('ynyyn'), + 'colB': list('abcab'), + 'colC': list('sttts')}) + mapper = DataFrameMapper([ + ('colA', LabelEncoder()), + ('colB', LabelBinarizer()), + ('colC', LabelEncoder()), + ]) + + transformed = mapper.fit_transform(df) + restored = mapper.inverse_transform(transformed) + + assert isinstance(restored, pd.DataFrame) + assert restored.equals(df)