From a13f6a8b6a0d5cf452dc4ebe11cd5763ee0a9e5b Mon Sep 17 00:00:00 2001 From: Varun Sriram Date: Tue, 28 Apr 2020 03:08:10 -0400 Subject: [PATCH 1/4] support None column name --- sklearn_pandas/dataframe_mapper.py | 29 ++++++++++++++++++++++++++--- tests/test_dataframe_mapper.py | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index f530521..7420890 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -75,7 +75,8 @@ def __init__(self, features, default=False, sparse=False, df_out=False, features a list of tuples with features definitions. The first element is the pandas column selector. This can - be a string (for one column) or a list of strings. + be a string (for one column), a list of strings, or None + (for all columns). The second element is an object that supports sklearn's transform interface, or a list of such objects. The third element is optional and, if present, must be @@ -162,13 +163,32 @@ def __setstate__(self, state): self.built_default = state.get('built_default', self.default) self.transformed_names_ = state.get('transformed_names_', []) + def _build_cols(self, X, cols): + """ + Build columns, replacing None sentinel with all cols of X. + + X a Pandas dataframe; the table to select columns from + cols a string or list of strings representing the columns + to select. if None, will be converted to a list of + all columns in X. + + Returns a numpy array with the data from the selected columns + """ + if cols is None: + if isinstance(X, DataWrapper): + cols = list(X.df.columns) + else: + cols = list(X.columns) + return cols + def _get_col_subset(self, X, cols, input_df=False): """ Get a subset of columns from the given table X. X a Pandas dataframe; the table to select columns from cols a string or list of strings representing the columns - to select + to select. if None, will be converted to a list of + all columns in X. Returns a numpy array with the data from the selected columns """ @@ -178,6 +198,9 @@ def _get_col_subset(self, X, cols, input_df=False): else: return_vector = False + # None is a sentinel to select all columns + cols = self._build_cols(X, cols) + # Needed when using the cross-validation compatibility # layer for sklearn<0.16.0. # Will be dropped on sklearn-pandas 2.0. @@ -308,7 +331,7 @@ def _transform(self, X, y=None, do_fit=False): alias = options.get('alias') self.transformed_names_ += self.get_names( - columns, transformers, Xt, alias) + self._build_cols(X, columns), transformers, Xt, alias) # handle features not explicitly selected if self.built_default is not False: diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 95adcfb..2f768f8 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -257,6 +257,20 @@ def test_complex_df(complex_dataframe): assert len(transformed[c]) == len(df[c]) +def test_none_all_col_sentinel(complex_dataframe): + """ + Get a dataframe from a complex mapped dataframe returning all cols + without spec. + """ + df = complex_dataframe + mapper = DataFrameMapper([(None, None)], df_out=True) + transformed = mapper.fit_transform(df) + print(transformed) + assert len(transformed) == len(complex_dataframe) + for c in df.columns: + assert len(transformed[c]) == len(df[c]) + + def test_numeric_column_names(complex_dataframe): """ Get a dataframe from a complex mapped dataframe with numeric column names From 4bea0b20ae48b27ad93d4b71a672f69ea683dbc8 Mon Sep 17 00:00:00 2001 From: Varun Sriram Date: Tue, 28 Apr 2020 08:55:35 -0400 Subject: [PATCH 2/4] bug fix --- sklearn_pandas/dataframe_mapper.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 7420890..0d0689e 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -249,7 +249,7 @@ def fit(self, X, y=None): _call_fit(self.built_default.fit, Xt, y) return self - def get_names(self, columns, transformer, x, alias=None): + def get_names(self, columns, transformer, x, alias=None, mode=None): """ Return verbose names for the transformed columns. @@ -257,6 +257,9 @@ def get_names(self, columns, transformer, x, alias=None): transformer transformer - can be a TransformerPipeline x transformed columns (numpy.ndarray) alias base name to use for the selected columns + mode if not None, either "nonecols" (cols is None + indicating to use all) or "nonecolstransforms" + (cols and transformer is None) """ if alias is not None: name = alias @@ -280,11 +283,20 @@ def get_names(self, columns, transformer, x, alias=None): # Otherwise use the only estimator present else: names = _get_feature_names(transformer) - if names is not None and len(names) == num_cols: - return ['%s_%s' % (name, o) for o in names] - # otherwise, return name concatenated with '_1', '_2', etc. + + if mode == "nonecolstransforms": + return columns + elif mode == "nonecols": + if names is not None and len(names) == num_cols: + return [str(o) for o in names] + else: + return [str(o) for o in range(num_cols)] else: - return [name + '_' + str(o) for o in range(num_cols)] + if names is not None and len(names) == num_cols: + return ['%s_%s' % (name, o) for o in names] + # otherwise, return name concatenated with '_1', '_2', etc. + else: + return [name + '_' + str(o) for o in range(num_cols)] else: return [name] @@ -330,8 +342,14 @@ def _transform(self, X, y=None, do_fit=False): extracted.append(_handle_feature(Xt)) alias = options.get('alias') + mode = None + if columns is None and transformers is None: + mode = "nonecolstransforms" + elif columns is None: + mode = "nonecols" self.transformed_names_ += self.get_names( - self._build_cols(X, columns), transformers, Xt, alias) + self._build_cols(X, columns), transformers, Xt, alias, + mode) # handle features not explicitly selected if self.built_default is not False: From a1ac205b722d01e0d8560db54db408232bdd6afd Mon Sep 17 00:00:00 2001 From: Varun Sriram Date: Tue, 28 Apr 2020 12:08:59 -0400 Subject: [PATCH 3/4] int to float cast if np.nan exists --- sklearn_pandas/dataframe_mapper.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 0d0689e..89ea64a 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -404,6 +404,13 @@ def _transform(self, X, y=None, do_fit=False): index=index) # preserve types for col, dtype in zip(self.transformed_names_, dtypes): + # this ensures that int types with null values are + # correctly cast to float + if ((np.issubdtype(df_out[col].values.dtype, np.floating) and + np.issubdtype(dtype, np.integer)) and + not np.isfinite(df_out[col].values).all()): + dtype = np.float64 + df_out[col] = df_out[col].astype(dtype) return df_out else: From 5ee3d59a2e7183ff81242357db3ff70f745d001d Mon Sep 17 00:00:00 2001 From: Varun Sriram Date: Tue, 28 Apr 2020 13:27:08 -0400 Subject: [PATCH 4/4] bug fix --- sklearn_pandas/dataframe_mapper.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 89ea64a..1971536 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -37,10 +37,12 @@ def _build_feature(columns, transformers, options={}): return (columns, _build_transformer(transformers), options) -def _get_feature_names(estimator): +def _get_feature_names(estimator, x): """ Attempt to extract feature names based on a given estimator """ + if isinstance(x, pd.DataFrame): + return list(x.columns) if hasattr(estimator, 'classes_'): return estimator.classes_ elif hasattr(estimator, 'get_feature_names'): @@ -255,7 +257,8 @@ def get_names(self, columns, transformer, x, alias=None, mode=None): columns name (or list of names) of the original column(s) transformer transformer - can be a TransformerPipeline - x transformed columns (numpy.ndarray) + x transformed columns (numpy.ndarray or + pd.DataFrame) alias base name to use for the selected columns mode if not None, either "nonecols" (cols is None indicating to use all) or "nonecolstransforms" @@ -278,11 +281,11 @@ def get_names(self, columns, transformer, x, alias=None, mode=None): if isinstance(transformer, TransformerPipeline): inverse_steps = transformer.steps[::-1] estimators = (estimator for name, estimator in inverse_steps) - names_steps = (_get_feature_names(e) for e in estimators) + names_steps = (_get_feature_names(e, x) for e in estimators) names = next((n for n in names_steps if n is not None), None) # Otherwise use the only estimator present else: - names = _get_feature_names(transformer) + names = _get_feature_names(transformer, x) if mode == "nonecolstransforms": return columns @@ -298,6 +301,20 @@ def get_names(self, columns, transformer, x, alias=None, mode=None): else: return [name + '_' + str(o) for o in range(num_cols)] else: + if isinstance(transformer, TransformerPipeline): + inverse_steps = transformer.steps[::-1] + estimators = (estimator for name, estimator in inverse_steps) + names_steps = (_get_feature_names(e, x) for e in estimators) + names = next((n for n in names_steps if n is not None), None) + # Otherwise use the only estimator present + else: + names = _get_feature_names(transformer, x) + + if mode == "nonecols": + if names is not None and len(names) == num_cols: + return [str(o) for o in names] + else: + return [str(o) for o in range(num_cols)] return [name] def get_dtypes(self, extracted):