From 162873d3df52a438ef4b79a23f079cf7566ebaa3 Mon Sep 17 00:00:00 2001 From: Paulo Castro Date: Fri, 16 Mar 2018 11:35:01 -0300 Subject: [PATCH 1/3] DOC: update the pandas.core.resample.Resampler.fillna docstring --- pandas/core/resample.py | 124 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 004d572375234..909d772cf70a2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -624,18 +624,134 @@ def backfill(self, limit=None): def fillna(self, method, limit=None): """ - Fill missing values + Fill the new missing values in the resampled data using different + methods. + + In statistics, imputation is the process of replacing missing data with + substituted values [1]_. When resampling data, missing values may + appear (e.g., when the resampling frequency is higher than the original + frequency). + + The backward fill ('bfill') will replace NaN values that appeared in + the resampled data with the next value in the original sequence. The + forward fill ('ffill'), on the other hand, will replace NaN values + that appeared in the resampled data with the previous value in the + original sequence. Missing values that existed in the orginal data will + not be modified. Parameters ---------- method : str, method of resampling ('ffill', 'bfill') + Method to use for filling holes in resampled data + * ffill: use previous valid observation to fill gap (forward + fill). + * bfill: use next valid observation to fill gap (backward + fill). limit : integer, optional - limit of how many values to fill + Limit of how many values to fill. + + Returns + ------- + Series, DataFrame + An upsampled Series or DataFrame with backward or forwards filled + NaN values. + + Examples + -------- + + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: H, dtype: int64 + + >>> s.resample('30min').fillna("bfill") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + >>> s.resample('15min').fillna("bfill", limit=2) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 NaN + 2018-01-01 00:30:00 2.0 + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:15:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 01:45:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 15T, dtype: float64 + + >>> s.resample('30min').fillna("ffill") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 2 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + Resampling a DataFrame that has missing values: + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('30min').fillna("bfill") + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('15min').fillna("bfill", limit=2) + a b + 2018-01-01 00:00:00 2.0 1.0 + 2018-01-01 00:15:00 NaN NaN + 2018-01-01 00:30:00 NaN 3.0 + 2018-01-01 00:45:00 NaN 3.0 + 2018-01-01 01:00:00 NaN 3.0 + 2018-01-01 01:15:00 NaN NaN + 2018-01-01 01:30:00 6.0 5.0 + 2018-01-01 01:45:00 6.0 5.0 + 2018-01-01 02:00:00 6.0 5.0 + + >>> df.resample('30min').fillna("ffill") + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 See Also -------- - Series.fillna - DataFrame.fillna + backfill : Backward fill NaN values in the resampled data. + pad : Forward fill NaN values in the resampled data. + bfill : Alias of backfill. + ffill: Alias of pad. + nearest : Fill NaN values in the resampled data + with nearest neighbor starting from center. + pandas.Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'bfill' and 'ffill'. + pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'bfill' and 'ffill'. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) """ return self._upsample(method, limit=limit) From 7160e0dd22bb39a1b61892447c62a1a2e84b5708 Mon Sep 17 00:00:00 2001 From: Paulo Castro Date: Fri, 16 Mar 2018 14:01:47 -0300 Subject: [PATCH 2/3] DOC: make suggested corrections and added more useful examples --- pandas/core/resample.py | 97 +++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 28 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 909d772cf70a2..00420b80280b0 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -624,35 +624,33 @@ def backfill(self, limit=None): def fillna(self, method, limit=None): """ - Fill the new missing values in the resampled data using different - methods. + Fill missing values introduced by upsampling. In statistics, imputation is the process of replacing missing data with substituted values [1]_. When resampling data, missing values may appear (e.g., when the resampling frequency is higher than the original frequency). - The backward fill ('bfill') will replace NaN values that appeared in - the resampled data with the next value in the original sequence. The - forward fill ('ffill'), on the other hand, will replace NaN values - that appeared in the resampled data with the previous value in the - original sequence. Missing values that existed in the orginal data will + Missing values that existed in the orginal data will not be modified. Parameters ---------- - method : str, method of resampling ('ffill', 'bfill') + method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} Method to use for filling holes in resampled data - * ffill: use previous valid observation to fill gap (forward - fill). - * bfill: use next valid observation to fill gap (backward - fill). + + * 'pad': use previous valid observation to fill gap (forward + fill). + * 'backfill': use next valid observation to fill gap. + * 'nearest': use nearest valid observation to fill gap. + * 'ffill': same as 'pad'. + * 'bfill': same as 'backfill'. limit : integer, optional - Limit of how many values to fill. + Limit of how many consecutive values to fill. Returns ------- - Series, DataFrame + Series or DataFrame An upsampled Series or DataFrame with backward or forwards filled NaN values. @@ -669,7 +667,17 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3 Freq: H, dtype: int64 - >>> s.resample('30min').fillna("bfill") + Without filling the missing values you get: + + >>> s.resample("30min").asfreq() + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> s.resample('30min').fillna("backfill") 2018-01-01 00:00:00 1 2018-01-01 00:30:00 2 2018-01-01 01:00:00 2 @@ -677,7 +685,7 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3 Freq: 30T, dtype: int64 - >>> s.resample('15min').fillna("bfill", limit=2) + >>> s.resample('15min').fillna("backfill", limit=2) 2018-01-01 00:00:00 1.0 2018-01-01 00:15:00 NaN 2018-01-01 00:30:00 2.0 @@ -689,7 +697,7 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3.0 Freq: 15T, dtype: float64 - >>> s.resample('30min').fillna("ffill") + >>> s.resample('30min').fillna("pad") 2018-01-01 00:00:00 1 2018-01-01 00:30:00 1 2018-01-01 01:00:00 2 @@ -697,7 +705,50 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3 Freq: 30T, dtype: int64 - Resampling a DataFrame that has missing values: + >>> s.resample('30min').fillna("nearest") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + Resamping a Series that has missing values: + + >>> sm = pd.Series([1, None, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> sm + 2018-01-01 00:00:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: H, dtype: float64 + + >>> sm.resample('30min').fillna('backfill') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> sm.resample('30min').fillna('pad') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> sm.resample('30min').fillna('nearest') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + Resampling a DataFrame that has missing values works similar as for + Series column-by-column: >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, ... index=pd.date_range('20180101', periods=3, @@ -728,20 +779,10 @@ def fillna(self, method, limit=None): 2018-01-01 01:45:00 6.0 5.0 2018-01-01 02:00:00 6.0 5.0 - >>> df.resample('30min').fillna("ffill") - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 00:30:00 2.0 1 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 01:30:00 NaN 3 - 2018-01-01 02:00:00 6.0 5 - See Also -------- backfill : Backward fill NaN values in the resampled data. pad : Forward fill NaN values in the resampled data. - bfill : Alias of backfill. - ffill: Alias of pad. nearest : Fill NaN values in the resampled data with nearest neighbor starting from center. pandas.Series.fillna : Fill NaN values in the Series using the From 39e69ba20f50892a1f85ba99f03c54a3523581d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Mar 2018 14:49:10 -0500 Subject: [PATCH 3/3] Updates [ci skip] [ci skip] --- pandas/core/resample.py | 55 +++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 00420b80280b0..b3ab90fd67de4 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -639,24 +639,32 @@ def fillna(self, method, limit=None): method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} Method to use for filling holes in resampled data - * 'pad': use previous valid observation to fill gap (forward - fill). - * 'backfill': use next valid observation to fill gap. + * 'pad' or 'ffill': use previous valid observation to fill gap + (forward fill). + * 'backfill' or 'bfill': use next valid observation to fill gap. * 'nearest': use nearest valid observation to fill gap. - * 'ffill': same as 'pad'. - * 'bfill': same as 'backfill'. + limit : integer, optional - Limit of how many consecutive values to fill. + Limit of how many consecutive missing values to fill. Returns ------- Series or DataFrame - An upsampled Series or DataFrame with backward or forwards filled - NaN values. + An upsampled Series or DataFrame with missing values filled. - Examples + See Also -------- + backfill : Backward fill NaN values in the resampled data. + pad : Forward fill NaN values in the resampled data. + nearest : Fill NaN values in the resampled data + with nearest neighbor starting from center. + pandas.Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'bfill' and 'ffill'. + pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'bfill' and 'ffill'. + Examples + -------- Resampling a Series: >>> s = pd.Series([1, 2, 3], @@ -713,7 +721,7 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3 Freq: 30T, dtype: int64 - Resamping a Series that has missing values: + Missing values present before the upsampling are not affected. >>> sm = pd.Series([1, None, 3], ... index=pd.date_range('20180101', periods=3, freq='h')) @@ -747,8 +755,8 @@ def fillna(self, method, limit=None): 2018-01-01 02:00:00 3.0 Freq: 30T, dtype: float64 - Resampling a DataFrame that has missing values works similar as for - Series column-by-column: + DataFrame resampling is done column-wise. All the same options are + available. >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, ... index=pd.date_range('20180101', periods=3, @@ -767,29 +775,6 @@ def fillna(self, method, limit=None): 2018-01-01 01:30:00 6.0 5 2018-01-01 02:00:00 6.0 5 - >>> df.resample('15min').fillna("bfill", limit=2) - a b - 2018-01-01 00:00:00 2.0 1.0 - 2018-01-01 00:15:00 NaN NaN - 2018-01-01 00:30:00 NaN 3.0 - 2018-01-01 00:45:00 NaN 3.0 - 2018-01-01 01:00:00 NaN 3.0 - 2018-01-01 01:15:00 NaN NaN - 2018-01-01 01:30:00 6.0 5.0 - 2018-01-01 01:45:00 6.0 5.0 - 2018-01-01 02:00:00 6.0 5.0 - - See Also - -------- - backfill : Backward fill NaN values in the resampled data. - pad : Forward fill NaN values in the resampled data. - nearest : Fill NaN values in the resampled data - with nearest neighbor starting from center. - pandas.Series.fillna : Fill NaN values in the Series using the - specified method, which can be 'bfill' and 'ffill'. - pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the - specified method, which can be 'bfill' and 'ffill'. - References ---------- .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)