diff --git a/doc/source/release.rst b/doc/source/release.rst index 930f100fd86dc..c24ff6ab0ab30 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -67,6 +67,9 @@ pandas 0.13 - A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object to yield a ``float64`` dtyped Series. This is frequency conversion. + - Timedelta64 support ``fillna/ffill/bfill`` with an integer interpreted as seconds, + or a ``timedelta`` (:issue:`3371`) + - Datetime64 support ``ffill/bfill`` - Performance improvements with ``__getitem__`` on ``DataFrames`` with when the key is a column - Support for using a ``DatetimeIndex/PeriodsIndex`` directly in a datelike calculation @@ -154,6 +157,8 @@ pandas 0.13 - Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`) - The ``method`` argument of ``NDFrame.replace()`` is valid again, so that a a list can be passed to ``to_replace`` (:issue:`4743`). + - provide automatic dtype conversions on _reduce operations (:issue:`3371`) + - exclude non-numerics if mixed types with datelike in _reduce operations (:issue:`3371`) **Internal Refactoring** diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index e9540365c3dee..1175fe1c5a3a5 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1195,6 +1195,15 @@ issues). ``idxmin, idxmax`` are supported as well. df.min().idxmax() df.min(axis=1).idxmin() +You can fillna on timedeltas. Integers will be interpreted as seconds. You can +pass a timedelta to get a particular value. + +.. ipython:: python + + y.fillna(0) + y.fillna(10) + y.fillna(timedelta(days=-1,seconds=5)) + .. _timeseries.timedeltas_convert: Time Deltas & Conversions diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 6b8b7e73f3ac4..a38ff2fa6d457 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -195,6 +195,7 @@ Enhancements - NaN handing in get_dummies (:issue:`4446`) with `dummy_na` .. ipython:: python + # previously, nan was erroneously counted as 2 here # now it is not counted at all get_dummies([1, 2, np.nan]) @@ -237,10 +238,17 @@ Enhancements from pandas import offsets td + offsets.Minute(5) + offsets.Milli(5) - - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and - ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set - the bandwidth, and to gkde.evaluate() to specify the indicies at which it - is evaluated, respecttively. See scipy docs. + - Fillna is now supported for timedeltas + + .. ipython:: python + + td.fillna(0) + td.fillna(timedelta(days=1,seconds=5)) + + - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and + ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set + the bandwidth, and to gkde.evaluate() to specify the indicies at which it + is evaluated, respecttively. See scipy docs. .. _whatsnew_0130.refactoring: diff --git a/pandas/core/common.py b/pandas/core/common.py index 8b9ba4d5eea16..54197e86c961d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -705,6 +705,54 @@ def diff(arr, n, axis=0): return out_arr +def _coerce_scalar_to_timedelta_type(r): + # kludgy here until we have a timedelta scalar + # handle the numpy < 1.7 case + + if is_integer(r): + r = timedelta(microseconds=r/1000) + + if _np_version_under1p7: + if not isinstance(r, timedelta): + raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) + if compat.PY3: + # convert to microseconds in timedelta64 + r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000)) + else: + return r + + if isinstance(r, timedelta): + r = np.timedelta64(r) + elif not isinstance(r, np.timedelta64): + raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) + return r.astype('timedelta64[ns]') + +def _coerce_to_dtypes(result, dtypes): + """ given a dtypes and a result set, coerce the result elements to the dtypes """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + def conv(r,dtype): + try: + if isnull(r): + pass + elif dtype == _NS_DTYPE: + r = Timestamp(r) + elif dtype == _TD_DTYPE: + r = _coerce_scalar_to_timedelta_type(r) + elif dtype == np.bool_: + r = bool(r) + elif dtype.kind == 'f': + r = float(r) + elif dtype.kind == 'i': + r = int(r) + except: + pass + + return r + + return np.array([ conv(r,dtype) for r, dtype in zip(result,dtypes) ]) + def _infer_dtype_from_scalar(val): """ interpret the dtype from a scalar, upcast floats and ints return the new value and the dtype """ @@ -1288,7 +1336,7 @@ def _possibly_cast_to_timedelta(value, coerce=True): # coercion compatability if coerce == 'compat' and _np_version_under1p7: - def convert(td, type): + def convert(td, dtype): # we have an array with a non-object dtype if hasattr(td,'item'): @@ -1317,6 +1365,7 @@ def convert(td, type): # < 1.7 coercion if not is_list_like(value): value = np.array([ value ]) + dtype = value.dtype return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a3eb3ea54c784..52d3a15d8d184 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,7 +23,8 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, - _infer_dtype_from_scalar, _values_from_object) + _infer_dtype_from_scalar, _values_from_object, + _coerce_to_dtypes, _DATELIKE_DTYPES) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, @@ -4235,11 +4236,24 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, axis = self._get_axis_number(axis) f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) labels = self._get_agg_axis(axis) + + # exclude timedelta/datetime unless we are uniform types + if axis == 1 and self._is_mixed_type and len(set(self.dtypes) & _DATELIKE_DTYPES): + numeric_only = True + if numeric_only is None: try: values = self.values result = f(values) except Exception as e: + + # try by-column first + if filter_type is None and axis == 0: + try: + return self.apply(f).iloc[0] + except: + pass + if filter_type is None or filter_type == 'numeric': data = self._get_numeric_data() elif filter_type == 'bool': @@ -4273,9 +4287,11 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, result = result.astype(np.float64) elif filter_type == 'bool' and notnull(result).all(): result = result.astype(np.bool_) - # otherwise, accept it except (ValueError, TypeError): - pass + + # try to coerce to the original dtypes item by item if we can + if axis == 0: + result = com._coerce_to_dtypes(result, self.dtypes) return Series(result, index=labels) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bdfd98127bb7..58e1fbc4f177d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -20,12 +20,9 @@ _infer_dtype_from_scalar, _maybe_promote, ABCSeries) - - def is_dictlike(x): return isinstance(x, (dict, com.ABCSeries)) - def _single_replace(self, to_replace, method, inplace, limit): orig_dtype = self.dtype result = self if inplace else self.copy() @@ -1906,7 +1903,21 @@ def abs(self): abs: type of caller """ obj = np.abs(self) - obj = com._possibly_cast_to_timedelta(obj, coerce=False) + + # suprimo numpy 1.6 hacking + if com._np_version_under1p7: + if self.ndim == 1: + if obj.dtype == 'm8[us]': + obj = obj.astype('m8[ns]') + elif self.ndim == 2: + def f(x): + if x.dtype == 'm8[us]': + x = x.astype('m8[ns]') + return x + + if 'm8[us]' in obj.dtypes.values: + obj = obj.apply(f) + return obj def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e27430b06c45c..1716980813cea 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,6 +1,6 @@ import itertools import re -from datetime import datetime +from datetime import datetime, timedelta import copy from collections import defaultdict @@ -41,6 +41,7 @@ class Block(PandasObject): is_integer = False is_complex = False is_datetime = False + is_timedelta = False is_bool = False is_object = False is_sparse = False @@ -326,6 +327,8 @@ def _maybe_downcast(self, blocks, downcast=None): # unless indicated if downcast is None and self.is_float: return blocks + elif downcast is None and (self.is_timedelta or self.is_datetime): + return blocks result_blocks = [] for b in blocks: @@ -485,6 +488,10 @@ def _try_cast_result(self, result, dtype=None): # may need to change the dtype here return _possibly_downcast_to_dtype(result, dtype) + def _try_operate(self, values): + """ return a version to operate on as the input """ + return values + def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ return values, other @@ -703,8 +710,11 @@ def interpolate(self, method='pad', axis=0, inplace=False, else: return [self.copy()] + fill_value = self._try_fill(fill_value) values = self.values if inplace else self.values.copy() + values = self._try_operate(values) values = com.interpolate_2d(values, method, axis, limit, fill_value) + values = self._try_coerce_result(values) blocks = [ make_block(values, self.items, self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) ] return self._maybe_downcast(blocks, downcast) @@ -1008,6 +1018,57 @@ def _try_cast(self, element): def should_store(self, value): return com.is_integer_dtype(value) and value.dtype == self.dtype +class TimeDeltaBlock(IntBlock): + is_timedelta = True + _can_hold_na = True + is_numeric = False + + def _try_fill(self, value): + """ if we are a NaT, return the actual fill value """ + if isinstance(value, type(tslib.NaT)) or isnull(value): + value = tslib.iNaT + elif isinstance(value, np.timedelta64): + pass + elif com.is_integer(value): + # coerce to seconds of timedelta + value = np.timedelta64(int(value*1e9)) + elif isinstance(value, timedelta): + value = np.timedelta64(value) + + return value + + def _try_operate(self, values): + """ return a version to operate on """ + return values.view('i8') + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args / try_operate """ + if isinstance(result, np.ndarray): + if result.dtype.kind in ['i','f','O']: + result = result.astype('m8[ns]') + elif isinstance(result, np.integer): + result = np.timedelta64(result) + return result + + def should_store(self, value): + return issubclass(value.dtype.type, np.timedelta64) + + def to_native_types(self, slicer=None, na_rep=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isnull(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (-mask).ravel() + rvalues.flat[imask] = np.array([lib.repr_timedelta64(val) + for val in values.ravel()[imask]], dtype=object) + return rvalues.tolist() class BoolBlock(NumericBlock): is_bool = True @@ -1216,6 +1277,10 @@ def _try_cast(self, element): except: return element + def _try_operate(self, values): + """ return a version to operate on """ + return values.view('i8') + def _try_coerce_args(self, values, other): """ provide coercion to our input arguments we are going to compare vs i8, so coerce to integer @@ -1236,17 +1301,20 @@ def _try_coerce_result(self, result): if result.dtype == 'i8': result = tslib.array_to_datetime( result.astype(object).ravel()).reshape(result.shape) + elif result.dtype.kind in ['i','f','O']: + result = result.astype('M8[ns]') elif isinstance(result, (np.integer, np.datetime64)): result = lib.Timestamp(result) return result def _try_fill(self, value): """ if we are a NaT, return the actual fill value """ - if isinstance(value, type(tslib.NaT)): + if isinstance(value, type(tslib.NaT)) or isnull(value): value = tslib.iNaT return value def fillna(self, value, inplace=False, downcast=None): + # straight putmask here values = self.values if inplace else self.values.copy() mask = com.isnull(self.values) value = self._try_fill(value) @@ -1267,12 +1335,9 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): na_rep = 'NaT' rvalues[mask] = na_rep imask = (-mask).ravel() - if self.dtype == 'datetime64[ns]': - rvalues.flat[imask] = np.array( - [Timestamp(val)._repr_base for val in values.ravel()[imask]], dtype=object) - elif self.dtype == 'timedelta64[ns]': - rvalues.flat[imask] = np.array([lib.repr_timedelta64(val) - for val in values.ravel()[imask]], dtype=object) + rvalues.flat[imask] = np.array( + [Timestamp(val)._repr_base for val in values.ravel()[imask]], dtype=object) + return rvalues.tolist() def should_store(self, value): @@ -1551,6 +1616,8 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, fast klass = SparseBlock elif issubclass(vtype, np.floating): klass = FloatBlock + elif issubclass(vtype, np.integer) and issubclass(vtype, np.timedelta64): + klass = TimeDeltaBlock elif issubclass(vtype, np.integer) and not issubclass(vtype, np.datetime64): klass = IntBlock elif dtype == np.bool_: @@ -3404,12 +3471,13 @@ def _lcd_dtype(l): have_float = len(counts[FloatBlock]) > 0 have_complex = len(counts[ComplexBlock]) > 0 have_dt64 = len(counts[DatetimeBlock]) > 0 + have_td64 = len(counts[TimeDeltaBlock]) > 0 have_sparse = len(counts[SparseBlock]) > 0 have_numeric = have_float or have_complex or have_int if (have_object or (have_bool and have_numeric) or - (have_numeric and have_dt64)): + (have_numeric and (have_dt64 or have_td64))): return np.dtype(object) elif have_bool: return np.dtype(bool) @@ -3432,6 +3500,8 @@ def _lcd_dtype(l): elif have_dt64 and not have_float and not have_complex: return np.dtype('M8[ns]') + elif have_td64 and not have_float and not have_complex: + return np.dtype('m8[ns]') elif have_complex: return np.dtype('c16') else: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 3e247caae9c42..3a185ca83604d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -287,8 +287,7 @@ def nanmin(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf') # numpy 1.6.1 workaround in Python 3.x - if (values.dtype == np.object_ - and sys.version_info[0] >= 3): # pragma: no cover + if (values.dtype == np.object_ and compat.PY3): if values.ndim > 1: apply_ax = axis if axis is not None else 0 result = np.apply_along_axis(builtins.min, apply_ax, values) @@ -311,8 +310,7 @@ def nanmax(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf') # numpy 1.6.1 workaround in Python 3.x - if (values.dtype == np.object_ - and sys.version_info[0] >= 3): # pragma: no cover + if (values.dtype == np.object_ and compat.PY3): if values.ndim > 1: apply_ax = axis if axis is not None else 0 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 39334e95e8c81..e0bbc1a4e64c1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -480,6 +480,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_object = 1 # objects[i] = val.astype('O') break + elif util.is_timedelta64_object(val): + seen_object = 1 + break elif util.is_integer_object(val): seen_int = 1 floats[i] = val diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index bb76547da0c28..cefe15952d329 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3232,25 +3232,43 @@ def test_operators_timedelta64(self): result = diffs.max(axis=1) self.assert_((result == diffs['A']).all() == True) - # abs ###### THIS IS BROKEN NOW ###### (results are dtype=timedelta64[us] - # even though fixed in series - #result = np.abs(df['A']-df['B']) - #result = diffs.abs() - #expected = DataFrame(dict(A = df['A']-df['C'], - # B = df['B']-df['A'])) - #assert_frame_equal(result,expected) + # abs + result = diffs.abs() + expected = DataFrame(dict(A = df['A']-df['C'], + B = df['B']-df['A'])) + assert_frame_equal(result,expected) # mixed frame mixed = diffs.copy() mixed['C'] = 'foo' mixed['D'] = 1 mixed['E'] = 1. + mixed['F'] = Timestamp('20130101') - # this is ok + # results in an object array result = mixed.min() + expected = Series([com._coerce_scalar_to_timedelta_type(timedelta(seconds=5*60+5)), + com._coerce_scalar_to_timedelta_type(timedelta(days=-1)), + 'foo', + 1, + 1.0, + Timestamp('20130101')], + index=mixed.columns) + assert_series_equal(result,expected) - # this is not + # excludes numeric result = mixed.min(axis=1) + expected = Series([1, 1, 1.],index=[0, 1, 2]) + assert_series_equal(result,expected) + + # works when only those columns are selected + result = mixed[['A','B']].min(1) + expected = Series([ timedelta(days=-1) ] * 3) + assert_series_equal(result,expected) + + result = mixed[['A','B']].min() + expected = Series([ timedelta(seconds=5*60+5), timedelta(days=-1) ],index=['A','B']) + assert_series_equal(result,expected) # GH 3106 df = DataFrame({'time' : date_range('20130102',periods=5), diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4d86e8ae4a25b..b0911ed10be20 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2405,6 +2405,78 @@ def test_timedelta64_functions(self): expected = Series([timedelta(1)], dtype='timedelta64[ns]') assert_series_equal(result, expected) + def test_timedelta_fillna(self): + if com._np_version_under1p7: + raise nose.SkipTest("timedelta broken in np 1.6.1") + + #GH 3371 + from datetime import timedelta + + s = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130102'),Timestamp('20130103 9:01:01')]) + td = s.diff() + + # reg fillna + result = td.fillna(0) + expected = Series([timedelta(0),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) + assert_series_equal(result,expected) + + # interprested as seconds + result = td.fillna(1) + expected = Series([timedelta(seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) + assert_series_equal(result,expected) + + result = td.fillna(timedelta(days=1,seconds=1)) + expected = Series([timedelta(days=1,seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) + assert_series_equal(result,expected) + + result = td.fillna(np.timedelta64(int(1e9))) + expected = Series([timedelta(seconds=1),timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)]) + assert_series_equal(result,expected) + + from pandas import tslib + result = td.fillna(tslib.NaT) + expected = Series([tslib.NaT,timedelta(0),timedelta(1),timedelta(days=1,seconds=9*3600+60+1)],dtype='m8[ns]') + assert_series_equal(result,expected) + + # ffill + td[2] = np.nan + result = td.ffill() + expected = td.fillna(0) + expected[0] = np.nan + assert_series_equal(result,expected) + + # bfill + td[2] = np.nan + result = td.bfill() + expected = td.fillna(0) + expected[2] = timedelta(days=1,seconds=9*3600+60+1) + assert_series_equal(result,expected) + + def test_datetime64_fillna(self): + + s = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130102'),Timestamp('20130103 9:01:01')]) + s[2] = np.nan + + # reg fillna + result = s.fillna(Timestamp('20130104')) + expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130104'),Timestamp('20130103 9:01:01')]) + assert_series_equal(result,expected) + + from pandas import tslib + result = s.fillna(tslib.NaT) + expected = s + assert_series_equal(result,expected) + + # ffill + result = s.ffill() + expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130103 9:01:01')]) + assert_series_equal(result,expected) + + # bfill + result = s.bfill() + expected = Series([Timestamp('20130101'),Timestamp('20130101'),Timestamp('20130103 9:01:01'),Timestamp('20130103 9:01:01')]) + assert_series_equal(result,expected) + def test_sub_of_datetime_from_TimeSeries(self): from pandas.core import common as com from datetime import datetime @@ -4205,16 +4277,14 @@ def test_reindex_corner(self): def test_reindex_pad(self): - s = Series(np.arange(10), np.arange(10)) + s = Series(np.arange(10)) s2 = s[::2] reindexed = s2.reindex(s.index, method='pad') reindexed2 = s2.reindex(s.index, method='ffill') assert_series_equal(reindexed, reindexed2) - # used platform int above, need to pass int explicitly here per #1219 - expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], dtype=int, - index=np.arange(10)) + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) assert_series_equal(reindexed, expected) # GH4604 @@ -4624,7 +4694,7 @@ def test_replace_with_single_list(self): assert_series_equal(s, ser) def test_replace_mixed_types(self): - s = Series(np.arange(5)) + s = Series(np.arange(5),dtype='int64') def check_replace(to_rep, val, expected): sc = s.copy()