From eb1536b4303b36116b5f0c86e2f28bd17454385d Mon Sep 17 00:00:00 2001 From: Christopher Kotfila Date: Tue, 11 Nov 2014 08:17:45 -0500 Subject: [PATCH 01/14] Issue with to_latex and MultiIndex column format This is a potential resolution to https://github.com/pydata/pandas/issues/8336 It borrows the same code flow from _get_formatted_index in pandas.core.format --- pandas/core/format.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 89973754a861c..9eb1af7a78033 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -581,14 +581,20 @@ def get_col_type(dtype): strcols = self._to_str_columns() if self.index and isinstance(self.frame.index, MultiIndex): - clevels = self.frame.columns.nlevels strcols.pop(0) - name = any(self.frame.columns.names) - for i, lev in enumerate(self.frame.index.levels): - lev2 = lev.format(name=name) - width = len(lev2[0]) - lev3 = [' ' * width] * clevels + lev2 - strcols.insert(i, lev3) + + + fmt = self._get_formatter('__index__') + fmt_index = self.frame.index.format(sparsify=self.sparsify, + adjoin=False, + names=False, + formatter=fmt) + + for i, lev in enumerate(fmt_index): + width = len(lev[0]) + lev2 = [width * ' ' if l == '' else l for l in lev] + lev2.insert(0, width * ' ') + strcols.insert(i, lev2) if column_format is None: dtypes = self.frame.dtypes.values From e366f688630782c344e61bbb77012580d45a1efb Mon Sep 17 00:00:00 2001 From: Christopher Kotfila Date: Tue, 11 Nov 2014 09:12:04 -0500 Subject: [PATCH 02/14] Must include MultiIndex Column names --- pandas/core/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 9eb1af7a78033..145dbd59109df 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -587,7 +587,7 @@ def get_col_type(dtype): fmt = self._get_formatter('__index__') fmt_index = self.frame.index.format(sparsify=self.sparsify, adjoin=False, - names=False, + names=True, formatter=fmt) for i, lev in enumerate(fmt_index): From 8bae0d42db5e4fbd93407a016b35267f830a4ace Mon Sep 17 00:00:00 2001 From: dsm054 Date: Sun, 22 Mar 2015 11:38:18 -0400 Subject: [PATCH 03/14] BUG: ensure we use group sizes, not group counts, in transform (GH9697) --- doc/source/whatsnew/v0.16.1.txt | 2 ++ pandas/core/groupby.py | 2 +- pandas/tests/test_groupby.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index bf01d3b21f3fa..5801d1b811790 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -31,3 +31,5 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 73439fb1e535d..6d98b3b99021b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2453,7 +2453,7 @@ def _transform_fast(self, func): if isinstance(func, compat.string_types): func = getattr(self,func) values = func().values - counts = self.count().values + counts = self.size().values values = np.repeat(values, com._ensure_platform_int(counts)) return self._set_result_index_ordered(Series(values)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 79ebb80fc9ebb..e7001eb09f20c 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1058,6 +1058,19 @@ def test_transform_function_aliases(self): expected = self.df.groupby('A')['C'].transform(np.mean) assert_series_equal(result, expected) + def test_transform_length(self): + # GH 9697 + df = pd.DataFrame({'col1':[1,1,2,2], 'col2':[1,2,3,np.nan]}) + expected = pd.Series([3.0]*4) + def nsum(x): + return np.nansum(x) + results = [df.groupby('col1').transform(sum)['col2'], + df.groupby('col1')['col2'].transform(sum), + df.groupby('col1').transform(nsum)['col2'], + df.groupby('col1')['col2'].transform(nsum)] + for result in results: + assert_series_equal(result, expected) + def test_with_na(self): index = Index(np.arange(10)) From 25baa7e89d312f353c2e7a32230e7ece534b8c82 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 22 Mar 2015 22:53:56 +0100 Subject: [PATCH 04/14] DOC: correct currenmoduel for API stringmethods Now we use Series.str this is not in the submodule anymore --- doc/source/api.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 58ea517d055a0..33577a5badc54 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -517,8 +517,6 @@ String handling strings and apply several methods to it. These can be acccessed like ``Series.str.``. -.. currentmodule:: pandas.core.strings - .. autosummary:: :toctree: generated/ :template: autosummary/accessor_method.rst From 29377bef771908124c0b3e5f1469fddd9c3b93a7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Mar 2015 17:54:55 -0400 Subject: [PATCH 05/14] DOC: add v0.16.1.txt to whatsnew --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 4f5991bc20ed9..d05c19a5e4bea 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.16.1.txt + .. include:: whatsnew/v0.16.0.txt .. include:: whatsnew/v0.15.2.txt From 61086a770615e5a5874254c6153ab36269631fda Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 22 Mar 2015 23:33:59 +0100 Subject: [PATCH 06/14] DOC: add automatic content to 0.16.1 whatsnew file --- doc/source/whatsnew/v0.16.1.txt | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index bf01d3b21f3fa..f01bca17fa83a 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -3,30 +3,45 @@ v0.16.1 (April ??, 2015) ------------------------ -This is a minor bug-fix release from 0.16.0 and includes a small number of API changes, several new features, -enhancements, and performance improvements along with a large number of bug fixes. We recommend that all -users upgrade to this version. +This is a minor bug-fix release from 0.16.0 and includes a a large number of +bug fixes along several new features, enhancements, and performance improvements. +We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.16.1 + :local: + :backlinks: none + + +.. _whatsnew_0161.enhancements: + +Enhancements +~~~~~~~~~~~~ + + + + -- :ref:`Enhancements ` -- :ref:`API Changes ` -- :ref:`Performance Improvements ` -- :ref:`Bug Fixes ` .. _whatsnew_0161.api: API changes ~~~~~~~~~~~ -.. _whatsnew_0161.enhancements: -Enhancements -~~~~~~~~~~~~ + + + .. _whatsnew_0161.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + .. _whatsnew_0161.bug_fixes: Bug Fixes From 3fb908b9fe6daee78ea362c42ad8c9b355cf6784 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 22 Mar 2015 22:28:44 -0700 Subject: [PATCH 07/14] DOC: add DataFrame.assign to API docs --- doc/source/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index 33577a5badc54..b617009fe2f13 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -861,6 +861,7 @@ Combining / joining / merging :toctree: generated/ DataFrame.append + DataFrame.assign DataFrame.join DataFrame.merge DataFrame.update From 9dc9f0caba22fcb925e4124376d9c72327b1ee9e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Mar 2015 09:19:49 +0100 Subject: [PATCH 08/14] DOC: try to use matplotlib style (mpl >= 1.4) in all docs --- doc/source/10min.rst | 8 +++++--- doc/source/categorical.rst | 1 - doc/source/computation.rst | 6 +++++- doc/source/cookbook.rst | 7 ++++++- doc/source/faq.rst | 6 +++++- doc/source/groupby.rst | 8 ++++++-- doc/source/sparse.rst | 4 ---- doc/source/visualization.rst | 10 +++++++--- 8 files changed, 34 insertions(+), 16 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index c98f41973e1ee..1f59c38d75f93 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -12,7 +12,11 @@ from pandas import options import pandas as pd np.set_printoptions(precision=4, suppress=True) - options.display.mpl_style='default' + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + options.display.mpl_style = 'default' options.display.max_rows=15 #### portions of this were borrowed from the @@ -695,8 +699,6 @@ Plotting import matplotlib.pyplot as plt plt.close('all') - from pandas import options - options.display.mpl_style='default' .. ipython:: python diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 7fe04af716cec..d03e0fb117c5c 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -13,7 +13,6 @@ from pandas import * import pandas as pd np.set_printoptions(precision=4, suppress=True) - options.display.mpl_style='default' options.display.max_rows=15 diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 759675c51b960..4b0fe39d929a9 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -10,9 +10,13 @@ import pandas.util.testing as tm randn = np.random.randn np.set_printoptions(precision=4, suppress=True) + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + options.display.mpl_style = 'default' import matplotlib.pyplot as plt plt.close('all') - options.display.mpl_style='default' options.display.max_rows=15 Computational tools diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 6a3751cf7a0b8..0e6386955a653 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -17,7 +17,12 @@ np.random.seed(123456) pd.options.display.max_rows=15 - pd.options.display.mpl_style='default' + + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + pd.options.display.mpl_style = 'default' np.set_printoptions(precision=4, suppress=True) diff --git a/doc/source/faq.rst b/doc/source/faq.rst index de88b436198dd..467ec02b55f20 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -21,7 +21,11 @@ Frequently Asked Questions (FAQ) from pandas.tseries.offsets import * import matplotlib.pyplot as plt plt.close('all') - options.display.mpl_style='default' + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + options.display.mpl_style = 'default' from pandas.compat import lrange diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index db19e0de3d788..7ad2641dec52a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -12,7 +12,11 @@ np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.close('all') - options.display.mpl_style='default' + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + options.display.mpl_style = 'default' from pandas.compat import zip ***************************** @@ -346,7 +350,7 @@ A single group can be selected using ``GroupBy.get_group()``: .. ipython:: python grouped.get_group('bar') - + Or for an object grouped on multiple columns: .. ipython:: python diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index e72ee6b709282..79def066f0710 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -10,9 +10,6 @@ import pandas.util.testing as tm randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - plt.close('all') - options.display.mpl_style='default' options.display.max_rows = 15 ********************** @@ -222,4 +219,3 @@ row and columns coordinates of the matrix. Note that this will consume a signifi ss_dense = SparseSeries.from_coo(A, dense_index=True) ss_dense - diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 852397c355361..9d4cba2e5ee8c 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -31,10 +31,14 @@ We use the standard convention for referencing the matplotlib API: import matplotlib.pyplot as plt -.. versionadded:: 0.11.0 +The plots in this document are made using matplotlib's ``ggplot`` style (new in version 1.4): -The plots in this document are made using matplotlib's ``ggplot`` style (new in version 1.4). -If your version of matplotlib is 1.3 or lower, setting the ``display.mpl_style`` to ``'default'`` +.. code-block:: python + + import matplotlib + matplotlib.style.use('ggplot') + +If your version of matplotlib is 1.3 or lower, you can set ``display.mpl_style`` to ``'default'`` with ``pd.options.display.mpl_style = 'default'`` to produce more appealing plots. When set, matplotlib's ``rcParams`` are changed (globally!) to nicer-looking settings. From 03f9af07a080dfaf2d5d8ad541addb00e220c89e Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Wed, 25 Mar 2015 07:41:19 -0400 Subject: [PATCH 09/14] TST: Fix dateutil version check --- pandas/tseries/tests/test_tslib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 79adabafb7044..e452ddee9d8db 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -167,7 +167,7 @@ def test_repr(self): # dateutil zone change (only matters for repr) import dateutil - if dateutil.__version__ >= LooseVersion('2.3') and dateutil.__version__ <= LooseVersion('2.4'): + if dateutil.__version__ >= LooseVersion('2.3') and dateutil.__version__ <= LooseVersion('2.4.0'): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] else: timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles'] From f7c7ee0e92c61870626256eea93f64fda940cb95 Mon Sep 17 00:00:00 2001 From: flying-sheep Date: Tue, 24 Mar 2015 13:47:31 +0100 Subject: [PATCH 10/14] Document how to drop duplicate indices fixes #9708 --- doc/source/indexing.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 5079b4fa8ad6f..1f50a9c85343c 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1137,6 +1137,16 @@ should be taken instead. df2.drop_duplicates(['a','b']) df2.drop_duplicates(['a','b'], take_last=True) +An easier way to drop duplicates on the index than to temporarily forgo it is +``groupby(level=0)`` combined with ``first()`` or ``last()``. + +.. ipython:: python + + df3 = df2.set_index('b') + df3 + df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b') + df3.groupby(level=0).first() + .. _indexing.dictionarylike: Dictionary-like :meth:`~pandas.DataFrame.get` method From beb0812cbcfc75b5776ba64c01c24eb57ebc84c7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Mar 2015 18:55:43 -0400 Subject: [PATCH 11/14] DOC: edit indexing.rst a bit --- doc/source/indexing.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1f50a9c85343c..fc074802353ee 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1137,16 +1137,17 @@ should be taken instead. df2.drop_duplicates(['a','b']) df2.drop_duplicates(['a','b'], take_last=True) -An easier way to drop duplicates on the index than to temporarily forgo it is -``groupby(level=0)`` combined with ``first()`` or ``last()``. +An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``. .. ipython:: python df3 = df2.set_index('b') df3 - df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b') df3.groupby(level=0).first() + # a bit more verbose + df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b') + .. _indexing.dictionarylike: Dictionary-like :meth:`~pandas.DataFrame.get` method From a4772028fef7f660cf6d94129a1a9a6f3852e441 Mon Sep 17 00:00:00 2001 From: Sam Zhang Date: Sat, 21 Mar 2015 01:26:06 +0000 Subject: [PATCH 12/14] BUG: datetime/timedelta Series quantile() call Changes to be committed: modified: pandas/core/series.py modified: pandas/tests/test_series.py Fixes global reference to iNaT (should be tslib.iNaT) in series._maybe_box. Adds test `test_datetime_timedelta_quantiles` to check for proper return value in test_series.py. Issue #9675 --- doc/source/whatsnew/v0.16.1.txt | 13 +++++++++++++ pandas/core/series.py | 2 +- pandas/tests/test_series.py | 5 +++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 1e570768b5a7a..d130879b85475 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -47,4 +47,17 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + + + + - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) + + + + + + + + +- Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7e3b21be13525..68f3a6032402f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2095,7 +2095,7 @@ def _maybe_box(self, func, dropna=False): boxer = com.i8_boxer(self) if len(values) == 0: - return boxer(iNaT) + return boxer(tslib.iNaT) values = values.view('i8') result = func(values) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index ae2ed4eaca2f4..9b5e36974553b 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -6837,6 +6837,11 @@ def test_repeat(self): def test_unique_data_ownership(self): # it works! #1807 Series(Series(["a", "c", "b"]).unique()).sort() + + def test_datetime_timedelta_quantiles(self): + # covers #9694 + self.assertTrue(pd.isnull(Series([],dtype='M8[ns]').quantile(.5))) + self.assertTrue(pd.isnull(Series([],dtype='m8[ns]').quantile(.5))) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 64c3921ef3c862fb17a1c5cdc540cedb261b83f4 Mon Sep 17 00:00:00 2001 From: Christopher Kotfila Date: Tue, 11 Nov 2014 08:17:45 -0500 Subject: [PATCH 13/14] Issue with to_latex and MultiIndex column format This is a potential resolution to https://github.com/pydata/pandas/issues/8336 It borrows the same code flow from _get_formatted_index in pandas.core.format --- pandas/core/format.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index b21ca9050ffd0..654601efc8269 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -608,14 +608,20 @@ def get_col_type(dtype): strcols = self._to_str_columns() if self.index and isinstance(self.frame.index, MultiIndex): - clevels = self.frame.columns.nlevels strcols.pop(0) - name = any(self.frame.columns.names) - for i, lev in enumerate(self.frame.index.levels): - lev2 = lev.format(name=name) - width = len(lev2[0]) - lev3 = [' ' * width] * clevels + lev2 - strcols.insert(i, lev3) + + + fmt = self._get_formatter('__index__') + fmt_index = self.frame.index.format(sparsify=self.sparsify, + adjoin=False, + names=False, + formatter=fmt) + + for i, lev in enumerate(fmt_index): + width = len(lev[0]) + lev2 = [width * ' ' if l == '' else l for l in lev] + lev2.insert(0, width * ' ') + strcols.insert(i, lev2) if column_format is None: dtypes = self.frame.dtypes.values From 079b19c5f0b4ea03c690b6cbbabf4f34704c10b3 Mon Sep 17 00:00:00 2001 From: Christopher Kotfila Date: Tue, 11 Nov 2014 09:12:04 -0500 Subject: [PATCH 14/14] Must include MultiIndex Column names --- pandas/core/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 654601efc8269..78f06b04dc7fd 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -614,7 +614,7 @@ def get_col_type(dtype): fmt = self._get_formatter('__index__') fmt_index = self.frame.index.format(sparsify=self.sparsify, adjoin=False, - names=False, + names=True, formatter=fmt) for i, lev in enumerate(fmt_index):