From c3064e669faba7f1d9da0f3846ffa9a01432a190 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Fri, 14 Jun 2019 22:40:57 +0530 Subject: [PATCH 01/23] BUG:Sanity check on merge parameters for correct exception #26824 --- pandas/core/reshape/merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1a80b35629356..259804328eef2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,6 +1089,9 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n + if not self.right_on: + raise ValueError('both left_on and right_on ' + 'should be passed') elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1096,6 +1099,9 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n + if not self.left_on: + raise ValueError('both left_on and right_on ' + 'should be passed') if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From c862ec0a8e4206969474e2847b2771b775e67efe Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 12:17:21 +0530 Subject: [PATCH 02/23] added tests --- pandas/tests/reshape/merge/test_merge.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b487f865b68a4..d632577a29654 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1763,3 +1763,18 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_correct_exception(): + # GH26824 + df1 = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + df2 = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'both left_on and right_on should be passed' + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', left_on='A') From 0df85f03c6aa34a9580f82d429617b7168b85f9c Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 13:26:21 +0530 Subject: [PATCH 03/23] updated tests for better coverage --- pandas/tests/reshape/merge/test_merge.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d632577a29654..a925737d883da 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1765,7 +1765,8 @@ def test_merge_equal_cat_dtypes2(): tm.assert_frame_equal(result, expected, check_categorical=False) -def test_merge_correct_exception(): +@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) +def test_merge_correct_exception(merge_type): # GH26824 df1 = DataFrame({ 'A': [1, 2, 3, 4, 5, 6], @@ -1776,5 +1777,9 @@ def test_merge_correct_exception(): 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) msg = 'both left_on and right_on should be passed' - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', left_on='A') + if merge_type == 'left_on': + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', left_on='A') + if merge_type == 'right_on': + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', right_on='A') From dea59fa8f721450e3f133934c1023b45efd6d5a9 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 19:20:23 +0530 Subject: [PATCH 04/23] requested changes on the test --- pandas/tests/reshape/merge/test_merge.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a925737d883da..c384548ba692e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1766,7 +1766,7 @@ def test_merge_equal_cat_dtypes2(): @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) -def test_merge_correct_exception(merge_type): +def test_missing_on_raises(merge_type): # GH26824 df1 = DataFrame({ 'A': [1, 2, 3, 4, 5, 6], @@ -1777,9 +1777,6 @@ def test_merge_correct_exception(merge_type): 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) msg = 'both left_on and right_on should be passed' - if merge_type == 'left_on': - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', left_on='A') - if merge_type == 'right_on': - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', right_on='A') + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', **kwargs) From 53aee366401dc0deeffde43091a8df3c617b5270 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 19:42:23 +0530 Subject: [PATCH 05/23] updated whatsnew v0250rst --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 207d16afd350f..5968cf123a894 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -99,6 +99,7 @@ Other Enhancements - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) +- :func:`pandas.merge` now raises ``ValueError`` when either of ``left_on`` or ``right_on`` is not provided (:issue:`26855`) .. _whatsnew_0250.api_breaking: From 639694744d7cd14ed6af905398180747ce7ff247 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 20:09:51 +0530 Subject: [PATCH 06/23] requested changes --- pandas/core/reshape/merge.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 259804328eef2..d1fa88e1ab585 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,9 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - if not self.right_on: - raise ValueError('both left_on and right_on ' - 'should be passed') + self.right_on = self.right_on or [None] * len(self.left_on) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1099,9 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - if not self.left_on: - raise ValueError('both left_on and right_on ' - 'should be passed') + self.left_on = self.left_on or [None] * len(self.right_on) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 11b1894bf9abd781a6408dad8df86ff66bc066e8 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 22:52:39 +0530 Subject: [PATCH 07/23] further changes --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d1fa88e1ab585..0965ca2c2f99a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,7 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - self.right_on = self.right_on or [None] * len(self.left_on) + self.right_on = self.right_on or [None] * (n + 1) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1097,7 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - self.left_on = self.left_on or [None] * len(self.right_on) + self.left_on = self.left_on or [None] * (n + 1) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 7b4aa22fce94423c4f26dd6c07decbc67e74eb0c Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 22:54:33 +0530 Subject: [PATCH 08/23] updated test --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c384548ba692e..810a87ba30506 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1776,7 +1776,7 @@ def test_missing_on_raises(merge_type): 'A': [1, 2, 4, 5, 7, 8], 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) - msg = 'both left_on and right_on should be passed' + msg = 'must equal' kwargs = {merge_type: 'A'} with pytest.raises(ValueError, match=msg): pd.merge(df1, df2, how='left', **kwargs) From 1e2b27662f3771dc5811b17254d51b07c277f7fc Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Fri, 14 Jun 2019 22:40:57 +0530 Subject: [PATCH 09/23] BUG:Sanity check on merge parameters for correct exception #26824 --- pandas/core/reshape/merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1a80b35629356..259804328eef2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,6 +1089,9 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n + if not self.right_on: + raise ValueError('both left_on and right_on ' + 'should be passed') elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1096,6 +1099,9 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n + if not self.left_on: + raise ValueError('both left_on and right_on ' + 'should be passed') if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 0f88c32dcc138907bb6e7031ab57e31427263aec Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 12:17:21 +0530 Subject: [PATCH 10/23] added tests --- pandas/tests/reshape/merge/test_merge.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b487f865b68a4..d632577a29654 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1763,3 +1763,18 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_correct_exception(): + # GH26824 + df1 = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + df2 = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'both left_on and right_on should be passed' + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', left_on='A') From a9905bdd2f90e48cf141edf6e9005d70ff98676d Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 13:26:21 +0530 Subject: [PATCH 11/23] updated tests for better coverage --- pandas/tests/reshape/merge/test_merge.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d632577a29654..a925737d883da 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1765,7 +1765,8 @@ def test_merge_equal_cat_dtypes2(): tm.assert_frame_equal(result, expected, check_categorical=False) -def test_merge_correct_exception(): +@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) +def test_merge_correct_exception(merge_type): # GH26824 df1 = DataFrame({ 'A': [1, 2, 3, 4, 5, 6], @@ -1776,5 +1777,9 @@ def test_merge_correct_exception(): 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) msg = 'both left_on and right_on should be passed' - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', left_on='A') + if merge_type == 'left_on': + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', left_on='A') + if merge_type == 'right_on': + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', right_on='A') From a939d8e50195b3fb49fc75a472e1fa6cfd846058 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 19:20:23 +0530 Subject: [PATCH 12/23] requested changes on the test --- pandas/tests/reshape/merge/test_merge.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a925737d883da..c384548ba692e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1766,7 +1766,7 @@ def test_merge_equal_cat_dtypes2(): @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) -def test_merge_correct_exception(merge_type): +def test_missing_on_raises(merge_type): # GH26824 df1 = DataFrame({ 'A': [1, 2, 3, 4, 5, 6], @@ -1777,9 +1777,6 @@ def test_merge_correct_exception(merge_type): 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) msg = 'both left_on and right_on should be passed' - if merge_type == 'left_on': - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', left_on='A') - if merge_type == 'right_on': - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', right_on='A') + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', **kwargs) From 83f8cda17968d7ef070bb08cc87992435d26c416 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 19:42:23 +0530 Subject: [PATCH 13/23] updated whatsnew v0250rst --- doc/source/whatsnew/v0.25.0.rst | 785 -------------------------------- 1 file changed, 785 deletions(-) delete mode 100644 doc/source/whatsnew/v0.25.0.rst diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst deleted file mode 100644 index b458b0f998255..0000000000000 --- a/doc/source/whatsnew/v0.25.0.rst +++ /dev/null @@ -1,785 +0,0 @@ -.. _whatsnew_0250: - -What's New in 0.25.0 (April XX, 2019) -------------------------------------- - -.. warning:: - - Starting with the 0.25.x series of releases, pandas only supports Python 3.5 and higher. - See :ref:`install.dropping-27` for more details. - -.. warning:: - - `Panel` has been fully removed. For N-D labeled data structures, please - use `xarray `_ - -{{ header }} - -These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog -including other versions of pandas. - - -Enhancements -~~~~~~~~~~~~ - -.. _whatsnew_0250.enhancements.agg_relabel: - -Groupby Aggregation with Relabeling -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has added special groupby behavior, known as "named aggregation", for naming the -output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). - -.. ipython:: python - - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) - animals - animals.groupby("kind").agg( - min_height=pd.NamedAgg(column='height', aggfunc='min'), - max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), - ) - -Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` -should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer -what the arguments to the function are, but plain tuples are accepted as well. - -.. ipython:: python - - animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), - average_weight=('weight', np.mean), - ) - -Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" -approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). - -A similar approach is now available for Series groupby objects as well. Because there's no need for -column selection, the values can just be the functions to apply - -.. ipython:: python - - animals.groupby("kind").height.agg( - min_height="min", - max_height="max", - ) - - -This type of aggregation is the recommended alternative to the deprecated behavior when passing -a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). - -See :ref:`groupby.aggregate.named` for more. - - -.. _whatsnew_0250.enhancements.multi_index_repr: - -Better repr for MultiIndex -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Printing of :class:`MultiIndex` instances now shows tuples of each row and ensures -that the tuple items are vertically aligned, so it's now easier to understand -the structure of the ``MultiIndex``. (:issue:`13480`): - -The repr now looks like this: - -.. ipython:: python - - pd.MultiIndex.from_product([['a', 'abc'], range(500)]) - -Previously, outputting a :class:`MultiIndex` printed all the ``levels`` and -``codes`` of the ``MultiIndex``, which was visually unappealing and made -the output more difficult to navigate. For example (limiting the range to 5): - -.. code-block:: ipython - - In [1]: pd.MultiIndex.from_product([['a', 'abc'], range(5)]) - Out[1]: MultiIndex(levels=[['a', 'abc'], [0, 1, 2, 3]], - ...: codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]]) - -In the new repr, all values will be shown, if the number of rows is smaller -than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally, -the output will truncate, if it's wider than :attr:`options.display.width` -(default: 80 characters). - - -.. _whatsnew_0250.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`DataFrame.plot` keywords ``logy``, ``logx`` and ``loglog`` can now accept the value ``'sym'`` for symlog scaling. (:issue:`24867`) -- Added support for ISO week year format ('%G-%V-%u') when parsing datetimes using :meth:`to_datetime` (:issue:`16607`) -- Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) -- :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) -- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) -- :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`) -- ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) -- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) -- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) -- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) -- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) -- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) -- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) -- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) -- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) -- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) -- :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- Error message for missing required imports now includes the original import error's text (:issue:`23868`) -- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) -- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) -- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) - -.. _whatsnew_0250.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_0250.api_breaking.utc_offset_indexing: - - -Indexing with date strings with UTC offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Indexing a :class:`DataFrame` or :class:`Series` with a :class:`DatetimeIndex` with a -date string with a UTC offset would previously ignore the UTC offset. Now, the UTC offset -is respected in indexing. (:issue:`24076`, :issue:`16785`) - -.. ipython:: python - - df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) - df - -*Previous Behavior*: - -.. code-block:: ipython - - In [3]: df['2019-01-01 00:00:00+04:00':'2019-01-01 01:00:00+04:00'] - Out[3]: - 0 - 2019-01-01 00:00:00-08:00 0 - -*New Behavior*: - -.. ipython:: python - - df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] - - -.. _whatsnew_0250.api_breaking.multi_indexing: - - -``MultiIndex`` constructed from levels and codes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Constructing a :class:`MultiIndex` with ``NaN`` levels or codes value < -1 was allowed previously. -Now, construction with codes value < -1 is not allowed and ``NaN`` levels' corresponding codes -would be reassigned as -1. (:issue:`19387`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [1]: pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - ...: codes=[[0, -1, 1, 2, 3, 4]]) - ...: - Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) - - In [2]: pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) - Out[2]: MultiIndex(levels=[[1, 2]], - codes=[[0, -2]]) - -*New Behavior*: - -.. ipython:: python - :okexcept: - - pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) - pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) - - -.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: - -``GroupBy.apply`` on ``DataFrame`` evaluates first group only once -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The implementation of :meth:`DataFrameGroupBy.apply() ` -previously evaluated the supplied function consistently twice on the first group -to infer if it is safe to use a fast code path. Particularly for functions with -side effects, this was an undesired behavior and may have led to surprises. (:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, :issue:`20084`, :issue:`21417`) - -Now every group is evaluated only a single time. - -.. ipython:: python - - df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) - df - - def func(group): - print(group.name) - return group - -*Previous Behavior*: - -.. code-block:: python - - In [3]: df.groupby('a').apply(func) - x - x - y - Out[3]: - a b - 0 x 1 - 1 y 2 - -*New Behavior*: - -.. ipython:: python - - df.groupby("a").apply(func) - - -Concatenating Sparse Values -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When passed DataFrames whose values are sparse, :func:`concat` will now return a -:class:`Series` or :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (:issue:`25702`). - -.. ipython:: python - - df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: type(pd.concat([df, df])) - pandas.core.sparse.frame.SparseDataFrame - -*New Behavior*: - -.. ipython:: python - - type(pd.concat([df, df])) - - -This now matches the existing behavior of :class:`concat` on ``Series`` with sparse values. -:func:`concat` will continue to return a ``SparseDataFrame`` when all the values -are instances of ``SparseDataFrame``. - -This change also affects routines using :func:`concat` internally, like :func:`get_dummies`, -which now returns a :class:`DataFrame` in all cases (previously a ``SparseDataFrame`` was -returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwise). - -Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will -cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. - -The ``.str``-accessor performs stricter type checks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Due to the lack of more fine-grained dtypes, :attr:`Series.str` so far only checked whether the data was -of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* the Series; in particular, -``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, -:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. - -*Previous Behavior*: - -.. code-block:: python - - In [1]: s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) - - In [2]: s - Out[2]: - 0 b'a' - 1 b'ba' - 2 b'cba' - dtype: object - - In [3]: s.str.startswith(b'a') - Out[3]: - 0 True - 1 False - 2 False - dtype: bool - -*New Behavior*: - -.. ipython:: python - :okexcept: - - s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) - s - s.str.startswith(b'a') - -.. _whatsnew_0250.api_breaking.incompatible_index_unions: - -Incompatible Index Type Unions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When performing :func:`Index.union` operations between objects of incompatible dtypes, -the result will be a base :class:`Index` of dtype ``object``. This behavior holds true for -unions between :class:`Index` objects that previously would have been prohibited. The dtype -of empty :class:`Index` objects will now be evaluated before performing union operations -rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be -considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). - -*Previous Behavior*: - -.. code-block:: python - - In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) - ... - ValueError: can only call with other PeriodIndex-ed objects - - In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) - Out[2]: Int64Index([1, 2, 3], dtype='int64') - -*New Behavior*: - -.. ipython:: python - - pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) - pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) - -``DataFrame`` groupby ffill/bfill no longer return group labels -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The methods ``ffill``, ``bfill``, ``pad`` and ``backfill`` of -:class:`DataFrameGroupBy ` -previously included the group labels in the return value, which was -inconsistent with other groupby transforms. Now only the filled values -are returned. (:issue:`21521`) - -.. ipython:: python - - df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) - df - -*Previous Behavior*: - -.. code-block:: python - - In [3]: df.groupby("a").ffill() - Out[3]: - a b - 0 x 1 - 1 y 2 - -*New Behavior*: - -.. ipython:: python - - df.groupby("a").ffill() - -``DataFrame`` describe on an empty categorical / object column will return top and freq -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When calling :meth:`DataFrame.describe` with an empty categorical / object -column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with -the output for non-empty columns. Now the 'top' and 'freq' columns will always be included, -with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`) - -.. ipython:: python - - df = pd.DataFrame({"empty_col": pd.Categorical([])}) - df - -*Previous Behavior*: - -.. code-block:: python - - In [3]: df.describe() - Out[3]: - empty_col - count 0 - unique 0 - -*New Behavior*: - -.. ipython:: python - - df.describe() - -``__str__`` methods now call ``__repr__`` rather than vice versa -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has until now mostly defined string representations in a Pandas objects's -``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` -method, if a specific ``__repr__`` method is not found. This is not needed for Python3. -In Pandas 0.25, the string representations of Pandas objects are now generally -defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to -the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. -This change is backward compatible for direct usage of Pandas, but if you subclass -Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, -you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). - -.. _whatsnew_0250.api_breaking.deps: - -Increased minimum versions for dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Due to dropping support for Python 2.7, a number of optional dependencies have updated minimum versions (:issue:`25725`, :issue:`24942`, :issue:`25752`). -Independently, some minimum supported versions of dependencies were updated (:issue:`23519`, :issue:`25554`). -If installed, we now require: - -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| numpy | 1.13.3 | X | -+-----------------+-----------------+----------+ -| pytz | 2015.4 | X | -+-----------------+-----------------+----------+ -| python-dateutil | 2.6.1 | X | -+-----------------+-----------------+----------+ -| bottleneck | 1.2.1 | | -+-----------------+-----------------+----------+ -| numexpr | 2.6.2 | | -+-----------------+-----------------+----------+ -| pytest (dev) | 4.0.2 | | -+-----------------+-----------------+----------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+ -| Package | Minimum Version | -+=================+=================+ -| beautifulsoup4 | 4.6.0 | -+-----------------+-----------------+ -| fastparquet | 0.2.1 | -+-----------------+-----------------+ -| matplotlib | 2.2.2 | -+-----------------+-----------------+ -| openpyxl | 2.4.8 | -+-----------------+-----------------+ -| pyarrow | 0.9.0 | -+-----------------+-----------------+ -| pytables | 3.4.2 | -+-----------------+-----------------+ -| scipy | 0.19.0 | -+-----------------+-----------------+ -| sqlalchemy | 1.1.4 | -+-----------------+-----------------+ -| xarray | 0.8.2 | -+-----------------+-----------------+ -| xlrd | 1.1.0 | -+-----------------+-----------------+ -| xlsxwriter | 0.9.8 | -+-----------------+-----------------+ -| xlwt | 1.2.0 | -+-----------------+-----------------+ - -See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -.. _whatsnew_0250.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) -- :class:`Timestamp` and :class:`Timedelta` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`) -- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`) -- Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) -- Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) -- The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) -- The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) -- Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) -- The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) -- Removed support of gtk package for clipboards (:issue:`26563`) - -.. _whatsnew_0250.deprecations: - -Deprecations -~~~~~~~~~~~~ - -Sparse Subclasses -^^^^^^^^^^^^^^^^^ - -The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Their functionality is better-provided -by a ``Series`` or ``DataFrame`` with sparse values. - -**Previous Way** - -.. ipython:: python - :okwarning: - - df = pd.SparseDataFrame({"A": [0, 0, 1, 2]}) - df.dtypes - -**New Way** - -.. ipython:: python - - df = pd.DataFrame({"A": pd.SparseArray([0, 0, 1, 2])}) - df.dtypes - -The memory usage of the two approaches is identical. See :ref:`sparse.migration` for more (:issue:`19239`). - -Other Deprecations -^^^^^^^^^^^^^^^^^^ - -- The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`). -- Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`) -- The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or - the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). -- The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`) -- The :meth:`DataFrame.compound` and :meth:`Series.compound` methods are deprecated and will be removed in a future version (:issue:`26405`). -- The internal attributes ``_start``, ``_stop`` and ``_step`` attributes of :class:`RangeIndex` have been deprecated. - Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). -- The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. - Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). -- :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) -- :meth:`Series.to_sparse`, :meth:`DataFrame.to_sparse`, :meth:`Series.to_dense` and :meth:`DataFrame.to_dense` are deprecated and will be removed in a future version. (:issue:`26557`). - -.. _whatsnew_0250.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Removed ``Panel`` (:issue:`25047`, :issue:`25191`, :issue:`25231`) -- Removed the previously deprecated ``sheetname`` keyword in :func:`read_excel` (:issue:`16442`, :issue:`20938`) -- Removed the previously deprecated ``TimeGrouper`` (:issue:`16942`) -- Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) -- Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) -- Removed the previously deprecated ``convert_objects`` (:issue:`11221`) -- Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`) - -.. _whatsnew_0250.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) -- :meth:`DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is - int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) -- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) -- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`) -- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) -- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) -- Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) -- Improved performance of :meth:`DataFrame.to_csv` when writing datetime dtypes (:issue:`25708`) -- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) -- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) -- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) -- Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`) -- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero and float ``NaN``; by faster checking the string for the possibility of being a date (:issue:`25754`) -- Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`) -- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`) -- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - -.. _whatsnew_0250.bug_fixes: - -Bug Fixes -~~~~~~~~~ - - -Categorical -^^^^^^^^^^^ - -- Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) -- - -Datetimelike -^^^^^^^^^^^^ - -- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) -- Bug in :func:`to_datetime` which would raise ``InvalidIndexError: Reindexing only valid with uniquely valued Index objects`` when called with ``cache=True``, with ``arg`` including at least two different elements from the set ``{None, numpy.nan, pandas.NaT}`` (:issue:`22305`) -- Bug in :class:`DataFrame` and :class:`Series` where timezone aware data with ``dtype='datetime64[ns]`` was not cast to naive (:issue:`25843`) -- Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed ``datetime`` (:issue:`25851`) -- Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) -- Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) -- Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) -- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) -- Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) -- Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` -- Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) - -Timedelta -^^^^^^^^^ - -- Bug in :func:`TimedeltaIndex.intersection` where for non-monotonic indices in some cases an empty ``Index`` was returned when in fact an intersection existed (:issue:`25913`) -- Bug with comparisons between :class:`Timedelta` and ``NaT`` raising ``TypeError`` (:issue:`26039`) -- Bug when adding or subtracting a :class:`BusinessHour` to a :class:`Timestamp` with the resulting time landing in a following or prior day respectively (:issue:`26381`) -- Bug when comparing a :class:`TimedeltaIndex` against a zero-dimensional numpy array (:issue:`26689`) - -Timezones -^^^^^^^^^ - -- Bug in :func:`DatetimeIndex.to_frame` where timezone aware data would be converted to timezone naive data (:issue:`25809`) -- Bug in :func:`to_datetime` with ``utc=True`` and datetime strings that would apply previously parsed UTC offsets to subsequent arguments (:issue:`24992`) -- Bug in :func:`Timestamp.tz_localize` and :func:`Timestamp.tz_convert` does not propagate ``freq`` (:issue:`25241`) -- Bug in :func:`Series.at` where setting :class:`Timestamp` with timezone raises ``TypeError`` (:issue:`25506`) -- Bug in :func:`DataFrame.update` when updating with timezone aware data would return timezone naive data (:issue:`25807`) -- Bug in :func:`to_datetime` where an uninformative ``RuntimeError`` was raised when passing a naive :class:`Timestamp` with datetime strings with mixed UTC offsets (:issue:`25978`) -- Bug in :func:`to_datetime` with ``unit='ns'`` would drop timezone information from the parsed argument (:issue:`26168`) -- Bug in :func:`DataFrame.join` where joining a timezone aware index with a timezone aware column would result in a column of ``NaN`` (:issue:`26335`) - -Numeric -^^^^^^^ - -- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`) -- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`) -- Bug in :meth:`to_numeric` in which invalid values for ``errors`` were being allowed (:issue:`26466`) -- Bug in :class:`format` in which floating point complex numbers were not being formatted to proper display precision and trimming (:issue:`25514`) -- Bug in error messages in :meth:`DataFrame.corr` and :meth:`Series.corr`. Added the possibility of using a callable. (:issue:`25729`) -- Bug in :meth:`Series.divmod` and :meth:`Series.rdivmod` which would raise an (incorrect) ``ValueError`` rather than return a pair of :class:`Series` objects as result (:issue:`25557`) -- Raises a helpful exception when a non-numeric index is sent to :meth:`interpolate` with methods which require numeric index. (:issue:`21662`) -- Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) -- Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) -- -- - -Conversion -^^^^^^^^^^ - -- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) -- -- - -Strings -^^^^^^^ - -- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) -- Improved error message when passing :class:`Series` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`) -- - - -Interval -^^^^^^^^ - -- Construction of :class:`Interval` is restricted to numeric, :class:`Timestamp` and :class:`Timedelta` endpoints (:issue:`23013`) -- Fixed bug in :class:`Series`/:class:`DataFrame` not displaying ``NaN`` in :class:`IntervalIndex` with missing values (:issue:`25984`) -- - -Indexing -^^^^^^^^ - -- Improved exception message when calling :meth:`DataFrame.iloc` with a list of non-numeric objects (:issue:`25753`). -- Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`). -- Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). -- Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). -- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) -- Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - - -Missing -^^^^^^^ - -- Fixed misleading exception message in :meth:`Series.interpolate` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`). -- Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`) -- - -MultiIndex -^^^^^^^^^^ - -- Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) -- - -I/O -^^^ - -- Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) -- Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) -- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) -- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) -- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`) -- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) -- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) -- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) -- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) -- Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) -- Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) -- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) -- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) -- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) -- Improved the ``col_space`` parameter in :meth:`DataFrame.to_html` to accept a string so CSS length values can be set correctly (:issue:`25941`) -- Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`) -- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`) -- Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) -- Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) -- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) -- :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) -- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) -- :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) -- Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). - -Plotting -^^^^^^^^ - -- Fixed bug where :class:`api.extensions.ExtensionArray` could not be used in matplotlib plotting (:issue:`25587`) -- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612`, :issue:`15912`, :issue:`22334`) -- Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`) -- -- -- - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) -- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`) -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) -- Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) -- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) -- Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) -- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) -- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) -- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) -- Bug in :meth:`pandas.core.window.Rolling.count` and ``pandas.core.window.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`) -- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) -- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) -- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) -- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.merge` adds a string of ``None``, if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`, :issue:`25009`) -- :func:`to_records` now accepts dtypes to its ``column_dtypes`` parameter (:issue:`24895`) -- Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) -- Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`) -- Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). -- Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) -- bug in :class:`DataFrame` instantiating with a dict of iterators or generators (e.g. ``pd.DataFrame({'A': reversed(range(3))})``) raised an error (:issue:`26349`). -- Bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`). -- Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) -- Bug in :func:`Series.apply` failed when the series is a timezone aware :class:`DatetimeIndex` (:issue:`25959`) -- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`) -- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`) -- Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) -- Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`) - -Sparse -^^^^^^ - -- Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) -- Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) -- Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) -- Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) - -Other -^^^^^ - -- Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) -- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). -- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) - -.. _whatsnew_0.250.contributors: - -Contributors -~~~~~~~~~~~~ - -.. contributors:: v0.24.x..HEAD From 962882d0df69d1b934abf65959bff7af70b5e988 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 20:09:51 +0530 Subject: [PATCH 14/23] requested changes --- pandas/core/reshape/merge.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 259804328eef2..d1fa88e1ab585 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,9 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - if not self.right_on: - raise ValueError('both left_on and right_on ' - 'should be passed') + self.right_on = self.right_on or [None] * len(self.left_on) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1099,9 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - if not self.left_on: - raise ValueError('both left_on and right_on ' - 'should be passed') + self.left_on = self.left_on or [None] * len(self.right_on) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 139d696ad4496b35d6f79c14bdfc923581bd7584 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 22:52:39 +0530 Subject: [PATCH 15/23] further changes --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d1fa88e1ab585..0965ca2c2f99a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,7 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - self.right_on = self.right_on or [None] * len(self.left_on) + self.right_on = self.right_on or [None] * (n + 1) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1097,7 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - self.left_on = self.left_on or [None] * len(self.right_on) + self.left_on = self.left_on or [None] * (n + 1) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 4e6bd89d1f282598635f6793396e69612e955a2e Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Sat, 15 Jun 2019 22:54:33 +0530 Subject: [PATCH 16/23] updated test --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c384548ba692e..810a87ba30506 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1776,7 +1776,7 @@ def test_missing_on_raises(merge_type): 'A': [1, 2, 4, 5, 7, 8], 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] }) - msg = 'both left_on and right_on should be passed' + msg = 'must equal' kwargs = {merge_type: 'A'} with pytest.raises(ValueError, match=msg): pd.merge(df1, df2, how='left', **kwargs) From a0680e09f0e71e25750cbb67af06feaa08760ee5 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Mon, 17 Jun 2019 21:20:25 +0530 Subject: [PATCH 17/23] Trying original patch --- pandas/core/reshape/merge.py | 8 ++++-- pandas/tests/reshape/merge/test_merge.py | 33 ++++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0965ca2c2f99a..1e552a218ecf0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,7 +1089,9 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - self.right_on = self.right_on or [None] * (n + 1) + if not self.right_on: + raise ValueError('both "left_on" and "right_on" ' + 'should be passed') elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1097,7 +1099,9 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - self.left_on = self.left_on or [None] * (n + 1) + if not self.left_on: + raise ValueError('both "left_on" and "right_on" ' + 'should be passed') if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 810a87ba30506..9cb5aaec8b6af 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1026,6 +1026,22 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) + def test_missing_on_raises(self, merge_type): + # GH26824 # PR26855 + left = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + right = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'should be passed' + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(left, right, how='left', **kwargs) + def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = pd.DataFrame({'a': [], 'b': [], 'c': []}) @@ -1763,20 +1779,3 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) - - -@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) -def test_missing_on_raises(merge_type): - # GH26824 - df1 = DataFrame({ - 'A': [1, 2, 3, 4, 5, 6], - 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] - }) - df2 = DataFrame({ - 'A': [1, 2, 4, 5, 7, 8], - 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] - }) - msg = 'must equal' - kwargs = {merge_type: 'A'} - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', **kwargs) From 0a894a979cefa513a82a3f8e824ceedcfad08232 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Wed, 19 Jun 2019 20:51:04 +0530 Subject: [PATCH 18/23] Revert "Trying original patch" This reverts commit 78f7f117aaaae5db03f2bf62f32f884412109899. --- pandas/core/reshape/merge.py | 8 ++---- pandas/tests/reshape/merge/test_merge.py | 33 ++++++++++++------------ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1e552a218ecf0..0965ca2c2f99a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,9 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - if not self.right_on: - raise ValueError('both "left_on" and "right_on" ' - 'should be passed') + self.right_on = self.right_on or [None] * (n + 1) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1099,9 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - if not self.left_on: - raise ValueError('both "left_on" and "right_on" ' - 'should be passed') + self.left_on = self.left_on or [None] * (n + 1) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 9cb5aaec8b6af..810a87ba30506 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1026,22 +1026,6 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) - @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) - def test_missing_on_raises(self, merge_type): - # GH26824 # PR26855 - left = DataFrame({ - 'A': [1, 2, 3, 4, 5, 6], - 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] - }) - right = DataFrame({ - 'A': [1, 2, 4, 5, 7, 8], - 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] - }) - msg = 'should be passed' - kwargs = {merge_type: 'A'} - with pytest.raises(ValueError, match=msg): - pd.merge(left, right, how='left', **kwargs) - def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = pd.DataFrame({'a': [], 'b': [], 'c': []}) @@ -1779,3 +1763,20 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + + +@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) +def test_missing_on_raises(merge_type): + # GH26824 + df1 = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + df2 = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'must equal' + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', **kwargs) From 0cb88439e037a9039a66ec83e48e0c672eb123a2 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Wed, 19 Jun 2019 21:27:42 +0530 Subject: [PATCH 19/23] Trying original patch --- pandas/core/reshape/merge.py | 8 ++++-- pandas/tests/reshape/merge/test_merge.py | 32 ++++++++++++------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0965ca2c2f99a..1680aa0afb033 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,7 +1089,9 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - self.right_on = self.right_on or [None] * (n + 1) + if self.right_on is None: + raise ValueError('both "left_on" and "right_on" ' + 'should be passed') elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1097,7 +1099,9 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - self.left_on = self.left_on or [None] * (n + 1) + if self.left_on is None: + raise ValueError('both "left_on" and "right_on" ' + 'should be passed') if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 810a87ba30506..c62b80be70331 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1026,6 +1026,22 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) + def test_missing_on_raises(merge_type): + # GH26824 + left = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + right = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'should be passed' + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(left, right, how='left', **kwargs) + def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = pd.DataFrame({'a': [], 'b': [], 'c': []}) @@ -1764,19 +1780,3 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) - -@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) -def test_missing_on_raises(merge_type): - # GH26824 - df1 = DataFrame({ - 'A': [1, 2, 3, 4, 5, 6], - 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] - }) - df2 = DataFrame({ - 'A': [1, 2, 4, 5, 7, 8], - 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] - }) - msg = 'must equal' - kwargs = {merge_type: 'A'} - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', **kwargs) From 4e45c758afe080b7692e8986d90866bfce19eba0 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Wed, 19 Jun 2019 21:34:27 +0530 Subject: [PATCH 20/23] Revert "Trying original patch" This reverts commit 0cb88439e037a9039a66ec83e48e0c672eb123a2. --- pandas/core/reshape/merge.py | 8 ++---- pandas/tests/reshape/merge/test_merge.py | 32 ++++++++++++------------ 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1680aa0afb033..0965ca2c2f99a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,9 +1089,7 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - if self.right_on is None: - raise ValueError('both "left_on" and "right_on" ' - 'should be passed') + self.right_on = self.right_on or [None] * (n + 1) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1099,9 +1097,7 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - if self.left_on is None: - raise ValueError('both "left_on" and "right_on" ' - 'should be passed') + self.left_on = self.left_on or [None] * (n + 1) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c62b80be70331..810a87ba30506 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1026,22 +1026,6 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) - @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) - def test_missing_on_raises(merge_type): - # GH26824 - left = DataFrame({ - 'A': [1, 2, 3, 4, 5, 6], - 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] - }) - right = DataFrame({ - 'A': [1, 2, 4, 5, 7, 8], - 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] - }) - msg = 'should be passed' - kwargs = {merge_type: 'A'} - with pytest.raises(ValueError, match=msg): - pd.merge(left, right, how='left', **kwargs) - def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = pd.DataFrame({'a': [], 'b': [], 'c': []}) @@ -1780,3 +1764,19 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + +@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) +def test_missing_on_raises(merge_type): + # GH26824 + df1 = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + df2 = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'must equal' + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, how='left', **kwargs) From 500e37440fa71e1a86db50ee4c369d1b10bd7336 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Wed, 19 Jun 2019 22:24:28 +0530 Subject: [PATCH 21/23] Trying original patch --- pandas/core/reshape/merge.py | 8 ++++-- pandas/tests/reshape/merge/test_merge.py | 33 ++++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0965ca2c2f99a..7eec6cfe3f5d7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1089,7 +1089,9 @@ def _validate_specification(self): raise ValueError('len(left_on) must equal the number ' 'of levels in the index of "right"') self.right_on = [None] * n - self.right_on = self.right_on or [None] * (n + 1) + if not self.right_on: + raise ValueError('both "right_on" and "left_on" ' + 'should be passed') elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1097,7 +1099,9 @@ def _validate_specification(self): raise ValueError('len(right_on) must equal the number ' 'of levels in the index of "left"') self.left_on = [None] * n - self.left_on = self.left_on or [None] * (n + 1) + if not self.left_on: + raise ValueError('both "right_on" and "left_on" ' + 'should be passed') if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 810a87ba30506..f8b101878f130 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1026,6 +1026,22 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + @pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) + def test_missing_on_raises(self, merge_type): + # GH26824 + left = DataFrame({ + 'A': [1, 2, 3, 4, 5, 6], + 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] + }) + right = DataFrame({ + 'A': [1, 2, 4, 5, 7, 8], + 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] + }) + msg = 'must equal' + kwargs = {merge_type: 'A'} + with pytest.raises(ValueError, match=msg): + pd.merge(left, right, how='left', **kwargs) + def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = pd.DataFrame({'a': [], 'b': [], 'c': []}) @@ -1763,20 +1779,3 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) - - -@pytest.mark.parametrize('merge_type', ['left_on', 'right_on']) -def test_missing_on_raises(merge_type): - # GH26824 - df1 = DataFrame({ - 'A': [1, 2, 3, 4, 5, 6], - 'B': ['P', 'Q', 'R', 'S', 'T', 'U'] - }) - df2 = DataFrame({ - 'A': [1, 2, 4, 5, 7, 8], - 'C': ['L', 'M', 'N', 'O', 'P', 'Q'] - }) - msg = 'must equal' - kwargs = {merge_type: 'A'} - with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, how='left', **kwargs) From 7770b1daa574575323cb6a46e69e6a66835cadf4 Mon Sep 17 00:00:00 2001 From: Harshit Saxena Date: Wed, 19 Jun 2019 23:12:45 +0530 Subject: [PATCH 22/23] further changes --- pandas/core/reshape/merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 251327f0ea760..7eec6cfe3f5d7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1092,7 +1092,6 @@ def _validate_specification(self): if not self.right_on: raise ValueError('both "right_on" and "left_on" ' 'should be passed') - self.right_on = self.right_on or [None] * (n + 1) elif self.right_on is not None: n = len(self.right_on) if self.left_index: @@ -1103,7 +1102,6 @@ def _validate_specification(self): if not self.left_on: raise ValueError('both "right_on" and "left_on" ' 'should be passed') - self.left_on = self.left_on or [None] * (n + 1) if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") From 6b7f5a2ac9647a7ef16a6bc6ff456c7764440051 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 25 Jun 2019 12:18:29 -0500 Subject: [PATCH 23/23] re-revert --- doc/source/whatsnew/v0.25.0.rst | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ccfb30fe5ad7a..01aa9ad97c541 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -74,6 +74,36 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca See :ref:`groupby.aggregate.named` for more. +.. _whatsnew_0250.enhancements.multi_index_repr: + +Better repr for MultiIndex +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Printing of :class:`MultiIndex` instances now shows tuples of each row and ensures +that the tuple items are vertically aligned, so it's now easier to understand +the structure of the ``MultiIndex``. (:issue:`13480`): + +The repr now looks like this: + +.. ipython:: python + + pd.MultiIndex.from_product([['a', 'abc'], range(500)]) + +Previously, outputting a :class:`MultiIndex` printed all the ``levels`` and +``codes`` of the ``MultiIndex``, which was visually unappealing and made +the output more difficult to navigate. For example (limiting the range to 5): + +.. code-block:: ipython + + In [1]: pd.MultiIndex.from_product([['a', 'abc'], range(5)]) + Out[1]: MultiIndex(levels=[['a', 'abc'], [0, 1, 2, 3]], + ...: codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]]) + +In the new repr, all values will be shown, if the number of rows is smaller +than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally, +the output will truncate, if it's wider than :attr:`options.display.width` +(default: 80 characters). + .. _whatsnew_0250.enhancements.other: Other Enhancements @@ -413,7 +443,7 @@ If installed, we now require: | pytest (dev) | 4.0.2 | | +-----------------+-----------------+----------+ -For `optional libraries `_ the general recommendation is to use the latest version. +For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. @@ -505,6 +535,8 @@ Other Deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) +- :meth:`Series.to_sparse`, :meth:`DataFrame.to_sparse`, :meth:`Series.to_dense` and :meth:`DataFrame.to_dense` are deprecated and will be removed in a future version. (:issue:`26557`). + .. _whatsnew_0250.prior_deprecations: