From 12591d1a7571e924baf576c455c3a5f608f3b92c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Tue, 11 Jun 2019 22:22:12 +0200
Subject: [PATCH 01/18] DOC: add single dtype to NDFrame.to_sql
---
pandas/core/generic.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 19d093dd29457..25e766784e2f1 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2493,10 +2493,11 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
chunksize : int, optional
Rows will be written in batches of this size at a time. By default,
all rows will be written at once.
- dtype : dict, optional
- Specifying the datatype for columns. The keys should be the column
- names and the values should be the SQLAlchemy types or strings for
- the sqlite3 legacy mode.
+ dtype : dict or a SQLAchemy type, optional
+ Specifying the datatype for column(s). If a dictionary is used, the keys
+ should be the column names and the values should be the SQLAlchemy types
+ or strings for the sqlite3 legacy mode. If all columns are of the same
+ type, one single value can be used.
method : {None, 'multi', callable}, default None
Controls the SQL insertion clause used:
From 4397fc7376ec5fda3cda5b038c836a6f1c7bbf48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Tue, 11 Jun 2019 22:31:50 +0200
Subject: [PATCH 02/18] Now passes flake8
---
pandas/core/generic.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 25e766784e2f1..271cbaf81db03 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2494,10 +2494,10 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
Rows will be written in batches of this size at a time. By default,
all rows will be written at once.
dtype : dict or a SQLAchemy type, optional
- Specifying the datatype for column(s). If a dictionary is used, the keys
- should be the column names and the values should be the SQLAlchemy types
- or strings for the sqlite3 legacy mode. If all columns are of the same
- type, one single value can be used.
+ Specifying the datatype for columns. If a dictionary is used, the
+ keys should be the column names and the values should be the
+ SQLAlchemy types or strings for the sqlite3 legacy mode. If all
+ columns are of the same type, one single value can be used.
method : {None, 'multi', callable}, default None
Controls the SQL insertion clause used:
From e9b09c6c79efd85a7e9571923e6a5ec1058e38a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 13 Jun 2019 21:57:24 +0200
Subject: [PATCH 03/18] Update pandas/core/generic.py
Co-Authored-By: William Ayd
---
pandas/core/generic.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 271cbaf81db03..f36936b36dc6f 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2493,7 +2493,7 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
chunksize : int, optional
Rows will be written in batches of this size at a time. By default,
all rows will be written at once.
- dtype : dict or a SQLAchemy type, optional
+ dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
SQLAlchemy types or strings for the sqlite3 legacy mode. If all
From d7990f285734b2378b26706c70dc64c21ec11332 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 13 Jun 2019 21:57:34 +0200
Subject: [PATCH 04/18] Update pandas/core/generic.py
Co-Authored-By: William Ayd
---
pandas/core/generic.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index f36936b36dc6f..509389bb3fce0 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2497,7 +2497,7 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
SQLAlchemy types or strings for the sqlite3 legacy mode. If all
- columns are of the same type, one single value can be used.
+ If a scalar is provided it will be applied to all columns.
method : {None, 'multi', callable}, default None
Controls the SQL insertion clause used:
From 47bafad47930ec5a003dd4b27989e5565a4b886d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 13 Jun 2019 22:05:26 +0200
Subject: [PATCH 05/18] Type annotations for pandas.core.generic.to_sql()
---
pandas/core/generic.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 509389bb3fce0..d59484b94465b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6,7 +6,7 @@
import operator
import pickle
from textwrap import dedent
-from typing import Callable, FrozenSet, List, Set
+from typing import Any, Callable, FrozenSet, List, Optional, Set, Union
import warnings
import weakref
@@ -2458,8 +2458,12 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
**kwargs)
- def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
- index_label=None, chunksize=None, dtype=None, method=None):
+ def to_sql(self, name: str,
+ con,
+ schema: Optional[str]=None, if_exists: str='fail', index: bool=True,
+ index_label: Optional[Union[str, List[str]]]=None,
+ chunksize: Optional[int]=None, dtype: Union[dict]=None,
+ method: Union[str, Callable]=None):
"""
Write records stored in a DataFrame to a SQL database.
@@ -2496,8 +2500,8 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
- SQLAlchemy types or strings for the sqlite3 legacy mode. If all
- If a scalar is provided it will be applied to all columns.
+ SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+ scalar is provided, it will be applied to all columns.
method : {None, 'multi', callable}, default None
Controls the SQL insertion clause used:
From 53a364f8120976b50415cfd10ee17c40c00a8560 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 13 Jun 2019 22:06:33 +0200
Subject: [PATCH 06/18] Aligned documentation for pandas.io.sql.to_sql()
---
pandas/io/sql.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 1e3fe2ade6ab7..92aa87df43a5c 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -419,10 +419,11 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
chunksize : int, default None
If not None, then rows will be written in batches of this size at a
time. If None, all rows will be written at once.
- dtype : single SQLtype or dict of column name to SQL type, default None
- Optional specifying the datatype for columns. The SQL type should
- be a SQLAlchemy type, or a string for sqlite3 fallback connection.
- If all columns are of the same type, one single value can be used.
+ dtype : dict or scalar, optional
+ Specifying the datatype for columns. If a dictionary is used, the
+ keys should be the column names and the values should be the
+ SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+ scalar is provided, it will be applied to all columns.
method : {None, 'multi', callable}, default None
Controls the SQL insertion clause used:
From d535914b548fb0def2126a940fa6b2e87c307c1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 13 Jun 2019 22:10:35 +0200
Subject: [PATCH 07/18] Now passes flake8
---
pandas/core/generic.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index d59484b94465b..3a3bc5be82d1a 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6,7 +6,7 @@
import operator
import pickle
from textwrap import dedent
-from typing import Any, Callable, FrozenSet, List, Optional, Set, Union
+from typing import Callable, FrozenSet, List, Optional, Set, Union
import warnings
import weakref
@@ -2458,12 +2458,12 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
**kwargs)
- def to_sql(self, name: str,
- con,
- schema: Optional[str]=None, if_exists: str='fail', index: bool=True,
- index_label: Optional[Union[str, List[str]]]=None,
- chunksize: Optional[int]=None, dtype: Union[dict]=None,
- method: Union[str, Callable]=None):
+ def to_sql(self, name: str, con,
+ schema: Optional[str] = None, if_exists: str = 'fail',
+ index: bool = True,
+ index_label: Optional[Union[str, List[str]]] = None,
+ chunksize: Optional[int] = None, dtype: Union[dict] = None,
+ method: Union[str, Callable] = None):
"""
Write records stored in a DataFrame to a SQL database.
From 680ed87ccedaa0ec64121badff7801d348013bdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 14:11:03 +0200
Subject: [PATCH 08/18] Completed type annotations for generic.NDFrame.to_sql()
---
pandas/core/generic.py | 27 +++++++++++++++++----------
1 file changed, 17 insertions(+), 10 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 3a3bc5be82d1a..262aea9d8650b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6,7 +6,7 @@
import operator
import pickle
from textwrap import dedent
-from typing import Callable, FrozenSet, List, Optional, Set, Union
+from typing import Any, Callable, Dict, FrozenSet, Iterator, List, Set, Union
import warnings
import weakref
@@ -34,6 +34,7 @@
from pandas.core.dtypes.missing import isna, notna
import pandas as pd
+from pandas._typing import Dtype
from pandas.core import missing, nanops
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
@@ -48,8 +49,12 @@
from pandas.io.formats.format import DataFrameFormatter, format_percentiles
from pandas.io.formats.printing import pprint_thing
+from pandas.io.sql import SQLTable
from pandas.tseries.frequencies import to_offset
+# mypy confuses the `bool()`` method of NDFrame
+_bool = bool
+
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = dict()
@@ -2459,11 +2464,13 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
**kwargs)
def to_sql(self, name: str, con,
- schema: Optional[str] = None, if_exists: str = 'fail',
- index: bool = True,
- index_label: Optional[Union[str, List[str]]] = None,
- chunksize: Optional[int] = None, dtype: Union[dict] = None,
- method: Union[str, Callable] = None):
+ schema: str = None, if_exists: str = 'fail',
+ index: _bool = True, index_label: Union[str, List[str]] = None,
+ chunksize: int = None,
+ dtype: Union[Dict[str, Dtype], Dtype] = None,
+ method: Union[str, Callable[[SQLTable, Any, List[str],
+ Iterator[List]], None]] = None
+ ) -> None:
"""
Write records stored in a DataFrame to a SQL database.
@@ -2472,12 +2479,12 @@ def to_sql(self, name: str, con,
Parameters
----------
- name : string
+ name : str
Name of SQL table.
con : sqlalchemy.engine.Engine or sqlite3.Connection
Using SQLAlchemy makes it possible to use any DB supported by that
library. Legacy support is provided for sqlite3.Connection objects.
- schema : string, optional
+ schema : str, optional
Specify the schema (if database flavor supports this). If None, use
default schema.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
@@ -2490,7 +2497,7 @@ def to_sql(self, name: str, con,
index : bool, default True
Write DataFrame index as a column. Uses `index_label` as the column
name in the table.
- index_label : string or sequence, default None
+ index_label : string or sequence, optional
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
@@ -2502,7 +2509,7 @@ def to_sql(self, name: str, con,
keys should be the column names and the values should be the
SQLAlchemy types or strings for the sqlite3 legacy mode. If a
scalar is provided, it will be applied to all columns.
- method : {None, 'multi', callable}, default None
+ method : {None, 'multi', callable}, optional
Controls the SQL insertion clause used:
* None : Uses standard SQL ``INSERT`` clause (one per row).
From ac443710e0bf72bb43ac4ed4d69f36455e3888ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 14:13:25 +0200
Subject: [PATCH 09/18] Follow docstring guide for parameter types of
pandas.io.sql
---
pandas/io/sql.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 92aa87df43a5c..64a26e0986ea5 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -396,14 +396,14 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
Parameters
----------
frame : DataFrame, Series
- name : string
+ name : str
Name of SQL table.
con : SQLAlchemy connectable(engine/connection) or database string URI
or sqlite3 DBAPI2 connection
Using SQLAlchemy makes it possible to use any DB supported by that
library.
If a DBAPI2 object, only sqlite3 is supported.
- schema : string, default None
+ schema : str, optional
Name of SQL schema in database to write to (if database flavor
supports this). If None, use default schema (default).
if_exists : {'fail', 'replace', 'append'}, default 'fail'
@@ -412,19 +412,19 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
- append: If table exists, insert data. Create if does not exist.
index : boolean, default True
Write DataFrame index as a column.
- index_label : string or sequence, default None
+ index_label : str or sequence, optional
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
- chunksize : int, default None
+ chunksize : int, optional
If not None, then rows will be written in batches of this size at a
time. If None, all rows will be written at once.
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
- SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+ SQLAlchemy types or strings for the sqlite3 fallback mode. If a
scalar is provided, it will be applied to all columns.
- method : {None, 'multi', callable}, default None
+ method : {None, 'multi', callable}, optional
Controls the SQL insertion clause used:
- None : Uses standard SQL ``INSERT`` clause (one per row).
From 52a6782ab7f6abc6118085de11d579c8651b6588 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 15:20:32 +0200
Subject: [PATCH 10/18] Avoid circular import
---
pandas/core/generic.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 262aea9d8650b..00eee9ecda58a 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -49,7 +49,6 @@
from pandas.io.formats.format import DataFrameFormatter, format_percentiles
from pandas.io.formats.printing import pprint_thing
-from pandas.io.sql import SQLTable
from pandas.tseries.frequencies import to_offset
# mypy confuses the `bool()`` method of NDFrame
@@ -2463,12 +2462,15 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
**kwargs)
+ # TODO: Replace `Callable[[Any, Any, ...` when SQLTable and sqlalchemy
+ # can be imported. SQLTable can't be imported due to circular import.
+ # sqlalchemy can't be imported since it's an optional dependency.
def to_sql(self, name: str, con,
schema: str = None, if_exists: str = 'fail',
index: _bool = True, index_label: Union[str, List[str]] = None,
chunksize: int = None,
dtype: Union[Dict[str, Dtype], Dtype] = None,
- method: Union[str, Callable[[SQLTable, Any, List[str],
+ method: Union[str, Callable[[Any, Any, List[str],
Iterator[List]], None]] = None
) -> None:
"""
From 4cfee2939c4a2d3138b49afd37695cc8e3991d82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 15:36:59 +0200
Subject: [PATCH 11/18] Have one parameter per line
Also added Any to con: for consistency.
---
pandas/core/generic.py | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 00eee9ecda58a..be4614d21c114 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2462,12 +2462,17 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
**kwargs)
- # TODO: Replace `Callable[[Any, Any, ...` when SQLTable and sqlalchemy
- # can be imported. SQLTable can't be imported due to circular import.
- # sqlalchemy can't be imported since it's an optional dependency.
- def to_sql(self, name: str, con,
- schema: str = None, if_exists: str = 'fail',
- index: _bool = True, index_label: Union[str, List[str]] = None,
+ # TODO: Replace `con: Any` and `Callable[[Any, Any, ...` when SQLTable
+ # and sqlalchemy can be imported. SQLTable can't be imported due to
+ # circular import. sqlalchemy can't be imported since it's an optional
+ # dependency.
+ def to_sql(self,
+ name: str,
+ con: Any,
+ schema: str = None,
+ if_exists: str = 'fail',
+ index: _bool = True,
+ index_label: Union[str, List[str]] = None,
chunksize: int = None,
dtype: Union[Dict[str, Dtype], Dtype] = None,
method: Union[str, Callable[[Any, Any, List[str],
From 00d6a25555d9da31c4643d423050b0953d72ab05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 17:22:33 +0200
Subject: [PATCH 12/18] Closes #11415 reported by @martinburch
---
pandas/core/generic.py | 4 ++--
pandas/io/sql.py | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index be4614d21c114..e1de54d32e19b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2509,8 +2509,8 @@ def to_sql(self,
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
- Rows will be written in batches of this size at a time. By default,
- all rows will be written at once.
+ Specify the number of rows in each batch to be written at a time.
+ By default, all rows will be written at once.
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 64a26e0986ea5..fb4afafaeca22 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -417,8 +417,8 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
- If not None, then rows will be written in batches of this size at a
- time. If None, all rows will be written at once.
+ Specify the number of rows in each batch to be written at a time.
+ By default, all rows will be written at once.
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
From 13633beecc4821a8cba844d620edc8a1fa13e2e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Sat, 15 Jun 2019 18:25:55 +0200
Subject: [PATCH 13/18] Use only _typing.Dtype for dtype=
---
pandas/core/generic.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index e1de54d32e19b..e90bb10f1bbd8 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6,7 +6,7 @@
import operator
import pickle
from textwrap import dedent
-from typing import Any, Callable, Dict, FrozenSet, Iterator, List, Set, Union
+from typing import Any, Callable, FrozenSet, Iterator, List, Set, Union
import warnings
import weakref
@@ -2474,7 +2474,7 @@ def to_sql(self,
index: _bool = True,
index_label: Union[str, List[str]] = None,
chunksize: int = None,
- dtype: Union[Dict[str, Dtype], Dtype] = None,
+ dtype: Dtype = None,
method: Union[str, Callable[[Any, Any, List[str],
Iterator[List]], None]] = None
) -> None:
From 82ebc5c6bafe67affd6b6f15bee46bf59a88d0b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Mon, 17 Jun 2019 07:59:00 +0200
Subject: [PATCH 14/18] Add type checking only imports
---
pandas/core/generic.py | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index e90bb10f1bbd8..06f9f83625ff4 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6,7 +6,8 @@
import operator
import pickle
from textwrap import dedent
-from typing import Any, Callable, FrozenSet, Iterator, List, Set, Union
+from typing import (
+ TYPE_CHECKING, Callable, FrozenSet, Iterator, List, Set, Union)
import warnings
import weakref
@@ -51,6 +52,12 @@
from pandas.io.formats.printing import pprint_thing
from pandas.tseries.frequencies import to_offset
+if TYPE_CHECKING:
+ import sqlalchemy # noqa: F401
+ import sqlite3 # noqa: F401
+
+ from pandas.io.sql import SQLTable # noqa: F401
+
# mypy confuses the `bool()`` method of NDFrame
_bool = bool
@@ -2468,15 +2475,19 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
# dependency.
def to_sql(self,
name: str,
- con: Any,
+ con: Union['sqlalchemy.engine.Engine', 'sqlite3.Connection'],
schema: str = None,
if_exists: str = 'fail',
index: _bool = True,
index_label: Union[str, List[str]] = None,
chunksize: int = None,
dtype: Dtype = None,
- method: Union[str, Callable[[Any, Any, List[str],
- Iterator[List]], None]] = None
+ method: Union[str,
+ Callable[['SQLTable',
+ Union['sqlalchemy.engine.Engine',
+ 'sqlite3.Connection'],
+ List[str], Iterator[List]], None]
+ ] = None
) -> None:
"""
Write records stored in a DataFrame to a SQL database.
@@ -2521,7 +2532,7 @@ def to_sql(self,
* None : Uses standard SQL ``INSERT`` clause (one per row).
* 'multi': Pass multiple values in a single ``INSERT`` clause.
- * callable with signature ``(pd_table, conn, keys, data_iter)``.
+ * callable with signature ``(pd_table, con, keys, data_iter)``.
Details and a sample callable implementation can be found in the
section :ref:`insert method `.
From 9eee1cfbe08f18befb73c4766c32ea7f7fc10ad4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?=
Date: Thu, 29 Aug 2019 11:36:56 +0200
Subject: [PATCH 15/18] merge upstream/master
---
.github/FUNDING.yml | 1 +
.github/PULL_REQUEST_TEMPLATE.md | 1 +
.github/SECURITY.md | 1 +
.gitignore | 3 +
.pre-commit-config.yaml | 17 +
.travis.yml | 13 +-
Makefile | 10 +-
README.md | 4 +-
asv_bench/benchmarks/algorithms.py | 146 +-
asv_bench/benchmarks/attrs_caching.py | 6 +-
asv_bench/benchmarks/binary_ops.py | 51 +-
asv_bench/benchmarks/categoricals.py | 125 +-
asv_bench/benchmarks/ctors.py | 55 +-
asv_bench/benchmarks/dtypes.py | 32 +-
asv_bench/benchmarks/eval.py | 31 +-
asv_bench/benchmarks/frame_ctor.py | 26 +-
asv_bench/benchmarks/frame_methods.py | 249 +-
asv_bench/benchmarks/gil.py | 136 +-
asv_bench/benchmarks/groupby.py | 515 +-
.../benchmarks/index_cached_properties.py | 75 +
asv_bench/benchmarks/index_object.py | 93 +-
asv_bench/benchmarks/indexing.py | 186 +-
asv_bench/benchmarks/indexing_engines.py | 59 +-
asv_bench/benchmarks/inference.py | 75 +-
asv_bench/benchmarks/io/csv.py | 322 +-
asv_bench/benchmarks/io/excel.py | 18 +-
asv_bench/benchmarks/io/hdf.py | 106 +-
asv_bench/benchmarks/io/json.py | 240 +-
asv_bench/benchmarks/io/msgpack.py | 17 +-
asv_bench/benchmarks/io/parsers.py | 22 +-
asv_bench/benchmarks/io/pickle.py | 13 +-
asv_bench/benchmarks/io/sas.py | 20 +-
asv_bench/benchmarks/io/sql.py | 161 +-
asv_bench/benchmarks/io/stata.py | 38 +-
asv_bench/benchmarks/join_merge.py | 274 +-
asv_bench/benchmarks/multiindex_object.py | 84 +-
asv_bench/benchmarks/offset.py | 75 +-
asv_bench/benchmarks/pandas_vb_common.py | 39 +-
asv_bench/benchmarks/period.py | 92 +-
asv_bench/benchmarks/plotting.py | 47 +-
asv_bench/benchmarks/reindex.py | 70 +-
asv_bench/benchmarks/replace.py | 46 +-
asv_bench/benchmarks/reshape.py | 166 +-
asv_bench/benchmarks/rolling.py | 80 +-
asv_bench/benchmarks/series_methods.py | 149 +-
asv_bench/benchmarks/sparse.py | 49 +-
asv_bench/benchmarks/stat_ops.py | 47 +-
asv_bench/benchmarks/strings.py | 84 +-
asv_bench/benchmarks/timedelta.py | 59 +-
asv_bench/benchmarks/timeseries.py | 213 +-
asv_bench/benchmarks/timestamp.py | 35 +-
azure-pipelines.yml | 32 +-
ci/azure/posix.yml | 13 +-
ci/azure/windows.yml | 9 +-
ci/build_docs.sh | 56 -
ci/check_git_tags.sh | 28 +
ci/code_checks.sh | 17 +-
ci/deps/azure-35-compat.yaml | 3 +-
ci/deps/azure-36-32bit.yaml | 20 +
ci/deps/azure-36-locale.yaml | 5 +-
ci/deps/azure-36-locale_slow.yaml | 1 +
ci/deps/azure-37-locale.yaml | 7 +-
ci/deps/azure-37-numpydev.yaml | 5 +-
ci/deps/azure-macos-35.yaml | 8 +-
ci/deps/azure-windows-36.yaml | 5 +-
ci/deps/azure-windows-37.yaml | 7 +-
ci/deps/travis-36-cov.yaml | 7 +-
ci/deps/travis-36-doc.yaml | 46 -
ci/deps/travis-36-locale.yaml | 20 +-
ci/deps/travis-36-slow.yaml | 4 +-
ci/deps/travis-37.yaml | 4 +-
ci/print_skipped.py | 33 +-
ci/run_tests.sh | 11 +-
ci/setup_env.sh | 6 +
codecov.yml | 4 +-
doc/logo/pandas_logo.py | 16 +-
doc/make.py | 266 +-
doc/source/conf.py | 348 +-
doc/source/development/contributing.rst | 215 +-
.../development/contributing_docstring.rst | 4 +-
doc/source/development/developer.rst | 60 +-
doc/source/development/extending.rst | 77 +-
doc/source/development/index.rst | 1 +
doc/source/development/internals.rst | 2 +-
doc/source/development/roadmap.rst | 193 +
doc/source/ecosystem.rst | 19 +-
doc/source/getting_started/10min.rst | 44 +-
doc/source/getting_started/basics.rst | 61 +-
.../comparison/comparison_with_r.rst | 14 +-
.../comparison/comparison_with_sas.rst | 38 +-
.../comparison/comparison_with_stata.rst | 46 +-
doc/source/getting_started/dsintro.rst | 67 +-
doc/source/getting_started/overview.rst | 10 +-
doc/source/getting_started/tutorials.rst | 8 +-
doc/source/index.rst.template | 4 +-
doc/source/install.rst | 40 +-
doc/source/reference/arrays.rst | 19 +-
doc/source/reference/extensions.rst | 36 +-
doc/source/reference/frame.rst | 15 +-
doc/source/reference/groupby.rst | 2 +-
doc/source/reference/index.rst | 2 +-
doc/source/reference/indexing.rst | 25 +-
doc/source/reference/io.rst | 11 +-
doc/source/reference/offset_frequency.rst | 2 +-
doc/source/reference/resampling.rst | 2 +-
doc/source/reference/series.rst | 28 +-
doc/source/reference/style.rst | 10 +-
doc/source/reference/window.rst | 7 +-
.../themes/nature_with_gtoc/layout.html | 18 +-
doc/source/user_guide/advanced.rst | 88 +-
doc/source/user_guide/categorical.rst | 30 +-
doc/source/user_guide/computation.rst | 36 +-
doc/source/user_guide/cookbook.rst | 30 +-
doc/source/user_guide/enhancingperf.rst | 48 +-
doc/source/user_guide/gotchas.rst | 4 +-
doc/source/user_guide/groupby.rst | 40 +-
doc/source/user_guide/indexing.rst | 120 +-
doc/source/user_guide/integer_na.rst | 2 +-
doc/source/user_guide/io.rst | 324 +-
doc/source/user_guide/merging.rst | 28 +-
doc/source/user_guide/missing_data.rst | 15 +-
doc/source/user_guide/options.rst | 44 +-
doc/source/user_guide/reshaping.rst | 78 +-
doc/source/user_guide/sparse.rst | 8 +-
doc/source/user_guide/style.ipynb | 16 +-
doc/source/user_guide/text.rst | 29 +-
doc/source/user_guide/timedeltas.rst | 6 +-
doc/source/user_guide/timeseries.rst | 113 +-
doc/source/user_guide/visualization.rst | 75 +-
doc/source/whatsnew/index.rst | 18 +
doc/source/whatsnew/v0.10.0.rst | 8 +-
doc/source/whatsnew/v0.10.1.rst | 4 +-
doc/source/whatsnew/v0.11.0.rst | 54 +-
doc/source/whatsnew/v0.12.0.rst | 8 +-
doc/source/whatsnew/v0.13.0.rst | 16 +-
doc/source/whatsnew/v0.13.1.rst | 6 +-
doc/source/whatsnew/v0.14.0.rst | 16 +-
doc/source/whatsnew/v0.14.1.rst | 4 +-
doc/source/whatsnew/v0.15.0.rst | 27 +-
doc/source/whatsnew/v0.15.1.rst | 2 +-
doc/source/whatsnew/v0.15.2.rst | 15 +-
doc/source/whatsnew/v0.16.0.rst | 48 +-
doc/source/whatsnew/v0.16.1.rst | 14 +-
doc/source/whatsnew/v0.16.2.rst | 8 +-
doc/source/whatsnew/v0.17.0.rst | 50 +-
doc/source/whatsnew/v0.17.1.rst | 6 +-
doc/source/whatsnew/v0.18.0.rst | 34 +-
doc/source/whatsnew/v0.18.1.rst | 20 +-
doc/source/whatsnew/v0.19.0.rst | 20 +-
doc/source/whatsnew/v0.19.1.rst | 4 +-
doc/source/whatsnew/v0.19.2.rst | 4 +-
doc/source/whatsnew/v0.20.0.rst | 96 +-
doc/source/whatsnew/v0.20.2.rst | 6 +-
doc/source/whatsnew/v0.20.3.rst | 2 +-
doc/source/whatsnew/v0.21.0.rst | 68 +-
doc/source/whatsnew/v0.21.1.rst | 10 +-
doc/source/whatsnew/v0.22.0.rst | 6 +-
doc/source/whatsnew/v0.23.0.rst | 78 +-
doc/source/whatsnew/v0.23.1.rst | 16 +-
doc/source/whatsnew/v0.23.2.rst | 12 +-
doc/source/whatsnew/v0.23.3.rst | 2 +-
doc/source/whatsnew/v0.23.4.rst | 10 +-
doc/source/whatsnew/v0.24.0.rst | 134 +-
doc/source/whatsnew/v0.24.1.rst | 12 +-
doc/source/whatsnew/v0.24.2.rst | 10 +-
doc/source/whatsnew/v0.25.0.rst | 607 +-
doc/source/whatsnew/v0.25.1.rst | 119 +
doc/source/whatsnew/v0.25.2.rst | 109 +
doc/source/whatsnew/v0.4.x.rst | 4 +-
doc/source/whatsnew/v0.5.0.rst | 4 +-
doc/source/whatsnew/v0.6.0.rst | 4 +-
doc/source/whatsnew/v0.7.0.rst | 4 +-
doc/source/whatsnew/v0.7.3.rst | 10 +-
doc/source/whatsnew/v0.8.0.rst | 2 +-
doc/source/whatsnew/v1.0.0.rst | 217 +
doc/sphinxext/announce.py | 48 +-
doc/sphinxext/contributors.py | 25 +-
environment.yml | 6 +-
mypy.ini | 9 -
pandas/__init__.py | 206 +-
pandas/_config/__init__.py | 21 +-
pandas/_config/config.py | 153 +-
pandas/_config/dates.py | 12 +-
pandas/_config/display.py | 11 +-
pandas/_config/localization.py | 22 +-
pandas/_libs/__init__.py | 9 +-
pandas/_libs/algos.pyx | 25 -
pandas/_libs/algos_take_helper.pxi.in | 8 +-
pandas/_libs/groupby.pyx | 7 +-
pandas/_libs/groupby_helper.pxi.in | 12 +-
pandas/_libs/hashtable.pxd | 2 +-
pandas/_libs/hashtable.pyx | 4 +-
pandas/_libs/hashtable_class_helper.pxi.in | 10 +-
pandas/_libs/hashtable_func_helper.pxi.in | 2 +-
pandas/_libs/index.pyx | 17 +-
pandas/_libs/internals.pyx | 2 +-
pandas/_libs/interval.pyx | 53 +
pandas/_libs/join.pyx | 20 +-
pandas/_libs/lib.pxd | 1 +
pandas/_libs/lib.pyx | 184 +-
pandas/_libs/missing.pyx | 7 +-
pandas/_libs/parsers.pyx | 29 +-
pandas/_libs/reduction.pyx | 2 +-
pandas/_libs/reshape.pyx | 63 +-
pandas/_libs/src/klib/khash_python.h | 2 +-
pandas/_libs/src/parser/tokenizer.c | 43 +-
pandas/_libs/src/parser/tokenizer.h | 18 +-
pandas/_libs/src/ujson/lib/ultrajson.h | 7 -
pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 +
pandas/_libs/src/ujson/python/objToJSON.c | 381 +-
pandas/_libs/tslib.pyx | 4 +-
pandas/_libs/tslibs/__init__.py | 6 +-
pandas/_libs/tslibs/c_timestamp.pyx | 18 +-
pandas/_libs/tslibs/conversion.pyx | 9 +
pandas/_libs/tslibs/fields.pyx | 12 +-
pandas/_libs/tslibs/nattype.pyx | 99 +-
pandas/_libs/tslibs/period.pyx | 13 +-
pandas/_libs/tslibs/timedeltas.pyx | 56 +-
pandas/_libs/tslibs/tzconversion.pyx | 4 +
pandas/_libs/tslibs/util.pxd | 3 +-
pandas/_libs/window.pyx | 90 +-
pandas/_typing.py | 43 +-
pandas/_version.py | 123 +-
pandas/api/__init__.py | 2 +-
pandas/api/extensions/__init__.py | 19 +-
pandas/api/types/__init__.py | 17 +-
pandas/arrays/__init__.py | 25 +-
pandas/compat/__init__.py | 46 +-
pandas/compat/_optional.py | 22 +-
pandas/compat/chainmap.py | 7 -
pandas/compat/numpy/__init__.py | 52 +-
pandas/compat/numpy/function.py | 292 +-
pandas/compat/pickle_compat.py | 141 +-
pandas/conftest.py | 380 +-
pandas/core/accessor.py | 66 +-
pandas/core/algorithms.py | 629 +-
pandas/core/api.py | 23 +-
pandas/core/apply.py | 164 +-
pandas/core/arrays/__init__.py | 26 +-
pandas/core/arrays/_ranges.py | 73 +-
pandas/core/arrays/array_.py | 276 -
pandas/core/arrays/base.py | 249 +-
pandas/core/arrays/categorical.py | 839 +--
pandas/core/arrays/datetimelike.py | 440 +-
pandas/core/arrays/datetimes.py | 741 ++-
pandas/core/arrays/integer.py | 377 +-
pandas/core/arrays/interval.py | 471 +-
pandas/core/arrays/numpy_.py | 201 +-
pandas/core/arrays/period.py | 343 +-
pandas/core/arrays/sparse.py | 587 +-
pandas/core/arrays/timedeltas.py | 266 +-
pandas/core/base.py | 422 +-
pandas/core/common.py | 85 +-
pandas/core/computation/align.py | 40 +-
pandas/core/computation/check.py | 5 +-
pandas/core/computation/common.py | 4 +-
pandas/core/computation/engines.py | 37 +-
pandas/core/computation/eval.py | 114 +-
pandas/core/computation/expr.py | 356 +-
pandas/core/computation/expressions.py | 157 +-
pandas/core/computation/ops.py | 247 +-
pandas/core/computation/pytables.py | 213 +-
pandas/core/computation/scope.py | 103 +-
pandas/core/config_init.py | 362 +-
pandas/core/construction.py | 549 ++
pandas/core/dtypes/api.py | 55 +-
pandas/core/dtypes/base.py | 32 +-
pandas/core/dtypes/cast.py | 525 +-
pandas/core/dtypes/common.py | 340 +-
pandas/core/dtypes/concat.py | 272 +-
pandas/core/dtypes/dtypes.py | 318 +-
pandas/core/dtypes/generic.py | 126 +-
pandas/core/dtypes/inference.py | 82 +-
pandas/core/dtypes/missing.py | 139 +-
pandas/core/frame.py | 2770 +++++----
pandas/core/generic.py | 3886 +++++++-----
pandas/core/groupby/__init__.py | 5 +-
pandas/core/groupby/base.py | 164 +-
pandas/core/groupby/categorical.py | 15 +-
pandas/core/groupby/generic.py | 713 ++-
pandas/core/groupby/groupby.py | 725 ++-
pandas/core/groupby/grouper.py | 253 +-
pandas/core/groupby/ops.py | 405 +-
pandas/core/index.py | 28 +-
pandas/core/indexers.py | 236 +
pandas/core/indexes/accessors.py | 112 +-
pandas/core/indexes/api.py | 83 +-
pandas/core/indexes/base.py | 1778 +++---
pandas/core/indexes/category.py | 296 +-
pandas/core/indexes/datetimelike.py | 234 +-
pandas/core/indexes/datetimes.py | 677 +-
pandas/core/indexes/frozen.py | 27 +-
pandas/core/indexes/interval.py | 846 +--
pandas/core/indexes/multi.py | 1277 ++--
pandas/core/indexes/numeric.py | 234 +-
pandas/core/indexes/period.py | 443 +-
pandas/core/indexes/range.py | 291 +-
pandas/core/indexes/timedeltas.py | 325 +-
pandas/core/indexing.py | 1200 ++--
pandas/core/internals/__init__.py | 40 +-
pandas/core/internals/arrays.py | 55 -
pandas/core/internals/blocks.py | 1943 +++---
pandas/core/internals/concat.py | 161 +-
pandas/core/internals/construction.py | 429 +-
pandas/core/internals/managers.py | 800 +--
pandas/core/missing.py | 417 +-
pandas/core/nanops.py | 325 +-
pandas/core/ops.py | 2354 -------
pandas/core/ops/__init__.py | 1234 ++++
pandas/core/ops/array_ops.py | 128 +
pandas/core/ops/docstrings.py | 675 ++
pandas/core/ops/invalid.py | 61 +
pandas/core/ops/methods.py | 249 +
pandas/core/ops/missing.py | 194 +
pandas/core/ops/roperator.py | 61 +
pandas/core/panel.py | 1576 -----
pandas/core/resample.py | 543 +-
pandas/core/reshape/concat.py | 269 +-
pandas/core/reshape/melt.py | 97 +-
pandas/core/reshape/merge.py | 989 +--
pandas/core/reshape/pivot.py | 300 +-
pandas/core/reshape/reshape.py | 349 +-
pandas/core/reshape/tile.py | 186 +-
pandas/core/reshape/util.py | 9 +-
pandas/core/series.py | 1624 +++--
pandas/core/sorting.py | 148 +-
pandas/core/sparse/frame.py | 455 +-
pandas/core/sparse/scipy_sparse.py | 51 +-
pandas/core/sparse/series.py | 390 +-
pandas/core/strings.py | 920 +--
pandas/core/tools/datetimes.py | 522 +-
pandas/core/tools/numeric.py | 50 +-
pandas/core/tools/timedeltas.py | 57 +-
pandas/core/util/hashing.py | 142 +-
pandas/core/window.py | 2657 --------
pandas/core/window/__init__.py | 3 +
pandas/core/window/common.py | 276 +
pandas/core/window/ewm.py | 388 ++
pandas/core/window/expanding.py | 260 +
pandas/core/window/rolling.py | 1939 ++++++
pandas/errors/__init__.py | 11 +-
pandas/io/clipboard/__init__.py | 44 +-
pandas/io/clipboard/clipboards.py | 64 +-
pandas/io/clipboard/exceptions.py | 1 -
pandas/io/clipboard/windows.py | 49 +-
pandas/io/clipboards.py | 64 +-
pandas/io/common.py | 335 +-
pandas/io/date_converters.py | 15 +-
pandas/io/excel/__init__.py | 2 +-
pandas/io/excel/_base.py | 389 +-
pandas/io/excel/_odfreader.py | 180 +
pandas/io/excel/_openpyxl.py | 165 +-
pandas/io/excel/_util.py | 43 +-
pandas/io/excel/_xlrd.py | 38 +-
pandas/io/excel/_xlsxwriter.py | 240 +-
pandas/io/excel/_xlwt.py | 59 +-
pandas/io/feather_format.py | 67 +-
pandas/io/formats/console.py | 17 +-
pandas/io/formats/css.py | 156 +-
pandas/io/formats/csvs.py | 196 +-
pandas/io/formats/excel.py | 456 +-
pandas/io/formats/format.py | 1056 ++--
pandas/io/formats/html.py | 366 +-
pandas/io/formats/latex.py | 168 +-
pandas/io/formats/printing.py | 330 +-
pandas/io/formats/style.py | 451 +-
pandas/io/gbq.py | 81 +-
pandas/io/gcs.py | 13 +-
pandas/io/html.py | 303 +-
pandas/io/json/__init__.py | 15 +-
pandas/io/json/{json.py => _json.py} | 662 +-
.../io/json/{normalize.py => _normalize.py} | 187 +-
.../{table_schema.py => _table_schema.py} | 158 +-
pandas/io/msgpack/__init__.py | 4 +-
pandas/io/msgpack/_packer.pyi | 22 +
pandas/io/msgpack/_packer.pyx | 2 +-
pandas/io/msgpack/_unpacker.pyi | 59 +
pandas/io/msgpack/_unpacker.pyx | 19 +-
pandas/io/msgpack/exceptions.py | 1 -
pandas/io/packers.py | 783 +--
pandas/io/parquet.py | 162 +-
pandas/io/parsers.py | 1708 +++---
pandas/io/pickle.py | 18 +-
pandas/io/pytables.py | 2210 ++++---
pandas/io/s3.py | 30 +-
pandas/io/sas/sas7bdat.py | 326 +-
pandas/io/sas/sas_constants.py | 140 +-
pandas/io/sas/sas_xport.py | 190 +-
pandas/io/sas/sasreader.py | 52 +-
pandas/io/spss.py | 15 +-
pandas/io/sql.py | 679 +-
pandas/io/stata.py | 1263 ++--
pandas/plotting/__init__.py | 109 +-
pandas/plotting/_core.py | 1554 +++--
pandas/plotting/_matplotlib/__init__.py | 90 +-
pandas/plotting/_matplotlib/boxplot.py | 260 +-
pandas/plotting/_matplotlib/compat.py | 12 +-
pandas/plotting/_matplotlib/converter.py | 443 +-
pandas/plotting/_matplotlib/core.py | 663 +-
pandas/plotting/_matplotlib/hist.py | 321 +-
pandas/plotting/_matplotlib/misc.py | 179 +-
pandas/plotting/_matplotlib/style.py | 26 +-
pandas/plotting/_matplotlib/timeseries.py | 157 +-
pandas/plotting/_matplotlib/tools.py | 132 +-
pandas/plotting/_misc.py | 179 +-
pandas/testing.py | 5 +-
pandas/tests/api/test_api.py | 231 +-
pandas/tests/api/test_types.py | 72 +-
pandas/tests/arithmetic/conftest.py | 156 +-
pandas/tests/arithmetic/test_datetime64.py | 1869 +++---
pandas/tests/arithmetic/test_numeric.py | 692 ++-
pandas/tests/arithmetic/test_object.py | 193 +-
pandas/tests/arithmetic/test_period.py | 723 ++-
pandas/tests/arithmetic/test_timedelta64.py | 1185 ++--
pandas/tests/arrays/categorical/common.py | 6 +-
pandas/tests/arrays/categorical/test_algos.py | 78 +-
.../arrays/categorical/test_analytics.py | 144 +-
pandas/tests/arrays/categorical/test_api.py | 286 +-
.../arrays/categorical/test_constructors.py | 293 +-
.../tests/arrays/categorical/test_dtypes.py | 140 +-
.../tests/arrays/categorical/test_indexing.py | 173 +-
.../tests/arrays/categorical/test_missing.py | 42 +-
.../arrays/categorical/test_operators.py | 247 +-
pandas/tests/arrays/categorical/test_repr.py | 116 +-
.../tests/arrays/categorical/test_sorting.py | 38 +-
.../tests/arrays/categorical/test_subclass.py | 13 +-
.../tests/arrays/categorical/test_warnings.py | 12 +-
pandas/tests/arrays/interval/test_interval.py | 73 +-
pandas/tests/arrays/interval/test_ops.py | 56 +-
pandas/tests/arrays/sparse/test_accessor.py | 94 +-
.../tests/arrays/sparse/test_arithmetics.py | 594 +-
pandas/tests/arrays/sparse/test_array.py | 472 +-
pandas/tests/arrays/sparse/test_dtype.py | 182 +-
pandas/tests/arrays/sparse/test_libsparse.py | 266 +-
pandas/tests/arrays/test_array.py | 381 +-
pandas/tests/arrays/test_datetimelike.py | 268 +-
pandas/tests/arrays/test_datetimes.py | 217 +-
pandas/tests/arrays/test_integer.py | 384 +-
pandas/tests/arrays/test_numpy.py | 93 +-
pandas/tests/arrays/test_period.py | 194 +-
pandas/tests/arrays/test_timedeltas.py | 93 +-
pandas/tests/computation/test_compat.py | 15 +-
pandas/tests/computation/test_eval.py | 1209 ++--
pandas/tests/config/test_config.py | 384 +-
pandas/tests/config/test_localization.py | 8 +-
.../dtypes/cast/test_construct_from_scalar.py | 6 +-
.../dtypes/cast/test_construct_ndarray.py | 17 +-
.../dtypes/cast/test_construct_object_arr.py | 6 +-
.../tests/dtypes/cast/test_convert_objects.py | 7 +-
pandas/tests/dtypes/cast/test_downcast.py | 30 +-
.../dtypes/cast/test_find_common_type.py | 154 +-
.../dtypes/cast/test_infer_datetimelike.py | 13 +-
pandas/tests/dtypes/cast/test_infer_dtype.py | 93 +-
pandas/tests/dtypes/cast/test_promote.py | 862 +++
pandas/tests/dtypes/cast/test_upcast.py | 90 +-
pandas/tests/dtypes/test_common.py | 499 +-
pandas/tests/dtypes/test_concat.py | 100 +-
pandas/tests/dtypes/test_dtypes.py | 711 ++-
pandas/tests/dtypes/test_generic.py | 38 +-
pandas/tests/dtypes/test_inference.py | 1025 ++--
pandas/tests/dtypes/test_missing.py | 348 +-
.../extension/arrow/{bool.py => arrays.py} | 90 +-
pandas/tests/extension/arrow/test_bool.py | 20 +-
pandas/tests/extension/arrow/test_string.py | 13 +
pandas/tests/extension/base/__init__.py | 10 +-
pandas/tests/extension/base/base.py | 5 +-
pandas/tests/extension/base/constructors.py | 5 +-
pandas/tests/extension/base/dtype.py | 27 +-
pandas/tests/extension/base/getitem.py | 64 +-
pandas/tests/extension/base/groupby.py | 53 +-
pandas/tests/extension/base/interface.py | 32 +-
pandas/tests/extension/base/io.py | 13 +-
pandas/tests/extension/base/methods.py | 153 +-
pandas/tests/extension/base/missing.py | 55 +-
pandas/tests/extension/base/ops.py | 21 +-
pandas/tests/extension/base/printing.py | 6 +-
pandas/tests/extension/base/reduce.py | 13 +-
pandas/tests/extension/base/reshaping.py | 208 +-
pandas/tests/extension/base/setitem.py | 46 +-
pandas/tests/extension/conftest.py | 27 +-
pandas/tests/extension/decimal/__init__.py | 5 +-
pandas/tests/extension/decimal/array.py | 60 +-
.../tests/extension/decimal/test_decimal.py | 192 +-
pandas/tests/extension/json/__init__.py | 2 +-
pandas/tests/extension/json/array.py | 53 +-
pandas/tests/extension/json/test_json.py | 84 +-
pandas/tests/extension/test_categorical.py | 34 +-
pandas/tests/extension/test_common.py | 31 +-
pandas/tests/extension/test_datetime.py | 106 +-
pandas/tests/extension/test_external_block.py | 34 +-
pandas/tests/extension/test_integer.py | 59 +-
pandas/tests/extension/test_interval.py | 12 +-
pandas/tests/extension/test_numpy.py | 66 +-
pandas/tests/extension/test_period.py | 30 +-
pandas/tests/extension/test_sparse.py | 103 +-
pandas/tests/frame/common.py | 114 +-
pandas/tests/frame/conftest.py | 83 +-
pandas/tests/frame/test_alter_axes.py | 1317 ++--
pandas/tests/frame/test_analytics.py | 2017 +++---
pandas/tests/frame/test_api.py | 277 +-
pandas/tests/frame/test_apply.py | 1013 +--
pandas/tests/frame/test_arithmetic.py | 322 +-
pandas/tests/frame/test_asof.py | 79 +-
.../tests/frame/test_axis_select_reindex.py | 972 +--
pandas/tests/frame/test_block_internals.py | 464 +-
pandas/tests/frame/test_combine_concat.py | 802 +--
pandas/tests/frame/test_constructors.py | 2021 +++---
pandas/tests/frame/test_convert_to.py | 634 +-
pandas/tests/frame/test_dtypes.py | 1253 ++--
pandas/tests/frame/test_duplicates.py | 307 +-
pandas/tests/frame/test_explode.py | 120 +
pandas/tests/frame/test_indexing.py | 2454 ++++----
pandas/tests/frame/test_join.py | 183 +-
pandas/tests/frame/test_missing.py | 766 ++-
pandas/tests/frame/test_mutate_columns.py | 228 +-
pandas/tests/frame/test_nonunique_indexes.py | 497 +-
pandas/tests/frame/test_operators.py | 539 +-
pandas/tests/frame/test_period.py | 101 +-
pandas/tests/frame/test_quantile.py | 424 +-
pandas/tests/frame/test_query_eval.py | 711 +--
pandas/tests/frame/test_rank.py | 176 +-
pandas/tests/frame/test_replace.py | 1157 ++--
pandas/tests/frame/test_repr_info.py | 317 +-
pandas/tests/frame/test_reshape.py | 1021 +--
.../frame/test_sort_values_level_as_str.py | 71 +-
pandas/tests/frame/test_sorting.py | 622 +-
pandas/tests/frame/test_subclass.py | 586 +-
pandas/tests/frame/test_timeseries.py | 590 +-
pandas/tests/frame/test_timezones.py | 176 +-
pandas/tests/frame/test_to_csv.py | 929 +--
pandas/tests/frame/test_validate.py | 19 +-
pandas/tests/generic/test_frame.py | 220 +-
pandas/tests/generic/test_generic.py | 356 +-
.../generic/test_label_or_level_utils.py | 99 +-
pandas/tests/generic/test_series.py | 151 +-
.../tests/groupby/aggregate/test_aggregate.py | 436 +-
pandas/tests/groupby/aggregate/test_cython.py | 216 +-
pandas/tests/groupby/aggregate/test_other.py | 604 +-
pandas/tests/groupby/conftest.py | 106 +-
pandas/tests/groupby/test_apply.py | 451 +-
pandas/tests/groupby/test_bin_groupby.py | 69 +-
pandas/tests/groupby/test_categorical.py | 1120 ++--
pandas/tests/groupby/test_counting.py | 128 +-
pandas/tests/groupby/test_filters.py | 338 +-
pandas/tests/groupby/test_function.py | 1364 +++--
pandas/tests/groupby/test_groupby.py | 1216 ++--
pandas/tests/groupby/test_grouping.py | 675 +-
pandas/tests/groupby/test_index_as_string.py | 70 +-
pandas/tests/groupby/test_nth.py | 524 +-
pandas/tests/groupby/test_rank.py | 566 +-
pandas/tests/groupby/test_timegrouper.py | 851 +--
pandas/tests/groupby/test_transform.py | 823 ++-
pandas/tests/groupby/test_value_counts.py | 48 +-
pandas/tests/groupby/test_whitelist.py | 371 +-
pandas/tests/indexes/common.py | 215 +-
pandas/tests/indexes/conftest.py | 44 +-
pandas/tests/indexes/datetimelike.py | 17 +-
.../indexes/datetimes/test_arithmetic.py | 93 +-
pandas/tests/indexes/datetimes/test_astype.py | 283 +-
.../indexes/datetimes/test_construction.py | 842 ++-
.../indexes/datetimes/test_date_range.py | 706 ++-
.../tests/indexes/datetimes/test_datetime.py | 221 +-
.../indexes/datetimes/test_datetimelike.py | 9 +-
.../tests/indexes/datetimes/test_formats.py | 264 +-
.../tests/indexes/datetimes/test_indexing.py | 605 +-
pandas/tests/indexes/datetimes/test_misc.py | 297 +-
.../tests/indexes/datetimes/test_missing.py | 82 +-
pandas/tests/indexes/datetimes/test_ops.py | 312 +-
.../indexes/datetimes/test_partial_slicing.py | 394 +-
.../indexes/datetimes/test_scalar_compat.py | 272 +-
pandas/tests/indexes/datetimes/test_setops.py | 263 +-
.../tests/indexes/datetimes/test_timezones.py | 964 +--
pandas/tests/indexes/datetimes/test_tools.py | 2089 ++++---
pandas/tests/indexes/interval/test_astype.py | 139 +-
.../indexes/interval/test_construction.py | 259 +-
.../tests/indexes/interval/test_interval.py | 844 ++-
.../indexes/interval/test_interval_new.py | 310 +-
.../indexes/interval/test_interval_range.py | 229 +-
.../indexes/interval/test_interval_tree.py | 117 +-
pandas/tests/indexes/interval/test_setops.py | 61 +-
pandas/tests/indexes/multi/conftest.py | 59 +-
pandas/tests/indexes/multi/test_analytics.py | 203 +-
pandas/tests/indexes/multi/test_astype.py | 8 +-
pandas/tests/indexes/multi/test_compat.py | 10 +-
.../tests/indexes/multi/test_constructor.py | 481 +-
pandas/tests/indexes/multi/test_contains.py | 66 +-
pandas/tests/indexes/multi/test_conversion.py | 164 +-
pandas/tests/indexes/multi/test_copy.py | 35 +-
pandas/tests/indexes/multi/test_drop.py | 86 +-
pandas/tests/indexes/multi/test_duplicates.py | 156 +-
.../tests/indexes/multi/test_equivalence.py | 38 +-
pandas/tests/indexes/multi/test_format.py | 184 +-
pandas/tests/indexes/multi/test_get_set.py | 154 +-
pandas/tests/indexes/multi/test_indexing.py | 224 +-
pandas/tests/indexes/multi/test_integrity.py | 127 +-
pandas/tests/indexes/multi/test_join.py | 50 +-
pandas/tests/indexes/multi/test_missing.py | 61 +-
pandas/tests/indexes/multi/test_monotonic.py | 131 +-
pandas/tests/indexes/multi/test_names.py | 57 +-
.../indexes/multi/test_partial_indexing.py | 46 +-
pandas/tests/indexes/multi/test_reindex.py | 43 +-
pandas/tests/indexes/multi/test_reshape.py | 100 +-
pandas/tests/indexes/multi/test_set_ops.py | 77 +-
pandas/tests/indexes/multi/test_sorting.py | 140 +-
.../tests/indexes/period/test_arithmetic.py | 87 +-
pandas/tests/indexes/period/test_asfreq.py | 203 +-
pandas/tests/indexes/period/test_astype.py | 78 +-
.../tests/indexes/period/test_construction.py | 398 +-
pandas/tests/indexes/period/test_formats.py | 191 +-
pandas/tests/indexes/period/test_indexing.py | 502 +-
pandas/tests/indexes/period/test_ops.py | 220 +-
.../indexes/period/test_partial_slicing.py | 121 +-
pandas/tests/indexes/period/test_period.py | 358 +-
.../tests/indexes/period/test_period_range.py | 70 +-
.../indexes/period/test_scalar_compat.py | 10 +-
pandas/tests/indexes/period/test_setops.py | 381 +-
pandas/tests/indexes/period/test_tools.py | 322 +-
pandas/tests/indexes/test_base.py | 1816 +++---
pandas/tests/indexes/test_category.py | 709 ++-
pandas/tests/indexes/test_common.py | 90 +-
pandas/tests/indexes/test_frozen.py | 4 +-
pandas/tests/indexes/test_numeric.py | 549 +-
pandas/tests/indexes/test_numpy_compat.py | 68 +-
pandas/tests/indexes/test_range.py | 452 +-
pandas/tests/indexes/test_setops.py | 67 +-
.../indexes/timedeltas/test_arithmetic.py | 185 +-
.../tests/indexes/timedeltas/test_astype.py | 79 +-
.../indexes/timedeltas/test_construction.py | 145 +-
.../tests/indexes/timedeltas/test_formats.py | 108 +-
.../tests/indexes/timedeltas/test_indexing.py | 265 +-
pandas/tests/indexes/timedeltas/test_ops.py | 152 +-
.../timedeltas/test_partial_slicing.py | 59 +-
.../indexes/timedeltas/test_scalar_compat.py | 50 +-
.../tests/indexes/timedeltas/test_setops.py | 98 +-
.../indexes/timedeltas/test_timedelta.py | 189 +-
.../timedeltas/test_timedelta_range.py | 51 +-
pandas/tests/indexes/timedeltas/test_tools.py | 156 +-
pandas/tests/indexing/common.py | 158 +-
pandas/tests/indexing/conftest.py | 27 +-
.../tests/indexing/interval/test_interval.py | 226 +-
.../indexing/interval/test_interval_new.py | 110 +-
pandas/tests/indexing/multiindex/conftest.py | 23 +-
.../multiindex/test_chaining_and_caching.py | 29 +-
.../indexing/multiindex/test_datetime.py | 8 +-
.../tests/indexing/multiindex/test_getitem.py | 210 +-
pandas/tests/indexing/multiindex/test_iloc.py | 82 +-
.../indexing/multiindex/test_indexing_slow.py | 53 +-
pandas/tests/indexing/multiindex/test_ix.py | 63 +-
pandas/tests/indexing/multiindex/test_loc.py | 289 +-
.../indexing/multiindex/test_multiindex.py | 82 +-
.../tests/indexing/multiindex/test_partial.py | 143 +-
.../tests/indexing/multiindex/test_set_ops.py | 25 +-
.../tests/indexing/multiindex/test_setitem.py | 454 +-
.../tests/indexing/multiindex/test_slice.py | 574 +-
.../tests/indexing/multiindex/test_sorted.py | 55 +-
pandas/tests/indexing/multiindex/test_xs.py | 192 +-
pandas/tests/indexing/test_callable.py | 167 +-
pandas/tests/indexing/test_categorical.py | 521 +-
.../indexing/test_chaining_and_caching.py | 268 +-
pandas/tests/indexing/test_coercion.py | 919 +--
pandas/tests/indexing/test_datetime.py | 253 +-
pandas/tests/indexing/test_floats.py | 852 ++-
pandas/tests/indexing/test_iloc.py | 437 +-
pandas/tests/indexing/test_indexing.py | 1086 ++--
.../tests/indexing/test_indexing_engines.py | 33 +-
pandas/tests/indexing/test_indexing_slow.py | 5 +-
pandas/tests/indexing/test_ix.py | 301 +-
pandas/tests/indexing/test_loc.py | 952 ++-
pandas/tests/indexing/test_partial.py | 324 +-
pandas/tests/indexing/test_scalar.py | 123 +-
pandas/tests/indexing/test_timedelta.py | 111 +-
pandas/tests/internals/test_internals.py | 941 +--
pandas/tests/io/conftest.py | 41 +-
pandas/tests/io/data/blank.ods | Bin 0 -> 2813 bytes
pandas/tests/io/data/blank_with_header.ods | Bin 0 -> 2893 bytes
pandas/tests/io/data/invalid_value_type.ods | Bin 0 -> 8502 bytes
.../0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack | Bin 4445 -> 0 bytes
.../0.16.2_AMD64_windows_2.7.10.msgpack | Bin 4745 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack | Bin 4745 -> 0 bytes
.../0.16.2_x86_64_darwin_2.7.10.msgpack | Bin 6196 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack | Bin 4745 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack | Bin 6196 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin 4684 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin 4684 -> 0 bytes
.../0.17.0_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.0_x86_64_darwin_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes
.../0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1_x86_64_darwin_2.7.11.msgpack | Bin 11323 -> 0 bytes
.../0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack | Bin 9300 -> 0 bytes
.../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin 10307 -> 0 bytes
.../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes
.../0.18.0_AMD64_windows_2.7.11.msgpack | Bin 8386 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack | Bin 8341 -> 0 bytes
.../0.18.0_x86_64_darwin_2.7.11.msgpack | Bin 8386 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack | Bin 8341 -> 0 bytes
.../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin 119258 -> 0 bytes
.../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin 119206 -> 0 bytes
.../0.19.2_x86_64_darwin_2.7.12.msgpack | Bin 12325 -> 0 bytes
.../0.20.3_x86_64_darwin_3.5.2.msgpack} | Bin 119196 -> 118654 bytes
.../0.10.1/AMD64_windows_2.7.3.pickle | Bin 4381 -> 0 bytes
.../0.10.1/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes
.../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin 8978 -> 0 bytes
.../0.11.0/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes
.../0.11.0/x86_64_linux_3.3.0.pickle | Bin 5822 -> 0 bytes
.../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin 8692 -> 0 bytes
.../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin 8768 -> 0 bytes
.../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin 7208 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin 7143 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin 7123 -> 0 bytes
.../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin 10019 -> 0 bytes
.../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin 7278 -> 0 bytes
.../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin 7445 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin 7278 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_2.7.8.pickle | Bin 7639 -> 0 bytes
.../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin 10049 -> 0 bytes
.../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin 8159 -> 0 bytes
.../0.14.0/0.14.0_x86_64_linux_2.7.8.pickle | Bin 9309 -> 0 bytes
.../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin 191074 -> 0 bytes
.../0.14.1/0.14.1_x86_64_linux_2.7.8.pickle | Bin 11930 -> 0 bytes
.../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin 127687 -> 0 bytes
.../0.15.0/0.15.0_x86_64_linux_2.7.8.pickle | Bin 15162 -> 0 bytes
.../0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle | Bin 14892 -> 0 bytes
.../0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_2.7.10.pickle | Bin 15173 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_2.7.14.pickle | Bin 132692 -> 0 bytes
.../0.16.2/0.16.2_AMD64_windows_3.4.3.pickle | Bin 13766 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle | Bin 16598 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes
.../0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle | Bin 15444 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin 14893 -> 0 bytes
.../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin 14116 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.0/0.17.0_AMD64_windows_3.4.4.pickle | Bin 16236 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle | Bin 16026 -> 0 bytes
.../0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle | Bin 129175 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.17.0/0.17.0_x86_64_linux_3.4.4.pickle | Bin 16581 -> 0 bytes
.../0.17.0/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.1/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes
.../0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_2.7.11.pickle | Bin 16875 -> 0 bytes
.../0.18.0/0.18.0_AMD64_windows_3.5.1.pickle | Bin 14674 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle | Bin 16718 -> 0 bytes
.../0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle | Bin 14671 -> 0 bytes
.../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin 127584 -> 0 bytes
.../0.19.2/0.19.2_AMD64_windows_2.7.14.pickle | Bin 133468 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle | Bin 127525 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle | Bin 132762 -> 0 bytes
.../0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle | Bin 126076 -> 0 bytes
.../0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle | Bin 132857 -> 0 bytes
.../0.20.3_x86_64_darwin_3.5.2.pickle} | Bin 127853 -> 127923 bytes
pandas/tests/io/data/test1.ods | Bin 0 -> 4440 bytes
pandas/tests/io/data/test1.xlsm | Bin 13967 -> 12091 bytes
pandas/tests/io/data/test1.xlsx | Bin 13878 -> 12074 bytes
pandas/tests/io/data/test2.ods | Bin 0 -> 2877 bytes
pandas/tests/io/data/test3.ods | Bin 0 -> 2889 bytes
pandas/tests/io/data/test4.ods | Bin 0 -> 2992 bytes
pandas/tests/io/data/test4.xls | Bin 25600 -> 25600 bytes
pandas/tests/io/data/test4.xlsm | Bin 8022 -> 8360 bytes
pandas/tests/io/data/test4.xlsx | Bin 28216 -> 8344 bytes
pandas/tests/io/data/test5.ods | Bin 0 -> 2906 bytes
pandas/tests/io/data/test5.xls | Bin 20480 -> 20480 bytes
pandas/tests/io/data/test5.xlsm | Bin 8017 -> 8642 bytes
pandas/tests/io/data/test5.xlsx | Bin 8002 -> 8626 bytes
pandas/tests/io/data/test_converters.ods | Bin 0 -> 3287 bytes
.../tests/io/data/test_index_name_pre17.ods | Bin 0 -> 3699 bytes
pandas/tests/io/data/test_multisheet.ods | Bin 0 -> 3797 bytes
pandas/tests/io/data/test_multisheet.xls | Bin 24576 -> 24576 bytes
pandas/tests/io/data/test_multisheet.xlsm | Bin 11148 -> 11313 bytes
pandas/tests/io/data/test_multisheet.xlsx | Bin 11131 -> 11296 bytes
pandas/tests/io/data/test_squeeze.ods | Bin 0 -> 3218 bytes
pandas/tests/io/data/test_squeeze.xls | Bin 26112 -> 26112 bytes
pandas/tests/io/data/test_squeeze.xlsm | Bin 8990 -> 9122 bytes
pandas/tests/io/data/test_squeeze.xlsx | Bin 8972 -> 9106 bytes
pandas/tests/io/data/test_types.ods | Bin 0 -> 3489 bytes
pandas/tests/io/data/test_types.xlsm | Bin 8733 -> 9042 bytes
pandas/tests/io/data/test_types.xlsx | Bin 33769 -> 9010 bytes
pandas/tests/io/data/testdateoverflow.ods | Bin 0 -> 3422 bytes
pandas/tests/io/data/testdtype.ods | Bin 0 -> 3196 bytes
pandas/tests/io/data/testmultiindex.ods | Bin 0 -> 5575 bytes
pandas/tests/io/data/testskiprows.ods | Bin 0 -> 3235 bytes
pandas/tests/io/data/times_1900.ods | Bin 0 -> 3181 bytes
pandas/tests/io/data/times_1904.ods | Bin 0 -> 3215 bytes
pandas/tests/io/data/writertable.odt | Bin 0 -> 10313 bytes
pandas/tests/io/excel/conftest.py | 5 +-
pandas/tests/io/excel/test_odf.py | 38 +
pandas/tests/io/excel/test_openpyxl.py | 98 +-
pandas/tests/io/excel/test_readers.py | 837 +--
pandas/tests/io/excel/test_style.py | 154 +-
pandas/tests/io/excel/test_writers.py | 764 +--
pandas/tests/io/excel/test_xlrd.py | 17 +-
pandas/tests/io/excel/test_xlsxwriter.py | 21 +-
pandas/tests/io/excel/test_xlwt.py | 33 +-
pandas/tests/io/formats/test_console.py | 50 +-
pandas/tests/io/formats/test_css.py | 297 +-
.../tests/io/formats/test_eng_formatting.py | 161 +-
pandas/tests/io/formats/test_format.py | 2768 +++++----
pandas/tests/io/formats/test_printing.py | 111 +-
pandas/tests/io/formats/test_style.py | 1794 +++---
pandas/tests/io/formats/test_to_csv.py | 519 +-
pandas/tests/io/formats/test_to_excel.py | 471 +-
pandas/tests/io/formats/test_to_html.py | 633 +-
pandas/tests/io/formats/test_to_latex.py | 214 +-
.../tests/io/generate_legacy_storage_files.py | 439 +-
pandas/tests/io/json/test_compression.py | 43 +-
.../tests/io/json/test_json_table_schema.py | 854 +--
pandas/tests/io/json/test_normalize.py | 734 ++-
pandas/tests/io/json/test_pandas.py | 1407 +++--
pandas/tests/io/json/test_readlines.py | 83 +-
pandas/tests/io/json/test_ujson.py | 512 +-
pandas/tests/io/msgpack/test_buffer.py | 11 +-
pandas/tests/io/msgpack/test_case.py | 100 +-
pandas/tests/io/msgpack/test_except.py | 11 +-
pandas/tests/io/msgpack/test_extension.py | 50 +-
pandas/tests/io/msgpack/test_format.py | 101 +-
pandas/tests/io/msgpack/test_limits.py | 15 +-
pandas/tests/io/msgpack/test_newspec.py | 58 +-
pandas/tests/io/msgpack/test_obj.py | 33 +-
pandas/tests/io/msgpack/test_pack.py | 86 +-
pandas/tests/io/msgpack/test_read_size.py | 42 +-
pandas/tests/io/msgpack/test_seq.py | 2 +-
pandas/tests/io/msgpack/test_sequnpack.py | 77 +-
pandas/tests/io/msgpack/test_subtype.py | 2 +-
pandas/tests/io/msgpack/test_unpack.py | 20 +-
pandas/tests/io/msgpack/test_unpack_raw.py | 10 +-
pandas/tests/io/parser/conftest.py | 12 +-
pandas/tests/io/parser/test_c_parser_only.py | 289 +-
pandas/tests/io/parser/test_comment.py | 41 +-
pandas/tests/io/parser/test_common.py | 1108 ++--
pandas/tests/io/parser/test_compression.py | 22 +-
pandas/tests/io/parser/test_converters.py | 42 +-
pandas/tests/io/parser/test_dialect.py | 65 +-
pandas/tests/io/parser/test_dtypes.py | 319 +-
pandas/tests/io/parser/test_header.py | 404 +-
pandas/tests/io/parser/test_index_col.py | 111 +-
pandas/tests/io/parser/test_mangle_dupes.py | 104 +-
pandas/tests/io/parser/test_multi_thread.py | 37 +-
pandas/tests/io/parser/test_na_values.py | 384 +-
pandas/tests/io/parser/test_network.py | 126 +-
pandas/tests/io/parser/test_parse_dates.py | 1358 ++--
.../io/parser/test_python_parser_only.py | 96 +-
pandas/tests/io/parser/test_quoting.py | 88 +-
pandas/tests/io/parser/test_read_fwf.py | 237 +-
pandas/tests/io/parser/test_skiprows.py | 182 +-
pandas/tests/io/parser/test_textreader.py | 259 +-
pandas/tests/io/parser/test_unsupported.py | 61 +-
pandas/tests/io/parser/test_usecols.py | 374 +-
pandas/tests/io/pytables/__init__.py | 0
pandas/tests/io/pytables/test_compat.py | 76 +
pandas/tests/io/pytables/test_pytables.py | 5448 +++++++++++++++++
.../io/pytables/test_pytables_missing.py | 14 +
pandas/tests/io/sas/test_sas.py | 11 +-
pandas/tests/io/sas/test_sas7bdat.py | 88 +-
pandas/tests/io/sas/test_xport.py | 17 +-
pandas/tests/io/test_clipboard.py | 191 +-
pandas/tests/io/test_common.py | 291 +-
pandas/tests/io/test_compression.py | 125 +-
pandas/tests/io/test_date_converters.py | 13 +-
pandas/tests/io/test_feather.py | 106 +-
pandas/tests/io/test_gbq.py | 70 +-
pandas/tests/io/test_gcs.py | 65 +-
pandas/tests/io/test_html.py | 663 +-
pandas/tests/io/test_packers.py | 591 +-
pandas/tests/io/test_parquet.py | 413 +-
pandas/tests/io/test_pickle.py | 149 +-
pandas/tests/io/test_pytables.py | 5169 ----------------
pandas/tests/io/test_s3.py | 8 +-
pandas/tests/io/test_spss.py | 7 +-
pandas/tests/io/test_sql.py | 1766 +++---
pandas/tests/io/test_stata.py | 1482 +++--
pandas/tests/plotting/common.py | 155 +-
pandas/tests/plotting/test_backend.py | 89 +
pandas/tests/plotting/test_boxplot_method.py | 307 +-
pandas/tests/plotting/test_converter.py | 186 +-
pandas/tests/plotting/test_datetimelike.py | 647 +-
pandas/tests/plotting/test_frame.py | 1902 +++---
pandas/tests/plotting/test_groupby.py | 38 +-
pandas/tests/plotting/test_hist_method.py | 155 +-
pandas/tests/plotting/test_misc.py | 349 +-
pandas/tests/plotting/test_series.py | 358 +-
pandas/tests/reductions/test_reductions.py | 535 +-
.../tests/reductions/test_stat_reductions.py | 102 +-
pandas/tests/resample/conftest.py | 36 +-
pandas/tests/resample/test_base.py | 95 +-
pandas/tests/resample/test_datetime_index.py | 1277 ++--
pandas/tests/resample/test_period_index.py | 819 +--
pandas/tests/resample/test_resample_api.py | 522 +-
.../tests/resample/test_resampler_grouper.py | 238 +-
pandas/tests/resample/test_time_grouper.py | 241 +-
pandas/tests/resample/test_timedelta.py | 110 +-
pandas/tests/reshape/merge/test_join.py | 759 +--
pandas/tests/reshape/merge/test_merge.py | 2279 ++++---
pandas/tests/reshape/merge/test_merge_asof.py | 1582 +++--
.../merge/test_merge_index_as_string.py | 99 +-
.../tests/reshape/merge/test_merge_ordered.py | 100 +-
pandas/tests/reshape/merge/test_multi.py | 914 +--
pandas/tests/reshape/test_concat.py | 1985 +++---
pandas/tests/reshape/test_cut.py | 366 +-
pandas/tests/reshape/test_melt.py | 1269 ++--
pandas/tests/reshape/test_pivot.py | 2924 +++++----
pandas/tests/reshape/test_qcut.py | 131 +-
pandas/tests/reshape/test_reshape.py | 579 +-
.../tests/reshape/test_union_categoricals.py | 228 +-
pandas/tests/reshape/test_util.py | 16 +-
pandas/tests/scalar/interval/test_interval.py | 123 +-
pandas/tests/scalar/interval/test_ops.py | 26 +-
pandas/tests/scalar/period/test_asfreq.py | 1139 ++--
pandas/tests/scalar/period/test_period.py | 1224 ++--
pandas/tests/scalar/test_nat.py | 328 +-
.../tests/scalar/timedelta/test_arithmetic.py | 243 +-
.../scalar/timedelta/test_construction.py | 285 +-
pandas/tests/scalar/timedelta/test_formats.py | 49 +-
.../tests/scalar/timedelta/test_timedelta.py | 655 +-
.../tests/scalar/timestamp/test_arithmetic.py | 63 +-
.../scalar/timestamp/test_comparisons.py | 51 +-
.../tests/scalar/timestamp/test_rendering.py | 45 +-
.../tests/scalar/timestamp/test_timestamp.py | 640 +-
.../tests/scalar/timestamp/test_timezones.py | 350 +-
.../tests/scalar/timestamp/test_unary_ops.py | 299 +-
pandas/tests/series/common.py | 7 +-
pandas/tests/series/conftest.py | 6 +-
pandas/tests/series/indexing/conftest.py | 2 +-
.../tests/series/indexing/test_alter_index.py | 248 +-
pandas/tests/series/indexing/test_boolean.py | 196 +-
pandas/tests/series/indexing/test_callable.py | 16 +-
pandas/tests/series/indexing/test_datetime.py | 347 +-
pandas/tests/series/indexing/test_indexing.py | 413 +-
pandas/tests/series/indexing/test_loc.py | 32 +-
pandas/tests/series/indexing/test_numeric.py | 155 +-
pandas/tests/series/test_alter_axes.py | 234 +-
pandas/tests/series/test_analytics.py | 957 +--
pandas/tests/series/test_api.py | 320 +-
pandas/tests/series/test_apply.py | 562 +-
pandas/tests/series/test_arithmetic.py | 76 +-
pandas/tests/series/test_asof.py | 58 +-
pandas/tests/series/test_block_internals.py | 12 +-
pandas/tests/series/test_combine_concat.py | 285 +-
pandas/tests/series/test_constructors.py | 790 +--
pandas/tests/series/test_datetime_values.py | 539 +-
pandas/tests/series/test_dtypes.py | 364 +-
pandas/tests/series/test_duplicates.py | 82 +-
pandas/tests/series/test_explode.py | 113 +
pandas/tests/series/test_internals.py | 119 +-
pandas/tests/series/test_io.py | 143 +-
pandas/tests/series/test_missing.py | 1253 ++--
pandas/tests/series/test_operators.py | 324 +-
pandas/tests/series/test_period.py | 129 +-
pandas/tests/series/test_quantile.py | 122 +-
pandas/tests/series/test_rank.py | 463 +-
pandas/tests/series/test_replace.py | 140 +-
pandas/tests/series/test_repr.py | 155 +-
pandas/tests/series/test_sorting.py | 100 +-
pandas/tests/series/test_subclass.py | 60 +-
pandas/tests/series/test_timeseries.py | 572 +-
pandas/tests/series/test_timezones.py | 235 +-
pandas/tests/series/test_ufunc.py | 305 +
pandas/tests/series/test_validate.py | 9 +-
pandas/tests/sparse/frame/conftest.py | 33 +-
pandas/tests/sparse/frame/test_analytics.py | 4 +-
pandas/tests/sparse/frame/test_apply.py | 41 +-
pandas/tests/sparse/frame/test_frame.py | 1075 ++--
pandas/tests/sparse/frame/test_indexing.py | 74 +-
pandas/tests/sparse/frame/test_to_csv.py | 10 +-
.../tests/sparse/frame/test_to_from_scipy.py | 61 +-
pandas/tests/sparse/series/test_indexing.py | 82 +-
pandas/tests/sparse/series/test_series.py | 834 +--
pandas/tests/sparse/test_combine_concat.py | 231 +-
pandas/tests/sparse/test_format.py | 127 +-
pandas/tests/sparse/test_groupby.py | 56 +-
pandas/tests/sparse/test_indexing.py | 754 ++-
pandas/tests/sparse/test_pivot.py | 67 +-
pandas/tests/sparse/test_reshape.py | 4 +-
pandas/tests/test_algos.py | 1363 +++--
pandas/tests/test_base.py | 798 ++-
pandas/tests/test_common.py | 68 +-
pandas/tests/test_downstream.py | 97 +-
pandas/tests/test_errors.py | 24 +-
pandas/tests/test_expressions.py | 298 +-
pandas/tests/test_join.py | 196 +-
pandas/tests/test_lib.py | 37 +-
pandas/tests/test_multilevel.py | 1656 ++---
pandas/tests/test_nanops.py | 815 ++-
pandas/tests/test_optional_dependency.py | 10 +-
pandas/tests/test_register_accessor.py | 46 +-
pandas/tests/test_sorting.py | 275 +-
pandas/tests/test_strings.py | 2605 ++++----
pandas/tests/test_take.py | 204 +-
pandas/tests/test_window.py | 4109 -------------
pandas/tests/tools/test_numeric.py | 371 +-
.../tseries/frequencies/test_freq_code.py | 184 +-
.../tseries/frequencies/test_inference.py | 326 +-
.../tseries/frequencies/test_to_offset.py | 184 +-
pandas/tests/tseries/holiday/test_calendar.py | 38 +-
pandas/tests/tseries/holiday/test_federal.py | 34 +-
pandas/tests/tseries/holiday/test_holiday.py | 289 +-
.../tests/tseries/holiday/test_observance.py | 70 +-
pandas/tests/tseries/offsets/common.py | 18 +-
pandas/tests/tseries/offsets/conftest.py | 10 +-
pandas/tests/tseries/offsets/test_fiscal.py | 613 +-
pandas/tests/tseries/offsets/test_offsets.py | 3989 +++++++-----
.../offsets/test_offsets_properties.py | 68 +-
pandas/tests/tseries/offsets/test_ticks.py | 174 +-
.../tests/tseries/offsets/test_yqm_offsets.py | 1624 +++--
pandas/tests/tslibs/test_api.py | 66 +-
pandas/tests/tslibs/test_array_to_datetime.py | 101 +-
pandas/tests/tslibs/test_ccalendar.py | 15 +-
pandas/tests/tslibs/test_conversion.py | 41 +-
pandas/tests/tslibs/test_fields.py | 31 +
pandas/tests/tslibs/test_libfrequencies.py | 146 +-
pandas/tests/tslibs/test_liboffsets.py | 162 +-
pandas/tests/tslibs/test_normalize_date.py | 25 +-
pandas/tests/tslibs/test_parse_iso8601.py | 76 +-
pandas/tests/tslibs/test_parsing.py | 194 +-
pandas/tests/tslibs/test_period_asfreq.py | 125 +-
pandas/tests/tslibs/test_timedeltas.py | 21 +-
pandas/tests/tslibs/test_timezones.py | 30 +-
pandas/tests/util/test_assert_almost_equal.py | 176 +-
.../util/test_assert_categorical_equal.py | 12 +-
.../util/test_assert_extension_array_equal.py | 25 +-
pandas/tests/util/test_assert_frame_equal.py | 164 +-
pandas/tests/util/test_assert_index_equal.py | 42 +-
.../util/test_assert_interval_array_equal.py | 13 +-
.../util/test_assert_numpy_array_equal.py | 46 +-
.../util/test_assert_produces_warning.py | 9 +-
pandas/tests/util/test_assert_series_equal.py | 75 +-
pandas/tests/util/test_deprecate.py | 27 +-
pandas/tests/util/test_deprecate_kwarg.py | 6 +-
pandas/tests/util/test_hashing.py | 176 +-
pandas/tests/util/test_move.py | 1 +
pandas/tests/util/test_safe_import.py | 13 +-
pandas/tests/util/test_util.py | 5 +-
pandas/tests/util/test_validate_args.py | 27 +-
.../util/test_validate_args_and_kwargs.py | 56 +-
pandas/tests/util/test_validate_kwargs.py | 18 +-
pandas/tests/window/__init__.py | 0
pandas/tests/window/common.py | 23 +
pandas/tests/window/conftest.py | 49 +
pandas/tests/window/test_api.py | 367 ++
pandas/tests/window/test_dtypes.py | 242 +
pandas/tests/window/test_ewm.py | 70 +
pandas/tests/window/test_expanding.py | 115 +
pandas/tests/window/test_grouper.py | 176 +
pandas/tests/window/test_moments.py | 2562 ++++++++
pandas/tests/window/test_pairwise.py | 183 +
pandas/tests/window/test_rolling.py | 336 +
pandas/tests/window/test_timeseries_window.py | 692 +++
pandas/tests/window/test_window.py | 76 +
pandas/tseries/converter.py | 24 +-
pandas/tseries/frequencies.py | 155 +-
pandas/tseries/holiday.py | 135 +-
pandas/tseries/offsets.py | 1204 ++--
pandas/util/__init__.py | 3 +-
pandas/util/_decorators.py | 208 +-
pandas/util/_depr_module.py | 32 +-
pandas/util/_doctools.py | 72 +-
pandas/util/_print_versions.py | 157 +-
pandas/util/_test_decorators.py | 109 +-
pandas/util/_tester.py | 6 +-
pandas/util/_validators.py | 96 +-
pandas/util/testing.py | 1186 ++--
requirements-dev.txt | 3 +-
scripts/download_wheels.py | 20 +-
scripts/find_commits_touching_func.py | 131 +-
scripts/generate_pip_deps_from_conda.py | 61 +-
scripts/merge-pr.py | 146 +-
scripts/tests/conftest.py | 7 +-
scripts/tests/test_validate_docstrings.py | 719 ++-
scripts/validate_docstrings.py | 714 ++-
setup.cfg | 45 +-
setup.py | 828 +--
versioneer.py | 213 +-
1079 files changed, 154703 insertions(+), 119438 deletions(-)
create mode 100644 .github/SECURITY.md
create mode 100644 .pre-commit-config.yaml
create mode 100644 asv_bench/benchmarks/index_cached_properties.py
delete mode 100755 ci/build_docs.sh
create mode 100644 ci/check_git_tags.sh
create mode 100644 ci/deps/azure-36-32bit.yaml
delete mode 100644 ci/deps/travis-36-doc.yaml
create mode 100644 doc/source/development/roadmap.rst
create mode 100644 doc/source/whatsnew/v0.25.1.rst
create mode 100644 doc/source/whatsnew/v0.25.2.rst
create mode 100644 doc/source/whatsnew/v1.0.0.rst
delete mode 100644 mypy.ini
create mode 100644 pandas/_libs/lib.pxd
delete mode 100644 pandas/core/arrays/array_.py
create mode 100644 pandas/core/construction.py
create mode 100644 pandas/core/indexers.py
delete mode 100644 pandas/core/internals/arrays.py
delete mode 100644 pandas/core/ops.py
create mode 100644 pandas/core/ops/__init__.py
create mode 100644 pandas/core/ops/array_ops.py
create mode 100644 pandas/core/ops/docstrings.py
create mode 100644 pandas/core/ops/invalid.py
create mode 100644 pandas/core/ops/methods.py
create mode 100644 pandas/core/ops/missing.py
create mode 100644 pandas/core/ops/roperator.py
delete mode 100644 pandas/core/panel.py
delete mode 100644 pandas/core/window.py
create mode 100644 pandas/core/window/__init__.py
create mode 100644 pandas/core/window/common.py
create mode 100644 pandas/core/window/ewm.py
create mode 100644 pandas/core/window/expanding.py
create mode 100644 pandas/core/window/rolling.py
create mode 100644 pandas/io/excel/_odfreader.py
rename pandas/io/json/{json.py => _json.py} (62%)
rename pandas/io/json/{normalize.py => _normalize.py} (56%)
rename pandas/io/json/{table_schema.py => _table_schema.py} (69%)
create mode 100644 pandas/io/msgpack/_packer.pyi
create mode 100644 pandas/io/msgpack/_unpacker.pyi
create mode 100644 pandas/tests/dtypes/cast/test_promote.py
rename pandas/tests/extension/arrow/{bool.py => arrays.py} (67%)
create mode 100644 pandas/tests/extension/arrow/test_string.py
create mode 100644 pandas/tests/frame/test_explode.py
create mode 100644 pandas/tests/io/data/blank.ods
create mode 100644 pandas/tests/io/data/blank_with_header.ods
create mode 100644 pandas/tests/io/data/invalid_value_type.ods
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack
delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.19.2/0.19.2_x86_64_darwin_2.7.12.msgpack
rename pandas/tests/io/data/legacy_msgpack/{0.19.2/0.19.2_x86_64_darwin_3.6.1.msgpack => 0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack} (92%)
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle
delete mode 100644 pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle
rename pandas/tests/io/data/legacy_pickle/{0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle => 0.20.3/0.20.3_x86_64_darwin_3.5.2.pickle} (85%)
create mode 100644 pandas/tests/io/data/test1.ods
create mode 100644 pandas/tests/io/data/test2.ods
create mode 100644 pandas/tests/io/data/test3.ods
create mode 100644 pandas/tests/io/data/test4.ods
create mode 100644 pandas/tests/io/data/test5.ods
create mode 100644 pandas/tests/io/data/test_converters.ods
create mode 100644 pandas/tests/io/data/test_index_name_pre17.ods
create mode 100644 pandas/tests/io/data/test_multisheet.ods
create mode 100644 pandas/tests/io/data/test_squeeze.ods
create mode 100644 pandas/tests/io/data/test_types.ods
create mode 100644 pandas/tests/io/data/testdateoverflow.ods
create mode 100644 pandas/tests/io/data/testdtype.ods
create mode 100644 pandas/tests/io/data/testmultiindex.ods
create mode 100644 pandas/tests/io/data/testskiprows.ods
create mode 100644 pandas/tests/io/data/times_1900.ods
create mode 100644 pandas/tests/io/data/times_1904.ods
create mode 100644 pandas/tests/io/data/writertable.odt
create mode 100644 pandas/tests/io/excel/test_odf.py
create mode 100644 pandas/tests/io/pytables/__init__.py
create mode 100644 pandas/tests/io/pytables/test_compat.py
create mode 100644 pandas/tests/io/pytables/test_pytables.py
create mode 100644 pandas/tests/io/pytables/test_pytables_missing.py
delete mode 100644 pandas/tests/io/test_pytables.py
create mode 100644 pandas/tests/plotting/test_backend.py
create mode 100644 pandas/tests/series/test_explode.py
create mode 100644 pandas/tests/series/test_ufunc.py
delete mode 100644 pandas/tests/test_window.py
create mode 100644 pandas/tests/tslibs/test_fields.py
create mode 100644 pandas/tests/window/__init__.py
create mode 100644 pandas/tests/window/common.py
create mode 100644 pandas/tests/window/conftest.py
create mode 100644 pandas/tests/window/test_api.py
create mode 100644 pandas/tests/window/test_dtypes.py
create mode 100644 pandas/tests/window/test_ewm.py
create mode 100644 pandas/tests/window/test_expanding.py
create mode 100644 pandas/tests/window/test_grouper.py
create mode 100644 pandas/tests/window/test_moments.py
create mode 100644 pandas/tests/window/test_pairwise.py
create mode 100644 pandas/tests/window/test_rolling.py
create mode 100644 pandas/tests/window/test_timeseries_window.py
create mode 100644 pandas/tests/window/test_window.py
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index 6912d15abf3d6..944ce9b4fb1f6 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1 +1,2 @@
custom: https://pandas.pydata.org/donate.html
+tidelift: pypi/pandas
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4e1e9ce017408..7c3870470f074 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,5 @@
- [ ] closes #xxxx
- [ ] tests added / passed
+- [ ] passes `black pandas`
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
- [ ] whatsnew entry
diff --git a/.github/SECURITY.md b/.github/SECURITY.md
new file mode 100644
index 0000000000000..f3b059a5d4f13
--- /dev/null
+++ b/.github/SECURITY.md
@@ -0,0 +1 @@
+To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there.
diff --git a/.gitignore b/.gitignore
index 56828fa1d9331..e85da9c9b976b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,9 @@ coverage_html_report
# hypothesis test database
.hypothesis/
__pycache__
+# pytest-monkeytype
+monkeytype.sqlite3
+
# OS generated files #
######################
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..32ffb3330564c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+ - repo: https://github.com/python/black
+ rev: stable
+ hooks:
+ - id: black
+ language_version: python3.7
+ - repo: https://gitlab.com/pycqa/flake8
+ rev: 3.7.7
+ hooks:
+ - id: flake8
+ language: python_venv
+ additional_dependencies: [flake8-comprehensions]
+ - repo: https://github.com/pre-commit/mirrors-isort
+ rev: v4.3.20
+ hooks:
+ - id: isort
+ language: python_venv
diff --git a/.travis.yml b/.travis.yml
index fd59544d9b3c6..79fecc41bec0d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,3 @@
-sudo: false
language: python
python: 3.5
@@ -22,7 +21,7 @@ env:
git:
# for cloning
- depth: 2000
+ depth: false
matrix:
fast_finish: true
@@ -48,17 +47,10 @@ matrix:
env:
- JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
- # In allow_failures
- - dist: trusty
- env:
- - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
allow_failures:
- dist: trusty
env:
- JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
- - dist: trusty
- env:
- - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
before_install:
- echo "before_install"
@@ -71,7 +63,7 @@ before_install:
- pwd
- uname -a
- git --version
- - git tag
+ - ./ci/check_git_tags.sh
# Because travis runs on Google Cloud and has a /etc/boto.cfg,
# it breaks moto import, see:
# https://github.com/spulec/moto/issues/1771
@@ -97,7 +89,6 @@ before_script:
script:
- echo "script start"
- source activate pandas-dev
- - ci/build_docs.sh
- ci/run_tests.sh
after_script:
diff --git a/Makefile b/Makefile
index 956ff52338839..9e69eb7922925 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,6 @@
-tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx
- python setup.py build_ext --inplace
+.PHONY : develop build clean clean_pyc doc lint-diff black
-.PHONY : develop build clean clean_pyc tseries doc
+all: develop
clean:
-python setup.py clean
@@ -15,8 +14,11 @@ build: clean_pyc
lint-diff:
git diff upstream/master --name-only -- "*.py" | xargs flake8
+black:
+ black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)'
+
develop: build
- -python setup.py develop
+ python setup.py develop
doc:
-rm -rf doc/build doc/source/generated
diff --git a/README.md b/README.md
index e8bfd28cc8208..3cde98d3145f2 100644
--- a/README.md
+++ b/README.md
@@ -224,7 +224,7 @@ Most development discussion is taking place on github in this repo. Further, the
All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
-A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas-docs.github.io/pandas-docs-travis/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
@@ -233,3 +233,5 @@ You can also triage issues which may include reproducing bug reports, or asking
Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
+
+As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md)
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 45ef47fde0a56..7d97f2c740acb 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -2,10 +2,12 @@
import numpy as np
+from pandas._libs import lib
+
import pandas as pd
from pandas.util import testing as tm
-for imp in ['pandas.util', 'pandas.tools.hashing']:
+for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
break
@@ -13,17 +15,32 @@
pass
+class MaybeConvertObjects:
+ def setup(self):
+ N = 10 ** 5
+
+ data = list(range(N))
+ data[0] = pd.NaT
+ data = np.array(data)
+ self.data = data
+
+ def time_maybe_convert_objects(self):
+ lib.maybe_convert_objects(self.data)
+
+
class Factorize:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
def time_factorize(self, sort, dtype):
@@ -32,15 +49,17 @@ def time_factorize(self, sort, dtype):
class FactorizeUnique:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.arange(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.arange(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
assert self.idx.is_unique
@@ -50,15 +69,17 @@ def time_factorize(self, sort, dtype):
class Duplicated:
- params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
- param_names = ['keep', 'dtype']
+ params = [["first", "last", False], ["int", "uint", "float", "string"]]
+ param_names = ["keep", "dtype"]
def setup(self, keep, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -69,15 +90,17 @@ def time_duplicated(self, keep, dtype):
class DuplicatedUniqueIndex:
- params = ['int', 'uint', 'float', 'string']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "string"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.random.randn(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.random.randn(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -87,18 +110,21 @@ def time_duplicated_unique(self, dtype):
class Hashing:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
df = pd.DataFrame(
- {'strings': pd.Series(tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=N))),
- 'floats': np.random.randn(N),
- 'ints': np.arange(N),
- 'dates': pd.date_range('20110101', freq='s', periods=N),
- 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
- df['categories'] = df['strings'].astype('category')
+ {
+ "strings": pd.Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+ ),
+ "floats": np.random.randn(N),
+ "ints": np.arange(N),
+ "dates": pd.date_range("20110101", freq="s", periods=N),
+ "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N),
+ }
+ )
+ df["categories"] = df["strings"].astype("category")
df.iloc[10:20] = np.nan
return df
@@ -106,39 +132,55 @@ def time_frame(self, df):
hashing.hash_pandas_object(df)
def time_series_int(self, df):
- hashing.hash_pandas_object(df['ints'])
+ hashing.hash_pandas_object(df["ints"])
def time_series_string(self, df):
- hashing.hash_pandas_object(df['strings'])
+ hashing.hash_pandas_object(df["strings"])
def time_series_float(self, df):
- hashing.hash_pandas_object(df['floats'])
+ hashing.hash_pandas_object(df["floats"])
def time_series_categorical(self, df):
- hashing.hash_pandas_object(df['categories'])
+ hashing.hash_pandas_object(df["categories"])
def time_series_timedeltas(self, df):
- hashing.hash_pandas_object(df['timedeltas'])
+ hashing.hash_pandas_object(df["timedeltas"])
def time_series_dates(self, df):
- hashing.hash_pandas_object(df['dates'])
+ hashing.hash_pandas_object(df["dates"])
class Quantile:
- params = [[0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'],
- ['float', 'int', 'uint']]
- param_names = ['quantile', 'interpolation', 'dtype']
+ params = [
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ ["float", "int", "uint"],
+ ]
+ param_names = ["quantile", "interpolation", "dtype"]
def setup(self, quantile, interpolation, dtype):
- N = 10**5
- data = {'int': np.arange(N),
- 'uint': np.arange(N).astype(np.uint64),
- 'float': np.random.randn(N)}
+ N = 10 ** 5
+ data = {
+ "int": np.arange(N),
+ "uint": np.arange(N).astype(np.uint64),
+ "float": np.random.randn(N),
+ }
self.idx = pd.Series(data[dtype].repeat(5))
def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)
+class SortIntegerArray:
+ params = [10 ** 3, 10 ** 5]
+
+ def setup(self, N):
+ data = np.arange(N, dtype=float)
+ data[40] = np.nan
+ self.array = pd.array(data, dtype="Int64")
+
+ def time_argsort(self, N):
+ self.array.argsort()
+
+
from .pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py
index dd316a2bc88d0..c43e5dfd729aa 100644
--- a/asv_bench/benchmarks/attrs_caching.py
+++ b/asv_bench/benchmarks/attrs_caching.py
@@ -1,5 +1,6 @@
import numpy as np
from pandas import DataFrame
+
try:
from pandas.util import cache_readonly
except ImportError:
@@ -7,7 +8,6 @@
class DataFrameAttributes:
-
def setup(self):
self.df = DataFrame(np.random.randn(10, 6))
self.cur_index = self.df.index
@@ -20,14 +20,12 @@ def time_set_index(self):
class CacheReadonly:
-
def setup(self):
-
class Foo:
-
@cache_readonly
def prop(self):
return 5
+
self.obj = Foo()
def time_cache_readonly(self):
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
index 26cd66284c41e..fd3324b78f1c3 100644
--- a/asv_bench/benchmarks/binary_ops.py
+++ b/asv_bench/benchmarks/binary_ops.py
@@ -1,6 +1,7 @@
import numpy as np
from pandas import DataFrame, Series, date_range
from pandas.core.algorithms import checked_add_with_arr
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -9,14 +10,14 @@
class Ops:
- params = [[True, False], ['default', 1]]
- param_names = ['use_numexpr', 'threads']
+ params = [[True, False], ["default", 1]]
+ param_names = ["use_numexpr", "threads"]
def setup(self, use_numexpr, threads):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
- if threads != 'default':
+ if threads != "default":
expr.set_numexpr_threads(threads)
if not use_numexpr:
expr.set_use_numexpr(False)
@@ -39,18 +40,21 @@ def teardown(self, use_numexpr, threads):
class Ops2:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N, N))
self.df2 = DataFrame(np.random.randn(N, N))
- self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
- self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
+ self.df_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
+ self.df2_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
self.s = Series(np.random.randn(N))
@@ -90,16 +94,16 @@ def time_frame_series_dot(self):
class Timeseries:
- params = [None, 'US/Eastern']
- param_names = ['tz']
+ params = [None, "US/Eastern"]
+ param_names = ["tz"]
def setup(self, tz):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz))
+ self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz))
self.ts = self.s[halfway]
- self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz))
+ self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
def time_series_timestamp_compare(self, tz):
self.s <= self.ts
@@ -117,10 +121,10 @@ def time_timestamp_ops_diff_with_shift(self, tz):
class AddOverflowScalar:
params = [1, -1, 0]
- param_names = ['scalar']
+ param_names = ["scalar"]
def setup(self, scalar):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
def time_add_overflow_scalar(self, scalar):
@@ -128,9 +132,8 @@ def time_add_overflow_scalar(self, scalar):
class AddOverflowArray:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
self.arr_rev = np.arange(-N, 0)
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
@@ -144,12 +147,12 @@ def time_add_overflow_arr_mask_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
def time_add_overflow_b_mask_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed,
- b_mask=self.arr_nan_1)
+ checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1)
def time_add_overflow_both_arg_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
- b_mask=self.arr_nan_2)
+ checked_add_with_arr(
+ self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2
+ )
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 790157497ca36..8097118a79d20 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -1,8 +1,8 @@
-import warnings
-
import numpy as np
import pandas as pd
import pandas.util.testing as tm
+import warnings
+
try:
from pandas.api.types import union_categoricals
except ImportError:
@@ -13,13 +13,12 @@
class Concat:
-
def setup(self):
- N = 10**5
- self.s = pd.Series(list('aabbcd') * N).astype('category')
+ N = 10 ** 5
+ self.s = pd.Series(list("aabbcd") * N).astype("category")
- self.a = pd.Categorical(list('aabbcd') * N)
- self.b = pd.Categorical(list('bbcdjk') * N)
+ self.a = pd.Categorical(list("aabbcd") * N)
+ self.b = pd.Categorical(list("bbcdjk") * N)
def time_concat(self):
pd.concat([self.s, self.s])
@@ -29,23 +28,22 @@ def time_union(self):
class Constructor:
-
def setup(self):
- N = 10**5
- self.categories = list('abcde')
+ N = 10 ** 5
+ self.categories = list("abcde")
self.cat_idx = pd.Index(self.categories)
self.values = np.tile(self.categories, N)
self.codes = np.tile(range(len(self.categories)), N)
- self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00',
- periods=N / 10,
- freq='s'))
+ self.datetimes = pd.Series(
+ pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s")
+ )
self.datetimes_with_nat = self.datetimes.copy()
self.datetimes_with_nat.iloc[-1] = pd.NaT
self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
self.values_all_nan = [np.nan] * len(self.values)
- self.values_all_int8 = np.ones(N, 'int8')
+ self.values_all_int8 = np.ones(N, "int8")
self.categorical = pd.Categorical(self.values, self.categories)
self.series = pd.Series(self.categorical)
@@ -80,68 +78,61 @@ def time_existing_series(self):
class ValueCounts:
params = [True, False]
- param_names = ['dropna']
+ param_names = ["dropna"]
def setup(self, dropna):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_value_counts(self, dropna):
self.ts.value_counts(dropna=dropna)
class Repr:
-
def setup(self):
- self.sel = pd.Series(['s1234']).astype('category')
+ self.sel = pd.Series(["s1234"]).astype("category")
def time_rendering(self):
str(self.sel)
class SetCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])
class RemoveCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_remove_categories(self):
self.ts.cat.remove_categories(self.ts.cat.categories[::2])
class Rank:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
ncats = 100
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
- self.s_str_cat = self.s_str.astype('category')
+ self.s_str_cat = pd.Series(self.s_str, dtype="category")
with warnings.catch_warnings(record=True):
- self.s_str_cat_ordered = self.s_str.astype('category',
- ordered=True)
+ str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
+ self.s_str_cat_ordered = self.s_str.astype(str_cat_type)
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
- self.s_int_cat = self.s_int.astype('category')
+ self.s_int_cat = pd.Series(self.s_int, dtype="category")
with warnings.catch_warnings(record=True):
- self.s_int_cat_ordered = self.s_int.astype('category',
- ordered=True)
+ int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True)
+ self.s_int_cat_ordered = self.s_int.astype(int_cat_type)
def time_rank_string(self):
self.s_str.rank()
@@ -164,28 +155,27 @@ def time_rank_int_cat_ordered(self):
class Isin:
- params = ['object', 'int64']
- param_names = ['dtype']
+ params = ["object", "int64"]
+ param_names = ["dtype"]
def setup(self, dtype):
np.random.seed(1234)
- n = 5 * 10**5
+ n = 5 * 10 ** 5
sample_size = 100
arr = [i for i in np.random.randint(0, n // 10, size=n)]
- if dtype == 'object':
- arr = ['s{:04d}'.format(i) for i in arr]
+ if dtype == "object":
+ arr = ["s{:04d}".format(i) for i in arr]
self.sample = np.random.choice(arr, sample_size)
- self.series = pd.Series(arr).astype('category')
+ self.series = pd.Series(arr).astype("category")
def time_isin_categorical(self, dtype):
self.series.isin(self.sample)
class IsMonotonic:
-
def setup(self):
N = 1000
- self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N))
+ self.c = pd.CategoricalIndex(list("a" * N + "b" * N + "c" * N))
self.s = pd.Series(self.c)
def time_categorical_index_is_monotonic_increasing(self):
@@ -202,9 +192,8 @@ def time_categorical_series_is_monotonic_decreasing(self):
class Contains:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.ci = tm.makeCategoricalIndex(N)
self.c = self.ci.values
self.key = self.ci.categories[0]
@@ -218,34 +207,33 @@ def time_categorical_contains(self):
class CategoricalSlicing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**6
- categories = ['a', 'b', 'c']
+ N = 10 ** 6
+ categories = ["a", "b", "c"]
values = [0] * N + [1] * N + [2] * N
- if index == 'monotonic_incr':
- self.data = pd.Categorical.from_codes(values,
- categories=categories)
- elif index == 'monotonic_decr':
- self.data = pd.Categorical.from_codes(list(reversed(values)),
- categories=categories)
- elif index == 'non_monotonic':
- self.data = pd.Categorical.from_codes([0, 1, 2] * N,
- categories=categories)
+ if index == "monotonic_incr":
+ self.data = pd.Categorical.from_codes(values, categories=categories)
+ elif index == "monotonic_decr":
+ self.data = pd.Categorical.from_codes(
+ list(reversed(values)), categories=categories
+ )
+ elif index == "non_monotonic":
+ self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
else:
- raise ValueError('Invalid index param: {}'.format(index))
+ raise ValueError("Invalid index param: {}".format(index))
self.scalar = 10000
self.list = list(range(10000))
- self.cat_scalar = 'b'
+ self.cat_scalar = "b"
def time_getitem_scalar(self, index):
self.data[self.scalar]
def time_getitem_slice(self, index):
- self.data[:self.scalar]
+ self.data[: self.scalar]
def time_getitem_list_like(self, index):
self.data[[self.scalar]]
@@ -258,9 +246,8 @@ def time_getitem_bool_array(self, index):
class Indexing:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.index = pd.CategoricalIndex(range(N), range(N))
self.series = pd.Series(range(N), index=self.index).sort_index()
self.category = self.index[500]
@@ -275,7 +262,7 @@ def time_shallow_copy(self):
self.index._shallow_copy()
def time_align(self):
- pd.DataFrame({'a': self.series, 'b': self.series[:500]})
+ pd.DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
@@ -287,7 +274,7 @@ def time_reindex(self):
self.index.reindex(self.index[:500])
def time_reindex_missing(self):
- self.index.reindex(['a', 'b', 'c', 'd'])
+ self.index.reindex(["a", "b", "c", "d"])
def time_sort_values(self):
self.index.sort_values(ascending=False)
diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
index 1c6841a296377..654075292cdf6 100644
--- a/asv_bench/benchmarks/ctors.py
+++ b/asv_bench/benchmarks/ctors.py
@@ -42,22 +42,34 @@ def list_of_lists_with_none(arr):
class SeriesConstructors:
param_names = ["data_fmt", "with_index", "dtype"]
- params = [[no_change,
- list,
- list_of_str,
- gen_of_str,
- arr_dict,
- list_of_tuples,
- gen_of_tuples,
- list_of_lists,
- list_of_tuples_with_none,
- list_of_lists_with_none],
- [False, True],
- ['float', 'int']]
+ params = [
+ [
+ no_change,
+ list,
+ list_of_str,
+ gen_of_str,
+ arr_dict,
+ list_of_tuples,
+ gen_of_tuples,
+ list_of_lists,
+ list_of_tuples_with_none,
+ list_of_lists_with_none,
+ ],
+ [False, True],
+ ["float", "int"],
+ ]
+
+ # Generators get exhausted on use, so run setup before every call
+ number = 1
+ repeat = (3, 250, 10)
def setup(self, data_fmt, with_index, dtype):
- N = 10**4
- if dtype == 'float':
+ if data_fmt in (gen_of_str, gen_of_tuples) and with_index:
+ raise NotImplementedError(
+ "Series constructors do not support " "using generators with indexes"
+ )
+ N = 10 ** 4
+ if dtype == "float":
arr = np.random.randn(N)
else:
arr = np.arange(N)
@@ -69,13 +81,15 @@ def time_series_constructor(self, data_fmt, with_index, dtype):
class SeriesDtypesConstructors:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.arr = np.random.randn(N)
- self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object)
- self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
- Timestamp('20130101')] * N * 10)
+ self.arr_str = np.array(["foo", "bar", "baz"], dtype=object)
+ self.s = Series(
+ [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")]
+ * N
+ * 10
+ )
def time_index_from_array_string(self):
Index(self.arr_str)
@@ -91,9 +105,8 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.iterables = [tm.makeStringIndex(N), range(20)]
def time_multiindex_from_iterables(self):
diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
index 9bfaaa8696009..60800b1f9cae7 100644
--- a/asv_bench/benchmarks/dtypes.py
+++ b/asv_bench/benchmarks/dtypes.py
@@ -2,32 +2,36 @@
import numpy as np
from .pandas_vb_common import (
- numeric_dtypes, datetime_dtypes, string_dtypes, extension_dtypes)
+ numeric_dtypes,
+ datetime_dtypes,
+ string_dtypes,
+ extension_dtypes,
+)
-_numpy_dtypes = [np.dtype(dtype)
- for dtype in (numeric_dtypes +
- datetime_dtypes +
- string_dtypes)]
+_numpy_dtypes = [
+ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes)
+]
_dtypes = _numpy_dtypes + extension_dtypes
class Dtypes:
- params = (_dtypes +
- list(map(lambda dt: dt.name, _dtypes)))
- param_names = ['dtype']
+ params = _dtypes + list(map(lambda dt: dt.name, _dtypes))
+ param_names = ["dtype"]
def time_pandas_dtype(self, dtype):
pandas_dtype(dtype)
class DtypesInvalid:
- param_names = ['dtype']
- params = ['scalar-string', 'scalar-int', 'list-string', 'array-string']
- data_dict = {'scalar-string': 'foo',
- 'scalar-int': 1,
- 'list-string': ['foo'] * 1000,
- 'array-string': np.array(['foo'] * 1000)}
+ param_names = ["dtype"]
+ params = ["scalar-string", "scalar-int", "list-string", "array-string"]
+ data_dict = {
+ "scalar-string": "foo",
+ "scalar-int": 1,
+ "list-string": ["foo"] * 1000,
+ "array-string": np.array(["foo"] * 1000),
+ }
def time_pandas_dtype_invalid(self, dtype):
try:
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
index be47d35f2cad1..84e94315cc28b 100644
--- a/asv_bench/benchmarks/eval.py
+++ b/asv_bench/benchmarks/eval.py
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -8,8 +9,8 @@
class Eval:
- params = [['numexpr', 'python'], [1, 'all']]
- param_names = ['engine', 'threads']
+ params = [["numexpr", "python"], [1, "all"]]
+ param_names = ["engine", "threads"]
def setup(self, engine, threads):
self.df = pd.DataFrame(np.random.randn(20000, 100))
@@ -21,44 +22,44 @@ def setup(self, engine, threads):
expr.set_numexpr_threads(1)
def time_add(self, engine, threads):
- pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine)
+ pd.eval("self.df + self.df2 + self.df3 + self.df4", engine=engine)
def time_and(self, engine, threads):
- pd.eval('(self.df > 0) & (self.df2 > 0) & '
- '(self.df3 > 0) & (self.df4 > 0)', engine=engine)
+ pd.eval(
+ "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)",
+ engine=engine,
+ )
def time_chained_cmp(self, engine, threads):
- pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine)
+ pd.eval("self.df < self.df2 < self.df3 < self.df4", engine=engine)
def time_mult(self, engine, threads):
- pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine)
+ pd.eval("self.df * self.df2 * self.df3 * self.df4", engine=engine)
def teardown(self, engine, threads):
expr.set_numexpr_threads()
class Query:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- index = pd.date_range('20010101', periods=N, freq='T')
+ index = pd.date_range("20010101", periods=N, freq="T")
s = pd.Series(index)
self.ts = s.iloc[halfway]
- self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': index},
- index=index)
+ self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index)
data = np.random.randn(N)
self.min_val = data.min()
self.max_val = data.max()
def time_query_datetime_index(self):
- self.df.query('index < @self.ts')
+ self.df.query("index < @self.ts")
def time_query_datetime_column(self):
- self.df.query('dates < @self.ts')
+ self.df.query("dates < @self.ts")
def time_query_with_boolean_selection(self):
- self.df.query('(a >= @self.min_val) & (a <= @self.max_val)')
+ self.df.query("(a >= @self.min_val) & (a <= @self.max_val)")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
index 19c2a913e8494..acfb26bcf5d7c 100644
--- a/asv_bench/benchmarks/frame_ctor.py
+++ b/asv_bench/benchmarks/frame_ctor.py
@@ -1,25 +1,23 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range
+
try:
from pandas.tseries.offsets import Nano, Hour
except ImportError:
# For compatibility with older versions
- from pandas.core.datetools import * # noqa
+ from pandas.core.datetools import * # noqa
class FromDicts:
-
def setup(self):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
- frame = DataFrame(np.random.randn(N, K), index=self.index,
- columns=self.columns)
+ frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
- self.dict_list = frame.to_dict(orient='records')
- self.data2 = {i: {j: float(j) for j in range(100)}
- for i in range(2000)}
+ self.dict_list = frame.to_dict(orient="records")
+ self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
def time_list_of_dict(self):
DataFrame(self.dict_list)
@@ -42,7 +40,6 @@ def time_nested_dict_int64(self):
class FromSeries:
-
def setup(self):
mi = MultiIndex.from_product([range(100), range(100)])
self.s = Series(np.random.randn(10000), index=mi)
@@ -54,12 +51,12 @@ def time_mi_series(self):
class FromDictwithTimestamp:
params = [Nano(1), Hour(1)]
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- N = 10**3
+ N = 10 ** 3
np.random.seed(1234)
- idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N)
+ idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N)
df = DataFrame(np.random.randn(N, 10), index=idx)
self.d = df.to_dict()
@@ -70,7 +67,11 @@ def time_dict_with_timestamp_offsets(self, offset):
class FromRecords:
params = [None, 1000]
- param_names = ['nrows']
+ param_names = ["nrows"]
+
+ # Generators get exhausted on use, so run setup before every call
+ number = 1
+ repeat = (3, 250, 10)
def setup(self, nrows):
N = 100000
@@ -82,7 +83,6 @@ def time_frame_from_records_generator(self, nrows):
class FromNDArray:
-
def setup(self):
N = 100000
self.data = np.random.randn(N)
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 5b76eeba115a4..e2f6764c76eef 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -1,18 +1,17 @@
+import warnings
import string
import numpy as np
-from pandas import (
- DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range)
+from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range
import pandas.util.testing as tm
class GetNumericData:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 25))
- self.df['foo'] = 'bar'
- self.df['bar'] = 'baz'
+ self.df["foo"] = "bar"
+ self.df["bar"] = "baz"
self.df = self.df._consolidate()
def time_frame_get_numeric_data(self):
@@ -20,17 +19,17 @@ def time_frame_get_numeric_data(self):
class Lookup:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 8),
- columns=list('abcdefgh'))
- self.df['foo'] = 'bar'
+ self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh"))
+ self.df["foo"] = "bar"
self.row_labels = list(self.df.index[::10])[:900]
self.col_labels = list(self.df.columns) * 100
self.row_labels_all = np.array(
- list(self.df.index) * len(self.df.columns), dtype='object')
+ list(self.df.index) * len(self.df.columns), dtype="object"
+ )
self.col_labels_all = np.array(
- list(self.df.columns) * len(self.df.index), dtype='object')
+ list(self.df.columns) * len(self.df.index), dtype="object"
+ )
def time_frame_fancy_lookup(self):
self.df.lookup(self.row_labels, self.col_labels)
@@ -40,17 +39,21 @@ def time_frame_fancy_lookup_all(self):
class Reindex:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_reindex_axis0(self):
self.df.reindex(self.idx)
@@ -66,18 +69,22 @@ def time_reindex_upcast(self):
class Rename:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.dict_idx = {k: k for k in self.idx}
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_rename_single(self):
self.df.rename({0: 0})
@@ -103,19 +110,20 @@ def setup(self):
N = 1000
self.df = DataFrame(np.random.randn(N * 10, N))
self.df2 = DataFrame(np.random.randn(N * 50, 10))
- self.df3 = DataFrame(np.random.randn(N, 5 * N),
- columns=['C' + str(c) for c in range(N * 5)])
+ self.df3 = DataFrame(
+ np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)]
+ )
self.df4 = DataFrame(np.random.randn(N * 1000, 10))
- def time_iteritems(self):
+ def time_items(self):
# (monitor no-copying behaviour)
- if hasattr(self.df, '_item_cache'):
+ if hasattr(self.df, "_item_cache"):
self.df._item_cache.clear()
- for name, col in self.df.iteritems():
+ for name, col in self.df.items():
pass
- def time_iteritems_cached(self):
- for name, col in self.df.iteritems():
+ def time_items_cached(self):
+ for name, col in self.df.items():
pass
def time_iteritems_indexing(self):
@@ -192,7 +200,6 @@ def time_iterrows(self):
class ToString:
-
def setup(self):
self.df = DataFrame(np.random.randn(100, 10))
@@ -201,11 +208,10 @@ def time_to_string_floats(self):
class ToHTML:
-
def setup(self):
nrows = 500
self.df2 = DataFrame(np.random.randn(nrows, 10))
- self.df2[0] = period_range('2000', periods=nrows)
+ self.df2[0] = period_range("2000", periods=nrows)
self.df2[1] = range(nrows)
def time_to_html_mixed(self):
@@ -213,7 +219,6 @@ def time_to_html_mixed(self):
class Repr:
-
def setup(self):
nrows = 10000
data = np.random.randn(nrows, 10)
@@ -238,7 +243,6 @@ def time_frame_repr_wide(self):
class MaskBool:
-
def setup(self):
data = np.random.randn(1000, 500)
df = DataFrame(data)
@@ -254,9 +258,8 @@ def time_frame_mask_floats(self):
class Isnull:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df_no_null = DataFrame(np.random.randn(N, N))
sample = np.array([np.nan, 1.0])
@@ -267,8 +270,20 @@ def setup(self):
data = np.random.choice(sample, (N, N))
self.df_strings = DataFrame(data)
- sample = np.array([NaT, np.nan, None, np.datetime64('NaT'),
- np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd'])
+ sample = np.array(
+ [
+ NaT,
+ np.nan,
+ None,
+ np.datetime64("NaT"),
+ np.timedelta64("NaT"),
+ 0,
+ 1,
+ 2.0,
+ "",
+ "abcd",
+ ]
+ )
data = np.random.choice(sample, (N, N))
self.df_obj = DataFrame(data)
@@ -287,8 +302,8 @@ def time_isnull_obj(self):
class Fillna:
- params = ([True, False], ['pad', 'bfill'])
- param_names = ['inplace', 'method']
+ params = ([True, False], ["pad", "bfill"])
+ param_names = ["inplace", "method"]
def setup(self, inplace, method):
values = np.random.randn(10000, 100)
@@ -301,16 +316,17 @@ def time_frame_fillna(self, inplace, method):
class Dropna:
- params = (['all', 'any'], [0, 1])
- param_names = ['how', 'axis']
+ params = (["all", "any"], [0, 1])
+ param_names = ["how", "axis"]
def setup(self, how, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
- self.df.ix[50:1000, 20:50] = np.nan
- self.df.ix[2000:3000] = np.nan
- self.df.ix[:, 60:70] = np.nan
+ with warnings.catch_warnings(record=True):
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
def time_dropna(self, how, axis):
self.df.dropna(how=how, axis=axis)
@@ -322,23 +338,25 @@ def time_dropna_axis_mixed_dtypes(self, how, axis):
class Count:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
- self.df.ix[50:1000, 20:50] = np.nan
- self.df.ix[2000:3000] = np.nan
- self.df.ix[:, 60:70] = np.nan
+ with warnings.catch_warnings(record=True):
+ self.df.ix[50:1000, 20:50] = np.nan
+ self.df.ix[2000:3000] = np.nan
+ self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index])
- self.df.columns = MultiIndex.from_arrays([self.df.columns,
- self.df.columns])
- self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index,
- self.df_mixed.index])
- self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns,
- self.df_mixed.columns])
+ self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns])
+ self.df_mixed.index = MultiIndex.from_arrays(
+ [self.df_mixed.index, self.df_mixed.index]
+ )
+ self.df_mixed.columns = MultiIndex.from_arrays(
+ [self.df_mixed.columns, self.df_mixed.columns]
+ )
def time_count_level_multi(self, axis):
self.df.count(axis=axis, level=1)
@@ -348,13 +366,12 @@ def time_count_level_mixed_dtypes_multi(self, axis):
class Apply:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
self.df2 = DataFrame({i: self.s for i in range(1028)})
- self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)])
@@ -372,11 +389,10 @@ def time_apply_pass_thru(self):
self.df.apply(lambda x: x)
def time_apply_ref_by_name(self):
- self.df3.apply(lambda x: x['A'] + x['B'], axis=1)
+ self.df3.apply(lambda x: x["A"] + x["B"], axis=1)
class Dtypes:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))
@@ -385,19 +401,18 @@ def time_frame_dtypes(self):
class Equals:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.float_df = DataFrame(np.random.randn(N, N))
self.float_df_nan = self.float_df.copy()
self.float_df_nan.iloc[-1, -1] = np.nan
- self.object_df = DataFrame('foo', index=range(N), columns=range(N))
+ self.object_df = DataFrame("foo", index=range(N), columns=range(N))
self.object_df_nan = self.object_df.copy()
self.object_df_nan.iloc[-1, -1] = np.nan
self.nonunique_cols = self.object_df.copy()
- self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns)
+ self.nonunique_cols.columns = ["A"] * len(self.nonunique_cols.columns)
self.nonunique_cols_nan = self.nonunique_cols.copy()
self.nonunique_cols_nan.iloc[-1, -1] = np.nan
@@ -422,8 +437,8 @@ def time_frame_object_unequal(self):
class Interpolate:
- params = [None, 'infer']
- param_names = ['downcast']
+ params = [None, "infer"]
+ param_names = ["downcast"]
def setup(self, downcast):
N = 10000
@@ -431,12 +446,16 @@ def setup(self, downcast):
self.df = DataFrame(np.random.randn(N, 100))
self.df.values[::2] = np.nan
- self.df2 = DataFrame({'A': np.arange(0, N),
- 'B': np.random.randint(0, 100, N),
- 'C': np.random.randn(N),
- 'D': np.random.randn(N)})
- self.df2.loc[1::5, 'A'] = np.nan
- self.df2.loc[1::5, 'C'] = np.nan
+ self.df2 = DataFrame(
+ {
+ "A": np.arange(0, N),
+ "B": np.random.randint(0, 100, N),
+ "C": np.random.randn(N),
+ "D": np.random.randn(N),
+ }
+ )
+ self.df2.loc[1::5, "A"] = np.nan
+ self.df2.loc[1::5, "C"] = np.nan
def time_interpolate(self, downcast):
self.df.interpolate(downcast=downcast)
@@ -448,7 +467,7 @@ def time_interpolate_some_good(self, downcast):
class Shift:
# frame shift speedup issue-5609
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.rand(10000, 500))
@@ -458,7 +477,6 @@ def time_shift(self, axis):
class Nunique:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 1000))
@@ -467,14 +485,17 @@ def time_frame_nunique(self):
class Duplicated:
-
def setup(self):
- n = (1 << 20)
- t = date_range('2015-01-01', freq='S', periods=(n // 64))
+ n = 1 << 20
+ t = date_range("2015-01-01", freq="S", periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
- self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
- 'b': np.random.choice(t, n),
- 'c': np.random.choice(xs, n)})
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(-1 << 8, 1 << 8, n),
+ "b": np.random.choice(t, n),
+ "c": np.random.choice(xs, n),
+ }
+ )
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
def time_frame_duplicated(self):
@@ -487,10 +508,10 @@ def time_frame_duplicated_wide(self):
class XS:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.N = 10**4
+ self.N = 10 ** 4
self.df = DataFrame(np.random.randn(self.N, self.N))
def time_frame_xs(self, axis):
@@ -500,35 +521,38 @@ def time_frame_xs(self, axis):
class SortValues:
params = [True, False]
- param_names = ['ascending']
+ param_names = ["ascending"]
def setup(self, ascending):
- self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))
+ self.df = DataFrame(np.random.randn(1000000, 2), columns=list("AB"))
def time_frame_sort_values(self, ascending):
- self.df.sort_values(by='A', ascending=ascending)
+ self.df.sort_values(by="A", ascending=ascending)
class SortIndexByColumns:
-
def setup(self):
N = 10000
K = 10
- self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
- 'key2': tm.makeStringIndex(N).values.repeat(K),
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {
+ "key1": tm.makeStringIndex(N).values.repeat(K),
+ "key2": tm.makeStringIndex(N).values.repeat(K),
+ "value": np.random.randn(N * K),
+ }
+ )
def time_frame_sort_values_by_columns(self):
- self.df.sort_values(by=['key1', 'key2'])
+ self.df.sort_values(by=["key1", "key2"])
class Quantile:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_frame_quantile(self, axis):
self.df.quantile([0.1, 0.5], axis=axis)
@@ -540,7 +564,8 @@ def setup(self):
self.df = DataFrame(np.random.randn(10, 10000))
def time_frame_get_dtype_counts(self):
- self.df.get_dtype_counts()
+ with warnings.catch_warnings(record=True):
+ self.df.get_dtype_counts()
def time_info(self):
self.df.info()
@@ -548,37 +573,37 @@ def time_info(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
- self.df = DataFrame(np.random.randn(100000, 3),
- columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC"))
def time_nlargest_one_column(self, keep):
- self.df.nlargest(100, 'A', keep=keep)
+ self.df.nlargest(100, "A", keep=keep)
def time_nlargest_two_columns(self, keep):
- self.df.nlargest(100, ['A', 'B'], keep=keep)
+ self.df.nlargest(100, ["A", "B"], keep=keep)
def time_nsmallest_one_column(self, keep):
- self.df.nsmallest(100, 'A', keep=keep)
+ self.df.nsmallest(100, "A", keep=keep)
def time_nsmallest_two_columns(self, keep):
- self.df.nsmallest(100, ['A', 'B'], keep=keep)
+ self.df.nsmallest(100, ["A", "B"], keep=keep)
class Describe:
-
def setup(self):
- self.df = DataFrame({
- 'a': np.random.randint(0, 100, int(1e6)),
- 'b': np.random.randint(0, 100, int(1e6)),
- 'c': np.random.randint(0, 100, int(1e6))
- })
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(0, 100, int(1e6)),
+ "b": np.random.randint(0, 100, int(1e6)),
+ "c": np.random.randint(0, 100, int(1e6)),
+ }
+ )
def time_series_describe(self):
- self.df['a'].describe()
+ self.df["a"].describe()
def time_dataframe_describe(self):
self.df.describe()
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
index 65a03bfda48c5..0d0b75561d057 100644
--- a/asv_bench/benchmarks/gil.py
+++ b/asv_bench/benchmarks/gil.py
@@ -2,9 +2,19 @@
import pandas.util.testing as tm
from pandas import DataFrame, Series, read_csv, factorize, date_range
from pandas.core.algorithms import take_1d
+
try:
- from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max,
- rolling_var, rolling_skew, rolling_kurt, rolling_std)
+ from pandas import (
+ rolling_median,
+ rolling_mean,
+ rolling_min,
+ rolling_max,
+ rolling_var,
+ rolling_skew,
+ rolling_kurt,
+ rolling_std,
+ )
+
have_rolling_methods = True
except ImportError:
have_rolling_methods = False
@@ -14,6 +24,7 @@
from pandas import algos
try:
from pandas.util.testing import test_parallel
+
have_real_test_parallel = True
except ImportError:
have_real_test_parallel = False
@@ -21,32 +32,36 @@
def test_parallel(num_threads=1):
def wrapper(fname):
return fname
+
return wrapper
+
from .pandas_vb_common import BaseIO
class ParallelGroupbyMethods:
- params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod',
- 'sum', 'var'])
- param_names = ['threads', 'method']
+ params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"])
+ param_names = ["threads", "method"]
def setup(self, threads, method):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- ngroups = 10**3
- df = DataFrame({'key': np.random.randint(0, ngroups, size=N),
- 'data': np.random.randn(N)})
+ N = 10 ** 6
+ ngroups = 10 ** 3
+ df = DataFrame(
+ {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)}
+ )
@test_parallel(num_threads=threads)
def parallel():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.parallel = parallel
def loop():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.loop = loop
def time_parallel(self, threads, method):
@@ -60,18 +75,19 @@ def time_loop(self, threads, method):
class ParallelGroups:
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
raise NotImplementedError
- size = 2**22
- ngroups = 10**3
+ size = 2 ** 22
+ ngroups = 10 ** 3
data = Series(np.random.randint(0, ngroups, size=size))
@test_parallel(num_threads=threads)
def get_groups():
data.groupby(data).groups
+
self.get_groups = get_groups
def time_get_groups(self, threads):
@@ -80,19 +96,20 @@ def time_get_groups(self, threads):
class ParallelTake1D:
- params = ['int64', 'float64']
- param_names = ['dtype']
+ params = ["int64", "float64"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- df = DataFrame({'col': np.arange(N, dtype=dtype)})
+ N = 10 ** 6
+ df = DataFrame({"col": np.arange(N, dtype=dtype)})
indexer = np.arange(100, len(df) - 100)
@test_parallel(num_threads=2)
def parallel_take1d():
- take_1d(df['col'].values, indexer)
+ take_1d(df["col"].values, indexer)
+
self.parallel_take1d = parallel_take1d
def time_take1d(self, dtype):
@@ -107,14 +124,14 @@ class ParallelKth:
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**7
- k = 5 * 10**5
- kwargs_list = [{'arr': np.random.randn(N)},
- {'arr': np.random.randn(N)}]
+ N = 10 ** 7
+ k = 5 * 10 ** 5
+ kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@test_parallel(num_threads=2, kwargs_list=kwargs_list)
def parallel_kth_smallest(arr):
algos.kth_smallest(arr, k)
+
self.parallel_kth_smallest = parallel_kth_smallest
def time_kth_smallest(self):
@@ -122,81 +139,90 @@ def time_kth_smallest(self):
class ParallelDatetimeFields:
-
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- self.dti = date_range('1900-01-01', periods=N, freq='T')
- self.period = self.dti.to_period('D')
+ N = 10 ** 6
+ self.dti = date_range("1900-01-01", periods=N, freq="T")
+ self.period = self.dti.to_period("D")
def time_datetime_field_year(self):
@test_parallel(num_threads=2)
def run(dti):
dti.year
+
run(self.dti)
def time_datetime_field_day(self):
@test_parallel(num_threads=2)
def run(dti):
dti.day
+
run(self.dti)
def time_datetime_field_daysinmonth(self):
@test_parallel(num_threads=2)
def run(dti):
dti.days_in_month
+
run(self.dti)
def time_datetime_field_normalize(self):
@test_parallel(num_threads=2)
def run(dti):
dti.normalize()
+
run(self.dti)
def time_datetime_to_period(self):
@test_parallel(num_threads=2)
def run(dti):
- dti.to_period('S')
+ dti.to_period("S")
+
run(self.dti)
def time_period_to_datetime(self):
@test_parallel(num_threads=2)
def run(period):
period.to_timestamp()
+
run(self.period)
class ParallelRolling:
- params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std']
- param_names = ['method']
+ params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"]
+ param_names = ["method"]
def setup(self, method):
if not have_real_test_parallel:
raise NotImplementedError
win = 100
arr = np.random.rand(100000)
- if hasattr(DataFrame, 'rolling'):
+ if hasattr(DataFrame, "rolling"):
df = DataFrame(arr).rolling(win)
@test_parallel(num_threads=2)
def parallel_rolling():
getattr(df, method)()
+
self.parallel_rolling = parallel_rolling
elif have_rolling_methods:
- rolling = {'median': rolling_median,
- 'mean': rolling_mean,
- 'min': rolling_min,
- 'max': rolling_max,
- 'var': rolling_var,
- 'skew': rolling_skew,
- 'kurt': rolling_kurt,
- 'std': rolling_std}
+ rolling = {
+ "median": rolling_median,
+ "mean": rolling_mean,
+ "min": rolling_min,
+ "max": rolling_max,
+ "var": rolling_var,
+ "skew": rolling_skew,
+ "kurt": rolling_kurt,
+ "std": rolling_std,
+ }
@test_parallel(num_threads=2)
def parallel_rolling():
rolling[method](arr, win)
+
self.parallel_rolling = parallel_rolling
else:
raise NotImplementedError
@@ -209,30 +235,34 @@ class ParallelReadCSV(BaseIO):
number = 1
repeat = 5
- params = ['float', 'object', 'datetime']
- param_names = ['dtype']
+ params = ["float", "object", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
rows = 10000
cols = 50
- data = {'float': DataFrame(np.random.randn(rows, cols)),
- 'datetime': DataFrame(np.random.randn(rows, cols),
- index=date_range('1/1/2000',
- periods=rows)),
- 'object': DataFrame('foo',
- index=range(rows),
- columns=['object%03d'.format(i)
- for i in range(5)])}
-
- self.fname = '__test_{}__.csv'.format(dtype)
+ data = {
+ "float": DataFrame(np.random.randn(rows, cols)),
+ "datetime": DataFrame(
+ np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
+ ),
+ "object": DataFrame(
+ "foo",
+ index=range(rows),
+ columns=["object%03d".format(i) for i in range(5)],
+ ),
+ }
+
+ self.fname = "__test_{}__.csv".format(dtype)
df = data[dtype]
df.to_csv(self.fname)
@test_parallel(num_threads=2)
def parallel_read_csv():
read_csv(self.fname)
+
self.parallel_read_csv = parallel_read_csv
def time_read_csv(self, dtype):
@@ -244,7 +274,7 @@ class ParallelFactorize:
number = 1
repeat = 5
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
@@ -255,10 +285,12 @@ def setup(self, threads):
@test_parallel(num_threads=threads)
def parallel():
factorize(strings)
+
self.parallel = parallel
def loop():
factorize(strings)
+
self.loop = loop
def time_parallel(self, threads):
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 3097ada6d2022..39b07d4734399 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -5,18 +5,55 @@
import numpy as np
from pandas import (
- Categorical, DataFrame, MultiIndex, Series, Timestamp,
- date_range, period_range)
+ Categorical,
+ DataFrame,
+ MultiIndex,
+ Series,
+ Timestamp,
+ date_range,
+ period_range,
+)
import pandas.util.testing as tm
method_blacklist = {
- 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
- 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
- 'var', 'mad', 'describe', 'std', 'quantile'},
- 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
- 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
- 'std'}
+ "object": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "cummin",
+ "mean",
+ "max",
+ "skew",
+ "cumprod",
+ "cummax",
+ "rank",
+ "pct_change",
+ "min",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ "quantile",
+ },
+ "datetime": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "mean",
+ "skew",
+ "cumprod",
+ "cummax",
+ "pct_change",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ },
}
@@ -26,28 +63,31 @@ def setup(self):
self.data = Series(np.random.randn(len(self.labels)))
def time_groupby_apply_dict_return(self):
- self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0],
- 'last': x.values[-1]})
+ self.data.groupby(self.labels).apply(
+ lambda x: {"first": x.values[0], "last": x.values[-1]}
+ )
class Apply:
-
def setup_cache(self):
- N = 10**4
+ N = 10 ** 4
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
- df = DataFrame({'key': labels,
- 'key2': labels2,
- 'value1': np.random.randn(N),
- 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)
- })
+ df = DataFrame(
+ {
+ "key": labels,
+ "key2": labels2,
+ "value1": np.random.randn(N),
+ "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
+ }
+ )
return df
def time_scalar_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(lambda x: 1)
+ df.groupby(["key", "key2"]).apply(lambda x: 1)
def time_scalar_function_single_col(self, df):
- df.groupby('key').apply(lambda x: 1)
+ df.groupby("key").apply(lambda x: 1)
@staticmethod
def df_copy_function(g):
@@ -56,27 +96,29 @@ def df_copy_function(g):
return g.copy()
def time_copy_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(self.df_copy_function)
+ df.groupby(["key", "key2"]).apply(self.df_copy_function)
def time_copy_overhead_single_col(self, df):
- df.groupby('key').apply(self.df_copy_function)
+ df.groupby("key").apply(self.df_copy_function)
class Groups:
- param_names = ['key']
- params = ['int64_small', 'int64_large', 'object_small', 'object_large']
+ param_names = ["key"]
+ params = ["int64_small", "int64_large", "object_small", "object_large"]
def setup_cache(self):
- size = 10**6
- data = {'int64_small': Series(np.random.randint(0, 100, size=size)),
- 'int64_large': Series(np.random.randint(0, 10000, size=size)),
- 'object_small': Series(
- tm.makeStringIndex(100).take(
- np.random.randint(0, 100, size=size))),
- 'object_large': Series(
- tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=size)))}
+ size = 10 ** 6
+ data = {
+ "int64_small": Series(np.random.randint(0, 100, size=size)),
+ "int64_large": Series(np.random.randint(0, 10000, size=size)),
+ "object_small": Series(
+ tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
+ ),
+ "object_large": Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
+ ),
+ }
return data
def setup(self, data, key):
@@ -89,7 +131,7 @@ def time_series_groups(self, data, key):
class GroupManyLabels:
params = [1, 1000]
- param_names = ['ncols']
+ param_names = ["ncols"]
def setup(self, ncols):
N = 1000
@@ -103,46 +145,45 @@ def time_sum(self, ncols):
class Nth:
- param_names = ['dtype']
- params = ['float32', 'float64', 'datetime', 'object']
+ param_names = ["dtype"]
+ params = ["float32", "float64", "datetime", "object"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
# with datetimes (GH7555)
- if dtype == 'datetime':
- values = date_range('1/1/2011', periods=N, freq='s')
- elif dtype == 'object':
- values = ['foo'] * N
+ if dtype == "datetime":
+ values = date_range("1/1/2011", periods=N, freq="s")
+ elif dtype == "object":
+ values = ["foo"] * N
else:
values = np.arange(N).astype(dtype)
key = np.arange(N)
- self.df = DataFrame({'key': key, 'values': values})
+ self.df = DataFrame({"key": key, "values": values})
self.df.iloc[1, 1] = np.nan # insert missing data
def time_frame_nth_any(self, dtype):
- self.df.groupby('key').nth(0, dropna='any')
+ self.df.groupby("key").nth(0, dropna="any")
def time_groupby_nth_all(self, dtype):
- self.df.groupby('key').nth(0, dropna='all')
+ self.df.groupby("key").nth(0, dropna="all")
def time_frame_nth(self, dtype):
- self.df.groupby('key').nth(0)
+ self.df.groupby("key").nth(0)
def time_series_nth_any(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="any")
def time_series_nth_all(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="all")
def time_series_nth(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0)
+ self.df["values"].groupby(self.df["key"]).nth(0)
class DateAttributes:
-
def setup(self):
- rng = date_range('1/1/2000', '12/31/2005', freq='H')
+ rng = date_range("1/1/2000", "12/31/2005", freq="H")
self.year, self.month, self.day = rng.year, rng.month, rng.day
self.ts = Series(np.random.randn(len(rng)), index=rng)
@@ -151,154 +192,167 @@ def time_len_groupby_object(self):
class Int64:
-
def setup(self):
arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5))
i = np.random.choice(len(arr), len(arr) * 5)
arr = np.vstack((arr, arr[i]))
i = np.random.permutation(len(arr))
arr = arr[i]
- self.cols = list('abcde')
+ self.cols = list("abcde")
self.df = DataFrame(arr, columns=self.cols)
- self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10
+ self.df["jim"], self.df["joe"] = np.random.randn(2, len(self.df)) * 10
def time_overflow(self):
self.df.groupby(self.cols).max()
class CountMultiDtype:
-
def setup_cache(self):
n = 10000
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- dates[np.random.rand(n) > 0.5] = np.datetime64('nat')
- offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat')
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ dates[np.random.rand(n) > 0.5] = np.datetime64("nat")
+ offsets[np.random.rand(n) > 0.5] = np.timedelta64("nat")
value2 = np.random.randn(n)
value2[np.random.rand(n) > 0.5] = np.nan
- obj = np.random.choice(list('ab'), size=n).astype(object)
+ obj = np.random.choice(list("ab"), size=n).astype(object)
obj[np.random.randn(n) > 0.5] = np.nan
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'dates': dates,
- 'value2': value2,
- 'value3': np.random.randn(n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'obj': obj,
- 'offsets': offsets})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "dates": dates,
+ "value2": value2,
+ "value3": np.random.randn(n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "obj": obj,
+ "offsets": offsets,
+ }
+ )
return df
def time_multi_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
class CountMultiInt:
-
def setup_cache(self):
n = 10000
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'ints2': np.random.randint(0, 1000, size=n)})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "ints2": np.random.randint(0, 1000, size=n),
+ }
+ )
return df
def time_multi_int_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
def time_multi_int_nunique(self, df):
- df.groupby(['key1', 'key2']).nunique()
+ df.groupby(["key1", "key2"]).nunique()
class AggFunctions:
-
def setup_cache(self):
- N = 10**5
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
- df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)),
- 'key2': fac2.take(np.random.randint(0, 2, size=N)),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
+ N = 10 ** 5
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
+ df = DataFrame(
+ {
+ "key1": fac1.take(np.random.randint(0, 3, size=N)),
+ "key2": fac2.take(np.random.randint(0, 2, size=N)),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
return df
def time_different_str_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': 'mean',
- 'value2': 'var',
- 'value3': 'sum'})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": "mean", "value2": "var", "value3": "sum"}
+ )
def time_different_numpy_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': np.mean,
- 'value2': np.var,
- 'value3': np.sum})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": np.mean, "value2": np.var, "value3": np.sum}
+ )
def time_different_python_functions_multicol(self, df):
- df.groupby(['key1', 'key2']).agg([sum, min, max])
+ df.groupby(["key1", "key2"]).agg([sum, min, max])
def time_different_python_functions_singlecol(self, df):
- df.groupby('key1').agg([sum, min, max])
+ df.groupby("key1").agg([sum, min, max])
class GroupStrings:
-
def setup(self):
- n = 2 * 10**5
- alpha = list(map(''.join, product(ascii_letters, repeat=4)))
+ n = 2 * 10 ** 5
+ alpha = list(map("".join, product(ascii_letters, repeat=4)))
data = np.random.choice(alpha, (n // 5, 4), replace=False)
data = np.repeat(data, 5, axis=0)
- self.df = DataFrame(data, columns=list('abcd'))
- self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
+ self.df = DataFrame(data, columns=list("abcd"))
+ self.df["joe"] = (np.random.randn(len(self.df)) * 10).round(3)
self.df = self.df.sample(frac=1).reset_index(drop=True)
def time_multi_columns(self):
- self.df.groupby(list('abcd')).max()
+ self.df.groupby(list("abcd")).max()
class MultiColumn:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
key1 = np.tile(np.arange(100, dtype=object), 1000)
key2 = key1.copy()
np.random.shuffle(key1)
np.random.shuffle(key2)
- df = DataFrame({'key1': key1,
- 'key2': key2,
- 'data1': np.random.randn(N),
- 'data2': np.random.randn(N)})
+ df = DataFrame(
+ {
+ "key1": key1,
+ "key2": key2,
+ "data1": np.random.randn(N),
+ "data2": np.random.randn(N),
+ }
+ )
return df
def time_lambda_sum(self, df):
- df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"]).agg(lambda x: x.values.sum())
def time_cython_sum(self, df):
- df.groupby(['key1', 'key2']).sum()
+ df.groupby(["key1", "key2"]).sum()
def time_col_select_lambda_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum())
def time_col_select_numpy_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(np.sum)
+ df.groupby(["key1", "key2"])["data1"].agg(np.sum)
class Size:
-
def setup(self):
- n = 10**5
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'value1': np.random.randn(n),
- 'value2': np.random.randn(n),
- 'value3': np.random.randn(n),
- 'dates': dates})
+ n = 10 ** 5
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ self.df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "value1": np.random.randn(n),
+ "value2": np.random.randn(n),
+ "value3": np.random.randn(n),
+ "dates": dates,
+ }
+ )
self.draws = Series(np.random.randn(n))
- labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
- self.cats = labels.astype('category')
+ labels = Series(["foo", "bar", "baz", "qux"] * (n // 4))
+ self.cats = labels.astype("category")
def time_multi_size(self):
- self.df.groupby(['key1', 'key2']).size()
+ self.df.groupby(["key1", "key2"]).size()
def time_category_size(self):
self.draws.groupby(self.cats).size()
@@ -306,15 +360,47 @@ def time_category_size(self):
class GroupByMethods:
- param_names = ['dtype', 'method', 'application']
- params = [['int', 'float', 'object', 'datetime'],
- ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
- 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
- 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
- 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift',
- 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts',
- 'var'],
- ['direct', 'transformation']]
+ param_names = ["dtype", "method", "application"]
+ params = [
+ ["int", "float", "object", "datetime"],
+ [
+ "all",
+ "any",
+ "bfill",
+ "count",
+ "cumcount",
+ "cummax",
+ "cummin",
+ "cumprod",
+ "cumsum",
+ "describe",
+ "ffill",
+ "first",
+ "head",
+ "last",
+ "mad",
+ "max",
+ "min",
+ "median",
+ "mean",
+ "nunique",
+ "pct_change",
+ "prod",
+ "quantile",
+ "rank",
+ "sem",
+ "shift",
+ "size",
+ "skew",
+ "std",
+ "sum",
+ "tail",
+ "unique",
+ "value_counts",
+ "var",
+ ],
+ ["direct", "transformation"],
+ ]
def setup(self, dtype, method, application):
if method in method_blacklist.get(dtype, {}):
@@ -323,29 +409,28 @@ def setup(self, dtype, method, application):
size = ngroups * 2
rng = np.arange(ngroups)
values = rng.take(np.random.randint(0, ngroups, size=size))
- if dtype == 'int':
+ if dtype == "int":
key = np.random.randint(0, size, size=size)
- elif dtype == 'float':
- key = np.concatenate([np.random.random(ngroups) * 0.1,
- np.random.random(ngroups) * 10.0])
- elif dtype == 'object':
- key = ['foo'] * size
- elif dtype == 'datetime':
- key = date_range('1/1/2011', periods=size, freq='s')
-
- df = DataFrame({'values': values, 'key': key})
-
- if application == 'transform':
- if method == 'describe':
+ elif dtype == "float":
+ key = np.concatenate(
+ [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]
+ )
+ elif dtype == "object":
+ key = ["foo"] * size
+ elif dtype == "datetime":
+ key = date_range("1/1/2011", periods=size, freq="s")
+
+ df = DataFrame({"values": values, "key": key})
+
+ if application == "transform":
+ if method == "describe":
raise NotImplementedError
- self.as_group_method = lambda: df.groupby(
- 'key')['values'].transform(method)
- self.as_field_method = lambda: df.groupby(
- 'values')['key'].transform(method)
+ self.as_group_method = lambda: df.groupby("key")["values"].transform(method)
+ self.as_field_method = lambda: df.groupby("values")["key"].transform(method)
else:
- self.as_group_method = getattr(df.groupby('key')['values'], method)
- self.as_field_method = getattr(df.groupby('values')['key'], method)
+ self.as_group_method = getattr(df.groupby("key")["values"], method)
+ self.as_field_method = getattr(df.groupby("values")["key"], method)
def time_dtype_as_group(self, dtype, method, application):
self.as_group_method()
@@ -356,20 +441,22 @@ def time_dtype_as_field(self, dtype, method, application):
class RankWithTies:
# GH 21237
- param_names = ['dtype', 'tie_method']
- params = [['float64', 'float32', 'int64', 'datetime64'],
- ['first', 'average', 'dense', 'min', 'max']]
+ param_names = ["dtype", "tie_method"]
+ params = [
+ ["float64", "float32", "int64", "datetime64"],
+ ["first", "average", "dense", "min", "max"],
+ ]
def setup(self, dtype, tie_method):
- N = 10**4
- if dtype == 'datetime64':
+ N = 10 ** 4
+ if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
- self.df = DataFrame({'values': data, 'key': ['foo'] * N})
+ self.df = DataFrame({"values": data, "key": ["foo"] * N})
def time_rank_ties(self, dtype, tie_method):
- self.df.groupby('key').rank(method=tie_method)
+ self.df.groupby("key").rank(method=tie_method)
class Float32:
@@ -382,57 +469,61 @@ def setup(self):
self.df = DataFrame(dict(a=arr, b=arr))
def time_sum(self):
- self.df.groupby(['a'])['b'].sum()
+ self.df.groupby(["a"])["b"].sum()
class Categories:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
arr = np.random.random(N)
- data = {'a': Categorical(np.random.randint(10000, size=N)),
- 'b': arr}
+ data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr}
self.df = DataFrame(data)
- data = {'a': Categorical(np.random.randint(10000, size=N),
- ordered=True),
- 'b': arr}
+ data = {
+ "a": Categorical(np.random.randint(10000, size=N), ordered=True),
+ "b": arr,
+ }
self.df_ordered = DataFrame(data)
- data = {'a': Categorical(np.random.randint(100, size=N),
- categories=np.arange(10000)),
- 'b': arr}
+ data = {
+ "a": Categorical(
+ np.random.randint(100, size=N), categories=np.arange(10000)
+ ),
+ "b": arr,
+ }
self.df_extra_cat = DataFrame(data)
def time_groupby_sort(self):
- self.df.groupby('a')['b'].count()
+ self.df.groupby("a")["b"].count()
def time_groupby_nosort(self):
- self.df.groupby('a', sort=False)['b'].count()
+ self.df.groupby("a", sort=False)["b"].count()
def time_groupby_ordered_sort(self):
- self.df_ordered.groupby('a')['b'].count()
+ self.df_ordered.groupby("a")["b"].count()
def time_groupby_ordered_nosort(self):
- self.df_ordered.groupby('a', sort=False)['b'].count()
+ self.df_ordered.groupby("a", sort=False)["b"].count()
def time_groupby_extra_cat_sort(self):
- self.df_extra_cat.groupby('a')['b'].count()
+ self.df_extra_cat.groupby("a")["b"].count()
def time_groupby_extra_cat_nosort(self):
- self.df_extra_cat.groupby('a', sort=False)['b'].count()
+ self.df_extra_cat.groupby("a", sort=False)["b"].count()
class Datelike:
# GH 14338
- params = ['period_range', 'date_range', 'date_range_tz']
- param_names = ['grouper']
+ params = ["period_range", "date_range", "date_range_tz"]
+ param_names = ["grouper"]
def setup(self, grouper):
- N = 10**4
- rng_map = {'period_range': period_range,
- 'date_range': date_range,
- 'date_range_tz': partial(date_range, tz='US/Central')}
- self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N)
- self.df = DataFrame(np.random.randn(10**4, 2))
+ N = 10 ** 4
+ rng_map = {
+ "period_range": period_range,
+ "date_range": date_range,
+ "date_range_tz": partial(date_range, tz="US/Central"),
+ }
+ self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N)
+ self.df = DataFrame(np.random.randn(10 ** 4, 2))
def time_sum(self, grouper):
self.df.groupby(self.grouper).sum()
@@ -442,11 +533,10 @@ class SumBools:
# GH 2692
def setup(self):
N = 500
- self.df = DataFrame({'ii': range(N),
- 'bb': [True] * N})
+ self.df = DataFrame({"ii": range(N), "bb": [True] * N})
def time_groupby_sum_booleans(self):
- self.df.groupby('ii').sum()
+ self.df.groupby("ii").sum()
class SumMultiLevel:
@@ -455,84 +545,85 @@ class SumMultiLevel:
def setup(self):
N = 50
- self.df = DataFrame({'A': list(range(N)) * 2,
- 'B': range(N * 2),
- 'C': 1}).set_index(['A', 'B'])
+ self.df = DataFrame(
+ {"A": list(range(N)) * 2, "B": range(N * 2), "C": 1}
+ ).set_index(["A", "B"])
def time_groupby_sum_multiindex(self):
self.df.groupby(level=[0, 1]).sum()
class Transform:
-
def setup(self):
n1 = 400
n2 = 250
- index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)],
- codes=[np.repeat(range(n1), n2).tolist(),
- list(range(n2)) * n1],
- names=['lev1', 'lev2'])
+ index = MultiIndex(
+ levels=[np.arange(n1), tm.makeStringIndex(n2)],
+ codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
+ names=["lev1", "lev2"],
+ )
arr = np.random.randn(n1 * n2, 3)
arr[::10000, 0] = np.nan
arr[1::10000, 1] = np.nan
arr[2::10000, 2] = np.nan
- data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3'])
+ data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"])
self.df = data
n = 20000
- self.df1 = DataFrame(np.random.randint(1, n, (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df1 = DataFrame(
+ np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df2 = self.df1.copy()
- self.df2['jim'] = self.df2['joe']
+ self.df2["jim"] = self.df2["joe"]
- self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df3 = DataFrame(
+ np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df4 = self.df3.copy()
- self.df4['jim'] = self.df4['joe']
+ self.df4["jim"] = self.df4["joe"]
def time_transform_lambda_max(self):
- self.df.groupby(level='lev1').transform(lambda x: max(x))
+ self.df.groupby(level="lev1").transform(lambda x: max(x))
def time_transform_ufunc_max(self):
- self.df.groupby(level='lev1').transform(np.max)
+ self.df.groupby(level="lev1").transform(np.max)
def time_transform_multi_key1(self):
- self.df1.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df1.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key2(self):
- self.df2.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df2.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key3(self):
- self.df3.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df3.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key4(self):
- self.df4.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df4.groupby(["jim", "joe"])["jolie"].transform("max")
class TransformBools:
-
def setup(self):
N = 120000
transition_points = np.sort(np.random.choice(np.arange(N), 1400))
transitions = np.zeros(N, dtype=np.bool)
transitions[transition_points] = True
self.g = transitions.cumsum()
- self.df = DataFrame({'signal': np.random.rand(N)})
+ self.df = DataFrame({"signal": np.random.rand(N)})
def time_transform_mean(self):
- self.df['signal'].groupby(self.g).transform(np.mean)
+ self.df["signal"].groupby(self.g).transform(np.mean)
class TransformNaN:
# GH 12737
def setup(self):
- self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10),
- 'B': np.nan,
- 'C': np.nan})
- self.df_nans.loc[4::10, 'B':'C'] = 5
+ self.df_nans = DataFrame(
+ {"key": np.repeat(np.arange(1000), 10), "B": np.nan, "C": np.nan}
+ )
+ self.df_nans.loc[4::10, "B":"C"] = 5
def time_first(self):
- self.df_nans.groupby('key').transform('first')
+ self.df_nans.groupby("key").transform("first")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py
new file mode 100644
index 0000000000000..13b33855569c9
--- /dev/null
+++ b/asv_bench/benchmarks/index_cached_properties.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+
+class IndexCache:
+ number = 1
+ repeat = (3, 100, 20)
+
+ params = [
+ [
+ "DatetimeIndex",
+ "Float64Index",
+ "IntervalIndex",
+ "Int64Index",
+ "MultiIndex",
+ "PeriodIndex",
+ "RangeIndex",
+ "TimedeltaIndex",
+ "UInt64Index",
+ ]
+ ]
+ param_names = ["index_type"]
+
+ def setup(self, index_type):
+ N = 10 ** 5
+ if index_type == "MultiIndex":
+ self.idx = pd.MultiIndex.from_product(
+ [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]]
+ )
+ elif index_type == "DatetimeIndex":
+ self.idx = pd.date_range("1/1/2000", freq="T", periods=N)
+ elif index_type == "Int64Index":
+ self.idx = pd.Index(range(N))
+ elif index_type == "PeriodIndex":
+ self.idx = pd.period_range("1/1/2000", freq="T", periods=N)
+ elif index_type == "RangeIndex":
+ self.idx = pd.RangeIndex(start=0, stop=N)
+ elif index_type == "IntervalIndex":
+ self.idx = pd.IntervalIndex.from_arrays(range(N), range(1, N + 1))
+ elif index_type == "TimedeltaIndex":
+ self.idx = pd.TimedeltaIndex(range(N))
+ elif index_type == "Float64Index":
+ self.idx = pd.Float64Index(range(N))
+ elif index_type == "UInt64Index":
+ self.idx = pd.UInt64Index(range(N))
+ else:
+ raise ValueError
+ assert len(self.idx) == N
+ self.idx._cache = {}
+
+ def time_values(self, index_type):
+ self.idx._values
+
+ def time_shape(self, index_type):
+ self.idx.shape
+
+ def time_is_monotonic(self, index_type):
+ self.idx.is_monotonic
+
+ def time_is_monotonic_decreasing(self, index_type):
+ self.idx.is_monotonic_decreasing
+
+ def time_is_monotonic_increasing(self, index_type):
+ self.idx.is_monotonic_increasing
+
+ def time_is_unique(self, index_type):
+ self.idx.is_unique
+
+ def time_engine(self, index_type):
+ self.idx._engine
+
+ def time_inferred_type(self, index_type):
+ self.idx.inferred_type
+
+ def time_is_all_dates(self, index_type):
+ self.idx.is_all_dates
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
index 1eedc1a2b3021..49834ae94cc38 100644
--- a/asv_bench/benchmarks/index_object.py
+++ b/asv_bench/benchmarks/index_object.py
@@ -1,38 +1,48 @@
+import gc
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex,
- Float64Index, IntervalIndex)
+from pandas import (
+ Series,
+ date_range,
+ DatetimeIndex,
+ Index,
+ RangeIndex,
+ Float64Index,
+ IntervalIndex,
+)
class SetOperations:
- params = (['datetime', 'date_string', 'int', 'strings'],
- ['intersection', 'union', 'symmetric_difference'])
- param_names = ['dtype', 'method']
+ params = (
+ ["datetime", "date_string", "int", "strings"],
+ ["intersection", "union", "symmetric_difference"],
+ )
+ param_names = ["dtype", "method"]
def setup(self, dtype, method):
- N = 10**5
- dates_left = date_range('1/1/2000', periods=N, freq='T')
- fmt = '%Y-%m-%d %H:%M:%S'
+ N = 10 ** 5
+ dates_left = date_range("1/1/2000", periods=N, freq="T")
+ fmt = "%Y-%m-%d %H:%M:%S"
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
str_left = tm.makeStringIndex(N)
- data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]},
- 'date_string': {'left': date_str_left,
- 'right': date_str_left[:-1]},
- 'int': {'left': int_left, 'right': int_left[:-1]},
- 'strings': {'left': str_left, 'right': str_left[:-1]}}
- self.left = data[dtype]['left']
- self.right = data[dtype]['right']
+ data = {
+ "datetime": {"left": dates_left, "right": dates_left[:-1]},
+ "date_string": {"left": date_str_left, "right": date_str_left[:-1]},
+ "int": {"left": int_left, "right": int_left[:-1]},
+ "strings": {"left": str_left, "right": str_left[:-1]},
+ }
+ self.left = data[dtype]["left"]
+ self.right = data[dtype]["right"]
def time_operation(self, dtype, method):
getattr(self.left, method)(self.right)
class SetDisjoint:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
B = N + 20000
self.datetime_left = DatetimeIndex(range(N))
self.datetime_right = DatetimeIndex(range(N, B))
@@ -42,9 +52,8 @@ def time_datetime_difference_disjoint(self):
class Datetime:
-
def setup(self):
- self.dr = date_range('20000101', freq='D', periods=10000)
+ self.dr = date_range("20000101", freq="D", periods=10000)
def time_is_dates_only(self):
self.dr._is_dates_only
@@ -52,12 +61,12 @@ def time_is_dates_only(self):
class Ops:
- params = ['float', 'int']
- param_names = ['dtype']
+ params = ["float", "int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'}
+ N = 10 ** 6
+ indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
self.index = getattr(tm, indexes[dtype])(N)
def time_add(self, dtype):
@@ -77,10 +86,9 @@ def time_modulo(self, dtype):
class Range:
-
def setup(self):
- self.idx_inc = RangeIndex(start=0, stop=10**7, step=3)
- self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3)
+ self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)
+ self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3)
def time_max(self):
self.idx_inc.max()
@@ -102,7 +110,6 @@ def time_get_loc_dec(self):
class IndexAppend:
-
def setup(self):
N = 10000
@@ -132,19 +139,20 @@ def time_append_obj_list(self):
class Indexing:
- params = ['String', 'Float', 'Int']
- param_names = ['dtype']
+ params = ["String", "Float", "Int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- self.idx = getattr(tm, 'make{}Index'.format(dtype))(N)
+ N = 10 ** 6
+ self.idx = getattr(tm, "make{}Index".format(dtype))(N)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
- self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half])
- .sort_values())
+ self.non_unique_sorted = (
+ self.sorted[:half].append(self.sorted[:half]).sort_values()
+ )
self.key = self.sorted[N // 4]
def time_boolean_array(self, dtype):
@@ -188,7 +196,7 @@ def time_get_loc(self):
class IntervalIndexMethod:
# GH 24813
- params = [10**3, 10**5]
+ params = [10 ** 3, 10 ** 5]
def setup(self, N):
left = np.append(np.arange(N), np.array(0))
@@ -218,4 +226,21 @@ def time_intersection_both_duplicate(self, N):
self.intv.intersection(self.intv2)
+class GC:
+ params = [1, 2, 5]
+
+ def create_use_drop(self):
+ idx = Index(list(range(1000 * 1000)))
+ idx._engine
+
+ def peakmem_gc_instances(self, N):
+ try:
+ gc.disable()
+
+ for _ in range(N):
+ self.create_use_drop()
+ finally:
+ gc.enable()
+
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 4c932cf3600e8..84604b8196536 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -2,26 +2,38 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex,
- Int64Index, UInt64Index, Float64Index,
- IntervalIndex, CategoricalIndex,
- IndexSlice, concat, date_range)
+from pandas import (
+ Series,
+ DataFrame,
+ MultiIndex,
+ Int64Index,
+ UInt64Index,
+ Float64Index,
+ IntervalIndex,
+ CategoricalIndex,
+ IndexSlice,
+ concat,
+ date_range,
+ option_context,
+ period_range,
+)
class NumericSeriesIndexing:
params = [
(Int64Index, UInt64Index, Float64Index),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
+ N = 10 ** 6
indices = {
- 'unique_monotonic_inc': index(range(N)),
- 'nonunique_monotonic_inc': index(
- list(range(55)) + [54] + list(range(55, N - 1))),
+ "unique_monotonic_inc": index(range(N)),
+ "nonunique_monotonic_inc": index(
+ list(range(55)) + [54] + list(range(55, N - 1))
+ ),
}
self.data = Series(np.random.rand(N), index=indices[index_structure])
self.array = np.arange(10000)
@@ -55,16 +67,20 @@ def time_iloc_slice(self, index, index_structure):
self.data.iloc[:800000]
def time_ix_array(self, index, index_structure):
- self.data.ix[self.array]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[self.array]
def time_ix_list_like(self, index, index_structure):
- self.data.ix[[800000]]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[[800000]]
def time_ix_scalar(self, index, index_structure):
- self.data.ix[800000]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[800000]
def time_ix_slice(self, index, index_structure):
- self.data.ix[:800000]
+ with warnings.catch_warnings(record=True):
+ self.data.ix[:800000]
def time_loc_array(self, index, index_structure):
self.data.loc[self.array]
@@ -82,31 +98,37 @@ def time_loc_slice(self, index, index_structure):
class NonNumericSeriesIndexing:
params = [
- ('string', 'datetime'),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("string", "datetime", "period"),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
- indexes = {'string': tm.makeStringIndex(N),
- 'datetime': date_range('1900', periods=N, freq='s')}
- index = indexes[index]
- if index_structure == 'nonunique_monotonic_inc':
+ N = 10 ** 6
+ if index == "string":
+ index = tm.makeStringIndex(N)
+ elif index == "datetime":
+ index = date_range("1900", periods=N, freq="s")
+ elif index == "period":
+ index = period_range("1900", periods=N, freq="s")
+ index = index.sort_values()
+ assert index.is_unique and index.is_monotonic_increasing
+ if index_structure == "nonunique_monotonic_inc":
index = index.insert(item=index[2], loc=2)[:-1]
+ elif index_structure == "non_monotonic":
+ index = index[::2].append(index[1::2])
+ assert len(index) == N
self.s = Series(np.random.rand(N), index=index)
self.lbl = index[80000]
+ # warm up index mapping
+ self.s[self.lbl]
def time_getitem_label_slice(self, index, index_structure):
- self.s[:self.lbl]
+ self.s[: self.lbl]
def time_getitem_pos_slice(self, index, index_structure):
self.s[:80000]
- def time_get_value(self, index, index_structure):
- with warnings.catch_warnings(record=True):
- self.s.get_value(self.lbl)
-
def time_getitem_scalar(self, index, index_structure):
self.s[self.lbl]
@@ -115,23 +137,19 @@ def time_getitem_list_like(self, index, index_structure):
class DataFrameStringIndexing:
-
def setup(self):
index = tm.makeStringIndex(1000)
columns = tm.makeStringIndex(30)
- self.df = DataFrame(np.random.randn(1000, 30), index=index,
- columns=columns)
+ with warnings.catch_warnings(record=True):
+ self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
self.col_scalar = columns[10]
self.bool_indexer = self.df[self.col_scalar] > 0
self.bool_obj_indexer = self.bool_indexer.astype(object)
- def time_get_value(self):
- with warnings.catch_warnings(record=True):
- self.df.get_value(self.idx_scalar, self.col_scalar)
-
def time_ix(self):
- self.df.ix[self.idx_scalar, self.col_scalar]
+ with warnings.catch_warnings(record=True):
+ self.df.ix[self.idx_scalar, self.col_scalar]
def time_loc(self):
self.df.loc[self.idx_scalar, self.col_scalar]
@@ -147,7 +165,6 @@ def time_boolean_rows_object(self):
class DataFrameNumericIndexing:
-
def setup(self):
self.idx_dupe = np.array(range(30)) * 99
self.df = DataFrame(np.random.randn(10000, 5))
@@ -172,13 +189,15 @@ def time_bool_indexer(self):
class Take:
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': Int64Index(np.arange(N)),
- 'datetime': date_range('2011-01-01', freq='S', periods=N)}
+ indexes = {
+ "int": Int64Index(np.arange(N)),
+ "datetime": date_range("2011-01-01", freq="S", periods=N),
+ }
index = indexes[index]
self.s = Series(np.random.rand(N), index=index)
self.indexer = [True, False, True, True, False] * 20000
@@ -188,35 +207,39 @@ def time_take(self, index):
class MultiIndexing:
-
def setup(self):
mi = MultiIndex.from_product([range(1000), range(1000)])
self.s = Series(np.random.randn(1000000), index=mi)
self.df = DataFrame(self.s)
n = 100000
- self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000),
- n),
- 'B': np.random.choice(range(10, 400), n),
- 'C': np.random.choice(range(1, 150), n),
- 'D': np.random.choice(range(10000, 45000), n),
- 'x': np.random.choice(range(400), n),
- 'y': np.random.choice(range(25), n)})
+ with warnings.catch_warnings(record=True):
+ self.mdt = DataFrame(
+ {
+ "A": np.random.choice(range(10000, 45000, 1000), n),
+ "B": np.random.choice(range(10, 400), n),
+ "C": np.random.choice(range(1, 150), n),
+ "D": np.random.choice(range(10000, 45000), n),
+ "x": np.random.choice(range(400), n),
+ "y": np.random.choice(range(25), n),
+ }
+ )
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
- self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index()
+ self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
def time_series_ix(self):
- self.s.ix[999]
+ with warnings.catch_warnings(record=True):
+ self.s.ix[999]
def time_frame_ix(self):
- self.df.ix[999]
+ with warnings.catch_warnings(record=True):
+ self.df.ix[999]
def time_index_slice(self):
self.mdt.loc[self.idx, :]
class IntervalIndexing:
-
def setup_cache(self):
idx = IntervalIndex.from_breaks(np.arange(1000001))
monotonic = Series(np.arange(1000000), index=idx)
@@ -237,29 +260,30 @@ def time_loc_list(self, monotonic):
class CategoricalIndexIndexing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
indices = {
- 'monotonic_incr': CategoricalIndex(values),
- 'monotonic_decr': CategoricalIndex(reversed(values)),
- 'non_monotonic': CategoricalIndex(list('abc' * N))}
+ "monotonic_incr": CategoricalIndex(values),
+ "monotonic_decr": CategoricalIndex(reversed(values)),
+ "non_monotonic": CategoricalIndex(list("abc" * N)),
+ }
self.data = indices[index]
self.int_scalar = 10000
self.int_list = list(range(10000))
- self.cat_scalar = 'b'
- self.cat_list = ['a', 'c']
+ self.cat_scalar = "b"
+ self.cat_list = ["a", "c"]
def time_getitem_scalar(self, index):
self.data[self.int_scalar]
def time_getitem_slice(self, index):
- self.data[:self.int_scalar]
+ self.data[: self.int_scalar]
def time_getitem_list_like(self, index):
self.data[[self.int_scalar]]
@@ -278,7 +302,6 @@ def time_get_indexer_list(self, index):
class MethodLookup:
-
def setup_cache(self):
s = Series()
return s
@@ -287,47 +310,44 @@ def time_lookup_iloc(self, s):
s.iloc
def time_lookup_ix(self, s):
- s.ix
+ with warnings.catch_warnings(record=True):
+ s.ix
def time_lookup_loc(self, s):
s.loc
class GetItemSingleColumn:
-
def setup(self):
- self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A'])
+ self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=["A"])
self.df_int_col = DataFrame(np.random.randn(3000, 1))
def time_frame_getitem_single_column_label(self):
- self.df_string_col['A']
+ self.df_string_col["A"]
def time_frame_getitem_single_column_int(self):
self.df_int_col[0]
class AssignTimeseriesIndex:
-
def setup(self):
N = 100000
- idx = date_range('1/1/2000', periods=N, freq='H')
- self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx)
+ idx = date_range("1/1/2000", periods=N, freq="H")
+ self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)
def time_frame_assign_timeseries_index(self):
- self.df['date'] = self.df.index
+ self.df["date"] = self.df.index
class InsertColumns:
-
def setup(self):
- self.N = 10**3
+ self.N = 10 ** 3
self.df = DataFrame(index=range(self.N))
def time_insert(self):
np.random.seed(1234)
for i in range(100):
- self.df.insert(0, i, np.random.randn(self.N),
- allow_duplicates=True)
+ self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)
def time_assign_with_setitem(self):
np.random.seed(1234)
@@ -335,4 +355,20 @@ def time_assign_with_setitem(self):
self.df[i] = np.random.randn(self.N)
+class ChainIndexing:
+
+ params = [None, "warn"]
+ param_names = ["mode"]
+
+ def setup(self, mode):
+ self.N = 1000000
+
+ def time_chained_indexing(self, mode):
+ with warnings.catch_warnings(record=True):
+ with option_context("mode.chained_assignment", mode):
+ df = DataFrame({"A": np.arange(self.N), "B": "foo"})
+ df2 = df[df.A > self.N // 2]
+ df2["C"] = 1.0
+
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
index 5655701781846..44a22dfa77791 100644
--- a/asv_bench/benchmarks/indexing_engines.py
+++ b/asv_bench/benchmarks/indexing_engines.py
@@ -5,33 +5,40 @@
def _get_numeric_engines():
engine_names = [
- ('Int64Engine', np.int64), ('Int32Engine', np.int32),
- ('Int16Engine', np.int16), ('Int8Engine', np.int8),
- ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
- ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
- ('Float64Engine', np.float64), ('Float32Engine', np.float32),
+ ("Int64Engine", np.int64),
+ ("Int32Engine", np.int32),
+ ("Int16Engine", np.int16),
+ ("Int8Engine", np.int8),
+ ("UInt64Engine", np.uint64),
+ ("UInt32Engine", np.uint32),
+ ("UInt16engine", np.uint16),
+ ("UInt8Engine", np.uint8),
+ ("Float64Engine", np.float64),
+ ("Float32Engine", np.float32),
+ ]
+ return [
+ (getattr(libindex, engine_name), dtype)
+ for engine_name, dtype in engine_names
+ if hasattr(libindex, engine_name)
]
- return [(getattr(libindex, engine_name), dtype)
- for engine_name, dtype in engine_names
- if hasattr(libindex, engine_name)]
class NumericEngineIndexing:
- params = [_get_numeric_engines(),
- ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
- ]
- param_names = ['engine_and_dtype', 'index_type']
+ params = [
+ _get_numeric_engines(),
+ ["monotonic_incr", "monotonic_decr", "non_monotonic"],
+ ]
+ param_names = ["engine_and_dtype", "index_type"]
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype
- N = 10**5
+ N = 10 ** 5
values = list([1] * N + [2] * N + [3] * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=dtype),
- 'monotonic_decr': np.array(list(reversed(values)),
- dtype=dtype),
- 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
+ "monotonic_incr": np.array(values, dtype=dtype),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=dtype),
+ "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
}[index_type]
self.data = engine(lambda: arr, len(arr))
@@ -44,21 +51,21 @@ def time_get_loc(self, engine_and_dtype, index_type):
class ObjectEngineIndexing:
- params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
- param_names = ['index_type']
+ params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
+ param_names = ["index_type"]
def setup(self, index_type):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=object),
- 'monotonic_decr': np.array(list(reversed(values)), dtype=object),
- 'non_monotonic': np.array(list('abc') * N, dtype=object),
+ "monotonic_incr": np.array(values, dtype=object),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=object),
+ "non_monotonic": np.array(list("abc") * N, dtype=object),
}[index_type]
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
- self.data.get_loc('b')
+ self.data.get_loc("b")
def time_get_loc(self, index_type):
- self.data.get_loc('b')
+ self.data.get_loc("b")
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
index 065c82207d251..66ef4f2aec380 100644
--- a/asv_bench/benchmarks/inference.py
+++ b/asv_bench/benchmarks/inference.py
@@ -8,56 +8,57 @@
class NumericInferOps:
# from GH 7332
params = numeric_dtypes
- param_names = ['dtype']
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 5 * 10**5
- self.df = DataFrame({'A': np.arange(N).astype(dtype),
- 'B': np.arange(N).astype(dtype)})
+ N = 5 * 10 ** 5
+ self.df = DataFrame(
+ {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)}
+ )
def time_add(self, dtype):
- self.df['A'] + self.df['B']
+ self.df["A"] + self.df["B"]
def time_subtract(self, dtype):
- self.df['A'] - self.df['B']
+ self.df["A"] - self.df["B"]
def time_multiply(self, dtype):
- self.df['A'] * self.df['B']
+ self.df["A"] * self.df["B"]
def time_divide(self, dtype):
- self.df['A'] / self.df['B']
+ self.df["A"] / self.df["B"]
def time_modulo(self, dtype):
- self.df['A'] % self.df['B']
+ self.df["A"] % self.df["B"]
class DateInferOps:
# from GH 7332
def setup_cache(self):
- N = 5 * 10**5
- df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
- df['timedelta'] = df['datetime64'] - df['datetime64']
+ N = 5 * 10 ** 5
+ df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")})
+ df["timedelta"] = df["datetime64"] - df["datetime64"]
return df
def time_subtract_datetimes(self, df):
- df['datetime64'] - df['datetime64']
+ df["datetime64"] - df["datetime64"]
def time_timedelta_plus_datetime(self, df):
- df['timedelta'] + df['datetime64']
+ df["timedelta"] + df["datetime64"]
def time_add_timedeltas(self, df):
- df['timedelta'] + df['timedelta']
+ df["timedelta"] + df["timedelta"]
class ToNumeric:
- params = ['ignore', 'coerce']
- param_names = ['errors']
+ params = ["ignore", "coerce"]
+ param_names = ["errors"]
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
- self.numstr = self.float.astype('str')
+ self.numstr = self.float.astype("str")
self.str = Series(tm.makeStringIndex(N))
def time_from_float(self, errors):
@@ -72,21 +73,32 @@ def time_from_str(self, errors):
class ToNumericDowncast:
- param_names = ['dtype', 'downcast']
- params = [['string-float', 'string-int', 'string-nint', 'datetime64',
- 'int-list', 'int32'],
- [None, 'integer', 'signed', 'unsigned', 'float']]
+ param_names = ["dtype", "downcast"]
+ params = [
+ [
+ "string-float",
+ "string-int",
+ "string-nint",
+ "datetime64",
+ "int-list",
+ "int32",
+ ],
+ [None, "integer", "signed", "unsigned", "float"],
+ ]
N = 500000
N2 = int(N / 2)
- data_dict = {'string-int': ['1'] * N2 + [2] * N2,
- 'string-nint': ['-1'] * N2 + [2] * N2,
- 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
- dtype='datetime64[D]'), N),
- 'string-float': ['1.1'] * N2 + [2] * N2,
- 'int-list': [1] * N2 + [2] * N2,
- 'int32': np.repeat(np.int32(1), N)}
+ data_dict = {
+ "string-int": ["1"] * N2 + [2] * N2,
+ "string-nint": ["-1"] * N2 + [2] * N2,
+ "datetime64": np.repeat(
+ np.array(["1970-01-01", "1970-01-02"], dtype="datetime64[D]"), N
+ ),
+ "string-float": ["1.1"] * N2 + [2] * N2,
+ "int-list": [1] * N2 + [2] * N2,
+ "int32": np.repeat(np.int32(1), N),
+ }
def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]
@@ -96,10 +108,9 @@ def time_downcast(self, dtype, downcast):
class MaybeConvertNumeric:
-
def setup_cache(self):
- N = 10**6
- arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
+ N = 10 ** 6
+ arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64")
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 6beb21883b5ab..4525e504fc4dd 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -4,7 +4,6 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
-from pandas.io.parsers import _parser_defaults
from io import StringIO
from ..pandas_vb_common import BaseIO
@@ -12,27 +11,31 @@
class ToCSV(BaseIO):
- fname = '__test__.csv'
- params = ['wide', 'long', 'mixed']
- param_names = ['kind']
+ fname = "__test__.csv"
+ params = ["wide", "long", "mixed"]
+ param_names = ["kind"]
def setup(self, kind):
wide_frame = DataFrame(np.random.randn(3000, 30))
- long_frame = DataFrame({'A': np.arange(50000),
- 'B': np.arange(50000) + 1.,
- 'C': np.arange(50000) + 2.,
- 'D': np.arange(50000) + 3.})
- mixed_frame = DataFrame({'float': np.random.randn(5000),
- 'int': np.random.randn(5000).astype(int),
- 'bool': (np.arange(5000) % 2) == 0,
- 'datetime': date_range('2001',
- freq='s',
- periods=5000),
- 'object': ['foo'] * 5000})
- mixed_frame.loc[30:500, 'float'] = np.nan
- data = {'wide': wide_frame,
- 'long': long_frame,
- 'mixed': mixed_frame}
+ long_frame = DataFrame(
+ {
+ "A": np.arange(50000),
+ "B": np.arange(50000) + 1.0,
+ "C": np.arange(50000) + 2.0,
+ "D": np.arange(50000) + 3.0,
+ }
+ )
+ mixed_frame = DataFrame(
+ {
+ "float": np.random.randn(5000),
+ "int": np.random.randn(5000).astype(int),
+ "bool": (np.arange(5000) % 2) == 0,
+ "datetime": date_range("2001", freq="s", periods=5000),
+ "object": ["foo"] * 5000,
+ }
+ )
+ mixed_frame.loc[30:500, "float"] = np.nan
+ data = {"wide": wide_frame, "long": long_frame, "mixed": mixed_frame}
self.df = data[kind]
def time_frame(self, kind):
@@ -41,36 +44,39 @@ def time_frame(self, kind):
class ToCSVDatetime(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
- rng = date_range('1/1/2000', periods=1000)
+ rng = date_range("1/1/2000", periods=1000)
self.data = DataFrame(rng, index=rng)
def time_frame_date_formatting(self):
- self.data.to_csv(self.fname, date_format='%Y%m%d')
+ self.data.to_csv(self.fname, date_format="%Y%m%d")
class ToCSVDatetimeBig(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
timeout = 1500
params = [1000, 10000, 100000]
- param_names = ['obs']
+ param_names = ["obs"]
def setup(self, obs):
- d = '2018-11-29'
- dt = '2018-11-26 11:18:27.0'
- self.data = DataFrame({'dt': [np.datetime64(dt)] * obs,
- 'd': [np.datetime64(d)] * obs,
- 'r': [np.random.uniform()] * obs})
+ d = "2018-11-29"
+ dt = "2018-11-26 11:18:27.0"
+ self.data = DataFrame(
+ {
+ "dt": [np.datetime64(dt)] * obs,
+ "d": [np.datetime64(d)] * obs,
+ "r": [np.random.uniform()] * obs,
+ }
+ )
def time_frame(self, obs):
self.data.to_csv(self.fname)
class StringIORewind:
-
def data(self, stringio_object):
stringio_object.seek(0)
return stringio_object
@@ -78,68 +84,84 @@ def data(self, stringio_object):
class ReadCSVDInferDatetimeFormat(StringIORewind):
- params = ([True, False], ['custom', 'iso8601', 'ymd'])
- param_names = ['infer_datetime_format', 'format']
+ params = ([True, False], ["custom", "iso8601", "ymd"])
+ param_names = ["infer_datetime_format", "format"]
def setup(self, infer_datetime_format, format):
- rng = date_range('1/1/2000', periods=1000)
- formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
- 'iso8601': '%Y-%m-%d %H:%M:%S',
- 'ymd': '%Y%m%d'}
+ rng = date_range("1/1/2000", periods=1000)
+ formats = {
+ "custom": "%m/%d/%Y %H:%M:%S.%f",
+ "iso8601": "%Y-%m-%d %H:%M:%S",
+ "ymd": "%Y%m%d",
+ }
dt_format = formats[format]
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(dt_format).tolist()))
+ self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist()))
def time_read_csv(self, infer_datetime_format, format):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=infer_datetime_format)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=infer_datetime_format,
+ )
class ReadCSVConcatDatetime(StringIORewind):
- iso8601 = '%Y-%m-%d %H:%M:%S'
+ iso8601 = "%Y-%m-%d %H:%M:%S"
def setup(self):
- rng = date_range('1/1/2000', periods=50000, freq='S')
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(self.iso8601).tolist()))
+ rng = date_range("1/1/2000", periods=50000, freq="S")
+ self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist()))
def time_read_csv(self):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
- params = (['nan', '0', ''],)
- param_names = ['bad_date_value']
+ params = (["nan", "0", ""],)
+ param_names = ["bad_date_value"]
def setup(self, bad_date_value):
- self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000)
+ self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000)
def time_read_csv(self, bad_date_value):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo', 'bar'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo", "bar"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVSkipRows(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
params = [None, 10000]
- param_names = ['skiprows']
+ param_names = ["skiprows"]
def setup(self, skiprows):
N = 20000
index = tm.makeStringIndex(N)
- df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ df = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
df.to_csv(self.fname)
def time_skipprows(self, skiprows):
@@ -147,31 +169,31 @@ def time_skipprows(self, skiprows):
class ReadUint64Integers(StringIORewind):
-
def setup(self):
- self.na_values = [2**63 + 500]
- arr = np.arange(10000).astype('uint64') + 2**63
- self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.na_values = [2 ** 63 + 500]
+ arr = np.arange(10000).astype("uint64") + 2 ** 63
+ self.data1 = StringIO("\n".join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
- self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.data2 = StringIO("\n".join(arr.astype(str).tolist()))
def time_read_uint64(self):
- read_csv(self.data(self.data1), header=None, names=['foo'])
+ read_csv(self.data(self.data1), header=None, names=["foo"])
def time_read_uint64_neg_values(self):
- read_csv(self.data(self.data2), header=None, names=['foo'])
+ read_csv(self.data(self.data2), header=None, names=["foo"])
def time_read_uint64_na_values(self):
- read_csv(self.data(self.data1), header=None, names=['foo'],
- na_values=self.na_values)
+ read_csv(
+ self.data(self.data1), header=None, names=["foo"], na_values=self.na_values
+ )
class ReadCSVThousands(BaseIO):
- fname = '__test__.csv'
- params = ([',', '|'], [None, ','])
- param_names = ['sep', 'thousands']
+ fname = "__test__.csv"
+ params = ([",", "|"], [None, ","])
+ param_names = ["sep", "thousands"]
def setup(self, sep, thousands):
N = 10000
@@ -179,8 +201,8 @@ def setup(self, sep, thousands):
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
df = DataFrame(data)
if thousands is not None:
- fmt = ':{}'.format(thousands)
- fmt = '{' + fmt + '}'
+ fmt = ":{}".format(thousands)
+ fmt = "{" + fmt + "}"
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)
@@ -189,57 +211,68 @@ def time_thousands(self, sep, thousands):
class ReadCSVComment(StringIORewind):
-
def setup(self):
- data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
- self.StringIO_input = StringIO('\n'.join(data))
+ data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
+ self.StringIO_input = StringIO("\n".join(data))
def time_comment(self):
- read_csv(self.data(self.StringIO_input), comment='#',
- header=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
+ )
class ReadCSVFloatPrecision(StringIORewind):
- params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
- param_names = ['sep', 'decimal', 'float_precision']
+ params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
+ param_names = ["sep", "decimal", "float_precision"]
def setup(self, sep, decimal, float_precision):
- floats = [''.join(random.choice(string.digits) for _ in range(28))
- for _ in range(15)]
- rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
+ floats = [
+ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15)
+ ]
+ rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n"
data = rows * 5
data = data.format(*floats) * 200 # 1000 x 3 strings csv
self.StringIO_input = StringIO(data)
def time_read_csv(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- names=list('abc'), float_precision=float_precision)
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ names=list("abc"),
+ float_precision=float_precision,
+ )
def time_read_csv_python_engine(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- engine='python', float_precision=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ engine="python",
+ float_precision=None,
+ names=list("abc"),
+ )
class ReadCSVCategorical(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
N = 100000
- group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
- df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
+ group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
+ df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
df.to_csv(self.fname, index=False)
def time_convert_post(self):
read_csv(self.fname).apply(Categorical)
def time_convert_direct(self):
- read_csv(self.fname, dtype='category')
+ read_csv(self.fname, dtype="category")
class ReadCSVParseDates(StringIORewind):
-
def setup(self):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
@@ -247,38 +280,50 @@ def setup(self):
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
"""
- two_cols = ['KORD,19990127'] * 5
+ two_cols = ["KORD,19990127"] * 5
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)
def time_multiple_date(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=list(string.digits[:9]),
- parse_dates=[[1, 2], [1, 3]])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=list(string.digits[:9]),
+ parse_dates=[[1, 2], [1, 3]],
+ )
def time_baseline(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- parse_dates=[1],
- names=list(string.digits[:9]))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ parse_dates=[1],
+ names=list(string.digits[:9]),
+ )
class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
- param_names = ['do_cache']
+ param_names = ["do_cache"]
def setup(self, do_cache):
- data = ('\n'.join('10/{}'.format(year)
- for year in range(2000, 2100)) + '\n') * 10
+ data = (
+ "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n"
+ ) * 10
self.StringIO_input = StringIO(data)
def time_read_csv_cached(self, do_cache):
- # kwds setting here is used to avoid breaking tests in
- # previous version of pandas, because this is api changes
- kwds = {}
- if 'cache_dates' in _parser_defaults:
- kwds['cache_dates'] = do_cache
- read_csv(self.data(self.StringIO_input), header=None,
- parse_dates=[0], **kwds)
+ try:
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ parse_dates=[0],
+ cache_dates=do_cache,
+ )
+ except TypeError:
+ # cache_dates is a new keyword in 0.25
+ pass
class ReadCSVMemoryGrowth(BaseIO):
@@ -301,12 +346,12 @@ def mem_parser_chunks(self):
class ReadCSVParseSpecialDate(StringIORewind):
- params = (['mY', 'mdY', 'hm'],)
- param_names = ['value']
+ params = (["mY", "mdY", "hm"],)
+ param_names = ["value"]
objects = {
- 'mY': '01-2019\n10-2019\n02/2000\n',
- 'mdY': '12/02/2010\n',
- 'hm': '21:34\n'
+ "mY": "01-2019\n10-2019\n02/2000\n",
+ "mdY": "12/02/2010\n",
+ "hm": "21:34\n",
}
def setup(self, value):
@@ -315,33 +360,50 @@ def setup(self, value):
self.StringIO_input = StringIO(data)
def time_read_special_date(self, value):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ )
class ParseDateComparison(StringIORewind):
params = ([False, True],)
- param_names = ['cache_dates']
+ param_names = ["cache_dates"]
def setup(self, cache_dates):
count_elem = 10000
- data = '12-02-2010\n' * count_elem
+ data = "12-02-2010\n" * count_elem
self.StringIO_input = StringIO(data)
def time_read_csv_dayfirst(self, cache_dates):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
- dayfirst=True)
+ try:
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ cache_dates=cache_dates,
+ dayfirst=True,
+ )
+ except TypeError:
+ # cache_dates is a new keyword in 0.25
+ pass
def time_to_datetime_dayfirst(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, dayfirst=True)
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, dayfirst=True)
def time_to_datetime_format_DD_MM_YYYY(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y')
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y")
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
index 1decb83f2f723..12e70f84e5203 100644
--- a/asv_bench/benchmarks/io/excel.py
+++ b/asv_bench/benchmarks/io/excel.py
@@ -6,19 +6,21 @@
class Excel:
- params = ['openpyxl', 'xlsxwriter', 'xlwt']
- param_names = ['engine']
+ params = ["openpyxl", "xlsxwriter", "xlwt"]
+ param_names = ["engine"]
def setup(self, engine):
N = 2000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
- self.df.to_excel(self.writer_read, sheet_name='Sheet1')
+ self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)
@@ -29,7 +31,7 @@ def time_write_excel(self, engine):
bio_write = BytesIO()
bio_write.seek(0)
writer_write = ExcelWriter(bio_write, engine=engine)
- self.df.to_excel(writer_write, sheet_name='Sheet1')
+ self.df.to_excel(writer_write, sheet_name="Sheet1")
writer_write.save()
diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py
index a5dc28eb9508c..2874a7889156b 100644
--- a/asv_bench/benchmarks/io/hdf.py
+++ b/asv_bench/benchmarks/io/hdf.py
@@ -6,86 +6,92 @@
class HDFStoreDataFrame(BaseIO):
-
def setup(self):
N = 25000
index = tm.makeStringIndex(N)
- self.df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=index)
- self.df_mixed = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ self.df = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index
+ )
+ self.df_mixed = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
self.df_wide = DataFrame(np.random.randn(N, 100))
self.start_wide = self.df_wide.index[10000]
self.stop_wide = self.df_wide.index[15000]
- self.df2 = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=date_range('1/1/2000', periods=N))
+ self.df2 = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)},
+ index=date_range("1/1/2000", periods=N),
+ )
self.start = self.df2.index[10000]
self.stop = self.df2.index[15000]
- self.df_wide2 = DataFrame(np.random.randn(N, 100),
- index=date_range('1/1/2000', periods=N))
- self.df_dc = DataFrame(np.random.randn(N, 10),
- columns=['C%03d' % i for i in range(10)])
+ self.df_wide2 = DataFrame(
+ np.random.randn(N, 100), index=date_range("1/1/2000", periods=N)
+ )
+ self.df_dc = DataFrame(
+ np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)]
+ )
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
self.store = HDFStore(self.fname)
- self.store.put('fixed', self.df)
- self.store.put('fixed_mixed', self.df_mixed)
- self.store.append('table', self.df2)
- self.store.append('table_mixed', self.df_mixed)
- self.store.append('table_wide', self.df_wide)
- self.store.append('table_wide2', self.df_wide2)
+ self.store.put("fixed", self.df)
+ self.store.put("fixed_mixed", self.df_mixed)
+ self.store.append("table", self.df2)
+ self.store.append("table_mixed", self.df_mixed)
+ self.store.append("table_wide", self.df_wide)
+ self.store.append("table_wide2", self.df_wide2)
def teardown(self):
self.store.close()
self.remove(self.fname)
def time_read_store(self):
- self.store.get('fixed')
+ self.store.get("fixed")
def time_read_store_mixed(self):
- self.store.get('fixed_mixed')
+ self.store.get("fixed_mixed")
def time_write_store(self):
- self.store.put('fixed_write', self.df)
+ self.store.put("fixed_write", self.df)
def time_write_store_mixed(self):
- self.store.put('fixed_mixed_write', self.df_mixed)
+ self.store.put("fixed_mixed_write", self.df_mixed)
def time_read_store_table_mixed(self):
- self.store.select('table_mixed')
+ self.store.select("table_mixed")
def time_write_store_table_mixed(self):
- self.store.append('table_mixed_write', self.df_mixed)
+ self.store.append("table_mixed_write", self.df_mixed)
def time_read_store_table(self):
- self.store.select('table')
+ self.store.select("table")
def time_write_store_table(self):
- self.store.append('table_write', self.df)
+ self.store.append("table_write", self.df)
def time_read_store_table_wide(self):
- self.store.select('table_wide')
+ self.store.select("table_wide")
def time_write_store_table_wide(self):
- self.store.append('table_wide_write', self.df_wide)
+ self.store.append("table_wide_write", self.df_wide)
def time_write_store_table_dc(self):
- self.store.append('table_dc_write', self.df_dc, data_columns=True)
+ self.store.append("table_dc_write", self.df_dc, data_columns=True)
def time_query_store_table_wide(self):
- self.store.select('table_wide', where="index > self.start_wide and "
- "index < self.stop_wide")
+ self.store.select(
+ "table_wide", where="index > self.start_wide and " "index < self.stop_wide"
+ )
def time_query_store_table(self):
- self.store.select('table', where="index > self.start and "
- "index < self.stop")
+ self.store.select("table", where="index > self.start and " "index < self.stop")
def time_store_repr(self):
repr(self.store)
@@ -99,24 +105,26 @@ def time_store_info(self):
class HDF(BaseIO):
- params = ['table', 'fixed']
- param_names = ['format']
+ params = ["table", "fixed"]
+ param_names = ["format"]
def setup(self, format):
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
+ self.df.to_hdf(self.fname, "df", format=format)
def time_read_hdf(self, format):
- read_hdf(self.fname, 'df')
+ read_hdf(self.fname, "df")
def time_write_hdf(self, format):
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df.to_hdf(self.fname, "df", format=format)
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
index 19d11e6610198..fc07f2a484102 100644
--- a/asv_bench/benchmarks/io/json.py
+++ b/asv_bench/benchmarks/io/json.py
@@ -8,16 +8,20 @@
class ReadJSON(BaseIO):
fname = "__test__.json"
- params = (['split', 'index', 'records'], ['int', 'datetime'])
- param_names = ['orient', 'index']
+ params = (["split", "index", "records"], ["int", "datetime"])
+ param_names = ["orient", "index"]
def setup(self, orient, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
df.to_json(self.fname, orient=orient)
def time_read_json(self, orient, index):
@@ -27,121 +31,185 @@ def time_read_json(self, orient, index):
class ReadJSONLines(BaseIO):
fname = "__test_lines__.json"
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
- df.to_json(self.fname, orient='records', lines=True)
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
+ df.to_json(self.fname, orient="records", lines=True)
def time_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def time_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
def peakmem_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def peakmem_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
class ToJSON(BaseIO):
fname = "__test__.json"
- params = ['split', 'columns', 'index']
- param_names = ['orient']
+ params = [
+ ["split", "columns", "index", "values", "records"],
+ ["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"],
+ ]
+ param_names = ["orient", "frame"]
+
+ def setup(self, orient, frame):
+ N = 10 ** 5
+ ncols = 5
+ index = date_range("20000101", periods=N, freq="H")
+ timedeltas = timedelta_range(start=1, periods=N, freq="s")
+ datetimes = date_range(start=1, periods=N, freq="s")
+ ints = np.random.randint(100000000, size=N)
+ floats = np.random.randn(N)
+ strings = tm.makeStringIndex(N)
+ self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
+ self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
+ self.df_td_int_ts = DataFrame(
+ {
+ "td_1": timedeltas,
+ "td_2": timedeltas,
+ "int_1": ints,
+ "int_2": ints,
+ "ts_1": datetimes,
+ "ts_2": datetimes,
+ },
+ index=index,
+ )
+ self.df_int_floats = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "int_3": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "float_3": floats,
+ },
+ index=index,
+ )
+ self.df_int_float_str = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "str_1": strings,
+ "str_2": strings,
+ },
+ index=index,
+ )
+
+ def time_to_json(self, orient, frame):
+ getattr(self, frame).to_json(self.fname, orient=orient)
+
+ def mem_to_json(self, orient, frame):
+ getattr(self, frame).to_json(self.fname, orient=orient)
+
+ def time_to_json_wide(self, orient, frame):
+ base_df = getattr(self, frame).copy()
+ df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
+ df.to_json(self.fname, orient=orient)
- def setup(self, lines_orient):
- N = 10**5
+ def mem_to_json_wide(self, orient, frame):
+ base_df = getattr(self, frame).copy()
+ df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
+ df.to_json(self.fname, orient=orient)
+
+
+class ToJSONLines(BaseIO):
+
+ fname = "__test__.json"
+
+ def setup(self):
+ N = 10 ** 5
ncols = 5
- index = date_range('20000101', periods=N, freq='H')
- timedeltas = timedelta_range(start=1, periods=N, freq='s')
- datetimes = date_range(start=1, periods=N, freq='s')
+ index = date_range("20000101", periods=N, freq="H")
+ timedeltas = timedelta_range(start=1, periods=N, freq="s")
+ datetimes = date_range(start=1, periods=N, freq="s")
ints = np.random.randint(100000000, size=N)
floats = np.random.randn(N)
strings = tm.makeStringIndex(N)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
- self.df_td_int_ts = DataFrame({'td_1': timedeltas,
- 'td_2': timedeltas,
- 'int_1': ints,
- 'int_2': ints,
- 'ts_1': datetimes,
- 'ts_2': datetimes},
- index=index)
- self.df_int_floats = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'int_3': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'float_3': floats},
- index=index)
- self.df_int_float_str = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'str_1': strings,
- 'str_2': strings},
- index=index)
-
- def time_floats_with_int_index(self, orient):
- self.df.to_json(self.fname, orient=orient)
-
- def time_floats_with_dt_index(self, orient):
- self.df_date_idx.to_json(self.fname, orient=orient)
-
- def time_delta_int_tstamp(self, orient):
- self.df_td_int_ts.to_json(self.fname, orient=orient)
-
- def time_float_int(self, orient):
- self.df_int_floats.to_json(self.fname, orient=orient)
-
- def time_float_int_str(self, orient):
- self.df_int_float_str.to_json(self.fname, orient=orient)
-
- def time_floats_with_int_idex_lines(self, orient):
- self.df.to_json(self.fname, orient='records', lines=True)
-
- def time_floats_with_dt_index_lines(self, orient):
- self.df_date_idx.to_json(self.fname, orient='records', lines=True)
-
- def time_delta_int_tstamp_lines(self, orient):
- self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)
-
- def time_float_int_lines(self, orient):
- self.df_int_floats.to_json(self.fname, orient='records', lines=True)
-
- def time_float_int_str_lines(self, orient):
- self.df_int_float_str.to_json(self.fname, orient='records', lines=True)
+ self.df_td_int_ts = DataFrame(
+ {
+ "td_1": timedeltas,
+ "td_2": timedeltas,
+ "int_1": ints,
+ "int_2": ints,
+ "ts_1": datetimes,
+ "ts_2": datetimes,
+ },
+ index=index,
+ )
+ self.df_int_floats = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "int_3": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "float_3": floats,
+ },
+ index=index,
+ )
+ self.df_int_float_str = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "str_1": strings,
+ "str_2": strings,
+ },
+ index=index,
+ )
+
+ def time_floats_with_int_idex_lines(self):
+ self.df.to_json(self.fname, orient="records", lines=True)
+
+ def time_floats_with_dt_index_lines(self):
+ self.df_date_idx.to_json(self.fname, orient="records", lines=True)
+
+ def time_delta_int_tstamp_lines(self):
+ self.df_td_int_ts.to_json(self.fname, orient="records", lines=True)
+
+ def time_float_int_lines(self):
+ self.df_int_floats.to_json(self.fname, orient="records", lines=True)
+
+ def time_float_int_str_lines(self):
+ self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
class ToJSONMem:
-
def setup_cache(self):
df = DataFrame([[1]])
- frames = {
- 'int': df,
- 'float': df.astype(float),
- }
+ frames = {"int": df, "float": df.astype(float)}
return frames
def peakmem_int(self, frames):
- df = frames['int']
+ df = frames["int"]
for _ in range(100_000):
df.to_json()
def peakmem_float(self, frames):
- df = frames['float']
+ df = frames["float"]
for _ in range(100_000):
df.to_json()
diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py
index dc2642d920fd0..d97b4ae13f0bd 100644
--- a/asv_bench/benchmarks/io/msgpack.py
+++ b/asv_bench/benchmarks/io/msgpack.py
@@ -1,3 +1,4 @@
+import warnings
import numpy as np
from pandas import DataFrame, date_range, read_msgpack
import pandas.util.testing as tm
@@ -6,16 +7,18 @@
class MSGPack(BaseIO):
-
def setup(self):
- self.fname = '__test__.msg'
+ self.fname = "__test__.msg"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
- self.df.to_msgpack(self.fname)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
+ with warnings.catch_warnings(record=True):
+ self.df.to_msgpack(self.fname)
def time_read_msgpack(self):
read_msgpack(self.fname)
diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
index edba0358c821a..c5e099bd44eac 100644
--- a/asv_bench/benchmarks/io/parsers.py
+++ b/asv_bench/benchmarks/io/parsers.py
@@ -2,16 +2,18 @@
try:
from pandas._libs.tslibs.parsing import (
- _concat_date_cols, _does_string_look_like_datetime)
+ _concat_date_cols,
+ _does_string_look_like_datetime,
+ )
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
-class DoesStringLookLikeDatetime(object):
+class DoesStringLookLikeDatetime:
- params = (['2Q2005', '0.0', '10000'],)
- param_names = ['value']
+ params = (["2Q2005", "0.0", "10000"],)
+ param_names = ["value"]
def setup(self, value):
self.objects = [value] * 1000000
@@ -21,18 +23,20 @@ def time_check_datetimes(self, value):
_does_string_look_like_datetime(obj)
-class ConcatDateCols(object):
+class ConcatDateCols:
- params = ([1234567890, 'AAAA'], [1, 2])
- param_names = ['value', 'dim']
+ params = ([1234567890, "AAAA"], [1, 2])
+ param_names = ["value", "dim"]
def setup(self, value, dim):
count_elem = 10000
if dim == 1:
self.object = (np.array([value] * count_elem),)
if dim == 2:
- self.object = (np.array([value] * count_elem),
- np.array([value] * count_elem))
+ self.object = (
+ np.array([value] * count_elem),
+ np.array([value] * count_elem),
+ )
def time_check_concat(self, value, dim):
_concat_date_cols(self.object)
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
index 74a58bbb946aa..286ac767c02e7 100644
--- a/asv_bench/benchmarks/io/pickle.py
+++ b/asv_bench/benchmarks/io/pickle.py
@@ -6,15 +6,16 @@
class Pickle(BaseIO):
-
def setup(self):
- self.fname = '__test__.pkl'
+ self.fname = "__test__.pkl"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.df.to_pickle(self.fname)
def time_read_pickle(self):
diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py
index 8181f1d41ac70..7ce8ef8c12639 100644
--- a/asv_bench/benchmarks/io/sas.py
+++ b/asv_bench/benchmarks/io/sas.py
@@ -5,15 +5,25 @@
class SAS:
- params = ['sas7bdat', 'xport']
- param_names = ['format']
+ params = ["sas7bdat", "xport"]
+ param_names = ["format"]
def setup(self, format):
# Read files that are located in 'pandas/io/tests/sas/data'
- files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'}
+ files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"}
file = files[format]
- paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas',
- 'tests', 'io', 'sas', 'data', file]
+ paths = [
+ os.path.dirname(__file__),
+ "..",
+ "..",
+ "..",
+ "pandas",
+ "tests",
+ "io",
+ "sas",
+ "data",
+ file,
+ ]
self.f = os.path.join(*paths)
def time_read_msgpack(self, format):
diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py
index ee48f3bd0a3ab..b80872b17a9e4 100644
--- a/asv_bench/benchmarks/io/sql.py
+++ b/asv_bench/benchmarks/io/sql.py
@@ -8,31 +8,35 @@
class SQL:
- params = ['sqlalchemy', 'sqlite']
- param_names = ['connection']
+ params = ["sqlalchemy", "sqlite"]
+ param_names = ["connection"]
def setup(self, connection):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_all = 'SELECT * FROM {}'.format(self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_all = "SELECT * FROM {}".format(self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe(self, connection):
- self.df.to_sql('test1', self.con, if_exists='replace')
+ self.df.to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query(self, connection):
read_sql_query(self.query_all, self.con)
@@ -40,85 +44,98 @@ def time_read_sql_query(self, connection):
class WriteSQLDtypes:
- params = (['sqlalchemy', 'sqlite'],
- ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'])
- param_names = ['connection', 'dtype']
+ params = (
+ ["sqlalchemy", "sqlite"],
+ ["float", "float_with_nan", "string", "bool", "int", "datetime"],
+ )
+ param_names = ["connection", "dtype"]
def setup(self, connection, dtype):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe_column(self, connection, dtype):
- self.df[[dtype]].to_sql('test1', self.con, if_exists='replace')
+ self.df[[dtype]].to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query_select_column(self, connection, dtype):
read_sql_query(self.query_col, self.con)
class ReadSQLTable:
-
def setup(self):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_all(self):
read_sql_table(self.table_name, self.con)
def time_read_sql_table_parse_dates(self):
- read_sql_table(self.table_name, self.con, columns=['datetime_string'],
- parse_dates=['datetime_string'])
+ read_sql_table(
+ self.table_name,
+ self.con,
+ columns=["datetime_string"],
+ parse_dates=["datetime_string"],
+ )
class ReadSQLTableDtypes:
- params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']
- param_names = ['dtype']
+ params = ["float", "float_with_nan", "string", "bool", "int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_column(self, dtype):
read_sql_table(self.table_name, self.con, columns=[dtype])
diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py
index fff10cf10a4d3..b3ed71af47dc8 100644
--- a/asv_bench/benchmarks/io/stata.py
+++ b/asv_bench/benchmarks/io/stata.py
@@ -7,26 +7,30 @@
class Stata(BaseIO):
- params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty']
- param_names = ['convert_dates']
+ params = ["tc", "td", "tm", "tw", "th", "tq", "ty"]
+ param_names = ["convert_dates"]
def setup(self, convert_dates):
- self.fname = '__test__.dta'
+ self.fname = "__test__.dta"
N = self.N = 100000
C = self.C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(self.N)
- self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
- np.iinfo(np.int8).max - 27, N)
- self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max - 27, N)
- self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min,
- np.iinfo(np.int32).max - 27, N)
- self.df['float32_'] = np.array(np.random.randn(N),
- dtype=np.float32)
- self.convert_dates = {'index': convert_dates}
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(self.N)
+ self.df["int8_"] = np.random.randint(
+ np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N
+ )
+ self.df["int16_"] = np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27, N
+ )
+ self.df["int32_"] = np.random.randint(
+ np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27, N
+ )
+ self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32)
+ self.convert_dates = {"index": convert_dates}
self.df.to_stata(self.fname, self.convert_dates)
def time_read_stata(self, convert_dates):
@@ -42,7 +46,7 @@ def setup(self, convert_dates):
for i in range(10):
missing_data = np.random.randn(self.N)
missing_data[missing_data < 0] = np.nan
- self.df['missing_{0}'.format(i)] = missing_data
+ self.df["missing_{0}".format(i)] = missing_data
self.df.to_stata(self.fname, self.convert_dates)
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index bbaba9909966e..7c899e3dc6ac8 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -2,8 +2,7 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex,
- date_range, concat, merge, merge_asof)
+from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof
try:
from pandas import merge_ordered
@@ -12,16 +11,14 @@
class Append:
-
def setup(self):
- self.df1 = DataFrame(np.random.randn(10000, 4),
- columns=['A', 'B', 'C', 'D'])
+ self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"])
self.df2 = self.df1.copy()
self.df2.index = np.arange(10000, 20000)
self.mdf1 = self.df1.copy()
- self.mdf1['obj1'] = 'bar'
- self.mdf1['obj2'] = 'bar'
- self.mdf1['int1'] = 5
+ self.mdf1["obj1"] = "bar"
+ self.mdf1["obj2"] = "bar"
+ self.mdf1["int1"] = 5
self.mdf1 = self.mdf1._consolidate()
self.mdf2 = self.mdf1.copy()
self.mdf2.index = self.df2.index
@@ -36,15 +33,16 @@ def time_append_mixed(self):
class Concat:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
N = 1000
s = Series(N, index=tm.makeStringIndex(N))
- self.series = [s[i:- i] for i in range(1, 10)] * 50
+ self.series = [s[i:-i] for i in range(1, 10)] * 50
self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
- df = DataFrame({'A': range(N)},
- index=date_range('20130101', periods=N, freq='s'))
+ df = DataFrame(
+ {"A": range(N)}, index=date_range("20130101", periods=N, freq="s")
+ )
self.empty_left = [DataFrame(), df]
self.empty_right = [df, DataFrame()]
self.mixed_ndims = [df, df.head(N // 2)]
@@ -68,14 +66,12 @@ def time_concat_mixed_ndims(self, axis):
class ConcatDataFrames:
params = ([0, 1], [True, False])
- param_names = ['axis', 'ignore_index']
+ param_names = ["axis", "ignore_index"]
def setup(self, axis, ignore_index):
- frame_c = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='C'))
+ frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C"))
self.frame_c = [frame_c] * 20
- frame_f = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='F'))
+ frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F"))
self.frame_f = [frame_f] * 20
def time_c_ordered(self, axis, ignore_index):
@@ -88,74 +84,78 @@ def time_f_ordered(self, axis, ignore_index):
class Join:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
level1 = tm.makeStringIndex(10).values
level2 = tm.makeStringIndex(1000).values
codes1 = np.arange(10).repeat(1000)
codes2 = np.tile(np.arange(1000), 10)
- index2 = MultiIndex(levels=[level1, level2],
- codes=[codes1, codes2])
- self.df_multi = DataFrame(np.random.randn(len(index2), 4),
- index=index2,
- columns=['A', 'B', 'C', 'D'])
+ index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
+ self.df_multi = DataFrame(
+ np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
+ )
self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
- self.df = DataFrame({'data1': np.random.randn(100000),
- 'data2': np.random.randn(100000),
- 'key1': self.key1,
- 'key2': self.key2})
-
- self.df_key1 = DataFrame(np.random.randn(len(level1), 4),
- index=level1,
- columns=['A', 'B', 'C', 'D'])
- self.df_key2 = DataFrame(np.random.randn(len(level2), 4),
- index=level2,
- columns=['A', 'B', 'C', 'D'])
+ self.df = DataFrame(
+ {
+ "data1": np.random.randn(100000),
+ "data2": np.random.randn(100000),
+ "key1": self.key1,
+ "key2": self.key2,
+ }
+ )
+
+ self.df_key1 = DataFrame(
+ np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
+ )
+ self.df_key2 = DataFrame(
+ np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
+ )
shuf = np.arange(100000)
np.random.shuffle(shuf)
self.df_shuf = self.df.reindex(self.df.index[shuf])
def time_join_dataframe_index_multi(self, sort):
- self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort)
+ self.df.join(self.df_multi, on=["key1", "key2"], sort=sort)
def time_join_dataframe_index_single_key_bigger(self, sort):
- self.df.join(self.df_key2, on='key2', sort=sort)
+ self.df.join(self.df_key2, on="key2", sort=sort)
def time_join_dataframe_index_single_key_small(self, sort):
- self.df.join(self.df_key1, on='key1', sort=sort)
+ self.df.join(self.df_key1, on="key1", sort=sort)
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
- self.df_shuf.join(self.df_key2, on='key2', sort=sort)
+ self.df_shuf.join(self.df_key2, on="key2", sort=sort)
class JoinIndex:
-
def setup(self):
N = 50000
- self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jim', 'joe'])
- self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jolie', 'jolia']).set_index('jolie')
+ self.left = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"]
+ )
+ self.right = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"]
+ ).set_index("jolie")
def time_left_outer_join_index(self):
- self.left.join(self.right, on='jim')
+ self.left.join(self.right, on="jim")
class JoinNonUnique:
# outer join of non-unique
# GH 6329
def setup(self):
- date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T')
- daily_dates = date_index.to_period('D').to_timestamp('S', 'S')
+ date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T")
+ daily_dates = date_index.to_period("D").to_timestamp("S", "S")
self.fracofday = date_index.values - daily_dates.values
- self.fracofday = self.fracofday.astype('timedelta64[ns]')
+ self.fracofday = self.fracofday.astype("timedelta64[ns]")
self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0
self.fracofday = Series(self.fracofday, daily_dates)
- index = date_range(date_index.min(), date_index.max(), freq='D')
+ index = date_range(date_index.min(), date_index.max(), freq="D")
self.temp = Series(1.0, index)[self.fracofday.index]
def time_join_non_unique_equal(self):
@@ -165,7 +165,7 @@ def time_join_non_unique_equal(self):
class Merge:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
N = 10000
@@ -173,17 +173,25 @@ def setup(self, sort):
indices2 = tm.makeStringIndex(N).values
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
- self.left = DataFrame({'key': key, 'key2': key2,
- 'value': np.random.randn(80000)})
- self.right = DataFrame({'key': indices[2000:],
- 'key2': indices2[2000:],
- 'value2': np.random.randn(8000)})
-
- self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2),
- 'key2': np.tile(np.arange(250).repeat(10), 4),
- 'value': np.random.randn(10000)})
- self.df2 = DataFrame({'key1': np.arange(500),
- 'value2': np.random.randn(500)})
+ self.left = DataFrame(
+ {"key": key, "key2": key2, "value": np.random.randn(80000)}
+ )
+ self.right = DataFrame(
+ {
+ "key": indices[2000:],
+ "key2": indices2[2000:],
+ "value2": np.random.randn(8000),
+ }
+ )
+
+ self.df = DataFrame(
+ {
+ "key1": np.tile(np.arange(500).repeat(10), 2),
+ "key2": np.tile(np.arange(250).repeat(10), 4),
+ "value": np.random.randn(10000),
+ }
+ )
+ self.df2 = DataFrame({"key1": np.arange(500), "value2": np.random.randn(500)})
self.df3 = self.df[:5000]
def time_merge_2intkey(self, sort):
@@ -193,125 +201,141 @@ def time_merge_dataframe_integer_2key(self, sort):
merge(self.df, self.df3, sort=sort)
def time_merge_dataframe_integer_key(self, sort):
- merge(self.df, self.df2, on='key1', sort=sort)
+ merge(self.df, self.df2, on="key1", sort=sort)
class I8Merge:
- params = ['inner', 'outer', 'left', 'right']
- param_names = ['how']
+ params = ["inner", "outer", "left", "right"]
+ param_names = ["how"]
def setup(self, how):
- low, high, n = -1000, 1000, 10**6
- self.left = DataFrame(np.random.randint(low, high, (n, 7)),
- columns=list('ABCDEFG'))
- self.left['left'] = self.left.sum(axis=1)
- self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
+ low, high, n = -1000, 1000, 10 ** 6
+ self.left = DataFrame(
+ np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")
+ )
+ self.left["left"] = self.left.sum(axis=1)
+ self.right = self.left.sample(frac=1).rename({"left": "right"}, axis=1)
self.right = self.right.reset_index(drop=True)
- self.right['right'] *= -1
+ self.right["right"] *= -1
def time_i8merge(self, how):
merge(self.left, self.right, how=how)
class MergeCategoricals:
-
def setup(self):
self.left_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Y": np.random.choice(["one", "two", "three"], size=(10000,)),
+ }
+ )
self.right_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
+ }
+ )
self.left_cat = self.left_object.assign(
- Y=self.left_object['Y'].astype('category'))
+ Y=self.left_object["Y"].astype("category")
+ )
self.right_cat = self.right_object.assign(
- Z=self.right_object['Z'].astype('category'))
+ Z=self.right_object["Z"].astype("category")
+ )
def time_merge_object(self):
- merge(self.left_object, self.right_object, on='X')
+ merge(self.left_object, self.right_object, on="X")
def time_merge_cat(self):
- merge(self.left_cat, self.right_cat, on='X')
+ merge(self.left_cat, self.right_cat, on="X")
class MergeOrdered:
-
def setup(self):
groups = tm.makeStringIndex(10).values
- self.left = DataFrame({'group': groups.repeat(5000),
- 'key': np.tile(np.arange(0, 10000, 2), 10),
- 'lvalue': np.random.randn(50000)})
- self.right = DataFrame({'key': np.arange(10000),
- 'rvalue': np.random.randn(10000)})
+ self.left = DataFrame(
+ {
+ "group": groups.repeat(5000),
+ "key": np.tile(np.arange(0, 10000, 2), 10),
+ "lvalue": np.random.randn(50000),
+ }
+ )
+ self.right = DataFrame(
+ {"key": np.arange(10000), "rvalue": np.random.randn(10000)}
+ )
def time_merge_ordered(self):
- merge_ordered(self.left, self.right, on='key', left_by='group')
+ merge_ordered(self.left, self.right, on="key", left_by="group")
class MergeAsof:
- params = [['backward', 'forward', 'nearest']]
- param_names = ['direction']
+ params = [["backward", "forward", "nearest"]]
+ param_names = ["direction"]
def setup(self, direction):
one_count = 200000
two_count = 1000000
df1 = DataFrame(
- {'time': np.random.randint(0, one_count / 20, one_count),
- 'key': np.random.choice(list(string.ascii_uppercase), one_count),
- 'key2': np.random.randint(0, 25, one_count),
- 'value1': np.random.randn(one_count)})
+ {
+ "time": np.random.randint(0, one_count / 20, one_count),
+ "key": np.random.choice(list(string.ascii_uppercase), one_count),
+ "key2": np.random.randint(0, 25, one_count),
+ "value1": np.random.randn(one_count),
+ }
+ )
df2 = DataFrame(
- {'time': np.random.randint(0, two_count / 20, two_count),
- 'key': np.random.choice(list(string.ascii_uppercase), two_count),
- 'key2': np.random.randint(0, 25, two_count),
- 'value2': np.random.randn(two_count)})
-
- df1 = df1.sort_values('time')
- df2 = df2.sort_values('time')
-
- df1['time32'] = np.int32(df1.time)
- df2['time32'] = np.int32(df2.time)
-
- self.df1a = df1[['time', 'value1']]
- self.df2a = df2[['time', 'value2']]
- self.df1b = df1[['time', 'key', 'value1']]
- self.df2b = df2[['time', 'key', 'value2']]
- self.df1c = df1[['time', 'key2', 'value1']]
- self.df2c = df2[['time', 'key2', 'value2']]
- self.df1d = df1[['time32', 'value1']]
- self.df2d = df2[['time32', 'value2']]
- self.df1e = df1[['time', 'key', 'key2', 'value1']]
- self.df2e = df2[['time', 'key', 'key2', 'value2']]
+ {
+ "time": np.random.randint(0, two_count / 20, two_count),
+ "key": np.random.choice(list(string.ascii_uppercase), two_count),
+ "key2": np.random.randint(0, 25, two_count),
+ "value2": np.random.randn(two_count),
+ }
+ )
+
+ df1 = df1.sort_values("time")
+ df2 = df2.sort_values("time")
+
+ df1["time32"] = np.int32(df1.time)
+ df2["time32"] = np.int32(df2.time)
+
+ self.df1a = df1[["time", "value1"]]
+ self.df2a = df2[["time", "value2"]]
+ self.df1b = df1[["time", "key", "value1"]]
+ self.df2b = df2[["time", "key", "value2"]]
+ self.df1c = df1[["time", "key2", "value1"]]
+ self.df2c = df2[["time", "key2", "value2"]]
+ self.df1d = df1[["time32", "value1"]]
+ self.df2d = df2[["time32", "value2"]]
+ self.df1e = df1[["time", "key", "key2", "value1"]]
+ self.df2e = df2[["time", "key", "key2", "value2"]]
def time_on_int(self, direction):
- merge_asof(self.df1a, self.df2a, on='time', direction=direction)
+ merge_asof(self.df1a, self.df2a, on="time", direction=direction)
def time_on_int32(self, direction):
- merge_asof(self.df1d, self.df2d, on='time32', direction=direction)
+ merge_asof(self.df1d, self.df2d, on="time32", direction=direction)
def time_by_object(self, direction):
- merge_asof(self.df1b, self.df2b, on='time', by='key',
- direction=direction)
+ merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction)
def time_by_int(self, direction):
- merge_asof(self.df1c, self.df2c, on='time', by='key2',
- direction=direction)
+ merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction)
def time_multiby(self, direction):
- merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
- direction=direction)
+ merge_asof(
+ self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction
+ )
class Align:
-
def setup(self):
- size = 5 * 10**5
- rng = np.arange(0, 10**13, 10**7)
- stamps = np.datetime64('now').view('i8') + rng
+ size = 5 * 10 ** 5
+ rng = np.arange(0, 10 ** 13, 10 ** 7)
+ stamps = np.datetime64("now").view("i8") + rng
idx1 = np.sort(np.random.choice(stamps, size, replace=False))
idx2 = np.sort(np.random.choice(stamps, size, replace=False))
self.ts1 = Series(np.random.randn(size), idx1)
@@ -321,7 +345,7 @@ def time_series_align_int64_index(self):
self.ts1 + self.ts2
def time_series_align_left_monotonic(self):
- self.ts1.align(self.ts2, join='left')
+ self.ts1.align(self.ts2, join="left")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index c979ba6d53a08..eda059a68e8a5 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -6,46 +6,44 @@
class GetLoc:
-
def setup(self):
self.mi_large = MultiIndex.from_product(
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
- names=['one', 'two', 'three'])
+ names=["one", "two", "three"],
+ )
self.mi_med = MultiIndex.from_product(
- [np.arange(1000), np.arange(10), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"]
+ )
self.mi_small = MultiIndex.from_product(
- [np.arange(100), list('A'), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(100), list("A"), list("A")], names=["one", "two", "three"]
+ )
def time_large_get_loc(self):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_large_get_loc_warm(self):
for _ in range(1000):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_med_get_loc(self):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_med_get_loc_warm(self):
for _ in range(1000):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_string_get_loc(self):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
def time_small_get_loc_warm(self):
for _ in range(1000):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
class Duplicates:
-
def setup(self):
size = 65536
- arrays = [np.random.randint(0, 8192, size),
- np.random.randint(0, 1024, size)]
+ arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)]
mask = np.random.rand(size) < 0.1
self.mi_unused_levels = MultiIndex.from_arrays(arrays)
self.mi_unused_levels = self.mi_unused_levels[mask]
@@ -55,15 +53,25 @@ def time_remove_unused_levels(self):
class Integer:
-
def setup(self):
- self.mi_int = MultiIndex.from_product([np.arange(1000),
- np.arange(1000)],
- names=['one', 'two'])
- self.obj_index = np.array([(0, 10), (0, 11), (0, 12),
- (0, 13), (0, 14), (0, 15),
- (0, 16), (0, 17), (0, 18),
- (0, 19)], dtype=object)
+ self.mi_int = MultiIndex.from_product(
+ [np.arange(1000), np.arange(1000)], names=["one", "two"]
+ )
+ self.obj_index = np.array(
+ [
+ (0, 10),
+ (0, 11),
+ (0, 12),
+ (0, 13),
+ (0, 14),
+ (0, 15),
+ (0, 16),
+ (0, 17),
+ (0, 18),
+ (0, 19),
+ ],
+ dtype=object,
+ )
def time_get_indexer(self):
self.mi_int.get_indexer(self.obj_index)
@@ -73,12 +81,9 @@ def time_is_monotonic(self):
class Duplicated:
-
def setup(self):
n, k = 200, 5000
- levels = [np.arange(n),
- tm.makeStringIndex(n).values,
- 1000 + np.arange(n)]
+ levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
codes = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, codes=codes)
@@ -87,12 +92,13 @@ def time_duplicated(self):
class Sortlevel:
-
def setup(self):
n = 1182720
low, high = -4096, 4096
- arrs = [np.repeat(np.random.randint(low, high, (n // k)), k)
- for k in [11, 7, 5, 3, 1]]
+ arrs = [
+ np.repeat(np.random.randint(low, high, (n // k)), k)
+ for k in [11, 7, 5, 3, 1]
+ ]
self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)]
a = np.repeat(np.arange(100), 1000)
@@ -111,11 +117,10 @@ def time_sortlevel_one(self):
class Values:
-
def setup_cache(self):
level1 = range(1000)
- level2 = date_range(start='1/1/2012', periods=100)
+ level2 = date_range(start="1/1/2012", periods=100)
mi = MultiIndex.from_product([level1, level2])
return mi
@@ -127,17 +132,18 @@ def time_datetime_level_values_sliced(self, mi):
class CategoricalLevel:
-
def setup(self):
- self.df = DataFrame({
- 'a': np.arange(1_000_000, dtype=np.int32),
- 'b': np.arange(1_000_000, dtype=np.int64),
- 'c': np.arange(1_000_000, dtype=float),
- }).astype({'a': 'category', 'b': 'category'})
+ self.df = DataFrame(
+ {
+ "a": np.arange(1_000_000, dtype=np.int32),
+ "b": np.arange(1_000_000, dtype=np.int64),
+ "c": np.arange(1_000_000, dtype=float),
+ }
+ ).astype({"a": "category", "b": "category"})
def time_categorical_level(self):
- self.df.set_index(['a', 'b'])
+ self.df.set_index(["a", "b"])
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py
index 26e344758596f..31c3b6fb6cb60 100644
--- a/asv_bench/benchmarks/offset.py
+++ b/asv_bench/benchmarks/offset.py
@@ -3,42 +3,51 @@
import numpy as np
import pandas as pd
+
try:
import pandas.tseries.holiday # noqa
except ImportError:
pass
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
-# These offests currently raise a NotImplimentedError with .apply_index()
-non_apply = [pd.offsets.Day(),
- pd.offsets.BYearEnd(),
- pd.offsets.BYearBegin(),
- pd.offsets.BQuarterEnd(),
- pd.offsets.BQuarterBegin(),
- pd.offsets.BMonthEnd(),
- pd.offsets.BMonthBegin(),
- pd.offsets.CustomBusinessDay(),
- pd.offsets.CustomBusinessDay(calendar=hcal),
- pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal)]
-other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(),
- pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(),
- pd.offsets.MonthEnd(), pd.offsets.MonthBegin(),
- pd.offsets.DateOffset(months=2, days=2),
- pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(),
- pd.offsets.SemiMonthBegin()]
+# These offsets currently raise a NotImplimentedError with .apply_index()
+non_apply = [
+ pd.offsets.Day(),
+ pd.offsets.BYearEnd(),
+ pd.offsets.BYearBegin(),
+ pd.offsets.BQuarterEnd(),
+ pd.offsets.BQuarterBegin(),
+ pd.offsets.BMonthEnd(),
+ pd.offsets.BMonthBegin(),
+ pd.offsets.CustomBusinessDay(),
+ pd.offsets.CustomBusinessDay(calendar=hcal),
+ pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+]
+other_offsets = [
+ pd.offsets.YearEnd(),
+ pd.offsets.YearBegin(),
+ pd.offsets.QuarterEnd(),
+ pd.offsets.QuarterBegin(),
+ pd.offsets.MonthEnd(),
+ pd.offsets.MonthBegin(),
+ pd.offsets.DateOffset(months=2, days=2),
+ pd.offsets.BusinessDay(),
+ pd.offsets.SemiMonthEnd(),
+ pd.offsets.SemiMonthBegin(),
+]
offsets = non_apply + other_offsets
class ApplyIndex:
params = other_offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 10000
- self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_apply_index(self, offset):
offset.apply_index(self.rng)
@@ -47,13 +56,15 @@ def time_apply_index(self, offset):
class OnOffset:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- self.dates = [datetime(2016, m, d)
- for m in [10, 11, 12]
- for d in [1, 2, 3, 28, 29, 30, 31]
- if not (m == 11 and d == 31)]
+ self.dates = [
+ datetime(2016, m, d)
+ for m in [10, 11, 12]
+ for d in [1, 2, 3, 28, 29, 30, 31]
+ if not (m == 11 and d == 31)
+ ]
def time_on_offset(self, offset):
for date in self.dates:
@@ -63,11 +74,11 @@ def time_on_offset(self, offset):
class OffsetSeriesArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
self.data = pd.Series(rng)
def time_add_offset(self, offset):
@@ -78,11 +89,11 @@ def time_add_offset(self, offset):
class OffsetDatetimeIndexArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- self.data = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.data = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_add_offset(self, offset):
with warnings.catch_warnings(record=True):
@@ -92,11 +103,11 @@ def time_add_offset(self, offset):
class OffestDatetimeArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
self.date = datetime(2011, 1, 1)
- self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.dt64 = np.datetime64("2011-01-01 09:00Z")
def time_apply(self, offset):
offset.apply(self.date)
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
index 59b1638920666..fdc8207021c0f 100644
--- a/asv_bench/benchmarks/pandas_vb_common.py
+++ b/asv_bench/benchmarks/pandas_vb_common.py
@@ -5,26 +5,42 @@
import pandas as pd
# Compatibility import for lib
-for imp in ['pandas._libs.lib', 'pandas.lib']:
+for imp in ["pandas._libs.lib", "pandas.lib"]:
try:
lib = import_module(imp)
break
except (ImportError, TypeError, ValueError):
pass
-numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
- np.float64, np.int16, np.int8, np.uint16, np.uint8]
+numeric_dtypes = [
+ np.int64,
+ np.int32,
+ np.uint32,
+ np.uint64,
+ np.float32,
+ np.float64,
+ np.int16,
+ np.int8,
+ np.uint16,
+ np.uint8,
+]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [np.object]
try:
- extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
- pd.Int32Dtype, pd.Int64Dtype,
- pd.UInt8Dtype, pd.UInt16Dtype,
- pd.UInt32Dtype, pd.UInt64Dtype,
- pd.CategoricalDtype,
- pd.IntervalDtype,
- pd.DatetimeTZDtype('ns', 'UTC'),
- pd.PeriodDtype('D')]
+ extension_dtypes = [
+ pd.Int8Dtype,
+ pd.Int16Dtype,
+ pd.Int32Dtype,
+ pd.Int64Dtype,
+ pd.UInt8Dtype,
+ pd.UInt16Dtype,
+ pd.UInt32Dtype,
+ pd.UInt64Dtype,
+ pd.CategoricalDtype,
+ pd.IntervalDtype,
+ pd.DatetimeTZDtype("ns", "UTC"),
+ pd.PeriodDtype("D"),
+ ]
except AttributeError:
extension_dtypes = []
@@ -40,6 +56,7 @@ class BaseIO:
"""
Base class for IO benchmarks
"""
+
fname = None
def remove(self, f):
diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py
index c8ba6c382cb64..2f8ae0650ab75 100644
--- a/asv_bench/benchmarks/period.py
+++ b/asv_bench/benchmarks/period.py
@@ -1,18 +1,33 @@
-from pandas import (
- DataFrame, Period, PeriodIndex, Series, date_range, period_range)
+from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range
from pandas.tseries.frequencies import to_offset
class PeriodProperties:
- params = (['M', 'min'],
- ['year', 'month', 'day', 'hour', 'minute', 'second',
- 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth',
- 'dayofweek', 'dayofyear', 'start_time', 'end_time'])
- param_names = ['freq', 'attr']
+ params = (
+ ["M", "min"],
+ [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "is_leap_year",
+ "quarter",
+ "qyear",
+ "week",
+ "daysinmonth",
+ "dayofweek",
+ "dayofyear",
+ "start_time",
+ "end_time",
+ ],
+ )
+ param_names = ["freq", "attr"]
def setup(self, freq, attr):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_property(self, freq, attr):
getattr(self.per, attr)
@@ -20,11 +35,11 @@ def time_property(self, freq, attr):
class PeriodUnaryMethods:
- params = ['M', 'min']
- param_names = ['freq']
+ params = ["M", "min"]
+ param_names = ["freq"]
def setup(self, freq):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_to_timestamp(self, freq):
self.per.to_timestamp()
@@ -33,12 +48,12 @@ def time_now(self, freq):
self.per.now(freq)
def time_asfreq(self, freq):
- self.per.asfreq('A')
+ self.per.asfreq("A")
class PeriodConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
if is_offset:
@@ -47,20 +62,21 @@ def setup(self, freq, is_offset):
self.freq = freq
def time_period_constructor(self, freq, is_offset):
- Period('2012-06-01', freq=freq)
+ Period("2012-06-01", freq=freq)
class PeriodIndexConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
- self.rng = date_range('1985', periods=1000)
- self.rng2 = date_range('1985', periods=1000).to_pydatetime()
+ self.rng = date_range("1985", periods=1000)
+ self.rng2 = date_range("1985", periods=1000).to_pydatetime()
self.ints = list(range(2000, 3000))
- self.daily_ints = date_range('1/1/2000', periods=1000,
- freq=freq).strftime('%Y%m%d').map(int)
+ self.daily_ints = (
+ date_range("1/1/2000", periods=1000, freq=freq).strftime("%Y%m%d").map(int)
+ )
if is_offset:
self.freq = to_offset(freq)
else:
@@ -80,32 +96,35 @@ def time_from_ints_daily(self, freq, is_offset):
class DataFramePeriodColumn:
-
def setup(self):
- self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
+ self.rng = period_range(start="1/1/1990", freq="S", periods=20000)
self.df = DataFrame(index=range(len(self.rng)))
def time_setitem_period_column(self):
- self.df['col'] = self.rng
+ self.df["col"] = self.rng
def time_set_index(self):
# GH#21582 limited by comparisons of Period objects
- self.df['col2'] = self.rng
- self.df.set_index('col2', append=True)
+ self.df["col2"] = self.rng
+ self.df.set_index("col2", append=True)
class Algorithms:
- params = ['index', 'series']
- param_names = ['typ']
+ params = ["index", "series"]
+ param_names = ["typ"]
def setup(self, typ):
- data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
- Period('2011-03', freq='M'), Period('2011-04', freq='M')]
-
- if typ == 'index':
- self.vector = PeriodIndex(data * 1000, freq='M')
- elif typ == 'series':
+ data = [
+ Period("2011-01", freq="M"),
+ Period("2011-02", freq="M"),
+ Period("2011-03", freq="M"),
+ Period("2011-04", freq="M"),
+ ]
+
+ if typ == "index":
+ self.vector = PeriodIndex(data * 1000, freq="M")
+ elif typ == "series":
self.vector = Series(data * 1000)
def time_drop_duplicates(self, typ):
@@ -116,9 +135,8 @@ def time_value_counts(self, typ):
class Indexing:
-
def setup(self):
- self.index = period_range(start='1985', periods=1000, freq='D')
+ self.index = period_range(start="1985", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.period = self.index[500]
@@ -135,7 +153,7 @@ def time_series_loc(self):
self.series.loc[self.period]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
index 9e3bc87c32987..4fb0876f05a0a 100644
--- a/asv_bench/benchmarks/plotting.py
+++ b/asv_bench/benchmarks/plotting.py
@@ -1,27 +1,29 @@
import numpy as np
from pandas import DataFrame, Series, DatetimeIndex, date_range
+
try:
from pandas.plotting import andrews_curves
except ImportError:
from pandas.tools.plotting import andrews_curves
import matplotlib
-matplotlib.use('Agg')
+
+matplotlib.use("Agg")
class SeriesPlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']]
- param_names = ['kind']
+ params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde']:
+ elif kind in ["kde"]:
n = 10000
else:
n = 1000000
self.s = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.s = self.s.abs()
def time_series_plot(self, kind):
@@ -29,41 +31,43 @@ def time_series_plot(self, kind):
class FramePlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter',
- 'hexbin']]
- param_names = ['kind']
+ params = [
+ ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"]
+ ]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde', 'scatter', 'hexbin']:
+ elif kind in ["kde", "scatter", "hexbin"]:
n = 10000
else:
n = 1000000
self.x = Series(np.random.randn(n))
self.y = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.x = self.x.abs()
self.y = self.y.abs()
- self.df = DataFrame({'x': self.x, 'y': self.y})
+ self.df = DataFrame({"x": self.x, "y": self.y})
def time_frame_plot(self, kind):
- self.df.plot(x='x', y='y', kind=kind)
+ self.df.plot(x="x", y="y", kind=kind)
class TimeseriesPlotting:
-
def setup(self):
N = 2000
M = 5
- idx = date_range('1/1/1975', periods=N)
+ idx = date_range("1/1/1975", periods=N)
self.df = DataFrame(np.random.randn(N, M), index=idx)
- idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10],
- idx.values[12:])))
- self.df2 = DataFrame(np.random.randn(len(idx_irregular), M),
- index=idx_irregular)
+ idx_irregular = DatetimeIndex(
+ np.concatenate((idx.values[0:10], idx.values[12:]))
+ )
+ self.df2 = DataFrame(
+ np.random.randn(len(idx_irregular), M), index=idx_irregular
+ )
def time_plot_regular(self):
self.df.plot()
@@ -79,12 +83,11 @@ def time_plot_table(self):
class Misc:
-
def setup(self):
N = 500
M = 10
self.df = DataFrame(np.random.randn(N, M))
- self.df['Name'] = ["A"] * N
+ self.df["Name"] = ["A"] * N
def time_plot_andrews_curves(self):
andrews_curves(self.df, "Name")
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
index a6ceb0e93a089..8d4c9ebaf3e89 100644
--- a/asv_bench/benchmarks/reindex.py
+++ b/asv_bench/benchmarks/reindex.py
@@ -1,20 +1,18 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
- period_range)
+from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range
from .pandas_vb_common import lib
class Reindex:
-
def setup(self):
- rng = date_range(start='1/1/1970', periods=10000, freq='1min')
- self.df = DataFrame(np.random.rand(10000, 10), index=rng,
- columns=range(10))
- self.df['foo'] = 'bar'
+ rng = date_range(start="1/1/1970", periods=10000, freq="1min")
+ self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10))
+ self.df["foo"] = "bar"
self.rng_subset = Index(rng[::2])
- self.df2 = DataFrame(index=range(10000),
- data=np.random.rand(10000, 30), columns=range(30))
+ self.df2 = DataFrame(
+ index=range(10000), data=np.random.rand(10000, 30), columns=range(30)
+ )
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
@@ -35,12 +33,12 @@ def time_reindex_multiindex(self):
class ReindexMethod:
- params = [['pad', 'backfill'], [date_range, period_range]]
- param_names = ['method', 'constructor']
+ params = [["pad", "backfill"], [date_range, period_range]]
+ param_names = ["method", "constructor"]
def setup(self, method, constructor):
N = 100000
- self.idx = constructor('1/1/2000', periods=N, freq='1min')
+ self.idx = constructor("1/1/2000", periods=N, freq="1min")
self.ts = Series(np.random.randn(N), index=self.idx)[::2]
def time_reindex_method(self, method, constructor):
@@ -49,15 +47,15 @@ def time_reindex_method(self, method, constructor):
class Fillna:
- params = ['pad', 'backfill']
- param_names = ['method']
+ params = ["pad", "backfill"]
+ param_names = ["method"]
def setup(self, method):
N = 100000
- self.idx = date_range('1/1/2000', periods=N, freq='1min')
+ self.idx = date_range("1/1/2000", periods=N, freq="1min")
ts = Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
- self.ts_float32 = self.ts_reindexed.astype('float32')
+ self.ts_float32 = self.ts_reindexed.astype("float32")
def time_reindexed(self, method):
self.ts_reindexed.fillna(method=method)
@@ -67,17 +65,17 @@ def time_float_32(self, method):
class LevelAlign:
-
def setup(self):
self.index = MultiIndex(
levels=[np.arange(10), np.arange(100), np.arange(100)],
- codes=[np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)])
- self.df = DataFrame(np.random.randn(len(self.index), 4),
- index=self.index)
- self.df_level = DataFrame(np.random.randn(100, 4),
- index=self.index.levels[1])
+ codes=[
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ],
+ )
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
def time_align_level(self):
self.df.align(self.df_level, level=1, copy=False)
@@ -89,15 +87,16 @@ def time_reindex_level(self):
class DropDuplicates:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
- self.df = DataFrame({'key1': key1, 'key2': key2,
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {"key1": key1, "key2": key2, "value": np.random.randn(N * K)}
+ )
self.df_nan = self.df.copy()
self.df_nan.iloc[:10000, :] = np.nan
@@ -107,15 +106,14 @@ def setup(self, inplace):
N = 1000000
K = 10000
key1 = np.random.randint(0, K, size=N)
- self.df_int = DataFrame({'key1': key1})
- self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
- dtype=bool))
+ self.df_int = DataFrame({"key1": key1})
+ self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool))
def time_frame_drop_dups(self, inplace):
- self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_frame_drop_dups_na(self, inplace):
- self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df_nan.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_series_drop_dups_int(self, inplace):
self.s.drop_duplicates(inplace=inplace)
@@ -137,16 +135,16 @@ def setup(self):
indices = tm.makeStringIndex(n)
subsample_size = 40000
self.x = Series(np.random.randn(n), indices)
- self.y = Series(np.random.randn(subsample_size),
- index=np.random.choice(indices, subsample_size,
- replace=False))
+ self.y = Series(
+ np.random.randn(subsample_size),
+ index=np.random.choice(indices, subsample_size, replace=False),
+ )
def time_align_series_irregular_string(self):
self.x + self.y
class LibFastZip:
-
def setup(self):
N = 10000
K = 10
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
index 9dff1778f8e56..f69ae15028525 100644
--- a/asv_bench/benchmarks/replace.py
+++ b/asv_bench/benchmarks/replace.py
@@ -5,11 +5,11 @@
class FillNa:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**6
- rng = pd.date_range('1/1/2000', periods=N, freq='min')
+ N = 10 ** 6
+ rng = pd.date_range("1/1/2000", periods=N, freq="min")
data = np.random.randn(N)
data[::2] = np.nan
self.ts = pd.Series(data, index=rng)
@@ -24,28 +24,48 @@ def time_replace(self, inplace):
class ReplaceDict:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**5
- start_value = 10**5
+ N = 10 ** 5
+ start_value = 10 ** 5
self.to_rep = dict(enumerate(np.arange(N) + start_value))
- self.s = pd.Series(np.random.randint(N, size=10**3))
+ self.s = pd.Series(np.random.randint(N, size=10 ** 3))
def time_replace_series(self, inplace):
self.s.replace(self.to_rep, inplace=inplace)
+class ReplaceList:
+ # GH#28099
+
+ params = [(True, False)]
+ param_names = ["inplace"]
+
+ def setup(self, inplace):
+ self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7))
+
+ def time_replace_list(self, inplace):
+ self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace)
+
+ def time_replace_list_one_match(self, inplace):
+ # the 1 can be held in self._df.blocks[0], while the inf and -inf cant
+ self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace)
+
+
class Convert:
- params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
- param_names = ['constructor', 'replace_data']
+ params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"])
+ param_names = ["constructor", "replace_data"]
def setup(self, constructor, replace_data):
- N = 10**3
- data = {'Series': pd.Series(np.random.randint(N, size=N)),
- 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
- 'B': np.random.randint(N, size=N)})}
+ N = 10 ** 3
+ data = {
+ "Series": pd.Series(np.random.randint(N, size=N)),
+ "DataFrame": pd.DataFrame(
+ {"A": np.random.randint(N, size=N), "B": np.random.randint(N, size=N)}
+ ),
+ }
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
self.data = data[constructor]
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
index 678403d837805..cc373f413fb88 100644
--- a/asv_bench/benchmarks/reshape.py
+++ b/asv_bench/benchmarks/reshape.py
@@ -7,35 +7,33 @@
class Melt:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C'])
- self.df['id1'] = np.random.randint(0, 10, 10000)
- self.df['id2'] = np.random.randint(100, 1000, 10000)
+ self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"])
+ self.df["id1"] = np.random.randint(0, 10, 10000)
+ self.df["id2"] = np.random.randint(100, 1000, 10000)
def time_melt_dataframe(self):
- melt(self.df, id_vars=['id1', 'id2'])
+ melt(self.df, id_vars=["id1", "id2"])
class Pivot:
-
def setup(self):
N = 10000
- index = date_range('1/1/2000', periods=N, freq='h')
- data = {'value': np.random.randn(N * 50),
- 'variable': np.arange(50).repeat(N),
- 'date': np.tile(index.values, 50)}
+ index = date_range("1/1/2000", periods=N, freq="h")
+ data = {
+ "value": np.random.randn(N * 50),
+ "variable": np.arange(50).repeat(N),
+ "date": np.tile(index.values, 50),
+ }
self.df = DataFrame(data)
def time_reshape_pivot_time_series(self):
- self.df.pivot('date', 'variable', 'value')
+ self.df.pivot("date", "variable", "value")
class SimpleReshape:
-
def setup(self):
- arrays = [np.arange(100).repeat(100),
- np.roll(np.tile(np.arange(100), 100), 25)]
+ arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]
index = MultiIndex.from_arrays(arrays)
self.df = DataFrame(np.random.randn(10000, 4), index=index)
self.udf = self.df.unstack(1)
@@ -49,7 +47,7 @@ def time_unstack(self):
class Unstack:
- params = ['int', 'category']
+ params = ["int", "category"]
def setup(self, dtype):
m = 100
@@ -58,7 +56,7 @@ def setup(self, dtype):
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
columns = np.arange(n)
- if dtype == 'int':
+ if dtype == "int":
values = np.arange(m * m * n).reshape(m * m, n)
else:
# the category branch is ~20x slower than int. So we
@@ -80,84 +78,94 @@ def time_without_last_row(self, dtype):
class SparseIndex:
-
def setup(self):
NUM_ROWS = 1000
- self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS),
- 'B': np.random.randint(50, size=NUM_ROWS),
- 'C': np.random.randint(-10, 10, size=NUM_ROWS),
- 'D': np.random.randint(-10, 10, size=NUM_ROWS),
- 'E': np.random.randint(10, size=NUM_ROWS),
- 'F': np.random.randn(NUM_ROWS)})
- self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E'])
+ self.df = DataFrame(
+ {
+ "A": np.random.randint(50, size=NUM_ROWS),
+ "B": np.random.randint(50, size=NUM_ROWS),
+ "C": np.random.randint(-10, 10, size=NUM_ROWS),
+ "D": np.random.randint(-10, 10, size=NUM_ROWS),
+ "E": np.random.randint(10, size=NUM_ROWS),
+ "F": np.random.randn(NUM_ROWS),
+ }
+ )
+ self.df = self.df.set_index(["A", "B", "C", "D", "E"])
def time_unstack(self):
self.df.unstack()
class WideToLong:
-
def setup(self):
nyrs = 20
nidvars = 20
N = 5000
- self.letters = list('ABCD')
- yrvars = [l + str(num)
- for l, num in product(self.letters, range(1, nyrs + 1))]
+ self.letters = list("ABCD")
+ yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
columns = [str(i) for i in range(nidvars)] + yrvars
- self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)),
- columns=columns)
- self.df['id'] = self.df.index
+ self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
+ self.df["id"] = self.df.index
def time_wide_to_long_big(self):
- wide_to_long(self.df, self.letters, i='id', j='year')
+ wide_to_long(self.df, self.letters, i="id", j="year")
class PivotTable:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
ind1 = np.random.randint(0, 3, size=N)
ind2 = np.random.randint(0, 2, size=N)
- self.df = DataFrame({'key1': fac1.take(ind1),
- 'key2': fac2.take(ind2),
- 'key3': fac2.take(ind2),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
- self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
- 'col3': [1, 2, 3, 4, 5]})
- self.df2.col1 = self.df2.col1.astype('category')
- self.df2.col2 = self.df2.col2.astype('category')
+ self.df = DataFrame(
+ {
+ "key1": fac1.take(ind1),
+ "key2": fac2.take(ind2),
+ "key3": fac2.take(ind2),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
+ self.df2 = DataFrame(
+ {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
+ )
+ self.df2.col1 = self.df2.col1.astype("category")
+ self.df2.col2 = self.df2.col2.astype("category")
def time_pivot_table(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'])
+ self.df.pivot_table(index="key1", columns=["key2", "key3"])
def time_pivot_table_agg(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- aggfunc=['sum', 'mean'])
+ self.df.pivot_table(
+ index="key1", columns=["key2", "key3"], aggfunc=["sum", "mean"]
+ )
def time_pivot_table_margins(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- margins=True)
+ self.df.pivot_table(index="key1", columns=["key2", "key3"], margins=True)
def time_pivot_table_categorical(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0)
+ self.df2.pivot_table(
+ index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0
+ )
def time_pivot_table_categorical_observed(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0, observed=True)
+ self.df2.pivot_table(
+ index="col1",
+ values="col3",
+ columns="col2",
+ aggfunc=np.sum,
+ fill_value=0,
+ observed=True,
+ )
class Crosstab:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
self.ind1 = np.random.randint(0, 3, size=N)
self.ind2 = np.random.randint(0, 2, size=N)
self.vec1 = fac1.take(self.ind1)
@@ -167,7 +175,7 @@ def time_crosstab(self):
pd.crosstab(self.vec1, self.vec2)
def time_crosstab_values(self):
- pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')
+ pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc="sum")
def time_crosstab_normalize(self):
pd.crosstab(self.vec1, self.vec2, normalize=True)
@@ -179,8 +187,10 @@ def time_crosstab_normalize_margins(self):
class GetDummies:
def setup(self):
categories = list(string.ascii_letters[:12])
- s = pd.Series(np.random.choice(categories, size=1000000),
- dtype=pd.api.types.CategoricalDtype(categories))
+ s = pd.Series(
+ np.random.choice(categories, size=1000000),
+ dtype=pd.api.types.CategoricalDtype(categories),
+ )
self.s = s
def time_get_dummies_1d(self):
@@ -192,16 +202,19 @@ def time_get_dummies_1d_sparse(self):
class Cut:
params = [[4, 10, 1000]]
- param_names = ['bins']
+ param_names = ["bins"]
def setup(self, bins):
- N = 10**5
+ N = 10 ** 5
self.int_series = pd.Series(np.arange(N).repeat(5))
self.float_series = pd.Series(np.random.randn(N).repeat(5))
- self.timedelta_series = pd.Series(np.random.randint(N, size=N),
- dtype='timedelta64[ns]')
- self.datetime_series = pd.Series(np.random.randint(N, size=N),
- dtype='datetime64[ns]')
+ self.timedelta_series = pd.Series(
+ np.random.randint(N, size=N), dtype="timedelta64[ns]"
+ )
+ self.datetime_series = pd.Series(
+ np.random.randint(N, size=N), dtype="datetime64[ns]"
+ )
+ self.interval_bins = pd.IntervalIndex.from_breaks(np.linspace(0, N, bins))
def time_cut_int(self, bins):
pd.cut(self.int_series, bins)
@@ -227,5 +240,26 @@ def time_qcut_timedelta(self, bins):
def time_qcut_datetime(self, bins):
pd.qcut(self.datetime_series, bins)
+ def time_cut_interval(self, bins):
+ # GH 27668
+ pd.cut(self.int_series, self.interval_bins)
+
+ def peakmem_cut_interval(self, bins):
+ # GH 27668
+ pd.cut(self.int_series, self.interval_bins)
+
+
+class Explode:
+ param_names = ["n_rows", "max_list_length"]
+ params = [[100, 1000, 10000], [3, 5, 10]]
+
+ def setup(self, n_rows, max_list_length):
+
+ data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]
+ self.series = pd.Series(data)
+
+ def time_explode(self, n_rows, max_list_length):
+ self.series.explode()
+
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 033b466c8b9be..a70977fcf539f 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -4,15 +4,16 @@
class Methods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
@@ -22,14 +23,15 @@ def time_rolling(self, constructor, window, dtype, method):
class ExpandingMethods:
- params = (['DataFrame', 'Series'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.expanding = getattr(pd, constructor)(arr).expanding()
@@ -39,14 +41,11 @@ def time_expanding(self, constructor, dtype, method):
class EWMMethods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['mean', 'std'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"])
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window)
@@ -55,29 +54,28 @@ def time_ewm(self, constructor, window, dtype, method):
class VariableWindowMethods(Methods):
- params = (['DataFrame', 'Series'],
- ['50s', '1h', '1d'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["50s", "1h", "1d"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
- index = pd.date_range('2017-01-01', periods=N, freq='5s')
+ index = pd.date_range("2017-01-01", periods=N, freq="5s")
self.roll = getattr(pd, constructor)(arr, index=index).rolling(window)
class Pairwise:
- params = ([10, 1000, None],
- ['corr', 'cov'],
- [True, False])
- param_names = ['window', 'method', 'pairwise']
+ params = ([10, 1000, None], ["corr", "cov"], [True, False])
+ param_names = ["window", "method", "pairwise"]
def setup(self, window, method, pairwise):
- N = 10**4
+ N = 10 ** 4
arr = np.random.random(N)
self.df = pd.DataFrame(arr)
@@ -90,25 +88,25 @@ def time_pairwise(self, window, method, pairwise):
class Quantile:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- [0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'])
- param_names = ['constructor', 'window', 'dtype', 'percentile']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ )
+ param_names = ["constructor", "window", "dtype", "percentile"]
def setup(self, constructor, window, dtype, percentile, interpolation):
N = 10 ** 5
arr = np.random.random(N).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
- def time_quantile(self, constructor, window, dtype, percentile,
- interpolation):
+ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
self.roll.quantile(percentile, interpolation=interpolation)
class PeakMemFixed:
-
def setup(self):
N = 10
arr = 100 * np.random.random(N)
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 4b1af2dc8c932..6038a2ab4bd9f 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -7,13 +7,13 @@
class SeriesConstructor:
- params = [None, 'dict']
- param_names = ['data']
+ params = [None, "dict"]
+ param_names = ["data"]
def setup(self, data):
- self.idx = date_range(start=datetime(2015, 10, 26),
- end=datetime(2016, 1, 1),
- freq='50s')
+ self.idx = date_range(
+ start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
+ )
dict_data = dict(zip(self.idx, range(len(self.idx))))
self.data = None if data is None else dict_data
@@ -23,8 +23,8 @@ def time_constructor(self, data):
class IsIn:
- params = ['int64', 'uint64', 'object']
- param_names = ['dtype']
+ params = ["int64", "uint64", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
@@ -35,12 +35,11 @@ def time_isin(self, dtypes):
class IsInFloat64:
-
def setup(self):
self.small = Series([1, 2], dtype=np.float64)
- self.many_different_values = np.arange(10**6, dtype=np.float64)
- self.few_different_values = np.zeros(10**7, dtype=np.float64)
- self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
+ self.many_different_values = np.arange(10 ** 6, dtype=np.float64)
+ self.few_different_values = np.zeros(10 ** 7, dtype=np.float64)
+ self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64)
def time_isin_many_different(self):
# runtime is dominated by creation of the lookup-table
@@ -56,19 +55,18 @@ def time_isin_nan_values(self):
class IsInForObjects:
-
def setup(self):
- self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object)
- self.vals_nans = np.full(10**4, np.nan).astype(np.object)
+ self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object)
+ self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object)
self.s_short = Series(np.arange(2)).astype(np.object)
- self.s_long = Series(np.arange(10**5)).astype(np.object)
+ self.s_long = Series(np.arange(10 ** 5)).astype(np.object)
self.vals_short = np.arange(2).astype(np.object)
- self.vals_long = np.arange(10**5).astype(np.object)
+ self.vals_long = np.arange(10 ** 5).astype(np.object)
# because of nans floats are special:
- self.s_long_floats = Series(np.arange(10**5,
- dtype=np.float)).astype(np.object)
- self.vals_long_floats = np.arange(10**5,
- dtype=np.float).astype(np.object)
+ self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(
+ np.object
+ )
+ self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object)
def time_isin_nans(self):
# if nan-objects are different objects,
@@ -94,8 +92,8 @@ def time_isin_long_series_long_values_floats(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
self.s = Series(np.random.randint(1, 10, 100000))
@@ -109,15 +107,17 @@ def time_nsmallest(self, keep):
class Dropna:
- params = ['int', 'datetime']
- param_names = ['dtype']
+ params = ["int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- data = {'int': np.random.randint(1, 10, N),
- 'datetime': date_range('2000-01-01', freq='S', periods=N)}
+ N = 10 ** 6
+ data = {
+ "int": np.random.randint(1, 10, N),
+ "datetime": date_range("2000-01-01", freq="S", periods=N),
+ }
self.s = Series(data[dtype])
- if dtype == 'datetime':
+ if dtype == "datetime":
self.s[np.random.randint(1, N, 100)] = NaT
def time_dropna(self, dtype):
@@ -127,37 +127,47 @@ def time_dropna(self, dtype):
class SearchSorted:
goal_time = 0.2
- params = ['int8', 'int16', 'int32', 'int64',
- 'uint8', 'uint16', 'uint32', 'uint64',
- 'float16', 'float32', 'float64',
- 'str']
- param_names = ['dtype']
+ params = [
+ "int8",
+ "int16",
+ "int32",
+ "int64",
+ "uint8",
+ "uint16",
+ "uint32",
+ "uint64",
+ "float16",
+ "float32",
+ "float64",
+ "str",
+ ]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
self.s = Series(data)
def time_searchsorted(self, dtype):
- key = '2' if dtype == 'str' else 2
+ key = "2" if dtype == "str" else 2
self.s.searchsorted(key)
class Map:
- params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int'])
- param_names = 'mapper'
+ params = (["dict", "Series", "lambda"], ["object", "category", "int"])
+ param_names = "mapper"
def setup(self, mapper, dtype):
map_size = 1000
map_data = Series(map_size - np.arange(map_size), dtype=dtype)
# construct mapper
- if mapper == 'Series':
+ if mapper == "Series":
self.map_data = map_data
- elif mapper == 'dict':
+ elif mapper == "dict":
self.map_data = map_data.to_dict()
- elif mapper == 'lambda':
+ elif mapper == "lambda":
map_dict = map_data.to_dict()
self.map_data = lambda x: map_dict[x]
else:
@@ -170,8 +180,8 @@ def time_map(self, mapper, *args, **kwargs):
class Clip:
- params = [50, 1000, 10**5]
- param_names = ['n']
+ params = [50, 1000, 10 ** 5]
+ param_names = ["n"]
def setup(self, n):
self.s = Series(np.random.randn(n))
@@ -182,8 +192,8 @@ def time_clip(self, n):
class ValueCounts:
- params = ['int', 'uint', 'float', 'object']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)
@@ -193,7 +203,6 @@ def time_value_counts(self, dtype):
class Dir:
-
def setup(self):
self.s = Series(index=tm.makeStringIndex(10000))
@@ -204,47 +213,59 @@ def time_dir_strings(self):
class SeriesGetattr:
# https://github.com/pandas-dev/pandas/issues/19764
def setup(self):
- self.s = Series(1,
- index=date_range("2012-01-01", freq='s',
- periods=int(1e6)))
+ self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6)))
def time_series_datetimeindex_repr(self):
- getattr(self.s, 'a', None)
+ getattr(self.s, "a", None)
-class All(object):
+class All:
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case != 'fast'
+ val = case != "fast"
self.s = Series([val] * N)
def time_all(self, N, case):
self.s.all()
-class Any(object):
+class Any:
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case == 'fast'
+ val = case == "fast"
self.s = Series([val] * N)
def time_any(self, N, case):
self.s.any()
-class NanOps(object):
-
- params = [['var', 'mean', 'median', 'max', 'min', 'sum', 'std', 'sem',
- 'argmax', 'skew', 'kurt', 'prod'],
- [10**3, 10**6],
- ['int8', 'int32', 'int64', 'float64']]
- param_names = ['func', 'N', 'dtype']
+class NanOps:
+
+ params = [
+ [
+ "var",
+ "mean",
+ "median",
+ "max",
+ "min",
+ "sum",
+ "std",
+ "sem",
+ "argmax",
+ "skew",
+ "kurt",
+ "prod",
+ ],
+ [10 ** 3, 10 ** 6],
+ ["int8", "int32", "int64", "float64"],
+ ]
+ param_names = ["func", "N", "dtype"]
def setup(self, func, N, dtype):
self.s = Series([1] * N, dtype=dtype)
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index 281e81f21ba9c..19d08c086a508 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -14,11 +14,10 @@ def make_array(size, dense_proportion, fill_value, dtype):
class SparseSeriesToFrame:
-
def setup(self):
K = 50
N = 50001
- rng = date_range('1/1/2000', periods=N, freq='T')
+ rng = date_range("1/1/2000", periods=N, freq="T")
self.series = {}
for i in range(1, K):
data = np.random.randn(N)[:-i]
@@ -32,12 +31,11 @@ def time_series_to_frame(self):
class SparseArrayConstructor:
- params = ([0.1, 0.01], [0, np.nan],
- [np.int64, np.float64, np.object])
- param_names = ['dense_proportion', 'fill_value', 'dtype']
+ params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object])
+ param_names = ["dense_proportion", "fill_value", "dtype"]
def setup(self, dense_proportion, fill_value, dtype):
- N = 10**6
+ N = 10 ** 6
self.array = make_array(N, dense_proportion, fill_value, dtype)
def time_sparse_array(self, dense_proportion, fill_value, dtype):
@@ -45,7 +43,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
class SparseDataFrameConstructor:
-
def setup(self):
N = 1000
self.arr = np.arange(N)
@@ -56,18 +53,16 @@ def time_from_scipy(self):
class FromCoo:
-
def setup(self):
- self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0],
- ([1, 0, 0], [0, 2, 3])),
- shape=(100, 100))
+ self.matrix = scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)
+ )
def time_sparse_series_from_coo(self):
pd.Series.sparse.from_coo(self.matrix)
class ToCoo:
-
def setup(self):
s = Series([np.nan] * 10000)
s[0] = 3.0
@@ -77,18 +72,16 @@ def setup(self):
self.ss = s.astype("Sparse")
def time_sparse_series_to_coo(self):
- self.ss.sparse.to_coo(row_levels=[0, 1],
- column_levels=[2, 3],
- sort_labels=True)
+ self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
class Arithmetic:
params = ([0.1, 0.01], [0, np.nan])
- param_names = ['dense_proportion', 'fill_value']
+ param_names = ["dense_proportion", "fill_value"]
def setup(self, dense_proportion, fill_value):
- N = 10**6
+ N = 10 ** 6
arr1 = make_array(N, dense_proportion, fill_value, np.int64)
self.array1 = SparseArray(arr1, fill_value=fill_value)
arr2 = make_array(N, dense_proportion, fill_value, np.int64)
@@ -110,22 +103,24 @@ def time_divide(self, dense_proportion, fill_value):
class ArithmeticBlock:
params = [np.nan, 0]
- param_names = ['fill_value']
+ param_names = ["fill_value"]
def setup(self, fill_value):
- N = 10**6
- self.arr1 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
- self.arr2 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
+ N = 10 ** 6
+ self.arr1 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
+ self.arr2 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
def make_block_array(self, length, num_blocks, block_size, fill_value):
arr = np.full(length, fill_value)
- indicies = np.random.choice(np.arange(0, length, block_size),
- num_blocks,
- replace=False)
+ indicies = np.random.choice(
+ np.arange(0, length, block_size), num_blocks, replace=False
+ )
for ind in indicies:
- arr[ind:ind + block_size] = np.random.randint(0, 100, block_size)
+ arr[ind : ind + block_size] = np.random.randint(0, 100, block_size)
return SparseArray(arr, fill_value=fill_value)
def time_make_union(self, fill_value):
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 3514335f92e77..620a6de0f5f34 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -2,14 +2,13 @@
import pandas as pd
-ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem',
- 'var']
+ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"]
class FrameOps:
- params = [ops, ['float', 'int'], [0, 1], [True, False]]
- param_names = ['op', 'dtype', 'axis', 'use_bottleneck']
+ params = [ops, ["float", "int"], [0, 1], [True, False]]
+ param_names = ["op", "dtype", "axis", "use_bottleneck"]
def setup(self, op, dtype, axis, use_bottleneck):
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
@@ -17,6 +16,7 @@ def setup(self, op, dtype, axis, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df_func = getattr(df, op)
@@ -27,13 +27,15 @@ def time_op(self, op, dtype, axis, use_bottleneck):
class FrameMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
df = pd.DataFrame(np.random.randn(len(index), 4), index=index)
self.df_func = getattr(df, op)
@@ -44,8 +46,8 @@ def time_op(self, level, op):
class SeriesOps:
- params = [ops, ['float', 'int'], [True, False]]
- param_names = ['op', 'dtype', 'use_bottleneck']
+ params = [ops, ["float", "int"], [True, False]]
+ param_names = ["op", "dtype", "use_bottleneck"]
def setup(self, op, dtype, use_bottleneck):
s = pd.Series(np.random.randn(100000)).astype(dtype)
@@ -53,6 +55,7 @@ def setup(self, op, dtype, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s_func = getattr(s, op)
@@ -63,13 +66,15 @@ def time_op(self, op, dtype, use_bottleneck):
class SeriesMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
s = pd.Series(np.random.randn(len(index)), index=index)
self.s_func = getattr(s, op)
@@ -80,11 +85,11 @@ def time_op(self, level, op):
class Rank:
- params = [['DataFrame', 'Series'], [True, False]]
- param_names = ['constructor', 'pct']
+ params = [["DataFrame", "Series"], [True, False]]
+ param_names = ["constructor", "pct"]
def setup(self, constructor, pct):
- values = np.random.randn(10**5)
+ values = np.random.randn(10 ** 5)
self.data = getattr(pd, constructor)(values)
def time_rank(self, constructor, pct):
@@ -96,14 +101,15 @@ def time_average_old(self, constructor, pct):
class Correlation:
- params = [['spearman', 'kendall', 'pearson'], [True, False]]
- param_names = ['method', 'use_bottleneck']
+ params = [["spearman", "kendall", "pearson"], [True, False]]
+ param_names = ["method", "use_bottleneck"]
def setup(self, method, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df = pd.DataFrame(np.random.randn(1000, 30))
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
@@ -126,13 +132,14 @@ def time_corrwith_rows(self, method, use_bottleneck):
class Covariance:
params = [[True, False]]
- param_names = ['use_bottleneck']
+ param_names = ["use_bottleneck"]
def setup(self, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s = pd.Series(np.random.randn(100000))
self.s2 = pd.Series(np.random.randn(100000))
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 5dbcc71b7455e..6be2fa92d9eac 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -6,31 +6,30 @@
class Methods:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_center(self):
self.s.str.center(100)
def time_count(self):
- self.s.str.count('A')
+ self.s.str.count("A")
def time_endswith(self):
- self.s.str.endswith('A')
+ self.s.str.endswith("A")
def time_extract(self):
with warnings.catch_warnings(record=True):
- self.s.str.extract('(\\w*)A(\\w*)')
+ self.s.str.extract("(\\w*)A(\\w*)")
def time_findall(self):
- self.s.str.findall('[A-Z]+')
+ self.s.str.findall("[A-Z]+")
def time_find(self):
- self.s.str.find('[A-Z]+')
+ self.s.str.find("[A-Z]+")
def time_rfind(self):
- self.s.str.rfind('[A-Z]+')
+ self.s.str.rfind("[A-Z]+")
def time_get(self):
self.s.str.get(0)
@@ -39,43 +38,43 @@ def time_len(self):
self.s.str.len()
def time_join(self):
- self.s.str.join(' ')
+ self.s.str.join(" ")
def time_match(self):
- self.s.str.match('A')
+ self.s.str.match("A")
def time_normalize(self):
- self.s.str.normalize('NFC')
+ self.s.str.normalize("NFC")
def time_pad(self):
- self.s.str.pad(100, side='both')
+ self.s.str.pad(100, side="both")
def time_partition(self):
- self.s.str.partition('A')
+ self.s.str.partition("A")
def time_rpartition(self):
- self.s.str.rpartition('A')
+ self.s.str.rpartition("A")
def time_replace(self):
- self.s.str.replace('A', '\x01\x01')
+ self.s.str.replace("A", "\x01\x01")
def time_translate(self):
- self.s.str.translate({'A': '\x01\x01'})
+ self.s.str.translate({"A": "\x01\x01"})
def time_slice(self):
self.s.str.slice(5, 15, 2)
def time_startswith(self):
- self.s.str.startswith('A')
+ self.s.str.startswith("A")
def time_strip(self):
- self.s.str.strip('A')
+ self.s.str.strip("A")
def time_rstrip(self):
- self.s.str.rstrip('A')
+ self.s.str.rstrip("A")
def time_lstrip(self):
- self.s.str.lstrip('A')
+ self.s.str.lstrip("A")
def time_title(self):
self.s.str.title()
@@ -95,13 +94,13 @@ def time_zfill(self):
class Repeat:
- params = ['int', 'array']
- param_names = ['repeats']
+ params = ["int", "array"]
+ param_names = ["repeats"]
def setup(self, repeats):
- N = 10**5
+ N = 10 ** 5
self.s = Series(tm.makeStringIndex(N))
- repeat = {'int': 1, 'array': np.random.randint(1, 3, N)}
+ repeat = {"int": 1, "array": np.random.randint(1, 3, N)}
self.values = repeat[repeats]
def time_repeat(self, repeats):
@@ -110,20 +109,20 @@ def time_repeat(self, repeats):
class Cat:
- params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15])
- param_names = ['other_cols', 'sep', 'na_rep', 'na_frac']
+ params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15])
+ param_names = ["other_cols", "sep", "na_rep", "na_frac"]
def setup(self, other_cols, sep, na_rep, na_frac):
N = 10 ** 5
- mask_gen = lambda: np.random.choice([True, False], N,
- p=[1 - na_frac, na_frac])
+ mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac])
self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
if other_cols == 0:
# str.cat self-concatenates only for others=None
self.others = None
else:
- self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
- for i in range(other_cols)})
+ self.others = DataFrame(
+ {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)}
+ )
def time_cat(self, other_cols, sep, na_rep, na_frac):
# before the concatenation (one caller + other_cols columns), the total
@@ -136,52 +135,49 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
class Contains:
params = [True, False]
- param_names = ['regex']
+ param_names = ["regex"]
def setup(self, regex):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_contains(self, regex):
- self.s.str.contains('A', regex=regex)
+ self.s.str.contains("A", regex=regex)
class Split:
params = [True, False]
- param_names = ['expand']
+ param_names = ["expand"]
def setup(self, expand):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('--')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
def time_split(self, expand):
- self.s.str.split('--', expand=expand)
+ self.s.str.split("--", expand=expand)
def time_rsplit(self, expand):
- self.s.str.rsplit('--', expand=expand)
+ self.s.str.rsplit("--", expand=expand)
class Dummies:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('|')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|")
def time_get_dummies(self):
- self.s.str.get_dummies('|')
+ self.s.str.get_dummies("|")
class Encode:
-
def setup(self):
self.ser = Series(tm.makeUnicodeIndex())
def time_encode_decode(self):
- self.ser.str.encode('utf-8').str.decode('utf-8')
+ self.ser.str.encode("utf-8").str.decode("utf-8")
class Slice:
-
def setup(self):
- self.s = Series(['abcdefg', np.nan] * 500000)
+ self.s = Series(["abcdefg", np.nan] * 500000)
def time_vector_slice(self):
# GH 2602
diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
index c4fe462944a2a..36a9db529f98f 100644
--- a/asv_bench/benchmarks/timedelta.py
+++ b/asv_bench/benchmarks/timedelta.py
@@ -3,49 +3,60 @@
import numpy as np
from pandas import (
- DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta)
+ DataFrame,
+ Series,
+ Timedelta,
+ Timestamp,
+ timedelta_range,
+ to_timedelta,
+)
class TimedeltaConstructor:
-
def time_from_int(self):
Timedelta(123456789)
def time_from_unit(self):
- Timedelta(1, unit='d')
+ Timedelta(1, unit="d")
def time_from_components(self):
- Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5,
- microseconds=6, nanoseconds=7)
+ Timedelta(
+ days=1,
+ hours=2,
+ minutes=3,
+ seconds=4,
+ milliseconds=5,
+ microseconds=6,
+ nanoseconds=7,
+ )
def time_from_datetime_timedelta(self):
Timedelta(datetime.timedelta(days=1, seconds=1))
def time_from_np_timedelta(self):
- Timedelta(np.timedelta64(1, 'ms'))
+ Timedelta(np.timedelta64(1, "ms"))
def time_from_string(self):
- Timedelta('1 days')
+ Timedelta("1 days")
def time_from_iso_format(self):
- Timedelta('P4DT12H30M5S')
+ Timedelta("P4DT12H30M5S")
def time_from_missing(self):
- Timedelta('nat')
+ Timedelta("nat")
class ToTimedelta:
-
def setup(self):
self.ints = np.random.randint(0, 60, size=10000)
self.str_days = []
self.str_seconds = []
for i in self.ints:
- self.str_days.append('{0} days'.format(i))
- self.str_seconds.append('00:00:{0:02d}'.format(i))
+ self.str_days.append("{0} days".format(i))
+ self.str_seconds.append("00:00:{0:02d}".format(i))
def time_convert_int(self):
- to_timedelta(self.ints, unit='s')
+ to_timedelta(self.ints, unit="s")
def time_convert_string_days(self):
to_timedelta(self.str_days)
@@ -56,30 +67,28 @@ def time_convert_string_seconds(self):
class ToTimedeltaErrors:
- params = ['coerce', 'ignore']
- param_names = ['errors']
+ params = ["coerce", "ignore"]
+ param_names = ["errors"]
def setup(self, errors):
ints = np.random.randint(0, 60, size=10000)
- self.arr = ['{0} days'.format(i) for i in ints]
- self.arr[-1] = 'apple'
+ self.arr = ["{0} days".format(i) for i in ints]
+ self.arr[-1] = "apple"
def time_convert(self, errors):
to_timedelta(self.arr, errors=errors)
class TimedeltaOps:
-
def setup(self):
self.td = to_timedelta(np.arange(1000000))
- self.ts = Timestamp('2000')
+ self.ts = Timestamp("2000")
def time_add_td_ts(self):
self.td + self.ts
class TimedeltaProperties:
-
def setup_cache(self):
td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35)
return td
@@ -98,10 +107,9 @@ def time_timedelta_nanoseconds(self, td):
class DatetimeAccessor:
-
def setup_cache(self):
N = 100000
- series = Series(timedelta_range('1 days', periods=N, freq='h'))
+ series = Series(timedelta_range("1 days", periods=N, freq="h"))
return series
def time_dt_accessor(self, series):
@@ -121,10 +129,9 @@ def time_timedelta_nanoseconds(self, series):
class TimedeltaIndexing:
-
def setup(self):
- self.index = timedelta_range(start='1985', periods=1000, freq='D')
- self.index2 = timedelta_range(start='1986', periods=1000, freq='D')
+ self.index = timedelta_range(start="1985", periods=1000, freq="D")
+ self.index2 = timedelta_range(start="1986", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.timedelta = self.index[500]
@@ -141,7 +148,7 @@ def time_series_loc(self):
self.series.loc[self.timedelta]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index.intersection(self.index2)
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index 7de1c42246ad5..1020b773f8acb 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -4,6 +4,7 @@
import numpy as np
from pandas import to_datetime, date_range, Series, DataFrame, period_range
from pandas.tseries.frequencies import infer_freq
+
try:
from pandas.plotting._matplotlib.converter import DatetimeConverter
except ImportError:
@@ -12,27 +13,22 @@
class DatetimeIndex:
- params = ['dst', 'repeated', 'tz_aware', 'tz_local', 'tz_naive']
- param_names = ['index_type']
+ params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"]
+ param_names = ["index_type"]
def setup(self, index_type):
N = 100000
- dtidxes = {'dst': date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S'),
- 'repeated': date_range(start='2000',
- periods=N / 10,
- freq='s').repeat(10),
- 'tz_aware': date_range(start='2000',
- periods=N,
- freq='s',
- tz='US/Eastern'),
- 'tz_local': date_range(start='2000',
- periods=N,
- freq='s',
- tz=dateutil.tz.tzlocal()),
- 'tz_naive': date_range(start='2000',
- periods=N,
- freq='s')}
+ dtidxes = {
+ "dst": date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ ),
+ "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10),
+ "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"),
+ "tz_local": date_range(
+ start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal()
+ ),
+ "tz_naive": date_range(start="2000", periods=N, freq="s"),
+ }
self.index = dtidxes[index_type]
def time_add_timedelta(self, index_type):
@@ -62,31 +58,31 @@ def time_to_pydatetime(self, index_type):
class TzLocalize:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
- dst_rng = date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S')
- self.index = date_range(start='10/29/2000',
- end='10/29/2000 00:59:59', freq='S')
+ dst_rng = date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ )
+ self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S")
self.index = self.index.append(dst_rng)
self.index = self.index.append(dst_rng)
- self.index = self.index.append(date_range(start='10/29/2000 2:00:00',
- end='10/29/2000 3:00:00',
- freq='S'))
+ self.index = self.index.append(
+ date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S")
+ )
def time_infer_dst(self, tz):
- self.index.tz_localize(tz, ambiguous='infer')
+ self.index.tz_localize(tz, ambiguous="infer")
class ResetIndex:
- params = [None, 'US/Eastern']
- param_names = 'tz'
+ params = [None, "US/Eastern"]
+ param_names = "tz"
def setup(self, tz):
- idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz)
+ idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz)
self.df = DataFrame(np.random.randn(1000, 2), index=idx)
def time_reest_datetimeindex(self, tz):
@@ -95,12 +91,12 @@ def time_reest_datetimeindex(self, tz):
class Factorize:
- params = [None, 'Asia/Tokyo']
- param_names = 'tz'
+ params = [None, "Asia/Tokyo"]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz)
+ self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz)
self.dti = self.dti.repeat(5)
def time_factorize(self, tz):
@@ -109,25 +105,24 @@ def time_factorize(self, tz):
class InferFreq:
- params = [None, 'D', 'B']
- param_names = ['freq']
+ params = [None, "D", "B"]
+ param_names = ["freq"]
def setup(self, freq):
if freq is None:
- self.idx = date_range(start='1/1/1700', freq='D', periods=10000)
+ self.idx = date_range(start="1/1/1700", freq="D", periods=10000)
self.idx.freq = None
else:
- self.idx = date_range(start='1/1/1700', freq=freq, periods=10000)
+ self.idx = date_range(start="1/1/1700", freq=freq, periods=10000)
def time_infer_freq(self, freq):
infer_freq(self.idx)
class TimeDatetimeConverter:
-
def setup(self):
N = 100000
- self.rng = date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = date_range(start="1/1/2000", periods=N, freq="T")
def time_convert(self):
DatetimeConverter.convert(self.rng, None, None)
@@ -136,11 +131,11 @@ def time_convert(self):
class Iteration:
params = [date_range, period_range]
- param_names = ['time_index']
+ param_names = ["time_index"]
def setup(self, time_index):
- N = 10**6
- self.idx = time_index(start='20140101', freq='T', periods=N)
+ N = 10 ** 6
+ self.idx = time_index(start="20140101", freq="T", periods=N)
self.exit = 10000
def time_iter(self, time_index):
@@ -155,13 +150,13 @@ def time_iter_preexit(self, time_index):
class ResampleDataFrame:
- params = ['max', 'mean', 'min']
- param_names = ['method']
+ params = ["max", "mean", "min"]
+ param_names = ["method"]
def setup(self, method):
- rng = date_range(start='20130101', periods=100000, freq='50L')
+ rng = date_range(start="20130101", periods=100000, freq="50L")
df = DataFrame(np.random.randn(100000, 2), index=rng)
- self.resample = getattr(df.resample('1s'), method)
+ self.resample = getattr(df.resample("1s"), method)
def time_method(self, method):
self.resample()
@@ -169,16 +164,14 @@ def time_method(self, method):
class ResampleSeries:
- params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc'])
- param_names = ['index', 'freq', 'method']
+ params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"])
+ param_names = ["index", "freq", "method"]
def setup(self, index, freq, method):
- indexes = {'period': period_range(start='1/1/2000',
- end='1/1/2001',
- freq='T'),
- 'datetime': date_range(start='1/1/2000',
- end='1/1/2001',
- freq='T')}
+ indexes = {
+ "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ }
idx = indexes[index]
ts = Series(np.random.randn(len(idx)), index=idx)
self.resample = getattr(ts.resample(freq), method)
@@ -190,32 +183,35 @@ def time_resample(self, index, freq, method):
class ResampleDatetetime64:
# GH 7754
def setup(self):
- rng3 = date_range(start='2000-01-01 00:00:00',
- end='2000-01-01 10:00:00', freq='555000U')
- self.dt_ts = Series(5, rng3, dtype='datetime64[ns]')
+ rng3 = date_range(
+ start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U"
+ )
+ self.dt_ts = Series(5, rng3, dtype="datetime64[ns]")
def time_resample(self):
- self.dt_ts.resample('1S').last()
+ self.dt_ts.resample("1S").last()
class AsOf:
- params = ['DataFrame', 'Series']
- param_names = ['constructor']
+ params = ["DataFrame", "Series"]
+ param_names = ["constructor"]
def setup(self, constructor):
N = 10000
M = 10
- rng = date_range(start='1/1/1990', periods=N, freq='53s')
- data = {'DataFrame': DataFrame(np.random.randn(N, M)),
- 'Series': Series(np.random.randn(N))}
+ rng = date_range(start="1/1/1990", periods=N, freq="53s")
+ data = {
+ "DataFrame": DataFrame(np.random.randn(N, M)),
+ "Series": Series(np.random.randn(N)),
+ }
self.ts = data[constructor]
self.ts.index = rng
self.ts2 = self.ts.copy()
self.ts2.iloc[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3.iloc[-5000:] = np.nan
- self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s')
+ self.dates = date_range(start="1/1/1990", periods=N * 10, freq="5s")
self.date = self.dates[0]
self.date_last = self.dates[-1]
self.date_early = self.date - timedelta(10)
@@ -248,11 +244,11 @@ def time_asof_nan_single(self, constructor):
class SortIndex:
params = [True, False]
- param_names = ['monotonic']
+ param_names = ["monotonic"]
def setup(self, monotonic):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
self.s = Series(np.random.randn(N), index=idx)
if not monotonic:
self.s = self.s.sample(frac=1)
@@ -265,10 +261,9 @@ def time_get_slice(self, monotonic):
class IrregularOps:
-
def setup(self):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
s = Series(np.random.randn(N), index=idx)
self.left = s.sample(frac=1)
self.right = s.sample(frac=1)
@@ -278,10 +273,9 @@ def time_add(self):
class Lookup:
-
def setup(self):
N = 1500000
- rng = date_range(start='1/1/2000', periods=N, freq='S')
+ rng = date_range(start="1/1/2000", periods=N, freq="S")
self.ts = Series(1, index=rng)
self.lookup_val = rng[N // 2]
@@ -291,23 +285,35 @@ def time_lookup_and_cleanup(self):
class ToDatetimeYYYYMMDD:
-
def setup(self):
- rng = date_range(start='1/1/2000', periods=10000, freq='D')
- self.stringsD = Series(rng.strftime('%Y%m%d'))
+ rng = date_range(start="1/1/2000", periods=10000, freq="D")
+ self.stringsD = Series(rng.strftime("%Y%m%d"))
def time_format_YYYYMMDD(self):
- to_datetime(self.stringsD, format='%Y%m%d')
+ to_datetime(self.stringsD, format="%Y%m%d")
-class ToDatetimeISO8601:
+class ToDatetimeCacheSmallCount:
+ params = ([True, False], [50, 500, 5000, 100000])
+ param_names = ["cache", "count"]
+
+ def setup(self, cache, count):
+ rng = date_range(start="1/1/1971", periods=count)
+ self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist()
+
+ def time_unique_date_strings(self, cache, count):
+ to_datetime(self.unique_date_strings, cache=cache)
+
+
+class ToDatetimeISO8601:
def setup(self):
- rng = date_range(start='1/1/2000', periods=20000, freq='H')
- self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
- self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist()
- self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
- for x in rng]
+ rng = date_range(start="1/1/2000", periods=20000, freq="H")
+ self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
+ self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
+ self.strings_tz_space = [
+ x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
+ ]
def time_iso8601(self):
to_datetime(self.strings)
@@ -316,22 +322,21 @@ def time_iso8601_nosep(self):
to_datetime(self.strings_nosep)
def time_iso8601_format(self):
- to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')
+ to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S")
def time_iso8601_format_no_sep(self):
- to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')
+ to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S")
def time_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)
class ToDatetimeNONISO8601:
-
def setup(self):
N = 10000
half = int(N / 2)
- ts_string_1 = 'March 1, 2018 12:00:00+0400'
- ts_string_2 = 'March 1, 2018 12:00:00+0500'
+ ts_string_1 = "March 1, 2018 12:00:00+0400"
+ ts_string_2 = "March 1, 2018 12:00:00+0500"
self.same_offset = [ts_string_1] * N
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half
@@ -343,50 +348,48 @@ def time_different_offset(self):
class ToDatetimeFormatQuarters:
-
def setup(self):
- self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000)
+ self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000)
def time_infer_quarter(self):
to_datetime(self.s)
class ToDatetimeFormat:
-
def setup(self):
- self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000)
- self.s2 = self.s.str.replace(':\\S+$', '')
+ self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000)
+ self.s2 = self.s.str.replace(":\\S+$", "")
def time_exact(self):
- to_datetime(self.s2, format='%d%b%y')
+ to_datetime(self.s2, format="%d%b%y")
def time_no_exact(self):
- to_datetime(self.s, format='%d%b%y', exact=False)
+ to_datetime(self.s, format="%d%b%y", exact=False)
class ToDatetimeCache:
params = [True, False]
- param_names = ['cache']
+ param_names = ["cache"]
def setup(self, cache):
N = 10000
self.unique_numeric_seconds = list(range(N))
self.dup_numeric_seconds = [1000] * N
- self.dup_string_dates = ['2000-02-11'] * N
- self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N
+ self.dup_string_dates = ["2000-02-11"] * N
+ self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N
def time_unique_seconds_and_unit(self, cache):
- to_datetime(self.unique_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.unique_numeric_seconds, unit="s", cache=cache)
def time_dup_seconds_and_unit(self, cache):
- to_datetime(self.dup_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.dup_numeric_seconds, unit="s", cache=cache)
def time_dup_string_dates(self, cache):
to_datetime(self.dup_string_dates, cache=cache)
def time_dup_string_dates_and_format(self, cache):
- to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache)
+ to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache)
def time_dup_string_tzoffset_dates(self, cache):
to_datetime(self.dup_string_with_tz, cache=cache)
@@ -394,14 +397,12 @@ def time_dup_string_tzoffset_dates(self, cache):
class DatetimeAccessor:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.series = Series(
- date_range(start='1/1/2000', periods=N, freq='T', tz=tz)
- )
+ self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz))
def time_dt_accessor(self, tz):
self.series.dt
diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py
index c6e56804c7b21..8ebb2d8d2f35d 100644
--- a/asv_bench/benchmarks/timestamp.py
+++ b/asv_bench/benchmarks/timestamp.py
@@ -7,21 +7,20 @@
class TimestampConstruction:
-
def time_parse_iso8601_no_tz(self):
- Timestamp('2017-08-25 08:16:14')
+ Timestamp("2017-08-25 08:16:14")
def time_parse_iso8601_tz(self):
- Timestamp('2017-08-25 08:16:14-0500')
+ Timestamp("2017-08-25 08:16:14-0500")
def time_parse_dateutil(self):
- Timestamp('2017/08/25 08:16:14 AM')
+ Timestamp("2017/08/25 08:16:14 AM")
def time_parse_today(self):
- Timestamp('today')
+ Timestamp("today")
def time_parse_now(self):
- Timestamp('now')
+ Timestamp("now")
def time_fromordinal(self):
Timestamp.fromordinal(730120)
@@ -31,14 +30,13 @@ def time_fromtimestamp(self):
class TimestampProperties:
- _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC,
- dateutil.tz.tzutc()]
- _freqs = [None, 'B']
+ _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()]
+ _freqs = [None, "B"]
params = [_tzs, _freqs]
- param_names = ['tz', 'freq']
+ param_names = ["tz", "freq"]
def setup(self, tz, freq):
- self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq)
+ self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq)
def time_tz(self, tz, freq):
self.ts.tz
@@ -93,15 +91,14 @@ def time_month_name(self, tz, freq):
class TimestampOps:
- params = [None, 'US/Eastern', pytz.UTC,
- dateutil.tz.tzutc()]
- param_names = ['tz']
+ params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()]
+ param_names = ["tz"]
def setup(self, tz):
- self.ts = Timestamp('2017-08-25 08:16:14', tz=tz)
+ self.ts = Timestamp("2017-08-25 08:16:14", tz=tz)
def time_replace_tz(self, tz):
- self.ts.replace(tzinfo=pytz.timezone('US/Eastern'))
+ self.ts.replace(tzinfo=pytz.timezone("US/Eastern"))
def time_replace_None(self, tz):
self.ts.replace(tzinfo=None)
@@ -124,16 +121,16 @@ def time_to_julian_date(self, tz):
self.ts.to_julian_date()
def time_floor(self, tz):
- self.ts.floor('5T')
+ self.ts.floor("5T")
def time_ceil(self, tz):
- self.ts.ceil('5T')
+ self.ts.ceil("5T")
class TimestampAcrossDst:
def setup(self):
dt = datetime.datetime(2016, 3, 27, 1)
- self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo
+ self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo
self.ts2 = Timestamp(dt)
def time_replace_across_dst(self):
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index b40d46bdebe02..263a87176a9c9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -5,6 +5,7 @@ jobs:
parameters:
name: macOS
vmImage: xcode9-macos10.13
+
- template: ci/azure/posix.yml
parameters:
name: Linux
@@ -21,22 +22,17 @@ jobs:
timeoutInMinutes: 90
steps:
- script: |
- # XXX next command should avoid redefining the path in every step, but
- # made the process crash as it couldn't find deactivate
- #echo '##vso[task.prependpath]$HOME/miniconda3/bin'
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
echo '##vso[task.setvariable variable=ENV_FILE]environment.yml'
echo '##vso[task.setvariable variable=AZURE]true'
displayName: 'Setting environment variables'
# Do not require a conda environment
- - script: |
- export PATH=$HOME/miniconda3/bin:$PATH
- ci/code_checks.sh patterns
+ - script: ci/code_checks.sh patterns
displayName: 'Looking for unwanted patterns'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
sudo apt-get install -y libc6-dev-i386
ci/setup_env.sh
displayName: 'Setup environment and build pandas'
@@ -44,14 +40,12 @@ jobs:
# Do not require pandas
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh lint
displayName: 'Linting'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh dependencies
displayName: 'Dependencies consistency'
@@ -59,42 +53,36 @@ jobs:
# Require pandas
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh code
displayName: 'Checks on imported code'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh doctests
displayName: 'Running doctests'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh docstrings
displayName: 'Docstring validation'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/code_checks.sh typing
displayName: 'Typing validation'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
pytest --capture=no --strict scripts
- displayName: 'Testing docstring validaton script'
+ displayName: 'Testing docstring validation script'
condition: true
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
cd asv_bench
asv check -E existing
@@ -122,19 +110,21 @@ jobs:
timeoutInMinutes: 90
steps:
- script: |
- echo '##vso[task.setvariable variable=ENV_FILE]ci/deps/travis-36-doc.yaml'
+ echo '##vso[task.setvariable variable=ENV_FILE]environment.yml'
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
displayName: 'Setting environment variables'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
sudo apt-get install -y libc6-dev-i386
ci/setup_env.sh
displayName: 'Setup environment and build pandas'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
- doc/make.py
+ # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547)
+ doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]}
+ grep -B1 "^<<<-------------------------------------------------------------------------$" sphinx.log ; IPY_RET=$(( $? != 1 ))
+ exit $(( $SPHINX_RET + $IPY_RET ))
displayName: 'Build documentation'
- script: |
@@ -142,6 +132,7 @@ jobs:
git init
touch .nojekyll
echo "dev.pandas.io" > CNAME
+ printf "User-agent: *\nDisallow: /" > robots.txt
git add --all .
git config user.email "pandas-dev@python.org"
git config user.name "pandas-docs-bot"
@@ -172,7 +163,6 @@ jobs:
cd doc/build/html
git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git
git push -f origin master
- exit 0 # FIXME this will leave the build green even if the step fails. To be removed when we are confident with this.
displayName: 'Publish docs to GitHub pages'
condition : |
and(not(eq(variables['Build.Reason'], 'PullRequest')),
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
index c5676e0a2a6a0..6093df46ffb60 100644
--- a/ci/azure/posix.yml
+++ b/ci/azure/posix.yml
@@ -33,6 +33,12 @@ jobs:
PATTERN: "not slow and not network"
LOCALE_OVERRIDE: "it_IT.UTF-8"
+ py36_32bit:
+ ENV_FILE: ci/deps/azure-36-32bit.yaml
+ CONDA_PY: "36"
+ PATTERN: "not slow and not network"
+ BITS32: "yes"
+
py37_locale:
ENV_FILE: ci/deps/azure-37-locale.yaml
CONDA_PY: "37"
@@ -50,17 +56,15 @@ jobs:
steps:
- script: |
if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi
+ echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
echo "Creating Environment"
ci/setup_env.sh
displayName: 'Setup environment and build pandas'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
ci/run_tests.sh
displayName: 'Test'
- - script: |
- export PATH=$HOME/miniconda3/bin:$PATH
- source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
+ - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
- task: PublishTestResults@2
inputs:
testResultsFiles: 'test-data-*.xml'
@@ -91,7 +95,6 @@ jobs:
}
displayName: 'Check for test failures'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
python ci/print_skipped.py
displayName: 'Print skipped tests'
diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml
index 6d4afccb57865..dfa82819b9826 100644
--- a/ci/azure/windows.yml
+++ b/ci/azure/windows.yml
@@ -17,11 +17,15 @@ jobs:
CONDA_PY: "37"
steps:
- - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
+ - powershell: |
+ Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
+ Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin"
displayName: 'Add conda to PATH'
- script: conda update -q -n base conda
displayName: Update conda
- - script: conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml
+ - script: |
+ call activate
+ conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml
displayName: 'Create anaconda environment'
- script: |
call activate pandas-dev
@@ -50,7 +54,6 @@ jobs:
}
displayName: 'Check for test failures'
- script: |
- export PATH=$HOME/miniconda3/bin:$PATH
source activate pandas-dev
python ci/print_skipped.py
displayName: 'Print skipped tests'
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
deleted file mode 100755
index bf22f0764144c..0000000000000
--- a/ci/build_docs.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-set -e
-
-if [ "${TRAVIS_OS_NAME}" != "linux" ]; then
- echo "not doing build_docs on non-linux"
- exit 0
-fi
-
-cd "$TRAVIS_BUILD_DIR"/doc
-echo "inside $0"
-
-if [ "$DOC" ]; then
-
- echo "Will build docs"
-
- echo ###############################
- echo # Log file for the doc build #
- echo ###############################
-
- echo ./make.py
- ./make.py
-
- echo ########################
- echo # Create and send docs #
- echo ########################
-
- echo "Only uploading docs when TRAVIS_PULL_REQUEST is 'false'"
- echo "TRAVIS_PULL_REQUEST: ${TRAVIS_PULL_REQUEST}"
-
- if [ "${TRAVIS_PULL_REQUEST}" == "false" ]; then
- cd build/html
- git config --global user.email "pandas-docs-bot@localhost.foo"
- git config --global user.name "pandas-docs-bot"
-
- # create the repo
- git init
-
- touch README
- git add README
- git commit -m "Initial commit" --allow-empty
- git branch gh-pages
- git checkout gh-pages
- touch .nojekyll
- git add --all .
- git commit -m "Version" --allow-empty
-
- git remote add origin "https://${PANDAS_GH_TOKEN}@github.com/pandas-dev/pandas-docs-travis.git"
- git fetch origin
- git remote -v
-
- git push origin gh-pages -f
- fi
-fi
-
-exit 0
diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh
new file mode 100644
index 0000000000000..9dbcd4f98683e
--- /dev/null
+++ b/ci/check_git_tags.sh
@@ -0,0 +1,28 @@
+set -e
+
+if [[ ! $(git tag) ]]; then
+ echo "No git tags in clone, please sync your git tags with upstream using:"
+ echo " git fetch --tags upstream"
+ echo " git push --tags origin"
+ echo ""
+ echo "If the issue persists, the clone depth needs to be increased in .travis.yml"
+ exit 1
+fi
+
+# This will error if there are no tags and we omit --always
+DESCRIPTION=$(git describe --long --tags)
+echo "$DESCRIPTION"
+
+if [[ "$DESCRIPTION" == *"untagged"* ]]; then
+ echo "Unable to determine most recent tag, aborting build"
+ exit 1
+else
+ if [[ "$DESCRIPTION" != *"g"* ]]; then
+ # A good description will have the hash prefixed by g, a bad one will be
+ # just the hash
+ echo "Unable to determine most recent tag, aborting build"
+ exit 1
+ else
+ echo "$(git tag)"
+ fi
+fi
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index a16580679ff54..333136ddfddd9 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -52,6 +52,13 @@ fi
### LINTING ###
if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
+ echo "black --version"
+ black --version
+
+ MSG='Checking black formatting' ; echo $MSG
+ black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)'
+ RET=$(($RET + $?)) ; echo $MSG "DONE"
+
# `setup.cfg` contains the list of error codes that are being ignored in flake8
echo "flake8 --version"
@@ -149,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
- invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
+ invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG
@@ -245,10 +252,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"
MSG='Doctests interval classes' ; echo $MSG
- pytest --doctest-modules -v \
+ pytest -q --doctest-modules \
pandas/core/indexes/interval.py \
pandas/core/arrays/interval.py \
- -k"-from_arrays -from_breaks -from_intervals -from_tuples -get_loc -set_closed -to_tuples -interval_range"
+ -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
@@ -256,8 +263,8 @@ fi
### DOCSTRINGS ###
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
- MSG='Validate docstrings (GL03, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
- $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
+ MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
+ $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml
index c783670e78d52..97c45b2be27d7 100644
--- a/ci/deps/azure-35-compat.yaml
+++ b/ci/deps/azure-35-compat.yaml
@@ -11,7 +11,7 @@ dependencies:
- openpyxl=2.4.8
- pytables=3.4.2
- python-dateutil=2.6.1
- - python=3.5.*
+ - python=3.5.3
- pytz=2017.2
- scipy=0.19.0
- xlrd=1.1.0
@@ -22,6 +22,7 @@ dependencies:
- hypothesis>=3.58.0
- pytest-xdist
- pytest-mock
+ - pytest-azurepipelines
- pip
- pip:
# for python 3.5, pytest>=4.0.2 is not available in conda
diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml
new file mode 100644
index 0000000000000..43bf0ecdd6c3e
--- /dev/null
+++ b/ci/deps/azure-36-32bit.yaml
@@ -0,0 +1,20 @@
+name: pandas-dev
+channels:
+ - defaults
+ - conda-forge
+dependencies:
+ - gcc_linux-32
+ - gcc_linux-32
+ - gxx_linux-32
+ - cython=0.28.2
+ - numpy=1.14.*
+ - python-dateutil
+ - python=3.6.*
+ - pytz=2017.2
+ # universal
+ - pytest>=4.0.2,<5.0.0
+ - pytest-xdist
+ - pytest-mock
+ - pytest-azurepipelines
+ - hypothesis>=3.58.0
+ - pip
diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml
index fbb240734d45d..6a77b5dbedc61 100644
--- a/ci/deps/azure-36-locale.yaml
+++ b/ci/deps/azure-36-locale.yaml
@@ -20,9 +20,10 @@ dependencies:
- xlsxwriter=0.9.8
- xlwt=1.2.0
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
+ - pytest-azurepipelines
- hypothesis>=3.58.0
- pip
- pip:
diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml
index 9ddc782da930e..2bf2bd74795d2 100644
--- a/ci/deps/azure-36-locale_slow.yaml
+++ b/ci/deps/azure-36-locale_slow.yaml
@@ -29,6 +29,7 @@ dependencies:
- pytest>=4.0.2
- pytest-xdist
- pytest-mock
+ - pytest-azurepipelines
- moto
- pip
- pip:
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
index 2ebb7dda86e36..26dcd213bbfa0 100644
--- a/ci/deps/azure-37-locale.yaml
+++ b/ci/deps/azure-37-locale.yaml
@@ -10,6 +10,7 @@ dependencies:
- jinja2
- lxml
- matplotlib
+ - moto
- nomkl
- numexpr
- numpy
@@ -25,10 +26,10 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
+ - pytest-azurepipelines
- pip
- pip:
- hypothesis>=3.58.0
- - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed
diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml
index 831f13fb421f0..65c92ec1dcf0d 100644
--- a/ci/deps/azure-37-numpydev.yaml
+++ b/ci/deps/azure-37-numpydev.yaml
@@ -6,7 +6,8 @@ dependencies:
- pytz
- Cython>=0.28.2
# universal
- - pytest>=4.0.2
+ # pytest < 5 until defaults has pytest-xdist>=1.29.0
+ - pytest>=4.0.2,<5.0
- pytest-xdist
- pytest-mock
- hypothesis>=3.58.0
@@ -17,3 +18,5 @@ dependencies:
- "--pre"
- "numpy"
- "scipy"
+ # https://github.com/pandas-dev/pandas/issues/27421
+ - pytest-azurepipelines<1.0.0
diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
index 24c753e16d98d..39315b15a018b 100644
--- a/ci/deps/azure-macos-35.yaml
+++ b/ci/deps/azure-macos-35.yaml
@@ -22,10 +22,14 @@ dependencies:
- xlrd
- xlsxwriter
- xlwt
+ - pip
- pip:
- pyreadstat
# universal
- - pytest==4.5.0
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
- hypothesis>=3.58.0
+ # https://github.com/pandas-dev/pandas/issues/27421
+ - pytest-azurepipelines<1.0.0
+
diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml
index b1795059091b9..ff9264a36cb12 100644
--- a/ci/deps/azure-windows-36.yaml
+++ b/ci/deps/azure-windows-36.yaml
@@ -23,7 +23,8 @@ dependencies:
- xlwt
# universal
- cython>=0.28.2
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
+ - pytest-azurepipelines
- hypothesis>=3.58.0
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index 5bdc29e0eec80..075234a937035 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -10,6 +10,7 @@ dependencies:
- jinja2
- lxml
- matplotlib=2.2.*
+ - moto
- numexpr
- numpy=1.14.*
- openpyxl
@@ -25,9 +26,9 @@ dependencies:
- xlwt
# universal
- cython>=0.28.2
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- - moto
+ - pytest-azurepipelines
- hypothesis>=3.58.0
- pyreadstat
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
index c497495553e8b..19002cbb8575e 100644
--- a/ci/deps/travis-36-cov.yaml
+++ b/ci/deps/travis-36-cov.yaml
@@ -12,9 +12,11 @@ dependencies:
- geopandas
- html5lib
- matplotlib
+ - moto
- nomkl
- numexpr
- numpy=1.15.*
+ - odfpy
- openpyxl
- pandas-gbq
# https://github.com/pydata/pandas-gbq/issues/271
@@ -37,8 +39,8 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-cov
- pytest-mock
- hypothesis>=3.58.0
@@ -46,6 +48,5 @@ dependencies:
- pip:
- brotlipy
- coverage
- - moto
- pandas-datareader
- python-dateutil
diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml
deleted file mode 100644
index 9419543e601e2..0000000000000
--- a/ci/deps/travis-36-doc.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-name: pandas-dev
-channels:
- - defaults
- - conda-forge
-dependencies:
- - beautifulsoup4
- - bottleneck
- - cython>=0.28.2
- - fastparquet>=0.2.1
- - gitpython
- - html5lib
- - hypothesis>=3.58.0
- - ipykernel
- - ipython
- - ipywidgets
- - lxml
- - matplotlib
- - nbconvert>=5.4.1
- - nbformat
- - nbsphinx
- - notebook>=5.7.5
- - numexpr
- - numpy
- - numpydoc
- - openpyxl
- - pandoc
- - pyarrow
- - pyqt
- - pytables
- - python-dateutil
- - python-snappy
- - python=3.6.*
- - pytz
- - scipy
- - seaborn
- # some styling is broken with sphinx >= 2 (https://github.com/pandas-dev/pandas/issues/26058)
- - sphinx=1.8.5
- - sqlalchemy
- - statsmodels
- - xarray
- - xlrd
- - xlsxwriter
- - xlwt
- # universal
- - pytest>=4.0.2
- - pytest-xdist
diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml
index badf4e6932da8..7da4abb9283df 100644
--- a/ci/deps/travis-36-locale.yaml
+++ b/ci/deps/travis-36-locale.yaml
@@ -8,37 +8,35 @@ dependencies:
- python-blosc
- cython>=0.28.2
- fastparquet=0.2.1
- - gcsfs=0.1.0
+ - gcsfs=0.2.2
- html5lib
- ipython
- jinja2
- - lxml=3.7.0
- - matplotlib=3.0.0
+ - lxml=3.8.0
+ - matplotlib=3.0.*
+ - moto
- nomkl
- numexpr
- numpy
- openpyxl
- pandas-gbq=0.8.0
- psycopg2=2.6.2
- - pymysql=0.7.9
+ - pymysql=0.7.11
- pytables
- python-dateutil
- # cannot go past python=3.6.6 for matplotlib=3.0.0 due to
- # https://github.com/matplotlib/matplotlib/issues/12626
- - python=3.6.6
+ - python=3.6.*
- pytz
- s3fs=0.0.8
- scipy
- sqlalchemy=1.1.4
- - xarray=0.8.2
+ - xarray=0.10
- xlrd
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.1
+ - pytest-xdist>=1.29.0
- pytest-mock
- - moto
- pip
- pip:
- hypothesis>=3.58.0
diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml
index 87021d5dae04e..9564bf5bb3a9f 100644
--- a/ci/deps/travis-36-slow.yaml
+++ b/ci/deps/travis-36-slow.yaml
@@ -25,8 +25,8 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- moto
- hypothesis>=3.58.0
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
index c9a8c274fb144..9e08c41a3d9c0 100644
--- a/ci/deps/travis-37.yaml
+++ b/ci/deps/travis-37.yaml
@@ -13,8 +13,8 @@ dependencies:
- pyarrow
- pytz
# universal
- - pytest>=4.0.2
- - pytest-xdist
+ - pytest>=5.0.0
+ - pytest-xdist>=1.29.0
- pytest-mock
- hypothesis>=3.58.0
- s3fs
diff --git a/ci/print_skipped.py b/ci/print_skipped.py
index 859481c5d188d..a44281044e11d 100755
--- a/ci/print_skipped.py
+++ b/ci/print_skipped.py
@@ -11,45 +11,42 @@ def parse_results(filename):
root = tree.getroot()
skipped = []
- current_class = ''
+ current_class = ""
i = 1
assert i - 1 == len(skipped)
- for el in root.findall('testcase'):
- cn = el.attrib['classname']
- for sk in el.findall('skipped'):
+ for el in root.findall("testcase"):
+ cn = el.attrib["classname"]
+ for sk in el.findall("skipped"):
old_class = current_class
current_class = cn
- name = '{classname}.{name}'.format(classname=current_class,
- name=el.attrib['name'])
- msg = sk.attrib['message']
- out = ''
+ name = "{classname}.{name}".format(
+ classname=current_class, name=el.attrib["name"]
+ )
+ msg = sk.attrib["message"]
+ out = ""
if old_class != current_class:
ndigits = int(math.log(i, 10) + 1)
# 4 for : + space + # + space
- out += ('-' * (len(name + msg) + 4 + ndigits) + '\n')
- out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg)
+ out += "-" * (len(name + msg) + 4 + ndigits) + "\n"
+ out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg)
skipped.append(out)
i += 1
assert i - 1 == len(skipped)
assert i - 1 == len(skipped)
# assert len(skipped) == int(root.attrib['skip'])
- return '\n'.join(skipped)
+ return "\n".join(skipped)
def main():
- test_files = [
- 'test-data-single.xml',
- 'test-data-multiple.xml',
- 'test-data.xml',
- ]
+ test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"]
- print('SKIPPED TESTS:')
+ print("SKIPPED TESTS:")
for fn in test_files:
if os.path.isfile(fn):
print(parse_results(fn))
return 0
-if __name__ == '__main__':
+if __name__ == "__main__":
sys.exit(main())
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index ee46da9f52eab..27d3fcb4cf563 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -50,9 +50,10 @@ do
# if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code
sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret"
- if [[ "$COVERAGE" && $? == 0 ]]; then
- echo "uploading coverage for $TYPE tests"
- echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
- bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
- fi
+ # 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602
+ # if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
+ # echo "uploading coverage for $TYPE tests"
+ # echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
+ # bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
+ # fi
done
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
index 8f73bb228e2bd..88742e0483c7e 100755
--- a/ci/setup_env.sh
+++ b/ci/setup_env.sh
@@ -94,6 +94,12 @@ echo
echo "conda env create -q --file=${ENV_FILE}"
time conda env create -q --file="${ENV_FILE}"
+
+if [[ "$BITS32" == "yes" ]]; then
+ # activate 32-bit compiler
+ export CONDA_BUILD=1
+fi
+
echo "activate pandas-dev"
source activate pandas-dev
diff --git a/codecov.yml b/codecov.yml
index 512bc2e82a736..1644bf315e0ac 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,13 +1,13 @@
codecov:
branch: master
+comment: off
+
coverage:
status:
project:
default:
- enabled: no
target: '82'
patch:
default:
- enabled: no
target: '50'
diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py
index c3647f0c7d2a8..5a07b094e6ad3 100644
--- a/doc/logo/pandas_logo.py
+++ b/doc/logo/pandas_logo.py
@@ -4,7 +4,7 @@
from matplotlib import rcParams
import numpy as np
-rcParams['mathtext.fontset'] = 'cm'
+rcParams["mathtext.fontset"] = "cm"
def fnx():
@@ -37,8 +37,12 @@ def fnx():
plt.figtext(0.05, 0.5, "pandas", size=40)
plt.figtext(
- 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
- size=16, color="#5a89a4")
-
-fig.savefig('pandas_logo.svg')
-fig.savefig('pandas_logo.png')
+ 0.05,
+ 0.2,
+ r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
+ size=16,
+ color="#5a89a4",
+)
+
+fig.savefig("pandas_logo.svg")
+fig.savefig("pandas_logo.png")
diff --git a/doc/make.py b/doc/make.py
index 496b3cfd4ee45..48febef20fbe6 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -24,9 +24,9 @@
DOC_PATH = os.path.dirname(os.path.abspath(__file__))
-SOURCE_PATH = os.path.join(DOC_PATH, 'source')
-BUILD_PATH = os.path.join(DOC_PATH, 'build')
-REDIRECTS_FILE = os.path.join(DOC_PATH, 'redirects.csv')
+SOURCE_PATH = os.path.join(DOC_PATH, "source")
+BUILD_PATH = os.path.join(DOC_PATH, "build")
+REDIRECTS_FILE = os.path.join(DOC_PATH, "redirects.csv")
class DocBuilder:
@@ -36,8 +36,15 @@ class DocBuilder:
All public methods of this class can be called as parameters of the
script.
"""
- def __init__(self, num_jobs=0, include_api=True, single_doc=None,
- verbosity=0, warnings_are_errors=False):
+
+ def __init__(
+ self,
+ num_jobs=0,
+ include_api=True,
+ single_doc=None,
+ verbosity=0,
+ warnings_are_errors=False,
+ ):
self.num_jobs = num_jobs
self.verbosity = verbosity
self.warnings_are_errors = warnings_are_errors
@@ -45,16 +52,15 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None,
if single_doc:
single_doc = self._process_single_doc(single_doc)
include_api = False
- os.environ['SPHINX_PATTERN'] = single_doc
+ os.environ["SPHINX_PATTERN"] = single_doc
elif not include_api:
- os.environ['SPHINX_PATTERN'] = '-api'
+ os.environ["SPHINX_PATTERN"] = "-api"
self.single_doc_html = None
- if single_doc and single_doc.endswith('.rst'):
- self.single_doc_html = os.path.splitext(single_doc)[0] + '.html'
+ if single_doc and single_doc.endswith(".rst"):
+ self.single_doc_html = os.path.splitext(single_doc)[0] + ".html"
elif single_doc:
- self.single_doc_html = 'reference/api/pandas.{}.html'.format(
- single_doc)
+ self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc)
def _process_single_doc(self, single_doc):
"""
@@ -66,26 +72,30 @@ def _process_single_doc(self, single_doc):
(e.g. reference/api/pandas.DataFrame.head.rst).
"""
base_name, extension = os.path.splitext(single_doc)
- if extension in ('.rst', '.ipynb'):
+ if extension in (".rst", ".ipynb"):
if os.path.exists(os.path.join(SOURCE_PATH, single_doc)):
return single_doc
else:
- raise FileNotFoundError('File {} not found'.format(single_doc))
+ raise FileNotFoundError("File {} not found".format(single_doc))
- elif single_doc.startswith('pandas.'):
+ elif single_doc.startswith("pandas."):
try:
obj = pandas # noqa: F821
- for name in single_doc.split('.'):
+ for name in single_doc.split("."):
obj = getattr(obj, name)
except AttributeError:
- raise ImportError('Could not import {}'.format(single_doc))
+ raise ImportError("Could not import {}".format(single_doc))
else:
- return single_doc[len('pandas.'):]
+ return single_doc[len("pandas.") :]
else:
- raise ValueError(('--single={} not understood. Value should be a '
- 'valid path to a .rst or .ipynb file, or a '
- 'valid pandas object (e.g. categorical.rst or '
- 'pandas.DataFrame.head)').format(single_doc))
+ raise ValueError(
+ (
+ "--single={} not understood. Value should be a "
+ "valid path to a .rst or .ipynb file, or a "
+ "valid pandas object (e.g. categorical.rst or "
+ "pandas.DataFrame.head)"
+ ).format(single_doc)
+ )
@staticmethod
def _run_os(*args):
@@ -117,52 +127,55 @@ def _sphinx_build(self, kind):
--------
>>> DocBuilder(num_jobs=4)._sphinx_build('html')
"""
- if kind not in ('html', 'latex'):
- raise ValueError('kind must be html or latex, '
- 'not {}'.format(kind))
+ if kind not in ("html", "latex"):
+ raise ValueError("kind must be html or latex, " "not {}".format(kind))
- cmd = ['sphinx-build', '-b', kind]
+ cmd = ["sphinx-build", "-b", kind]
if self.num_jobs:
- cmd += ['-j', str(self.num_jobs)]
+ cmd += ["-j", str(self.num_jobs)]
if self.warnings_are_errors:
- cmd += ['-W', '--keep-going']
+ cmd += ["-W", "--keep-going"]
if self.verbosity:
- cmd.append('-{}'.format('v' * self.verbosity))
- cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'),
- SOURCE_PATH, os.path.join(BUILD_PATH, kind)]
+ cmd.append("-{}".format("v" * self.verbosity))
+ cmd += [
+ "-d",
+ os.path.join(BUILD_PATH, "doctrees"),
+ SOURCE_PATH,
+ os.path.join(BUILD_PATH, kind),
+ ]
return subprocess.call(cmd)
def _open_browser(self, single_doc_html):
"""
Open a browser tab showing single
"""
- url = os.path.join('file://', DOC_PATH, 'build', 'html',
- single_doc_html)
+ url = os.path.join("file://", DOC_PATH, "build", "html", single_doc_html)
webbrowser.open(url, new=2)
def _get_page_title(self, page):
"""
Open the rst file `page` and extract its title.
"""
- fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page))
+ fname = os.path.join(SOURCE_PATH, "{}.rst".format(page))
option_parser = docutils.frontend.OptionParser(
- components=(docutils.parsers.rst.Parser,))
- doc = docutils.utils.new_document(
- '',
- option_parser.get_default_values())
+ components=(docutils.parsers.rst.Parser,)
+ )
+ doc = docutils.utils.new_document("", option_parser.get_default_values())
with open(fname) as f:
data = f.read()
parser = docutils.parsers.rst.Parser()
# do not generate any warning when parsing the rst
- with open(os.devnull, 'a') as f:
+ with open(os.devnull, "a") as f:
doc.reporter.stream = f
parser.parse(data, doc)
- section = next(node for node in doc.children
- if isinstance(node, docutils.nodes.section))
- title = next(node for node in section.children
- if isinstance(node, docutils.nodes.title))
+ section = next(
+ node for node in doc.children if isinstance(node, docutils.nodes.section)
+ )
+ title = next(
+ node for node in section.children if isinstance(node, docutils.nodes.title)
+ )
return title.astext()
@@ -171,7 +184,7 @@ def _add_redirects(self):
Create in the build directory an html file with a redirect,
for every row in REDIRECTS_FILE.
"""
- html = '''
+ html = """
@@ -182,16 +195,14 @@ def _add_redirects(self):
- '''
+ """
with open(REDIRECTS_FILE) as mapping_fd:
reader = csv.reader(mapping_fd)
for row in reader:
- if not row or row[0].strip().startswith('#'):
+ if not row or row[0].strip().startswith("#"):
continue
- path = os.path.join(BUILD_PATH,
- 'html',
- *row[0].split('/')) + '.html'
+ path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html"
try:
title = self._get_page_title(row[1])
@@ -199,24 +210,26 @@ def _add_redirects(self):
# the file can be an ipynb and not an rst, or docutils
# may not be able to read the rst because it has some
# sphinx specific stuff
- title = 'this page'
+ title = "this page"
if os.path.exists(path):
- raise RuntimeError((
- 'Redirection would overwrite an existing file: '
- '{}').format(path))
+ raise RuntimeError(
+ ("Redirection would overwrite an existing file: " "{}").format(
+ path
+ )
+ )
- with open(path, 'w') as moved_page_fd:
+ with open(path, "w") as moved_page_fd:
moved_page_fd.write(
- html.format(url='{}.html'.format(row[1]),
- title=title))
+ html.format(url="{}.html".format(row[1]), title=title)
+ )
def html(self):
"""
Build HTML documentation.
"""
- ret_code = self._sphinx_build('html')
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ ret_code = self._sphinx_build("html")
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
@@ -231,20 +244,20 @@ def latex(self, force=False):
"""
Build PDF documentation.
"""
- if sys.platform == 'win32':
- sys.stderr.write('latex build has not been tested on windows\n')
+ if sys.platform == "win32":
+ sys.stderr.write("latex build has not been tested on windows\n")
else:
- ret_code = self._sphinx_build('latex')
- os.chdir(os.path.join(BUILD_PATH, 'latex'))
+ ret_code = self._sphinx_build("latex")
+ os.chdir(os.path.join(BUILD_PATH, "latex"))
if force:
for i in range(3):
- self._run_os('pdflatex',
- '-interaction=nonstopmode',
- 'pandas.tex')
- raise SystemExit('You should check the file '
- '"build/latex/pandas.pdf" for problems.')
+ self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex")
+ raise SystemExit(
+ "You should check the file "
+ '"build/latex/pandas.pdf" for problems.'
+ )
else:
- self._run_os('make')
+ self._run_os("make")
return ret_code
def latex_forced(self):
@@ -259,84 +272,101 @@ def clean():
Clean documentation generated files.
"""
shutil.rmtree(BUILD_PATH, ignore_errors=True)
- shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),
- ignore_errors=True)
+ shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True)
def zip_html(self):
"""
Compress HTML documentation into a zip file.
"""
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
- dirname = os.path.join(BUILD_PATH, 'html')
+ dirname = os.path.join(BUILD_PATH, "html")
fnames = os.listdir(dirname)
os.chdir(dirname)
- self._run_os('zip',
- zip_fname,
- '-r',
- '-q',
- *fnames)
+ self._run_os("zip", zip_fname, "-r", "-q", *fnames)
def main():
- cmds = [method for method in dir(DocBuilder) if not method.startswith('_')]
+ cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]
argparser = argparse.ArgumentParser(
- description='pandas documentation builder',
- epilog='Commands: {}'.format(','.join(cmds)))
- argparser.add_argument('command',
- nargs='?',
- default='html',
- help='command to run: {}'.format(', '.join(cmds)))
- argparser.add_argument('--num-jobs',
- type=int,
- default=0,
- help='number of jobs used by sphinx-build')
- argparser.add_argument('--no-api',
- default=False,
- help='omit api and autosummary',
- action='store_true')
- argparser.add_argument('--single',
- metavar='FILENAME',
- type=str,
- default=None,
- help=('filename (relative to the "source" folder)'
- ' of section or method name to compile, e.g. '
- '"development/contributing.rst",'
- ' "ecosystem.rst", "pandas.DataFrame.join"'))
- argparser.add_argument('--python-path',
- type=str,
- default=os.path.dirname(DOC_PATH),
- help='path')
- argparser.add_argument('-v', action='count', dest='verbosity', default=0,
- help=('increase verbosity (can be repeated), '
- 'passed to the sphinx build command'))
- argparser.add_argument('--warnings-are-errors', '-W',
- action='store_true',
- help='fail if warnings are raised')
+ description="pandas documentation builder",
+ epilog="Commands: {}".format(",".join(cmds)),
+ )
+ argparser.add_argument(
+ "command",
+ nargs="?",
+ default="html",
+ help="command to run: {}".format(", ".join(cmds)),
+ )
+ argparser.add_argument(
+ "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build"
+ )
+ argparser.add_argument(
+ "--no-api", default=False, help="omit api and autosummary", action="store_true"
+ )
+ argparser.add_argument(
+ "--single",
+ metavar="FILENAME",
+ type=str,
+ default=None,
+ help=(
+ 'filename (relative to the "source" folder)'
+ " of section or method name to compile, e.g. "
+ '"development/contributing.rst",'
+ ' "ecosystem.rst", "pandas.DataFrame.join"'
+ ),
+ )
+ argparser.add_argument(
+ "--python-path", type=str, default=os.path.dirname(DOC_PATH), help="path"
+ )
+ argparser.add_argument(
+ "-v",
+ action="count",
+ dest="verbosity",
+ default=0,
+ help=(
+ "increase verbosity (can be repeated), "
+ "passed to the sphinx build command"
+ ),
+ )
+ argparser.add_argument(
+ "--warnings-are-errors",
+ "-W",
+ action="store_true",
+ help="fail if warnings are raised",
+ )
args = argparser.parse_args()
if args.command not in cmds:
- raise ValueError('Unknown command {}. Available options: {}'.format(
- args.command, ', '.join(cmds)))
+ raise ValueError(
+ "Unknown command {}. Available options: {}".format(
+ args.command, ", ".join(cmds)
+ )
+ )
# Below we update both os.environ and sys.path. The former is used by
# external libraries (namely Sphinx) to compile this module and resolve
# the import of `python_path` correctly. The latter is used to resolve
# the import within the module, injecting it into the global namespace
- os.environ['PYTHONPATH'] = args.python_path
+ os.environ["PYTHONPATH"] = args.python_path
sys.path.insert(0, args.python_path)
- globals()['pandas'] = importlib.import_module('pandas')
+ globals()["pandas"] = importlib.import_module("pandas")
# Set the matplotlib backend to the non-interactive Agg backend for all
# child processes.
- os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg'
-
- builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
- args.verbosity, args.warnings_are_errors)
+ os.environ["MPLBACKEND"] = "module://matplotlib.backends.backend_agg"
+
+ builder = DocBuilder(
+ args.num_jobs,
+ not args.no_api,
+ args.single,
+ args.verbosity,
+ args.warnings_are_errors,
+ )
return getattr(builder, args.command)()
-if __name__ == '__main__':
+if __name__ == "__main__":
sys.exit(main())
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 2484a9d592e09..a4b7d97c2cf5e 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -34,15 +34,13 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.append(os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('../sphinxext'))
-sys.path.extend([
-
- # numpy standard doc extensions
- os.path.join(os.path.dirname(__file__),
- '..', '../..',
- 'sphinxext')
-
-])
+sys.path.insert(0, os.path.abspath("../sphinxext"))
+sys.path.extend(
+ [
+ # numpy standard doc extensions
+ os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext")
+ ]
+)
# -- General configuration -----------------------------------------------
@@ -50,65 +48,66 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
# sphinxext.
-extensions = ['sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.doctest',
- 'sphinx.ext.extlinks',
- 'sphinx.ext.todo',
- 'numpydoc', # handle NumPy documentation formatted docstrings
- 'IPython.sphinxext.ipython_directive',
- 'IPython.sphinxext.ipython_console_highlighting',
- 'matplotlib.sphinxext.plot_directive',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.coverage',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.linkcode',
- 'nbsphinx',
- 'contributors', # custom pandas extension
- ]
-
-exclude_patterns = ['**.ipynb_checkpoints']
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.doctest",
+ "sphinx.ext.extlinks",
+ "sphinx.ext.todo",
+ "numpydoc", # handle NumPy documentation formatted docstrings
+ "IPython.sphinxext.ipython_directive",
+ "IPython.sphinxext.ipython_console_highlighting",
+ "matplotlib.sphinxext.plot_directive",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.coverage",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.linkcode",
+ "nbsphinx",
+ "contributors", # custom pandas extension
+]
+
+exclude_patterns = ["**.ipynb_checkpoints"]
try:
import nbconvert
except ImportError:
- logger.warn('nbconvert not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("nbconvert not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
else:
try:
nbconvert.utils.pandoc.get_pandoc_version()
except nbconvert.utils.pandoc.PandocMissing:
- logger.warn('Pandoc not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("Pandoc not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
# sphinx_pattern can be '-api' to exclude the API pages,
# the path to a file, or a Python object
# (e.g. '10min.rst' or 'pandas.DataFrame.head')
source_path = os.path.dirname(os.path.abspath(__file__))
-pattern = os.environ.get('SPHINX_PATTERN')
+pattern = os.environ.get("SPHINX_PATTERN")
if pattern:
for dirname, dirs, fnames in os.walk(source_path):
for fname in fnames:
- if os.path.splitext(fname)[-1] in ('.rst', '.ipynb'):
- fname = os.path.relpath(os.path.join(dirname, fname),
- source_path)
+ if os.path.splitext(fname)[-1] in (".rst", ".ipynb"):
+ fname = os.path.relpath(os.path.join(dirname, fname), source_path)
- if (fname == 'index.rst'
- and os.path.abspath(dirname) == source_path):
+ if fname == "index.rst" and os.path.abspath(dirname) == source_path:
continue
- elif pattern == '-api' and dirname == 'reference':
+ elif pattern == "-api" and dirname == "reference":
exclude_patterns.append(fname)
- elif pattern != '-api' and fname != pattern:
+ elif pattern != "-api" and fname != pattern:
exclude_patterns.append(fname)
-with open(os.path.join(source_path, 'index.rst.template')) as f:
+with open(os.path.join(source_path, "index.rst.template")) as f:
t = jinja2.Template(f.read())
-with open(os.path.join(source_path, 'index.rst'), 'w') as f:
- f.write(t.render(include_api=pattern is None,
- single_doc=(pattern
- if pattern is not None and pattern != '-api'
- else None)))
-autosummary_generate = True if pattern is None else ['index']
+with open(os.path.join(source_path, "index.rst"), "w") as f:
+ f.write(
+ t.render(
+ include_api=pattern is None,
+ single_doc=(pattern if pattern is not None and pattern != "-api" else None),
+ )
+ )
+autosummary_generate = True if pattern is None else ["index"]
# numpydoc
numpydoc_attributes_as_param_list = False
@@ -122,22 +121,20 @@
import pandas as pd"""
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['../_templates']
+templates_path = ["../_templates"]
# The suffix of source filenames.
-source_suffix = [
- '.rst',
-]
+source_suffix = [".rst"]
# The encoding of source files.
-source_encoding = 'utf-8'
+source_encoding = "utf-8"
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = 'pandas'
-copyright = '2008-2014, the pandas development team'
+project = "pandas"
+copyright = "2008-2014, the pandas development team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -184,7 +181,7 @@
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
@@ -194,7 +191,7 @@
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'nature_with_gtoc'
+html_theme = "nature_with_gtoc"
# The style sheet to use for HTML and HTML Help pages. A file of that name
# must exist either in Sphinx' static/ path, or in one of the custom paths
@@ -207,7 +204,7 @@
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ['themes']
+html_theme_path = ["themes"]
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
@@ -223,12 +220,12 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-html_favicon = os.path.join(html_static_path[0], 'favicon.ico')
+html_favicon = os.path.join(html_static_path[0], "favicon.ico")
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
@@ -250,60 +247,62 @@
# https://github.com/pandas-dev/pandas/issues/16186
moved_api_pages = [
- ('pandas.core.common.isnull', 'pandas.isna'),
- ('pandas.core.common.notnull', 'pandas.notna'),
- ('pandas.core.reshape.get_dummies', 'pandas.get_dummies'),
- ('pandas.tools.merge.concat', 'pandas.concat'),
- ('pandas.tools.merge.merge', 'pandas.merge'),
- ('pandas.tools.pivot.pivot_table', 'pandas.pivot_table'),
- ('pandas.tseries.tools.to_datetime', 'pandas.to_datetime'),
- ('pandas.io.clipboard.read_clipboard', 'pandas.read_clipboard'),
- ('pandas.io.excel.ExcelFile.parse', 'pandas.ExcelFile.parse'),
- ('pandas.io.excel.read_excel', 'pandas.read_excel'),
- ('pandas.io.gbq.read_gbq', 'pandas.read_gbq'),
- ('pandas.io.html.read_html', 'pandas.read_html'),
- ('pandas.io.json.read_json', 'pandas.read_json'),
- ('pandas.io.parsers.read_csv', 'pandas.read_csv'),
- ('pandas.io.parsers.read_fwf', 'pandas.read_fwf'),
- ('pandas.io.parsers.read_table', 'pandas.read_table'),
- ('pandas.io.pickle.read_pickle', 'pandas.read_pickle'),
- ('pandas.io.pytables.HDFStore.append', 'pandas.HDFStore.append'),
- ('pandas.io.pytables.HDFStore.get', 'pandas.HDFStore.get'),
- ('pandas.io.pytables.HDFStore.put', 'pandas.HDFStore.put'),
- ('pandas.io.pytables.HDFStore.select', 'pandas.HDFStore.select'),
- ('pandas.io.pytables.read_hdf', 'pandas.read_hdf'),
- ('pandas.io.sql.read_sql', 'pandas.read_sql'),
- ('pandas.io.sql.read_frame', 'pandas.read_frame'),
- ('pandas.io.sql.write_frame', 'pandas.write_frame'),
- ('pandas.io.stata.read_stata', 'pandas.read_stata'),
+ ("pandas.core.common.isnull", "pandas.isna"),
+ ("pandas.core.common.notnull", "pandas.notna"),
+ ("pandas.core.reshape.get_dummies", "pandas.get_dummies"),
+ ("pandas.tools.merge.concat", "pandas.concat"),
+ ("pandas.tools.merge.merge", "pandas.merge"),
+ ("pandas.tools.pivot.pivot_table", "pandas.pivot_table"),
+ ("pandas.tseries.tools.to_datetime", "pandas.to_datetime"),
+ ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"),
+ ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"),
+ ("pandas.io.excel.read_excel", "pandas.read_excel"),
+ ("pandas.io.gbq.read_gbq", "pandas.read_gbq"),
+ ("pandas.io.html.read_html", "pandas.read_html"),
+ ("pandas.io.json.read_json", "pandas.read_json"),
+ ("pandas.io.parsers.read_csv", "pandas.read_csv"),
+ ("pandas.io.parsers.read_fwf", "pandas.read_fwf"),
+ ("pandas.io.parsers.read_table", "pandas.read_table"),
+ ("pandas.io.pickle.read_pickle", "pandas.read_pickle"),
+ ("pandas.io.pytables.HDFStore.append", "pandas.HDFStore.append"),
+ ("pandas.io.pytables.HDFStore.get", "pandas.HDFStore.get"),
+ ("pandas.io.pytables.HDFStore.put", "pandas.HDFStore.put"),
+ ("pandas.io.pytables.HDFStore.select", "pandas.HDFStore.select"),
+ ("pandas.io.pytables.read_hdf", "pandas.read_hdf"),
+ ("pandas.io.sql.read_sql", "pandas.read_sql"),
+ ("pandas.io.sql.read_frame", "pandas.read_frame"),
+ ("pandas.io.sql.write_frame", "pandas.write_frame"),
+ ("pandas.io.stata.read_stata", "pandas.read_stata"),
]
# Again, tuples of (from_old, to_new)
moved_classes = [
- ('pandas.tseries.resample.Resampler', 'pandas.core.resample.Resampler'),
- ('pandas.formats.style.Styler', 'pandas.io.formats.style.Styler'),
+ ("pandas.tseries.resample.Resampler", "pandas.core.resample.Resampler"),
+ ("pandas.formats.style.Styler", "pandas.io.formats.style.Styler"),
]
for old, new in moved_classes:
# the class itself...
moved_api_pages.append((old, new))
- mod, classname = new.rsplit('.', 1)
+ mod, classname = new.rsplit(".", 1)
klass = getattr(importlib.import_module(mod), classname)
- methods = [x for x in dir(klass)
- if not x.startswith('_') or x in ('__iter__', '__array__')]
+ methods = [
+ x for x in dir(klass) if not x.startswith("_") or x in ("__iter__", "__array__")
+ ]
for method in methods:
# ... and each of its public methods
moved_api_pages.append(
- ("{old}.{method}".format(old=old, method=method),
- "{new}.{method}".format(new=new, method=method))
+ (
+ "{old}.{method}".format(old=old, method=method),
+ "{new}.{method}".format(new=new, method=method),
+ )
)
if pattern is None:
html_additional_pages = {
- 'generated/' + page[0]: 'api_redirect.html'
- for page in moved_api_pages
+ "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages
}
@@ -316,19 +315,20 @@
import numpy as np
import pandas as pd
- randn = np.random.randn
np.random.seed(123456)
np.set_printoptions(precision=4, suppress=True)
pd.options.display.max_rows = 15
import os
os.chdir(r'{}')
-""".format(os.path.dirname(os.path.dirname(__file__)))
+""".format(
+ os.path.dirname(os.path.dirname(__file__))
+)
html_context = {
- 'redirects': {old: new for old, new in moved_api_pages},
- 'header': header
+ "redirects": {old: new for old, new in moved_api_pages},
+ "header": header,
}
# If false, no module index is generated.
@@ -352,7 +352,7 @@
# html_file_suffix = ''
# Output file base name for HTML help builder.
-htmlhelp_basename = 'pandas'
+htmlhelp_basename = "pandas"
# -- Options for nbsphinx ------------------------------------------------
@@ -371,9 +371,13 @@
# Grouping the document tree into LaTeX files. List of tuples (source start
# file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'pandas.tex',
- 'pandas: powerful Python data analysis toolkit',
- r'Wes McKinney\n\& PyData Development Team', 'manual'),
+ (
+ "index",
+ "pandas.tex",
+ "pandas: powerful Python data analysis toolkit",
+ r"Wes McKinney\n\& PyData Development Team",
+ "manual",
+ )
]
# The name of an image file (relative to this directory) to place at the top of
@@ -396,32 +400,32 @@
if pattern is None:
intersphinx_mapping = {
- 'dateutil': ("https://dateutil.readthedocs.io/en/latest/", None),
- 'matplotlib': ('https://matplotlib.org/', None),
- 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
- 'pandas-gbq': ('https://pandas-gbq.readthedocs.io/en/latest/', None),
- 'py': ('https://pylib.readthedocs.io/en/latest/', None),
- 'python': ('https://docs.python.org/3/', None),
- 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
- 'statsmodels': ('http://www.statsmodels.org/devel/', None),
+ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
+ "matplotlib": ("https://matplotlib.org/", None),
+ "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+ "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
+ "py": ("https://pylib.readthedocs.io/en/latest/", None),
+ "python": ("https://docs.python.org/3/", None),
+ "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
+ "statsmodels": ("http://www.statsmodels.org/devel/", None),
}
# extlinks alias
-extlinks = {'issue': ('https://github.com/pandas-dev/pandas/issues/%s',
- 'GH'),
- 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s',
- 'wiki ')}
+extlinks = {
+ "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"),
+ "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "),
+}
ipython_warning_is_error = False
ipython_exec_lines = [
- 'import numpy as np',
- 'import pandas as pd',
+ "import numpy as np",
+ "import pandas as pd",
# This ensures correct rendering on system with console encoding != utf8
# (windows). It forces pandas to encode its output reprs using utf8
# wherever the docs are built. The docs' target is the browser, not
# the console, so this is fine.
- 'pd.options.display.encoding="utf8"'
+ 'pd.options.display.encoding="utf8"',
]
@@ -430,8 +434,7 @@
import sphinx
from sphinx.util import rpartition
-from sphinx.ext.autodoc import (
- Documenter, MethodDocumenter, AttributeDocumenter)
+from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter
from sphinx.ext.autosummary import Autosummary
@@ -439,8 +442,9 @@ class AccessorDocumenter(MethodDocumenter):
"""
Specialized Documenter subclass for accessors.
"""
- objtype = 'accessor'
- directivetype = 'method'
+
+ objtype = "accessor"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -448,7 +452,7 @@ class AccessorDocumenter(MethodDocumenter):
def format_signature(self):
# this method gives an error/warning for the accessors, therefore
# overriding it (accessor has no arguments)
- return ''
+ return ""
class AccessorLevelDocumenter(Documenter):
@@ -456,6 +460,7 @@ class AccessorLevelDocumenter(Documenter):
Specialized Documenter subclass for objects on accessor level (methods,
attributes).
"""
+
# This is the simple straightforward version
# modname is None, base the last elements (eg 'hour')
# and path the part before (eg 'Series.dt')
@@ -468,41 +473,40 @@ class AccessorLevelDocumenter(Documenter):
def resolve_name(self, modname, parents, path, base):
if modname is None:
if path:
- mod_cls = path.rstrip('.')
+ mod_cls = path.rstrip(".")
else:
mod_cls = None
# if documenting a class-level object without path,
# there must be a current class, either from a parent
# auto directive ...
- mod_cls = self.env.temp_data.get('autodoc:class')
+ mod_cls = self.env.temp_data.get("autodoc:class")
# ... or from a class directive
if mod_cls is None:
- mod_cls = self.env.temp_data.get('py:class')
+ mod_cls = self.env.temp_data.get("py:class")
# ... if still None, there's no way to know
if mod_cls is None:
return None, []
# HACK: this is added in comparison to ClassLevelDocumenter
# mod_cls still exists of class.accessor, so an extra
# rpartition is needed
- modname, accessor = rpartition(mod_cls, '.')
- modname, cls = rpartition(modname, '.')
+ modname, accessor = rpartition(mod_cls, ".")
+ modname, cls = rpartition(modname, ".")
parents = [cls, accessor]
# if the module name is still missing, get it like above
if not modname:
- modname = self.env.temp_data.get('autodoc:module')
+ modname = self.env.temp_data.get("autodoc:module")
if not modname:
- if sphinx.__version__ > '1.3':
- modname = self.env.ref_context.get('py:module')
+ if sphinx.__version__ > "1.3":
+ modname = self.env.ref_context.get("py:module")
else:
- modname = self.env.temp_data.get('py:module')
+ modname = self.env.temp_data.get("py:module")
# ... else, it stays None, which means invalid
return modname, parents + [base]
-class AccessorAttributeDocumenter(AccessorLevelDocumenter,
- AttributeDocumenter):
- objtype = 'accessorattribute'
- directivetype = 'attribute'
+class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter):
+ objtype = "accessorattribute"
+ directivetype = "attribute"
# lower than AttributeDocumenter so this is not chosen for normal
# attributes
@@ -510,8 +514,8 @@ class AccessorAttributeDocumenter(AccessorLevelDocumenter,
class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter):
- objtype = 'accessormethod'
- directivetype = 'method'
+ objtype = "accessormethod"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -522,14 +526,15 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter):
This documenter lets us removes .__call__ from the method signature for
callable accessors like Series.plot
"""
- objtype = 'accessorcallable'
- directivetype = 'method'
+
+ objtype = "accessorcallable"
+ directivetype = "method"
# lower than MethodDocumenter; otherwise the doc build prints warnings
priority = 0.5
def format_name(self):
- return MethodDocumenter.format_name(self).rstrip('.__call__')
+ return MethodDocumenter.format_name(self).rstrip(".__call__")
class PandasAutosummary(Autosummary):
@@ -537,15 +542,16 @@ class PandasAutosummary(Autosummary):
This alternative autosummary class lets us override the table summary for
Series.plot and DataFrame.plot in the API docs.
"""
+
def _replace_pandas_items(self, display_name, sig, summary, real_name):
# this a hack: ideally we should extract the signature from the
# .__call__ method instead of hard coding this
- if display_name == 'DataFrame.plot':
- sig = '([x, y, kind, ax, ....])'
- summary = 'DataFrame plotting accessor and method'
- elif display_name == 'Series.plot':
- sig = '([kind, ax, figsize, ....])'
- summary = 'Series plotting accessor and method'
+ if display_name == "DataFrame.plot":
+ sig = "([x, y, kind, ax, ....])"
+ summary = "DataFrame plotting accessor and method"
+ elif display_name == "Series.plot":
+ sig = "([kind, ax, figsize, ....])"
+ summary = "Series plotting accessor and method"
return (display_name, sig, summary, real_name)
@staticmethod
@@ -554,15 +560,15 @@ def _is_deprecated(real_name):
obj, parent, modname = _import_by_name(real_name)
except ImportError:
return False
- doc = NumpyDocString(obj.__doc__ or '')
- summary = ''.join(doc['Summary'] + doc['Extended Summary'])
- return '.. deprecated::' in summary
+ doc = NumpyDocString(obj.__doc__ or "")
+ summary = "".join(doc["Summary"] + doc["Extended Summary"])
+ return ".. deprecated::" in summary
def _add_deprecation_prefixes(self, items):
for item in items:
display_name, sig, summary, real_name = item
if self._is_deprecated(real_name):
- summary = '(DEPRECATED) %s' % summary
+ summary = "(DEPRECATED) %s" % summary
yield display_name, sig, summary, real_name
def get_items(self, names):
@@ -577,18 +583,18 @@ def linkcode_resolve(domain, info):
"""
Determine the URL corresponding to Python object
"""
- if domain != 'py':
+ if domain != "py":
return None
- modname = info['module']
- fullname = info['fullname']
+ modname = info["module"]
+ fullname = info["fullname"]
submod = sys.modules.get(modname)
if submod is None:
return None
obj = submod
- for part in fullname.split('.'):
+ for part in fullname.split("."):
try:
obj = getattr(obj, part)
except AttributeError:
@@ -617,12 +623,14 @@ def linkcode_resolve(domain, info):
fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__))
- if '+' in pandas.__version__:
- return ("http://github.com/pandas-dev/pandas/blob/master/pandas/"
- "{}{}".format(fn, linespec))
+ if "+" in pandas.__version__:
+ return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format(
+ fn, linespec
+ )
else:
- return ("http://github.com/pandas-dev/pandas/blob/"
- "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec))
+ return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format(
+ pandas.__version__, fn, linespec
+ )
# remove the docstring of the flags attribute (inherited from numpy ndarray)
@@ -646,7 +654,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
"""
if what == "class":
- joined = '\n'.join(lines)
+ joined = "\n".join(lines)
templates = [
""".. rubric:: Attributes
@@ -662,25 +670,25 @@ def process_class_docstrings(app, what, name, obj, options, lines):
:toctree:
None
-"""
+""",
]
for template in templates:
if template in joined:
- joined = joined.replace(template, '')
- lines[:] = joined.split('\n')
+ joined = joined.replace(template, "")
+ lines[:] = joined.split("\n")
suppress_warnings = [
# We "overwrite" autosummary with our PandasAutosummary, but
# still want the regular autosummary setup to run. So we just
# suppress this warning.
- 'app.add_directive'
+ "app.add_directive"
]
if pattern:
# When building a single document we don't want to warn because references
# to other documents are unknown, as it's expected
- suppress_warnings.append('ref.ref')
+ suppress_warnings.append("ref.ref")
def rstjinja(app, docname, source):
@@ -689,12 +697,10 @@ def rstjinja(app, docname, source):
"""
# http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/
# Make sure we're outputting HTML
- if app.builder.format != 'html':
+ if app.builder.format != "html":
return
src = source[0]
- rendered = app.builder.templates.render_string(
- src, app.config.html_context
- )
+ rendered = app.builder.templates.render_string(src, app.config.html_context)
source[0] = rendered
@@ -706,4 +712,4 @@ def setup(app):
app.add_autodocumenter(AccessorAttributeDocumenter)
app.add_autodocumenter(AccessorMethodDocumenter)
app.add_autodocumenter(AccessorCallableDocumenter)
- app.add_directive('autosummary', PandasAutosummary)
+ app.add_directive("autosummary", PandasAutosummary)
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index b5c7ae7a213cb..be6555b2ab936 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -127,35 +127,24 @@ to build the documentation locally before pushing your changes.
.. _contributing.dev_c:
-Installing a C Compiler
+Installing a C compiler
~~~~~~~~~~~~~~~~~~~~~~~
Pandas uses C extensions (mostly written using Cython) to speed up certain
operations. To install pandas from source, you need to compile these C
extensions, which means you need a C compiler. This process depends on which
-platform you're using. Follow the `CPython contributing guide
-`_ for getting a
-compiler installed. You don't need to do any of the ``./configure`` or ``make``
-steps; you only need to install the compiler.
-
-For Windows developers, when using Python 3.5 and later, it is sufficient to
-install `Visual Studio 2017 `_ with the
-**Python development workload** and the **Python native development tools**
-option. Otherwise, the following links may be helpful.
-
-* https://blogs.msdn.microsoft.com/pythonengineering/2017/03/07/python-support-in-vs2017/
-* https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/
-* https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit
-* https://cowboyprogrammer.org/building-python-wheels-for-windows/
-* https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/
-* https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy
+platform you're using.
+
+* Windows: https://devguide.python.org/setup/#windows-compiling
+* Mac: https://devguide.python.org/setup/#macos
+* Unix: https://devguide.python.org/setup/#unix-compiling
Let us know if you have any difficulties by opening an issue or reaching out on
`Gitter`_.
.. _contributing.dev_python:
-Creating a Python Environment
+Creating a Python environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Now that you have a C compiler, create an isolated pandas development
@@ -178,7 +167,6 @@ We'll now kick off a three-step process:
# Create and activate the build environment
conda env create -f environment.yml
conda activate pandas-dev
- conda uninstall --force pandas
# or with older versions of Anaconda:
source activate pandas-dev
@@ -209,7 +197,7 @@ See the full conda docs `here `__.
.. _contributing.pip:
-Creating a Python Environment (pip)
+Creating a Python environment (pip)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If you aren't using conda for your development environment, follow these instructions.
@@ -289,7 +277,7 @@ complex changes to the documentation as well.
Some other important things to know about the docs:
* The *pandas* documentation consists of two parts: the docstrings in the code
- itself and the docs in this folder ``pandas/doc/``.
+ itself and the docs in this folder ``doc/``.
The docstrings provide a clear explanation of the usage of the individual
functions, while the documentation in this folder consists of tutorial-like
@@ -405,11 +393,11 @@ Building the documentation
~~~~~~~~~~~~~~~~~~~~~~~~~~
So how do you build the docs? Navigate to your local
-``pandas/doc/`` directory in the console and run::
+``doc/`` directory in the console and run::
python make.py html
-Then you can find the HTML output in the folder ``pandas/doc/build/html/``.
+Then you can find the HTML output in the folder ``doc/build/html/``.
The first time you build the docs, it will take quite a while because it has to run
all the code examples and build all the generated docstring pages. In subsequent
@@ -449,7 +437,7 @@ You can also specify to use multiple cores to speed up the documentation build::
Open the following file in a web browser to see the full documentation you
just built::
- pandas/docs/build/html/index.html
+ doc/build/html/index.html
And you'll have the satisfaction of seeing your new and improved documentation!
@@ -460,7 +448,7 @@ Building master branch documentation
When pull requests are merged into the *pandas* ``master`` branch, the main parts of
the documentation are also built by Travis-CI. These docs are then hosted `here
-`__, see also
+`__, see also
the :ref:`Continuous Integration ` section.
.. _contributing.code:
@@ -563,23 +551,38 @@ many errors as possible, but it may not correct *all* of them. Thus, it is
recommended that you run ``cpplint`` to double check and make any other style
fixes manually.
-Python (PEP8)
-~~~~~~~~~~~~~
-
-*pandas* uses the `PEP8 `_ standard.
-There are several tools to ensure you abide by this standard. Here are *some* of
-the more common ``PEP8`` issues:
+Python (PEP8 / black)
+~~~~~~~~~~~~~~~~~~~~~
-* we restrict line-length to 79 characters to promote readability
-* passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')``
+*pandas* follows the `PEP8 `_ standard
+and uses `Black `_ and
+`Flake8 `_ to ensure a consistent code
+format throughout the project.
-:ref:`Continuous Integration ` will run
-the `flake8 `_ tool
-and report any stylistic errors in your code. Therefore, it is helpful before
-submitting code to run the check yourself on the diff::
+:ref:`Continuous Integration ` will run those tools and
+report any stylistic errors in your code. Therefore, it is helpful before
+submitting code to run the check yourself::
+ black pandas
git diff upstream/master -u -- "*.py" | flake8 --diff
+to auto-format your code. Additionally, many editors have plugins that will
+apply ``black`` as you edit files.
+
+Optionally, you may wish to setup `pre-commit hooks `_
+to automatically run ``black`` and ``flake8`` when you make a git commit. This
+can be done by installing ``pre-commit``::
+
+ pip install pre-commit
+
+and then running::
+
+ pre-commit install
+
+from the root of the pandas repository. Now ``black`` and ``flake8`` will be run
+each time you commit changes. You can skip these checks with
+``git commit --no-verify``.
+
This command will catch any stylistic errors in your changes specifically, but
be beware it may not catch all of them. For example, if you delete the only
usage of an imported function, it is stylistically incorrect to import an
@@ -605,7 +608,7 @@ and run ``flake8`` on them, one after the other.
.. _contributing.import-formatting:
-Import Formatting
+Import formatting
~~~~~~~~~~~~~~~~~
*pandas* uses `isort `__ to standardise import
formatting across the codebase.
@@ -651,7 +654,7 @@ The `--recursive` flag can be passed to sort all files in a directory.
You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `.
-Backwards Compatibility
+Backwards compatibility
~~~~~~~~~~~~~~~~~~~~~~~
Please try to maintain backward compatibility. *pandas* has lots of users with lots of
@@ -696,10 +699,140 @@ You'll also need to
See :ref:`contributing.warnings` for more.
+.. _contributing.type_hints:
+
+Type Hints
+----------
+
+*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well!
+
+Style Guidelines
+~~~~~~~~~~~~~~~~
+
+Types imports should follow the ``from typing import ...`` convention. So rather than
+
+.. code-block:: python
+
+ import typing
+
+ primes = [] # type: typing.List[int]
+
+You should write
+
+.. code-block:: python
+
+ from typing import List, Optional, Union
+
+ primes = [] # type: List[int]
+
+``Optional`` should be used where applicable, so instead of
+
+.. code-block:: python
+
+ maybe_primes = [] # type: List[Union[int, None]]
+
+You should write
+
+.. code-block:: python
+
+ maybe_primes = [] # type: List[Optional[int]]
+
+In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like
+
+.. code-block:: python
+
+ class SomeClass1:
+ str = None
+
+The appropriate way to annotate this would be as follows
+
+.. code-block:: python
+
+ str_type = str
+
+ class SomeClass2:
+ str = None # type: str_type
+
+In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example
+
+.. code-block:: python
+
+ from typing import cast
+
+ from pandas.core.dtypes.common import is_number
+
+ def cannot_infer_bad(obj: Union[str, int, float]):
+
+ if is_number(obj):
+ ...
+ else: # Reasonably only str objects would reach this but...
+ obj = cast(str, obj) # Mypy complains without this!
+ return obj.upper()
+
+The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
+
+.. code-block:: python
+
+ def cannot_infer_good(obj: Union[str, int, float]):
+
+ if isinstance(obj, str):
+ return obj.upper()
+ else:
+ ...
+
+With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths.
+
+Syntax Requirements
+~~~~~~~~~~~~~~~~~~~
+
+Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas:
+
+.. code-block:: python
+
+ primes = [] # type: List[int]
+
+Whereas this is **NOT** allowed:
+
+.. code-block:: python
+
+ primes: List[int] = [] # not supported in Python 3.5!
+
+Note that function signatures can always be annotated per :pep:`3107`:
+
+.. code-block:: python
+
+ def sum_of_primes(primes: List[int] = []) -> int:
+ ...
+
+
+Pandas-specific Types
+~~~~~~~~~~~~~~~~~~~~~
+
+Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas.
+
+For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module
+
+.. code-block:: python
+
+ from pandas._typing import Dtype
+
+ def as_type(dtype: Dtype) -> ...:
+ ...
+
+This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types.
+
+Validating Type Hints
+~~~~~~~~~~~~~~~~~~~~~
+
+*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running
+
+.. code-block:: shell
+
+ mypy pandas
.. _contributing.ci:
-Testing With Continuous Integration
+Testing with continuous integration
-----------------------------------
The *pandas* test suite will run automatically on `Travis-CI `__ and
@@ -930,7 +1063,7 @@ options or subtle interactions to test (or think of!) all of them.
.. _contributing.warnings:
-Testing Warnings
+Testing warnings
~~~~~~~~~~~~~~~~
By default, one of pandas CI workers will fail if any unhandled warnings are emitted.
diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
index f7e2b42a1ccbd..34bc5f44eb0c0 100644
--- a/doc/source/development/contributing_docstring.rst
+++ b/doc/source/development/contributing_docstring.rst
@@ -522,7 +522,7 @@ examples:
* ``loc`` and ``iloc``, as they do the same, but in one case providing indices
and in the other positions
* ``max`` and ``min``, as they do the opposite
-* ``iterrows``, ``itertuples`` and ``iteritems``, as it is easy that a user
+* ``iterrows``, ``itertuples`` and ``items``, as it is easy that a user
looking for the method to iterate over columns ends up in the method to
iterate over rows, and vice-versa
* ``fillna`` and ``dropna``, as both methods are used to handle missing values
@@ -929,7 +929,7 @@ plot will be generated automatically when building the documentation.
.. _docstring.sharing:
-Sharing Docstrings
+Sharing docstrings
------------------
Pandas has a system for sharing docstrings, with slight variations, between
diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
index a283920ae4377..923ef005d5926 100644
--- a/doc/source/development/developer.rst
+++ b/doc/source/development/developer.rst
@@ -37,12 +37,19 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
.. code-block:: text
- {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
+ {'index_columns': [, , ...],
'column_indexes': [, , ..., ],
'columns': [, , ...],
- 'pandas_version': $VERSION}
+ 'pandas_version': $VERSION,
+ 'creator': {
+ 'library': $LIBRARY,
+ 'version': $LIBRARY_VERSION
+ }}
-Here, ````/```` and so forth are dictionaries containing the metadata
+The "descriptor" values ```` in the ``'index_columns'`` field are
+strings (referring to a column) or dictionaries with values as described below.
+
+The ````/```` and so forth are dictionaries containing the metadata
for each column, *including the index columns*. This has JSON form:
.. code-block:: text
@@ -53,26 +60,37 @@ for each column, *including the index columns*. This has JSON form:
'numpy_type': numpy_type,
'metadata': metadata}
-.. note::
+See below for the detailed specification for these.
+
+Index Metadata Descriptors
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``RangeIndex`` can be stored as metadata only, not requiring serialization. The
+descriptor format for these as is follows:
- Every index column is stored with a name matching the pattern
- ``__index_level_\d+__`` and its corresponding column information is can be
- found with the following code snippet.
+.. code-block:: python
- Following this naming convention isn't strictly necessary, but strongly
- suggested for compatibility with Arrow.
+ index = pd.RangeIndex(0, 10, 2)
+ {'kind': 'range',
+ 'name': index.name,
+ 'start': index.start,
+ 'stop': index.stop,
+ 'step': index.step}
- Here's an example of how the index metadata is structured in pyarrow:
+Other index types must be serialized as data columns along with the other
+DataFrame columns. The metadata for these is a string indicating the name of
+the field in the data columns, for example ``'__index_level_0__'``.
- .. code-block:: python
+If an index has a non-None ``name`` attribute, and there is no other column
+with a name matching that value, then the ``index.name`` value can be used as
+the descriptor. Otherwise (for unnamed indexes and ones with names colliding
+with other column names) a disambiguating name with pattern matching
+``__index_level_\d+__`` should be used. In cases of named indexes as data
+columns, ``name`` attribute is always stored in the column descriptors as
+above.
- # assuming there's at least 3 levels in the index
- index_columns = metadata['index_columns'] # noqa: F821
- columns = metadata['columns'] # noqa: F821
- ith_index = 2
- assert index_columns[ith_index] == '__index_level_2__'
- ith_index_info = columns[-len(index_columns):][ith_index]
- ith_index_level_name = ith_index_info['name']
+Column Metadata
+~~~~~~~~~~~~~~~
``pandas_type`` is the logical type of the column, and is one of:
@@ -161,4 +179,8 @@ As an example of fully-formed metadata:
'numpy_type': 'int64',
'metadata': None}
],
- 'pandas_version': '0.20.0'}
+ 'pandas_version': '0.20.0',
+ 'creator': {
+ 'library': 'pyarrow',
+ 'version': '0.13.0'
+ }}
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index 8bee0452c2207..e341dcb8318bc 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-Extending Pandas
+Extending pandas
****************
While pandas provides a rich set of methods, containers, and data types, your
@@ -12,7 +12,7 @@ pandas.
.. _extending.register-accessors:
-Registering Custom Accessors
+Registering custom accessors
----------------------------
Libraries can use the decorators
@@ -70,7 +70,7 @@ applies only to certain dtypes.
.. _extending.extension-types:
-Extension Types
+Extension types
---------------
.. versionadded:: 0.23.0
@@ -208,9 +208,28 @@ will
2. call ``result = op(values, ExtensionArray)``
3. re-box the result in a ``Series``
+.. _extending.extension.ufunc:
+
+NumPy Universal Functions
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Series` implements ``__array_ufunc__``. As part of the implementation,
+pandas unboxes the ``ExtensionArray`` from the :class:`Series`, applies the ufunc,
+and re-boxes it if necessary.
+
+If applicable, we highly recommend that you implement ``__array_ufunc__`` in your
+extension array to avoid coercion to an ndarray. See
+`the numpy documentation `__
+for an example.
+
+As part of your implementation, we require that you defer to pandas when a pandas
+container (:class:`Series`, :class:`DataFrame`, :class:`Index`) is detected in ``inputs``.
+If any of those is present, you should return ``NotImplemented``. Pandas will take care of
+unboxing the array from the container and re-calling the ufunc with the unwrapped input.
+
.. _extending.extension.testing:
-Testing Extension Arrays
+Testing extension arrays
^^^^^^^^^^^^^^^^^^^^^^^^
We provide a test suite for ensuring that your extension arrays satisfy the expected
@@ -238,7 +257,7 @@ for a list of all the tests available.
.. _extending.subclassing-pandas:
-Subclassing pandas Data Structures
+Subclassing pandas data structures
----------------------------------
.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
@@ -260,7 +279,7 @@ This section describes how to subclass ``pandas`` data structures to meet more s
You can find a nice example in `geopandas `_ project.
-Override Constructor Properties
+Override constructor properties
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Each data structure has several *constructor properties* for returning a new
@@ -348,7 +367,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame
>>> type(sliced2)
-Define Original Properties
+Define original properties
^^^^^^^^^^^^^^^^^^^^^^^^^^
To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
@@ -397,3 +416,47 @@ Below is an example to define two original properties, "internal_cache" as a tem
# properties defined in _metadata are retained
>>> df[['A', 'B']].added_property
property
+
+.. _extending.plotting-backends:
+
+Plotting backends
+-----------------
+
+Starting in 0.25 pandas can be extended with third-party plotting backends. The
+main idea is letting users select a plotting backend different than the provided
+one based on Matplotlib. For example:
+
+.. code-block:: python
+
+ >>> pd.set_option('plotting.backend', 'backend.module')
+ >>> pd.Series([1, 2, 3]).plot()
+
+This would be more or less equivalent to:
+
+.. code-block:: python
+
+ >>> import backend.module
+ >>> backend.module.plot(pd.Series([1, 2, 3]))
+
+The backend module can then use other visualization tools (Bokeh, Altair,...)
+to generate the plots.
+
+Libraries implementing the plotting backend should use `entry points `__
+to make their backend discoverable to pandas. The key is ``"pandas_plotting_backends"``. For example, pandas
+registers the default "matplotlib" backend as follows.
+
+.. code-block:: python
+
+ # in setup.py
+ setup( # noqa: F821
+ ...,
+ entry_points={
+ "pandas_plotting_backends": [
+ "matplotlib = pandas:plotting._matplotlib",
+ ],
+ },
+ )
+
+
+More information on how to implement a third-party plotting backend can be found at
+https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst
index a149f31118ed5..c7710ff19f078 100644
--- a/doc/source/development/index.rst
+++ b/doc/source/development/index.rst
@@ -16,3 +16,4 @@ Development
internals
extending
developer
+ roadmap
diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst
index 9c434928c214e..748caae295460 100644
--- a/doc/source/development/internals.rst
+++ b/doc/source/development/internals.rst
@@ -102,7 +102,7 @@ So, for example, ``Series[category]._values`` is a ``Categorical``, while
.. _ref-subclassing-pandas:
-Subclassing pandas Data Structures
+Subclassing pandas data structures
----------------------------------
This section has been moved to :ref:`extending.subclassing-pandas`.
diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst
new file mode 100644
index 0000000000000..00598830e2fe9
--- /dev/null
+++ b/doc/source/development/roadmap.rst
@@ -0,0 +1,193 @@
+.. _roadmap:
+
+=======
+Roadmap
+=======
+
+This page provides an overview of the major themes in pandas' development. Each of
+these items requires a relatively large amount of effort to implement. These may
+be achieved more quickly with dedicated funding or interest from contributors.
+
+An item being on the roadmap does not mean that it will *necessarily* happen, even
+with unlimited funding. During the implementation period we may discover issues
+preventing the adoption of the feature.
+
+Additionally, an item *not* being on the roadmap does not exclude it from inclusion
+in pandas. The roadmap is intended for larger, fundamental changes to the project that
+are likely to take months or years of developer time. Smaller-scoped items will continue
+to be tracked on our `issue tracker `__.
+
+See :ref:`roadmap.evolution` for proposing changes to this document.
+
+Extensibility
+-------------
+
+Pandas :ref:`extending.extension-types` allow for extending NumPy types with custom
+data types and array storage. Pandas uses extension types internally, and provides
+an interface for 3rd-party libraries to define their own custom data types.
+
+Many parts of pandas still unintentionally convert data to a NumPy array.
+These problems are especially pronounced for nested data.
+
+We'd like to improve the handling of extension arrays throughout the library,
+making their behavior more consistent with the handling of NumPy arrays. We'll do this
+by cleaning up pandas' internals and adding new methods to the extension array interface.
+
+String data type
+----------------
+
+Currently, pandas stores text data in an ``object`` -dtype NumPy array.
+The current implementation has two primary drawbacks: First, ``object`` -dtype
+is not specific to strings: any Python object can be stored in an ``object`` -dtype
+array, not just strings. Second: this is not efficient. The NumPy memory model
+isn't especially well-suited to variable width text data.
+
+To solve the first issue, we propose a new extension type for string data. This
+will initially be opt-in, with users explicitly requesting ``dtype="string"``.
+The array backing this string dtype may initially be the current implementation:
+an ``object`` -dtype NumPy array of Python strings.
+
+To solve the second issue (performance), we'll explore alternative in-memory
+array libraries (for example, Apache Arrow). As part of the work, we may
+need to implement certain operations expected by pandas users (for example
+the algorithm used in, ``Series.str.upper``). That work may be done outside of
+pandas.
+
+Apache Arrow interoperability
+-----------------------------
+
+`Apache Arrow `__ is a cross-language development
+platform for in-memory data. The Arrow logical types are closely aligned with
+typical pandas use cases.
+
+We'd like to provide better-integrated support for Arrow memory and data types
+within pandas. This will let us take advantage of its I/O capabilities and
+provide for better interoperability with other languages and libraries
+using Arrow.
+
+Block manager rewrite
+---------------------
+
+We'd like to replace pandas current internal data structures (a collection of
+1 or 2-D arrays) with a simpler collection of 1-D arrays.
+
+Pandas internal data model is quite complex. A DataFrame is made up of
+one or more 2-dimensional "blocks", with one or more blocks per dtype. This
+collection of 2-D arrays is managed by the BlockManager.
+
+The primary benefit of the BlockManager is improved performance on certain
+operations (construction from a 2D array, binary operations, reductions across the columns),
+especially for wide DataFrames. However, the BlockManager substantially increases the
+complexity and maintenance burden of pandas.
+
+By replacing the BlockManager we hope to achieve
+
+* Substantially simpler code
+* Easier extensibility with new logical types
+* Better user control over memory use and layout
+* Improved micro-performance
+* Option to provide a C / Cython API to pandas' internals
+
+See `these design documents `__
+for more.
+
+Decoupling of indexing and internals
+------------------------------------
+
+The code for getting and setting values in pandas' data structures needs refactoring.
+In particular, we must clearly separate code that converts keys (e.g., the argument
+to ``DataFrame.loc``) to positions from code that uses these positions to get
+or set values. This is related to the proposed BlockManager rewrite. Currently, the
+BlockManager sometimes uses label-based, rather than position-based, indexing.
+We propose that it should only work with positional indexing, and the translation of keys
+to positions should be entirely done at a higher level.
+
+Indexing is a complicated API with many subtleties. This refactor will require care
+and attention. More details are discussed at
+https://github.com/pandas-dev/pandas/wiki/(Tentative)-rules-for-restructuring-indexing-code
+
+Numba-accelerated operations
+----------------------------
+
+`Numba `__ is a JIT compiler for Python code. We'd like to provide
+ways for users to apply their own Numba-jitted functions where pandas accepts user-defined functions
+(for example, :meth:`Series.apply`, :meth:`DataFrame.apply`, :meth:`DataFrame.applymap`,
+and in groupby and window contexts). This will improve the performance of
+user-defined-functions in these operations by staying within compiled code.
+
+
+Documentation improvements
+--------------------------
+
+We'd like to improve the content, structure, and presentation of the pandas documentation.
+Some specific goals include
+
+* Overhaul the HTML theme with a modern, responsive design (:issue:`15556`)
+* Improve the "Getting Started" documentation, designing and writing learning paths
+ for users different backgrounds (e.g. brand new to programming, familiar with
+ other languages like R, already familiar with Python).
+* Improve the overall organization of the documentation and specific subsections
+ of the documentation to make navigation and finding content easier.
+
+Package docstring validation
+----------------------------
+
+To improve the quality and consistency of pandas docstrings, we've developed
+tooling to check docstrings in a variety of ways.
+https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py
+contains the checks.
+
+Like many other projects, pandas uses the
+`numpydoc `__ style for writing
+docstrings. With the collaboration of the numpydoc maintainers, we'd like to
+move the checks to a package other than pandas so that other projects can easily
+use them as well.
+
+Performance monitoring
+----------------------
+
+Pandas uses `airspeed velocity `__ to
+monitor for performance regressions. ASV itself is a fabulous tool, but requires
+some additional work to be integrated into an open source project's workflow.
+
+The `asv-runner `__ organization, currently made up
+of pandas maintainers, provides tools built on top of ASV. We have a physical
+machine for running a number of project's benchmarks, and tools managing the
+benchmark runs and reporting on results.
+
+We'd like to fund improvements and maintenance of these tools to
+
+* Be more stable. Currently, they're maintained on the nights and weekends when
+ a maintainer has free time.
+* Tune the system for benchmarks to improve stability, following
+ https://pyperf.readthedocs.io/en/latest/system.html
+* Build a GitHub bot to request ASV runs *before* a PR is merged. Currently, the
+ benchmarks are only run nightly.
+
+.. _roadmap.evolution:
+
+Roadmap Evolution
+-----------------
+
+Pandas continues to evolve. The direction is primarily determined by community
+interest. Everyone is welcome to review existing items on the roadmap and
+to propose a new item.
+
+Each item on the roadmap should be a short summary of a larger design proposal.
+The proposal should include
+
+1. Short summary of the changes, which would be appropriate for inclusion in
+ the roadmap if accepted.
+2. Motivation for the changes.
+3. An explanation of why the change is in scope for pandas.
+4. Detailed design: Preferably with example-usage (even if not implemented yet)
+ and API documentation
+5. API Change: Any API changes that may result from the proposal.
+
+That proposal may then be submitted as a GitHub issue, where the pandas maintainers
+can review and comment on the design. The `pandas mailing list `__
+should be notified of the proposal.
+
+When there's agreement that an implementation
+would be welcome, the roadmap should be updated to include the summary and a
+link to the discussion issue.
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index b1a5430752558..b1e3d8dc8a1ad 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-pandas Ecosystem
+Pandas ecosystem
****************
Increasingly, packages are being built on top of pandas to address specific needs
@@ -26,7 +26,7 @@ substantial projects that you feel should be on this list, please let us know.
.. _ecosystem.stats:
-Statistics and Machine Learning
+Statistics and machine learning
-------------------------------
`Statsmodels `__
@@ -72,6 +72,17 @@ the latest web technologies. Its goal is to provide elegant, concise constructio
graphics in the style of Protovis/D3, while delivering high-performance interactivity over
large data to thin clients.
+`Pandas-Bokeh `__ provides a high level API
+for Bokeh that can be loaded as a native Pandas plotting backend via
+
+.. code:: python
+
+ pd.set_option("plotting.backend", "pandas_bokeh")
+
+It is very similar to the matplotlib plotting backend, but provides interactive
+web-based charts and maps.
+
+
`seaborn `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -243,7 +254,7 @@ you can obtain for free on the FRED website.
.. _ecosystem.domain:
-Domain Specific
+Domain specific
---------------
`Geopandas `__
@@ -332,7 +343,7 @@ and check that they're *actually* true.
.. _ecosystem.extensions:
-Extension Data Types
+Extension data types
--------------------
Pandas provides an interface for defining
diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst
index fdf1f05b8e61f..41520795bde62 100644
--- a/doc/source/getting_started/10min.rst
+++ b/doc/source/getting_started/10min.rst
@@ -3,7 +3,7 @@
{{ header }}
********************
-10 Minutes to pandas
+10 minutes to pandas
********************
This is a short introduction to pandas, geared mainly for new users.
@@ -16,7 +16,7 @@ Customarily, we import as follows:
import numpy as np
import pandas as pd
-Object Creation
+Object creation
---------------
See the :ref:`Data Structure Intro section `.
@@ -83,7 +83,7 @@ As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
tab completed. ``E`` is there as well; the rest of the attributes have been
truncated for brevity.
-Viewing Data
+Viewing data
------------
See the :ref:`Basics section `.
@@ -183,7 +183,7 @@ Selecting via ``[]``, which slices the rows.
df[0:3]
df['20130102':'20130104']
-Selection by Label
+Selection by label
~~~~~~~~~~~~~~~~~~
See more in :ref:`Selection by Label `.
@@ -224,7 +224,7 @@ For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0], 'A']
-Selection by Position
+Selection by position
~~~~~~~~~~~~~~~~~~~~~
See more in :ref:`Selection by Position `.
@@ -271,14 +271,14 @@ For getting fast access to a scalar (equivalent to the prior method):
df.iat[1, 1]
-Boolean Indexing
+Boolean indexing
~~~~~~~~~~~~~~~~
Using a single column's values to select data.
.. ipython:: python
- df[df.A > 0]
+ df[df['A'] > 0]
Selecting values from a DataFrame where a boolean condition is met.
@@ -340,7 +340,7 @@ A ``where`` operation with setting.
df2
-Missing Data
+Missing data
------------
pandas primarily uses the value ``np.nan`` to represent missing data. It is by
@@ -468,6 +468,13 @@ Concatenating pandas objects together with :func:`concat`:
pd.concat(pieces)
+.. note::
+ Adding a column to a ``DataFrame`` is relatively fast. However, adding
+ a row requires a copy, and may be expensive. We recommend passing a
+ pre-built list of records to the ``DataFrame`` constructor instead
+ of building a ``DataFrame`` by iteratively appending records to it.
+ See :ref:`Appending to dataframe ` for more.
+
Join
~~~~
@@ -491,21 +498,6 @@ Another example that can be given is:
right
pd.merge(left, right, on='key')
-
-Append
-~~~~~~
-
-Append rows to a dataframe. See the :ref:`Appending `
-section.
-
-.. ipython:: python
-
- df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
- df
- s = df.iloc[3]
- df.append(s, ignore_index=True)
-
-
Grouping
--------
@@ -580,7 +572,7 @@ With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the
stacked.unstack(1)
stacked.unstack(0)
-Pivot Tables
+Pivot tables
~~~~~~~~~~~~
See the section on :ref:`Pivot Tables `.
@@ -600,7 +592,7 @@ We can produce pivot tables from this data very easily:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
-Time Series
+Time series
-----------
pandas has simple, powerful, and efficient functionality for performing
@@ -734,7 +726,7 @@ of the columns with labels:
@savefig frame_plot_basic.png
plt.legend(loc='best')
-Getting Data In/Out
+Getting data in/out
-------------------
CSV
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 5ec0094de0a91..802ffadf2a81e 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -3,7 +3,7 @@
{{ header }}
==============================
- Essential Basic Functionality
+ Essential basic functionality
==============================
Here we discuss a lot of the essential functionality common to the pandas data
@@ -19,7 +19,7 @@ the previous section:
.. _basics.head_tail:
-Head and Tail
+Head and tail
-------------
To view a small sample of a Series or DataFrame object, use the
@@ -34,7 +34,7 @@ of elements to display is five, but you may pass a custom number.
.. _basics.attrs:
-Attributes and Underlying Data
+Attributes and underlying data
------------------------------
pandas objects have a number of attributes enabling you to access the metadata
@@ -286,7 +286,7 @@ using ``fillna`` if you wish).
.. _basics.compare:
-Flexible Comparisons
+Flexible comparisons
~~~~~~~~~~~~~~~~~~~~
Series and DataFrame have the binary comparison methods ``eq``, ``ne``, ``lt``, ``gt``,
@@ -304,7 +304,7 @@ indexing operations, see the section on :ref:`Boolean indexing
.. _basics.reductions:
-Boolean Reductions
+Boolean reductions
~~~~~~~~~~~~~~~~~~
You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`,
@@ -468,7 +468,7 @@ which we illustrate:
df2
df1.combine_first(df2)
-General DataFrame Combine
+General DataFrame combine
~~~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~DataFrame.combine_first` method above calls the more general
@@ -643,7 +643,7 @@ there for details about accepted inputs.
.. _basics.idxmin:
-Index of Min/Max Values
+Index of min/max values
~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` functions on Series
@@ -677,7 +677,7 @@ matching index:
.. _basics.discretization:
-Value counts (histogramming) / Mode
+Value counts (histogramming) / mode
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The :meth:`~Series.value_counts` Series method and top-level function computes a histogram
@@ -752,7 +752,7 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise.
.. _basics.pipe:
-Tablewise Function Application
+Tablewise function application
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``DataFrames`` and ``Series`` can of course just be passed into functions.
@@ -784,6 +784,7 @@ In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``.
For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``:
.. ipython:: python
+ :okwarning:
import statsmodels.formula.api as sm
@@ -806,7 +807,7 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`.
.. _R: https://www.r-project.org
-Row or Column-wise Function Application
+Row or column-wise function application
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Arbitrary functions can be applied along the axes of a DataFrame
@@ -925,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value:
.. ipython:: python
- tsdf.A.agg('sum')
+ tsdf['A'].agg('sum')
Aggregating with multiple functions
@@ -949,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function
.. ipython:: python
- tsdf.A.agg(['sum', 'mean'])
+ tsdf['A'].agg(['sum', 'mean'])
Passing a ``lambda`` function will yield a ```` named row:
.. ipython:: python
- tsdf.A.agg(['sum', lambda x: x.mean()])
+ tsdf['A'].agg(['sum', lambda x: x.mean()])
Passing a named function will yield that name for the row:
@@ -964,7 +965,7 @@ Passing a named function will yield that name for the row:
def mymean(x):
return x.mean()
- tsdf.A.agg(['sum', mymean])
+ tsdf['A'].agg(['sum', mymean])
Aggregating with a dict
+++++++++++++++++++++++
@@ -987,7 +988,7 @@ not noted for a particular column will be ``NaN``:
.. _basics.aggregation.mixed_dtypes:
-Mixed Dtypes
+Mixed dtypes
++++++++++++
When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
@@ -1064,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin
.. ipython:: python
- tsdf.A.transform(np.abs)
+ tsdf['A'].transform(np.abs)
Transform with multiple functions
@@ -1083,7 +1084,7 @@ resulting column names will be the transforming functions.
.. ipython:: python
- tsdf.A.transform([np.abs, lambda x: x + 1])
+ tsdf['A'].transform([np.abs, lambda x: x + 1])
Transforming with a dict
@@ -1106,7 +1107,7 @@ selective transforms.
.. _basics.elementwise:
-Applying Elementwise Functions
+Applying elementwise functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since not all functions can be vectorized (accept NumPy arrays and return
@@ -1421,8 +1422,6 @@ The :meth:`~DataFrame.rename` method also provides an ``inplace`` named
parameter that is by default ``False`` and copies the underlying data. Pass
``inplace=True`` to rename the data in place.
-.. versionadded:: 0.18.0
-
Finally, :meth:`~Series.rename` also accepts a scalar or list-like
for altering the ``Series.name`` attribute.
@@ -1474,7 +1473,7 @@ Thus, for example, iterating over a DataFrame gives you the column names:
print(col)
-Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to
+Pandas objects also have the dict-like :meth:`~DataFrame.items` method to
iterate over the (key, value) pairs.
To iterate over the rows of a DataFrame, you can use the following methods:
@@ -1523,10 +1522,10 @@ To iterate over the rows of a DataFrame, you can use the following methods:
df
-iteritems
-~~~~~~~~~
+items
+~~~~~
-Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates
+Consistent with the dict-like interface, :meth:`~DataFrame.items` iterates
through key-value pairs:
* **Series**: (index, scalar value) pairs
@@ -1536,7 +1535,7 @@ For example:
.. ipython:: python
- for label, ser in df.iteritems():
+ for label, ser in df.items():
print(label)
print(ser)
@@ -1726,7 +1725,7 @@ sorting by column values, and sorting by a combination of both.
.. _basics.sort_index:
-By Index
+By index
~~~~~~~~
The :meth:`Series.sort_index` and :meth:`DataFrame.sort_index` methods are
@@ -1753,7 +1752,7 @@ used to sort a pandas object by its index levels.
.. _basics.sort_values:
-By Values
+By values
~~~~~~~~~
The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The
@@ -1785,7 +1784,7 @@ argument:
.. _basics.sort_indexes_and_values:
-By Indexes and Values
+By indexes and values
~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.23.0
@@ -1968,11 +1967,11 @@ dtype of the column will be chosen to accommodate all of the data types
pd.Series([1, 2, 3, 6., 'foo'])
The number of columns of each type in a ``DataFrame`` can be found by calling
-:meth:`~DataFrame.get_dtype_counts`.
+``DataFrame.dtypes.value_counts()``.
.. ipython:: python
- dft.get_dtype_counts()
+ dft.dtypes.value_counts()
Numeric dtypes will propagate and can coexist in DataFrames.
If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``,
@@ -2062,8 +2061,6 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`.
dft
dft.dtypes
-.. versionadded:: 0.19.0
-
Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFrame.astype`.
.. ipython:: python
diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
index 2957430666b8a..f67f46fc2b29b 100644
--- a/doc/source/getting_started/comparison/comparison_with_r.rst
+++ b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -26,7 +26,7 @@ use HDF5 files, see :ref:`io.external_compatibility` for an
example.
-Quick Reference
+Quick reference
---------------
We'll start off with a quick reference guide pairing some common R
@@ -35,7 +35,7 @@ operations using `dplyr
pandas equivalents.
-Querying, Filtering, Sampling
+Querying, filtering, sampling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
=========================================== ===========================================
@@ -81,11 +81,11 @@ R pandas
=========================================== ===========================================
``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']``
``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})``
-``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)``
+``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])``
=========================================== ===========================================
-Grouping and Summarizing
+Grouping and summarizing
~~~~~~~~~~~~~~~~~~~~~~~~
============================================== ===========================================
@@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing:
df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
df.query('a <= b')
- df[df.a <= df.b]
- df.loc[df.a <= df.b]
+ df[df['a'] <= df['b']]
+ df.loc[df['a'] <= df['b']]
For more details and examples see :ref:`the query documentation
`.
@@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the
df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
df.eval('a + b')
- df.a + df.b # same as the previous expression
+ df['a'] + df['b'] # same as the previous expression
In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
evaluation in pure Python. For more details and examples see :ref:`the eval
diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst
index fc12c8524d3bf..69bb700c97b15 100644
--- a/doc/source/getting_started/comparison/comparison_with_sas.rst
+++ b/doc/source/getting_started/comparison/comparison_with_sas.rst
@@ -31,10 +31,10 @@ As is customary, we import pandas and NumPy as follows:
proc print data=df(obs=5);
run;
-Data Structures
+Data structures
---------------
-General Terminology Translation
+General terminology translation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. csv-table::
@@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an
``Index`` effectively.
-Data Input / Output
+Data input / output
-------------------
-Constructing a DataFrame from Values
+Constructing a DataFrame from values
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A SAS data set can be built from specified values by
@@ -110,7 +110,7 @@ and the values are the data.
df
-Reading External Data
+Reading external data
~~~~~~~~~~~~~~~~~~~~~
Like SAS, pandas provides utilities for reading in data from
@@ -151,7 +151,7 @@ In addition to text/csv, pandas supports a variety of other data formats
such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*``
function. See the :ref:`IO documentation` for more details.
-Exporting Data
+Exporting data
~~~~~~~~~~~~~~
The inverse of ``PROC IMPORT`` in SAS is ``PROC EXPORT``
@@ -169,10 +169,10 @@ and other data formats follow a similar api.
tips.to_csv('tips2.csv')
-Data Operations
+Data operations
---------------
-Operations on Columns
+Operations on columns
~~~~~~~~~~~~~~~~~~~~~
In the ``DATA`` step, arbitrary math expressions can
@@ -228,7 +228,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin
tips[tips['total_bill'] > 10].head()
-If/Then Logic
+If/then logic
~~~~~~~~~~~~~
In SAS, if/then logic can be used to create new columns.
@@ -256,7 +256,7 @@ the ``where`` method from ``numpy``.
tips = tips.drop('bucket', axis=1)
-Date Functionality
+Date functionality
~~~~~~~~~~~~~~~~~~
SAS provides a variety of functions to do operations on
@@ -301,7 +301,7 @@ see the :ref:`timeseries documentation` for more details.
tips = tips.drop(['date1', 'date2', 'date1_year',
'date2_month', 'date1_next', 'months_between'], axis=1)
-Selection of Columns
+Selection of columns
~~~~~~~~~~~~~~~~~~~~
SAS provides keywords in the ``DATA`` step to select,
@@ -338,7 +338,7 @@ The same operations are expressed in pandas below.
tips.rename(columns={'total_bill': 'total_bill_2'}).head()
-Sorting by Values
+Sorting by values
~~~~~~~~~~~~~~~~~
Sorting in SAS is accomplished via ``PROC SORT``
@@ -358,7 +358,7 @@ takes a list of columns to sort by.
tips.head()
-String Processing
+String processing
-----------------
Length
@@ -466,7 +466,7 @@ approaches, but this just shows a simple approach.
firstlast
-Upcase, Lowcase, and Propcase
+Upcase, lowcase, and propcase
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The SAS `UPCASE `__
@@ -555,7 +555,7 @@ types are accomplished via the ``how`` keyword.
outer_join
-Missing Data
+Missing data
------------
Like SAS, pandas has a representation for missing data - which is the
@@ -660,7 +660,7 @@ example, to subtract the mean for each observation by smoker group.
run;
-pandas ``groubpy`` provides a ``transform`` mechanism that allows
+pandas ``groupby`` provides a ``transform`` mechanism that allows
these type of operations to be succinctly expressed in one
operation.
@@ -671,7 +671,7 @@ operation.
tips.head()
-By Group Processing
+By group processing
~~~~~~~~~~~~~~~~~~~
In addition to aggregation, pandas ``groupby`` can be used to
@@ -701,7 +701,7 @@ In pandas this would be written as:
Other Considerations
--------------------
-Disk vs Memory
+Disk vs memory
~~~~~~~~~~~~~~
pandas operates exclusively in memory, where a SAS data set exists on disk.
@@ -713,7 +713,7 @@ If out of core processing is needed, one possibility is the
library (currently in development) which
provides a subset of pandas functionality for an on-disk ``DataFrame``
-Data Interop
+Data interop
~~~~~~~~~~~~
pandas provides a :func:`read_sas` method that can read SAS data saved in
diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst
index bf2b03176ecd8..db687386329bb 100644
--- a/doc/source/getting_started/comparison/comparison_with_stata.rst
+++ b/doc/source/getting_started/comparison/comparison_with_stata.rst
@@ -31,10 +31,10 @@ libraries as ``pd`` and ``np``, respectively, for the rest of the document.
list in 1/5
-Data Structures
+Data structures
---------------
-General Terminology Translation
+General terminology translation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. csv-table::
@@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an
``Index`` effectively.
-Data Input / Output
+Data input / output
-------------------
-Constructing a DataFrame from Values
+Constructing a DataFrame from values
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A Stata data set can be built from specified values by
@@ -107,7 +107,7 @@ and the values are the data.
df
-Reading External Data
+Reading external data
~~~~~~~~~~~~~~~~~~~~~
Like Stata, pandas provides utilities for reading in data from
@@ -155,7 +155,7 @@ such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a
function. See the :ref:`IO documentation` for more details.
-Exporting Data
+Exporting data
~~~~~~~~~~~~~~
The inverse of ``import delimited`` in Stata is ``export delimited``
@@ -177,10 +177,10 @@ Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata`
tips.to_stata('tips2.dta')
-Data Operations
+Data operations
---------------
-Operations on Columns
+Operations on columns
~~~~~~~~~~~~~~~~~~~~~
In Stata, arbitrary math expressions can be used with the ``generate`` and
@@ -222,7 +222,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin
tips[tips['total_bill'] > 10].head()
-If/Then Logic
+If/then logic
~~~~~~~~~~~~~
In Stata, an ``if`` clause can also be used to create new columns.
@@ -245,7 +245,7 @@ the ``where`` method from ``numpy``.
tips = tips.drop('bucket', axis=1)
-Date Functionality
+Date functionality
~~~~~~~~~~~~~~~~~~
Stata provides a variety of functions to do operations on
@@ -290,7 +290,7 @@ see the :ref:`timeseries documentation` for more details.
tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month',
'date1_next', 'months_between'], axis=1)
-Selection of Columns
+Selection of columns
~~~~~~~~~~~~~~~~~~~~
Stata provides keywords to select, drop, and rename columns.
@@ -319,7 +319,7 @@ to a variable.
tips.rename(columns={'total_bill': 'total_bill_2'}).head()
-Sorting by Values
+Sorting by values
~~~~~~~~~~~~~~~~~
Sorting in Stata is accomplished via ``sort``
@@ -337,10 +337,10 @@ takes a list of columns to sort by.
tips.head()
-String Processing
+String processing
-----------------
-Finding Length of String
+Finding length of string
~~~~~~~~~~~~~~~~~~~~~~~~
Stata determines the length of a character string with the :func:`strlen` and
@@ -361,7 +361,7 @@ Use ``len`` and ``rstrip`` to exclude trailing blanks.
tips['time'].str.rstrip().str.len().head()
-Finding Position of Substring
+Finding position of substring
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Stata determines the position of a character in a string with the :func:`strpos` function.
@@ -383,7 +383,7 @@ the function will return -1 if it fails to find the substring.
tips['sex'].str.find("ale").head()
-Extracting Substring by Position
+Extracting substring by position
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Stata extracts a substring from a string based on its position with the :func:`substr` function.
@@ -401,7 +401,7 @@ indexes are zero-based.
tips['sex'].str[0:1].head()
-Extracting nth Word
+Extracting nth word
~~~~~~~~~~~~~~~~~~~
The Stata :func:`word` function returns the nth word from a string.
@@ -431,7 +431,7 @@ approaches, but this just shows a simple approach.
firstlast
-Changing Case
+Changing case
~~~~~~~~~~~~~
The Stata :func:`strupper`, :func:`strlower`, :func:`strproper`,
@@ -547,7 +547,7 @@ types are accomplished via the ``how`` keyword.
outer_join
-Missing Data
+Missing data
------------
Like Stata, pandas has a representation for missing data -- the
@@ -634,7 +634,7 @@ For example, to subtract the mean for each observation by smoker group.
generate adj_total_bill = total_bill - group_bill
-pandas ``groubpy`` provides a ``transform`` mechanism that allows
+pandas ``groupby`` provides a ``transform`` mechanism that allows
these type of operations to be succinctly expressed in one
operation.
@@ -645,7 +645,7 @@ operation.
tips.head()
-By Group Processing
+By group processing
~~~~~~~~~~~~~~~~~~~
In addition to aggregation, pandas ``groupby`` can be used to
@@ -664,10 +664,10 @@ In pandas this would be written as:
tips.groupby(['sex', 'smoker']).first()
-Other Considerations
+Other considerations
--------------------
-Disk vs Memory
+Disk vs memory
~~~~~~~~~~~~~~
Pandas and Stata both operate exclusively in memory. This means that the size of
diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst
index 1abca7ac393dd..9e18951fe3f4c 100644
--- a/doc/source/getting_started/dsintro.rst
+++ b/doc/source/getting_started/dsintro.rst
@@ -3,7 +3,7 @@
{{ header }}
************************
-Intro to Data Structures
+Intro to data structures
************************
We'll start with a quick, non-comprehensive overview of the fundamental data
@@ -251,8 +251,6 @@ Series can also have a ``name`` attribute:
The Series ``name`` will be assigned automatically in many cases, in particular
when taking 1D slices of DataFrame as you will see below.
-.. versionadded:: 0.18.0
-
You can rename a Series with the :meth:`pandas.Series.rename` method.
.. ipython:: python
@@ -399,7 +397,7 @@ The result will be a DataFrame with the same index as the input Series, and
with one column whose name is the original name of the Series (only if no other
column name provided).
-**Missing Data**
+**Missing data**
Much more will be said on this topic in the :ref:`Missing data `
section. To construct a DataFrame with missing data, we use ``np.nan`` to
@@ -407,7 +405,7 @@ represent missing values. Alternatively, you may pass a ``numpy.MaskedArray``
as the data argument to the DataFrame constructor, and its masked entries will
be considered missing.
-Alternate Constructors
+Alternate constructors
~~~~~~~~~~~~~~~~~~~~~~
.. _basics.dataframe.from_dict:
@@ -498,7 +496,7 @@ available to insert at a particular location in the columns:
.. _dsintro.chained_assignment:
-Assigning New Columns in Method Chains
+Assigning new columns in method chains
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Inspired by `dplyr's
@@ -614,7 +612,7 @@ To write code compatible with all versions of Python, split the assignment in tw
-Indexing / Selection
+Indexing / selection
~~~~~~~~~~~~~~~~~~~~
The basics of indexing are as follows:
@@ -731,28 +729,67 @@ DataFrame interoperability with NumPy functions
.. _dsintro.numpy_interop:
Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions
-can be used with no issues on DataFrame, assuming the data within are numeric:
+can be used with no issues on Series and DataFrame, assuming the data within
+are numeric:
.. ipython:: python
np.exp(df)
np.asarray(df)
-The dot method on DataFrame implements matrix multiplication:
+DataFrame is not intended to be a drop-in replacement for ndarray as its
+indexing semantics and data model are quite different in places from an n-dimensional
+array.
+
+:class:`Series` implements ``__array_ufunc__``, which allows it to work with NumPy's
+`universal functions `_.
+
+The ufunc is applied to the underlying array in a Series.
.. ipython:: python
- df.T.dot(df)
+ ser = pd.Series([1, 2, 3, 4])
+ np.exp(ser)
-Similarly, the dot method on Series implements dot product:
+.. versionchanged:: 0.25.0
+
+ When multiple ``Series`` are passed to a ufunc, they are aligned before
+ performing the operation.
+
+Like other parts of the library, pandas will automatically align labeled inputs
+as part of a ufunc with multiple inputs. For example, using :meth:`numpy.remainder`
+on two :class:`Series` with differently ordered labels will align before the operation.
.. ipython:: python
- s1 = pd.Series(np.arange(5, 10))
- s1.dot(s1)
+ ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
+ ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c'])
+ ser1
+ ser2
+ np.remainder(ser1, ser2)
-DataFrame is not intended to be a drop-in replacement for ndarray as its
-indexing semantics are quite different in places from a matrix.
+As usual, the union of the two indices is taken, and non-overlapping values are filled
+with missing values.
+
+.. ipython:: python
+
+ ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd'])
+ ser3
+ np.remainder(ser1, ser3)
+
+When a binary ufunc is applied to a :class:`Series` and :class:`Index`, the Series
+implementation takes precedence and a Series is returned.
+
+.. ipython:: python
+
+ ser = pd.Series([1, 2, 3])
+ idx = pd.Index([4, 5, 6])
+
+ np.maximum(ser, idx)
+
+NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays,
+for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible,
+the ufunc is applied without converting the underlying data to an ndarray.
Console display
~~~~~~~~~~~~~~~
diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst
index b531f686951fc..ec76c60f24257 100644
--- a/doc/source/getting_started/overview.rst
+++ b/doc/source/getting_started/overview.rst
@@ -81,7 +81,7 @@ Some other notes
- pandas has been used extensively in production in financial applications.
-Data Structures
+Data structures
---------------
.. csv-table::
@@ -131,7 +131,7 @@ changed, but, for example, columns can be inserted into a DataFrame. However,
the vast majority of methods produce new objects and leave the input data
untouched. In general we like to **favor immutability** where sensible.
-Getting Support
+Getting support
---------------
The first stop for pandas issues and ideas is the `Github Issue Tracker
@@ -152,7 +152,7 @@ pandas is a `NumFOCUS `__ sponso
This will help ensure the success of development of pandas as a world-class open-source
project, and makes it possible to `donate `__ to the project.
-Project Governance
+Project governance
------------------
The governance process that pandas project has used informally since its inception in 2008 is formalized in `Project Governance documents `__.
@@ -160,13 +160,13 @@ The documents clarify how decisions are made and how the various elements of our
Wes McKinney is the Benevolent Dictator for Life (BDFL).
-Development Team
+Development team
-----------------
The list of the Core Team members and more detailed information can be found on the `people’s page `__ of the governance repo.
-Institutional Partners
+Institutional partners
----------------------
The information about current institutional partners can be found on `pandas website page `__.
diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst
index 8e23c643280c1..212f3636d0a98 100644
--- a/doc/source/getting_started/tutorials.rst
+++ b/doc/source/getting_started/tutorials.rst
@@ -8,7 +8,7 @@ Tutorials
This is a guide to many pandas tutorials, geared mainly for new users.
-Internal Guides
+Internal guides
===============
pandas' own :ref:`10 Minutes to pandas<10min>`.
@@ -17,7 +17,7 @@ More complex recipes are in the :ref:`Cookbook`.
A handy pandas `cheat sheet `_.
-Community Guides
+Community guides
================
pandas Cookbook by Julia Evans
@@ -74,7 +74,7 @@ Excel charts with pandas, vincent and xlsxwriter
* `Using Pandas and XlsxWriter to create Excel charts `_
-Video Tutorials
+Video tutorials
---------------
* `Pandas From The Ground Up `_
@@ -96,7 +96,7 @@ Video Tutorials
`Jupyter Notebook `__
-Various Tutorials
+Various tutorials
-----------------
* `Wes McKinney's (pandas BDFL) blog `_
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index b57ce83cfc33c..f5669626aa2b3 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library.
:hidden:
{% endif %}
{% if not single_doc %}
- What's New in 0.25.0
+ What's New in 1.0.0
install
getting_started/index
user_guide/index
@@ -53,7 +53,7 @@ See the :ref:`overview` for more detail about what's in the library.
whatsnew/index
{% endif %}
-* :doc:`whatsnew/v0.25.0`
+* :doc:`whatsnew/v1.0.0`
* :doc:`install`
* :doc:`getting_started/index`
diff --git a/doc/source/install.rst b/doc/source/install.rst
index 1c1f0c1d4cf8e..fc99b458fa0af 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -15,35 +15,10 @@ Instructions for installing from source,
`PyPI `__, `ActivePython `__, various Linux distributions, or a
`development version `__ are also provided.
-.. _install.dropping-27:
-
-Plan for dropping Python 2.7
-----------------------------
-
-The Python core team plans to stop supporting Python 2.7 on January 1st, 2020.
-In line with `NumPy's plans`_, all pandas releases through December 31, 2018
-will support Python 2.
-
-The 0.24.x feature release will be the last release to
-support Python 2. The released package will continue to be available on
-PyPI and through conda.
-
- Starting **January 1, 2019**, all new feature releases (> 0.24) will be Python 3 only.
-
-If there are people interested in continued support for Python 2.7 past December
-31, 2018 (either backporting bug fixes or funding) please reach out to the
-maintainers on the issue tracker.
-
-For more information, see the `Python 3 statement`_ and the `Porting to Python 3 guide`_.
-
-.. _NumPy's plans: https://github.com/numpy/numpy/blob/master/doc/neps/nep-0014-dropping-python2.7-proposal.rst#plan-for-dropping-python-27-support
-.. _Python 3 statement: http://python3statement.org/
-.. _Porting to Python 3 guide: https://docs.python.org/3/howto/pyporting.html
-
Python version support
----------------------
-Officially Python 2.7, 3.5, 3.6, and 3.7.
+Officially Python 3.5.3 and above, 3.6, and 3.7.
Installing pandas
-----------------
@@ -236,7 +211,7 @@ Package Minimum support
.. _install.recommended_dependencies:
-Recommended Dependencies
+Recommended dependencies
~~~~~~~~~~~~~~~~~~~~~~~~
* `numexpr `__: for accelerating certain numerical operations.
@@ -255,7 +230,7 @@ Recommended Dependencies
.. _install.optional_dependencies:
-Optional Dependencies
+Optional dependencies
~~~~~~~~~~~~~~~~~~~~~
Pandas has many optional dependencies that are only used for specific methods.
@@ -276,16 +251,17 @@ SciPy 0.19.0 Miscellaneous statistical functions
XLsxWriter 0.9.8 Excel writing
blosc Compression for msgpack
fastparquet 0.2.1 Parquet reading / writing
-gcsfs 0.1.0 Google Cloud Storage access
+gcsfs 0.2.2 Google Cloud Storage access
html5lib HTML parser for read_html (see :ref:`note `)
-lxml HTML parser for read_html (see :ref:`note `)
+lxml 3.8.0 HTML parser for read_html (see :ref:`note `)
matplotlib 2.2.2 Visualization
openpyxl 2.4.8 Reading / writing for xlsx files
pandas-gbq 0.8.0 Google Big Query access
psycopg2 PostgreSQL engine for sqlalchemy
pyarrow 0.9.0 Parquet and feather reading / writing
-pymysql MySQL engine for sqlalchemy
+pymysql 0.7.11 MySQL engine for sqlalchemy
pyreadstat SPSS files (.sav) reading
+pytables 3.4.2 HDF5 reading / writing
qtpy Clipboard I/O
s3fs 0.0.8 Amazon S3 access
xarray 0.8.2 pandas-like API for N-dimensional data
@@ -298,7 +274,7 @@ zlib Compression for msgpack
.. _optional_html:
-Optional Dependencies for Parsing HTML
+Optional dependencies for parsing HTML
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
One of the following combinations of libraries is needed to use the
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 38406bf5b2656..7f464bf952bfb 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -3,7 +3,7 @@
.. _api.arrays:
=============
-Pandas Arrays
+Pandas arrays
=============
.. currentmodule:: pandas
@@ -37,7 +37,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra
.. _api.arrays.datetime:
-Datetime Data
+Datetime data
-------------
NumPy cannot natively represent timezone-aware datetimes. Pandas supports this
@@ -156,7 +156,7 @@ If the data are tz-aware, then every value in the array must have the same timez
.. _api.arrays.timedelta:
-Timedelta Data
+Timedelta data
--------------
NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta`
@@ -211,7 +211,7 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`.
.. _api.arrays.period:
-Timespan Data
+Timespan data
-------------
Pandas represents spans of times as :class:`Period` objects.
@@ -277,7 +277,7 @@ Every period in a ``PeriodArray`` must have the same ``freq``.
.. _api.arrays.interval:
-Interval Data
+Interval data
-------------
Arbitrary intervals can be represented as :class:`Interval` objects.
@@ -295,6 +295,7 @@ Properties
Interval.closed
Interval.closed_left
Interval.closed_right
+ Interval.is_empty
Interval.left
Interval.length
Interval.mid
@@ -331,10 +332,12 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`.
arrays.IntervalArray.closed
arrays.IntervalArray.mid
arrays.IntervalArray.length
+ arrays.IntervalArray.is_empty
arrays.IntervalArray.is_non_overlapping_monotonic
arrays.IntervalArray.from_arrays
arrays.IntervalArray.from_tuples
arrays.IntervalArray.from_breaks
+ arrays.IntervalArray.contains
arrays.IntervalArray.overlaps
arrays.IntervalArray.set_closed
arrays.IntervalArray.to_tuples
@@ -342,7 +345,7 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`.
.. _api.arrays.integer_na:
-Nullable Integer
+Nullable integer
----------------
:class:`numpy.ndarray` cannot natively represent integer-data with missing values.
@@ -369,7 +372,7 @@ Pandas provides this through :class:`arrays.IntegerArray`.
.. _api.arrays.categorical:
-Categorical Data
+Categorical data
----------------
Pandas defines a custom data type for representing data that can take only a
@@ -434,7 +437,7 @@ data. See :ref:`api.series.cat` for more.
.. _api.arrays.sparse:
-Sparse Data
+Sparse data
-----------
Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index 34f76642119c8..4b1a99da7cd4c 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -18,10 +18,44 @@ objects.
api.extensions.register_series_accessor
api.extensions.register_index_accessor
api.extensions.ExtensionDtype
- api.extensions.ExtensionArray
.. autosummary::
:toctree: api/
:template: autosummary/class_without_autosummary.rst
+ api.extensions.ExtensionArray
arrays.PandasArray
+
+.. We need this autosummary so that methods and attributes are generated.
+.. Separate block, since they aren't classes.
+
+ .. autosummary::
+ :toctree: api/
+
+ api.extensions.ExtensionArray._concat_same_type
+ api.extensions.ExtensionArray._formatter
+ api.extensions.ExtensionArray._from_factorized
+ api.extensions.ExtensionArray._from_sequence
+ api.extensions.ExtensionArray._from_sequence_of_strings
+ api.extensions.ExtensionArray._ndarray_values
+ api.extensions.ExtensionArray._reduce
+ api.extensions.ExtensionArray._values_for_argsort
+ api.extensions.ExtensionArray._values_for_factorize
+ api.extensions.ExtensionArray.argsort
+ api.extensions.ExtensionArray.astype
+ api.extensions.ExtensionArray.copy
+ api.extensions.ExtensionArray.view
+ api.extensions.ExtensionArray.dropna
+ api.extensions.ExtensionArray.factorize
+ api.extensions.ExtensionArray.fillna
+ api.extensions.ExtensionArray.isna
+ api.extensions.ExtensionArray.ravel
+ api.extensions.ExtensionArray.repeat
+ api.extensions.ExtensionArray.searchsorted
+ api.extensions.ExtensionArray.shift
+ api.extensions.ExtensionArray.take
+ api.extensions.ExtensionArray.unique
+ api.extensions.ExtensionArray.dtype
+ api.extensions.ExtensionArray.nbytes
+ api.extensions.ExtensionArray.ndim
+ api.extensions.ExtensionArray.shape
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
index 7d5cd5d245631..b1c6172fb1261 100644
--- a/doc/source/reference/frame.rst
+++ b/doc/source/reference/frame.rst
@@ -67,8 +67,8 @@ Indexing, iteration
DataFrame.insert
DataFrame.__iter__
DataFrame.items
- DataFrame.keys
DataFrame.iteritems
+ DataFrame.keys
DataFrame.iterrows
DataFrame.itertuples
DataFrame.lookup
@@ -115,7 +115,7 @@ Binary operator functions
DataFrame.combine
DataFrame.combine_first
-Function application, GroupBy & Window
+Function application, GroupBy & window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -133,7 +133,7 @@ Function application, GroupBy & Window
.. _api.dataframe.stats:
-Computations / Descriptive Stats
+Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -177,7 +177,7 @@ Computations / Descriptive Stats
DataFrame.var
DataFrame.nunique
-Reindexing / Selection / Label manipulation
+Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -198,7 +198,6 @@ Reindexing / Selection / Label manipulation
DataFrame.idxmin
DataFrame.last
DataFrame.reindex
- DataFrame.reindex_axis
DataFrame.reindex_like
DataFrame.rename
DataFrame.rename_axis
@@ -240,6 +239,7 @@ Reshaping, sorting, transposing
DataFrame.unstack
DataFrame.swapaxes
DataFrame.melt
+ DataFrame.explode
DataFrame.squeeze
DataFrame.to_xarray
DataFrame.T
@@ -312,7 +312,7 @@ specific plotting methods of the form ``DataFrame.plot.``.
.. _api.frame.sparse:
-Sparse Accessor
+Sparse accessor
~~~~~~~~~~~~~~~
Sparse-dtype specific methods and attributes are provided under the
@@ -332,12 +332,11 @@ Sparse-dtype specific methods and attributes are provided under the
DataFrame.sparse.to_dense
-Serialization / IO / Conversion
+Serialization / IO / conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
- DataFrame.from_csv
DataFrame.from_dict
DataFrame.from_items
DataFrame.from_records
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index 5c8a563a47d00..921eb737aef07 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -40,7 +40,7 @@ Function application
GroupBy.transform
GroupBy.pipe
-Computations / Descriptive Stats
+Computations / descriptive stats
--------------------------------
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 31b493e472099..12ca318c815d3 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -3,7 +3,7 @@
.. _api:
=============
-API Reference
+API reference
=============
This page gives an overview of all public pandas objects, functions and
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
index 8931caf394388..576f734d517aa 100644
--- a/doc/source/reference/indexing.rst
+++ b/doc/source/reference/indexing.rst
@@ -3,7 +3,7 @@
.. _api.indexing:
=============
-Index Objects
+Index objects
=============
Index
@@ -48,7 +48,7 @@ Properties
Index.T
Index.memory_usage
-Modifying and Computations
+Modifying and computations
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -96,7 +96,7 @@ Compatibility with MultiIndex
Index.is_lexsorted_for_tuple
Index.droplevel
-Missing Values
+Missing values
~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -205,7 +205,7 @@ CategoricalIndex
CategoricalIndex
-Categorical Components
+Categorical components
~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -222,7 +222,7 @@ Categorical Components
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
-Modifying and Computations
+Modifying and computations
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -240,7 +240,7 @@ IntervalIndex
IntervalIndex
-IntervalIndex Components
+IntervalIndex components
~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -248,18 +248,19 @@ IntervalIndex Components
IntervalIndex.from_arrays
IntervalIndex.from_tuples
IntervalIndex.from_breaks
- IntervalIndex.contains
IntervalIndex.left
IntervalIndex.right
IntervalIndex.mid
IntervalIndex.closed
IntervalIndex.length
IntervalIndex.values
+ IntervalIndex.is_empty
IntervalIndex.is_non_overlapping_monotonic
IntervalIndex.is_overlapping
IntervalIndex.get_loc
IntervalIndex.get_indexer
IntervalIndex.set_closed
+ IntervalIndex.contains
IntervalIndex.overlaps
IntervalIndex.to_tuples
@@ -278,7 +279,7 @@ MultiIndex
IndexSlice
-MultiIndex Constructors
+MultiIndex constructors
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -288,7 +289,7 @@ MultiIndex Constructors
MultiIndex.from_product
MultiIndex.from_frame
-MultiIndex Properties
+MultiIndex properties
~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -299,7 +300,7 @@ MultiIndex Properties
MultiIndex.nlevels
MultiIndex.levshape
-MultiIndex Components
+MultiIndex components
~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -316,7 +317,7 @@ MultiIndex Components
MultiIndex.reorder_levels
MultiIndex.remove_unused_levels
-MultiIndex Selecting
+MultiIndex selecting
~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -336,7 +337,7 @@ DatetimeIndex
DatetimeIndex
-Time/Date Components
+Time/Date components
~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
index 9c776e3ff8a82..91f4942d03b0d 100644
--- a/doc/source/reference/io.rst
+++ b/doc/source/reference/io.rst
@@ -3,7 +3,7 @@
.. _api.io:
============
-Input/Output
+Input/output
============
.. currentmodule:: pandas
@@ -14,7 +14,7 @@ Pickling
read_pickle
-Flat File
+Flat file
~~~~~~~~~
.. autosummary::
:toctree: api/
@@ -105,6 +105,13 @@ SAS
read_sas
+SPSS
+~~~~
+.. autosummary::
+ :toctree: api/
+
+ read_spss
+
SQL
~~~
.. autosummary::
diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst
index ccc1c7e171d22..4a58055f1c955 100644
--- a/doc/source/reference/offset_frequency.rst
+++ b/doc/source/reference/offset_frequency.rst
@@ -3,7 +3,7 @@
.. _api.dateoffsets:
============
-Date Offsets
+Date offsets
============
.. currentmodule:: pandas.tseries.offsets
diff --git a/doc/source/reference/resampling.rst b/doc/source/reference/resampling.rst
index 2a52defa3c68f..57263139d9c18 100644
--- a/doc/source/reference/resampling.rst
+++ b/doc/source/reference/resampling.rst
@@ -43,7 +43,7 @@ Upsampling
Resampler.asfreq
Resampler.interpolate
-Computations / Descriptive Stats
+Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index a061f696f4b30..7ba625c141f24 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -76,8 +76,8 @@ Indexing, iteration
Series.loc
Series.iloc
Series.__iter__
- Series.iteritems
Series.items
+ Series.iteritems
Series.keys
Series.pop
Series.item
@@ -119,7 +119,7 @@ Binary operator functions
Series.product
Series.dot
-Function application, GroupBy & Window
+Function application, groupby & window
--------------------------------------
.. autosummary::
:toctree: api/
@@ -137,7 +137,7 @@ Function application, GroupBy & Window
.. _api.series.stats:
-Computations / Descriptive Stats
+Computations / descriptive stats
--------------------------------
.. autosummary::
:toctree: api/
@@ -188,7 +188,7 @@ Computations / Descriptive Stats
Series.value_counts
Series.compound
-Reindexing / Selection / Label manipulation
+Reindexing / selection / label manipulation
-------------------------------------------
.. autosummary::
:toctree: api/
@@ -245,6 +245,7 @@ Reshaping, sorting
Series.sort_index
Series.swaplevel
Series.unstack
+ Series.explode
Series.searchsorted
Series.ravel
Series.repeat
@@ -296,14 +297,14 @@ Sparse :ref:`sparse `
.. _api.series.dt:
-Datetimelike Properties
+Datetimelike properties
~~~~~~~~~~~~~~~~~~~~~~~
``Series.dt`` can be used to access the values of the series as
datetimelike and return several properties.
These can be accessed like ``Series.dt.``.
-Datetime Properties
+Datetime properties
^^^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -339,7 +340,7 @@ Datetime Properties
Series.dt.tz
Series.dt.freq
-Datetime Methods
+Datetime methods
^^^^^^^^^^^^^^^^
.. autosummary::
@@ -358,7 +359,7 @@ Datetime Methods
Series.dt.month_name
Series.dt.day_name
-Period Properties
+Period properties
^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -369,7 +370,7 @@ Period Properties
Series.dt.start_time
Series.dt.end_time
-Timedelta Properties
+Timedelta properties
^^^^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -382,7 +383,7 @@ Timedelta Properties
Series.dt.nanoseconds
Series.dt.components
-Timedelta Methods
+Timedelta methods
^^^^^^^^^^^^^^^^^
.. autosummary::
@@ -478,7 +479,7 @@ strings and apply several methods to it. These can be accessed like
.. _api.series.cat:
-Categorical Accessor
+Categorical accessor
~~~~~~~~~~~~~~~~~~~~
Categorical-dtype specific methods and attributes are available under
@@ -508,7 +509,7 @@ the ``Series.cat`` accessor.
.. _api.series.sparse:
-Sparse Accessor
+Sparse accessor
~~~~~~~~~~~~~~~
Sparse-dtype specific methods and attributes are provided under the
@@ -560,7 +561,7 @@ specific plotting methods of the form ``Series.plot.``.
Series.hist
-Serialization / IO / Conversion
+Serialization / IO / conversion
-------------------------------
.. autosummary::
:toctree: api/
@@ -590,4 +591,3 @@ Sparse
SparseSeries.to_coo
SparseSeries.from_coo
-
diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst
index bd9635b41e343..3d155535e2585 100644
--- a/doc/source/reference/style.rst
+++ b/doc/source/reference/style.rst
@@ -9,7 +9,7 @@ Style
``Styler`` objects are returned by :attr:`pandas.DataFrame.style`.
-Styler Constructor
+Styler constructor
------------------
.. autosummary::
:toctree: api/
@@ -17,7 +17,7 @@ Styler Constructor
Styler
Styler.from_custom_template
-Styler Properties
+Styler properties
-----------------
.. autosummary::
:toctree: api/
@@ -26,7 +26,7 @@ Styler Properties
Styler.template
Styler.loader
-Style Application
+Style application
-----------------
.. autosummary::
:toctree: api/
@@ -44,7 +44,7 @@ Style Application
Styler.clear
Styler.pipe
-Builtin Styles
+Builtin styles
--------------
.. autosummary::
:toctree: api/
@@ -55,7 +55,7 @@ Builtin Styles
Styler.background_gradient
Styler.bar
-Style Export and Import
+Style export and import
-----------------------
.. autosummary::
:toctree: api/
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
index 9e1374a3bd8e4..2f6addf607877 100644
--- a/doc/source/reference/window.rst
+++ b/doc/source/reference/window.rst
@@ -5,7 +5,6 @@
======
Window
======
-.. currentmodule:: pandas.core.window
Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc.
Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc.
@@ -13,6 +12,8 @@ EWM objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:
Standard moving window functions
--------------------------------
+.. currentmodule:: pandas.core.window.rolling
+
.. autosummary::
:toctree: api/
@@ -38,6 +39,8 @@ Standard moving window functions
Standard expanding window functions
-----------------------------------
+.. currentmodule:: pandas.core.window.expanding
+
.. autosummary::
:toctree: api/
@@ -59,6 +62,8 @@ Standard expanding window functions
Exponentially-weighted moving window functions
----------------------------------------------
+.. currentmodule:: pandas.core.window.ewm
+
.. autosummary::
:toctree: api/
diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html
index b3f13f99f44d4..6e7d8ece35133 100644
--- a/doc/source/themes/nature_with_gtoc/layout.html
+++ b/doc/source/themes/nature_with_gtoc/layout.html
@@ -94,15 +94,15 @@ {{ _('Search') }}
});
});
-
+
+
+
{% endblock %}
diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
index 3235e3c2a8b2e..62a9b6396404a 100644
--- a/doc/source/user_guide/advanced.rst
+++ b/doc/source/user_guide/advanced.rst
@@ -3,7 +3,7 @@
{{ header }}
******************************
-MultiIndex / Advanced Indexing
+MultiIndex / advanced indexing
******************************
This section covers :ref:`indexing with a MultiIndex `
@@ -179,18 +179,18 @@ on a deeper level.
.. _advanced.shown_levels:
-Defined Levels
+Defined levels
~~~~~~~~~~~~~~
-The repr of a ``MultiIndex`` shows all the defined levels of an index, even
+The :class:`MultiIndex` keeps all the defined levels of an index, even
if they are not actually used. When slicing an index, you may notice this.
For example:
.. ipython:: python
- df.columns # original MultiIndex
+ df.columns.levels # original MultiIndex
- df[['foo','qux']].columns # sliced
+ df[['foo','qux']].columns.levels # sliced
This is done to avoid a recomputation of the levels in order to make slicing
highly performant. If you want to see only the used levels, you can use the
@@ -210,7 +210,8 @@ To reconstruct the ``MultiIndex`` with only the used levels, the
.. ipython:: python
- df[['foo', 'qux']].columns.remove_unused_levels()
+ new_mi = df[['foo', 'qux']].columns.remove_unused_levels()
+ new_mi.levels
Data alignment and using ``reindex``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -641,7 +642,7 @@ And now selection works as expected.
dfm.loc[(0, 'y'):(1, 'z')]
-Take Methods
+Take methods
------------
.. _advanced.take:
@@ -711,7 +712,7 @@ faster than fancy indexing.
.. _indexing.index_types:
-Index Types
+Index types
-----------
We have discussed ``MultiIndex`` in the previous sections pretty extensively.
@@ -737,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
- df.B.cat.categories
+ df['B'].cat.categories
Setting the index will create a ``CategoricalIndex``.
@@ -809,15 +810,10 @@ values **not** in the categories, similarly to how you can reindex **any** panda
Int64Index and RangeIndex
~~~~~~~~~~~~~~~~~~~~~~~~~
-.. warning::
-
- Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `.
+:class:`Int64Index` is a fundamental basic index in pandas. This is an immutable array
+implementing an ordered, sliceable set.
-:class:`Int64Index` is a fundamental basic index in pandas.
-This is an immutable array implementing an ordered, sliceable set.
-Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects.
-
-:class:`RangeIndex` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects.
+:class:`RangeIndex` is a sub-class of ``Int64Index`` that provides the default index for all ``NDFrame`` objects.
``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analogous to Python `range types `__.
.. _indexing.float64index:
@@ -879,16 +875,6 @@ In non-float indexes, slicing using floats will raise a ``TypeError``.
In [1]: pd.Series(range(5))[3.5:4.5]
TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index)
-.. warning::
-
- Using a scalar float indexer for ``.iloc`` has been removed in 0.18.0, so the following will raise a ``TypeError``:
-
- .. code-block:: ipython
-
- In [3]: pd.Series(range(5)).iloc[3.0]
- TypeError: cannot do positional indexing on with these indexers [3.0] of
-
-
Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat
irregular timedelta-like indexing scheme, but the data is recorded as floats. This could, for
example, be millisecond offsets.
@@ -937,9 +923,8 @@ for interval notation.
The ``IntervalIndex`` allows some unique indexing and is also used as a
return type for the categories in :func:`cut` and :func:`qcut`.
-.. warning::
-
- These indexing behaviors are provisional and may change in a future version of pandas.
+Indexing with an ``IntervalIndex``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index.
@@ -964,7 +949,34 @@ If you select a label *contained* within an interval, this will also select the
df.loc[2.5]
df.loc[[2.5, 3.5]]
-``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``:
+Selecting using an ``Interval`` will only return exact matches (starting from pandas 0.25.0).
+
+.. ipython:: python
+
+ df.loc[pd.Interval(1, 2)]
+
+Trying to select an ``Interval`` that is not exactly contained in the ``IntervalIndex`` will raise a ``KeyError``.
+
+.. code-block:: python
+
+ In [7]: df.loc[pd.Interval(0.5, 2.5)]
+ ---------------------------------------------------------------------------
+ KeyError: Interval(0.5, 2.5, closed='right')
+
+Selecting all ``Intervals`` that overlap a given ``Interval`` can be performed using the
+:meth:`~IntervalIndex.overlaps` method to create a boolean indexer.
+
+.. ipython:: python
+
+ idxr = df.index.overlaps(pd.Interval(0.5, 2.5))
+ idxr
+ df[idxr]
+
+Binning data with ``cut`` and ``qcut``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`cut` and :func:`qcut` both return a ``Categorical`` object, and the bins they
+create are stored as an ``IntervalIndex`` in its ``.categories`` attribute.
.. ipython:: python
@@ -972,15 +984,19 @@ If you select a label *contained* within an interval, this will also select the
c
c.categories
-Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same
-bins, with ``NaN`` representing a missing value similar to other dtypes.
+:func:`cut` also accepts an ``IntervalIndex`` for its ``bins`` argument, which enables
+a useful pandas idiom. First, We call :func:`cut` with some data and ``bins`` set to a
+fixed number, to generate the bins. Then, we pass the values of ``.categories`` as the
+``bins`` argument in subsequent calls to :func:`cut`, supplying new data which will be
+binned into the same bins.
.. ipython:: python
pd.cut([0, 3, 5, 1], bins=c.categories)
+Any value which falls outside all bins will be assigned a ``NaN`` value.
-Generating Ranges of Intervals
+Generating ranges of intervals
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If we need intervals on a regular frequency, we can use the :func:`interval_range` function
@@ -1107,6 +1123,8 @@ the :meth:`~Index.is_unique` attribute.
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique
+.. _advanced.endpoints_are_inclusive:
+
Endpoints are inclusive
~~~~~~~~~~~~~~~~~~~~~~~
@@ -1136,7 +1154,7 @@ index can be somewhat complicated. For example, the following does not work:
s.loc['c':'e' + 1]
A very common use case is to limit a time series to start and end at two
-specific dates. To enable this, we made the design to make label-based
+specific dates. To enable this, we made the design choice to make label-based
slicing include both endpoints:
.. ipython:: python
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
index a6315c548b382..8ca96ba0daa5e 100644
--- a/doc/source/user_guide/categorical.rst
+++ b/doc/source/user_guide/categorical.rst
@@ -3,7 +3,7 @@
{{ header }}
****************
-Categorical Data
+Categorical data
****************
This is an introduction to pandas categorical data type, including a short comparison
@@ -38,10 +38,10 @@ See also the :ref:`API docs on categoricals`.
.. _categorical.objectcreation:
-Object Creation
+Object creation
---------------
-Series Creation
+Series creation
~~~~~~~~~~~~~~~
Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways:
@@ -90,7 +90,7 @@ Categorical data has a specific ``category`` :ref:`dtype `:
df.dtypes
-DataFrame Creation
+DataFrame creation
~~~~~~~~~~~~~~~~~~
Similar to the previous section where a single column was converted to categorical, all columns in a
@@ -130,7 +130,7 @@ This conversion is likewise done column by column:
df_cat['B']
-Controlling Behavior
+Controlling behavior
~~~~~~~~~~~~~~~~~~~~
In the examples above where we passed ``dtype='category'``, we used the default
@@ -181,7 +181,7 @@ during normal constructor mode:
categories=["train", "test"]))
-Regaining Original Data
+Regaining original data
~~~~~~~~~~~~~~~~~~~~~~~
To get back to the original ``Series`` or NumPy array, use
@@ -243,7 +243,7 @@ expects a `dtype`. For example :func:`pandas.read_csv`,
array. In other words, ``dtype='category'`` is equivalent to
``dtype=CategoricalDtype()``.
-Equality Semantics
+Equality semantics
~~~~~~~~~~~~~~~~~~
Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal
@@ -438,7 +438,7 @@ use :meth:`~pandas.Categorical.set_categories`.
intentionally or because it is misspelled or (under Python3) due to a type difference (e.g.,
NumPy S1 dtype and Python strings). This can result in surprising behaviour!
-Sorting and Order
+Sorting and order
-----------------
.. _categorical.sort:
@@ -510,7 +510,7 @@ necessarily make the sort order the same as the categories order.
(e.g. :meth:`Series.median`, which would need to compute the mean between two values if the length
of an array is even) do not work and raise a ``TypeError``.
-Multi Column Sorting
+Multi column sorting
~~~~~~~~~~~~~~~~~~~~
A categorical dtyped column will participate in a multi-column sort in a similar manner to other columns.
@@ -834,8 +834,6 @@ See also the section on :ref:`merge dtypes` for notes about pres
Unioning
~~~~~~~~
-.. versionadded:: 0.19.0
-
If you want to combine categoricals that do not necessarily have the same
categories, the :func:`~pandas.api.types.union_categoricals` function will
combine a list-like of categoricals. The new categories will be the union of
@@ -963,7 +961,7 @@ Following table summarizes the results of ``Categoricals`` related concatenation
+----------+--------------------------------------------------------+----------------------------+
-Getting Data In/Out
+Getting data in/out
-------------------
You can write data that contains ``category`` dtypes to a ``HDFStore``.
@@ -1000,7 +998,7 @@ relevant columns back to `category` and assign the right categories and categori
The same holds for writing to a SQL database with ``to_sql``.
-Missing Data
+Missing data
------------
pandas primarily uses the value `np.nan` to represent missing data. It is by
@@ -1052,7 +1050,7 @@ Gotchas
.. _categorical.rfactor:
-Memory Usage
+Memory usage
~~~~~~~~~~~~
.. _categorical.memory:
@@ -1152,7 +1150,7 @@ You can use ``fillna`` to handle missing values before applying a function.
df.apply(lambda row: type(row["cats"]), axis=1)
df.apply(lambda col: col.dtype, axis=0)
-Categorical Index
+Categorical index
~~~~~~~~~~~~~~~~~
``CategoricalIndex`` is a type of index that is useful for supporting
@@ -1173,7 +1171,7 @@ Setting the index will create a ``CategoricalIndex``:
# This now sorts by the categories order
df.sort_index()
-Side Effects
+Side effects
~~~~~~~~~~~~
Constructing a ``Series`` from a ``Categorical`` will not copy the input
diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst
index 71cbf58dff871..cfce7c40c477f 100644
--- a/doc/source/user_guide/computation.rst
+++ b/doc/source/user_guide/computation.rst
@@ -5,12 +5,13 @@
Computational tools
===================
-Statistical Functions
+
+Statistical functions
---------------------
.. _computation.pct_change:
-Percent Change
+Percent change
~~~~~~~~~~~~~~
``Series`` and ``DataFrame`` have a method
@@ -294,7 +295,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns:
.. _stats.summary:
-Method Summary
+Method summary
~~~~~~~~~~~~~~
We provide a number of common statistical functions:
@@ -335,7 +336,7 @@ compute the mean absolute deviation on a rolling basis:
.. _stats.rolling_window:
-Rolling Windows
+Rolling windows
~~~~~~~~~~~~~~~
Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``.
@@ -404,12 +405,10 @@ For some windowing functions, additional parameters must be specified:
.. _stats.moments.ts:
-Time-aware Rolling
+Time-aware rolling
~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
-New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce
+It is possible to pass an offset (or convertible) to a ``.rolling()`` method and have it produce
variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring
within the indicated time delta.
@@ -469,7 +468,7 @@ default of the index) in a DataFrame.
.. _stats.rolling_window.endpoints:
-Rolling Window Endpoints
+Rolling window endpoints
~~~~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.20.0
@@ -511,7 +510,7 @@ For fixed windows, the closed parameter cannot be set and the rolling window wil
.. _stats.moments.ts-versus-resampling:
-Time-aware Rolling vs. Resampling
+Time-aware rolling vs. resampling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They
@@ -529,7 +528,7 @@ will have the shape of a regular frequency between the min and the max of the or
To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation.
-Centering Windows
+Centering windows
~~~~~~~~~~~~~~~~~
By default the labels are set to the right edge of the window, but a
@@ -542,7 +541,7 @@ By default the labels are set to the right edge of the window, but a
.. _stats.moments.binary:
-Binary Window Functions
+Binary window functions
~~~~~~~~~~~~~~~~~~~~~~~
:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about
@@ -695,7 +694,7 @@ Furthermore you can pass a nested dict to indicate different aggregations on dif
.. _stats.moments.expanding:
-Expanding Windows
+Expanding windows
-----------------
A common alternative to rolling statistics is to use an *expanding* window,
@@ -716,7 +715,7 @@ they are implemented in pandas such that the following two calls are equivalent:
These have a similar set of methods to ``.rolling`` methods.
-Method Summary
+Method summary
~~~~~~~~~~~~~~
.. currentmodule:: pandas.core.window
@@ -798,7 +797,7 @@ relative impact of an individual data point. As an example, here is the
.. _stats.moments.exponentially_weighted:
-Exponentially Weighted Windows
+Exponentially weighted windows
------------------------------
.. currentmodule:: pandas.core.window
@@ -892,10 +891,9 @@ Therefore, there is an assumption that :math:`x_0` is not an ordinary value
but rather an exponentially weighted moment of the infinite series up to that
point.
-One must have :math:`0 < \alpha \leq 1`, and while since version 0.18.0
-it has been possible to pass :math:`\alpha` directly, it's often easier
-to think about either the **span**, **center of mass (com)** or **half-life**
-of an EW moment:
+One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass
+:math:`\alpha` directly, it's often easier to think about either the
+**span**, **center of mass (com)** or **half-life** of an EW moment:
.. math::
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 772362cab396c..c9d3bc3a28c70 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -99,7 +99,7 @@ Splitting
df[df.AAA <= 5]
df[df.AAA > 5]
-Building Criteria
+Building criteria
*****************
`Select with multi-column criteria
@@ -245,7 +245,7 @@ Ambiguity arises when an index consists of integers with a non-zero start or non
df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))]
-New Columns
+New columns
***********
`Efficiently and dynamically creating new columns using applymap
@@ -399,7 +399,7 @@ Sorting
df.sort_values(by=('Labs', 'II'), ascending=False)
-`Partial Selection, the need for sortedness;
+`Partial selection, the need for sortedness;
`__
Levels
@@ -413,7 +413,7 @@ Levels
.. _cookbook.missing_data:
-Missing Data
+Missing data
------------
The :ref:`missing data` docs.
@@ -485,7 +485,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
expected_df = gb.apply(GrowUp)
expected_df
-`Expanding Apply
+`Expanding apply
`__
.. ipython:: python
@@ -592,10 +592,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
.. ipython:: python
df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A'])
- df.A.groupby((df.A != df.A.shift()).cumsum()).groups
- df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum()
+ df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups
+ df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum()
-Expanding Data
+Expanding data
**************
`Alignment and to-date
@@ -690,7 +690,7 @@ To create year and month cross tabulation:
Apply
*****
-`Rolling Apply to Organize - Turning embedded lists into a MultiIndex frame
+`Rolling apply to organize - Turning embedded lists into a MultiIndex frame
`__
.. ipython:: python
@@ -706,7 +706,7 @@ Apply
for ind, row in df.iterrows()})
df_orgz
-`Rolling Apply with a DataFrame returning a Series
+`Rolling apply with a DataFrame returning a Series
`__
Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned
@@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc
df
def gm(df, const):
- v = ((((df.A + df.B) + 1).cumprod()) - 1) * const
+ v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const
return v.iloc[-1]
s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5)
@@ -1099,7 +1099,7 @@ HDFStore
The :ref:`HDFStores ` docs
-`Simple Queries with a Timestamp Index
+`Simple queries with a Timestamp Index
`__
`Managing heterogeneous data using a linked multiple table hierarchy
@@ -1169,7 +1169,7 @@ Storing Attributes to a group node
.. _cookbook.binary:
-Binary Files
+Binary files
************
pandas readily accepts NumPy record arrays, if you need to read in a binary
@@ -1334,7 +1334,7 @@ Values can be set to NaT using np.nan, similar to datetime
y[1] = np.nan
y
-Aliasing Axis Names
+Aliasing axis names
-------------------
To globally provide aliases for axis names, one can define these 2 functions:
@@ -1361,7 +1361,7 @@ To globally provide aliases for axis names, one can define these 2 functions:
df2.sum(axis='myaxis2')
clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
-Creating Example Data
+Creating example data
---------------------
To create a dataframe from every combination of some given values, like R's ``expand.grid()``
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index 525f9abb1d1ae..2df5b9d82dcc3 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -3,7 +3,7 @@
{{ header }}
*********************
-Enhancing Performance
+Enhancing performance
*********************
In this part of the tutorial, we will investigate how to speed up certain
@@ -15,7 +15,7 @@ when we use Cython and Numba on a test function operating row-wise on the
.. _enhancingperf.cython:
-Cython (Writing C extensions for pandas)
+Cython (writing C extensions for pandas)
----------------------------------------
For many use cases writing pandas in pure Python and NumPy is sufficient. In some
@@ -33,7 +33,7 @@ faster than the pure Python solution.
.. _enhancingperf.pure:
-Pure python
+Pure Python
~~~~~~~~~~~
We have a ``DataFrame`` to which we want to apply a function row-wise.
@@ -243,9 +243,9 @@ We've gotten another big improvement. Let's check again where the time is spent:
.. ipython:: python
- %prun -l 4 apply_integrate_f(df['a'].to_numpy(),
- df['b'].to_numpy(),
- df['N'].to_numpy())
+ %%prun -l 4 apply_integrate_f(df['a'].to_numpy(),
+ df['b'].to_numpy(),
+ df['N'].to_numpy())
As one might expect, the majority of the time is now spent in ``apply_integrate_f``,
so if we wanted to make anymore efficiencies we must continue to concentrate our
@@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation:
.. code-block:: ipython
# Custom function without numba
- In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501
+ In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501
1000 loops, best of 3: 797 us per loop
# Standard implementation (faster than a custom function)
- In [6]: %timeit df['col1_doubled'] = df.a * 2
+ In [6]: %timeit df['col1_doubled'] = df['a'] * 2
1000 loops, best of 3: 233 us per loop
# Custom function with numba
- In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy())
+ In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
1000 loops, best of 3: 145 us per loop
Caveats
@@ -429,7 +429,7 @@ Read more in the `Numba docs `__.
.. _enhancingperf.eval:
-Expression Evaluation via :func:`~pandas.eval`
+Expression evaluation via :func:`~pandas.eval`
-----------------------------------------------
The top-level function :func:`pandas.eval` implements expression evaluation of
@@ -465,7 +465,7 @@ engine in addition to some extensions available only in pandas.
The larger the frame and the larger the expression the more speedup you will
see from using :func:`~pandas.eval`.
-Supported Syntax
+Supported syntax
~~~~~~~~~~~~~~~~
These operations are supported by :func:`pandas.eval`:
@@ -505,7 +505,7 @@ This Python syntax is **not** allowed:
-:func:`~pandas.eval` Examples
+:func:`~pandas.eval` examples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`pandas.eval` works well with expressions containing large arrays.
@@ -601,8 +601,6 @@ This allows for *formulaic evaluation*. The assignment target can be a
new column name or an existing column name, and it must be a valid Python
identifier.
-.. versionadded:: 0.18.0
-
The ``inplace`` keyword determines whether this assignment will performed
on the original ``DataFrame`` or return a copy with the new column.
@@ -630,8 +628,6 @@ new or modified columns is returned and the original frame is unchanged.
df.eval('e = a - c', inplace=False)
df
-.. versionadded:: 0.18.0
-
As a convenience, multiple assignments can be performed by using a
multi-line string.
@@ -647,14 +643,12 @@ The equivalent in standard Python would be
.. ipython:: python
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
- df['c'] = df.a + df.b
- df['d'] = df.a + df.b + df.c
+ df['c'] = df['a'] + df['b']
+ df['d'] = df['a'] + df['b'] + df['c']
df['a'] = 1
df
-.. versionadded:: 0.18.0
-
-The ``query`` method gained the ``inplace`` keyword which determines
+The ``query`` method has a ``inplace`` keyword which determines
whether the query modifies the original frame.
.. ipython:: python
@@ -669,7 +663,7 @@ whether the query modifies the original frame.
Unlike with ``eval``, the default value for ``inplace`` for ``query``
is ``False``. This is consistent with prior versions of pandas.
-Local Variables
+Local variables
~~~~~~~~~~~~~~~
You must *explicitly reference* any local variable that you want to use in an
@@ -694,7 +688,7 @@ name in an expression.
a = np.random.randn()
df.query('@a < a')
- df.loc[a < df.a] # same as the previous expression
+ df.loc[a < df['a']] # same as the previous expression
With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it
isn't defined in that context. ``pandas`` will let you know this if you try to
@@ -714,7 +708,7 @@ standard Python.
pd.eval('a + b')
-:func:`pandas.eval` Parsers
+:func:`pandas.eval` parsers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are two different parsers and two different engines you can use as
@@ -754,7 +748,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would
in vanilla Python.
-:func:`pandas.eval` Backends
+:func:`pandas.eval` backends
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There's also the option to make :func:`~pandas.eval` operate identical to plain
@@ -779,7 +773,7 @@ is a bit slower (not by much) than evaluating the same expression in Python
%timeit pd.eval('df1 + df2 + df3 + df4', engine='python')
-:func:`pandas.eval` Performance
+:func:`pandas.eval` performance
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:func:`~pandas.eval` is intended to speed up certain kinds of operations. In
@@ -804,7 +798,7 @@ computation. The two lines are two different engines.
This plot was created using a ``DataFrame`` with 3 columns each containing
floating point values generated using ``numpy.random.randn()``.
-Technical Minutia Regarding Expression Evaluation
+Technical minutia regarding expression evaluation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expressions that would result in an object dtype or involve datetime operations
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index 3d89fe171a343..f9a72b87e58d8 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -75,7 +75,7 @@ See also :ref:`Categorical Memory Usage `.
.. _gotchas.truth:
-Using If/Truth Statements with pandas
+Using if/truth statements with pandas
-------------------------------------
pandas follows the NumPy convention of raising an error when you try to convert
@@ -317,7 +317,7 @@ See `this link ``
+ to each subsequent lambda.
+
+ .. ipython:: python
+
+ grouped['C'].agg([lambda x: x.max() - x.min(),
+ lambda x: x.median() - x.mean()])
+
+
+
.. _groupby.aggregate.named:
-Named Aggregation
+Named aggregation
~~~~~~~~~~~~~~~~~
.. versionadded:: 0.25.0
@@ -804,13 +827,10 @@ and that the transformed data contains no NAs.
.. _groupby.transform.window_resample:
-New syntax to window and resample operations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
+Window and resample operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Working with the resample, expanding or rolling operations on the groupby
-level used to require the application of helper functions. However,
-now it is possible to use ``resample()``, ``expanding()`` and
+It is possible to use ``resample()``, ``expanding()`` and
``rolling()`` as methods on groupbys.
The example below will apply the ``rolling()`` method on the samples of
@@ -1122,7 +1142,7 @@ can be used as group keys. If so, the order of the levels will be preserved:
.. _groupby.specify:
-Grouping with a Grouper specification
+Grouping with a grouper specification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You may need to specify a bit more data to properly group. You can
@@ -1404,7 +1424,7 @@ introduction ` and the
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
-Groupby by Indexer to 'resample' data
+Groupby by indexer to 'resample' data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
index 4ea7c656fd197..cf55ce0c9a6d4 100644
--- a/doc/source/user_guide/indexing.rst
+++ b/doc/source/user_guide/indexing.rst
@@ -3,7 +3,7 @@
{{ header }}
***************************
-Indexing and Selecting Data
+Indexing and selecting data
***************************
The axis labeling information in pandas objects serves many purposes:
@@ -36,17 +36,13 @@ this area.
should be avoided. See :ref:`Returning a View versus Copy
`.
-.. warning::
-
- Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `.
-
See the :ref:`MultiIndex / Advanced Indexing ` for ``MultiIndex`` and more advanced indexing documentation.
See the :ref:`cookbook` for some advanced strategies.
.. _indexing.choice:
-Different Choices for Indexing
+Different choices for indexing
------------------------------
Object selection has had a number of user-requested additions in order to
@@ -61,14 +57,12 @@ of multi-axis indexing.
* A list or array of labels ``['a', 'b', 'c']``.
* A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
- index! See :ref:`Slicing with labels
- `.).
+ index! See :ref:`Slicing with labels `
+ and :ref:`Endpoints are inclusive `.)
* A boolean array
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
- .. versionadded:: 0.18.1
-
See more at :ref:`Selection by Label `.
* ``.iloc`` is primarily integer position based (from ``0`` to
@@ -85,8 +79,6 @@ of multi-axis indexing.
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).
- .. versionadded:: 0.18.1
-
See more at :ref:`Selection by Position `,
:ref:`Advanced Indexing ` and :ref:`Advanced
Hierarchical `.
@@ -181,7 +173,7 @@ columns.
df[['A', 'B']]
-Attribute Access
+Attribute access
----------------
.. _indexing.columns.multiple:
@@ -218,7 +210,7 @@ as an attribute:
See `here for an explanation of valid identifiers
`__.
- - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed.
+ - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible.
- Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``,
``major_axis``, ``minor_axis``, ``items``.
@@ -287,7 +279,7 @@ largely as a convenience since it is such a common operation.
.. _indexing.label:
-Selection By Label
+Selection by label
------------------
.. warning::
@@ -335,8 +327,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp
* A list or array of labels ``['a', 'b', 'c']``.
* A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
- index! See :ref:`Slicing with labels
- `.).
+ index! See :ref:`Slicing with labels `.
* A boolean array.
* A ``callable``, see :ref:`Selection By Callable `.
@@ -418,9 +409,12 @@ error will be raised (since doing otherwise would be computationally expensive,
as well as potentially ambiguous for mixed type indexes). For instance, in the
above example, ``s.loc[1:6]`` would raise ``KeyError``.
+For the rationale behind this behavior, see
+:ref:`Endpoints are inclusive `.
+
.. _indexing.integer:
-Selection By Position
+Selection by position
---------------------
.. warning::
@@ -533,11 +527,9 @@ A list of indexers where any element is out of bounds will raise an
.. _indexing.callable:
-Selection By Callable
+Selection by callable
---------------------
-.. versionadded:: 0.18.1
-
``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer.
The ``callable`` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.
@@ -548,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat
columns=list('ABCD'))
df1
- df1.loc[lambda df: df.A > 0, :]
+ df1.loc[lambda df: df['A'] > 0, :]
df1.loc[:, lambda df: ['A', 'B']]
df1.iloc[:, lambda df: [0, 1]]
@@ -560,7 +552,7 @@ You can use callable indexing in ``Series``.
.. ipython:: python
- df1.A.loc[lambda s: s > 0]
+ df1['A'].loc[lambda s: s > 0]
Using these methods / indexers, you can chain data selection operations
without using a temporary variable.
@@ -569,11 +561,11 @@ without using a temporary variable.
bb = pd.read_csv('data/baseball.csv', index_col='id')
(bb.groupby(['year', 'team']).sum()
- .loc[lambda df: df.r > 100])
+ .loc[lambda df: df['r'] > 100])
.. _indexing.deprecate_ix:
-IX Indexer is Deprecated
+IX indexer is deprecated
------------------------
.. warning::
@@ -631,7 +623,7 @@ For getting *multiple* indexers, using ``.get_indexer``:
.. _deprecate_loc_reindex_listlike:
.. _indexing.deprecate_loc_reindex_listlike:
-Indexing with list with missing labels is Deprecated
+Indexing with list with missing labels is deprecated
----------------------------------------------------
.. warning::
@@ -655,7 +647,7 @@ Selection with all keys found is unchanged.
s.loc[[1, 2]]
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -667,7 +659,7 @@ Previous Behavior
dtype: float64
-Current Behavior
+Current behavior
.. code-block:: ipython
@@ -732,7 +724,7 @@ However, this would *still* raise if your resulting index is duplicated.
.. _indexing.basics.partial_setting:
-Selecting Random Samples
+Selecting random samples
------------------------
A random selection of rows or columns from a Series or DataFrame with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows.
@@ -807,7 +799,7 @@ Finally, one can also set a seed for ``sample``'s random number generator using
-Setting With Enlargement
+Setting with enlargement
------------------------
The ``.loc/[]`` operations can perform enlargement when setting a non-existent key for that axis.
@@ -879,9 +871,9 @@ Boolean indexing
Another common operation is the use of boolean vectors to filter the data.
The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``.
These **must** be grouped by using parentheses, since by default Python will
-evaluate an expression such as ``df.A > 2 & df.B < 3`` as
-``df.A > (2 & df.B) < 3``, while the desired evaluation order is
-``(df.A > 2) & (df.B < 3)``.
+evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as
+``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is
+``(df['A > 2) & (df['B'] < 3)``.
Using a boolean vector to index a Series works exactly as in a NumPy ndarray:
@@ -1076,7 +1068,7 @@ without creating a copy:
df.where(df < 0, -df) == np.where(df < 0, df, -df)
-**alignment**
+**Alignment**
Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame),
such that partial selection with setting is possible. This is analogous to
@@ -1103,9 +1095,7 @@ This is equivalent to (but faster than) the following.
df2 = df.copy()
df.apply(lambda x, y: x.where(x > 0, y), y=df['A'])
-.. versionadded:: 0.18.1
-
-Where can accept a callable as condition and ``other`` arguments. The function must
+``where`` can accept a callable as condition and ``other`` arguments. The function must
be with one argument (the calling Series or DataFrame) and that returns valid output
as condition and ``other`` argument.
@@ -1144,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example:
df
# pure python
- df[(df.a < df.b) & (df.b < df.c)]
+ df[(df['a'] < df['b']) & (df['b'] < df['c'])]
# query
df.query('(a < b) & (b < c)')
@@ -1251,7 +1241,7 @@ Full numpy-like syntax:
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
df.query('(a < b) & (b < c)')
- df[(df.a < df.b) & (df.b < df.c)]
+ df[(df['a'] < df['b']) & (df['b'] < df['c'])]
Slightly nicer by removing the parentheses (by binding making comparison
operators bind tighter than ``&`` and ``|``).
@@ -1289,12 +1279,12 @@ The ``in`` and ``not in`` operators
df.query('a in b')
# How you'd do it in pure Python
- df[df.a.isin(df.b)]
+ df[df['a'].isin(df['b'])]
df.query('a not in b')
# pure Python
- df[~df.a.isin(df.b)]
+ df[~df['a'].isin(df['b'])]
You can combine this with other expressions for very succinct queries:
@@ -1307,7 +1297,7 @@ You can combine this with other expressions for very succinct queries:
df.query('a in b and c < d')
# pure Python
- df[df.b.isin(df.a) & (df.c < df.d)]
+ df[df['b'].isin(df['a']) & (df['c'] < df['d'])]
.. note::
@@ -1336,7 +1326,7 @@ to ``in``/``not in``.
df.query('b == ["a", "b", "c"]')
# pure Python
- df[df.b.isin(["a", "b", "c"])]
+ df[df['b'].isin(["a", "b", "c"])]
df.query('c == [1, 2]')
@@ -1348,10 +1338,10 @@ to ``in``/``not in``.
df.query('[1, 2] not in c')
# pure Python
- df[df.c.isin([1, 2])]
+ df[df['c'].isin([1, 2])]
-Boolean Operators
+Boolean operators
~~~~~~~~~~~~~~~~~
You can negate boolean expressions with the word ``not`` or the ``~`` operator.
@@ -1362,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator.
df['bools'] = np.random.rand(len(df)) > 0.5
df.query('~bools')
df.query('not bools')
- df.query('not bools') == df[~df.bools]
+ df.query('not bools') == df[~df['bools']]
Of course, expressions can be arbitrarily complex too:
@@ -1372,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too:
shorter = df.query('a < b < c and (not bools) or bools > 2')
# equivalent in pure Python
- longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)]
+ longer = df[(df['a'] < df['b'])
+ & (df['b'] < df['c'])
+ & (~df['bools'])
+ | (df['bools'] > 2)]
shorter
longer
@@ -1407,7 +1400,7 @@ floating point values generated using ``numpy.random.randn()``.
df2 = df.copy()
-Duplicate Data
+Duplicate data
--------------
.. _indexing.duplicate:
@@ -1474,7 +1467,7 @@ default value.
s.get('a') # equivalent to s['a']
s.get('x', default=-1)
-The :meth:`~pandas.DataFrame.lookup` Method
+The :meth:`~pandas.DataFrame.lookup` method
-------------------------------------------
Sometimes you want to extract a set of values given a sequence of row labels
@@ -1559,11 +1552,11 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes.
index.levels[1]
index.set_levels(["a", "b"], level=1)
+.. _indexing.set_ops:
+
Set operations on Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. _indexing.set_ops:
-
The two main operations are ``union (|)`` and ``intersection (&)``.
These can be directly called as instance methods or used via overloaded
operators. Difference is provided via the ``.difference()`` method.
@@ -1592,11 +1585,22 @@ with duplicates dropped.
The resulting index from a set operation will be sorted in ascending order.
-Missing values
-~~~~~~~~~~~~~~
+When performing :meth:`Index.union` between indexes with different dtypes, the indexes
+must be cast to a common dtype. Typically, though not always, this is object dtype. The
+exception is when performing a union between integer and float data. In this case, the
+integer values are converted to float
+
+.. ipython:: python
+
+ idx1 = pd.Index([0, 1, 2])
+ idx2 = pd.Index([0.5, 1.5])
+ idx1 | idx2
.. _indexing.missing:
+Missing values
+~~~~~~~~~~~~~~
+
.. important::
Even though ``Index`` can hold missing values (``NaN``), it should be avoided
@@ -1617,18 +1621,18 @@ Missing values
idx2
idx2.fillna(pd.Timestamp('2011-01-02'))
-Set / Reset Index
+Set / reset index
-----------------
Occasionally you will load or create a data set into a DataFrame and want to
add an index after you've already done so. There are a couple of different
ways.
+.. _indexing.set_index:
+
Set an index
~~~~~~~~~~~~
-.. _indexing.set_index:
-
DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name
(for a regular ``Index``) or a list of column names (for a ``MultiIndex``).
To create a new, re-indexed DataFrame:
@@ -1834,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option `
# This will show the SettingWithCopyWarning
# but the frame values will be set
- dfb['c'][dfb.a.str.startswith('o')] = 42
+ dfb['c'][dfb['a'].str.startswith('o')] = 42
This however is operating on a copy and will not work.
::
>>> pd.set_option('mode.chained_assignment','warn')
- >>> dfb[dfb.a.str.startswith('o')]['c'] = 42
+ >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42
Traceback (most recent call last)
...
SettingWithCopyWarning:
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
index c5667e9319ca6..97b9c2f95dc50 100644
--- a/doc/source/user_guide/integer_na.rst
+++ b/doc/source/user_guide/integer_na.rst
@@ -5,7 +5,7 @@
.. _integer_na:
**************************
-Nullable Integer Data Type
+Nullable integer data type
**************************
.. versionadded:: 0.24.0
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 725af8ef8769b..1d49dbdee9c03 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -13,7 +13,7 @@
===============================
-IO Tools (Text, CSV, HDF5, ...)
+IO tools (text, CSV, HDF5, ...)
===============================
The pandas I/O API is a set of top level ``reader`` functions accessed like
@@ -28,16 +28,19 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
:delim: ;
text;`CSV `__;:ref:`read_csv`;:ref:`to_csv`
+ text;Fixed-Width Text File;:ref:`read_fwf`
text;`JSON `__;:ref:`read_json`;:ref:`to_json`
text;`HTML `__;:ref:`read_html`;:ref:`to_html`
text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard`
binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel`
+ binary;`OpenDocument `__;:ref:`read_excel`;
binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf`
binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather`
binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet`
binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack`
binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata`
binary;`SAS `__;:ref:`read_sas`;
+ binary;`SPSS `__;:ref:`read_spss`;
binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle`
SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql`
SQL;`Google Big Query `__;:ref:`read_gbq`;:ref:`to_gbq`
@@ -51,7 +54,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
.. _io.read_csv_table:
-CSV & Text files
+CSV & text files
----------------
The workhorse function for reading text files (a.k.a. flat files) is
@@ -86,9 +89,7 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.
- .. versionadded:: 0.18.1 support for the Python parser.
-
-Column and Index Locations and Names
+Column and index locations and names
++++++++++++++++++++++++++++++++++++
header : int or list of ints, default ``'infer'``
@@ -108,8 +109,7 @@ header : int or list of ints, default ``'infer'``
line of data rather than the first line of the file.
names : array-like, default ``None``
List of column names to use. If file contains no header row, then you should
- explicitly pass ``header=None``. Duplicates in this list will cause
- a ``UserWarning`` to be issued.
+ explicitly pass ``header=None``. Duplicates in this list are not allowed.
index_col : int, str, sequence of int / str, or False, default ``None``
Column(s) to use as the row labels of the ``DataFrame``, either given as
string name or column index. If a sequence of int / str is given, a
@@ -155,7 +155,7 @@ mangle_dupe_cols : boolean, default ``True``
Passing in ``False`` will cause data to be overwritten if there are duplicate
names in the columns.
-General Parsing Configuration
+General parsing configuration
+++++++++++++++++++++++++++++
dtype : Type name or dict of column -> type, default ``None``
@@ -211,7 +211,7 @@ memory_map : boolean, default False
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.
-NA and Missing Data Handling
+NA and missing data handling
++++++++++++++++++++++++++++
na_values : scalar, str, list-like, or dict, default ``None``
@@ -243,7 +243,7 @@ verbose : boolean, default ``False``
skip_blank_lines : boolean, default ``True``
If ``True``, skip over blank lines rather than interpreting as NaN values.
-Datetime Handling
+Datetime handling
+++++++++++++++++
parse_dates : boolean or list of ints or names or list of lists or dict, default ``False``.
@@ -263,7 +263,7 @@ keep_date_col : boolean, default ``False``
date_parser : function, default ``None``
Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call date_parser in three different ways,
+ conversion. pandas will try to call date_parser in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays (as
defined by parse_dates) as arguments; 2) concatenate (row-wise) the string
values from the columns defined by parse_dates into a single array and pass
@@ -288,7 +288,7 @@ chunksize : int, default ``None``
Return `TextFileReader` object for iteration. See :ref:`iterating and chunking
` below.
-Quoting, Compression, and File Format
+Quoting, compression, and file format
+++++++++++++++++++++++++++++++++++++
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
@@ -298,7 +298,6 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``
the ZIP file must contain only one data file to be read in.
Set to ``None`` for no decompression.
- .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
.. versionchanged:: 0.24.0 'infer' option added and set to default.
thousands : str, default ``None``
Thousands separator.
@@ -340,15 +339,8 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None``
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
documentation for more details.
-tupleize_cols : boolean, default ``False``
- .. deprecated:: 0.21.0
-
- This argument will be removed and will always convert to MultiIndex
- Leave a list of tuples on columns as is (default is to convert to a MultiIndex
- on the columns).
-
-Error Handling
+Error handling
++++++++++++++
error_bad_lines : boolean, default ``True``
@@ -460,11 +452,9 @@ worth trying.
.. _io.categorical:
-Specifying Categorical dtype
+Specifying categorical dtype
''''''''''''''''''''''''''''
-.. versionadded:: 0.19.0
-
``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
``dtype=CategoricalDtype(categories, ordered)``.
@@ -488,7 +478,7 @@ specification:
.. versionadded:: 0.21.0
-Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+Specifying ``dtype='category'`` will result in an unordered ``Categorical``
whose ``categories`` are the unique values observed in the data. For more
control on the categories and order, create a
:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
@@ -529,7 +519,7 @@ This matches the behavior of :meth:`Categorical.set_categories`.
df['col3']
-Naming and Using Columns
+Naming and using columns
''''''''''''''''''''''''
.. _io.headers:
@@ -646,7 +636,7 @@ use in the final result:
In this case, the callable is specifying that we exclude the "a" and "c"
columns from the output.
-Comments and Empty Lines
+Comments and empty lines
''''''''''''''''''''''''
.. _io.skiplines:
@@ -759,7 +749,7 @@ We can suppress the comments using the ``comment`` keyword:
.. _io.unicode:
-Dealing with Unicode Data
+Dealing with Unicode data
'''''''''''''''''''''''''
The ``encoding`` argument should be used for encoded unicode data, which will
@@ -834,7 +824,7 @@ If a subset of data is being parsed using the ``usecols`` option, the
Date Handling
'''''''''''''
-Specifying Date Columns
+Specifying date columns
+++++++++++++++++++++++
To better facilitate working with datetime data, :func:`read_csv`
@@ -947,7 +937,7 @@ data columns:
specify `index_col` as a column label rather then as an index on the resulting frame.
-Date Parsing Functions
+Date parsing functions
++++++++++++++++++++++
Finally, the parser allows you to specify a custom ``date_parser`` function to
@@ -1001,7 +991,7 @@ a single date rather than the entire array.
.. _io.csv.mixed_timezones:
-Parsing a CSV with mixed Timezones
+Parsing a CSV with mixed timezones
++++++++++++++++++++++++++++++++++
Pandas cannot natively represent a column or index with mixed timezones. If your CSV
@@ -1031,7 +1021,7 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie
.. _io.dayfirst:
-Inferring Datetime Format
+Inferring datetime format
+++++++++++++++++++++++++
If you have ``parse_dates`` enabled for some or all of your columns, and your
@@ -1070,7 +1060,7 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With
os.remove('foo.csv')
-International Date Formats
+International date formats
++++++++++++++++++++++++++
While US date formats tend to be MM/DD/YYYY, many international formats use
@@ -1118,7 +1108,7 @@ writing to a file). For example:
.. _io.thousands:
-Thousand Separators
+Thousand separators
'''''''''''''''''''
For large numbers that have been written with a thousands separator, you can
@@ -1163,7 +1153,7 @@ The ``thousands`` keyword allows integers to be parsed correctly:
.. _io.na_values:
-NA Values
+NA values
'''''''''
To control which values are parsed as missing values (which are signified by
@@ -1383,9 +1373,10 @@ should pass the ``escapechar`` option:
print(data)
pd.read_csv(StringIO(data), escapechar='\\')
+.. _io.fwf_reader:
.. _io.fwf:
-Files with Fixed Width Columns
+Files with fixed width columns
''''''''''''''''''''''''''''''
While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works
@@ -1679,14 +1670,14 @@ S3 URLs are handled as well but require installing the `S3Fs
df = pd.read_csv('s3://pandas-test/tips.csv')
-If your S3 bucket requires cedentials you will need to set them as environment
+If your S3 bucket requires credentials you will need to set them as environment
variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs
documentation on credentials
`_.
-Writing out Data
+Writing out data
''''''''''''''''
.. _io.store_in_csv:
@@ -1718,8 +1709,6 @@ function takes a number of arguments. Only the first is required.
* ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when
appropriate (default None)
* ``chunksize``: Number of rows to write at a time
-* ``tupleize_cols``: If False (default), write as a list of tuples, otherwise
- write in an expanded line format suitable for ``read_csv``
* ``date_format``: Format string for datetime objects
Writing a formatted string
@@ -1805,7 +1794,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet
json = dfj.to_json()
json
-Orient Options
+Orient options
++++++++++++++
There are a number of different options for the format of the resulting JSON
@@ -1869,7 +1858,7 @@ preservation of metadata including but not limited to dtypes and index names.
index and column labels during round-trip serialization. If you wish to preserve
label ordering use the `split` option as it uses ordered containers.
-Date Handling
+Date handling
+++++++++++++
Writing in ISO date format:
@@ -1910,7 +1899,7 @@ Writing to a file, with a date index and a date column:
with open('test.json') as fh:
print(fh.read())
-Fallback Behavior
+Fallback behavior
+++++++++++++++++
If the JSON serializer cannot handle the container contents directly it will
@@ -2003,7 +1992,7 @@ If a non-default ``orient`` was used when encoding to JSON be sure to pass the s
option here so that decoding produces sensible results, see `Orient Options`_ for an
overview.
-Data Conversion
+Data conversion
+++++++++++++++
The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True``
@@ -2078,7 +2067,7 @@ Dates written in nanoseconds need to be read back in nanoseconds:
json = dfj2.to_json(date_unit='ns')
- # Try to parse timestamps as millseconds -> Won't Work
+ # Try to parse timestamps as milliseconds -> Won't Work
dfju = pd.read_json(json, date_unit='ms')
dfju
@@ -2090,7 +2079,7 @@ Dates written in nanoseconds need to be read back in nanoseconds:
dfju = pd.read_json(json, date_unit='ns')
dfju
-The Numpy Parameter
+The Numpy parameter
+++++++++++++++++++
.. note::
@@ -2186,13 +2175,24 @@ into a flat table.
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
+The max_level parameter provides more control over which level to end normalization.
+With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict.
+
+.. ipython:: python
+
+ data = [{'CreatedBy': {'Name': 'User001'},
+ 'Lookup': {'TextField': 'Some text',
+ 'UserField': {'Id': 'ID001',
+ 'Name': 'Name001'}},
+ 'Image': {'a': 'b'}
+ }]
+ json_normalize(data, max_level=1)
+
.. _io.jsonl:
Line delimited json
'''''''''''''''''''
-.. versionadded:: 0.19.0
-
pandas is able to read and write line-delimited json files that are common in data processing pipelines
using Hadoop or Spark.
@@ -2218,7 +2218,7 @@ For line-delimited json files, pandas can also return an iterator which reads in
.. _io.table_schema:
-Table Schema
+Table schema
''''''''''''
.. versionadded:: 0.20.0
@@ -2378,7 +2378,7 @@ HTML
.. _io.read_html:
-Reading HTML Content
+Reading HTML content
''''''''''''''''''''''
.. warning::
@@ -2490,16 +2490,12 @@ Specify values that should be converted to NaN:
dfs = pd.read_html(url, na_values=['No Acquirer'])
-.. versionadded:: 0.19
-
Specify whether to keep the default set of NaN values:
.. code-block:: python
dfs = pd.read_html(url, keep_default_na=False)
-.. versionadded:: 0.19
-
Specify converters for columns. This is useful for numerical text data that has
leading zeros. By default columns that are numerical are cast to numeric
types and the leading zeros are lost. To avoid this, we can convert these
@@ -2511,8 +2507,6 @@ columns to strings.
dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0,
converters={'MNC': str})
-.. versionadded:: 0.19
-
Use some combination of the above:
.. code-block:: python
@@ -2788,16 +2782,17 @@ parse HTML tables in the top-level pandas io function ``read_html``.
Excel files
-----------
-The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and
-Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python
-module. The :meth:`~DataFrame.to_excel` instance method is used for
+The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
+files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files
+can be read using either ``xlrd`` or ``openpyxl``.
+The :meth:`~DataFrame.to_excel` instance method is used for
saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv` data.
See the :ref:`cookbook` for some advanced strategies.
.. _io.excel_reader:
-Reading Excel Files
+Reading Excel files
'''''''''''''''''''
In the most basic use-case, ``read_excel`` takes a path to an Excel
@@ -2879,7 +2874,7 @@ with ``on_demand=True``.
.. _io.excel.specifying_sheets:
-Specifying Sheets
+Specifying sheets
+++++++++++++++++
.. note :: The second argument is ``sheet_name``, not to be confused with ``ExcelFile.sheet_names``.
@@ -2980,7 +2975,7 @@ should be passed to ``index_col`` and ``header``:
os.remove('path_to_file.xlsx')
-Parsing Specific Columns
+Parsing specific columns
++++++++++++++++++++++++
It is often the case that users will insert columns to do temporary computations
@@ -3035,7 +3030,7 @@ the column names, returning names where the callable function evaluates to ``Tru
pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha())
-Parsing Dates
+Parsing dates
+++++++++++++
Datetime-like values are normally automatically converted to the appropriate
@@ -3048,7 +3043,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes:
pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings'])
-Cell Converters
+Cell converters
+++++++++++++++
It is possible to transform the contents of Excel cells via the ``converters``
@@ -3073,7 +3068,7 @@ missing data to recover integer dtype:
pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
-dtype Specifications
+Dtype specifications
++++++++++++++++++++
.. versionadded:: 0.20
@@ -3089,10 +3084,10 @@ no type inference, use the type ``str`` or ``object``.
.. _io.excel_writer:
-Writing Excel Files
+Writing Excel files
'''''''''''''''''''
-Writing Excel Files to Disk
+Writing Excel files to disk
+++++++++++++++++++++++++++
To write a ``DataFrame`` object to a sheet of an Excel file, you can use the
@@ -3138,7 +3133,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
.. _io.excel_writing_buffer:
-Writing Excel Files to Memory
+Writing Excel files to memory
+++++++++++++++++++++++++++++
Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or
@@ -3218,7 +3213,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
.. _io.excel.style:
-Style and Formatting
+Style and formatting
''''''''''''''''''''
The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method.
@@ -3226,7 +3221,31 @@ The look and feel of Excel worksheets created from pandas can be modified using
* ``float_format`` : Format string for floating point numbers (default ``None``).
* ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``).
+Using the `Xlsxwriter`_ engine provides many options for controlling the
+format of an Excel worksheet created with the ``to_excel`` method. Excellent examples can be found in the
+`Xlsxwriter`_ documentation here: https://xlsxwriter.readthedocs.io/working_with_pandas.html
+
+.. _io.ods:
+
+OpenDocument Spreadsheets
+-------------------------
+.. versionadded:: 0.25
+
+The :func:`~pandas.read_excel` method can also read OpenDocument spreadsheets
+using the ``odfpy`` module. The semantics and features for reading
+OpenDocument spreadsheets match what can be done for `Excel files`_ using
+``engine='odf'``.
+
+.. code-block:: python
+
+ # Returns a DataFrame
+ pd.read_excel('path_to_file.ods', engine='odf')
+
+.. note::
+
+ Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
+ is not implemented.
.. _io.clipboard:
@@ -3249,24 +3268,35 @@ And then import the data directly to a ``DataFrame`` by calling:
.. code-block:: python
- clipdf = pd.read_clipboard()
-
-.. ipython:: python
-
- clipdf
-
+ >>> clipdf = pd.read_clipboard()
+ >>> clipdf
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
The ``to_clipboard`` method can be used to write the contents of a ``DataFrame`` to
the clipboard. Following which you can paste the clipboard contents into other
applications (CTRL-V on many operating systems). Here we illustrate writing a
``DataFrame`` into clipboard and reading it back.
-.. ipython:: python
+.. code-block:: python
- df = pd.DataFrame(np.random.randn(5, 3))
- df
- df.to_clipboard()
- pd.read_clipboard()
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [4, 5, 6],
+ ... 'C': ['p', 'q', 'r']},
+ ... index=['x', 'y', 'z'])
+ >>> df
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
+ >>> df.to_clipboard()
+ >>> pd.read_clipboard()
+ A B C
+ x 1 4 p
+ y 2 5 q
+ z 3 6 r
We can see that we got the same content back, which we had earlier written to the clipboard.
@@ -3308,16 +3338,7 @@ any pickled pandas object (or any other pickled object) from file:
.. warning::
- Several internal refactoring have been done while still preserving
- compatibility with pickles created with older versions of pandas. However,
- for such cases, pickled ``DataFrames``, ``Series`` etc, must be read with
- ``pd.read_pickle``, rather than ``pickle.load``.
-
- See `here `__
- and `here `__
- for some examples of compatibility-breaking changes. See
- `this question `__
- for a detailed explanation.
+ :func:`read_pickle` is only guaranteed backwards compatible back to pandas version 0.20.3
.. _io.pickle.compression:
@@ -3391,11 +3412,15 @@ both on the writing (serialization), and reading (deserialization).
.. warning::
- This is a very new feature of pandas. We intend to provide certain
- optimizations in the io of the ``msgpack`` data. Since this is marked
- as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release.
+ The msgpack format is deprecated as of 0.25 and will be removed in a future version.
+ It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
+
+.. warning::
+
+ :func:`read_msgpack` is only guaranteed backwards compatible back to pandas version 0.20.3
.. ipython:: python
+ :okwarning:
df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
df.to_msgpack('foo.msg')
@@ -3405,6 +3430,7 @@ both on the writing (serialization), and reading (deserialization).
You can pass a list of objects and you will receive them back on deserialization.
.. ipython:: python
+ :okwarning:
pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s)
pd.read_msgpack('foo.msg')
@@ -3412,6 +3438,7 @@ You can pass a list of objects and you will receive them back on deserialization
You can pass ``iterator=True`` to iterate over the unpacked results:
.. ipython:: python
+ :okwarning:
for o in pd.read_msgpack('foo.msg', iterator=True):
print(o)
@@ -3419,6 +3446,7 @@ You can pass ``iterator=True`` to iterate over the unpacked results:
You can pass ``append=True`` to the writer to append to an existing pack:
.. ipython:: python
+ :okwarning:
df.to_msgpack('foo.msg', append=True)
pd.read_msgpack('foo.msg')
@@ -3429,6 +3457,7 @@ can pack arbitrary collections of Python lists, dicts, scalars, while intermixin
pandas objects.
.. ipython:: python
+ :okwarning:
pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'},
{'scalar': 1.}, {'s': s}]})
@@ -3441,20 +3470,22 @@ pandas objects.
os.remove('foo.msg')
os.remove('foo2.msg')
-Read/Write API
+Read/write API
''''''''''''''
Msgpacks can also be read from and written to strings.
.. ipython:: python
+ :okwarning:
df.to_msgpack()
Furthermore you can concatenate the strings to produce a list of the original objects.
.. ipython:: python
+ :okwarning:
- pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
+ pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
.. _io.hdf5:
@@ -3540,10 +3571,10 @@ Closing a Store and using a context manager:
-Read/Write API
+Read/write API
''''''''''''''
-``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing,
+``HDFStore`` supports a top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing,
similar to how ``read_csv`` and ``to_csv`` work.
.. ipython:: python
@@ -3586,7 +3617,7 @@ HDFStore will by default not drop rows that are all missing. This behavior can b
.. _io.hdf5-fixed:
-Fixed Format
+Fixed format
''''''''''''
The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called
@@ -3610,7 +3641,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for
.. _io.hdf5-table:
-Table Format
+Table format
''''''''''''
``HDFStore`` supports another ``PyTables`` format on disk, the ``table``
@@ -3652,13 +3683,13 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format.
.. _io.hdf5-keys:
-Hierarchical Keys
+Hierarchical keys
'''''''''''''''''
Keys to a store can be specified as a string. These can be in a
hierarchical path-name like format (e.g. ``foo/bar/bah``), which will
generate a hierarchy of sub-stores (or ``Groups`` in PyTables
-parlance). Keys can be specified with out the leading '/' and are **always**
+parlance). Keys can be specified without the leading '/' and are **always**
absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove
everything in the sub-store and **below**, so be *careful*.
@@ -3719,10 +3750,10 @@ will yield a tuple for each group key along with the relative keys of its conten
.. _io.hdf5-types:
-Storing Types
+Storing types
'''''''''''''
-Storing Mixed Types in a Table
+Storing mixed types in a table
++++++++++++++++++++++++++++++
Storing mixed-dtype data is supported. Strings are stored as a
@@ -3752,7 +3783,7 @@ defaults to `nan`.
store.append('df_mixed', df_mixed, min_itemsize={'values': 50})
df_mixed1 = store.select('df_mixed')
df_mixed1
- df_mixed1.get_dtype_counts()
+ df_mixed1.dtypes.value_counts()
# we have provided a minimum string column size
store.root.df_mixed.table
@@ -3786,7 +3817,7 @@ storing/selecting from homogeneous index ``DataFrames``.
Querying
''''''''
-Querying a Table
+Querying a table
++++++++++++++++
``select`` and ``delete`` operations have an optional criterion that can
@@ -3796,7 +3827,7 @@ data.
A query is specified using the ``Term`` class under the hood, as a boolean expression.
-* ``index`` and ``columns`` are supported indexers of a ``DataFrames``.
+* ``index`` and ``columns`` are supported indexers of ``DataFrames``.
* if ``data_columns`` are specified, these can be used as additional indexers.
Valid comparison operators are:
@@ -3888,7 +3919,7 @@ Use boolean expressions, with in-line function evaluation.
store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']")
-Use and inline column reference
+Use inline column reference.
.. ipython:: python
@@ -3992,7 +4023,7 @@ See `here `` to the first ``append``,
- to set the TOTAL number of expected rows that ``PyTables`` will
- expected. This will optimize read/write performance.
+ to set the TOTAL number of rows that ``PyTables`` will expect.
+ This will optimize read/write performance.
* Duplicate rows can be written to tables, but are filtered out in
selection (with the last items being selected; thus a table is
unique on major, minor pairs)
@@ -4737,7 +4768,7 @@ Read only certain columns of a parquet file.
os.remove('example_fp.parquet')
-Handling Indexes
+Handling indexes
''''''''''''''''
Serializing a ``DataFrame`` to parquet may include the implicit index as one or
@@ -4819,7 +4850,7 @@ The above example creates a partitioned dataset that may look like:
.. _io.sql:
-SQL Queries
+SQL queries
-----------
The :mod:`pandas.io.sql` module provides a collection of query wrappers to both
@@ -4988,7 +5019,7 @@ will convert the data to UTC.
.. _io.sql.method:
-Insertion Method
+Insertion method
++++++++++++++++
.. versionadded:: 0.24.0
@@ -5034,7 +5065,7 @@ Example of a callable using PostgreSQL `COPY clause
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
-Reading Tables
+Reading tables
''''''''''''''
:func:`~pandas.read_sql_table` will read a database table given the
@@ -5242,12 +5273,12 @@ Full documentation can be found `here `__.
.. _io.stata:
-Stata Format
+Stata format
------------
.. _io.stata_writer:
-Writing to Stata format
+Writing to stata format
'''''''''''''''''''''''
The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame
@@ -5361,7 +5392,7 @@ values will have ``object`` data type.
.. _io.stata-categorical:
-Categorical Data
+Categorical data
++++++++++++++++
``Categorical`` data can be exported to *Stata* data files as value labeled data.
@@ -5407,7 +5438,7 @@ whether imported ``Categorical`` variables are ordered.
.. _io.sas_reader:
-SAS Formats
+SAS formats
-----------
The top-level function :func:`read_sas` can read (but not write) SAS
@@ -5449,6 +5480,43 @@ web site.
No official documentation is available for the SAS7BDAT format.
+.. _io.spss:
+
+.. _io.spss_reader:
+
+SPSS formats
+------------
+
+.. versionadded:: 0.25.0
+
+The top-level function :func:`read_spss` can read (but not write) SPSS
+`sav` (.sav) and `zsav` (.zsav) format files.
+
+SPSS files contain column names. By default the
+whole file is read, categorical columns are converted into ``pd.Categorical``,
+and a ``DataFrame`` with all columns is returned.
+
+Specify the ``usecols`` parameter to obtain a subset of columns. Specify ``convert_categoricals=False``
+to avoid converting categorical columns into ``pd.Categorical``.
+
+Read an SPSS file:
+
+.. code-block:: python
+
+ df = pd.read_spss('spss_data.sav')
+
+Extract a subset of columns contained in ``usecols`` from an SPSS file and
+avoid converting categorical columns into ``pd.Categorical``:
+
+.. code-block:: python
+
+ df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'],
+ convert_categoricals=False)
+
+More information about the `sav` and `zsav` file format is available here_.
+
+.. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm
+
.. _io.other:
Other file formats
@@ -5469,7 +5537,7 @@ easy conversion to and from pandas.
.. _io.perf:
-Performance Considerations
+Performance considerations
--------------------------
This is an informal comparison of various IO methods, using pandas
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index 25c486c839b7f..4c0d3b75a4f79 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -70,9 +70,8 @@ some configurable handling of "what to do with the other axes":
::
- pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
- keys=None, levels=None, names=None, verify_integrity=False,
- copy=True)
+ pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
+ levels=None, names=None, verify_integrity=False, copy=True)
* ``objs`` : a sequence or mapping of Series or DataFrame objects. If a
dict is passed, the sorted keys will be used as the `keys` argument, unless
@@ -87,8 +86,6 @@ some configurable handling of "what to do with the other axes":
n - 1. This is useful if you are concatenating objects where the
concatenation axis does not have meaningful indexing information. Note
the index values on the other axes are still respected in the join.
-* ``join_axes`` : list of Index objects. Specific indexes to use for the other
- n - 1 axes instead of performing inner/outer set logic.
* ``keys`` : sequence, default None. Construct hierarchical index using the
passed keys as the outermost level. If multiple levels passed, should
contain tuples.
@@ -147,12 +144,11 @@ Set logic on the other axes
When gluing together multiple DataFrames, you have a choice of how to handle
the other axes (other than the one being concatenated). This can be done in
-the following three ways:
+the following two ways:
* Take the union of them all, ``join='outer'``. This is the default
option as it results in zero information loss.
* Take the intersection, ``join='inner'``.
-* Use a specific index, as passed to the ``join_axes`` argument.
Here is an example of each of these methods. First, the default ``join='outer'``
behavior:
@@ -202,7 +198,13 @@ DataFrame:
.. ipython:: python
- result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])
+ result = pd.concat([df1, df4], axis=1).reindex(df1.index)
+
+Similarly, we could index before the concatenation:
+
+.. ipython:: python
+
+ pd.concat([df1, df4.reindex(df1.index)], axis=1)
.. ipython:: python
:suppress:
@@ -814,11 +816,9 @@ The ``indicator`` argument will also accept string arguments, in which case the
.. _merging.dtypes:
-Merge Dtypes
+Merge dtypes
~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
Merging will preserve the dtype of the join keys.
.. ipython:: python
@@ -1361,7 +1361,7 @@ Timeseries friendly merging
.. _merging.merge_ordered:
-Merging Ordered Data
+Merging ordered data
~~~~~~~~~~~~~~~~~~~~
A :func:`merge_ordered` function allows combining time series and other
@@ -1381,11 +1381,9 @@ fill/interpolate missing data:
.. _merging.merge_asof:
-Merging AsOf
+Merging asof
~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
A :func:`merge_asof` is similar to an ordered left-join except that we match on
nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``,
we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index cd70a109b3c77..6c36a6470f841 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -74,7 +74,7 @@ Series and DataFrame objects:
df2['one'] == np.nan
-Integer Dtypes and Missing Data
+Integer dtypes and missing data
-------------------------------
Because ``NaN`` is a float, a column of integers with even one missing values
@@ -105,7 +105,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``.
df2
df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan
df2
- df2.get_dtype_counts()
+ df2.dtypes.value_counts()
.. _missing.inserting:
@@ -175,7 +175,7 @@ account for missing data. For example:
.. _missing_data.numeric_sum:
-Sum/Prod of Empties/Nans
+Sum/prod of empties/nans
~~~~~~~~~~~~~~~~~~~~~~~~
.. warning::
@@ -458,7 +458,6 @@ You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate
at the new values.
.. ipython:: python
- :okexcept:
ser = pd.Series(np.sort(np.random.uniform(size=100)))
@@ -473,7 +472,7 @@ at the new values.
.. _missing_data.interp_limits:
-Interpolation Limits
+Interpolation limits
--------------------
Like other pandas fill methods, :meth:`~DataFrame.interpolate` accepts a ``limit`` keyword
@@ -523,7 +522,7 @@ the ``limit_area`` parameter restricts filling to either inside or outside value
.. _missing_data.replace:
-Replacing Generic Values
+Replacing generic values
~~~~~~~~~~~~~~~~~~~~~~~~
Often times we want to replace arbitrary values with other values.
@@ -568,7 +567,7 @@ missing and interpolate over them:
.. _missing_data.replace_expression:
-String/Regular Expression Replacement
+String/regular expression replacement
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. note::
@@ -664,7 +663,7 @@ want to use a regular expression.
Anywhere in the above ``replace`` examples that you see a regular expression
a compiled regular expression is valid as well.
-Numeric Replacement
+Numeric replacement
~~~~~~~~~~~~~~~~~~~
:meth:`~DataFrame.replace` is similar to :meth:`~DataFrame.fillna`.
diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst
index 4b466c2c44d49..f32a8adfd4d33 100644
--- a/doc/source/user_guide/options.rst
+++ b/doc/source/user_guide/options.rst
@@ -3,7 +3,7 @@
{{ header }}
********************
-Options and Settings
+Options and settings
********************
Overview
@@ -68,7 +68,7 @@ with no argument ``describe_option`` will print out the descriptions for all ava
pd.reset_option("all")
-Getting and Setting Options
+Getting and setting options
---------------------------
As described above, :func:`~pandas.get_option` and :func:`~pandas.set_option`
@@ -120,10 +120,10 @@ are restored automatically when you exit the `with` block:
print(pd.get_option("display.max_columns"))
-Setting Startup Options in python/ipython Environment
+Setting startup options in Python/IPython environment
-----------------------------------------------------
-Using startup scripts for the python/ipython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at:
+Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at:
.. code-block:: none
@@ -157,6 +157,22 @@ lines are replaced by an ellipsis.
df
pd.reset_option('max_rows')
+Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options
+determines how many rows are shown in the truncated repr.
+
+.. ipython:: python
+
+ pd.set_option('max_rows', 8)
+ pd.set_option('max_rows', 4)
+ # below max_rows -> all rows shown
+ df = pd.DataFrame(np.random.randn(7, 2))
+ df
+ # above max_rows -> only min_rows (4) rows shown
+ df = pd.DataFrame(np.random.randn(9, 2))
+ df
+ pd.reset_option('max_rows')
+ pd.reset_option('min_rows')
+
``display.expand_frame_repr`` allows for the representation of
dataframes to stretch across pages, wrapped over the full column vs row-wise.
@@ -266,7 +282,7 @@ The options are 'right', and 'left'.
.. _options.available:
-Available Options
+Available options
-----------------
======================================= ============ ==================================
@@ -352,8 +368,12 @@ display.max_rows 60 This sets the maximum numbe
out various output. For example,
this value determines whether the
repr() for a dataframe prints out
- fully or just a summary repr.
+ fully or just a truncated or summary repr.
'None' value means unlimited.
+display.min_rows 10 The numbers of rows to show in a truncated
+ repr (when `max_rows` is exceeded). Ignored
+ when `max_rows` is set to None or 0. When set
+ to None, follows the value of `max_rows`.
display.max_seq_items 100 when pretty-printing a long sequence,
no more then `max_seq_items` will
be printed. If items are omitted,
@@ -431,6 +451,12 @@ compute.use_bottleneck True Use the bottleneck library
computation if it is installed.
compute.use_numexpr True Use the numexpr library to accelerate
computation if it is installed.
+plotting.backend matplotlib Change the plotting backend to a different
+ backend than the current matplotlib one.
+ Backends can be implemented as third-party
+ libraries implementing the pandas plotting
+ API. They can use other plotting libraries
+ like Bokeh, Altair, etc.
plotting.matplotlib.register_converters True Register custom converters with
matplotlib. Set to False to de-register.
======================================= ============ ==================================
@@ -438,7 +464,7 @@ plotting.matplotlib.register_converters True Register custom converters
.. _basics.console_output:
-Number Formatting
+Number formatting
------------------
pandas also allows you to set how numbers are displayed in the console.
@@ -469,7 +495,7 @@ To round floats on a case-by-case basis, you can also use :meth:`~pandas.Series.
.. _options.east_asian_width:
-Unicode Formatting
+Unicode formatting
------------------
.. warning::
@@ -532,7 +558,7 @@ However, setting this option incorrectly for your terminal will cause these char
.. _options.table_schema:
-Table Schema Display
+Table schema display
--------------------
.. versionadded:: 0.20.0
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
index 8ad78a68977ad..dd6d3062a8f0a 100644
--- a/doc/source/user_guide/reshaping.rst
+++ b/doc/source/user_guide/reshaping.rst
@@ -3,7 +3,7 @@
{{ header }}
**************************
-Reshaping and Pivot Tables
+Reshaping and pivot tables
**************************
Reshaping by pivoting DataFrame objects
@@ -186,7 +186,7 @@ removed.
.. _reshaping.stack_multiple:
-Multiple Levels
+Multiple levels
~~~~~~~~~~~~~~~
You may also stack or unstack more than one level at a time by passing a list
@@ -214,7 +214,7 @@ not a mixture of the two).
# from above is equivalent to:
df.stack(level=[1, 2])
-Missing Data
+Missing data
~~~~~~~~~~~~
These functions are intelligent about handling missing data and do not expect
@@ -254,8 +254,6 @@ values will be set to ``NaN``.
df3
df3.unstack()
-.. versionadded:: 0.18.0
-
Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
the value of missing data.
@@ -471,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table.
'C': [1, 1, np.nan, 1, 1]})
df
- pd.crosstab(df.A, df.B)
+ pd.crosstab(df['A'], df['B'])
Any input passed containing ``Categorical`` data will have **all** of its
categories included in the cross-tabulation, even if the actual data does
@@ -486,20 +484,18 @@ not contain any instances of a particular category.
Normalization
~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
Frequency tables can also be normalized to show percentages rather than counts
using the ``normalize`` argument:
.. ipython:: python
- pd.crosstab(df.A, df.B, normalize=True)
+ pd.crosstab(df['A'], df['B'], normalize=True)
``normalize`` can also normalize values within each row or within each column:
.. ipython:: python
- pd.crosstab(df.A, df.B, normalize='columns')
+ pd.crosstab(df['A'], df['B'], normalize='columns')
``crosstab`` can also be passed a third ``Series`` and an aggregation function
(``aggfunc``) that will be applied to the values of the third ``Series`` within
@@ -507,16 +503,16 @@ each group defined by the first two ``Series``:
.. ipython:: python
- pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum)
+ pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum)
-Adding Margins
+Adding margins
~~~~~~~~~~~~~~
Finally, one can also add margins or normalize this output.
.. ipython:: python
- pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True,
+ pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True,
margins=True)
.. _reshaping.tile:
@@ -630,8 +626,6 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways:
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict
-.. versionadded:: 0.18.0
-
Sometimes it will be useful to only keep k-1 levels of a categorical
variable to avoid collinearity when feeding the result to statistical models.
You can switch to this mode by turn on ``drop_first``.
@@ -727,7 +721,7 @@ DataFrame will be pivoted in the answers below.
df
-Pivoting with Single Aggregations
+Pivoting with single aggregations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Suppose we wanted to pivot ``df`` such that the ``col`` values are columns,
@@ -775,7 +769,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass
df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size')
-Pivoting with Multiple Aggregations
+Pivoting with multiple aggregations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We can also perform multiple aggregations. For example, to perform both a
@@ -801,3 +795,53 @@ Note to subdivide over multiple columns we can pass in a list to the
df.pivot_table(
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
+
+.. _reshaping.explode:
+
+Exploding a list-like column
+----------------------------
+
+.. versionadded:: 0.25.0
+
+Sometimes the values in a column are list-like.
+
+.. ipython:: python
+
+ keys = ['panda1', 'panda2', 'panda3']
+ values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
+ df = pd.DataFrame({'keys': keys, 'values': values})
+ df
+
+We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row:
+
+.. ipython:: python
+
+ df['values'].explode()
+
+You can also explode the column in the ``DataFrame``.
+
+.. ipython:: python
+
+ df.explode('values')
+
+:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``.
+
+.. ipython:: python
+
+ s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']])
+ s
+ s.explode()
+
+Here is a typical usecase. You have comma separated strings in a column and want to expand this.
+
+.. ipython:: python
+
+ df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
+ {'var1': 'd,e,f', 'var2': 2}])
+ df
+
+Creating a long form DataFrame is now straightforward using explode and chained operations
+
+.. ipython:: python
+
+ df.assign(var1=df.var1.str.split(',')).explode('var1')
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
index 6ee11bd78fee9..98fd30f67d05b 100644
--- a/doc/source/user_guide/sparse.rst
+++ b/doc/source/user_guide/sparse.rst
@@ -116,7 +116,7 @@ in many places
.. _sparse.accessor:
-Sparse Accessor
+Sparse accessor
---------------
.. versionadded:: 0.24.0
@@ -142,7 +142,7 @@ See :ref:`api.frame.sparse` for more.
.. _sparse.calculation:
-Sparse Calculation
+Sparse calculation
------------------
You can apply NumPy `ufuncs `_
@@ -239,7 +239,7 @@ Sparse-specific properties, like ``density``, are available on the ``.sparse`` a
df.sparse.density
-**General Differences**
+**General differences**
In a ``SparseDataFrame``, *all* columns were sparse. A :class:`DataFrame` can have a mixture of
sparse and dense columns. As a consequence, assigning new columns to a ``DataFrame`` with sparse
@@ -370,7 +370,7 @@ row and columns coordinates of the matrix. Note that this will consume a signifi
.. _sparse.subclasses:
-Sparse Subclasses
+Sparse subclasses
-----------------
The :class:`SparseSeries` and :class:`SparseDataFrame` classes are deprecated. Visit their
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index 79a9848704eec..006f928c037bd 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -6,10 +6,6 @@
"source": [
"# Styling\n",
"\n",
- "*New in version 0.17.1*\n",
- "\n",
- "*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*\n",
- "\n",
"This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n",
"\n",
"You can apply **conditional formatting**, the visual styling of a DataFrame\n",
@@ -26,7 +22,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Building Styles\n",
+ "## Building styles\n",
"\n",
"Pass your style functions into one of the following methods:\n",
"\n",
@@ -297,7 +293,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Finer Control: Slicing"
+ "## Finer control: slicing"
]
},
{
@@ -410,7 +406,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Builtin Styles"
+ "## Builtin styles"
]
},
{
@@ -612,7 +608,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Sharing Styles"
+ "## Sharing styles"
]
},
{
@@ -754,7 +750,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Table Styles"
+ "### Table styles"
]
},
{
@@ -840,7 +836,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### CSS Classes\n",
+ "### CSS classes\n",
"\n",
"Certain CSS classes are attached to cells.\n",
"\n",
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 87c75e8bcd91f..acb5810e5252a 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -3,7 +3,7 @@
{{ header }}
======================
-Working with Text Data
+Working with text data
======================
.. _text.string_methods:
@@ -81,7 +81,7 @@ and replacing any remaining whitespaces with underscores:
exceptions, other uses are not supported, and may be disabled at a later point.
-Splitting and Replacing Strings
+Splitting and replacing strings
-------------------------------
.. _text.split:
@@ -356,7 +356,7 @@ of the string, the result will be a ``NaN``.
s.str[0]
s.str[1]
-Extracting Substrings
+Extracting substrings
---------------------
.. _text.extract:
@@ -366,13 +366,12 @@ Extract first match in each subject (extract)
.. warning::
- In version 0.18.0, ``extract`` gained the ``expand`` argument. When
- ``expand=False`` it returns a ``Series``, ``Index``, or
+ Before version 0.23, argument ``expand`` of the ``extract`` method defaulted to
+ ``False``. When ``expand=False``, ``expand`` returns a ``Series``, ``Index``, or
``DataFrame``, depending on the subject and regular expression
- pattern (same behavior as pre-0.18.0). When ``expand=True`` it
- always returns a ``DataFrame``, which is more consistent and less
- confusing from the perspective of a user. ``expand=True`` is the
- default since version 0.23.0.
+ pattern. When ``expand=True``, it always returns a ``DataFrame``,
+ which is more consistent and less confusing from the perspective of a user.
+ ``expand=True`` has been the default since version 0.23.0.
The ``extract`` method accepts a `regular expression
`__ with at least one
@@ -468,8 +467,6 @@ Extract all matches in each subject (extractall)
.. _text.extractall:
-.. versionadded:: 0.18.0
-
Unlike ``extract`` (which returns only the first match),
.. ipython:: python
@@ -509,8 +506,6 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as
``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the
same result as a ``Series.str.extractall`` with a default index (starts from 0).
-.. versionadded:: 0.19.0
-
.. ipython:: python
pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
@@ -518,7 +513,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0).
pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
-Testing for Strings that Match or Contain a Pattern
+Testing for Strings that match or contain a pattern
---------------------------------------------------
You can check whether elements contain a pattern:
@@ -547,7 +542,7 @@ an extra ``na`` argument so missing values can be considered True or False:
.. _text.indicator:
-Creating Indicator Variables
+Creating indicator variables
----------------------------
You can extract dummy variables from string columns.
@@ -560,8 +555,6 @@ For example if they are separated by a ``'|'``:
String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
-.. versionadded:: 0.18.1
-
.. ipython:: python
idx = pd.Index(['a', 'a|b', np.nan, 'a|c'])
@@ -569,7 +562,7 @@ String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
See also :func:`~pandas.get_dummies`.
-Method Summary
+Method summary
--------------
.. _text.summary:
diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst
index 40a8fd3101409..3e46140d79b8e 100644
--- a/doc/source/user_guide/timedeltas.rst
+++ b/doc/source/user_guide/timedeltas.rst
@@ -5,7 +5,7 @@
.. _timedeltas.timedeltas:
***********
-Time Deltas
+Time deltas
***********
Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes,
@@ -229,7 +229,7 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob
.. _timedeltas.timedeltas_convert:
-Frequency Conversion
+Frequency conversion
--------------------
Timedelta Series, ``TimedeltaIndex``, and ``Timedelta`` scalars can be converted to other 'frequencies' by dividing by another timedelta,
@@ -360,7 +360,7 @@ inferred frequency upon creation:
pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer')
-Generating Ranges of Time Deltas
+Generating ranges of time deltas
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex``
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index f27e9c677d925..0894edd69c2ae 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -3,7 +3,7 @@
{{ header }}
********************************
-Time Series / Date functionality
+Time series / date functionality
********************************
pandas contains extensive capabilities and features for working with time series data for all domains.
@@ -183,7 +183,7 @@ future releases.
.. _timeseries.converting:
-Converting to Timestamps
+Converting to timestamps
------------------------
To convert a :class:`Series` or list-like object of date-like objects e.g. strings,
@@ -235,7 +235,7 @@ inferred frequency upon creation:
pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer')
-Providing a Format Argument
+Providing a format argument
~~~~~~~~~~~~~~~~~~~~~~~~~~~
In addition to the required datetime string, a ``format`` argument can be passed to ensure specific parsing.
@@ -252,11 +252,9 @@ option, see the Python `datetime documentation`_.
.. _datetime documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
-Assembling Datetime from Multiple DataFrame Columns
+Assembling datetime from multiple DataFrame columns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
You can also pass a ``DataFrame`` of integer or string columns to assemble into a ``Series`` of ``Timestamps``.
.. ipython:: python
@@ -279,7 +277,7 @@ You can pass only the columns that you need to assemble.
* required: ``year``, ``month``, ``day``
* optional: ``hour``, ``minute``, ``second``, ``millisecond``, ``microsecond``, ``nanosecond``
-Invalid Data
+Invalid data
~~~~~~~~~~~~
The default behavior, ``errors='raise'``, is to raise when unparseable:
@@ -304,7 +302,7 @@ Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time):
.. _timeseries.converting.epoch:
-Epoch Timestamps
+Epoch timestamps
~~~~~~~~~~~~~~~~
pandas supports converting integer or float epoch times to ``Timestamp`` and
@@ -356,7 +354,7 @@ as timezone-naive timestamps and then localize to the appropriate timezone:
.. _timeseries.converting.epoch_inverse:
-From Timestamps to Epoch
+From timestamps to epoch
~~~~~~~~~~~~~~~~~~~~~~~~
To invert the operation from above, namely, to convert from a ``Timestamp`` to a 'unix' epoch:
@@ -396,7 +394,7 @@ Commonly called 'unix epoch' or POSIX time.
.. _timeseries.daterange:
-Generating Ranges of Timestamps
+Generating ranges of timestamps
-------------------------------
To generate an index with timestamps, you can use either the ``DatetimeIndex`` or
@@ -471,19 +469,9 @@ resulting ``DatetimeIndex``:
.. _timeseries.custom-freq-ranges:
-Custom Frequency Ranges
+Custom frequency ranges
~~~~~~~~~~~~~~~~~~~~~~~
-.. warning::
-
- This functionality was originally exclusive to ``cdate_range``, which is
- deprecated as of version 0.21.0 in favor of ``bdate_range``. Note that
- ``cdate_range`` only utilizes the ``weekmask`` and ``holidays`` parameters
- when custom business day, 'C', is passed as the frequency string. Support has
- been expanded with ``bdate_range`` to work with any custom frequency string.
-
-.. versionadded:: 0.21.0
-
``bdate_range`` can also generate a range of custom frequency dates by using
the ``weekmask`` and ``holidays`` parameters. These parameters will only be
used if a custom frequency string is passed.
@@ -504,7 +492,7 @@ used if a custom frequency string is passed.
.. _timeseries.timestamp-limits:
-Timestamp Limitations
+Timestamp limitations
---------------------
Since pandas represents timestamps in nanosecond resolution, the time span that
@@ -561,7 +549,7 @@ intelligent functionality like selection, slicing, etc.
.. _timeseries.partialindexing:
-Partial String Indexing
+Partial string indexing
~~~~~~~~~~~~~~~~~~~~~~~
Dates and strings that parse to timestamps can be passed as indexing parameters:
@@ -619,8 +607,6 @@ We are stopping on the included end-point as it is part of the index:
dft['2013-1-15':'2013-1-15 12:30:00']
-.. versionadded:: 0.18.0
-
``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``:
.. ipython:: python
@@ -648,7 +634,7 @@ Slicing with string indexing also honors UTC offset.
.. _timeseries.slice_vs_exact_match:
-Slice vs. Exact Match
+Slice vs. exact match
~~~~~~~~~~~~~~~~~~~~~
.. versionchanged:: 0.20.0
@@ -719,7 +705,7 @@ Note also that ``DatetimeIndex`` resolution cannot be less precise than day.
series_monthly['2011-12'] # returns Series
-Exact Indexing
+Exact indexing
~~~~~~~~~~~~~~
As discussed in previous section, indexing a ``DatetimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*.
@@ -738,7 +724,7 @@ With no defaults.
datetime.datetime(2013, 2, 28, 10, 12, 0)]
-Truncating & Fancy Indexing
+Truncating & fancy indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~
A :meth:`~DataFrame.truncate` convenience function is provided that is similar
@@ -763,7 +749,7 @@ regularity will result in a ``DatetimeIndex``, although frequency is lost:
.. _timeseries.components:
-Time/Date Components
+Time/date components
--------------------
There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DatetimeIndex``.
@@ -805,7 +791,7 @@ on :ref:`.dt accessors`.
.. _timeseries.offsets:
-DateOffset Objects
+DateOffset objects
------------------
In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify
@@ -922,7 +908,7 @@ in the operation).
.. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html
-Parametric Offsets
+Parametric offsets
~~~~~~~~~~~~~~~~~~
Some of the offsets can be "parameterized" when created to result in different
@@ -958,7 +944,7 @@ Another example is parameterizing ``YearEnd`` with the specific ending month:
.. _timeseries.offsetseries:
-Using Offsets with ``Series`` / ``DatetimeIndex``
+Using offsets with ``Series`` / ``DatetimeIndex``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Offsets can be used with either a ``Series`` or ``DatetimeIndex`` to
@@ -997,7 +983,7 @@ calculate significantly slower and will show a ``PerformanceWarning``
.. _timeseries.custombusinessdays:
-Custom Business Days
+Custom business days
~~~~~~~~~~~~~~~~~~~~
The ``CDay`` or ``CustomBusinessDay`` class provides a parametric
@@ -1071,7 +1057,7 @@ in the usual way.
.. _timeseries.businesshour:
-Business Hour
+Business hour
~~~~~~~~~~~~~
The ``BusinessHour`` class provides a business hour representation on ``BusinessDay``,
@@ -1133,7 +1119,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine
pd.Timestamp('2014-08-01 17:00') + bh
pd.Timestamp('2014-08-01 23:00') + bh
- # Although 2014-08-02 is Satuaday,
+ # Although 2014-08-02 is Saturday,
# it is valid because it starts from 08-01 (Friday).
pd.Timestamp('2014-08-02 04:00') + bh
@@ -1172,11 +1158,9 @@ following subsection.
.. _timeseries.custombusinesshour:
-Custom Business Hour
+Custom business hour
~~~~~~~~~~~~~~~~~~~~
-.. versionadded:: 0.18.1
-
The ``CustomBusinessHour`` is a mixture of ``BusinessHour`` and ``CustomBusinessDay`` which
allows you to specify arbitrary holidays. ``CustomBusinessHour`` works as the same
as ``BusinessHour`` except that it skips specified custom holidays.
@@ -1205,7 +1189,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB
.. _timeseries.offset_aliases:
-Offset Aliases
+Offset aliases
~~~~~~~~~~~~~~
A number of string aliases are given to useful common time series
@@ -1243,7 +1227,7 @@ frequencies. We will refer to these aliases as *offset aliases*.
"U, us", "microseconds"
"N", "nanoseconds"
-Combining Aliases
+Combining aliases
~~~~~~~~~~~~~~~~~
As we have seen previously, the alias and the offset instance are fungible in
@@ -1263,7 +1247,7 @@ You can combine together day and intraday offsets:
pd.date_range(start, periods=10, freq='1D10U')
-Anchored Offsets
+Anchored offsets
~~~~~~~~~~~~~~~~
For some frequencies you can specify an anchoring suffix:
@@ -1308,7 +1292,7 @@ These can be used as arguments to ``date_range``, ``bdate_range``, constructors
for ``DatetimeIndex``, as well as various other timeseries-related functions
in pandas.
-Anchored Offset Semantics
+Anchored offset semantics
~~~~~~~~~~~~~~~~~~~~~~~~~
For those offsets that are anchored to the start or end of specific
@@ -1356,7 +1340,7 @@ it is rolled forward to the next anchor point.
.. _timeseries.holiday:
-Holidays / Holiday Calendars
+Holidays / holiday calendars
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Holidays and calendars provide a simple way to define holiday rules to be used
@@ -1456,7 +1440,7 @@ or calendars with additional rules.
Time Series-Related Instance Methods
------------------------------------
-Shifting / Lagging
+Shifting / lagging
~~~~~~~~~~~~~~~~~~
One may want to *shift* or *lag* the values in a time series back and forward in
@@ -1489,7 +1473,7 @@ changes all the dates in the index by a specified number of offsets:
Note that with ``tshift``, the leading entry is no longer NaN because the data
is not being realigned.
-Frequency Conversion
+Frequency conversion
~~~~~~~~~~~~~~~~~~~~
The primary function for changing frequencies is the :meth:`~Series.asfreq`
@@ -1511,13 +1495,13 @@ method for any gaps that may appear after the frequency conversion.
ts.asfreq(pd.offsets.BDay(), method='pad')
-Filling Forward / Backward
+Filling forward / backward
~~~~~~~~~~~~~~~~~~~~~~~~~~
Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is
documented in the :ref:`missing data section `.
-Converting to Python Datetimes
+Converting to Python datetimes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``DatetimeIndex`` can be converted to an array of Python native
@@ -1528,11 +1512,6 @@ Converting to Python Datetimes
Resampling
----------
-.. warning::
-
- The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible.
- See the :ref:`whatsnew docs ` for a comparison with prior versions.
-
Pandas has a simple, powerful, and efficient functionality for performing
resampling operations during frequency conversion (e.g., converting secondly
data into 5-minutely data). This is extremely common in, but not limited to,
@@ -1542,8 +1521,8 @@ financial applications.
on each of its groups. See some :ref:`cookbook examples ` for
some advanced strategies.
-Starting in version 0.18.1, the ``resample()`` function can be used directly from
-``DataFrameGroupBy`` objects, see the :ref:`groupby docs `.
+The ``resample()`` method can be used directly from ``DataFrameGroupBy`` objects,
+see the :ref:`groupby docs `.
.. note::
@@ -1654,7 +1633,7 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to
ts[:2].resample('250L').ffill(limit=2)
-Sparse Resampling
+Sparse resampling
~~~~~~~~~~~~~~~~~
Sparse timeseries are the ones where you have a lot fewer points relative
@@ -1807,7 +1786,7 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more.
.. _timeseries.periods:
-Time Span Representation
+Time span representation
------------------------
Regular intervals of time are represented by ``Period`` objects in pandas while
@@ -1939,11 +1918,9 @@ objects:
.. _timeseries.period_dtype:
-Period Dtypes
+Period dtypes
~~~~~~~~~~~~~
-.. versionadded:: 0.19.0
-
``PeriodIndex`` has a custom ``period`` dtype. This is a pandas extension
dtype similar to the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``).
@@ -1974,7 +1951,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
dti.astype('period[M]')
-PeriodIndex Partial String Indexing
+PeriodIndex partial string indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `.
@@ -2007,7 +1984,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa
dfp['2013-01-01 10H':'2013-01-01 11H']
-Frequency Conversion and Resampling with PeriodIndex
+Frequency conversion and resampling with PeriodIndex
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq``
method. Let's start with the fiscal year 2011, ending in December:
@@ -2078,7 +2055,7 @@ frequencies ``Q-JAN`` through ``Q-DEC``.
.. _timeseries.interchange:
-Converting Between Representations
+Converting between representations
----------------------------------
Timestamped data can be converted to PeriodIndex-ed data using ``to_period``
@@ -2122,7 +2099,7 @@ the quarter end:
.. _timeseries.oob:
-Representing Out-of-Bounds Spans
+Representing out-of-bounds spans
--------------------------------
If you have data that is outside of the ``Timestamp`` bounds, see :ref:`Timestamp limitations `,
@@ -2156,7 +2133,7 @@ These can easily be converted to a ``PeriodIndex``:
.. _timeseries.timezone:
-Time Zone Handling
+Time zone handling
------------------
pandas provides rich support for working with timestamps in different time
@@ -2164,7 +2141,7 @@ zones using the ``pytz`` and ``dateutil`` libraries or class:`datetime.timezone`
objects from the standard library.
-Working with Time Zones
+Working with time zones
~~~~~~~~~~~~~~~~~~~~~~~
By default, pandas objects are time zone unaware:
@@ -2320,7 +2297,7 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None)
.. _timeseries.timezone_ambiguous:
-Ambiguous Times when Localizing
+Ambiguous times when localizing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``tz_localize`` may not be able to determine the UTC offset of a timestamp
@@ -2354,7 +2331,7 @@ Handle these ambiguous times by specifying the following.
.. _timeseries.timezone_nonexistent:
-Nonexistent Times when Localizing
+Nonexistent times when localizing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A DST transition may also shift the local time ahead by 1 hour creating nonexistent
@@ -2392,7 +2369,7 @@ Transform nonexistent times to ``NaT`` or shift the times.
.. _timeseries.timezone_series:
-Time Zone Series Operations
+Time zone series operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
A :class:`Series` with time zone **naive** values is
diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
index 2448d0e5d9930..fa16b2f216610 100644
--- a/doc/source/user_guide/visualization.rst
+++ b/doc/source/user_guide/visualization.rst
@@ -23,7 +23,7 @@ libraries that go beyond the basics documented here.
.. _visualization.basic:
-Basic Plotting: ``plot``
+Basic plotting: ``plot``
------------------------
We will demonstrate the basics, see the :ref:`cookbook` for
@@ -97,7 +97,7 @@ You can plot one column versus another using the `x` and `y` keywords in
.. _visualization.other:
-Other Plots
+Other plots
-----------
Plotting methods allow for a handful of plot styles other than the
@@ -311,7 +311,7 @@ The ``by`` keyword can be specified to plot grouped histograms:
.. _visualization.box:
-Box Plots
+Box plots
~~~~~~~~~
Boxplot can be drawn calling :meth:`Series.plot.box` and :meth:`DataFrame.plot.box`,
@@ -438,10 +438,6 @@ columns:
.. _visualization.box.return:
-.. warning::
-
- The default changed from ``'dict'`` to ``'axes'`` in version 0.19.0.
-
In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``.
Faceting, created by ``DataFrame.boxplot`` with the ``by``
keyword, will affect the output type as well:
@@ -495,7 +491,7 @@ then by the numeric columns.
.. _visualization.area_plot:
-Area Plot
+Area plot
~~~~~~~~~
You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`.
@@ -531,7 +527,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5
.. _visualization.scatter:
-Scatter Plot
+Scatter plot
~~~~~~~~~~~~
Scatter plot can be drawn by using the :meth:`DataFrame.plot.scatter` method.
@@ -599,7 +595,7 @@ See the :meth:`scatter ` method and the
.. _visualization.hexbin:
-Hexagonal Bin Plot
+Hexagonal bin plot
~~~~~~~~~~~~~~~~~~
You can create hexagonal bin plots with :meth:`DataFrame.plot.hexbin`.
@@ -762,7 +758,7 @@ See the `matplotlib pie documentation `). These can be used
to control additional styling, beyond what pandas provides.
-Controlling the Legend
+Controlling the legend
~~~~~~~~~~~~~~~~~~~~~~
You may set the ``legend`` argument to ``False`` to hide the legend, which is
@@ -1140,7 +1136,7 @@ You may pass ``logy`` to get a log-scale Y axis.
See also the ``logx`` and ``loglog`` keyword arguments.
-Plotting on a Secondary Y-axis
+Plotting on a secondary y-axis
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To plot data on a secondary y-axis, use the ``secondary_y`` keyword:
@@ -1152,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword:
.. ipython:: python
- df.A.plot()
+ df['A'].plot()
@savefig series_plot_secondary_y.png
- df.B.plot(secondary_y=True, style='g')
+ df['B'].plot(secondary_y=True, style='g')
.. ipython:: python
:suppress:
@@ -1194,7 +1190,7 @@ with "(right)" in the legend. To turn off the automatic marking, use the
plt.close('all')
-Suppressing Tick Resolution Adjustment
+Suppressing tick resolution adjustment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pandas includes automatic tick resolution adjustment for regular frequency
@@ -1209,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed:
plt.figure()
@savefig ser_plot_suppress.png
- df.A.plot()
+ df['A'].plot()
.. ipython:: python
:suppress:
@@ -1223,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior:
plt.figure()
@savefig ser_plot_suppress_parm.png
- df.A.plot(x_compat=True)
+ df['A'].plot(x_compat=True)
.. ipython:: python
:suppress:
@@ -1239,16 +1235,16 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`:
@savefig ser_plot_suppress_context.png
with pd.plotting.plot_params.use('x_compat', True):
- df.A.plot(color='r')
- df.B.plot(color='g')
- df.C.plot(color='b')
+ df['A'].plot(color='r')
+ df['B'].plot(color='g')
+ df['C'].plot(color='b')
.. ipython:: python
:suppress:
plt.close('all')
-Automatic Date Tick Adjustment
+Automatic date tick adjustment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. versionadded:: 0.20.0
@@ -1276,7 +1272,7 @@ with the ``subplots`` keyword:
plt.close('all')
-Using Layout and Targeting Multiple Axes
+Using layout and targeting multiple axes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The layout of subplots can be specified by the ``layout`` keyword. It can accept
@@ -1377,7 +1373,7 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a
.. _visualization.errorbars:
-Plotting With Error Bars
+Plotting with error bars
~~~~~~~~~~~~~~~~~~~~~~~~
Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`.
@@ -1423,7 +1419,7 @@ Here is an example of one way to easily plot group means with standard deviation
.. _visualization.table:
-Plotting Tables
+Plotting tables
~~~~~~~~~~~~~~~
Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and :meth:`Series.plot` with a ``table`` keyword. The ``table`` keyword can accept ``bool``, :class:`DataFrame` or :class:`Series`. The simple way to draw a table is to specify ``table=True``. Data will be transposed to meet matplotlib's default layout.
@@ -1632,18 +1628,3 @@ when plotting a large number of points.
:suppress:
plt.close('all')
-
-
-.. _rplot:
-
-
-Trellis plotting interface
---------------------------
-
-.. warning::
-
- The ``rplot`` trellis plotting interface has been **removed**. Please use
- external packages like `seaborn `_ for
- similar but more refined functionality and refer to our 0.18.1 documentation
- `here `__
- for how to convert to using it.
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 6c529d2e2e5f3..fe80cc8bb959a 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,6 +10,24 @@ This is the list of changes to pandas between each release. For full details,
see the commit logs at http://github.com/pandas-dev/pandas. For install and
upgrade instructions, see :ref:`install`.
+Version 1.0
+-----------
+
+.. toctree::
+ :maxdepth: 2
+
+ v1.0.0
+
+Version 0.25
+------------
+
+.. toctree::
+ :maxdepth: 2
+
+ v0.25.2
+ v0.25.1
+ v0.25.0
+
Version 0.24
------------
diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst
index 9d497f2fc658d..2e0442364b2f3 100644
--- a/doc/source/whatsnew/v0.10.0.rst
+++ b/doc/source/whatsnew/v0.10.0.rst
@@ -255,7 +255,7 @@ Convenience methods ``ffill`` and ``bfill`` have been added:
New features
~~~~~~~~~~~~
-Wide DataFrame Printing
+Wide DataFrame printing
~~~~~~~~~~~~~~~~~~~~~~~
Instead of printing the summary information, pandas now splits the string
@@ -290,7 +290,7 @@ The width of each line can be changed via 'line_width' (80 by default):
wide_frame
-Updated PyTables Support
+Updated PyTables support
~~~~~~~~~~~~~~~~~~~~~~~~
:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect.
@@ -490,7 +490,7 @@ Updated PyTables Support
however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire
file and write it out using the new format to take advantage of the updates.
-N Dimensional Panels (Experimental)
+N dimensional Panels (experimental)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Adding experimental support for Panel4D and factory functions to create n-dimensional named panels.
@@ -498,7 +498,7 @@ Here is a taste of what to expect.
.. code-block:: ipython
- In [58]: p4d = Panel4D(randn(2, 2, 5, 4),
+ In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4),
....: labels=['Label1','Label2'],
....: items=['Item1', 'Item2'],
....: major_axis=date_range('1/1/2000', periods=5),
diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst
index b5b2b889732cd..c4251f70d85b6 100644
--- a/doc/source/whatsnew/v0.10.1.rst
+++ b/doc/source/whatsnew/v0.10.1.rst
@@ -89,7 +89,7 @@ You can now store ``datetime64`` in data columns
store.append('df_mixed', df_mixed)
df_mixed1 = store.select('df_mixed')
df_mixed1
- df_mixed1.get_dtype_counts()
+ df_mixed1.dtypes.value_counts()
You can pass ``columns`` keyword to select to filter a list of the return
columns, this is equivalent to passing a
@@ -170,7 +170,7 @@ combined result, by using ``where`` on a selector table.
df_mt, selector='df1_mt')
store
- # indiviual tables were created
+ # individual tables were created
store.select('df1_mt')
store.select('df2_mt')
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
index 0dfcfca9a7464..148ee349b049c 100644
--- a/doc/source/whatsnew/v0.11.0.rst
+++ b/doc/source/whatsnew/v0.11.0.rst
@@ -20,7 +20,7 @@ of useful recipes in pandas (and that we want contributions!).
There are several libraries that are now :ref:`Recommended Dependencies `
-Selection Choices
+Selection choices
~~~~~~~~~~~~~~~~~
Starting in 0.11.0, object selection has had a number of user-requested additions in
@@ -56,7 +56,7 @@ three types of multi-axis indexing.
See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `.
-Selection Deprecations
+Selection deprecations
~~~~~~~~~~~~~~~~~~~~~~
Starting in version 0.11.0, these methods *may* be deprecated in future versions.
@@ -88,7 +88,7 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe
df3
df3.dtypes
-Dtype Conversion
+Dtype conversion
~~~~~~~~~~~~~~~~
This is lower-common-denominator upcasting, meaning you get the dtype which can accommodate all of the types
@@ -103,7 +103,7 @@ Conversion
df3.astype('float32').dtypes
-Mixed Conversion
+Mixed conversion
.. code-block:: ipython
@@ -134,7 +134,7 @@ Mixed Conversion
E int32
dtype: object
-Forcing Date coercion (and setting ``NaT`` when not datelike)
+Forcing date coercion (and setting ``NaT`` when not datelike)
.. code-block:: ipython
@@ -154,10 +154,10 @@ Forcing Date coercion (and setting ``NaT`` when not datelike)
5 2001-01-05
dtype: datetime64[ns]
-Dtype Gotchas
+Dtype gotchas
~~~~~~~~~~~~~
-**Platform Gotchas**
+**Platform gotchas**
Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``,
*regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify
@@ -185,7 +185,7 @@ The following will all result in ``int64`` dtypes
Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms!
-**Upcasting Gotchas**
+**Upcasting gotchas**
Performing indexing operations on integer type data can easily upcast the data.
The dtype of the input data will be preserved in cases where ``nans`` are not introduced.
@@ -280,7 +280,7 @@ While float dtypes are unchanged.
E int32
dtype: object
-Datetimes Conversion
+Datetimes conversion
~~~~~~~~~~~~~~~~~~~~
Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value,
@@ -296,7 +296,7 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet
df
# datetime64[ns] out of the box
- df.get_dtype_counts()
+ df.dtypes.value_counts()
# use the traditional nan, which is mapped to NaT internally
df.loc[df.index[2:4], ['A', 'timestamp']] = np.nan
@@ -377,15 +377,31 @@ Enhancements
- ``Squeeze`` to possibly remove length 1 dimensions from an object.
- .. ipython:: python
- :okwarning:
-
- p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'],
- major_axis=pd.date_range('20010102', periods=4),
- minor_axis=['A', 'B', 'C', 'D'])
- p
- p.reindex(items=['ItemA']).squeeze()
- p.reindex(items=['ItemA'], minor=['B']).squeeze()
+ .. code-block:: python
+
+ >>> p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'],
+ ... major_axis=pd.date_range('20010102', periods=4),
+ ... minor_axis=['A', 'B', 'C', 'D'])
+ >>> p
+
+ Dimensions: 3 (items) x 4 (major_axis) x 4 (minor_axis)
+ Items axis: ItemA to ItemC
+ Major_axis axis: 2001-01-02 00:00:00 to 2001-01-05 00:00:00
+ Minor_axis axis: A to D
+
+ >>> p.reindex(items=['ItemA']).squeeze()
+ A B C D
+ 2001-01-02 0.926089 -2.026458 0.501277 -0.204683
+ 2001-01-03 -0.076524 1.081161 1.141361 0.479243
+ 2001-01-04 0.641817 -0.185352 1.824568 0.809152
+ 2001-01-05 0.575237 0.669934 1.398014 -0.399338
+
+ >>> p.reindex(items=['ItemA'], minor=['B']).squeeze()
+ 2001-01-02 -2.026458
+ 2001-01-03 1.081161
+ 2001-01-04 -0.185352
+ 2001-01-05 0.669934
+ Freq: D, Name: B, dtype: float64
- In ``pd.io.data.Options``,
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index ff549f10a97c3..0a74d67486715 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -177,7 +177,7 @@ API changes
``__repr__``). Plus string safety throughout. Now employed in many places
throughout the pandas library. (:issue:`4090`, :issue:`4092`)
-I/O Enhancements
+I/O enhancements
~~~~~~~~~~~~~~~~
- ``pd.read_html()`` can now parse HTML strings, files or urls and return
@@ -282,7 +282,7 @@ I/O Enhancements
- ``read_csv`` will now throw a more informative error message when a file
contains no columns, e.g., all newline characters
-Other Enhancements
+Other enhancements
~~~~~~~~~~~~~~~~~~
- ``DataFrame.replace()`` now allows regular expressions on contained
@@ -371,7 +371,7 @@ Other Enhancements
is detected (:issue:`4214`)
-Experimental Features
+Experimental features
~~~~~~~~~~~~~~~~~~~~~
- Added experimental ``CustomBusinessDay`` class to support ``DateOffsets``
@@ -398,7 +398,7 @@ Experimental Features
dts = pd.date_range(dt, periods=5, freq=bday_egypt)
print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())))
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Plotting functions now raise a ``TypeError`` before trying to plot anything
diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst
index 095d1807ca873..ab48594ddadab 100644
--- a/doc/source/whatsnew/v0.13.0.rst
+++ b/doc/source/whatsnew/v0.13.0.rst
@@ -203,7 +203,7 @@ API changes
- ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. These return the *index* of the
min or max element respectively. Prior to 0.13.0 these would return the position of the min / max element. (:issue:`6214`)
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These were announced changes in 0.12 or prior that are taking effect as of 0.13.0
@@ -234,7 +234,7 @@ Deprecated in 0.13.0
behavior is the default, but the new behavior is available through the
keyword argument ``as_indexer=True``.
-Indexing API Changes
+Indexing API changes
~~~~~~~~~~~~~~~~~~~~
Prior to 0.13, it was impossible to use a label indexer (``.loc/.ix``) to set a value that
@@ -305,7 +305,7 @@ A Panel setting operation on an arbitrary axis aligns the input to the Panel
2001-01-14 30.0 32.0
2001-01-15 30.0 32.0
-Float64Index API Change
+Float64Index API change
~~~~~~~~~~~~~~~~~~~~~~~
- Added a new index type, ``Float64Index``. This will be automatically created when passing floating values in index creation.
@@ -369,7 +369,7 @@ Float64Index API Change
In [3]: pd.Series(range(5))[3.0]
Out[3]: 3
-HDFStore API Changes
+HDFStore API changes
~~~~~~~~~~~~~~~~~~~~
- Query Format Changes. A much more string-like query format is now supported. See :ref:`the docs`.
@@ -468,7 +468,7 @@ HDFStore API Changes
via the option ``io.hdf.dropna_table`` (:issue:`4625`)
- pass through store creation arguments; can be used to support in-memory stores
-DataFrame repr Changes
+DataFrame repr changes
~~~~~~~~~~~~~~~~~~~~~~
The HTML and plain text representations of :class:`DataFrame` now show
@@ -829,6 +829,7 @@ Experimental
Since this is an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release.
.. ipython:: python
+ :okwarning:
df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
df.to_msgpack('foo.msg')
@@ -841,6 +842,7 @@ Experimental
You can pass ``iterator=True`` to iterator over the unpacked results
.. ipython:: python
+ :okwarning:
for o in pd.read_msgpack('foo.msg', iterator=True):
print(o)
@@ -915,7 +917,7 @@ Experimental
.. _whatsnew_0130.refactoring:
-Internal Refactoring
+Internal refactoring
~~~~~~~~~~~~~~~~~~~~
In 0.13.0 there is a major refactor primarily to subclass ``Series`` from
@@ -1030,7 +1032,7 @@ to unify methods and behaviors. Series formerly subclassed directly from
.. _release.bug_fixes-0.13.0:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``HDFStore``
diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst
index 161b0ef395f05..6242c40d44bf8 100644
--- a/doc/source/whatsnew/v0.13.1.rst
+++ b/doc/source/whatsnew/v0.13.1.rst
@@ -43,7 +43,7 @@ Highlights include:
df.loc[0, 'A'] = np.nan
df
-Output Formatting Enhancements
+Output formatting enhancements
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- df.info() view now display dtype info per column (:issue:`5682`)
@@ -179,7 +179,7 @@ API changes
[0 rows x 2 columns]
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are no announced changes in 0.13 or prior that are taking effect as of 0.13.1
@@ -394,7 +394,7 @@ There are no experimental changes in 0.13.1
.. _release.bug_fixes-0.13.1:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`)
diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
index d61b9a40438f8..25a75492d78fb 100644
--- a/doc/source/whatsnew/v0.14.0.rst
+++ b/doc/source/whatsnew/v0.14.0.rst
@@ -245,7 +245,7 @@ API changes
.. _whatsnew_0140.display:
-Display Changes
+Display changes
~~~~~~~~~~~~~~~
- The default way of printing large DataFrames has changed. DataFrames
@@ -301,7 +301,7 @@ Display Changes
.. _whatsnew_0140.parsing:
-Text Parsing API Changes
+Text parsing API changes
~~~~~~~~~~~~~~~~~~~~~~~~
:func:`read_csv`/:func:`read_table` will now be noisier w.r.t invalid options rather than falling back to the ``PythonParser``.
@@ -321,10 +321,10 @@ Text Parsing API Changes
.. _whatsnew_0140.groupby:
-Groupby API Changes
+Groupby API changes
~~~~~~~~~~~~~~~~~~~
-More consistent behaviour for some groupby methods:
+More consistent behavior for some groupby methods:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
@@ -473,7 +473,7 @@ Some other enhancements to the sql functions include:
.. _whatsnew_0140.slicers:
-MultiIndexing Using Slicers
+MultiIndexing using slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
In 0.14.0 we added a new way to slice MultiIndexed objects.
@@ -625,7 +625,7 @@ Plotting
.. _whatsnew_0140.prior_deprecations:
-Prior Version Deprecations/Changes
+Prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are prior version deprecations that are taking effect as of 0.14.0.
@@ -731,7 +731,7 @@ Deprecations
.. _whatsnew_0140.knownissues:
-Known Issues
+Known issues
~~~~~~~~~~~~
- OpenPyXL 2.0.0 breaks backwards compatibility (:issue:`7169`)
@@ -816,7 +816,7 @@ Enhancements
- Implemented ``Panel.pct_change`` (:issue:`6904`)
- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:`rolling_max` defaults to max,
:func:`rolling_min` defaults to min, and all others default to mean (:issue:`6297`)
-- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
+- ``CustomBusinessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of
quantiles.
- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`)
diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst
index 98ebbd6a52344..26018c5745a11 100644
--- a/doc/source/whatsnew/v0.14.1.rst
+++ b/doc/source/whatsnew/v0.14.1.rst
@@ -169,7 +169,7 @@ Experimental
.. _whatsnew_0141.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``DataFrame.where`` with a symmetric shaped frame and a passed other of a DataFrame (:issue:`7506`)
- Bug in Panel indexing with a MultiIndex axis (:issue:`7516`)
@@ -247,7 +247,7 @@ Bug Fixes
- Bug in ``DatetimeIndex`` comparison doesn't handle ``NaT`` properly (:issue:`7529`)
- Bug in passing input with ``tzinfo`` to some offsets ``apply``, ``rollforward`` or ``rollback`` resets ``tzinfo`` or raises ``ValueError`` (:issue:`7465`)
- Bug in ``DatetimeIndex.to_period``, ``PeriodIndex.asobject``, ``PeriodIndex.to_timestamp`` doesn't preserve ``name`` (:issue:`7485`)
-- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestanp`` handle ``NaT`` incorrectly (:issue:`7228`)
+- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestamp`` handle ``NaT`` incorrectly (:issue:`7228`)
- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may return normal ``datetime`` (:issue:`7502`)
- Bug in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`)
- Bug in ``Timestamp.tz_localize`` resets ``nanosecond`` info (:issue:`7534`)
diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst
index f9e47b45f498d..c27ada6ef3b58 100644
--- a/doc/source/whatsnew/v0.15.0.rst
+++ b/doc/source/whatsnew/v0.15.0.rst
@@ -220,7 +220,7 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert
.. _whatsnew_0150.memory:
-Memory Usage
+Memory usage
^^^^^^^^^^^^
Implemented methods to find memory usage of a DataFrame. See the :ref:`FAQ ` for more. (:issue:`6852`).
@@ -339,7 +339,7 @@ Timezone handling improvements
.. _whatsnew_0150.roll:
-Rolling/Expanding Moments improvements
+Rolling/expanding moments improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- :func:`rolling_min`, :func:`rolling_max`, :func:`rolling_cov`, and :func:`rolling_corr`
@@ -701,14 +701,19 @@ Other notable API changes:
This can also be seen in multi-axis indexing with a ``Panel``.
- .. ipython:: python
- :okwarning:
+ .. code-block:: python
+
+ >>> p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4),
+ ... items=['ItemA', 'ItemB'],
+ ... major_axis=[1, 2, 3],
+ ... minor_axis=['A', 'B', 'C', 'D'])
+ >>> p
+
+ Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis)
+ Items axis: ItemA to ItemB
+ Major_axis axis: 1 to 3
+ Minor_axis axis: A to D
- p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4),
- items=['ItemA', 'ItemB'],
- major_axis=[1, 2, 3],
- minor_axis=['A', 'B', 'C', 'D'])
- p
The following would raise ``KeyError`` prior to 0.15.0:
@@ -879,7 +884,7 @@ Other notable API changes:
.. _whatsnew_0150.refactoring:
-Internal Refactoring
+Internal refactoring
^^^^^^^^^^^^^^^^^^^^
In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray``
@@ -1109,7 +1114,7 @@ Performance
.. _whatsnew_0150.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in pivot_table, when using margins and a dict aggfunc (:issue:`8349`)
diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst
index 1091944cb056f..2e036267b5804 100644
--- a/doc/source/whatsnew/v0.15.1.rst
+++ b/doc/source/whatsnew/v0.15.1.rst
@@ -275,7 +275,7 @@ Enhancements
.. _whatsnew_0151.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in unpickling of a ``CustomBusinessDay`` object (:issue:`8591`)
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index 9f0449d6a1754..b58eabaed6127 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -160,11 +160,16 @@ Other enhancements:
- ``Panel`` now supports the ``all`` and ``any`` aggregation functions. (:issue:`8302`):
- .. ipython:: python
- :okwarning:
+ .. code-block:: python
- p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
- p.all()
+ >>> p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
+ >>> p.all()
+ 0 1 2 3
+ 0 True True True True
+ 1 True False True True
+ 2 True True True True
+ 3 False True False True
+ 4 True True True True
- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__.
@@ -191,7 +196,7 @@ Performance
.. _whatsnew_0152.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)
diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst
index 2cb09325c9466..42b3b9332ca98 100644
--- a/doc/source/whatsnew/v0.16.0.rst
+++ b/doc/source/whatsnew/v0.16.0.rst
@@ -39,7 +39,7 @@ New features
.. _whatsnew_0160.enhancements.assign:
-DataFrame Assign
+DataFrame assign
^^^^^^^^^^^^^^^^
Inspired by `dplyr's
@@ -135,7 +135,7 @@ from a ``scipy.sparse.coo_matrix``:
.. _whatsnew_0160.enhancements.string:
-String Methods Enhancements
+String methods enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Following new methods are accessible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9282`, :issue:`9352`, :issue:`9386`, :issue:`9387`, :issue:`9439`)
@@ -228,7 +228,7 @@ sub-class of ``datetime.timedelta``. Mentioned :ref:`here `_ for similar
but more refined functionality (:issue:`3445`).
The documentation includes some examples how to convert your existing code
- using ``rplot`` to seaborn: :ref:`rplot docs `.
+ from ``rplot`` to seaborn `here `__.
- The ``pandas.sandbox.qtpandas`` interface is deprecated and will be removed in a future version.
We refer users to the external package `pandas-qt `_. (:issue:`9615`)
@@ -555,7 +555,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0160.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
@@ -573,7 +573,7 @@ Performance Improvements
.. _whatsnew_0160.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Changed ``.to_html`` to remove leading/trailing spaces in table body (:issue:`4987`)
@@ -638,7 +638,7 @@ Bug Fixes
- ``Series`` number formatting inconsistent when truncated (:issue:`8532`).
- Previous Behavior
+ Previous behavior
.. code-block:: python
@@ -655,7 +655,7 @@ Bug Fixes
129 1.0000
Length: 130, dtype: float64
- New Behavior
+ New behavior
.. code-block:: python
diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst
index cbcb23e356577..502c1287efdbe 100644
--- a/doc/source/whatsnew/v0.16.1.rst
+++ b/doc/source/whatsnew/v0.16.1.rst
@@ -216,7 +216,7 @@ when sampling from rows.
.. _whatsnew_0161.enhancements.string:
-String Methods Enhancements
+String methods enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^
:ref:`Continuing from v0.16.0 `, the following
@@ -279,7 +279,7 @@ enhancements make string operations easier and more consistent with standard pyt
.. _whatsnew_0161.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`)
@@ -351,12 +351,12 @@ Deprecations
.. _whatsnew_0161.index_repr:
-Index Representation
+Index representation
~~~~~~~~~~~~~~~~~~~~
The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -378,7 +378,7 @@ Previous Behavior
[2013-01-01 00:00:00-05:00, ..., 2013-04-14 00:00:00-04:00]
Length: 104, Freq: D, Timezone: US/Eastern
-New Behavior
+New behavior
.. ipython:: python
@@ -399,7 +399,7 @@ New Behavior
.. _whatsnew_0161.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
@@ -409,7 +409,7 @@ Performance Improvements
.. _whatsnew_0161.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug where labels did not appear properly in the legend of ``DataFrame.plot()``, passing ``label=`` arguments works, and Series indices are no longer mutated. (:issue:`9542`)
diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst
index ca0ad8d3ae7f9..543f9c6bbf300 100644
--- a/doc/source/whatsnew/v0.16.2.rst
+++ b/doc/source/whatsnew/v0.16.2.rst
@@ -86,7 +86,7 @@ See the :ref:`documentation ` for more. (:issue:`10129`)
.. _whatsnew_0162.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- Added `rsplit` to Index/Series StringMethods (:issue:`10303`)
@@ -105,7 +105,7 @@ Other Enhancements
.. _whatsnew_0162.api:
-API Changes
+API changes
~~~~~~~~~~~
- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in the constructor instead of returning an incorrect result (:issue:`10217`).
@@ -113,7 +113,7 @@ API Changes
.. _whatsnew_0162.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved ``Series.resample`` performance with ``dtype=datetime64[ns]`` (:issue:`7754`)
@@ -121,7 +121,7 @@ Performance Improvements
.. _whatsnew_0162.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``Series.hist`` raises an error when a one row ``Series`` was given (:issue:`10214`)
diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst
index 8a3f87e8488ca..67abad659dc8d 100644
--- a/doc/source/whatsnew/v0.17.0.rst
+++ b/doc/source/whatsnew/v0.17.0.rst
@@ -103,7 +103,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f
There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but
functionally these are the same.
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -115,7 +115,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f
In [2]: pd.date_range('20130101', periods=3, tz='US/Eastern').dtype
Out[2]: dtype('` for more details.
.. _whatsnew_0170.matheval:
-Support for Math Functions in .eval()
+Support for math functions in .eval()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:meth:`~pandas.eval` now supports calling math functions (:issue:`4893`)
@@ -329,7 +329,7 @@ has been changed to make this keyword unnecessary - the change is shown below.
.. _whatsnew_0170.gbq:
-Google BigQuery Enhancements
+Google BigQuery enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Added ability to automatically create a table/dataset using the :func:`pandas.io.gbq.to_gbq` function if the destination table/dataset does not exist. (:issue:`8325`, :issue:`11121`).
- Added ability to replace an existing table and schema when calling the :func:`pandas.io.gbq.to_gbq` function via the ``if_exists`` argument. See the `docs `__ for more details (:issue:`8325`).
@@ -339,7 +339,7 @@ Google BigQuery Enhancements
.. _whatsnew_0170.east_asian_width:
-Display Alignment with Unicode East Asian Width
+Display alignment with Unicode East Asian width
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
@@ -411,7 +411,7 @@ Other enhancements
bar = pd.Series([1, 2])
baz = pd.Series([4, 5])
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -421,7 +421,7 @@ Other enhancements
0 1 1 4
1 2 2 5
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -609,14 +609,14 @@ In prior versions it was ``errors='ignore'``. Furthermore, the ``coerce`` argume
has been deprecated in favor of ``errors='coerce'``. This means that invalid parsing
will raise rather that return the original input as in previous versions. (:issue:`10636`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
In [2]: pd.to_datetime(['2009-07-31', 'asd'])
Out[2]: array(['2009-07-31', 'asd'], dtype=object)
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -638,7 +638,7 @@ To keep the previous behavior, you can use ``errors='ignore'``:
Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
has been deprecated in favor of ``errors='coerce'``.
-Consistent Parsing
+Consistent parsing
""""""""""""""""""
The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has
@@ -648,7 +648,7 @@ Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime
uses the beginning of the year. ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex``
can parse, such as a quarterly string.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -663,7 +663,7 @@ Previous Behavior:
v0.17.0 can parse them as below. It works on ``DatetimeIndex`` also.
-New Behavior:
+New behavior:
.. ipython:: python
@@ -681,7 +681,7 @@ New Behavior:
pd.Timestamp.now()
pd.Timestamp.now() + offsets.DateOffset(years=1)
-Changes to Index Comparisons
+Changes to Index comparisons
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`9947`, :issue:`10637`)
@@ -689,7 +689,7 @@ Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`994
Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise
a ``ValueError``. This is to be consistent with the behavior of ``Series``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -702,7 +702,7 @@ Previous Behavior:
In [4]: pd.Index([1, 2, 3]) == pd.Index([1, 2])
Out[4]: False
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -729,7 +729,7 @@ or it can return False if broadcasting can not be done:
np.array([1, 2, 3]) == np.array([1, 2])
-Changes to Boolean Comparisons vs. None
+Changes to boolean comparisons vs. None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to comparing with ``np.nan``, rather than raise ``TypeError``. (:issue:`1079`).
@@ -740,14 +740,14 @@ Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to compar
s.iloc[1] = None
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
In [5]: s == None
TypeError: Could not compare type with Series
-New Behavior:
+New behavior:
.. ipython:: python
@@ -776,7 +776,7 @@ HDFStore dropna behavior
The default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`)
-Previous Behavior:
+Previous behavior:
.. ipython:: python
@@ -802,7 +802,7 @@ Previous Behavior:
2 2 NaN
-New Behavior:
+New behavior:
.. ipython:: python
@@ -882,7 +882,7 @@ Changes to ``Categorical.unique``
cat
cat.unique()
-Changes to ``bool`` passed as ``header`` in Parsers
+Changes to ``bool`` passed as ``header`` in parsers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In earlier versions of pandas, if a bool was passed the ``header`` argument of
@@ -901,7 +901,7 @@ A ``bool`` input to ``header`` will now raise a ``TypeError``
.. _whatsnew_0170.api_breaking.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
@@ -1016,7 +1016,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0170.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Development support for benchmarking with the `Air Speed Velocity library `_ (:issue:`8361`)
@@ -1039,7 +1039,7 @@ Performance Improvements
.. _whatsnew_0170.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in incorrect computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst
index 9de49699b9652..55080240f2a55 100644
--- a/doc/source/whatsnew/v0.17.1.rst
+++ b/doc/source/whatsnew/v0.17.1.rst
@@ -31,7 +31,7 @@ New features
.. _whatsnew_0171.style:
-Conditional HTML Formatting
+Conditional HTML formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
@@ -135,7 +135,7 @@ Deprecations
.. _whatsnew_0171.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Checking monotonic-ness before sorting on an index (:issue:`11080`)
@@ -152,7 +152,7 @@ Performance Improvements
.. _whatsnew_0171.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``SparseArray.__iter__()`` now does not cause ``PendingDeprecationWarning`` in Python 3.5 (:issue:`11622`)
diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst
index 9ff6ad7188f5a..a7174c6325f86 100644
--- a/doc/source/whatsnew/v0.18.0.rst
+++ b/doc/source/whatsnew/v0.18.0.rst
@@ -62,7 +62,7 @@ Window functions have been refactored to be methods on ``Series/DataFrame`` obje
df = pd.DataFrame({'A': range(10), 'B': np.random.randn(10)})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -82,7 +82,7 @@ Previous Behavior:
8 7 0.079587
9 8 -0.954504
-New Behavior:
+New behavior:
.. ipython:: python
@@ -145,14 +145,14 @@ This continues to work as before for function or dict-like values.
.. _whatsnew_0180.enhancements.rangeindex:
-Range Index
+Range index
^^^^^^^^^^^
A ``RangeIndex`` has been added to the ``Int64Index`` sub-classes to support a memory saving alternative for common use cases. This has a similar implementation to the python ``range`` object (``xrange`` in python 2), in that it only stores the start, stop, and step values for the index. It will transparently interact with the user API, converting to ``Int64Index`` if needed.
This will now be the default constructed index for ``NDFrame`` objects, rather than previous an ``Int64Index``. (:issue:`939`, :issue:`12070`, :issue:`12071`, :issue:`12109`, :issue:`12888`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -168,7 +168,7 @@ Previous Behavior:
Out[6]: 8000
-New Behavior:
+New behavior:
.. ipython:: python
@@ -341,13 +341,13 @@ In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available throug
s
s.dt.round('D')
-Formatting of Integers in FloatIndex
+Formatting of integers in FloatIndex
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Integers in ``FloatIndex``, e.g. 1., are now formatted with a decimal point and a ``0`` digit, e.g. ``1.0`` (:issue:`11713`)
This change not only affects the display to the console, but also the output of IO methods like ``.to_csv`` or ``.to_html``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -369,7 +369,7 @@ Previous Behavior:
2,3
-New Behavior:
+New behavior:
.. ipython:: python
@@ -383,7 +383,7 @@ Changes to dtype assignment behaviors
When a DataFrame's slice is updated with a new slice of the same dtype, the dtype of the DataFrame will now remain the same. (:issue:`10503`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -406,7 +406,7 @@ Previous Behavior:
b int64
dtype: object
-New Behavior:
+New behavior:
.. ipython:: python
@@ -419,7 +419,7 @@ New Behavior:
When a DataFrame's integer slice is partially updated with a new slice of floats that could potentially be down-casted to integer without losing precision, the dtype of the slice will be set to float instead of integer.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -443,7 +443,7 @@ Previous Behavior:
10 4 5 1
8 12 7 8 9
-New Behavior:
+New behavior:
.. ipython:: python
@@ -484,7 +484,7 @@ See the `xarray full-documentation here `__
* major_axis (major_axis) int64 0 1 2
* minor_axis (minor_axis) int64 0 1 2 3
-Latex Representation
+Latex representation
^^^^^^^^^^^^^^^^^^^^
``DataFrame`` has gained a ``._repr_latex_()`` method in order to allow for conversion to latex in a ipython/jupyter notebook using nbconvert. (:issue:`11778`)
@@ -981,7 +981,7 @@ assignments are valid for multi-line expressions.
.. _whatsnew_0180.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`)
@@ -1074,7 +1074,7 @@ In 0.18.0, this deprecation warning is removed and these will now raise a ``Type
s2 = pd.Series([1, 2, 3], index=list('abc'))
s2
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1104,7 +1104,7 @@ Previous Behavior:
c 3
dtype: int64
-New Behavior:
+New behavior:
For iloc, getting & setting via a float scalar will always raise.
@@ -1180,7 +1180,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0180.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``andrews_curves`` (:issue:`11534`)
diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
index 069395c2e0f36..7e06e5050c5f0 100644
--- a/doc/source/whatsnew/v0.18.1.rst
+++ b/doc/source/whatsnew/v0.18.1.rst
@@ -31,7 +31,7 @@ New features
.. _whatsnew_0181.enhancements.custombusinesshour:
-Custom Business Hour
+Custom business hour
^^^^^^^^^^^^^^^^^^^^
The ``CustomBusinessHour`` is a mixture of ``BusinessHour`` and ``CustomBusinessDay`` which
@@ -199,7 +199,7 @@ On other levels
.. _whatsnew_0181.enhancements.assembling:
-Assembling Datetimes
+Assembling datetimes
^^^^^^^^^^^^^^^^^^^^
``pd.to_datetime()`` has gained the ability to assemble datetimes from a passed in ``DataFrame`` or a dict. (:issue:`8158`).
@@ -226,7 +226,7 @@ You can pass only the columns that you need to assemble.
.. _whatsnew_0181.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``pd.read_csv()`` now supports ``delim_whitespace=True`` for the Python engine (:issue:`12958`)
@@ -317,7 +317,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_
'B': [1, 2, 3]})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -333,7 +333,7 @@ Previous Behavior:
1 2
Name: B, dtype: int64
-New Behavior:
+New behavior:
.. ipython:: python
@@ -348,7 +348,7 @@ Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=
df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b'])
df['c'] = np.random.randint(0, 4, 100)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -370,7 +370,7 @@ Previous Behavior:
2 -0.720589 0.887163
3 0.859588 -0.636524
-New Behavior:
+New behavior:
.. ipython:: python
@@ -446,7 +446,7 @@ Previous behavior:
2000-11-30 value 13
dtype: int64
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -580,7 +580,7 @@ Deprecations
.. _whatsnew_0181.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved speed of SAS reader (:issue:`12656`, :issue:`12961`)
@@ -601,7 +601,7 @@ Performance Improvements
.. _whatsnew_0181.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)
- Bug in ``groupby.transform(..)`` when ``axis=1`` is specified with a non-monotonic ordered index (:issue:`12713`)
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index de29a1eb93709..1dad8769a6b39 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -218,7 +218,7 @@ contained the values ``[0, 3]``.
**New behavior**:
.. ipython:: python
- :okwarning:
+ :okexcept:
pd.read_csv(StringIO(data), names=names)
@@ -264,7 +264,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
.. _whatsnew_0190.enhancements.union_categoricals:
-Categorical Concatenation
+Categorical concatenation
^^^^^^^^^^^^^^^^^^^^^^^^^
- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`13763`, :issue:`13846`, :issue:`14173`)
@@ -298,7 +298,7 @@ Categorical Concatenation
.. _whatsnew_0190.enhancements.semi_month_offsets:
-Semi-Month Offsets
+Semi-month offsets
^^^^^^^^^^^^^^^^^^
Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS').
@@ -596,7 +596,7 @@ Comparison operators
Comparison operators raise ``ValueError`` when ``.index`` are different.
-**Previous Behavior** (``Series``):
+**Previous behavior** (``Series``):
``Series`` compared values ignoring the ``.index`` as long as both had the same length:
@@ -631,7 +631,7 @@ Comparison operators raise ``ValueError`` when ``.index`` are different.
s1.eq(s2)
-**Current Behavior** (``DataFrame``, no change):
+**Current behavior** (``DataFrame``, no change):
.. code-block:: ipython
@@ -675,7 +675,7 @@ Logical operators align both ``.index`` of left and right hand side.
s1 & s2.reindex_like(s1)
-**Current Behavior** (``DataFrame``, no change):
+**Current behavior** (``DataFrame``, no change):
.. ipython:: python
@@ -1324,7 +1324,7 @@ operations on that platform.
.. _whatsnew_0190.api.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout (:issue:`14101`).
@@ -1406,7 +1406,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0190.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`)
@@ -1426,7 +1426,7 @@ Performance Improvements
.. _whatsnew_0190.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
@@ -1513,7 +1513,7 @@ Bug Fixes
- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`)
- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`)
- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`)
-- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`)
+- Bug in ``Period`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`)
- Bug in ``pd.set_eng_float_format()`` that would prevent NaN and Inf from formatting (:issue:`11981`)
- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`)
- Clean some compile time warnings in datetime parsing (:issue:`13607`)
diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst
index 12f3e985565e0..a89d1461073bd 100644
--- a/doc/source/whatsnew/v0.19.1.rst
+++ b/doc/source/whatsnew/v0.19.1.rst
@@ -22,7 +22,7 @@ We recommend that all users upgrade to this version.
.. _whatsnew_0191.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
@@ -34,7 +34,7 @@ Performance Improvements
.. _whatsnew_0191.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`)
diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst
index 14310ceb45b4a..023bc78081ec9 100644
--- a/doc/source/whatsnew/v0.19.2.rst
+++ b/doc/source/whatsnew/v0.19.2.rst
@@ -39,7 +39,7 @@ The ``pd.merge_asof()``, added in 0.19.0, gained some improvements:
.. _whatsnew_0192.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance regression with ``PeriodIndex`` (:issue:`14822`)
@@ -50,7 +50,7 @@ Performance Improvements
.. _whatsnew_0192.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Compat with python 3.6 for pickling of some offsets (:issue:`14685`)
- Compat with python 3.6 for some indexing exception types (:issue:`14684`, :issue:`14689`)
diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
index 51c8c488fb9d9..ef6108ae3ec90 100644
--- a/doc/source/whatsnew/v0.20.0.rst
+++ b/doc/source/whatsnew/v0.20.0.rst
@@ -151,7 +151,7 @@ commonly called 'unix epoch' or POSIX time. This was the previous default, so th
.. _whatsnew_0200.enhancements.groupby_access:
-Groupby Enhancements
+Groupby enhancements
^^^^^^^^^^^^^^^^^^^^
Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names. Previously, only column names could be referenced. This allows to easily group by a column and index level at the same time. (:issue:`5677`)
@@ -240,7 +240,7 @@ The default is to infer the compression type from the extension (``compression='
.. _whatsnew_0200.enhancements.uint64_support:
-UInt64 Support Improved
+UInt64 support improved
^^^^^^^^^^^^^^^^^^^^^^^
Pandas has significantly improved support for operations involving unsigned,
@@ -263,7 +263,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
.. _whatsnew_0200.enhancements.groupy_categorical:
-GroupBy on Categoricals
+GroupBy on categoricals
^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
@@ -280,7 +280,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
ordered=True)})
df
-**Previous Behavior**:
+**Previous behavior**:
.. code-block:: ipython
@@ -288,7 +288,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
---------------------------------------------------------------------------
ValueError: items in new_categories are not the same as in old categories
-**New Behavior**:
+**New behavior**:
.. ipython:: python
@@ -296,7 +296,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
.. _whatsnew_0200.enhancements.table_schema:
-Table Schema Output
+Table schema output
^^^^^^^^^^^^^^^^^^^
The new orient ``'table'`` for :meth:`DataFrame.to_json`
@@ -457,7 +457,7 @@ Selecting via a scalar value that is contained *in* the intervals.
.. _whatsnew_0200.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`)
@@ -580,7 +580,7 @@ Map on Index types now return other Index types
mi = pd.MultiIndex.from_tuples([(1, 2), (2, 4)])
mi
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -596,7 +596,7 @@ Previous Behavior:
In [8]: mi.map(lambda x: x[0])
Out[8]: array([1, 2])
-New Behavior:
+New behavior:
.. ipython:: python
@@ -616,7 +616,7 @@ New Behavior:
.tz_localize('Asia/Tokyo'))
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -627,7 +627,7 @@ Previous Behavior:
2 2
dtype: int32
-New Behavior:
+New behavior:
.. ipython:: python
@@ -653,7 +653,7 @@ Previous behaviour:
In [2]: idx.hour
Out[2]: array([ 0, 10, 20, 6, 16], dtype=int32)
-New Behavior:
+New behavior:
.. ipython:: python
@@ -697,7 +697,7 @@ data-types would yield different return types. These are now made consistent. (:
...: pd.Timestamp('20160101', tz='US/Eastern')])
Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]')
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -727,7 +727,7 @@ data-types would yield different return types. These are now made consistent. (:
In [2]: pd.unique(pd.Series(list('baabc'), dtype='category'))
Out[2]: array(['b', 'a', 'c'], dtype=object)
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -737,7 +737,7 @@ data-types would yield different return types. These are now made consistent. (:
.. _whatsnew_0200.api_breaking.s3:
-S3 File Handling
+S3 file handling
^^^^^^^^^^^^^^^^
pandas now uses `s3fs `_ for handling S3 connections. This shouldn't break
@@ -746,7 +746,7 @@ in prior versions of pandas. (:issue:`11915`).
.. _whatsnew_0200.api_breaking.partial_string_indexing:
-Partial String Indexing Changes
+Partial string indexing changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:ref:`DatetimeIndex Partial String Indexing ` now works as an exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details.
@@ -756,7 +756,7 @@ Partial String Indexing Changes
df = pd.DataFrame({'a': [1, 2, 3]}, pd.DatetimeIndex(['2011-12-31 23:59:59',
'2012-01-01 00:00:00',
'2012-01-01 00:00:01']))
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -771,7 +771,7 @@ Previous Behavior:
Name: a, dtype: int64
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -797,7 +797,7 @@ Now the smallest acceptable dtype will be used (:issue:`13247`)
df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2))
df2.dtypes
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -806,7 +806,7 @@ Previous Behavior:
0 float64
dtype: object
-New Behavior:
+New behavior:
.. ipython:: python
@@ -823,12 +823,12 @@ currently released version of ``pandas-gbq=0.1.4``. Documentation is now hosted
.. _whatsnew_0200.api_breaking.memory_usage:
-Memory Usage for Index is more Accurate
+Memory usage for Index is more accurate
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -843,7 +843,7 @@ Previous Behavior:
In [11]: index.memory_usage(deep=True)
Out[11]: 180
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -900,7 +900,7 @@ doesn't behave as desired.
[[0, 0, 1, 1], [0, 1, 0, 1]]))
df
-Previous Behavior:
+Previous behavior:
.. code-block:: python
@@ -918,7 +918,7 @@ Previous Behavior:
In [15]: df.sort_index().index.is_monotonic
Out[15]: False
-New Behavior:
+New behavior:
.. ipython:: python
@@ -929,13 +929,13 @@ New Behavior:
.. _whatsnew_0200.api_breaking.groupby_describe:
-Groupby Describe Formatting
+Groupby describe formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -970,7 +970,7 @@ Previous Behavior:
1 1.5 0.707107 1 2
2 3.5 0.707107 3 4
-New Behavior:
+New behavior:
.. ipython:: python
@@ -982,7 +982,7 @@ New Behavior:
.. _whatsnew_0200.api_breaking.rolling_pairwise:
-Window Binary Corr/Cov operations return a MultiIndex DataFrame
+Window binary corr/cov operations return a MultiIndex DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object,
@@ -1000,7 +1000,7 @@ See the section on :ref:`Windowed Binary Operations ` for
periods=100, freq='D', name='foo'))
df.tail()
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1012,7 +1012,7 @@ Previous Behavior:
Major_axis axis: A to B
Minor_axis axis: A to B
-New Behavior:
+New behavior:
.. ipython:: python
@@ -1040,7 +1040,7 @@ usually resulting in an invalid comparison, returning an empty result frame. The
df.to_hdf('store.h5', 'key', format='table', data_columns=True)
df.dtypes
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1050,7 +1050,7 @@ Previous Behavior:
^
SyntaxError: invalid token
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -1084,14 +1084,14 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
right = pd.Index([1, 2, 3])
right
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
In [4]: left.intersection(right)
Out[4]: Int64Index([1, 2], dtype='int64')
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -1106,7 +1106,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3])
right
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -1116,7 +1116,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
1 10 100
2 20 200
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -1124,7 +1124,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method.
.. _whatsnew_0200.api_breaking.pivot_table:
-Pivot Table always returns a DataFrame
+Pivot table always returns a DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The documentation for :meth:`pivot_table` states that a ``DataFrame`` is *always* returned. Here a bug
@@ -1137,7 +1137,7 @@ is fixed that allowed this to return a ``Series`` under certain circumstance. (:
'col3': [1, 3, 9]})
df
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -1149,7 +1149,7 @@ Previous Behavior:
9 E 5
Name: col1, dtype: int64
-New Behavior:
+New behavior:
.. ipython:: python
@@ -1157,7 +1157,7 @@ New Behavior:
.. _whatsnew_0200.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`).
@@ -1192,12 +1192,12 @@ Other API Changes
.. _whatsnew_0200.privacy:
-Reorganization of the library: Privacy Changes
+Reorganization of the library: privacy changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. _whatsnew_0200.privacy.extensions:
-Modules Privacy Has Changed
+Modules privacy has changed
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API.
@@ -1327,7 +1327,7 @@ Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some example
df
-Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column.
+Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column.
.. code-block:: ipython
@@ -1532,7 +1532,7 @@ Should be changed to:
.. _whatsnew_0200.deprecations.other:
-Other Deprecations
+Other deprecations
^^^^^^^^^^^^^^^^^^
- ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`)
@@ -1584,7 +1584,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0200.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
@@ -1606,7 +1606,7 @@ Performance Improvements
.. _whatsnew_0200.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -1713,7 +1713,7 @@ Plotting
- Bug in the date and time converters pandas registers with matplotlib not handling multiple dimensions (:issue:`16026`)
- Bug in ``pd.scatter_matrix()`` could accept either ``color`` or ``c``, but not both (:issue:`14855`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst
index b2592579eb03f..232d1d283d9bd 100644
--- a/doc/source/whatsnew/v0.20.2.rst
+++ b/doc/source/whatsnew/v0.20.2.rst
@@ -35,7 +35,7 @@ Enhancements
.. _whatsnew_0202.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance regression fix when indexing with a list-like (:issue:`16285`)
@@ -46,7 +46,7 @@ Performance Improvements
.. _whatsnew_0202.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when
@@ -97,7 +97,7 @@ Plotting
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in creating a time-based rolling window on an empty ``DataFrame`` (:issue:`15819`)
diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst
index 8dc6acc2074bd..72faabd95bf1f 100644
--- a/doc/source/whatsnew/v0.20.3.rst
+++ b/doc/source/whatsnew/v0.20.3.rst
@@ -20,7 +20,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. _whatsnew_0203.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
- Fixed a bug in failing to compute rolling computations of a column-MultiIndexed ``DataFrame`` (:issue:`16789`, :issue:`16825`)
diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst
index 5c6f1d1af6b54..34b610e8af0b3 100644
--- a/doc/source/whatsnew/v0.21.0.rst
+++ b/doc/source/whatsnew/v0.21.0.rst
@@ -263,7 +263,7 @@ Now, to find prices per store/product, we can simply do:
See the :ref:`documentation ` for more.
-.. _whatsnew_0210.enhancements.reanme_categories:
+.. _whatsnew_0210.enhancements.rename_categories:
``Categorical.rename_categories`` accepts a dict-like
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -300,7 +300,7 @@ as in :meth:`DataFrame.rename`.
.. _whatsnew_0210.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
New functions or methods
@@ -412,13 +412,13 @@ Previously WITH ``bottleneck``:
In [2]: s.sum()
Out[2]: 0.0
-New Behavior, without regard to the bottleneck installation:
+New behavior, without regard to the bottleneck installation:
.. ipython:: python
s.sum()
-Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation:
+Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottleneck`` installation:
.. code-block:: ipython
@@ -434,7 +434,7 @@ but for consistency with the all-NaN case, this was changed to return NaN as wel
.. _whatsnew_0210.api_breaking.loc:
-Indexing with a list with missing labels is Deprecated
+Indexing with a list with missing labels is deprecated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Previously, selecting with a list of labels, where one or more labels were missing would always succeed, returning ``NaN`` for missing labels.
@@ -448,7 +448,7 @@ See the :ref:`deprecation docs `.
s = pd.Series([1, 2, 3])
s
-Previous Behavior
+Previous behavior
.. code-block:: ipython
@@ -460,7 +460,7 @@ Previous Behavior
dtype: float64
-Current Behavior
+Current behavior
.. code-block:: ipython
@@ -492,7 +492,7 @@ Selection with all keys found is unchanged.
.. _whatsnew_0210.api.na_changes:
-NA naming Changes
+NA naming changes
^^^^^^^^^^^^^^^^^
In order to promote more consistency among the pandas API, we have added additional top-level
@@ -524,7 +524,7 @@ Previously:
In [2]: type(list(s)[0])
Out[2]: numpy.int64
-New Behaviour:
+New behavior:
.. ipython:: python
@@ -544,7 +544,7 @@ Previously:
In [8]: type(df.to_dict()['a'][0])
Out[8]: numpy.int64
-New Behaviour:
+New behavior:
.. ipython:: python
@@ -561,7 +561,7 @@ you would get a label based selection, potentially duplicating result labels, ra
(where ``True`` selects elements), this was inconsistent how a boolean numpy array indexed. The new behavior is to
act like a boolean numpy array indexer. (:issue:`17738`)
-Previous Behavior:
+Previous behavior:
.. ipython:: python
@@ -578,7 +578,7 @@ Previous Behavior:
True 2
dtype: int64
-Current Behavior
+Current behavior
.. ipython:: python
@@ -588,7 +588,7 @@ Current Behavior
Furthermore, previously if you had an index that was non-numeric (e.g. strings), then a boolean Index would raise a ``KeyError``.
This will now be treated as a boolean indexer.
-Previously Behavior:
+Previously behavior:
.. ipython:: python
@@ -600,7 +600,7 @@ Previously Behavior:
In [39]: s.loc[pd.Index([True, False, True])]
KeyError: "None of [Index([True, False, True], dtype='object')] are in the [index]"
-Current Behavior
+Current behavior
.. ipython:: python
@@ -614,7 +614,7 @@ Current Behavior
In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -634,7 +634,7 @@ Previous Behavior:
In [5]: resampled.index
Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC')
-New Behavior:
+New behavior:
.. ipython:: python
@@ -650,7 +650,7 @@ New Behavior:
Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -670,7 +670,7 @@ Previous Behavior:
open high low close
2000-01 0 9 0 9
-New Behavior:
+New behavior:
.. ipython:: python
@@ -732,7 +732,7 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in
.. _whatsnew_0210.api_breaking.dtype_conversions:
-Dtype Conversions
+Dtype conversions
^^^^^^^^^^^^^^^^^
Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preserve the bools with ``object`` dtypes. (:issue:`16821`).
@@ -752,7 +752,7 @@ Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignmen
2 3
dtype: int64
-New Behavior
+New behavior
.. ipython:: python
@@ -789,7 +789,7 @@ These now coerce to ``object`` dtype.
.. _whatsnew_210.api.multiindex_single:
-MultiIndex Constructor with a Single Level
+MultiIndex constructor with a single level
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``MultiIndex`` constructors no longer squeezes a MultiIndex with all
@@ -818,7 +818,7 @@ UTC Localization with Series
Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`).
-Previous Behavior
+Previous behavior
.. ipython:: python
@@ -833,7 +833,7 @@ Previous Behavior
2 2013-01-01
dtype: datetime64[ns]
-New Behavior
+New behavior
.. ipython:: python
@@ -843,14 +843,14 @@ Additionally, DataFrames with datetime columns that were parsed by :func:`read_s
.. _whatsnew_0210.api.consistency_of_range_functions:
-Consistency of Range Functions
+Consistency of range functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`).
One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -863,7 +863,7 @@ Previous Behavior:
In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q')
Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC')
-New Behavior:
+New behavior:
.. code-block:: ipython
@@ -877,7 +877,7 @@ New Behavior:
Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``.
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -888,7 +888,7 @@ Previous Behavior:
dtype='interval[int64]')
-New Behavior:
+New behavior:
.. ipython:: python
@@ -896,7 +896,7 @@ New Behavior:
.. _whatsnew_0210.api.mpl_converters:
-No Automatic Matplotlib Converters
+No automatic Matplotlib converters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas no longer registers our ``date``, ``time``, ``datetime``,
@@ -915,7 +915,7 @@ converters on first-use (:issue:`17710`).
.. _whatsnew_0210.api:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
@@ -1024,7 +1024,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0210.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
@@ -1036,7 +1036,7 @@ Performance Improvements
.. _whatsnew_0210.docs:
-Documentation Changes
+Documentation changes
~~~~~~~~~~~~~~~~~~~~~
- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`)
@@ -1044,7 +1044,7 @@ Documentation Changes
.. _whatsnew_0210.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -1114,7 +1114,7 @@ Plotting
- Bug causing ``plotting.parallel_coordinates`` to reset the random seed when using random colors (:issue:`17525`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)
diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst
index c8897ca86e8cf..64f3339834b38 100644
--- a/doc/source/whatsnew/v0.21.1.rst
+++ b/doc/source/whatsnew/v0.21.1.rst
@@ -31,7 +31,7 @@ Highlights include:
.. _whatsnew_0211.converters:
-Restore Matplotlib datetime Converter Registration
+Restore Matplotlib datetime converter registration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Pandas implements some matplotlib converters for nicely formatting the axis
@@ -77,7 +77,7 @@ Improvements to the Parquet IO functionality
.. _whatsnew_0211.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`)
@@ -93,14 +93,14 @@ Deprecations
.. _whatsnew_0211.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of plotting large series/dataframes (:issue:`18236`).
.. _whatsnew_0211.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Conversion
@@ -143,7 +143,7 @@ Plotting
- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`)
diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
index b38fcd9d62af4..ea36b35d61740 100644
--- a/doc/source/whatsnew/v0.22.0.rst
+++ b/doc/source/whatsnew/v0.22.0.rst
@@ -37,7 +37,7 @@ time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``.
Based on feedback, we've partially reverted those changes.
-Arithmetic Operations
+Arithmetic operations
^^^^^^^^^^^^^^^^^^^^^
The default sum for empty or all-*NA* ``Series`` is now ``0``.
@@ -93,7 +93,7 @@ returning ``1`` instead.
These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well.
Finally, a few less obvious places in pandas are affected by this change.
-Grouping by a Categorical
+Grouping by a categorical
^^^^^^^^^^^^^^^^^^^^^^^^^
Grouping by a ``Categorical`` and summing now returns ``0`` instead of
@@ -196,7 +196,7 @@ Once again, the ``min_count`` keyword is available to restore the 0.21 behavior.
pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)
-Rolling and Expanding
+Rolling and expanding
^^^^^^^^^^^^^^^^^^^^^
Rolling and expanding already have a ``min_periods`` keyword that behaves
diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst
index 98479fa30eb15..f4c283ea742f7 100644
--- a/doc/source/whatsnew/v0.23.0.rst
+++ b/doc/source/whatsnew/v0.23.0.rst
@@ -22,7 +22,7 @@ Highlights include:
- :ref:`Instantiation from dicts respects order for Python 3.6+ `.
- :ref:`Dependent column arguments for assign `.
- :ref:`Merging / sorting on a combination of columns and index levels `.
-- :ref:`Extending Pandas with custom types `.
+- :ref:`Extending pandas with custom types `.
- :ref:`Excluding unobserved categories from groupby `.
- :ref:`Changes to make output shape of DataFrame.apply consistent `.
@@ -31,7 +31,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations
.. warning::
Starting January 1, 2019, pandas feature releases will support Python 3 only.
- See :ref:`install.dropping-27` for more.
+ See `Dropping Python 2.7 `_ for more.
.. contents:: What's new in v0.23.0
:local:
@@ -105,7 +105,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
using ``.assign()`` to update an existing column. Previously, callables
referring to other variables being updated would get the "old" values
- Previous Behavior:
+ Previous behavior:
.. code-block:: ipython
@@ -118,7 +118,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
1 3 -2
2 4 -3
- New Behavior:
+ New behavior:
.. ipython:: python
@@ -186,7 +186,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values
.. _whatsnew_023.enhancements.extension:
-Extending Pandas with Custom Types (Experimental)
+Extending pandas with custom types (experimental)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy
@@ -276,7 +276,7 @@ To show only observed values:
df.groupby(['A', 'B', 'C'], observed=True).count()
-For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword:
+For pivoting operations, this behavior is *already* controlled by the ``dropna`` keyword:
.. ipython:: python
@@ -398,7 +398,7 @@ In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their
s = pd.Series([-np.inf, 0, 1, np.nan, np.inf])
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -411,7 +411,7 @@ Previous Behavior:
4 NaN
dtype: float64
-Current Behavior:
+Current behavior:
.. ipython:: python
@@ -424,7 +424,7 @@ Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``N
s = pd.Series([np.nan, np.nan, -np.inf, -np.inf])
s
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -436,7 +436,7 @@ Previous Behavior:
3 2.5
dtype: float64
-Current Behavior:
+Current behavior:
.. ipython:: python
@@ -502,7 +502,7 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist
.. _whatsnew_0230.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- Unary ``+`` now permitted for ``Series`` and ``DataFrame`` as numeric operator (:issue:`16073`)
@@ -597,7 +597,7 @@ Pandas will use the dict's insertion order, when creating a ``Series`` or
``DataFrame`` from a dict and you're using Python version 3.6 or
higher. (:issue:`19884`)
-Previous Behavior (and current behavior if on Python < 3.6):
+Previous behavior (and current behavior if on Python < 3.6):
.. code-block:: ipython
@@ -614,7 +614,7 @@ Previous Behavior (and current behavior if on Python < 3.6):
Note the Series above is ordered alphabetically by the index values.
-New Behavior (for Python >= 3.6):
+New behavior (for Python >= 3.6):
.. ipython:: python
@@ -738,7 +738,7 @@ where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :iss
columns=['A', 'B', 'C'])
df
-Previous Behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``.
+Previous behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``.
If the return shape did not match, a ``Series`` with lists was returned.
.. code-block:: python
@@ -764,7 +764,7 @@ If the return shape did not match, a ``Series`` with lists was returned.
dtype: object
-New Behavior: When the applied function returns a list-like, this will now *always* return a ``Series``.
+New behavior: When the applied function returns a list-like, this will now *always* return a ``Series``.
.. ipython:: python
@@ -824,7 +824,7 @@ Note that this change also applies to :meth:`DataFrame.append`, which has also r
.. _whatsnew_0230.api_breaking.build_changes:
-Build Changes
+Build changes
^^^^^^^^^^^^^
- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`)
@@ -833,12 +833,12 @@ Build Changes
.. _whatsnew_0230.api_breaking.index_division_by_zero:
-Index Division By Zero Fills Correctly
+Index division by zero fills correctly
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -859,7 +859,7 @@ Previous Behavior:
In [11]: pd.RangeIndex(1, 5) / 0
ZeroDivisionError: integer division or modulo by zero
-Current Behavior:
+Current behavior:
.. ipython:: python
@@ -888,7 +888,7 @@ extracted). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame
``expand`` is set to ``False``. Finally, ``None`` was an accepted value for
the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`)
-Previous Behavior:
+Previous behavior:
.. code-block:: ipython
@@ -906,7 +906,7 @@ Previous Behavior:
Out [4]:
pandas.core.series.Series
-New Behavior:
+New behavior:
.. ipython:: python
@@ -933,15 +933,25 @@ The default value of the ``ordered`` parameter for :class:`~pandas.api.types.Cat
In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``.
-New Behavior:
+New behavior:
-.. ipython:: python
+.. code-block:: ipython
- from pandas.api.types import CategoricalDtype
- cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
- cat
- cdt = CategoricalDtype(categories=list('cbad'))
- cat.astype(cdt)
+ In [2]: from pandas.api.types import CategoricalDtype
+
+ In [3]: cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
+
+ In [4]: cat
+ Out[4]:
+ [a, b, c, a, b, a]
+ Categories (3, object): [c < b < a]
+
+ In [5]: cdt = CategoricalDtype(categories=list('cbad'))
+
+ In [6]: cat.astype(cdt)
+ Out[6]:
+ [a, b, c, a, b, a]
+ Categories (4, object): [c < b < a < d]
Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``.
@@ -978,7 +988,7 @@ yourself. To revert to the old setting, you can run this line:
.. _whatsnew_0230.api.datetimelike:
-Datetimelike API Changes
+Datetimelike API changes
^^^^^^^^^^^^^^^^^^^^^^^^
- The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`)
@@ -1007,7 +1017,7 @@ Datetimelike API Changes
.. _whatsnew_0230.api.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`)
@@ -1130,7 +1140,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0230.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
@@ -1162,7 +1172,7 @@ Performance Improvements
.. _whatsnew_0230.docs:
-Documentation Changes
+Documentation changes
~~~~~~~~~~~~~~~~~~~~~
Thanks to all of the contributors who participated in the Pandas Documentation
@@ -1190,7 +1200,7 @@ read the `NumFOCUS blogpost`_ recapping the sprint.
.. _whatsnew_0230.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Categorical
@@ -1393,7 +1403,7 @@ Plotting
- :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`)
diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst
index f6af2990c935b..03b7d9db6bc63 100644
--- a/doc/source/whatsnew/v0.23.1.rst
+++ b/doc/source/whatsnew/v0.23.1.rst
@@ -1,6 +1,6 @@
.. _whatsnew_0231:
-What's New in 0.23.1 (June 12, 2018)
+What's new in 0.23.1 (June 12, 2018)
------------------------------------
{{ header }}
@@ -12,7 +12,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. warning::
Starting January 1, 2019, pandas feature releases will support Python 3 only.
- See :ref:`install.dropping-27` for more.
+ See `Dropping Python 2.7 `_ for more.
.. contents:: What's new in v0.23.1
:local:
@@ -20,13 +20,13 @@ and bug fixes. We recommend that all users upgrade to this version.
.. _whatsnew_0231.fixed_regressions:
-Fixed Regressions
+Fixed regressions
~~~~~~~~~~~~~~~~~
**Comparing Series with datetime.date**
We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`).
-In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring.
+In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comparing.
This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal.
In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning.
@@ -64,7 +64,7 @@ To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1:
In addition, ordering comparisons will raise a ``TypeError`` in the future.
-**Other Fixes**
+**Other fixes**
- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue
inserts as this caused regression in certain cases (:issue:`21103`).
@@ -85,7 +85,7 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future.
.. _whatsnew_0231.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
@@ -94,10 +94,10 @@ Performance Improvements
.. _whatsnew_0231.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
-**Groupby/Resample/Rolling**
+**Groupby/resample/rolling**
- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
diff --git a/doc/source/whatsnew/v0.23.2.rst b/doc/source/whatsnew/v0.23.2.rst
index cae2415e3374e..9f24092d1d4ae 100644
--- a/doc/source/whatsnew/v0.23.2.rst
+++ b/doc/source/whatsnew/v0.23.2.rst
@@ -1,6 +1,6 @@
.. _whatsnew_0232:
-What's New in 0.23.2 (July 5, 2018)
+What's new in 0.23.2 (July 5, 2018)
-----------------------------------
{{ header }}
@@ -17,7 +17,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. warning::
Starting January 1, 2019, pandas feature releases will support Python 3 only.
- See :ref:`install.dropping-27` for more.
+ See `Dropping Python 2.7 `_ for more.
.. contents:: What's new in v0.23.2
:local:
@@ -25,7 +25,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. _whatsnew_0232.enhancements:
-Logical Reductions over Entire DataFrame
+Logical reductions over entire DataFrame
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`)
@@ -56,7 +56,7 @@ With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.1
.. _whatsnew_0232.fixed_regressions:
-Fixed Regressions
+Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
@@ -70,14 +70,14 @@ Fixed Regressions
- Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`)
-Build Changes
+Build changes
~~~~~~~~~~~~~
- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`)
.. _whatsnew_0232.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
**Conversion**
diff --git a/doc/source/whatsnew/v0.23.3.rst b/doc/source/whatsnew/v0.23.3.rst
index 3b1a0cfa5f273..bb8862a89b003 100644
--- a/doc/source/whatsnew/v0.23.3.rst
+++ b/doc/source/whatsnew/v0.23.3.rst
@@ -1,6 +1,6 @@
.. _whatsnew_0233:
-What's New in 0.23.3 (July 7, 2018)
+What's new in 0.23.3 (July 7, 2018)
-----------------------------------
{{ header }}
diff --git a/doc/source/whatsnew/v0.23.4.rst b/doc/source/whatsnew/v0.23.4.rst
index 01f904e129f80..eadac6f569926 100644
--- a/doc/source/whatsnew/v0.23.4.rst
+++ b/doc/source/whatsnew/v0.23.4.rst
@@ -1,6 +1,6 @@
.. _whatsnew_0234:
-What's New in 0.23.4 (August 3, 2018)
+What's new in 0.23.4 (August 3, 2018)
-------------------------------------
{{ header }}
@@ -12,7 +12,7 @@ and bug fixes. We recommend that all users upgrade to this version.
.. warning::
Starting January 1, 2019, pandas feature releases will support Python 3 only.
- See :ref:`install.dropping-27` for more.
+ See `Dropping Python 2.7 `_ for more.
.. contents:: What's new in v0.23.4
:local:
@@ -20,17 +20,17 @@ and bug fixes. We recommend that all users upgrade to this version.
.. _whatsnew_0234.fixed_regressions:
-Fixed Regressions
+Fixed regressions
~~~~~~~~~~~~~~~~~
- Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`)
.. _whatsnew_0234.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
-**Groupby/Resample/Rolling**
+**Groupby/resample/rolling**
- Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`)
- Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`)
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 05d6a03639a2d..d9f41d2a75116 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1,12 +1,12 @@
.. _whatsnew_0240:
-What's New in 0.24.0 (January 25, 2019)
+What's new in 0.24.0 (January 25, 2019)
---------------------------------------
.. warning::
The 0.24.x series of releases will be the last to support Python 2. Future feature
- releases will support Python 3 only. See :ref:`install.dropping-27` for more
+ releases will support Python 3 only. See `Dropping Python 2.7 `_ for more
details.
{{ header }}
@@ -35,7 +35,7 @@ Enhancements
.. _whatsnew_0240.enhancements.intna:
-Optional Integer NA Support
+Optional integer NA support
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `.
@@ -194,7 +194,7 @@ is a float.
.. _whatsnew_0240.enhancements.interval:
-Storing Interval and Period Data in Series and DataFrame
+Storing Interval and Period data in Series and DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:class:`Interval` and :class:`Period` data may now be stored in a :class:`Series` or :class:`DataFrame`, in addition to an
@@ -300,7 +300,7 @@ value. (:issue:`17054`)
""")
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -309,7 +309,7 @@ value. (:issue:`17054`)
[ A B C
0 1 2 NaN]
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -361,7 +361,7 @@ See the :ref:`Advanced documentation on renaming` for more
.. _whatsnew_0240.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
@@ -482,7 +482,7 @@ for the default line terminator (:issue:`20353`).
This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator
even when ``'\n'`` was passed in ``line_terminator``.
-*Previous Behavior* on Windows:
+*Previous behavior* on Windows:
.. code-block:: ipython
@@ -508,7 +508,7 @@ even when ``'\n'`` was passed in ``line_terminator``.
Out[5]: b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n'
-*New Behavior* on Windows:
+*New behavior* on Windows:
Passing ``line_terminator`` explicitly, set thes ``line terminator`` to that character.
@@ -569,7 +569,7 @@ missing indicator, ``np.nan``. (:issue:`20377`)
from io import StringIO
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -579,7 +579,7 @@ missing indicator, ``np.nan``. (:issue:`20377`)
Out[7]:
'nan'
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -591,7 +591,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
.. _whatsnew_0240.api.timezone_offset_parsing:
-Parsing Datetime Strings with Timezone Offsets
+Parsing datetime strings with timezone offsets
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Previously, parsing datetime strings with UTC offsets with :func:`to_datetime`
@@ -602,7 +602,7 @@ offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC
offset in the ``tz`` attribute when all the datetime strings have the same
UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -616,7 +616,7 @@ UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`)
In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"])
Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None)
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -656,7 +656,7 @@ Parsing mixed-timezones with :func:`read_csv`
:func:`read_csv` no longer silently converts mixed-timezone columns to UTC (:issue:`24987`).
-*Previous Behavior*
+*Previous behavior*
.. code-block:: python
@@ -671,7 +671,7 @@ Parsing mixed-timezones with :func:`read_csv`
1 1999-12-31 18:00:00
Name: a, dtype: datetime64[ns]
-*New Behavior*
+*New behavior*
.. ipython:: python
@@ -704,7 +704,7 @@ to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.e
:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``,
or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -717,7 +717,7 @@ or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`)
In [5]: p.end_time
Out[5]: Timestamp(2017-01-01 23:59:59.999999999)
-*New Behavior*:
+*New behavior*:
Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as
is the case with :attr:`Period.end_time`, for example
@@ -744,7 +744,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays
ser = pd.Series([pd.Timestamp('2000', tz='UTC'),
pd.Timestamp('2000', tz='UTC')])
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -752,7 +752,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays
Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object)
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -761,7 +761,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays
.. _whatsnew_0240.api_breaking.sparse_values:
-Sparse Data Structure Refactor
+Sparse data structure refactor
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``,
@@ -807,7 +807,7 @@ Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return v
a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset
of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned (:issue:`24284`).
-*Previous Behavior*
+*Previous behavior*
The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A``
is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``,
@@ -828,7 +828,7 @@ then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was retur
df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})
-*New Behavior*
+*New behavior*
Now, the return type is consistently a :class:`DataFrame`.
@@ -861,7 +861,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
.. _whatsnew_0240.api.datetimelike.normalize:
-Tick DateOffset Normalize Restrictions
+Tick DateOffset normalize restrictions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`,
@@ -869,7 +869,7 @@ Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`,
``normalize=True`` is no longer supported. This prevents unexpected behavior
where addition could fail to be monotone or associative. (:issue:`21427`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -891,7 +891,7 @@ where addition could fail to be monotone or associative. (:issue:`21427`)
In [7]: ts + tic + tic + tic == ts + (tic + tic + tic)
Out[7]: False
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -905,13 +905,13 @@ where addition could fail to be monotone or associative. (:issue:`21427`)
.. _whatsnew_0240.api.period_subtraction:
-Period Subtraction
+Period subtraction
^^^^^^^^^^^^^^^^^^
Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``.
instead of an integer (:issue:`21314`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -922,7 +922,7 @@ instead of an integer (:issue:`21314`)
In [4]: june - april
Out [4]: 2
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -933,7 +933,7 @@ instead of an integer (:issue:`21314`)
Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return
an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index``
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -942,7 +942,7 @@ an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index``
In [3]: pi - pi[0]
Out[3]: Int64Index([0, 1, 2], dtype='int64')
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -952,7 +952,7 @@ an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index``
.. _whatsnew_0240.api.timedelta64_subtract_nan:
-Addition/Subtraction of ``NaN`` from :class:`DataFrame`
+Addition/subtraction of ``NaN`` from :class:`DataFrame`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Adding or subtracting ``NaN`` from a :class:`DataFrame` column with
@@ -965,7 +965,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and
df = pd.DataFrame([pd.Timedelta(days=1)])
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -976,7 +976,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and
0
0 NaT
-*New Behavior*:
+*New behavior*:
.. code-block:: ipython
@@ -986,7 +986,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and
.. _whatsnew_0240.api.dataframe_cmp_broadcasting:
-DataFrame Comparison Operations Broadcasting Changes
+DataFrame comparison operations broadcasting changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Previously, the broadcasting behavior of :class:`DataFrame` comparison
operations (``==``, ``!=``, ...) was inconsistent with the behavior of
@@ -1006,7 +1006,7 @@ The affected cases are:
df = pd.DataFrame(arr)
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -1045,7 +1045,7 @@ The affected cases are:
...
ValueError: Unable to coerce to Series, length must be 2: given 3
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -1061,7 +1061,7 @@ The affected cases are:
.. code-block:: ipython
- # Comparison operations and arithmetic opeartions both raise ValueError.
+ # Comparison operations and arithmetic operations both raise ValueError.
In [6]: df == (1, 2, 3)
...
ValueError: Unable to coerce to Series, length must be 2: given 3
@@ -1072,7 +1072,7 @@ The affected cases are:
.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting:
-DataFrame Arithmetic Operations Broadcasting Changes
+DataFrame arithmetic operations broadcasting changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:class:`DataFrame` arithmetic operations when operating with 2-dimensional
@@ -1085,7 +1085,7 @@ broadcast. (:issue:`23000`)
df = pd.DataFrame(arr)
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -1096,7 +1096,7 @@ broadcast. (:issue:`23000`)
...
ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1)
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -1105,13 +1105,13 @@ broadcast. (:issue:`23000`)
.. _whatsnew_0240.api.incompatibilities:
-Series and Index Data-Dtype Incompatibilities
+Series and Index data-dtype incompatibilities
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``Series`` and ``Index`` constructors now raise when the
data is incompatible with a passed ``dtype=`` (:issue:`15832`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -1120,7 +1120,7 @@ data is incompatible with a passed ``dtype=`` (:issue:`15832`)
0 18446744073709551615
dtype: uint64
-*New Behavior*:
+*New behavior*:
.. code-block:: ipython
@@ -1143,7 +1143,7 @@ other than another ``Categorical`` of ints (:issue:`19214`)
s = pd.Series([0, 1, np.nan])
c = pd.Series([0, 1, np.nan], dtype="category")
-*Previous Behavior*
+*Previous behavior*
.. code-block:: ipython
@@ -1157,13 +1157,13 @@ other than another ``Categorical`` of ints (:issue:`19214`)
2 NaN
dtype: float64
-*New Behavior*
+*New behavior*
.. ipython:: python
pd.concat([s, c])
-Datetimelike API Changes
+Datetimelike API changes
^^^^^^^^^^^^^^^^^^^^^^^^
- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`)
@@ -1175,7 +1175,7 @@ Datetimelike API Changes
.. _whatsnew_0240.api.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`)
@@ -1212,10 +1212,10 @@ Other API Changes
.. _whatsnew_0240.api.extension:
-Extension Type Changes
+Extension type changes
~~~~~~~~~~~~~~~~~~~~~~
-**Equality and Hashability**
+**Equality and hashability**
Pandas now requires that extension dtypes be hashable (i.e. the respective
``ExtensionDtype`` objects; hashability is not a requirement for the values
@@ -1263,7 +1263,7 @@ ways of adding operator support.
- :meth:`ExtensionArray._formatting_values` is deprecated. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`)
- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
-**Bug Fixes**
+**Bug fixes**
- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`)
@@ -1298,7 +1298,7 @@ Deprecations
- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`)
- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)
-- :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`)
+- :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary. This deprecation has been removed in 0.25.0. (:issue:`21948`)
- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain
many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`)
- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`)
@@ -1324,12 +1324,12 @@ Deprecations
- :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)
- :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`)
- Passing an integer to :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtypes is deprecated, will raise ``TypeError`` in a future version. Use ``obj.fillna(pd.Timedelta(...))`` instead (:issue:`24694`)
-- ``Series.cat.categorical``, ``Series.cat.name`` and ``Sersies.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`).
+- ``Series.cat.categorical``, ``Series.cat.name`` and ``Series.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`).
- Passing a dtype without a precision like ``np.dtype('datetime64')`` or ``timedelta64`` to :class:`Index`, :class:`DatetimeIndex` and :class:`TimedeltaIndex` is now deprecated. Use the nanosecond-precision dtype instead (:issue:`24753`).
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
-Integer Addition/Subtraction with Datetimes and Timedeltas is Deprecated
+Integer addition/subtraction with datetimes and timedeltas is deprecated
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In the past, users could—in some cases—add or subtract integers or integer-dtype
@@ -1338,7 +1338,7 @@ arrays from :class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaInde
This usage is now deprecated. Instead add or subtract integer multiples of
the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -1354,7 +1354,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
In [10]: dti + pd.Index([1, 2])
Out[10]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None)
-*New Behavior*:
+*New behavior*:
.. ipython:: python
:okwarning:
@@ -1371,7 +1371,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
.. _whatsnew_0240.deprecations.integer_tz:
-Passing Integer data and a timezone to DatetimeIndex
+Passing integer data and a timezone to datetimeindex
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The behavior of :class:`DatetimeIndex` when passed integer data and
@@ -1417,7 +1417,7 @@ The old behavior can be retained with by localizing directly to the final timezo
.. _whatsnew_0240.deprecations.tz_aware_array:
-Converting Timezone-Aware Series and Index to NumPy Arrays
+Converting timezone-aware Series and Index to NumPy arrays
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The conversion from a :class:`Series` or :class:`Index` with timezone-aware
@@ -1459,13 +1459,13 @@ The default behavior remains the same, but issues a warning
The previous or future behavior can be obtained, without any warnings, by specifying
the ``dtype``
-*Previous Behavior*
+*Previous behavior*
.. ipython:: python
np.asarray(ser, dtype='datetime64[ns]')
-*Future Behavior*
+*Future behavior*
.. ipython:: python
@@ -1512,7 +1512,7 @@ Removal of prior version deprecations/changes
.. _whatsnew_0240.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Slicing Series and DataFrames with an monotonically increasing :class:`CategoricalIndex`
@@ -1547,7 +1547,7 @@ Performance Improvements
.. _whatsnew_0240.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
Categorical
@@ -1604,7 +1604,7 @@ Datetimelike
- Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`)
- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`)
- Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`)
-- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`)
+- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are unordered and have the same categories, but in a different order (:issue:`24142`)
- Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`)
- Bug in :class:`DatetimeIndex` where constructing a :class:`DatetimeIndex` from a :class:`Categorical` or :class:`CategoricalIndex` would incorrectly drop timezone information (:issue:`18664`)
- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would incorrectly lose the index's ``freq`` attribute (:issue:`21282`)
@@ -1670,7 +1670,7 @@ Timezones
Offsets
^^^^^^^
-- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`)
+- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operations (:issue:`14774`)
- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`)
- Bug in adding :class:`DateOffset` with :class:`DataFrame` or :class:`PeriodIndex` incorrectly raising ``TypeError`` (:issue:`23215`)
- Bug in comparing :class:`DateOffset` objects with non-DateOffset objects, particularly strings, raising ``ValueError`` instead of returning ``False`` for equality checks and ``True`` for not-equal checks (:issue:`23524`)
@@ -1827,7 +1827,7 @@ Plotting
- Bug in :func:`DataFrame.plot.bar` caused bars to use multiple colors instead of a single one (:issue:`20585`)
- Bug in validating color parameter caused extra color to be appended to the given color array. This happened to multiple plotting functions using matplotlib. (:issue:`20726`)
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`)
@@ -1838,7 +1838,7 @@ Groupby/Resample/Rolling
``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`).
- Multiple bugs in :func:`pandas.core.window.Rolling.min` with ``closed='left'`` and a
datetime-like index leading to incorrect results and also segfault. (:issue:`21704`)
-- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`).
+- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`).
- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`).
- Bug in :meth:`pandas.core.resample.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`)
@@ -1915,7 +1915,7 @@ Style
- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
-Build Changes
+Build changes
^^^^^^^^^^^^^
- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`)
diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst
index 8f963f1285e1b..aead8c48eb9b7 100644
--- a/doc/source/whatsnew/v0.24.1.rst
+++ b/doc/source/whatsnew/v0.24.1.rst
@@ -1,14 +1,12 @@
-:orphan:
-
.. _whatsnew_0241:
-Whats New in 0.24.1 (February 3, 2019)
+Whats new in 0.24.1 (February 3, 2019)
--------------------------------------
.. warning::
The 0.24.x series of releases will be the last to support Python 2. Future feature
- releases will support Python 3 only. See :ref:`install.dropping-27` for more.
+ releases will support Python 3 only. See `Dropping Python 2.7 `_ for more.
{{ header }}
@@ -17,7 +15,7 @@ including other versions of pandas. See :ref:`whatsnew_0240` for the 0.24.0 chan
.. _whatsnew_0241.api:
-API Changes
+API changes
~~~~~~~~~~~
Changing the ``sort`` parameter for :class:`Index` set operations
@@ -47,7 +45,7 @@ The `sort` option for :meth:`Index.intersection` has changed in three ways.
.. _whatsnew_0241.regressions:
-Fixed Regressions
+Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`DataFrame.to_dict` with ``records`` orient raising an
@@ -62,7 +60,7 @@ Fixed Regressions
.. _whatsnew_0241.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
**Reshaping**
diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index c3b442e2352bb..d1a893f99cff4 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -1,14 +1,12 @@
-:orphan:
-
.. _whatsnew_0242:
-Whats New in 0.24.2 (March 12, 2019)
+Whats new in 0.24.2 (March 12, 2019)
------------------------------------
.. warning::
The 0.24.x series of releases will be the last to support Python 2. Future feature
- releases will support Python 3 only. See :ref:`install.dropping-27` for more.
+ releases will support Python 3 only. See `Dropping Python 2.7 `_ for more.
{{ header }}
@@ -17,7 +15,7 @@ including other versions of pandas.
.. _whatsnew_0242.regressions:
-Fixed Regressions
+Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
@@ -39,7 +37,7 @@ Fixed Regressions
.. _whatsnew_0242.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
**I/O**
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index f7faeea7a646f..fe1e2d7826d62 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -1,18 +1,27 @@
.. _whatsnew_0250:
-What's New in 0.25.0 (April XX, 2019)
--------------------------------------
+What's new in 0.25.0 (July 18, 2019)
+------------------------------------
.. warning::
- Starting with the 0.25.x series of releases, pandas only supports Python 3.5 and higher.
- See :ref:`install.dropping-27` for more details.
+ Starting with the 0.25.x series of releases, pandas only supports Python 3.5.3 and higher.
+ See `Dropping Python 2.7 `_ for more details.
+
+.. warning::
+
+ The minimum supported Python version will be bumped to 3.6 in a future release.
.. warning::
`Panel` has been fully removed. For N-D labeled data structures, please
use `xarray `_
+.. warning::
+
+ :func:`read_pickle` and :func:`read_msgpack` are only guaranteed backwards compatible back to
+ pandas version 0.20.3 (:issue:`27082`)
+
{{ header }}
These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog
@@ -24,7 +33,7 @@ Enhancements
.. _whatsnew_0250.enhancements.agg_relabel:
-Groupby Aggregation with Relabeling
+Groupby aggregation with relabeling
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas has added special groupby behavior, known as "named aggregation", for naming the
@@ -74,20 +83,141 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca
See :ref:`groupby.aggregate.named` for more.
+.. _whatsnew_0250.enhancements.multiple_lambdas:
+
+Groupby Aggregation with multiple lambdas
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can now provide multiple lambda functions to a list-like aggregation in
+:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`).
+
+.. ipython:: python
+
+ animals.groupby('kind').height.agg([
+ lambda x: x.iloc[0], lambda x: x.iloc[-1]
+ ])
+
+ animals.groupby('kind').agg([
+ lambda x: x.iloc[0] - x.iloc[1],
+ lambda x: x.iloc[0] + x.iloc[1]
+ ])
+
+Previously, these raised a ``SpecificationError``.
+
+.. _whatsnew_0250.enhancements.multi_index_repr:
+
+Better repr for MultiIndex
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Printing of :class:`MultiIndex` instances now shows tuples of each row and ensures
+that the tuple items are vertically aligned, so it's now easier to understand
+the structure of the ``MultiIndex``. (:issue:`13480`):
+
+The repr now looks like this:
+
+.. ipython:: python
+
+ pd.MultiIndex.from_product([['a', 'abc'], range(500)])
+
+Previously, outputting a :class:`MultiIndex` printed all the ``levels`` and
+``codes`` of the ``MultiIndex``, which was visually unappealing and made
+the output more difficult to navigate. For example (limiting the range to 5):
+
+.. code-block:: ipython
+
+ In [1]: pd.MultiIndex.from_product([['a', 'abc'], range(5)])
+ Out[1]: MultiIndex(levels=[['a', 'abc'], [0, 1, 2, 3]],
+ ...: codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]])
+
+In the new repr, all values will be shown, if the number of rows is smaller
+than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally,
+the output will truncate, if it's wider than :attr:`options.display.width`
+(default: 80 characters).
+
+.. _whatsnew_0250.enhancements.shorter_truncated_repr:
+
+Shorter truncated repr for Series and DataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Currently, the default display options of pandas ensure that when a Series
+or DataFrame has more than 60 rows, its repr gets truncated to this maximum
+of 60 rows (the ``display.max_rows`` option). However, this still gives
+a repr that takes up a large part of the vertical screen estate. Therefore,
+a new option ``display.min_rows`` is introduced with a default of 10 which
+determines the number of rows showed in the truncated repr:
+
+- For small Series or DataFrames, up to ``max_rows`` number of rows is shown
+ (default: 60).
+- For larger Series of DataFrame with a length above ``max_rows``, only
+ ``min_rows`` number of rows is shown (default: 10, i.e. the first and last
+ 5 rows).
+
+This dual option allows to still see the full content of relatively small
+objects (e.g. ``df.head(20)`` shows all 20 rows), while giving a brief repr
+for large objects.
+
+To restore the previous behaviour of a single threshold, set
+``pd.options.display.min_rows = None``.
+
+.. _whatsnew_0250.enhancements.json_normalize_with_max_level:
+
+Json normalize with max_level param support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`json_normalize` normalizes the provided input dict to all
+nested levels. The new max_level parameter provides more control over
+which level to end normalization (:issue:`23843`):
+
+The repr now looks like this:
+
+.. ipython:: python
+
+ from pandas.io.json import json_normalize
+ data = [{
+ 'CreatedBy': {'Name': 'User001'},
+ 'Lookup': {'TextField': 'Some text',
+ 'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
+ 'Image': {'a': 'b'}
+ }]
+ json_normalize(data, max_level=1)
+
+
+.. _whatsnew_0250.enhancements.explode:
+
+Series.explode to split list-like values to rows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column ` in docs for more information (:issue:`16538`, :issue:`10511`)
+
+
+Here is a typical usecase. You have comma separated string in a column.
+
+.. ipython:: python
+
+ df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
+ {'var1': 'd,e,f', 'var2': 2}])
+ df
+
+Creating a long form ``DataFrame`` is now straightforward using chained operations
+
+.. ipython:: python
+
+ df.assign(var1=df.var1.str.split(',')).explode('var1')
+
.. _whatsnew_0250.enhancements.other:
-Other Enhancements
+Other enhancements
^^^^^^^^^^^^^^^^^^
- :func:`DataFrame.plot` keywords ``logy``, ``logx`` and ``loglog`` can now accept the value ``'sym'`` for symlog scaling. (:issue:`24867`)
- Added support for ISO week year format ('%G-%V-%u') when parsing datetimes using :meth:`to_datetime` (:issue:`16607`)
- Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
- :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
-- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
+- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :class:`datetime.time` objects with timezones (:issue:`24043`)
- :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`)
- ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`)
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
-- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
+- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a monotonically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
@@ -100,6 +230,11 @@ Other Enhancements
- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
+- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``` for more details (:issue:`9070`)
+- :class:`Interval`, :class:`IntervalIndex`, and :class:`~arrays.IntervalArray` have gained an :attr:`~Interval.is_empty` attribute denoting if the given interval(s) are empty (:issue:`27219`)
.. _whatsnew_0250.api_breaking:
@@ -121,7 +256,7 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -130,7 +265,7 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
0
2019-01-01 00:00:00-08:00 0
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -147,7 +282,7 @@ Constructing a :class:`MultiIndex` with ``NaN`` levels or codes value < -1 was a
Now, construction with codes value < -1 is not allowed and ``NaN`` levels' corresponding codes
would be reassigned as -1. (:issue:`19387`)
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
@@ -161,7 +296,7 @@ would be reassigned as -1. (:issue:`19387`)
Out[2]: MultiIndex(levels=[[1, 2]],
codes=[[0, -2]])
-*New Behavior*:
+*New behavior*:
.. ipython:: python
:okexcept:
@@ -173,7 +308,7 @@ would be reassigned as -1. (:issue:`19387`)
.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once:
-``GroupBy.apply`` on ``DataFrame`` evaluates first group only once
+``Groupby.apply`` on ``DataFrame`` evaluates first group only once
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The implementation of :meth:`DataFrameGroupBy.apply() `
@@ -192,7 +327,7 @@ Now every group is evaluated only a single time.
print(group.name)
return group
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: python
@@ -205,14 +340,14 @@ Now every group is evaluated only a single time.
0 x 1
1 y 2
-*New Behavior*:
+*New behavior*:
.. ipython:: python
df.groupby("a").apply(func)
-Concatenating Sparse Values
+Concatenating sparse values
^^^^^^^^^^^^^^^^^^^^^^^^^^^
When passed DataFrames whose values are sparse, :func:`concat` will now return a
@@ -222,14 +357,14 @@ When passed DataFrames whose values are sparse, :func:`concat` will now return a
df = pd.DataFrame({"A": pd.SparseArray([0, 1])})
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: ipython
In [2]: type(pd.concat([df, df]))
pandas.core.sparse.frame.SparseDataFrame
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -255,7 +390,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`,
:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`.
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: python
@@ -275,7 +410,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
2 False
dtype: bool
-*New Behavior*:
+*New behavior*:
.. ipython:: python
:okexcept:
@@ -284,9 +419,37 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t
s
s.str.startswith(b'a')
+.. _whatsnew_0250.api_breaking.groupby_categorical:
+
+Categorical dtypes are preserved during groupby
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`)
+
+.. ipython:: python
+
+ cat = pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)
+ df = pd.DataFrame({'payload': [-1, -2, -1, -2], 'col': cat})
+ df
+ df.dtypes
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+ In [5]: df.groupby('payload').first().col.dtype
+ Out[5]: dtype('O')
+
+*New Behavior*:
+
+.. ipython:: python
+
+ df.groupby('payload').first().col.dtype
+
+
.. _whatsnew_0250.api_breaking.incompatible_index_unions:
-Incompatible Index Type Unions
+Incompatible Index type unions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
When performing :func:`Index.union` operations between objects of incompatible dtypes,
@@ -296,7 +459,7 @@ of empty :class:`Index` objects will now be evaluated before performing union op
rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be
considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: python
@@ -307,13 +470,18 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))
Out[2]: Int64Index([1, 2, 3], dtype='int64')
-*New Behavior*:
+*New behavior*:
.. ipython:: python
pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))
+Note that integer- and floating-dtype indexes are considered "compatible". The integer
+values are coerced to floating point, which may result in loss of precision. See
+:ref:`indexing.set_ops` for more.
+
+
``DataFrame`` groupby ffill/bfill no longer return group labels
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -328,7 +496,7 @@ are returned. (:issue:`21521`)
df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: python
@@ -338,7 +506,7 @@ are returned. (:issue:`21521`)
0 x 1
1 y 2
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -357,7 +525,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397
df = pd.DataFrame({"empty_col": pd.Categorical([])})
df
-*Previous Behavior*:
+*Previous behavior*:
.. code-block:: python
@@ -367,7 +535,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397
count 0
unique 0
-*New Behavior*:
+*New behavior*:
.. ipython:: python
@@ -386,6 +554,255 @@ This change is backward compatible for direct usage of Pandas, but if you subcla
Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods,
you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`).
+.. _whatsnew_0250.api_breaking.interval_indexing:
+
+
+Indexing an ``IntervalIndex`` with ``Interval`` objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Indexing methods for :class:`IntervalIndex` have been modified to require exact matches only for :class:`Interval` queries.
+``IntervalIndex`` methods previously matched on any overlapping ``Interval``. Behavior with scalar points, e.g. querying
+with an integer, is unchanged (:issue:`16316`).
+
+.. ipython:: python
+
+ ii = pd.IntervalIndex.from_tuples([(0, 4), (1, 5), (5, 8)])
+ ii
+
+The ``in`` operator (``__contains__``) now only returns ``True`` for exact matches to ``Intervals`` in the ``IntervalIndex``, whereas
+this would previously return ``True`` for any ``Interval`` overlapping an ``Interval`` in the ``IntervalIndex``.
+
+*Previous behavior*:
+
+.. code-block:: python
+
+ In [4]: pd.Interval(1, 2, closed='neither') in ii
+ Out[4]: True
+
+ In [5]: pd.Interval(-10, 10, closed='both') in ii
+ Out[5]: True
+
+*New behavior*:
+
+.. ipython:: python
+
+ pd.Interval(1, 2, closed='neither') in ii
+ pd.Interval(-10, 10, closed='both') in ii
+
+The :meth:`~IntervalIndex.get_loc` method now only returns locations for exact matches to ``Interval`` queries, as opposed to the previous behavior of
+returning locations for overlapping matches. A ``KeyError`` will be raised if an exact match is not found.
+
+*Previous behavior*:
+
+.. code-block:: python
+
+ In [6]: ii.get_loc(pd.Interval(1, 5))
+ Out[6]: array([0, 1])
+
+ In [7]: ii.get_loc(pd.Interval(2, 6))
+ Out[7]: array([0, 1, 2])
+
+*New behavior*:
+
+.. code-block:: python
+
+ In [6]: ii.get_loc(pd.Interval(1, 5))
+ Out[6]: 1
+
+ In [7]: ii.get_loc(pd.Interval(2, 6))
+ ---------------------------------------------------------------------------
+ KeyError: Interval(2, 6, closed='right')
+
+Likewise, :meth:`~IntervalIndex.get_indexer` and :meth:`~IntervalIndex.get_indexer_non_unique` will also only return locations for exact matches
+to ``Interval`` queries, with ``-1`` denoting that an exact match was not found.
+
+These indexing changes extend to querying a :class:`Series` or :class:`DataFrame` with an ``IntervalIndex`` index.
+
+.. ipython:: python
+
+ s = pd.Series(list('abc'), index=ii)
+ s
+
+Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ``loc`` now only returns exact matches for ``Interval`` queries.
+
+*Previous behavior*:
+
+.. code-block:: python
+
+ In [8]: s[pd.Interval(1, 5)]
+ Out[8]:
+ (0, 4] a
+ (1, 5] b
+ dtype: object
+
+ In [9]: s.loc[pd.Interval(1, 5)]
+ Out[9]:
+ (0, 4] a
+ (1, 5] b
+ dtype: object
+
+*New behavior*:
+
+.. ipython:: python
+
+ s[pd.Interval(1, 5)]
+ s.loc[pd.Interval(1, 5)]
+
+Similarly, a ``KeyError`` will be raised for non-exact matches instead of returning overlapping matches.
+
+*Previous behavior*:
+
+.. code-block:: python
+
+ In [9]: s[pd.Interval(2, 3)]
+ Out[9]:
+ (0, 4] a
+ (1, 5] b
+ dtype: object
+
+ In [10]: s.loc[pd.Interval(2, 3)]
+ Out[10]:
+ (0, 4] a
+ (1, 5] b
+ dtype: object
+
+*New behavior*:
+
+.. code-block:: python
+
+ In [6]: s[pd.Interval(2, 3)]
+ ---------------------------------------------------------------------------
+ KeyError: Interval(2, 3, closed='right')
+
+ In [7]: s.loc[pd.Interval(2, 3)]
+ ---------------------------------------------------------------------------
+ KeyError: Interval(2, 3, closed='right')
+
+The :meth:`~IntervalIndex.overlaps` method can be used to create a boolean indexer that replicates the
+previous behavior of returning overlapping matches.
+
+*New behavior*:
+
+.. ipython:: python
+
+ idxr = s.index.overlaps(pd.Interval(2, 3))
+ idxr
+ s[idxr]
+ s.loc[idxr]
+
+
+.. _whatsnew_0250.api_breaking.ufunc:
+
+Binary ufuncs on Series now align
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Applying a binary ufunc like :func:`numpy.power` now aligns the inputs
+when both are :class:`Series` (:issue:`23293`).
+
+.. ipython:: python
+
+ s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
+ s2 = pd.Series([3, 4, 5], index=['d', 'c', 'b'])
+ s1
+ s2
+
+*Previous behavior*
+
+.. code-block:: ipython
+
+ In [5]: np.power(s1, s2)
+ Out[5]:
+ a 1
+ b 16
+ c 243
+ dtype: int64
+
+*New behavior*
+
+.. ipython:: python
+
+ np.power(s1, s2)
+
+This matches the behavior of other binary operations in pandas, like :meth:`Series.add`.
+To retain the previous behavior, convert the other ``Series`` to an array before
+applying the ufunc.
+
+.. ipython:: python
+
+ np.power(s1, s2.array)
+
+Categorical.argsort now places missing values at the end
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`Categorical.argsort` now places missing values at the end of the array, making it
+consistent with NumPy and the rest of pandas (:issue:`21801`).
+
+.. ipython:: python
+
+ cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
+
+*Previous behavior*
+
+.. code-block:: ipython
+
+ In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
+
+ In [3]: cat.argsort()
+ Out[3]: array([1, 2, 0])
+
+ In [4]: cat[cat.argsort()]
+ Out[4]:
+ [NaN, a, b]
+ categories (2, object): [a < b]
+
+*New behavior*
+
+.. ipython:: python
+
+ cat.argsort()
+ cat[cat.argsort()]
+
+.. _whatsnew_0250.api_breaking.list_of_dict:
+
+Column order is preserved when passing a list of dicts to DataFrame
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Starting with Python 3.7 the key-order of ``dict`` is `guaranteed `_. In practice, this has been true since
+Python 3.6. The :class:`DataFrame` constructor now treats a list of dicts in the same way as
+it does a list of ``OrderedDict``, i.e. preserving the order of the dicts.
+This change applies only when pandas is running on Python>=3.6 (:issue:`27309`).
+
+.. ipython:: python
+
+ data = [
+ {'name': 'Joe', 'state': 'NY', 'age': 18},
+ {'name': 'Jane', 'state': 'KY', 'age': 19, 'hobby': 'Minecraft'},
+ {'name': 'Jean', 'state': 'OK', 'age': 20, 'finances': 'good'}
+ ]
+
+*Previous Behavior*:
+
+The columns were lexicographically sorted previously,
+
+.. code-block:: python
+
+ In [1]: pd.DataFrame(data)
+ Out[1]:
+ age finances hobby name state
+ 0 18 NaN NaN Joe NY
+ 1 19 NaN Minecraft Jane KY
+ 2 20 good NaN Jean OK
+
+*New Behavior*:
+
+The column order now matches the insertion-order of the keys in the ``dict``,
+considering all the records from top to bottom. As a consequence, the column
+order of the resulting DataFrame has changed compared to previous pandas verisons.
+
+.. ipython:: python
+
+ pd.DataFrame(data)
+
.. _whatsnew_0250.api_breaking.deps:
Increased minimum versions for dependencies
@@ -411,7 +828,7 @@ If installed, we now require:
| pytest (dev) | 4.0.2 | |
+-----------------+-----------------+----------+
-For `optional libraries `_ the general recommendation is to use the latest version.
+For `optional libraries `_ the general recommendation is to use the latest version.
The following table lists the lowest version per library that is currently being tested throughout the development of pandas.
Optional libraries below the lowest tested version may still work, but are not considered supported.
@@ -422,12 +839,18 @@ Optional libraries below the lowest tested version may still work, but are not c
+-----------------+-----------------+
| fastparquet | 0.2.1 |
+-----------------+-----------------+
+| gcsfs | 0.2.2 |
++-----------------+-----------------+
+| lxml | 3.8.0 |
++-----------------+-----------------+
| matplotlib | 2.2.2 |
+-----------------+-----------------+
| openpyxl | 2.4.8 |
+-----------------+-----------------+
| pyarrow | 0.9.0 |
+-----------------+-----------------+
+| pymysql | 0.7.1 |
++-----------------+-----------------+
| pytables | 3.4.2 |
+-----------------+-----------------+
| scipy | 0.19.0 |
@@ -447,7 +870,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
.. _whatsnew_0250.api.other:
-Other API Changes
+Other API changes
^^^^^^^^^^^^^^^^^
- :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`)
@@ -460,19 +883,24 @@ Other API Changes
- Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`)
- The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`)
- Removed support of gtk package for clipboards (:issue:`26563`)
+- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
+- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
+- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`)
+- :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
+- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)
.. _whatsnew_0250.deprecations:
Deprecations
~~~~~~~~~~~~
-Sparse Subclasses
+Sparse subclasses
^^^^^^^^^^^^^^^^^
The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Their functionality is better-provided
by a ``Series`` or ``DataFrame`` with sparse values.
-**Previous Way**
+**Previous way**
.. ipython:: python
:okwarning:
@@ -480,7 +908,7 @@ by a ``Series`` or ``DataFrame`` with sparse values.
df = pd.SparseDataFrame({"A": [0, 0, 1, 2]})
df.dtypes
-**New Way**
+**New way**
.. ipython:: python
@@ -489,11 +917,18 @@ by a ``Series`` or ``DataFrame`` with sparse values.
The memory usage of the two approaches is identical. See :ref:`sparse.migration` for more (:issue:`19239`).
-Other Deprecations
+msgpack format
+^^^^^^^^^^^^^^
+
+The msgpack format is deprecated as of 0.25 and will be removed in a future version. It is recommended to use pyarrow for on-the-wire transmission of pandas objects. (:issue:`27084`)
+
+
+Other deprecations
^^^^^^^^^^^^^^^^^^
- The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`).
- Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`)
+- :meth:`pandas.concat` has deprecated the ``join_axes``-keyword. Instead, use :meth:`DataFrame.reindex` or :meth:`DataFrame.reindex_like` on the result or on the inputs (:issue:`21951`)
- The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or
the :meth:`SparseArray.to_dense` method instead (:issue:`26421`).
- The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`)
@@ -502,6 +937,20 @@ Other Deprecations
Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`).
- The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version.
Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`).
+- The :meth:`Series.get_values`, :meth:`DataFrame.get_values`, :meth:`Index.get_values`,
+ :meth:`SparseArray.get_values` and :meth:`Categorical.get_values` methods are deprecated.
+ One of ``np.asarray(..)`` or :meth:`~Series.to_numpy` can be used instead (:issue:`19617`).
+- The 'outer' method on NumPy ufuncs, e.g. ``np.subtract.outer`` has been deprecated on :class:`Series` objects. Convert the input to an array with :attr:`Series.array` first (:issue:`27186`)
+- :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`datetime.timedelta.resolution` (:issue:`21344`)
+- :func:`read_table` has been undeprecated. (:issue:`25220`)
+- :attr:`Index.dtype_str` is deprecated. (:issue:`18262`)
+- :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`)
+- :meth:`Series.put` is deprecated. (:issue:`18262`)
+- :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`)
+- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`)
+- :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`).
+- :meth:`DataFrame.get_dtype_counts` is deprecated. (:issue:`18262`)
+- :meth:`Categorical.ravel` will return a :class:`Categorical` instead of a ``np.ndarray`` (:issue:`27199`)
.. _whatsnew_0250.prior_deprecations:
@@ -515,10 +964,21 @@ Removal of prior version deprecations/changes
- Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`)
- Removed the previously deprecated ``convert_objects`` (:issue:`11221`)
- Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`)
+- Removed the previously deprecated behavior of :class:`Series` treated as list-like in :meth:`~Series.cat.rename_categories` (:issue:`17982`)
+- Removed the previously deprecated ``DataFrame.reindex_axis`` and ``Series.reindex_axis`` (:issue:`17842`)
+- Removed the previously deprecated behavior of altering column or index labels with :meth:`Series.rename_axis` or :meth:`DataFrame.rename_axis` (:issue:`17842`)
+- Removed the previously deprecated ``tupleize_cols`` keyword argument in :meth:`read_html`, :meth:`read_csv`, and :meth:`DataFrame.to_csv` (:issue:`17877`, :issue:`17820`)
+- Removed the previously deprecated ``DataFrame.from.csv`` and ``Series.from_csv`` (:issue:`17812`)
+- Removed the previously deprecated ``raise_on_error`` keyword argument in :meth:`DataFrame.where` and :meth:`DataFrame.mask` (:issue:`17744`)
+- Removed the previously deprecated ``ordered`` and ``categories`` keyword arguments in ``astype`` (:issue:`17742`)
+- Removed the previously deprecated ``cdate_range`` (:issue:`17691`)
+- Removed the previously deprecated ``True`` option for the ``dropna`` keyword argument in :func:`SeriesGroupBy.nth` (:issue:`17493`)
+- Removed the previously deprecated ``convert`` keyword argument in :meth:`Series.take` and :meth:`DataFrame.take` (:issue:`17352`)
+- Removed the previously deprecated behavior of arithmetic operations with ``datetime.date`` objects (:issue:`21152`)
.. _whatsnew_0250.performance:
-Performance Improvements
+Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`)
@@ -527,6 +987,7 @@ Performance Improvements
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`)
+- :class:`RangeIndex` now performs standard lookup without instantiating an actual hashtable, hence saving memory (:issue:`16685`)
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
- Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
@@ -539,10 +1000,15 @@ Performance Improvements
- Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`)
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
+- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
+- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
+- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
+- Improved performance of :meth:`pd.read_json` for index-oriented data. (:issue:`26773`)
+- Improved performance of :meth:`MultiIndex.shape` (:issue:`27384`).
.. _whatsnew_0250.bug_fixes:
-Bug Fixes
+Bug fixes
~~~~~~~~~
@@ -551,7 +1017,7 @@ Categorical
- Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`)
- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`)
--
+- Bug in :meth:`DataFrame.dropna` when the :class:`DataFrame` has a :class:`CategoricalIndex` containing :class:`Interval` objects incorrectly raised a ``TypeError`` (:issue:`25087`)
Datetimelike
^^^^^^^^^^^^
@@ -567,6 +1033,12 @@ Datetimelike
- Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`)
- Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'``
- Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`)
+- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`).
+- Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`)
+- Bug where adding :class:`Timestamp` to a ``np.timedelta64`` object would raise instead of returning a :class:`Timestamp` (:issue:`24775`)
+- Bug where comparing a zero-dimensional numpy array containing a ``np.datetime64`` object to a :class:`Timestamp` would incorrect raise ``TypeError`` (:issue:`26916`)
+- Bug in :func:`to_datetime` which would raise ``ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True`` when called with ``cache=True``, with ``arg`` including datetime strings with different offset (:issue:`26097`)
+-
Timedelta
^^^^^^^^^
@@ -587,6 +1059,9 @@ Timezones
- Bug in :func:`to_datetime` where an uninformative ``RuntimeError`` was raised when passing a naive :class:`Timestamp` with datetime strings with mixed UTC offsets (:issue:`25978`)
- Bug in :func:`to_datetime` with ``unit='ns'`` would drop timezone information from the parsed argument (:issue:`26168`)
- Bug in :func:`DataFrame.join` where joining a timezone aware index with a timezone aware column would result in a column of ``NaN`` (:issue:`26335`)
+- Bug in :func:`date_range` where ambiguous or nonexistent start or end times were not handled by the ``ambiguous`` or ``nonexistent`` keywords respectively (:issue:`27088`)
+- Bug in :meth:`DatetimeIndex.union` when combining a timezone aware and timezone unaware :class:`DatetimeIndex` (:issue:`21671`)
+- Bug when applying a numpy reduction function (e.g. :meth:`numpy.minimum`) to a timezone aware :class:`Series` (:issue:`15552`)
Numeric
^^^^^^^
@@ -600,7 +1075,8 @@ Numeric
- Raises a helpful exception when a non-numeric index is sent to :meth:`interpolate` with methods which require numeric index. (:issue:`21662`)
- Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`)
- Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`)
--
+- Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`)
+- Inconsistency in :class:`Series` floor-division (`//`) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`)
-
Conversion
@@ -623,24 +1099,37 @@ Interval
- Construction of :class:`Interval` is restricted to numeric, :class:`Timestamp` and :class:`Timedelta` endpoints (:issue:`23013`)
- Fixed bug in :class:`Series`/:class:`DataFrame` not displaying ``NaN`` in :class:`IntervalIndex` with missing values (:issue:`25984`)
--
+- Bug in :meth:`IntervalIndex.get_loc` where a ``KeyError`` would be incorrectly raised for a decreasing :class:`IntervalIndex` (:issue:`25860`)
+- Bug in :class:`Index` constructor where passing mixed closed :class:`Interval` objects would result in a ``ValueError`` instead of an ``object`` dtype ``Index`` (:issue:`27172`)
Indexing
^^^^^^^^
- Improved exception message when calling :meth:`DataFrame.iloc` with a list of non-numeric objects (:issue:`25753`).
+- Improved exception message when calling ``.iloc`` or ``.loc`` with a boolean indexer with different length (:issue:`26658`).
+- Bug in ``KeyError`` exception message when indexing a :class:`MultiIndex` with a non-existant key not displaying the original key (:issue:`27250`).
+- Bug in ``.iloc`` and ``.loc`` with a boolean indexer not raising an ``IndexError`` when too few items are passed (:issue:`26658`).
- Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`).
- Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`).
- Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`).
- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`)
- Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`)
-
+- Fixed a ``KeyError`` when indexing a :class:`MultiIndex`` level with a list containing exactly one label, which is missing (:issue:`27148`)
+- Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`)
+- Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`)
+- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`)
+- Bug in :class:`CategoricalIndex` and :class:`Categorical` incorrectly raising ``ValueError`` instead of ``TypeError`` when a list is passed using the ``in`` operator (``__contains__``) (:issue:`21729`)
+- Bug in setting a new value in a :class:`Series` with a :class:`Timedelta` object incorrectly casting the value to an integer (:issue:`22717`)
+- Bug in :class:`Series` setting a new key (``__setitem__``) with a timezone-aware datetime incorrectly raising ``ValueError`` (:issue:`12862`)
+- Bug in :meth:`DataFrame.iloc` when indexing with a read-only indexer (:issue:`17192`)
+- Bug in :class:`Series` setting an existing tuple key (``__setitem__``) with timezone-aware datetime values incorrectly raising ``TypeError`` (:issue:`20441`)
Missing
^^^^^^^
- Fixed misleading exception message in :meth:`Series.interpolate` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`).
- Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`)
+- A ``ValueError`` will now be thrown by :meth:`DataFrame.fillna` when ``limit`` is not a positive integer (:issue:`27042`)
-
MultiIndex
@@ -658,13 +1147,13 @@ I/O
- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`)
-- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
+- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AssertionError`` (:issue:`25608`)
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
- Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`)
+- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`)
- Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
-- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
- Improved the ``col_space`` parameter in :meth:`DataFrame.to_html` to accept a string so CSS length values can be set correctly (:issue:`25941`)
@@ -677,6 +1166,11 @@ I/O
- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`)
- :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`)
- Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`).
+- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`)
+- Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`)
+- Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`)
+- Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`)
+- Allow parsing of :class:`PeriodDtype` columns when using :func:`read_csv` (:issue:`26934`)
Plotting
^^^^^^^^
@@ -685,11 +1179,11 @@ Plotting
- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`)
- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612`, :issue:`15912`, :issue:`22334`)
- Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`)
--
+- Fixed bug when plotting a :class:`DatetimeIndex` with ``datetime.timezone.utc`` timezone (:issue:`17173`)
-
-
-Groupby/Resample/Rolling
+Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
@@ -709,6 +1203,11 @@ Groupby/Resample/Rolling
- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`)
- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`)
- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`)
+- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`)
+- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`)
+- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`)
+- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`)
+- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`)
Reshaping
^^^^^^^^^
@@ -728,6 +1227,10 @@ Reshaping
- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`)
- Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`)
- Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`)
+- Bug in which :meth:`DataFrame.from_dict` ignored order of ``OrderedDict`` when ``orient='index'`` (:issue:`8425`).
+- Bug in :meth:`DataFrame.transpose` where transposing a DataFrame with a timezone-aware datetime column would incorrectly raise ``ValueError`` (:issue:`26825`)
+- Bug in :func:`pivot_table` when pivoting a timezone aware column as the ``values`` would remove timezone information (:issue:`14948`)
+- Bug in :func:`merge_asof` when specifying multiple ``by`` columns where one is ``datetime64[ns, tz]`` dtype (:issue:`26649`)
Sparse
^^^^^^
@@ -736,13 +1239,29 @@ Sparse
- Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`)
- Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`)
- Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`)
+- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`).
+
+
+Build Changes
+^^^^^^^^^^^^^
+
+- Fix install error with PyPy on macOS (:issue:`26536`)
+
+ExtensionArray
+^^^^^^^^^^^^^^
+
+- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`).
+- :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`)
+- Added ``Series.__array_ufunc__`` to better handle NumPy ufuncs applied to Series backed by extension arrays (:issue:`23293`).
+- Keyword argument ``deep`` has been removed from :meth:`ExtensionArray.copy` (:issue:`27083`)
Other
^^^^^
- Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`)
-- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`).
- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`)
+- Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`).
+- Bug in :class:`DataFrame` where passing an object array of timezone-aware `datetime` objects would incorrectly raise ``ValueError`` (:issue:`13287`)
.. _whatsnew_0.250.contributors:
diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
new file mode 100644
index 0000000000000..63dd56f4a3793
--- /dev/null
+++ b/doc/source/whatsnew/v0.25.1.rst
@@ -0,0 +1,119 @@
+.. _whatsnew_0251:
+
+What's new in 0.25.1 (August 21, 2019)
+--------------------------------------
+
+These are the changes in pandas 0.25.1. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+I/O and LZMA
+~~~~~~~~~~~~
+
+Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`).
+Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`.
+A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python.
+For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python.
+
+.. _whatsnew_0251.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+Categorical
+^^^^^^^^^^^
+
+- Bug in :meth:`Categorical.fillna` that would replace all values, not just those that are ``NaN`` (:issue:`26215`)
+
+Datetimelike
+^^^^^^^^^^^^
+
+- Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`)
+- Bug in :meth:`Period.to_timestamp` where a :class:`Period` outside the :class:`Timestamp` implementation bounds (roughly 1677-09-21 to 2262-04-11) would return an incorrect :class:`Timestamp` instead of raising ``OutOfBoundsDatetime`` (:issue:`19643`)
+- Bug in iterating over :class:`DatetimeIndex` when the underlying data is read-only (:issue:`28055`)
+
+Timezones
+^^^^^^^^^
+
+- Bug in :class:`Index` where a numpy object array with a timezone aware :class:`Timestamp` and ``np.nan`` would not return a :class:`DatetimeIndex` (:issue:`27011`)
+
+Numeric
+^^^^^^^
+
+- Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`)
+- Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`)
+- Bug where :class:`DataFrame` arithmetic operators such as :meth:`DataFrame.mul` with a :class:`Series` with axis=1 would raise an ``AttributeError`` on :class:`DataFrame` larger than the minimum threshold to invoke numexpr (:issue:`27636`)
+- Bug in :class:`DataFrame` arithmetic where missing values in results were incorrectly masked with ``NaN`` instead of ``Inf`` (:issue:`27464`)
+
+Conversion
+^^^^^^^^^^
+
+- Improved the warnings for the deprecated methods :meth:`Series.real` and :meth:`Series.imag` (:issue:`27610`)
+
+Interval
+^^^^^^^^
+
+- Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`)
+
+Indexing
+^^^^^^^^
+
+- Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
+- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
+- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
+- Fix regression in ``.ix`` fallback with an ``IntervalIndex`` (:issue:`27865`).
+
+Missing
+^^^^^^^
+
+- Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. ``type(pandas.Series())`` (:issue:`27482`)
+
+I/O
+^^^
+
+- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`)
+- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`)
+- Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the HTML repr in the notebook (:issue:`27991`).
+
+Plotting
+^^^^^^^^
+
+- Added a ``pandas_plotting_backends`` entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`).
+- Fixed the re-instatement of Matplotlib datetime converters after calling
+ :meth:`pandas.plotting.deregister_matplotlib_converters` (:issue:`27481`).
+- Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`).
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`)
+- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
+- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`)
+- Bug in windowing over read-only arrays (:issue:`27766`)
+- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`)
+
+Reshaping
+^^^^^^^^^
+
+- A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`)
+- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing `tolerance` kwarg (:issue:`27642`)
+- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`)
+- :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`)
+- Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`)
+
+Sparse
+^^^^^^
+
+- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`)
+
+Other
+^^^^^
+
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`)
+- Bug in :meth:`Series.rename` when using a custom type indexer. Now any value that isn't callable or dict-like is treated as a scalar. (:issue:`27814`)
+
+.. _whatsnew_0.251.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v0.25.0..HEAD
diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst
new file mode 100644
index 0000000000000..6974c7521a237
--- /dev/null
+++ b/doc/source/whatsnew/v0.25.2.rst
@@ -0,0 +1,109 @@
+.. _whatsnew_0252:
+
+What's new in 0.25.2 (October XX, 2019)
+---------------------------------------
+
+These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+.. _whatsnew_0252.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+Categorical
+^^^^^^^^^^^
+
+-
+
+Datetimelike
+^^^^^^^^^^^^
+
+-
+-
+-
+
+Timezones
+^^^^^^^^^
+
+-
+
+Numeric
+^^^^^^^
+
+-
+-
+-
+-
+
+Conversion
+^^^^^^^^^^
+
+-
+
+Interval
+^^^^^^^^
+
+-
+
+Indexing
+^^^^^^^^
+
+-
+-
+-
+-
+
+Missing
+^^^^^^^
+
+-
+
+I/O
+^^^
+
+-
+-
+-
+
+Plotting
+^^^^^^^^
+
+-
+-
+-
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
+-
+-
+-
+
+Reshaping
+^^^^^^^^^
+
+-
+-
+-
+-
+-
+
+Sparse
+^^^^^^
+
+-
+
+Other
+^^^^^
+
+- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`)
+-
+
+.. _whatsnew_0.252.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v0.25.1..HEAD
diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst
index 0c2047ee69b81..8e41e528f5b75 100644
--- a/doc/source/whatsnew/v0.4.x.rst
+++ b/doc/source/whatsnew/v0.4.x.rst
@@ -5,7 +5,7 @@ v.0.4.1 through v0.4.3 (September 25 - October 9, 2011)
{{ header }}
-New Features
+New features
~~~~~~~~~~~~
- Added Python 3 support using 2to3 (:issue:`200`)
@@ -32,7 +32,7 @@ New Features
- :ref:`Enable ` unstacking by name (:issue:`142`)
- :ref:`Enable ` ``sortlevel`` to work by level (:issue:`141`)
-Performance Enhancements
+Performance enhancements
~~~~~~~~~~~~~~~~~~~~~~~~
- Altered binary operations on differently-indexed SparseSeries objects
diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst
index 4e635a5fe6859..37c52ac7bb34e 100644
--- a/doc/source/whatsnew/v0.5.0.rst
+++ b/doc/source/whatsnew/v0.5.0.rst
@@ -12,7 +12,7 @@ v.0.5.0 (October 24, 2011)
from pandas import * # noqa F401, F403
-New Features
+New features
~~~~~~~~~~~~
- :ref:`Added ` ``DataFrame.align`` method with standard join options
@@ -36,7 +36,7 @@ New Features
- :ref:`Added ` support for different delimiters in ``DataFrame.to_csv`` (:issue:`244`)
- TODO: DOCS ABOUT TAKE METHODS
-Performance Enhancements
+Performance enhancements
~~~~~~~~~~~~~~~~~~~~~~~~
- VBENCH Major performance improvements in file parsing functions ``read_csv`` and ``read_table``
diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst
index c0aba18d08b27..973ba897b3234 100644
--- a/doc/source/whatsnew/v0.6.0.rst
+++ b/doc/source/whatsnew/v0.6.0.rst
@@ -11,7 +11,7 @@ v.0.6.0 (November 25, 2011)
from pandas import * # noqa F401, F403
-New Features
+New features
~~~~~~~~~~~~
- :ref:`Added ` ``melt`` function to ``pandas.core.reshape``
- :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`)
@@ -49,7 +49,7 @@ New Features
- :ref:`Added ` ``raw`` option to ``DataFrame.apply`` for performance if only need ndarray (:issue:`309`)
- Added proper, tested weighted least squares to standard and panel OLS (:issue:`303`)
-Performance Enhancements
+Performance enhancements
~~~~~~~~~~~~~~~~~~~~~~~~
- VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the code base (:issue:`361`)
- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (:issue:`309`)
diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst
index deff214354e2b..a63cd37e47dc2 100644
--- a/doc/source/whatsnew/v0.7.0.rst
+++ b/doc/source/whatsnew/v0.7.0.rst
@@ -109,7 +109,7 @@ New features
- :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame.
-API Changes to integer indexing
+API changes to integer indexing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
One of the potentially riskiest API changes in 0.7.0, but also one of the most
@@ -255,7 +255,7 @@ In the case of integer indexes, the behavior will be exactly as before
If you wish to do indexing with sequences and slicing on an integer index with
label semantics, use ``ix``.
-Other API Changes
+Other API changes
~~~~~~~~~~~~~~~~~
- The deprecated ``LongPanel`` class has been completely removed
diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst
index 24bb756d66d68..020cf3bdc2d59 100644
--- a/doc/source/whatsnew/v0.7.3.rst
+++ b/doc/source/whatsnew/v0.7.3.rst
@@ -25,8 +25,6 @@ New features
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2) # noqa F821
-.. image:: ../savefig/scatter_matrix_kde.png
- :width: 5in
- Add ``stacked`` argument to Series and DataFrame's ``plot`` method for
:ref:`stacked bar plots `.
@@ -35,22 +33,18 @@ New features
df.plot(kind='bar', stacked=True) # noqa F821
-.. image:: ../savefig/bar_plot_stacked_ex.png
- :width: 4in
.. code-block:: python
df.plot(kind='barh', stacked=True) # noqa F821
-.. image:: ../savefig/barh_plot_stacked_ex.png
- :width: 4in
- Add log x and y :ref:`scaling options ` to
``DataFrame.plot`` and ``Series.plot``
- Add ``kurt`` methods to Series and DataFrame for computing kurtosis
-NA Boolean Comparison API Change
+NA Boolean comparison API change
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Reverted some changes to how NA values (represented typically as ``NaN`` or
@@ -79,7 +73,7 @@ in numerical arrays, would cause a large amount of problems for users. Thus, a
"practicality beats purity" approach was taken. This issue may be revisited at
some point in the future.
-Other API Changes
+Other API changes
~~~~~~~~~~~~~~~~~
When calling ``apply`` on a grouped Series, the return value will also be a
diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst
index 575ec6b7d19f4..664325ac063c0 100644
--- a/doc/source/whatsnew/v0.8.0.rst
+++ b/doc/source/whatsnew/v0.8.0.rst
@@ -77,7 +77,7 @@ Time series changes and improvements
interface while enabling working with nanosecond-resolution data. Also
provides :ref:`easy time zone conversions `.
- Enhanced support for :ref:`time zones `. Add
- `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All
+ `tz_convert` and ``tz_localize`` methods to TimeSeries and DataFrame. All
timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time
zone set will be localized to local time. Time zone conversions are therefore
essentially free. User needs to know very little about pytz library now; only
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
new file mode 100644
index 0000000000000..050a26cc86d42
--- /dev/null
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -0,0 +1,217 @@
+.. _whatsnew_1000:
+
+What's new in 1.0.0 (??)
+------------------------
+
+.. warning::
+
+ Starting with the 0.25.x series of releases, pandas only supports Python 3.5.3 and higher.
+ See `Dropping Python 2.7 `_ for more details.
+
+.. warning::
+
+ The minimum supported Python version will be bumped to 3.6 in a future release.
+
+{{ header }}
+
+These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+
+Enhancements
+~~~~~~~~~~~~
+
+-
+-
+
+.. _whatsnew_1000.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+-
+-
+
+.. _whatsnew_1000.api_breaking:
+
+Backwards incompatible API changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
+-
+
+.. _whatsnew_1000.api.other:
+
+Other API changes
+^^^^^^^^^^^^^^^^^
+
+- :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`)
+-
+-
+
+.. _whatsnew_1000.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+
+-
+-
+
+.. _whatsnew_1000.prior_deprecations:
+
+Removal of prior version deprecations/changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`)
+- Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`)
+- :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`)
+- :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`)
+- Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`)
+- Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`)
+
+.. _whatsnew_1000.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
+- Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`)
+- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
+- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`)
+
+
+.. _whatsnew_1000.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+
+Categorical
+^^^^^^^^^^^
+
+- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
+-
+-
+
+
+Datetimelike
+^^^^^^^^^^^^
+- Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`)
+- Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`)
+-
+
+
+Timedelta
+^^^^^^^^^
+
+-
+-
+
+Timezones
+^^^^^^^^^
+
+-
+-
+
+
+Numeric
+^^^^^^^
+- Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`)
+-
+-
+
+Conversion
+^^^^^^^^^^
+
+-
+-
+
+Strings
+^^^^^^^
+
+-
+-
+
+
+Interval
+^^^^^^^^
+
+-
+-
+
+Indexing
+^^^^^^^^
+
+- Bug in assignment using a reverse slicer (:issue:`26939`)
+-
+
+Missing
+^^^^^^^
+
+-
+-
+
+MultiIndex
+^^^^^^^^^^
+
+-
+-
+
+I/O
+^^^
+
+- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`)
+- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`)
+-
+
+Plotting
+^^^^^^^^
+
+- Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`)
+-
+- Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`)
+- Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`)
+- Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`)
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+-
+-
+- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
+
+Reshaping
+^^^^^^^^^
+
+-
+-
+
+Sparse
+^^^^^^
+
+-
+-
+
+
+Build Changes
+^^^^^^^^^^^^^
+- Fixed pyqt development dependency issue because of different pyqt package name in conda and PyPI (:issue:`26838`)
+
+
+ExtensionArray
+^^^^^^^^^^^^^^
+
+-
+-
+
+
+Other
+^^^^^
+- Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`)
+- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`)
+- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
+
+
+.. _whatsnew_1000.contributors:
+
+Contributors
+~~~~~~~~~~~~
diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py
index 950e3592abf6e..1a5ab99b5a94f 100755
--- a/doc/sphinxext/announce.py
+++ b/doc/sphinxext/announce.py
@@ -40,7 +40,7 @@
from git import Repo
-UTF8Writer = codecs.getwriter('utf8')
+UTF8Writer = codecs.getwriter("utf8")
this_repo = Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
author_msg = """\
@@ -54,21 +54,19 @@
def get_authors(revision_range):
- pat = '^.*\\t(.*)$'
- lst_release, cur_release = [r.strip() for r in revision_range.split('..')]
+ pat = "^.*\\t(.*)$"
+ lst_release, cur_release = [r.strip() for r in revision_range.split("..")]
# authors, in current release and previous to current release.
- cur = set(re.findall(pat, this_repo.git.shortlog('-s', revision_range),
- re.M))
- pre = set(re.findall(pat, this_repo.git.shortlog('-s', lst_release),
- re.M))
+ cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M))
+ pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M))
# Homu is the author of auto merges, clean him out.
- cur.discard('Homu')
- pre.discard('Homu')
+ cur.discard("Homu")
+ pre.discard("Homu")
# Append '+' to new authors.
- authors = [s + ' +' for s in cur - pre] + [s for s in cur & pre]
+ authors = [s + " +" for s in cur - pre] + [s for s in cur & pre]
authors.sort()
return authors
@@ -77,19 +75,19 @@ def get_pull_requests(repo, revision_range):
prnums = []
# From regular merges
- merges = this_repo.git.log(
- '--oneline', '--merges', revision_range)
+ merges = this_repo.git.log("--oneline", "--merges", revision_range)
issues = re.findall("Merge pull request \\#(\\d*)", merges)
prnums.extend(int(s) for s in issues)
# From Homu merges (Auto merges)
- issues = re. findall("Auto merge of \\#(\\d*)", merges)
+ issues = re.findall("Auto merge of \\#(\\d*)", merges)
prnums.extend(int(s) for s in issues)
# From fast forward squash-merges
commits = this_repo.git.log(
- '--oneline', '--no-merges', '--first-parent', revision_range)
- issues = re.findall('^.*\\(\\#(\\d+)\\)$', commits, re.M)
+ "--oneline", "--no-merges", "--first-parent", revision_range
+ )
+ issues = re.findall("^.*\\(\\#(\\d+)\\)$", commits, re.M)
prnums.extend(int(s) for s in issues)
# get PR data from github repo
@@ -99,27 +97,29 @@ def get_pull_requests(repo, revision_range):
def build_components(revision_range, heading="Contributors"):
- lst_release, cur_release = [r.strip() for r in revision_range.split('..')]
+ lst_release, cur_release = [r.strip() for r in revision_range.split("..")]
authors = get_authors(revision_range)
return {
- 'heading': heading,
- 'author_message': author_msg % len(authors),
- 'authors': authors,
+ "heading": heading,
+ "author_message": author_msg % len(authors),
+ "authors": authors,
}
def build_string(revision_range, heading="Contributors"):
components = build_components(revision_range, heading=heading)
- components['uline'] = '=' * len(components['heading'])
- components['authors'] = "* " + "\n* ".join(components['authors'])
+ components["uline"] = "=" * len(components["heading"])
+ components["authors"] = "* " + "\n* ".join(components["authors"])
- tpl = textwrap.dedent("""\
+ tpl = textwrap.dedent(
+ """\
{heading}
{uline}
{author_message}
- {authors}""").format(**components)
+ {authors}"""
+ ).format(**components)
return tpl
@@ -133,6 +133,6 @@ def main(revision_range):
from argparse import ArgumentParser
parser = ArgumentParser(description="Generate author lists for release")
- parser.add_argument('revision_range', help='..')
+ parser.add_argument("revision_range", help="..")
args = parser.parse_args()
main(args.revision_range)
diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py
index 179ba19a0908a..4256e4659715d 100644
--- a/doc/sphinxext/contributors.py
+++ b/doc/sphinxext/contributors.py
@@ -17,37 +17,36 @@
class ContributorsDirective(Directive):
required_arguments = 1
- name = 'contributors'
+ name = "contributors"
def run(self):
range_ = self.arguments[0]
+ if range_.endswith("x..HEAD"):
+ return [nodes.paragraph(), nodes.bullet_list()]
try:
components = build_components(range_)
- except git.GitCommandError:
+ except git.GitCommandError as exc:
return [
self.state.document.reporter.warning(
- "Cannot find contributors for range '{}'".format(range_),
- line=self.lineno)
+ "Cannot find contributors for range '{}': {}".format(range_, exc),
+ line=self.lineno,
+ )
]
else:
message = nodes.paragraph()
- message += nodes.Text(components['author_message'])
+ message += nodes.Text(components["author_message"])
listnode = nodes.bullet_list()
- for author in components['authors']:
+ for author in components["authors"]:
para = nodes.paragraph()
para += nodes.Text(author)
- listnode += nodes.list_item('', para)
+ listnode += nodes.list_item("", para)
return [message, listnode]
def setup(app):
- app.add_directive('contributors', ContributorsDirective)
+ app.add_directive("contributors", ContributorsDirective)
- return {
- 'version': '0.1',
- 'parallel_read_safe': True,
- 'parallel_write_safe': True,
- }
+ return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True}
diff --git a/environment.yml b/environment.yml
index de9bd67dd9f06..6d2cd701c3854 100644
--- a/environment.yml
+++ b/environment.yml
@@ -16,6 +16,7 @@ dependencies:
- cython>=0.28.2
# code checks
+ - black
- cpplint
- flake8
- flake8-comprehensions # used by flake8, linting of unnecessary comprehensions
@@ -70,7 +71,7 @@ dependencies:
- lxml # pandas.read_html
- openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- pyarrow>=0.9.0 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- - pyqt # pandas.read_clipbobard
+ - pyqt>=5.9.2 # pandas.read_clipboard
- pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf
- python-snappy # required by pyarrow
- s3fs # pandas.read_csv... when using 's3://...' path
@@ -79,5 +80,4 @@ dependencies:
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- - pip:
- - pyreadstat # pandas.read_spss
+ - pyreadstat # pandas.read_spss
diff --git a/mypy.ini b/mypy.ini
deleted file mode 100644
index eea6a3b551677..0000000000000
--- a/mypy.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-[mypy]
-ignore_missing_imports=True
-follow_imports=silent
-
-[mypy-pandas.conftest,pandas.tests.*]
-ignore_errors=True
-
-[mypy-pandas.core.indexes.datetimelike]
-ignore_errors=True
diff --git a/pandas/__init__.py b/pandas/__init__.py
index b95c312f12eed..6351b508fb0e5 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -1,6 +1,6 @@
# flake8: noqa
-__docformat__ = 'restructuredtext'
+__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
hard_dependencies = ("numpy", "pytz", "dateutil")
@@ -13,63 +13,113 @@
missing_dependencies.append("{0}: {1}".format(dependency, str(e)))
if missing_dependencies:
- raise ImportError("Unable to import required dependencies:\n" + "\n".join(missing_dependencies))
+ raise ImportError(
+ "Unable to import required dependencies:\n" + "\n".join(missing_dependencies)
+ )
del hard_dependencies, dependency, missing_dependencies
# numpy compat
from pandas.compat.numpy import (
- _np_version_under1p14, _np_version_under1p15, _np_version_under1p16,
- _np_version_under1p17)
+ _np_version_under1p14,
+ _np_version_under1p15,
+ _np_version_under1p16,
+ _np_version_under1p17,
+)
try:
- from pandas._libs import (hashtable as _hashtable,
- lib as _lib,
- tslib as _tslib)
+ from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
except ImportError as e: # pragma: no cover
# hack but overkill to use re
- module = str(e).replace('cannot import name ', '')
- raise ImportError("C extension: {0} not built. If you want to import "
- "pandas from the source directory, you may need to run "
- "'python setup.py build_ext --inplace --force' to build "
- "the C extensions first.".format(module))
+ module = str(e).replace("cannot import name ", "")
+ raise ImportError(
+ "C extension: {0} not built. If you want to import "
+ "pandas from the source directory, you may need to run "
+ "'python setup.py build_ext --inplace --force' to build "
+ "the C extensions first.".format(module)
+ )
from datetime import datetime
-from pandas._config import (get_option, set_option, reset_option,
- describe_option, option_context, options)
+from pandas._config import (
+ get_option,
+ set_option,
+ reset_option,
+ describe_option,
+ option_context,
+ options,
+)
# let init-time option registration happen
import pandas.core.config_init
from pandas.core.api import (
# dtype
- Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype,
- UInt16Dtype, UInt32Dtype, UInt64Dtype, CategoricalDtype,
- PeriodDtype, IntervalDtype, DatetimeTZDtype,
-
+ Int8Dtype,
+ Int16Dtype,
+ Int32Dtype,
+ Int64Dtype,
+ UInt8Dtype,
+ UInt16Dtype,
+ UInt32Dtype,
+ UInt64Dtype,
+ CategoricalDtype,
+ PeriodDtype,
+ IntervalDtype,
+ DatetimeTZDtype,
# missing
- isna, isnull, notna, notnull,
-
+ isna,
+ isnull,
+ notna,
+ notnull,
# indexes
- Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex,
- Float64Index, MultiIndex, IntervalIndex, TimedeltaIndex,
- DatetimeIndex, PeriodIndex, IndexSlice,
-
+ Index,
+ CategoricalIndex,
+ Int64Index,
+ UInt64Index,
+ RangeIndex,
+ Float64Index,
+ MultiIndex,
+ IntervalIndex,
+ TimedeltaIndex,
+ DatetimeIndex,
+ PeriodIndex,
+ IndexSlice,
# tseries
- NaT, Period, period_range, Timedelta, timedelta_range,
- Timestamp, date_range, bdate_range, Interval, interval_range,
+ NaT,
+ Period,
+ period_range,
+ Timedelta,
+ timedelta_range,
+ Timestamp,
+ date_range,
+ bdate_range,
+ Interval,
+ interval_range,
DateOffset,
-
# conversion
- to_numeric, to_datetime, to_timedelta,
-
+ to_numeric,
+ to_datetime,
+ to_timedelta,
# misc
- np, Grouper, factorize, unique, value_counts, NamedAgg,
- array, Categorical, set_eng_float_format, Series, DataFrame,
- Panel)
+ np,
+ Grouper,
+ factorize,
+ unique,
+ value_counts,
+ NamedAgg,
+ array,
+ Categorical,
+ set_eng_float_format,
+ Series,
+ DataFrame,
+)
from pandas.core.sparse.api import (
- SparseArray, SparseDataFrame, SparseSeries, SparseDtype)
+ SparseArray,
+ SparseDataFrame,
+ SparseSeries,
+ SparseDtype,
+)
from pandas.tseries.api import infer_freq
from pandas.tseries import offsets
@@ -77,35 +127,56 @@
from pandas.core.computation.api import eval
from pandas.core.reshape.api import (
- concat, lreshape, melt, wide_to_long, merge, merge_asof,
- merge_ordered, crosstab, pivot, pivot_table, get_dummies,
- cut, qcut)
+ concat,
+ lreshape,
+ melt,
+ wide_to_long,
+ merge,
+ merge_asof,
+ merge_ordered,
+ crosstab,
+ pivot,
+ pivot_table,
+ get_dummies,
+ cut,
+ qcut,
+)
from pandas.util._print_versions import show_versions
from pandas.io.api import (
# excel
- ExcelFile, ExcelWriter, read_excel,
-
+ ExcelFile,
+ ExcelWriter,
+ read_excel,
# packers
- read_msgpack, to_msgpack,
-
+ read_msgpack,
+ to_msgpack,
# parsers
- read_csv, read_fwf, read_table,
-
+ read_csv,
+ read_fwf,
+ read_table,
# pickle
- read_pickle, to_pickle,
-
+ read_pickle,
+ to_pickle,
# pytables
- HDFStore, read_hdf,
-
+ HDFStore,
+ read_hdf,
# sql
- read_sql, read_sql_query,
+ read_sql,
+ read_sql_query,
read_sql_table,
-
# misc
- read_clipboard, read_parquet, read_feather, read_gbq,
- read_html, read_json, read_stata, read_sas, read_spss)
+ read_clipboard,
+ read_parquet,
+ read_feather,
+ read_gbq,
+ read_html,
+ read_json,
+ read_stata,
+ read_sas,
+ read_spss,
+)
from pandas.util._tester import test
import pandas.testing
@@ -113,11 +184,42 @@
# use the closest tagged version if possible
from ._version import get_versions
+
v = get_versions()
-__version__ = v.get('closest-tag', v['version'])
-__git_version__ = v.get('full-revisionid')
+__version__ = v.get("closest-tag", v["version"])
+__git_version__ = v.get("full-revisionid")
del get_versions, v
+
+# GH 27101
+# TODO: remove Panel compat in 1.0
+if pandas.compat.PY37:
+
+ def __getattr__(name):
+ if name == "Panel":
+ import warnings
+
+ warnings.warn(
+ "The Panel class is removed from pandas. Accessing it "
+ "from the top-level namespace will also be removed in "
+ "the next version",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ class Panel:
+ pass
+
+ return Panel
+ raise AttributeError("module 'pandas' has no attribute '{}'".format(name))
+
+
+else:
+
+ class Panel:
+ pass
+
+
# module level doc-string
__doc__ = """
pandas - a powerful data analysis and manipulation library for Python
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
index bf221ea444288..65936a9fcdbf3 100644
--- a/pandas/_config/__init__.py
+++ b/pandas/_config/__init__.py
@@ -5,11 +5,24 @@
importing `dates` and `display` ensures that keys needed by _libs
are initialized.
"""
-__all__ = ["config", "detect_console_encoding", "get_option", "set_option",
- "reset_option", "describe_option", "option_context", "options"]
+__all__ = [
+ "config",
+ "detect_console_encoding",
+ "get_option",
+ "set_option",
+ "reset_option",
+ "describe_option",
+ "option_context",
+ "options",
+]
from pandas._config import config
from pandas._config import dates # noqa:F401
from pandas._config.config import (
- describe_option, get_option, option_context, options, reset_option,
- set_option)
+ describe_option,
+ get_option,
+ option_context,
+ options,
+ reset_option,
+ set_option,
+)
from pandas._config.display import detect_console_encoding
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 6b685a0ce962a..890db5b41907e 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -54,9 +54,8 @@
from typing import Dict, List
import warnings
-DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver')
-RegisteredOption = namedtuple('RegisteredOption',
- 'key defval doc validator cb')
+DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver")
+RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb")
# holds deprecated option metdata
_deprecated_options = {} # type: Dict[str, DeprecatedOption]
@@ -68,7 +67,7 @@
_global_config = {} # type: Dict[str, str]
# keys which have a special meaning
-_reserved_keys = ['all'] # type: List[str]
+_reserved_keys = ["all"] # type: List[str]
class OptionError(AttributeError, KeyError):
@@ -76,6 +75,7 @@ class OptionError(AttributeError, KeyError):
checks
"""
+
#
# User API
@@ -85,9 +85,9 @@ def _get_single_key(pat, silent):
if len(keys) == 0:
if not silent:
_warn_if_deprecated(pat)
- raise OptionError('No such keys(s): {pat!r}'.format(pat=pat))
+ raise OptionError("No such keys(s): {pat!r}".format(pat=pat))
if len(keys) > 1:
- raise OptionError('Pattern matched multiple keys')
+ raise OptionError("Pattern matched multiple keys")
key = keys[0]
if not silent:
@@ -110,11 +110,10 @@ def _set_option(*args, **kwargs):
# must at least 1 arg deal with constraints later
nargs = len(args)
if not nargs or nargs % 2 != 0:
- raise ValueError("Must provide an even number of non-keyword "
- "arguments")
+ raise ValueError("Must provide an even number of non-keyword arguments")
# default to false
- silent = kwargs.pop('silent', False)
+ silent = kwargs.pop("silent", False)
if kwargs:
msg = '_set_option() got an unexpected keyword argument "{kwarg}"'
@@ -139,13 +138,13 @@ def _set_option(*args, **kwargs):
o.cb(key)
-def _describe_option(pat='', _print_desc=True):
+def _describe_option(pat="", _print_desc=True):
keys = _select_options(pat)
if len(keys) == 0:
- raise OptionError('No such keys(s)')
+ raise OptionError("No such keys(s)")
- s = ''
+ s = ""
for k in keys: # filter by pat
s += _build_option_description(k)
@@ -160,13 +159,15 @@ def _reset_option(pat, silent=False):
keys = _select_options(pat)
if len(keys) == 0:
- raise OptionError('No such keys(s)')
+ raise OptionError("No such keys(s)")
- if len(keys) > 1 and len(pat) < 4 and pat != 'all':
- raise ValueError('You must specify at least 4 characters when '
- 'resetting multiple keys, use the special keyword '
- '"all" to reset all the options to their default '
- 'value')
+ if len(keys) > 1 and len(pat) < 4 and pat != "all":
+ raise ValueError(
+ "You must specify at least 4 characters when "
+ "resetting multiple keys, use the special keyword "
+ '"all" to reset all the options to their default '
+ "value"
+ )
for k in keys:
_set_option(k, _registered_options[k].defval, silent=silent)
@@ -213,6 +214,7 @@ def __getattr__(self, key):
def __dir__(self):
return list(self.d.keys())
+
# For user convenience, we'd like to have the available options described
# in the docstring. For dev convenience we'd like to generate the docstrings
# dynamically instead of maintaining them by hand. To this, we use the
@@ -223,7 +225,6 @@ def __dir__(self):
class CallableDynamicDoc:
-
def __init__(self, func, doc_tmpl):
self.__doc_tmpl__ = doc_tmpl
self.__func__ = func
@@ -233,10 +234,9 @@ def __call__(self, *args, **kwds):
@property
def __doc__(self):
- opts_desc = _describe_option('all', _print_desc=False)
+ opts_desc = _describe_option("all", _print_desc=False)
opts_list = pp_options_list(list(_registered_options.keys()))
- return self.__doc_tmpl__.format(opts_desc=opts_desc,
- opts_list=opts_list)
+ return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)
_get_option_tmpl = """
@@ -394,14 +394,14 @@ class option_context:
def __init__(self, *args):
if not (len(args) % 2 == 0 and len(args) >= 2):
- raise ValueError('Need to invoke as'
- ' option_context(pat, val, [(pat, val), ...]).')
+ raise ValueError(
+ "Need to invoke as option_context(pat, val, [(pat, val), ...])."
+ )
self.ops = list(zip(args[::2], args[1::2]))
def __enter__(self):
- self.undo = [(pat, _get_option(pat, silent=True))
- for pat, val in self.ops]
+ self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops]
for pat, val in self.ops:
_set_option(pat, val, silent=True)
@@ -412,7 +412,7 @@ def __exit__(self, *args):
_set_option(pat, val, silent=True)
-def register_option(key, defval, doc='', validator=None, cb=None):
+def register_option(key, defval, doc="", validator=None, cb=None):
"""Register an option in the package-wide pandas config object
Parameters
@@ -437,6 +437,7 @@ def register_option(key, defval, doc='', validator=None, cb=None):
"""
import tokenize
import keyword
+
key = key.lower()
if key in _registered_options:
@@ -451,10 +452,10 @@ def register_option(key, defval, doc='', validator=None, cb=None):
validator(defval)
# walk the nested dict, creating dicts as needed along the path
- path = key.split('.')
+ path = key.split(".")
for k in path:
- if not bool(re.match('^' + tokenize.Name + '$', k)):
+ if not bool(re.match("^" + tokenize.Name + "$", k)):
raise ValueError("{k} is not a valid identifier".format(k=k))
if keyword.iskeyword(k):
raise ValueError("{k} is a python keyword".format(k=k))
@@ -463,20 +464,20 @@ def register_option(key, defval, doc='', validator=None, cb=None):
msg = "Path prefix to option '{option}' is already an option"
for i, p in enumerate(path[:-1]):
if not isinstance(cursor, dict):
- raise OptionError(msg.format(option='.'.join(path[:i])))
+ raise OptionError(msg.format(option=".".join(path[:i])))
if p not in cursor:
cursor[p] = {}
cursor = cursor[p]
if not isinstance(cursor, dict):
- raise OptionError(msg.format(option='.'.join(path[:-1])))
+ raise OptionError(msg.format(option=".".join(path[:-1])))
cursor[path[-1]] = defval # initialize
# save the option metadata
- _registered_options[key] = RegisteredOption(key=key, defval=defval,
- doc=doc, validator=validator,
- cb=cb)
+ _registered_options[key] = RegisteredOption(
+ key=key, defval=defval, doc=doc, validator=validator, cb=cb
+ )
def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
@@ -526,6 +527,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
_deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
+
#
# functions internal to the module
@@ -542,14 +544,14 @@ def _select_options(pat):
# else look through all of them
keys = sorted(_registered_options.keys())
- if pat == 'all': # reserved key
+ if pat == "all": # reserved key
return keys
return [k for k in keys if re.search(pat, k, re.I)]
def _get_root(key):
- path = key.split('.')
+ path = key.split(".")
cursor = _global_config
for p in path[:-1]:
cursor = cursor[p]
@@ -621,12 +623,11 @@ def _warn_if_deprecated(key):
else:
msg = "'{key}' is deprecated".format(key=key)
if d.removal_ver:
- msg += (' and will be removed in {version}'
- .format(version=d.removal_ver))
+ msg += " and will be removed in {version}".format(version=d.removal_ver)
if d.rkey:
msg += ", please use '{rkey}' instead.".format(rkey=d.rkey)
else:
- msg += ', please refrain from using it.'
+ msg += ", please refrain from using it."
warnings.warn(msg, FutureWarning)
return True
@@ -639,22 +640,22 @@ def _build_option_description(k):
o = _get_registered_option(k)
d = _get_deprecated_option(k)
- s = '{k} '.format(k=k)
+ s = "{k} ".format(k=k)
if o.doc:
- s += '\n'.join(o.doc.strip().split('\n'))
+ s += "\n".join(o.doc.strip().split("\n"))
else:
- s += 'No description available.'
+ s += "No description available."
if o:
- s += ('\n [default: {default}] [currently: {current}]'
- .format(default=o.defval, current=_get_option(k, True)))
+ s += "\n [default: {default}] [currently: {current}]".format(
+ default=o.defval, current=_get_option(k, True)
+ )
if d:
- s += '\n (Deprecated'
- s += (', use `{rkey}` instead.'
- .format(rkey=d.rkey if d.rkey else ''))
- s += ')'
+ s += "\n (Deprecated"
+ s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "")
+ s += ")"
return s
@@ -666,28 +667,34 @@ def pp_options_list(keys, width=80, _print=False):
from itertools import groupby
def pp(name, ks):
- pfx = ('- ' + name + '.[' if name else '')
- ls = wrap(', '.join(ks), width, initial_indent=pfx,
- subsequent_indent=' ', break_long_words=False)
+ pfx = "- " + name + ".[" if name else ""
+ ls = wrap(
+ ", ".join(ks),
+ width,
+ initial_indent=pfx,
+ subsequent_indent=" ",
+ break_long_words=False,
+ )
if ls and ls[-1] and name:
- ls[-1] = ls[-1] + ']'
+ ls[-1] = ls[-1] + "]"
return ls
ls = []
- singles = [x for x in sorted(keys) if x.find('.') < 0]
+ singles = [x for x in sorted(keys) if x.find(".") < 0]
if singles:
- ls += pp('', singles)
- keys = [x for x in keys if x.find('.') >= 0]
+ ls += pp("", singles)
+ keys = [x for x in keys if x.find(".") >= 0]
- for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]):
- ks = [x[len(k) + 1:] for x in list(g)]
+ for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]):
+ ks = [x[len(k) + 1 :] for x in list(g)]
ls += pp(k, ks)
- s = '\n'.join(ls)
+ s = "\n".join(ls)
if _print:
print(s)
else:
return s
+
#
# helpers
@@ -724,7 +731,7 @@ def config_prefix(prefix):
def wrap(func):
def inner(key, *args, **kwds):
- pkey = '{prefix}.{key}'.format(prefix=prefix, key=key)
+ pkey = "{prefix}.{key}".format(prefix=prefix, key=key)
return func(pkey, *args, **kwds)
return inner
@@ -740,6 +747,7 @@ def inner(key, *args, **kwds):
get_option = _get_option
register_option = _register_option
+
# These factories and methods are handy for use as the validator
# arg in register_option
@@ -779,6 +787,7 @@ def is_instance_factory(_type):
ValueError if x is not an instance of `_type`
"""
+
if isinstance(_type, (tuple, list)):
_type = tuple(_type)
type_repr = "|".join(map(str, _type))
@@ -812,6 +821,32 @@ def inner(x):
return inner
+def is_nonnegative_int(value):
+ """
+ Verify that value is None or a positive int.
+
+ Parameters
+ ----------
+ value : None or int
+ The `value` to be checked.
+
+ Raises
+ ------
+ ValueError
+ When the value is not None or is a negative integer
+ """
+
+ if value is None:
+ return
+
+ elif isinstance(value, int):
+ if value >= 0:
+ return
+
+ msg = "Value must be a nonnegative integer or None"
+ raise ValueError(msg)
+
+
# common type validators, for convenience
# usage: register_option(... , validator = is_int)
is_int = is_type_factory(int)
diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py
index 85300a308de62..5bf2b49ce5904 100644
--- a/pandas/_config/dates.py
+++ b/pandas/_config/dates.py
@@ -13,9 +13,11 @@
When True, prints and parses dates with the year first, eg 2005/01/20
"""
-with cf.config_prefix('display'):
+with cf.config_prefix("display"):
# Needed upstream of `_libs` because these are used in tslibs.parsing
- cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc,
- validator=cf.is_bool)
- cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc,
- validator=cf.is_bool)
+ cf.register_option(
+ "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
+ )
+ cf.register_option(
+ "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
+ )
diff --git a/pandas/_config/display.py b/pandas/_config/display.py
index 7997d12e06aa9..6e5fabe2706e5 100644
--- a/pandas/_config/display.py
+++ b/pandas/_config/display.py
@@ -25,14 +25,14 @@ def detect_console_encoding():
pass
# try again for something better
- if not encoding or 'ascii' in encoding.lower():
+ if not encoding or "ascii" in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except Exception:
pass
# when all else fails. this will usually be "ascii"
- if not encoding or 'ascii' in encoding.lower():
+ if not encoding or "ascii" in encoding.lower():
encoding = sys.getdefaultencoding()
# GH#3360, save the reported defencoding at import time
@@ -50,6 +50,7 @@ def detect_console_encoding():
these are generally strings meant to be displayed on the console.
"""
-with cf.config_prefix('display'):
- cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc,
- validator=cf.is_text)
+with cf.config_prefix("display"):
+ cf.register_option(
+ "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
+ )
diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
index 1ca6d073f18c4..46802c6460959 100644
--- a/pandas/_config/localization.py
+++ b/pandas/_config/localization.py
@@ -37,7 +37,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL):
locale.setlocale(lc_var, new_locale)
normalized_locale = locale.getlocale()
if all(x is not None for x in normalized_locale):
- yield '.'.join(normalized_locale)
+ yield ".".join(normalized_locale)
else:
yield new_locale
finally:
@@ -99,15 +99,16 @@ def _valid_locales(locales, normalize):
def _default_locale_getter():
try:
- raw_locales = subprocess.check_output(['locale -a'], shell=True)
+ raw_locales = subprocess.check_output(["locale -a"], shell=True)
except subprocess.CalledProcessError as e:
- raise type(e)("{exception}, the 'locale -a' command cannot be found "
- "on your system".format(exception=e))
+ raise type(e)(
+ "{exception}, the 'locale -a' command cannot be found "
+ "on your system".format(exception=e)
+ )
return raw_locales
-def get_locales(prefix=None, normalize=True,
- locale_getter=_default_locale_getter):
+def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter):
"""
Get all the locales that are available on the system.
@@ -145,11 +146,10 @@ def get_locales(prefix=None, normalize=True,
# raw_locales is "\n" separated list of locales
# it may contain non-decodable parts, so split
# extract what we can and then rejoin.
- raw_locales = raw_locales.split(b'\n')
+ raw_locales = raw_locales.split(b"\n")
out_locales = []
for x in raw_locales:
- out_locales.append(str(
- x, encoding=options.display.encoding))
+ out_locales.append(str(x, encoding=options.display.encoding))
except TypeError:
pass
@@ -157,6 +157,6 @@ def get_locales(prefix=None, normalize=True,
if prefix is None:
return _valid_locales(out_locales, normalize)
- pattern = re.compile('{prefix}.*'.format(prefix=prefix))
- found = pattern.findall('\n'.join(out_locales))
+ pattern = re.compile("{prefix}.*".format(prefix=prefix))
+ found = pattern.findall("\n".join(out_locales))
return _valid_locales(found, normalize)
diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
index 1f6042389416e..af67cb3be7102 100644
--- a/pandas/_libs/__init__.py
+++ b/pandas/_libs/__init__.py
@@ -1,4 +1,11 @@
# flake8: noqa
from .tslibs import (
- iNaT, NaT, NaTType, Timestamp, Timedelta, OutOfBoundsDatetime, Period)
+ NaT,
+ NaTType,
+ OutOfBoundsDatetime,
+ Period,
+ Timedelta,
+ Timestamp,
+ iNaT,
+)
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 0dbe525f7506e..038447ad252fe 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -674,31 +674,6 @@ def backfill_2d_inplace(algos_t[:, :] values,
val = values[j, i]
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap(algos_t[:] index, object func):
- cdef:
- Py_ssize_t length = index.shape[0]
- Py_ssize_t i = 0
- ndarray[object] result = np.empty(length, dtype=np.object_)
-
- from pandas._libs.lib import maybe_convert_objects
-
- for i in range(length):
- result[i] = func(index[i])
-
- return maybe_convert_objects(result)
-
-
-arrmap_float64 = arrmap["float64_t"]
-arrmap_float32 = arrmap["float32_t"]
-arrmap_object = arrmap["object"]
-arrmap_int64 = arrmap["int64_t"]
-arrmap_int32 = arrmap["int32_t"]
-arrmap_uint64 = arrmap["uint64_t"]
-arrmap_bool = arrmap["uint8_t"]
-
-
@cython.boundscheck(False)
@cython.wraparound(False)
def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
index 2fea8b17fd9d7..3a3adc71875ed 100644
--- a/pandas/_libs/algos_take_helper.pxi.in
+++ b/pandas/_libs/algos_take_helper.pxi.in
@@ -148,7 +148,7 @@ def get_dispatch(dtypes):
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values,
- int64_t[:] indexer,
+ const int64_t[:] indexer,
{{c_type_out}}[:] out,
fill_value=np.nan):
@@ -159,7 +159,7 @@ cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values,
@cython.wraparound(False)
@cython.boundscheck(False)
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
- int64_t[:] indexer,
+ const int64_t[:] indexer,
{{c_type_out}}[:] out,
fill_value=np.nan):
@@ -178,7 +178,7 @@ def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
- int64_t[:] indexer,
+ const int64_t[:] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
{{inner_take_2d_axis0}}
@@ -205,7 +205,7 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
- int64_t[:] indexer,
+ const int64_t[:] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
{{inner_take_2d_axis1}}
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 8f662b57615f3..3069bbbf34bb7 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -260,7 +260,7 @@ def group_shift_indexer(int64_t[:] out, const int64_t[:] labels,
int ngroups, int periods):
cdef:
Py_ssize_t N, i, j, ii
- int offset, sign
+ int offset = 0, sign
int64_t lab, idxer, idxer_slot
int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
int64_t[:, :] label_indexer
@@ -719,6 +719,11 @@ def group_quantile(ndarray[float64_t] out,
ndarray[int64_t] counts, non_na_counts, sort_arr
assert values.shape[0] == N
+
+ if not (0 <= q <= 1):
+ raise ValueError("'q' must be between 0 and 1. Got"
+ " '{}' instead".format(q))
+
inter_methods = {
'linear': INTERPOLATION_LINEAR,
'lower': INTERPOLATION_LOWER,
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 8e351244b7f43..000689f634545 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -76,7 +76,11 @@ def group_last_{{name}}({{c_type}}[:, :] out,
val = values[i, j]
# not nan
- if val == val and val != {{nan_val}}:
+ if (
+ {{if not name.startswith("int")}}
+ val == val and
+ {{endif}}
+ val != {{nan_val}}):
nobs[lab, j] += 1
resx[lab, j] = val
@@ -133,7 +137,11 @@ def group_nth_{{name}}({{c_type}}[:, :] out,
val = values[i, j]
# not nan
- if val == val and val != {{nan_val}}:
+ if (
+ {{if not name.startswith("int")}}
+ val == val and
+ {{endif}}
+ val != {{nan_val}}):
nobs[lab, j] += 1
if nobs[lab, j] == rank:
resx[lab, j] = val
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
index 609420f429798..51ec4ba43159c 100644
--- a/pandas/_libs/hashtable.pxd
+++ b/pandas/_libs/hashtable.pxd
@@ -41,7 +41,7 @@ cdef class StringHashTable(HashTable):
cdef struct Int64VectorData:
int64_t *data
- size_t n, m
+ Py_ssize_t n, m
cdef class Int64Vector:
cdef Int64VectorData *data
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 544fb3d8a15c0..b8df78e600a46 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -44,7 +44,7 @@ cdef int64_t NPY_NAT = util.get_nat()
_SIZE_HINT_LIMIT = (1 << 20) + 7
-cdef size_t _INIT_VEC_CAP = 128
+cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"
@@ -108,7 +108,7 @@ cdef class Int64Factorizer:
def get_count(self):
return self.count
- def factorize(self, int64_t[:] values, sort=False,
+ def factorize(self, const int64_t[:] values, sort=False,
na_sentinel=-1, na_value=None):
"""
Factorize values with nans replaced by na_sentinel
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 8c2c560c062ac..17f1d011af01b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -424,8 +424,12 @@ cdef class {{name}}HashTable(HashTable):
for i in range(n):
val = values[i]
- if ignore_na and (val != val
- or (use_na_value and val == na_value2)):
+ if ignore_na and (
+ {{if not name.lower().startswith(("uint", "int"))}}
+ val != val or
+ {{endif}}
+ (use_na_value and val == na_value2)
+ ):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them,
# and replace the corresponding label with na_sentinel
@@ -659,7 +663,7 @@ cdef class StringHashTable(HashTable):
int64_t[:] locs = np.empty(n, dtype=np.int64)
# these by-definition *must* be strings
- vecs = malloc(n * sizeof(char *))
+ vecs =