Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNT: Bump dev pin on NumPy #60987

Merged
merged 16 commits into from
Apr 3, 2025
Merged
16 changes: 16 additions & 0 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ class NumericEngineIndexing:
def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype

if (
index_type == "non_monotonic"
and dtype in [np.int16, np.int8, np.uint8]
and unique
):
# Values overflow
raise NotImplementedError

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
Expand Down Expand Up @@ -115,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
dtype = dtype.lower()

if (
index_type == "non_monotonic"
and dtype in ["int16", "int8", "uint8"]
and unique
):
# Values overflow
raise NotImplementedError

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
Expand Down
4 changes: 2 additions & 2 deletions doc/source/getting_started/comparison/comparison_with_r.rst
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension.

.. ipython:: python

a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4)
a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4)
pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)])

meltlist
Expand All @@ -402,7 +402,7 @@ In Python, this list would be a list of tuples, so

.. ipython:: python

a = list(enumerate(list(range(1, 5)) + [np.NAN]))
a = list(enumerate(list(range(1, 5)) + [np.nan]))
pd.DataFrame(a)

For more details and examples see :ref:`the Intro to Data Structures
Expand Down
4 changes: 2 additions & 2 deletions doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2064,12 +2064,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi

.. ipython:: python

df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32")
df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64")
df1
df1.dtypes
df2 = pd.DataFrame(
{
"A": pd.Series(np.random.randn(8), dtype="float16"),
"A": pd.Series(np.random.randn(8), dtype="float32"),
"B": pd.Series(np.random.randn(8)),
"C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8)
}
Expand Down
2 changes: 2 additions & 0 deletions doc/source/user_guide/enhancingperf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ can be improved by passing an ``np.ndarray``.
In [4]: %%cython
...: cimport numpy as np
...: import numpy as np
...: np.import_array()
...: cdef double f_typed(double x) except? -2:
...: return x * (x - 1)
...: cpdef double integrate_f_typed(double a, double b, int N):
Expand Down Expand Up @@ -225,6 +226,7 @@ and ``wraparound`` checks can yield more performance.
...: cimport cython
...: cimport numpy as np
...: import numpy as np
...: np.import_array()
...: cdef np.float64_t f_typed(np.float64_t x) except? -2:
...: return x * (x - 1)
...: cpdef np.float64_t integrate_f_typed(np.float64_t a, np.float64_t b, np.int64_t N):
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.11.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe

.. ipython:: python

df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64')
df1
df1.dtypes
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'),
'B': pd.Series(np.random.randn(8)),
'C': pd.Series(range(8), dtype='uint8')})
df2
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies:

# required dependencies
- python-dateutil
- numpy<2
- numpy<3

# optional dependencies
- beautifulsoup4>=4.11.2
Expand Down
4 changes: 2 additions & 2 deletions pandas/compat/numpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
r".*In the future `np\.long` will be defined as.*",
FutureWarning,
)
np_long = np.long # type: ignore[attr-defined]
np_ulong = np.ulong # type: ignore[attr-defined]
np_long = np.long
np_ulong = np.ulong
except AttributeError:
np_long = np.int_
np_ulong = np.uint
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]:
AttributeError: The series must contain integer data only.
>>> df = pd.Series([1, 2, 3])
>>> df.int_accessor.sum()
6"""
np.int64(6)"""


@doc(_register_accessor, klass="Series", examples=_register_series_examples)
Expand Down
18 changes: 9 additions & 9 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmin()
1
np.int64(1)
"""
# Implementer note: You have two places to override the behavior of
# argmin.
Expand Down Expand Up @@ -975,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmax()
3
np.int64(3)
"""
# Implementer note: You have two places to override the behavior of
# argmax.
Expand Down Expand Up @@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
--------
>>> class MyExtensionArray(pd.arrays.NumpyExtensionArray):
... def _formatter(self, boxed=False):
... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*"
... return lambda x: "*" + str(x) + "*"
>>> MyExtensionArray(np.array([1, 2, 3, 4]))
<MyExtensionArray>
[1*, 2*, 3*, 4*]
[*1*, *2*, *3*, *4*]
Length: 4, dtype: int64
"""
if boxed:
Expand Down Expand Up @@ -2176,15 +2176,15 @@ def _reduce(
Examples
--------
>>> pd.array([1, 2, 3])._reduce("min")
1
np.int64(1)
>>> pd.array([1, 2, 3])._reduce("max")
3
np.int64(3)
>>> pd.array([1, 2, 3])._reduce("sum")
6
np.int64(6)
>>> pd.array([1, 2, 3])._reduce("mean")
2.0
np.float64(2.0)
>>> pd.array([1, 2, 3])._reduce("median")
2.0
np.float64(2.0)
"""
meth = getattr(self, name, None)
if meth is None:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def _unbox_scalar(
--------
>>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]"))
>>> arr._unbox_scalar(arr[0])
numpy.datetime64('1970-01-01T00:00:00.000000000')
np.datetime64('1970-01-01T00:00:00.000000000')
"""
raise AbstractMethodError(self)

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1775,7 +1775,8 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
[(0, 1], (1, 2]]
Length: 2, dtype: interval[int64, right]
>>> idx.to_tuples()
array([(0, 1), (1, 2)], dtype=object)
array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))],
dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In https://github.com/pandas-dev/pandas/pull/60987/files#r2016346826 I commented because the dtype of the array was the same as the dtype of the scalars.

Thinking some more, we currently have...

pd.Series([123,"123"])
# 0    123
# 1    123
# dtype: object

so for consistency, even object arrays should probably not show the NEP51 repr?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's an option here, this is what happens when you call str on a tuple of NumPy scalars.

print(str((np.int64(0), np.int64(1))))
# (np.int64(0), np.int64(1))

arr = pd.array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], dtype=object)
print(type(arr[0]), str(arr[0]))
# <class 'tuple'> (np.int64(0), np.int64(1))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems reasonable however extending the example I gave for the strings in an object array...

s = pd.Series([123, "123", ("123", "123")])
print(s)
print(s[2])
# 0           123
# 1           123
# 2    (123, 123)
# dtype: object
# ('123', '123')

however, we don't currently display the repr for strings in a collection either when the array is displayed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my bad, the numpy array of values does show the repr for the strings in a tuple.

s.values
# array([123, '123', ('123', '123')], dtype=object)


For :class:`pandas.IntervalIndex`:

Expand Down
32 changes: 16 additions & 16 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,25 +1378,25 @@ def any(
skips NAs):

>>> pd.array([True, False, True]).any()
True
np.True_
>>> pd.array([True, False, pd.NA]).any()
True
np.True_
>>> pd.array([False, False, pd.NA]).any()
False
np.False_
>>> pd.array([], dtype="boolean").any()
False
np.False_
>>> pd.array([pd.NA], dtype="boolean").any()
False
np.False_
>>> pd.array([pd.NA], dtype="Float64").any()
False
np.False_

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):

>>> pd.array([True, False, pd.NA]).any(skipna=False)
True
np.True_
>>> pd.array([1, 0, pd.NA]).any(skipna=False)
True
np.True_
>>> pd.array([False, False, pd.NA]).any(skipna=False)
<NA>
>>> pd.array([0, 0, pd.NA]).any(skipna=False)
Expand Down Expand Up @@ -1466,17 +1466,17 @@ def all(
skips NAs):

>>> pd.array([True, True, pd.NA]).all()
True
np.True_
>>> pd.array([1, 1, pd.NA]).all()
True
np.True_
>>> pd.array([True, False, pd.NA]).all()
False
np.False_
>>> pd.array([], dtype="boolean").all()
True
np.True_
>>> pd.array([pd.NA], dtype="boolean").all()
True
np.True_
>>> pd.array([pd.NA], dtype="Float64").all()
True
np.True_

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
Expand All @@ -1486,9 +1486,9 @@ def all(
>>> pd.array([1, 1, pd.NA]).all(skipna=False)
<NA>
>>> pd.array([True, False, pd.NA]).all(skipna=False)
False
np.False_
>>> pd.array([1, 0, pd.NA]).all(skipna=False)
False
np.False_
"""
nv.validate_all((), kwargs)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
--------
>>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
>>> df.sparse.density
0.5
np.float64(0.5)
"""

def _validate(self, data) -> None:
Expand Down Expand Up @@ -459,7 +459,7 @@ def density(self) -> float:
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.density
0.5
np.float64(0.5)
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,9 +804,9 @@ def argmax(
dtype: float64

>>> s.argmax()
2
np.int64(2)
>>> s.argmin()
0
Comment on lines 806 to -809
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a few of these where I'm wondering if we should be returning Python scalars instead of NumPy. Should issues be opened for these?

cc @pandas-dev/pandas-core

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think generally we always want to return Python scalars (IIRC we got a lot of issues about this in iteration and iteration-like APIs in the past)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even just wrapping the result of Series._reduce with maybe_box_naive breaks 692 tests. From a cursory look, they're tests that are expecting a NumPy scalar back. A lot however are something like op(data).any().any() so that they will work with DataFrame and Series. I plan to bring this up in the next dev meeting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree we should always return Python scalars. I'm surprised at the amount of failures that expect NumPy scalars

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd think you need a deprecation on this, because people may have code that depends on the result being a numpy scalar. I think that the tests we have in pandas-stubs for typing may depend on this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could put it up behind a future option, maybe something like always_return_python_scalars

np.int64(0)

The maximum cereal calories is the third element and
the minimum cereal calories is the first element,
Expand Down Expand Up @@ -1360,7 +1360,7 @@ def factorize(
dtype: int64

>>> ser.searchsorted(4)
3
np.int64(3)

>>> ser.searchsorted([0, 4])
array([0, 3])
Expand All @@ -1379,7 +1379,7 @@ def factorize(
dtype: datetime64[s]

>>> ser.searchsorted('3/14/2000')
3
np.int64(3)

>>> ser = pd.Categorical(
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
Expand All @@ -1389,7 +1389,7 @@ def factorize(
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

>>> ser.searchsorted('bread')
1
np.int64(1)

>>> ser.searchsorted(['bread'], side='right')
array([3])
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
with warnings.catch_warnings():
# Can remove warning filter once NumPy 1.24 is min version
if not np_version_gte1p24:
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
# np.VisibleDeprecationWarning only in np.exceptions in 2.0
warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined]
result = np.asarray(values, dtype=dtype)
except ValueError:
# Using try/except since it's more performant than checking is_list_like
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,9 +428,9 @@ def array_equivalent(
Examples
--------
>>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]))
True
np.True_
>>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan]))
False
np.False_
"""
left, right = np.asarray(left), np.asarray(right)

Expand Down Expand Up @@ -626,7 +626,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
>>> na_value_for_dtype(np.dtype("bool"))
False
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
numpy.datetime64('NaT')
np.datetime64('NaT')
"""

if isinstance(dtype, ExtensionDtype):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
dtype: int64

>>> even_primes.squeeze()
2
np.int64(2)

Squeezing objects with more than one value in every axis does nothing:

Expand Down Expand Up @@ -945,7 +945,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
Squeezing all axes will project directly into a scalar:

>>> df_0a.squeeze()
1
np.int64(1)
"""
axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
result = self.iloc[
Expand Down Expand Up @@ -7954,7 +7954,7 @@ def asof(self, where, subset=None):
dtype: float64

>>> s.asof(20)
2.0
np.float64(2.0)

For a sequence `where`, a Series is returned. The first value is
NaN, because the first element of `where` is before the first
Expand All @@ -7969,7 +7969,7 @@ def asof(self, where, subset=None):
NaN, even though NaN is at the index location for ``30``.

>>> s.asof(30)
2.0
np.float64(2.0)

Take all columns into consideration

Expand Down
Loading