From 658c4d5c55279093accec07ceed1fbe20d361c1b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Feb 2025 16:22:18 -0500 Subject: [PATCH 01/13] MNT: Bump dev pin on NumPy --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 69647a436e3ad..194611346d8de 100644 --- a/environment.yml +++ b/environment.yml @@ -23,7 +23,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy<3 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..8e370c37a9c9b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,7 @@ pytest-localserver PyQt5>=5.15.9 coverage python-dateutil -numpy<2 +numpy<3 beautifulsoup4>=4.11.2 blosc bottleneck>=1.3.6 From e424a9681024566dc85a1e99b469e10dc2ce46ad Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Mar 2025 15:05:43 -0500 Subject: [PATCH 02/13] Fix type-hints --- pandas/compat/numpy/__init__.py | 4 ++-- pandas/core/common.py | 3 ++- pandas/core/internals/managers.py | 7 ++++++- pandas/tests/extension/date/array.py | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3306b36d71806..e95b44c879940 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -36,8 +36,8 @@ r".*In the future `np\.long` will be defined as.*", FutureWarning, ) - np_long = np.long # type: ignore[attr-defined] - np_ulong = np.ulong # type: ignore[attr-defined] + np_long = np.long + np_ulong = np.ulong except AttributeError: np_long = np.int_ np_ulong = np.uint diff --git a/pandas/core/common.py b/pandas/core/common.py index 100ad312bd839..75f8a56aac5db 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -246,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi with warnings.catch_warnings(): # Can remove warning filter once NumPy 1.24 is min version if not np_version_gte1p24: - warnings.simplefilter("ignore", np.VisibleDeprecationWarning) + # np.VisibleDeprecationWarning only in np.exceptions in 2.0 + warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined] result = np.asarray(values, dtype=dtype) except ValueError: # Using try/except since it's more performant than checking is_list_like diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a3738bb25f56c..2e6701916a8d4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -572,7 +572,12 @@ def setitem(self, indexer, value) -> Self: 0, blk_loc, values ) # first block equals values - self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + col_indexer: slice | np.ndarray + if isinstance(indexer[1], slice) and indexer[1] == slice(None): + col_indexer = slice(None) + else: + col_indexer = np.arange(len(blk_loc)) + self.blocks[0].setitem((indexer[0], col_indexer), value) return self # No need to split if we either set all columns or on a single block # manager diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py index 2306f5974ba18..0c51570189a7c 100644 --- a/pandas/tests/extension/date/array.py +++ b/pandas/tests/extension/date/array.py @@ -113,7 +113,7 @@ def __init__( # error: "object_" object is not iterable obj = np.char.split(dates, sep="-") - for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc] + for (i,), (y, m, d) in np.ndenumerate(obj): self._year[i] = int(y) self._month[i] = int(m) self._day[i] = int(d) From e75abfecd1f0b739a2c66ffeb0df81443f82e096 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Mar 2025 15:07:48 -0500 Subject: [PATCH 03/13] Docs fixup --- doc/source/getting_started/comparison/comparison_with_r.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index d9d7d916b0238..cc7add87b5935 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4) pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) meltlist @@ -402,7 +402,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1, 5)) + [np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.nan])) pd.DataFrame(a) For more details and examples see :ref:`the Intro to Data Structures From 856a52f22124795f74de3e4070e533d129c6a4b8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Mar 2025 17:20:09 -0500 Subject: [PATCH 04/13] fixups --- doc/source/user_guide/basics.rst | 1 + doc/source/user_guide/enhancingperf.rst | 2 ++ doc/source/whatsnew/v0.11.0.rst | 1 + 3 files changed, 4 insertions(+) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index ffd7a2ad7bb01..ce5d16c604136 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2063,6 +2063,7 @@ or a passed ``Series``), then it will be preserved in DataFrame operations. Furt different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python + :okwarning: df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") df1 diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index e55a6cda47ac2..647b0f760f4d4 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -171,6 +171,7 @@ can be improved by passing an ``np.ndarray``. In [4]: %%cython ...: cimport numpy as np ...: import numpy as np + ...: np.import_array() ...: cdef double f_typed(double x) except? -2: ...: return x * (x - 1) ...: cpdef double integrate_f_typed(double a, double b, int N): @@ -225,6 +226,7 @@ and ``wraparound`` checks can yield more performance. ...: cimport cython ...: cimport numpy as np ...: import numpy as np + ...: np.import_array() ...: cdef np.float64_t f_typed(np.float64_t x) except? -2: ...: return x * (x - 1) ...: cpdef np.float64_t integrate_f_typed(np.float64_t a, np.float64_t b, np.int64_t N): diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index dcb0d3229aa5d..73228513bfffc 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -73,6 +73,7 @@ Dtypes Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python + :okwarning: df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') df1 From a8304b6d01521e53a7dc9c0aaca3974f8def3379 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Mar 2025 17:31:24 -0500 Subject: [PATCH 05/13] Fixup --- asv_bench/benchmarks/indexing_engines.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 5e3c593e269cb..c65124063b2f2 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -67,6 +67,10 @@ class NumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + if index_type == "non_monotonic" and dtype in ["int16", "int8", "uint8"]: + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) From 41265f3a54c7c77b024d8e42df50bf4f16c3bf51 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Mar 2025 17:33:31 -0500 Subject: [PATCH 06/13] Fixup --- pandas/core/internals/managers.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e6701916a8d4..a3738bb25f56c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -572,12 +572,7 @@ def setitem(self, indexer, value) -> Self: 0, blk_loc, values ) # first block equals values - col_indexer: slice | np.ndarray - if isinstance(indexer[1], slice) and indexer[1] == slice(None): - col_indexer = slice(None) - else: - col_indexer = np.arange(len(blk_loc)) - self.blocks[0].setitem((indexer[0], col_indexer), value) + self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) return self # No need to split if we either set all columns or on a single block # manager From f6a2330a2a2c4e1d8e6df0be2588dec72632004c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 6 Mar 2025 17:33:49 -0500 Subject: [PATCH 07/13] doctest fixups --- asv_bench/benchmarks/indexing_engines.py | 14 +++++++++++++- pandas/core/accessor.py | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 10 +++++----- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/interval.py | 3 ++- pandas/core/arrays/masked.py | 12 ++++++------ pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/sparse/accessor.py | 4 ++-- pandas/core/base.py | 8 ++++---- pandas/core/construction.py | 2 +- pandas/core/dtypes/missing.py | 6 +++--- pandas/core/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 5 +++-- pandas/core/indexing.py | 16 ++++++++-------- pandas/core/nanops.py | 24 ++++++++++++------------ pandas/plotting/_misc.py | 2 +- 17 files changed, 66 insertions(+), 52 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index c65124063b2f2..da0e7de585391 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -67,7 +67,11 @@ class NumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype - if index_type == "non_monotonic" and dtype in ["int16", "int8", "uint8"]: + if ( + index_type == "non_monotonic" + and dtype in [np.int16, np.int8, np.uint8] + and unique + ): # Values overflow raise NotImplementedError @@ -119,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype dtype = dtype.lower() + if ( + index_type == "non_monotonic" + and dtype in ["int16", "int8", "uint8"] + and unique + ): + # Values overflow + raise NotImplementedError + if index_type == "monotonic_incr": if unique: arr = np.arange(N * 3, dtype=dtype) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 78684eacf2d66..0331c26c805b6 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -351,7 +351,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]: AttributeError: The series must contain integer data only. >>> df = pd.Series([1, 2, 3]) >>> df.int_accessor.sum() -6""" +np.int64(6)""" @doc(_register_accessor, klass="Series", examples=_register_series_examples) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aafd802b827a5..77ec4286f05ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -415,7 +415,7 @@ def unique(values): >>> pd.unique(pd.array([1 + 1j, 2, 3])) - [(1+1j), (2+0j), (3+0j)] + [np.complex128(1+1j), np.complex128(2+0j), np.complex128(3+0j)] Length: 3, dtype: complex128 """ return unique_with_mask(values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 33745438e2aea..037abb3ff2ee8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1072,7 +1072,7 @@ def interpolate( ... limit_area="inside", ... ) - [0.0, 1.0, 2.0, 3.0] + [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0)] Length: 4, dtype: float64 Interpolating values in a FloatingArray: @@ -1962,7 +1962,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: ... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*" >>> MyExtensionArray(np.array([1, 2, 3, 4])) - [1*, 2*, 3*, 4*] + [np.int64(1)*, np.int64(2)*, np.int64(3)*, np.int64(4)*] Length: 4, dtype: int64 """ if boxed: @@ -2176,11 +2176,11 @@ def _reduce( Examples -------- >>> pd.array([1, 2, 3])._reduce("min") - 1 + np.int64(1) >>> pd.array([1, 2, 3])._reduce("max") - 3 + np.int64(3) >>> pd.array([1, 2, 3])._reduce("sum") - 6 + np.int64(6) >>> pd.array([1, 2, 3])._reduce("mean") 2.0 >>> pd.array([1, 2, 3])._reduce("median") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8a79ab53442c3..762d921fb5013 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -275,7 +275,7 @@ def _unbox_scalar( -------- >>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]")) >>> arr._unbox_scalar(arr[0]) - numpy.datetime64('1970-01-01T00:00:00.000000000') + np.datetime64('1970-01-01T00:00:00.000000000') """ raise AbstractMethodError(self) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 0bf2089df5f85..ae7362a5babef 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1775,7 +1775,8 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray: [(0, 1], (1, 2]] Length: 2, dtype: interval[int64, right] >>> idx.to_tuples() - array([(0, 1), (1, 2)], dtype=object) + array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], + dtype=object) For :class:`pandas.IntervalIndex`: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f3a0cc0dccdb3..e5872368049aa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1470,17 +1470,17 @@ def all( skips NAs): >>> pd.array([True, True, pd.NA]).all() - True + np.True_ >>> pd.array([1, 1, pd.NA]).all() - True + np.True_ >>> pd.array([True, False, pd.NA]).all() - False + np.False_ >>> pd.array([], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="boolean").all() - True + np.True_ >>> pd.array([pd.NA], dtype="Float64").all() - True + np.True_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index ac0823ed903b3..ab0029c3ae2ca 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -80,7 +80,7 @@ class NumpyExtensionArray( # type: ignore[misc] -------- >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) - [0, 1, 2, 3] + [np.int64(0), np.int64(1), np.int64(2), np.int64(3)] Length: 4, dtype: int64 """ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index eab8527eef526..7dde03b30cd6a 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -297,7 +297,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): -------- >>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]") >>> df.sparse.density - 0.5 + np.float64(0.5) """ def _validate(self, data) -> None: @@ -459,7 +459,7 @@ def density(self) -> float: -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.density - 0.5 + np.float64(0.5) """ tmp = np.mean([column.array.density for _, column in self._parent.items()]) return tmp diff --git a/pandas/core/base.py b/pandas/core/base.py index a64cd8633c1db..a7ae5e2f5301b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -558,7 +558,7 @@ def array(self) -> ExtensionArray: >>> pd.Series([1, 2, 3]).array - [1, 2, 3] + [np.int64(1), np.int64(2), np.int64(3)] Length: 3, dtype: int64 For extension types, like Categorical, the actual ExtensionArray @@ -804,9 +804,9 @@ def argmax( dtype: float64 >>> s.argmax() - 2 + np.int64(2) >>> s.argmin() - 0 + np.int64(0) The maximum cereal calories is the third element and the minimum cereal calories is the first element, @@ -1360,7 +1360,7 @@ def factorize( dtype: int64 >>> ser.searchsorted(4) - 3 + np.int64(3) >>> ser.searchsorted([0, 4]) array([0, 3]) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ada492787a179..a7f301fbb16af 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -177,7 +177,7 @@ def array( >>> pd.array(["a", "b"], dtype=str) - ['a', 'b'] + [np.str_('a'), np.str_('b')] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f20ca44728664..71fe0f6e4feb0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -428,9 +428,9 @@ def array_equivalent( Examples -------- >>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan])) - True + np.True_ >>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan])) - False + np.False_ """ left, right = np.asarray(left), np.asarray(right) @@ -626,7 +626,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) - numpy.datetime64('NaT') + np.datetime64('NaT') """ if isinstance(dtype, ExtensionDtype): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c3f535df9ce2..84e4397151d09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -887,7 +887,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: dtype: int64 >>> even_primes.squeeze() - 2 + np.int64(2) Squeezing objects with more than one value in every axis does nothing: @@ -7954,7 +7954,7 @@ def asof(self, where, subset=None): dtype: float64 >>> s.asof(20) - 2.0 + np.float64(2.0) For a sequence `where`, a Series is returned. The first value is NaN, because the first element of `where` is before the first diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d0c0ed29b6d44..f9438b348c140 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -546,7 +546,8 @@ def groups(self) -> dict[Hashable, Index]: 2023-02-15 4 dtype: int64 >>> ser.resample("MS").groups - {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} + {Timestamp('2023-01-01 00:00:00'): np.int64(2), + Timestamp('2023-02-01 00:00:00'): np.int64(4)} """ if isinstance(self.keys, list) and len(self.keys) == 1: warnings.warn( @@ -613,7 +614,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: toucan 1 5 6 eagle 7 8 9 >>> df.groupby(by=["a"]).indices - {1: array([0, 1]), 7: array([2])} + {np.int64(1): array([0, 1]), np.int64(7): array([2])} For Resampler: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bcb27d0320c91..2d872c72746bc 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -265,7 +265,7 @@ def iloc(self) -> _iLocIndexer: With scalar integers. >>> df.iloc[0, 1] - 2 + np.int64(2) With lists of integers. @@ -375,7 +375,7 @@ def loc(self) -> _LocIndexer: Single label for row and column >>> df.loc["cobra", "shield"] - 2 + np.int64(2) Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. @@ -666,18 +666,18 @@ def at(self) -> _AtIndexer: Get value at specified row/column pair >>> df.at[4, "B"] - 2 + np.int64(2) Set value at specified row/column pair >>> df.at[4, "B"] = 10 >>> df.at[4, "B"] - 10 + np.int64(10) Get value within a Series >>> df.loc[5].at["B"] - 4 + np.int64(5) """ return _AtIndexer("at", self) @@ -715,18 +715,18 @@ def iat(self) -> _iAtIndexer: Get value at specified row/column pair >>> df.iat[1, 2] - 1 + np.int64(1) Set value at specified row/column pair >>> df.iat[1, 2] = 10 >>> df.iat[1, 2] - 10 + np.int64(10) Get value within a series >>> df.loc[0].iat[1] - 2 + np.int64(2) """ return _iAtIndexer("iat", self) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d1dc0ff809497..25fb6e6181082 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -508,12 +508,12 @@ def nanany( >>> from pandas.core import nanops >>> s = pd.Series([1, 2]) >>> nanops.nanany(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([np.nan]) >>> nanops.nanany(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -564,12 +564,12 @@ def nanall( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanall(s.values) - True + np.True_ >>> from pandas.core import nanops >>> s = pd.Series([1, 0]) >>> nanops.nanall(s.values) - False + np.False_ """ if values.dtype.kind in "iub" and mask is None: # GH#26032 fastpath @@ -625,7 +625,7 @@ def nansum( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nansum(s.values) - 3.0 + np.float64(3.0) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -691,7 +691,7 @@ def nanmean( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s.values) - 1.5 + np.float64(1.5) """ dtype = values.dtype values, mask = _get_values(values, skipna, fill_value=0, mask=mask) @@ -1061,7 +1061,7 @@ def nansem( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nansem(s.values) - 0.5773502691896258 + np.float64(0.5773502691896258) """ # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise @@ -1136,7 +1136,7 @@ def nanargmax( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmax(arr) - 4 + np.int64(4) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 2] = np.nan @@ -1182,7 +1182,7 @@ def nanargmin( >>> from pandas.core import nanops >>> arr = np.array([1, 2, 3, np.nan, 4]) >>> nanops.nanargmin(arr) - 0 + np.int64(0) >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3) >>> arr[2:, 0] = np.nan @@ -1237,7 +1237,7 @@ def nanskew( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s.values) - 1.7320508075688787 + np.float64(1.7320508075688787) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1325,7 +1325,7 @@ def nankurt( >>> from pandas.core import nanops >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s.values) - -1.2892561983471076 + np.float64(-1.2892561983471076) """ mask = _maybe_get_mask(values, skipna, mask) if values.dtype.kind != "f": @@ -1417,7 +1417,7 @@ def nanprod( >>> from pandas.core import nanops >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s.values) - 6.0 + np.float64(6.0) """ mask = _maybe_get_mask(values, skipna, mask) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 0e0fb23d924bc..75ff40f1eca90 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -68,7 +68,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> fig, ax = plt.subplots() >>> ax.axis("off") - (0.0, 1.0, 0.0, 1.0) + (np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0)) >>> table = pd.plotting.table( ... ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2] ... ) From 83e26441c8260cb5d6a72134f741cbdf21994438 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 7 Mar 2025 18:03:39 -0500 Subject: [PATCH 08/13] More doc fixes --- pandas/core/arrays/base.py | 8 ++++---- pandas/core/arrays/masked.py | 20 ++++++++++---------- pandas/core/base.py | 2 +- pandas/core/construction.py | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/indexing.py | 4 ++-- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 037abb3ff2ee8..ca10bcfaf1ce6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -941,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmin() - 1 + np.int64(1) """ # Implementer note: You have two places to override the behavior of # argmin. @@ -975,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int: -------- >>> arr = pd.array([3, 1, 2, 5, 4]) >>> arr.argmax() - 3 + np.int64(3) """ # Implementer note: You have two places to override the behavior of # argmax. @@ -2182,9 +2182,9 @@ def _reduce( >>> pd.array([1, 2, 3])._reduce("sum") np.int64(6) >>> pd.array([1, 2, 3])._reduce("mean") - 2.0 + np.float64(2.0) >>> pd.array([1, 2, 3])._reduce("median") - 2.0 + np.float64(2.0) """ meth = getattr(self, name, None) if meth is None: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e5872368049aa..5e5ab2bb2d3fd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1382,25 +1382,25 @@ def any( skips NAs): >>> pd.array([True, False, True]).any() - True + np.True_ >>> pd.array([True, False, pd.NA]).any() - True + np.True_ >>> pd.array([False, False, pd.NA]).any() - False + np.False_ >>> pd.array([], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="boolean").any() - False + np.False_ >>> pd.array([pd.NA], dtype="Float64").any() - False + np.False_ With ``skipna=False``, the result can be NA if this is logically required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, False, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([1, 0, pd.NA]).any(skipna=False) - True + np.True_ >>> pd.array([False, False, pd.NA]).any(skipna=False) >>> pd.array([0, 0, pd.NA]).any(skipna=False) @@ -1490,9 +1490,9 @@ def all( >>> pd.array([1, 1, pd.NA]).all(skipna=False) >>> pd.array([True, False, pd.NA]).all(skipna=False) - False + np.False_ >>> pd.array([1, 0, pd.NA]).all(skipna=False) - False + np.False_ """ nv.validate_all((), kwargs) diff --git a/pandas/core/base.py b/pandas/core/base.py index a7ae5e2f5301b..780761003458c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1379,7 +1379,7 @@ def factorize( dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') - 3 + np.int64(3) >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True diff --git a/pandas/core/construction.py b/pandas/core/construction.py index a7f301fbb16af..72456ac3e1abc 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -186,7 +186,7 @@ def array( >>> pd.array(["a", "b"], dtype=np.dtype(" - ['a', 'b'] + [np.str_('a'), np.str_('b')] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 84e4397151d09..027f78f985953 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -945,7 +945,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: Squeezing all axes will project directly into a scalar: >>> df_0a.squeeze() - 1 + np.int64(1) """ axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),) result = self.iloc[ @@ -7969,7 +7969,7 @@ def asof(self, where, subset=None): NaN, even though NaN is at the index location for ``30``. >>> s.asof(30) - 2.0 + np.float64(2.0) Take all columns into consideration diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2d872c72746bc..bbbcc4da9fb39 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -585,7 +585,7 @@ def loc(self) -> _LocIndexer: Single tuple for the index with a single label for the column >>> df.loc[("cobra", "mark i"), "shield"] - 2 + np.int64(2) Slice from index tuple to single label @@ -677,7 +677,7 @@ def at(self) -> _AtIndexer: Get value within a Series >>> df.loc[5].at["B"] - np.int64(5) + np.int64(4) """ return _AtIndexer("at", self) From f7ef882505cd09a0314dae63ba90176e6042d0b1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 19 Mar 2025 15:04:44 -0400 Subject: [PATCH 09/13] Some reverts --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/base.py | 4 ++-- pandas/core/construction.py | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7a92b152b8886..76f2fdad591ff 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -415,7 +415,7 @@ def unique(values): >>> pd.unique(pd.array([1 + 1j, 2, 3])) - [np.complex128(1+1j), np.complex128(2+0j), np.complex128(3+0j)] + [(1+1j), (2+0j), (3+0j)] Length: 3, dtype: complex128 """ return unique_with_mask(values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e4926b32fe48d..bc5a5f6d68d95 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1072,7 +1072,7 @@ def interpolate( ... limit_area="inside", ... ) - [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0)] + [0.0, 1.0, 2.0, 3.0] Length: 4, dtype: float64 Interpolating values in a FloatingArray: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ae7362a5babef..6cb79e915c78b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1776,7 +1776,7 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray: Length: 2, dtype: interval[int64, right] >>> idx.to_tuples() array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], - dtype=object) + dtype=object) For :class:`pandas.IntervalIndex`: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index b00bea160fe69..fd2c8c9d63362 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -83,7 +83,7 @@ class NumpyExtensionArray( # type: ignore[misc] -------- >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) - [np.int64(0), np.int64(1), np.int64(2), np.int64(3)] + [0, 1, 2, 3] Length: 4, dtype: int64 """ diff --git a/pandas/core/base.py b/pandas/core/base.py index 780761003458c..ea7e99f6f1879 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -558,7 +558,7 @@ def array(self) -> ExtensionArray: >>> pd.Series([1, 2, 3]).array - [np.int64(1), np.int64(2), np.int64(3)] + [1, 2, 3] Length: 3, dtype: int64 For extension types, like Categorical, the actual ExtensionArray @@ -1389,7 +1389,7 @@ def factorize( Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] >>> ser.searchsorted('bread') - 1 + np.int64(1) >>> ser.searchsorted(['bread'], side='right') array([3]) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 72456ac3e1abc..ada492787a179 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -177,7 +177,7 @@ def array( >>> pd.array(["a", "b"], dtype=str) - [np.str_('a'), np.str_('b')] + ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string @@ -186,7 +186,7 @@ def array( >>> pd.array(["a", "b"], dtype=np.dtype(" - [np.str_('a'), np.str_('b')] + ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy From 347b865acbdeba1e4c28f5346661181282c66f2f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 19 Mar 2025 15:15:10 -0400 Subject: [PATCH 10/13] Avoid float16 --- doc/source/user_guide/basics.rst | 5 ++--- doc/source/whatsnew/v0.11.0.rst | 5 ++--- pandas/core/arrays/datetimelike.py | 4 ++++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index ce5d16c604136..d8ef9899a07c9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2063,14 +2063,13 @@ or a passed ``Series``), then it will be preserved in DataFrame operations. Furt different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python - :okwarning: - df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64") df1 df1.dtypes df2 = pd.DataFrame( { - "A": pd.Series(np.random.randn(8), dtype="float16"), + "A": pd.Series(np.random.randn(8), dtype="float32"), "B": pd.Series(np.random.randn(8)), "C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8) } diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 73228513bfffc..28c9d46f21fd8 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -73,12 +73,11 @@ Dtypes Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python - :okwarning: - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64') df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), + df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'), 'B': pd.Series(np.random.randn(8)), 'C': pd.Series(range(8), dtype='uint8')}) df2 diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b27bf19f2f593..f109db9458834 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2520,6 +2520,10 @@ def _validate_inferred_freq( freq : DateOffset or None """ if inferred_freq is not None: + offset1 = to_offset(freq) + offset2 = to_offset(inferred_freq) + print(offset1, offset2) + print(type(offset1), type(offset2)) if freq is not None and freq != inferred_freq: raise ValueError( f"Inferred frequency {inferred_freq} from passed " From 17ec834b699a18dbf0441f48e345b479f8628020 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 20 Mar 2025 16:30:31 -0400 Subject: [PATCH 11/13] Cleanup --- pandas/core/arrays/datetimelike.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f109db9458834..b27bf19f2f593 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2520,10 +2520,6 @@ def _validate_inferred_freq( freq : DateOffset or None """ if inferred_freq is not None: - offset1 = to_offset(freq) - offset2 = to_offset(inferred_freq) - print(offset1, offset2) - print(type(offset1), type(offset2)) if freq is not None and freq != inferred_freq: raise ValueError( f"Inferred frequency {inferred_freq} from passed " From 65e1374d33f8037ddea2d161e5a5208465c9ef1b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 29 Mar 2025 08:36:37 -0400 Subject: [PATCH 12/13] Remove repr from MyExtensionArray --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bc5a5f6d68d95..19f377a232d5a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): ... def _formatter(self, boxed=False): - ... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*" + ... return lambda x: "*" + str(x) >>> MyExtensionArray(np.array([1, 2, 3, 4])) - [np.int64(1)*, np.int64(2)*, np.int64(3)*, np.int64(4)*] + [1*, 2*, 3*, 4*] Length: 4, dtype: int64 """ if boxed: From 14d34a750c2b546a089e96a9acfcefbe78e8f9ea Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 29 Mar 2025 15:31:32 -0400 Subject: [PATCH 13/13] Fixup --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 19f377a232d5a..42be07e03bad8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: -------- >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): ... def _formatter(self, boxed=False): - ... return lambda x: "*" + str(x) + ... return lambda x: "*" + str(x) + "*" >>> MyExtensionArray(np.array([1, 2, 3, 4])) - [1*, 2*, 3*, 4*] + [*1*, *2*, *3*, *4*] Length: 4, dtype: int64 """ if boxed: