From d7fa43cd17a02c91485db7ee630081c1b164af83 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 22 Oct 2023 19:11:14 -0700 Subject: [PATCH 1/9] ENH: read_stata return non-nano --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/stata.py | 144 ++++++++++++--------------------- pandas/tests/io/test_stata.py | 85 ++++++++++++------- 3 files changed, 112 insertions(+), 118 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9ab0452c8334..fad58657d0310 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -350,6 +350,7 @@ Other enhancements - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) +- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`??`) .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 576e27f202524..e5b7eeacc757c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -62,7 +62,6 @@ Timestamp, isna, to_datetime, - to_timedelta, ) from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index @@ -232,6 +231,7 @@ stata_epoch: Final = datetime(1960, 1, 1) +unix_epoch: Final = datetime(1970, 1, 1) def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: @@ -280,64 +280,43 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: date - ty years since 0000 """ - MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year - MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days - MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days - MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 - MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 - def convert_year_month_safe(year, month) -> Series: - """ - Convert year and month to datetimes, using pandas vectorized versions - when the date range falls within the range supported by pandas. - Otherwise it falls back to a slower but more robust method - using datetime. - """ - if year.max() < MAX_YEAR and year.min() > MIN_YEAR: - return to_datetime(100 * year + month, format="%Y%m") - else: - index = getattr(year, "index", None) - return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index) - - def convert_year_days_safe(year, days) -> Series: - """ - Converts year (e.g. 1999) and days since the start of the year to a - datetime or datetime64 Series - """ - if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") - else: - index = getattr(year, "index", None) - value = [ - datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days) - ] - return Series(value, index=index) + if fmt.startswith(("%tc", "tc")): + # Delta ms relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "ms") + conv_dates = np.array(dates._values, dtype="M8[ms]") + td + return Series(conv_dates, index=dates.index) - def convert_delta_safe(base, deltas, unit) -> Series: - """ - Convert base dates and deltas to datetimes, using pandas vectorized - versions if the deltas satisfy restrictions required to be expressed - as dates in pandas. - """ - index = getattr(deltas, "index", None) - if unit == "d": - if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: - values = [base + timedelta(days=int(d)) for d in deltas] - return Series(values, index=index) - elif unit == "ms": - if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [ - base + timedelta(microseconds=(int(d) * 1000)) for d in deltas - ] - return Series(values, index=index) - else: - raise ValueError("format not understood") - base = to_datetime(base) - deltas = to_timedelta(deltas, unit=unit) - return base + deltas + elif fmt.startswith(("%td", "td", "%d", "d")): + # Delta days relative to base + td = np.timedelta64(stata_epoch - unix_epoch, "D") + conv_dates = np.array(dates._values, dtype="M8[D]") + td + return Series(conv_dates, index=dates.index) + + elif fmt.startswith(("%tm", "tm")): + # Delta months relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12 + res = np.array(ordinals, dtype="M8[M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%tq", "tq")): + # Delta quarters relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4 + res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%th", "th")): + # Delta half-years relative to base + ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2 + res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]") + return Series(res, index=dates.index) + + elif fmt.startswith(("%ty", "ty")): + # Years -- not delta + ordinals = dates - 1970 + res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]") + return Series(res, index=dates.index) - # TODO(non-nano): If/when pandas supports more than datetime64[ns], this - # should be improved to use correct range, e.g. datetime[Y] for yearly bad_locs = np.isnan(dates) has_bad_values = False if bad_locs.any(): @@ -345,11 +324,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: dates._values[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt.startswith(("%tc", "tc")): # Delta ms relative to base - base = stata_epoch - ms = dates - conv_dates = convert_delta_safe(base, ms, "ms") - elif fmt.startswith(("%tC", "tC")): + if fmt.startswith(("%tC", "tC")): warnings.warn( "Encountered %tC format. Leaving in Stata Internal Format.", stacklevel=find_stack_level(), @@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series: if has_bad_values: conv_dates[bad_locs] = NaT return conv_dates - # Delta days relative to base - elif fmt.startswith(("%td", "td", "%d", "d")): - base = stata_epoch - days = dates - conv_dates = convert_delta_safe(base, days, "d") # does not count leap days - 7 days is a week. # 52nd week may have more than 7 days elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 - conv_dates = convert_year_days_safe(year, days) - elif fmt.startswith(("%tm", "tm")): # Delta months relative to base - year = stata_epoch.year + dates // 12 - month = (dates % 12) + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base - year = stata_epoch.year + dates // 4 - quarter_month = (dates % 4) * 3 + 1 - conv_dates = convert_year_month_safe(year, quarter_month) - elif fmt.startswith(("%th", "th")): # Delta half-years relative to base - year = stata_epoch.year + dates // 2 - month = (dates % 2) * 6 + 1 - conv_dates = convert_year_month_safe(year, month) - elif fmt.startswith(("%ty", "ty")): # Years -- not delta - year = dates - first_month = np.ones_like(dates) - conv_dates = convert_year_month_safe(year, first_month) + per_y = (year - 1970).array.view("Period[Y]") + per_d = per_y.asfreq("D", how="S") + per_d_shifted = per_d + days + per_s = per_d_shifted.dt.asfreq("s", how="S") + conv_dates_arr = per_s.array.view("M8[s]") + conv_dates = Series(conv_dates_arr, index=per_s.index, name=per_s.name) + else: raise ValueError(f"Date fmt {fmt} not understood") @@ -409,6 +369,7 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: index = dates.index NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 US_PER_DAY = NS_PER_DAY / 1000 + MS_PER_DAY = NS_PER_DAY / 1_000_000 def parse_dates_safe( dates: Series, delta: bool = False, year: bool = False, days: bool = False @@ -416,17 +377,18 @@ def parse_dates_safe( d = {} if lib.is_np_dtype(dates.dtype, "M"): if delta: - time_delta = dates - Timestamp(stata_epoch).as_unit("ns") - d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds + time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit( + "ms" + ) + d["delta"] = time_delta._values.view(np.int64) if days or year: date_index = DatetimeIndex(dates) d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates._values.view(np.int64) - to_datetime( - d["year"], format="%Y" - )._values.view(np.int64) - d["days"] = days_in_ns // NS_PER_DAY + year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype) + diff = dates - year_start + d["days"] = np.asarray(diff).astype("m8[D]").view("int64") elif infer_dtype(dates, skipna=False) == "datetime": if delta: @@ -466,7 +428,7 @@ def g(x: datetime) -> int: if fmt in ["%tc", "tc"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta / 1000 + conv_dates = d.delta elif fmt in ["%tC", "tC"]: warnings.warn( "Stata Internal Format tC not supported.", @@ -475,7 +437,7 @@ def g(x: datetime) -> int: conv_dates = dates elif fmt in ["%td", "td"]: d = parse_dates_safe(dates, delta=True) - conv_dates = d.delta // US_PER_DAY + conv_dates = d.delta // MS_PER_DAY elif fmt in ["%tw", "tw"]: d = parse_dates_safe(dates, year=True, days=True) conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5c6377349304c..4d81cae6db2bf 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -174,7 +174,16 @@ def test_read_dta2(self, datapath): "yearly_date", ], ) - expected["yearly_date"] = expected["yearly_date"].astype("O") + # TODO(GH#55564): just pass M8[s] to the constructor + expected["datetime_c"] = expected["datetime_c"].astype("M8[ms]") + expected["date"] = expected["date"].astype("M8[s]") + expected["weekly_date"] = expected["weekly_date"].astype("M8[s]") + expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") + expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") + expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") + expected["yearly_date"] = ( + expected["yearly_date"].astype("Period[s]").array.view("M8[s]") + ) path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -360,12 +369,15 @@ def test_read_write_dta10(self, version): with tm.ensure_clean() as path: original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + + expected = original[:] + # "tc" convert_dates means we store in ms + expected["datetime"] = expected["datetime"].astype("M8[ms]") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: @@ -514,9 +526,10 @@ def test_read_write_reread_dta15(self, file, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + + # TODO(GH#55564): directly cast to M8[s] + arr = expected["date_td"].astype("Period[D]")._values.asfreq("s", how="S") + expected["date_td"] = arr.view("M8[s]") file = datapath("io", "data", "stata", f"{file}.dta") parsed = self.read_dta(file) @@ -636,10 +649,11 @@ def test_dates_invalid_column(self): written_and_read_again = self.read_dta(path) - modified = original - modified.columns = ["_0"] - modified.index = original.index.astype(np.int32) - tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) + expected = original.copy() + expected.columns = ["_0"] + expected.index = original.index.astype(np.int32) + expected["_0"] = expected["_0"].astype("M8[ms]") + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_105(self, datapath): # Data obtained from: @@ -684,7 +698,9 @@ def test_date_export_formats(self): [expected_values], index=pd.Index([0], dtype=np.int32, name="index"), columns=columns, + dtype="M8[s]", ) + expected["tc"] = expected["tc"].astype("M8[ms]") with tm.ensure_clean() as path: original.to_stata(path, convert_dates=conversions) @@ -881,6 +897,15 @@ def test_big_dates(self, datapath): expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=object) + # FIXME(GH#55564): can't astype directly to ms or s + expected["date_tc"] = expected["date_tc"].astype("Period[ms]")._values.view("M8[ms]") + expected["date_td"] = expected["date_td"].astype("Period[s]")._values.view("M8[s]") + expected["date_tm"] = expected["date_tm"].astype("Period[s]")._values.view("M8[s]") + expected["date_tw"] = expected["date_tw"].astype("Period[s]")._values.view("M8[s]") + expected["date_tq"] = expected["date_tq"].astype("Period[s]")._values.view("M8[s]") + expected["date_th"] = expected["date_th"].astype("Period[s]")._values.view("M8[s]") + expected["date_ty"] = expected["date_ty"].astype("Period[s]")._values.view("M8[s]") + parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) @@ -906,9 +931,9 @@ def test_dtype_conversion(self, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + # FIXME(GH#55564): can't astype directly to M8[ms] without OutOfBoundsDatetime + parr = expected["date_td"].astype("Period[D]")._values + expected["date_td"] = parr.view("M8[D]").astype("M8[s]") no_conversion = read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True @@ -922,12 +947,10 @@ def test_dtype_conversion(self, datapath): ) # read_csv types are the same - expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) - expected["date_td"] = expected["date_td"].apply( - datetime.strptime, args=("%Y-%m-%d",) - ) + expected2 = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) + expected2["date_td"] = expected["date_td"] - tm.assert_frame_equal(expected, conversion) + tm.assert_frame_equal(expected2, conversion) def test_drop_column(self, datapath): expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv")) @@ -1392,10 +1415,14 @@ def test_default_date_conversion(self): } ) + expected = original[:] + # "tc" for convert_dates below stores with "ms" resolution + expected["dates"] = expected["dates"].astype("M8[ms]") + with tm.ensure_clean() as path: original.to_stata(path, write_index=False) reread = read_stata(path, convert_dates=True) - tm.assert_frame_equal(original, reread) + tm.assert_frame_equal(expected, reread) original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) direct = read_stata(path, convert_dates=True) @@ -1666,11 +1693,14 @@ def test_writer_117(self): version=117, ) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 + + expected = original[:] + # "tc" for convert_dates means we store with "ms" resolution + expected["datetime"] = expected["datetime"].astype("M8[ms]") + tm.assert_frame_equal( written_and_read_again.set_index("index"), - original, - check_index_type=False, + expected, ) tm.assert_frame_equal(original, copy) @@ -1943,7 +1973,8 @@ def test_read_write_ea_dtypes(self, dtype_backend): "b": ["a", "b", "c"], "c": [1.0, 0, np.nan], "d": [1.5, 2.5, 3.5], - "e": pd.date_range("2020-12-31", periods=3, freq="D"), + # stata stores with ms unit, so unit does not round-trip exactly + "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"), }, index=pd.Index([0, 1, 2], name="index", dtype=np.int32), ) From fec9cc2eb7445426b260a2ec67ba3ff3609851f2 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 22 Oct 2023 19:13:23 -0700 Subject: [PATCH 2/9] GH ref --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index fad58657d0310..aabb1718e60b2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -339,6 +339,7 @@ Other enhancements - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) From f9598b9cfaeb647df80e627e4d7763e249ab88f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Oct 2023 09:01:09 -0700 Subject: [PATCH 3/9] mypy fixup --- pandas/io/stata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e5b7eeacc757c..d734610ed822f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -284,14 +284,14 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: if fmt.startswith(("%tc", "tc")): # Delta ms relative to base td = np.timedelta64(stata_epoch - unix_epoch, "ms") - conv_dates = np.array(dates._values, dtype="M8[ms]") + td - return Series(conv_dates, index=dates.index) + res = np.array(dates._values, dtype="M8[ms]") + td + return Series(res, index=dates.index) elif fmt.startswith(("%td", "td", "%d", "d")): # Delta days relative to base td = np.timedelta64(stata_epoch - unix_epoch, "D") - conv_dates = np.array(dates._values, dtype="M8[D]") + td - return Series(conv_dates, index=dates.index) + res = np.array(dates._values, dtype="M8[D]") + td + return Series(res, index=dates.index) elif fmt.startswith(("%tm", "tm")): # Delta months relative to base From dcf0c228732067aefeed3ace746ae38eb64b7b0a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Oct 2023 11:52:30 -0700 Subject: [PATCH 4/9] update doctest --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d734610ed822f..dd3b6ed41657e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: >>> dates = pd.Series([52]) >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") 0 1961-01-01 - dtype: datetime64[ns] + dtype: datetime64[s] Notes ----- From fa3aad7dc6c671212ee2635063916a34ac4366ef Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 13 Nov 2023 13:57:23 -0800 Subject: [PATCH 5/9] simplify --- pandas/tests/io/test_stata.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4d81cae6db2bf..a9001ab9a9fa9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -897,14 +897,13 @@ def test_big_dates(self, datapath): expected[5][5] = expected[5][6] = datetime(1678, 1, 1) expected = DataFrame(expected, columns=columns, dtype=object) - # FIXME(GH#55564): can't astype directly to ms or s - expected["date_tc"] = expected["date_tc"].astype("Period[ms]")._values.view("M8[ms]") - expected["date_td"] = expected["date_td"].astype("Period[s]")._values.view("M8[s]") - expected["date_tm"] = expected["date_tm"].astype("Period[s]")._values.view("M8[s]") - expected["date_tw"] = expected["date_tw"].astype("Period[s]")._values.view("M8[s]") - expected["date_tq"] = expected["date_tq"].astype("Period[s]")._values.view("M8[s]") - expected["date_th"] = expected["date_th"].astype("Period[s]")._values.view("M8[s]") - expected["date_ty"] = expected["date_ty"].astype("Period[s]")._values.view("M8[s]") + expected["date_tc"] = expected["date_tc"].astype("M8[ms]") + expected["date_td"] = expected["date_td"].astype("M8[s]") + expected["date_tm"] = expected["date_tm"].astype("M8[s]") + expected["date_tw"] = expected["date_tw"].astype("M8[s]") + expected["date_tq"] = expected["date_tq"].astype("M8[s]") + expected["date_th"] = expected["date_th"].astype("M8[s]") + expected["date_ty"] = expected["date_ty"].astype("M8[s]") parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) @@ -931,9 +930,7 @@ def test_dtype_conversion(self, datapath): expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) - # FIXME(GH#55564): can't astype directly to M8[ms] without OutOfBoundsDatetime - parr = expected["date_td"].astype("Period[D]")._values - expected["date_td"] = parr.view("M8[D]").astype("M8[s]") + expected["date_td"] = expected["date_td"].astype("M8[s]") no_conversion = read_stata( datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True From 640aaf9c3fb9e3188b46bfa625025a168527a90f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Nov 2023 16:29:23 -0800 Subject: [PATCH 6/9] avoid Series.view --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index dd3b6ed41657e..8b72aad2eff9b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -343,7 +343,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: per_d_shifted = per_d + days per_s = per_d_shifted.dt.asfreq("s", how="S") conv_dates_arr = per_s.array.view("M8[s]") - conv_dates = Series(conv_dates_arr, index=per_s.index, name=per_s.name) + conv_dates = Series(conv_dates_arr, index=dates.index) else: raise ValueError(f"Date fmt {fmt} not understood") From 2f538c7d75623d5a9f55fcd9ecbbf05cb166ec38 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Dec 2023 08:32:45 -0800 Subject: [PATCH 7/9] dont go through Series --- pandas/io/stata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8b72aad2eff9b..abebb4e2d1663 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -340,9 +340,9 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: days = (dates % 52) * 7 per_y = (year - 1970).array.view("Period[Y]") per_d = per_y.asfreq("D", how="S") - per_d_shifted = per_d + days - per_s = per_d_shifted.dt.asfreq("s", how="S") - conv_dates_arr = per_s.array.view("M8[s]") + per_d_shifted = per_d + days._values + per_s = per_d_shifted.asfreq("s", how="S") + conv_dates_arr = per_s.view("M8[s]") conv_dates = Series(conv_dates_arr, index=dates.index) else: From 7b802c7c16d8b76dff17c9413c8aa5896afcd561 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Jan 2024 10:27:46 -0800 Subject: [PATCH 8/9] move whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index aabb1718e60b2..fad58657d0310 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -339,7 +339,6 @@ Other enhancements - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) -- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 950082f9281c5..1bf1e70597e38 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -28,7 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - .. --------------------------------------------------------------------------- From ea261aa8b1af8e20e5d8084c02fd3d8ca4bc9eb0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Jan 2024 18:55:38 -0800 Subject: [PATCH 9/9] remove outdated whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index fad58657d0310..d9ab0452c8334 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -350,7 +350,6 @@ Other enhancements - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) -- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`??`) .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: