Skip to content

Commit 8209203

Browse files
authored
fix: converting to dataframe with out of bounds timestamps (#209)
Fixes #168. This PR fixes the problem when converting query results to Pandas with `pyarrow` when data contains timestamps that would fall out of `pyarrow`'s nanoseconds precision. The fix requires `pyarrow>=1.0.0`, thus it only works on Python 3. ### PR checklist - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary)
1 parent 478597a commit 8209203

File tree

3 files changed

+96
-2
lines changed

3 files changed

+96
-2
lines changed

google/cloud/bigquery/table.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import functools
2222
import logging
2323
import operator
24+
import pytz
2425
import warnings
2526

2627
import six
@@ -1726,7 +1727,35 @@ def to_dataframe(
17261727
bqstorage_client=bqstorage_client,
17271728
create_bqstorage_client=create_bqstorage_client,
17281729
)
1729-
df = record_batch.to_pandas(date_as_object=date_as_object)
1730+
1731+
# When converting timestamp values to nanosecond precision, the result
1732+
# can be out of pyarrow bounds. To avoid the error when converting to
1733+
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
1734+
#
1735+
# NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
1736+
# in pyarrow>=1.0, but the latter is not compatible with Python 2.
1737+
if six.PY2:
1738+
extra_kwargs = {}
1739+
else:
1740+
types_to_check = {
1741+
pyarrow.timestamp("us"),
1742+
pyarrow.timestamp("us", tz=pytz.UTC),
1743+
}
1744+
1745+
for column in record_batch:
1746+
if column.type in types_to_check:
1747+
try:
1748+
column.cast("timestamp[ns]")
1749+
except pyarrow.lib.ArrowInvalid:
1750+
timestamp_as_object = True
1751+
break
1752+
else:
1753+
timestamp_as_object = False
1754+
1755+
extra_kwargs = {"timestamp_as_object": timestamp_as_object}
1756+
1757+
df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
1758+
17301759
for column in dtypes:
17311760
df[column] = pandas.Series(df[column], dtype=dtypes[column])
17321761
return df

setup.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@
4848
"pandas": ["pandas>=0.17.1"],
4949
# Exclude PyArrow dependency from Windows Python 2.7.
5050
'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
51-
"pyarrow>=0.17.0"
51+
"pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
52+
# Pyarrow >= 0.17.0 is not compatible with Python 2 anymore.
53+
"pyarrow < 0.17.0; python_version < '3.0'",
5254
],
5355
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
5456
"fastparquet": [

tests/unit/test_table.py

+63
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import datetime as dt
1516
import itertools
1617
import logging
1718
import time
@@ -2271,6 +2272,68 @@ def test_to_dataframe(self):
22712272
self.assertEqual(df.name.dtype.name, "object")
22722273
self.assertEqual(df.age.dtype.name, "int64")
22732274

2275+
@pytest.mark.xfail(
2276+
six.PY2,
2277+
reason=(
2278+
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
2279+
"with Python 2 anymore."
2280+
),
2281+
)
2282+
@unittest.skipIf(pandas is None, "Requires `pandas`")
2283+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
2284+
def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
2285+
from google.cloud.bigquery.schema import SchemaField
2286+
2287+
schema = [SchemaField("some_timestamp", "TIMESTAMP")]
2288+
rows = [
2289+
{"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC
2290+
{"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC
2291+
]
2292+
path = "/foo"
2293+
api_request = mock.Mock(return_value={"rows": rows})
2294+
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
2295+
2296+
df = row_iterator.to_dataframe(create_bqstorage_client=False)
2297+
2298+
self.assertIsInstance(df, pandas.DataFrame)
2299+
self.assertEqual(len(df), 2) # verify the number of rows
2300+
self.assertEqual(list(df.columns), ["some_timestamp"])
2301+
self.assertEqual(
2302+
list(df["some_timestamp"]),
2303+
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
2304+
)
2305+
2306+
@pytest.mark.xfail(
2307+
six.PY2,
2308+
reason=(
2309+
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
2310+
"with Python 2 anymore."
2311+
),
2312+
)
2313+
@unittest.skipIf(pandas is None, "Requires `pandas`")
2314+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
2315+
def test_to_dataframe_datetime_out_of_pyarrow_bounds(self):
2316+
from google.cloud.bigquery.schema import SchemaField
2317+
2318+
schema = [SchemaField("some_datetime", "DATETIME")]
2319+
rows = [
2320+
{"f": [{"v": "4567-01-01T00:00:00"}]},
2321+
{"f": [{"v": "9999-12-31T00:00:00"}]},
2322+
]
2323+
path = "/foo"
2324+
api_request = mock.Mock(return_value={"rows": rows})
2325+
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
2326+
2327+
df = row_iterator.to_dataframe(create_bqstorage_client=False)
2328+
2329+
self.assertIsInstance(df, pandas.DataFrame)
2330+
self.assertEqual(len(df), 2) # verify the number of rows
2331+
self.assertEqual(list(df.columns), ["some_datetime"])
2332+
self.assertEqual(
2333+
list(df["some_datetime"]),
2334+
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
2335+
)
2336+
22742337
@unittest.skipIf(pandas is None, "Requires `pandas`")
22752338
def test_to_dataframe_warning_wo_pyarrow(self):
22762339
from google.cloud.bigquery.client import PyarrowMissingWarning

0 commit comments

Comments
 (0)