From 9b5787869fafe5f9abe1561d434fe2f11b0b9cfa Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 10 Nov 2021 17:05:15 -0600 Subject: [PATCH 01/18] test: upload DATE column with various dtypes --- setup.py | 1 + tests/system/test_to_gbq.py | 75 +++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index b66c0499..ce02f4e6 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ release_status = "Development Status :: 4 - Beta" dependencies = [ "setuptools", + "db-dtypes >=0.3.0, <2.0.0dev", "numpy>=1.16.6", "pandas>=0.23.2", "pyarrow >=3.0.0, <7.0dev", diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index d16997fd..3d097a98 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -13,6 +13,11 @@ pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") +@pytest.fixture(params=["default", "load_parquet", "load_csv"]) +def api_method(request): + return request.param + + @pytest.fixture def method_under_test(credentials, project_id): import pandas_gbq @@ -47,11 +52,6 @@ def method_under_test(credentials, project_id): [ "abc", "defg", - # Ensure that empty strings are written as empty string, - # not NULL. See: - # https://github.com/googleapis/python-bigquery-pandas/issues/366 - "", - None, # Ensure that unicode characters are encoded. See: # https://github.com/googleapis/python-bigquery-pandas/issues/106 "信用卡", @@ -61,11 +61,27 @@ def method_under_test(credentials, project_id): name="test_col", ), ), + ( + pandas.Series( + [ + "abc", + "defg", + # Ensure that empty strings are written as empty string, + # not NULL. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/366 + "", + None, + ], + name="empty_strings", + ), + ), ], ) def test_series_round_trip( - method_under_test, random_dataset_id, bigquery_client, input_series + method_under_test, random_dataset_id, bigquery_client, input_series, api_method, ): + if api_method == "load_csv" and input_series.name == "empty_strings": + pytest.skip("Loading empty string with CSV not supported.") table_id = f"{random_dataset_id}.round_trip_{random.randrange(1_000_000)}" input_series = input_series.sort_values().reset_index(drop=True) df = pandas.DataFrame( @@ -73,10 +89,53 @@ def test_series_round_trip( # https://github.com/googleapis/python-bigquery-pandas/issues/366 {"test_col": input_series, "test_col2": input_series} ) - method_under_test(df, table_id) + method_under_test(df, table_id, api_method=api_method) round_trip = bigquery_client.list_rows(table_id).to_dataframe() round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True) pandas.testing.assert_series_equal( - round_trip_series, input_series, check_exact=True, + round_trip_series, input_series, check_exact=True, check_names=False, + ) + + +@pytest.mark.parametrize( + ["input_df", "table_schema"], + [ + # Ensure that a DATE column can be written with datetime64[ns] dtype + # data. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/362 + ( + pandas.DataFrame( + { + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], + dtype="datetime64[ns]", + ), + } + ), + [{"name": "date_col", "type": "DATE"}], + ), + # TODO: Test with dbdate dtype. + ], +) +def test_dataframe_round_trip_with_table_schema( + method_under_test, + random_dataset_id, + bigquery_client, + input_df, + table_schema, + api_method, +): + table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}" + method_under_test( + input_df, table_id, table_schema=table_schema, api_method=api_method + ) + round_trip = bigquery_client.list_rows(table_id).to_dataframe( + dtypes=dict(zip(input_df.columns, input_df.dtypes)) ) + # TODO: Need to sort by row number before comparing. + pandas.testing.assert_frame_equal(input_df, round_trip) + # round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True) + # pandas.testing.assert_series_equal( + # round_trip_series, input_series, check_exact=True, + # ) From 907a1d5d4194666dda856ccd1929c54719a94b30 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 10 Nov 2021 17:38:09 -0600 Subject: [PATCH 02/18] add dbdate tests --- tests/system/test_to_gbq.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 3d097a98..754469c1 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -28,7 +28,7 @@ def method_under_test(credentials, project_id): @pytest.mark.parametrize( - ["input_series"], + ["input_series", "skip_csv"], [ # Ensure that 64-bit floating point numbers are unchanged. # See: https://github.com/pydata/pandas-gbq/issues/326 @@ -46,6 +46,7 @@ def method_under_test(credentials, project_id): ], name="test_col", ), + False, ), ( pandas.Series( @@ -60,6 +61,7 @@ def method_under_test(credentials, project_id): ], name="test_col", ), + False, ), ( pandas.Series( @@ -74,14 +76,20 @@ def method_under_test(credentials, project_id): ], name="empty_strings", ), + True, ), ], ) def test_series_round_trip( - method_under_test, random_dataset_id, bigquery_client, input_series, api_method, + method_under_test, + random_dataset_id, + bigquery_client, + input_series, + api_method, + skip_csv, ): - if api_method == "load_csv" and input_series.name == "empty_strings": - pytest.skip("Loading empty string with CSV not supported.") + if api_method == "load_csv" and skip_csv: + pytest.skip("Loading with CSV not supported.") table_id = f"{random_dataset_id}.round_trip_{random.randrange(1_000_000)}" input_series = input_series.sort_values().reset_index(drop=True) df = pandas.DataFrame( @@ -99,7 +107,7 @@ def test_series_round_trip( @pytest.mark.parametrize( - ["input_df", "table_schema"], + ["input_df", "table_schema", "skip_csv"], [ # Ensure that a DATE column can be written with datetime64[ns] dtype # data. See: @@ -114,8 +122,19 @@ def test_series_round_trip( } ), [{"name": "date_col", "type": "DATE"}], + True, + ), + ( + pandas.DataFrame( + { + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="dbdate", + ), + } + ), + [{"name": "date_col", "type": "DATE"}], + False, ), - # TODO: Test with dbdate dtype. ], ) def test_dataframe_round_trip_with_table_schema( @@ -125,7 +144,10 @@ def test_dataframe_round_trip_with_table_schema( input_df, table_schema, api_method, + skip_csv, ): + if api_method == "load_csv" and skip_csv: + pytest.skip("Loading with CSV not supported.") table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}" method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method From cf2a5bf7b2ac5543dc02d0ba0258cc02c04b8d40 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 09:52:01 -0600 Subject: [PATCH 03/18] test with db-dtypes only with newer pandas --- noxfile.py | 4 ++-- owlbot.py | 8 +++++-- setup.py | 6 +++-- tests/system/test_to_gbq.py | 47 ++++++++++++++++++++++--------------- 4 files changed, 40 insertions(+), 25 deletions(-) diff --git a/noxfile.py b/noxfile.py index ed88b094..9c0b8abf 100644 --- a/noxfile.py +++ b/noxfile.py @@ -28,8 +28,8 @@ BLACK_PATHS = ["docs", "pandas_gbq", "tests", "noxfile.py", "setup.py"] DEFAULT_PYTHON_VERSION = "3.8" -SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"] -UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"] +UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"] CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() diff --git a/owlbot.py b/owlbot.py index 76a17e40..71679dd4 100644 --- a/owlbot.py +++ b/owlbot.py @@ -29,12 +29,16 @@ # ---------------------------------------------------------------------------- extras = ["tqdm"] +extras_by_python = { + "3.9": ["tqdm", "db-dtypes"], +} templated_files = common.py_library( - unit_test_python_versions=["3.7", "3.8", "3.9"], - system_test_python_versions=["3.7", "3.8", "3.9"], + unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"], + system_test_python_versions=["3.7", "3.8", "3.9", "3.10"], cov_level=86, unit_test_extras=extras, system_test_extras=extras, + system_test_extras_by_python=extras_by_python, intersphinx_dependencies={ "pandas": "https://pandas.pydata.org/pandas-docs/stable/", "pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/", diff --git a/setup.py b/setup.py index ce02f4e6..876bd4c0 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,6 @@ release_status = "Development Status :: 4 - Beta" dependencies = [ "setuptools", - "db-dtypes >=0.3.0, <2.0.0dev", "numpy>=1.16.6", "pandas>=0.23.2", "pyarrow >=3.0.0, <7.0dev", @@ -34,7 +33,10 @@ # https://github.com/pydata/pandas-gbq/issues/343 "google-cloud-bigquery[bqstorage,pandas]>=1.11.1,<3.0.0dev,!=2.4.*", ] -extras = {"tqdm": "tqdm>=4.23.0"} +extras = { + "tqdm": "tqdm>=4.23.0", + "db-dtypes": "db-dtypes >=0.3.0,<2.0.0", +} # Setup boilerplate below this line. diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 754469c1..bedd7fe9 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -9,6 +9,11 @@ import pandas.testing import pytest +try: + import db_dtypes +except ImportError: + db_dtypes = None + pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") @@ -106,24 +111,24 @@ def test_series_round_trip( ) -@pytest.mark.parametrize( - ["input_df", "table_schema", "skip_csv"], - [ - # Ensure that a DATE column can be written with datetime64[ns] dtype - # data. See: - # https://github.com/googleapis/python-bigquery-pandas/issues/362 - ( - pandas.DataFrame( - { - "date_col": pandas.Series( - ["2021-04-17", "1999-12-31", "2038-01-19"], - dtype="datetime64[ns]", - ), - } - ), - [{"name": "date_col", "type": "DATE"}], - True, +DATAFRAME_ROUND_TRIPS = [ + # Ensure that a DATE column can be written with datetime64[ns] dtype + # data. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/362 + ( + pandas.DataFrame( + { + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="datetime64[ns]", + ), + } ), + [{"name": "date_col", "type": "DATE"}], + True, + ), +] +if db_dtypes is not None: + DATAFRAME_ROUND_TRIPS.append( ( pandas.DataFrame( { @@ -134,8 +139,12 @@ def test_series_round_trip( ), [{"name": "date_col", "type": "DATE"}], False, - ), - ], + ) + ) + + +@pytest.mark.parametrize( + ["input_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS ) def test_dataframe_round_trip_with_table_schema( method_under_test, From 1ea8fda43c78d915ab2d0367809e10cca33ff970 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 11 Nov 2021 15:55:20 +0000 Subject: [PATCH 04/18] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- CONTRIBUTING.rst | 10 ++++++---- noxfile.py | 6 +++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index bc37b498..90bd84f2 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.7, 3.8 and 3.9 on both UNIX and Windows. + 3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.9 -- -k + $ nox -s unit-3.10 -- -k .. note:: @@ -143,12 +143,12 @@ Running System Tests $ nox -s system # Run a single system test - $ nox -s system-3.9 -- -k + $ nox -s system-3.10 -- -k .. note:: - System tests are only configured to run under Python 3.7, 3.8 and 3.9. + System tests are only configured to run under Python 3.7, 3.8, 3.9 and 3.10. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -224,10 +224,12 @@ We support: - `Python 3.7`_ - `Python 3.8`_ - `Python 3.9`_ +- `Python 3.10`_ .. _Python 3.7: https://docs.python.org/3.7/ .. _Python 3.8: https://docs.python.org/3.8/ .. _Python 3.9: https://docs.python.org/3.9/ +.. _Python 3.10: https://docs.python.org/3.10/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/noxfile.py b/noxfile.py index 9c0b8abf..825daf18 100644 --- a/noxfile.py +++ b/noxfile.py @@ -146,7 +146,11 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path) - session.install("-e", ".[tqdm]", "-c", constraints_path) + if session.python == "3.9": + extras = "[tqdm,db-dtypes]" + else: + extras = "[tqdm]" + session.install("-e", f".{extras}", "-c", constraints_path) # Run py.test against the system tests. if system_test_exists: From b869a9f895b9f980afb3396f25212dcf9cb7a41f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 09:57:21 -0600 Subject: [PATCH 05/18] sort by row number --- tests/system/test_to_gbq.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index bedd7fe9..4f315a77 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -158,15 +158,13 @@ def test_dataframe_round_trip_with_table_schema( if api_method == "load_csv" and skip_csv: pytest.skip("Loading with CSV not supported.") table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}" + input_df["row_num"] = input_df.index + input_df.sort_values("row_num", inplace=True) method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method ) round_trip = bigquery_client.list_rows(table_id).to_dataframe( dtypes=dict(zip(input_df.columns, input_df.dtypes)) ) - # TODO: Need to sort by row number before comparing. + round_trip.sort_values("row_num", inplace=True) pandas.testing.assert_frame_equal(input_df, round_trip) - # round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True) - # pandas.testing.assert_series_equal( - # round_trip_series, input_series, check_exact=True, - # ) From 625df5ae2d680282f3c3dfde49dfa602590a4b35 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 15:49:16 -0600 Subject: [PATCH 06/18] fix: allow strings when writing to DATE and floats when writing to NUMERIC This improves `api_method="load_parquet"` compatibility with the previous `api_method="load_csv"` behavior. --- owlbot.py | 4 ---- pandas_gbq/load.py | 44 ++++++++++++++++++++++++++++++++++ setup.py | 4 ++-- testing/constraints-3.7.txt | 3 ++- tests/system/test_to_gbq.py | 48 +++++++++++++++++++++++++++++++++---- 5 files changed, 91 insertions(+), 12 deletions(-) diff --git a/owlbot.py b/owlbot.py index 71679dd4..7fa43dbd 100644 --- a/owlbot.py +++ b/owlbot.py @@ -29,16 +29,12 @@ # ---------------------------------------------------------------------------- extras = ["tqdm"] -extras_by_python = { - "3.9": ["tqdm", "db-dtypes"], -} templated_files = common.py_library( unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"], system_test_python_versions=["3.7", "3.8", "3.9", "3.10"], cov_level=86, unit_test_extras=extras, system_test_extras=extras, - system_test_extras_by_python=extras_by_python, intersphinx_dependencies={ "pandas": "https://pandas.pydata.org/pandas-docs/stable/", "pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/", diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 69210e41..a886fce5 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -4,9 +4,11 @@ """Helper methods for loading data into BigQuery""" +import decimal import io from typing import Any, Callable, Dict, List, Optional +import db_dtypes import pandas import pyarrow.lib from google.cloud import bigquery @@ -56,6 +58,47 @@ def split_dataframe(dataframe, chunksize=None): yield remaining_rows, chunk +def cast_dataframe_for_parquet( + dataframe: pandas.DataFrame, schema: Optional[Dict[str, Any]], +) -> pandas.DataFrame: + """Cast columns to needed dtype when writing parquet files. + + See: https://github.com/googleapis/python-bigquery-pandas/issues/421 + """ + columns = schema.get("fields", []) + for column in columns: + # Schema can be a superset of the columns in the dataframe, so ignore + # columns that aren't present. + column_name = column.get("name") + if column_name not in dataframe.columns: + continue + + # Skip array columns. + if column.get("mode", "NULLABLE").upper() not in {"REQUIRED", "NULLABLE"}: + continue + + column_type = column.get("type", "").upper() + if ( + column_type == "DATE" + and dataframe[column_name].dtype != db_dtypes.DateDtype() + ): + # Construct converted column manually, because I can't use + # .astype() with DateDtype. With .astype(), I get the error: + # + # TypeError: Cannot interpret '' as a data type + cast_column = pandas.Series( + dataframe[column_name], dtype=db_dtypes.DateDtype() + ) + elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}: + cast_column = dataframe[column_name].map(decimal.Decimal) + else: + cast_column = None + + if cast_column is not None: + dataframe = dataframe.assign(**{column_name: cast_column}) + return dataframe + + def load_parquet( client: bigquery.Client, dataframe: pandas.DataFrame, @@ -70,6 +113,7 @@ def load_parquet( if schema is not None: schema = pandas_gbq.schema.remove_policy_tags(schema) job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) + dataframe = cast_dataframe_for_parquet(dataframe, schema) try: client.load_table_from_dataframe( diff --git a/setup.py b/setup.py index 876bd4c0..28c81eee 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,9 @@ release_status = "Development Status :: 4 - Beta" dependencies = [ "setuptools", + "db-dtypes >=0.3.0,<2.0.0", "numpy>=1.16.6", - "pandas>=0.23.2", + "pandas>=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", "google-auth", @@ -35,7 +36,6 @@ ] extras = { "tqdm": "tqdm>=4.23.0", - "db-dtypes": "db-dtypes >=0.3.0,<2.0.0", } # Setup boilerplate below this line. diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 7c67d275..7920656a 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -5,12 +5,13 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 +db-dtypes==0.3.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 google-cloud-bigquery==1.11.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 -pandas==0.23.2 +pandas==0.24.2 pyarrow==3.0.0 pydata-google-auth==0.1.2 tqdm==4.23.0 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 4f315a77..ee553500 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -2,6 +2,8 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +import datetime +import decimal import functools import random @@ -118,25 +120,60 @@ def test_series_round_trip( ( pandas.DataFrame( { + "row_num": [0, 1, 2], "date_col": pandas.Series( ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="datetime64[ns]", ), } ), + None, [{"name": "date_col", "type": "DATE"}], True, ), + # Loading a DATE column should work for string objects. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/421 + ( + pandas.DataFrame( + {"row_num": [123], "date_col": ["2021-12-12"]}, + columns=["row_num", "date_col"], + ), + pandas.DataFrame( + {"row_num": [123], "date_col": [datetime.date(2021, 12, 12)]}, + columns=["row_num", "date_col"], + ), + [{"name": "row_num", "type": "INTEGER"}, {"name": "date_col", "type": "DATE"}], + False, + ), + # Loading a NUMERIC column should work for floating point objects. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/421 + ( + pandas.DataFrame( + {"row_num": [123], "num_col": [1.25]}, columns=["row_num", "num_col"], + ), + pandas.DataFrame( + {"row_num": [123], "num_col": [decimal.Decimal("1.25")]}, + columns=["row_num", "num_col"], + ), + [ + {"name": "row_num", "type": "INTEGER"}, + {"name": "num_col", "type": "NUMERIC"}, + ], + False, + ), ] + if db_dtypes is not None: DATAFRAME_ROUND_TRIPS.append( ( pandas.DataFrame( { + "row_num": [0, 1, 2], "date_col": pandas.Series( ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="dbdate", ), } ), + None, [{"name": "date_col", "type": "DATE"}], False, ) @@ -144,27 +181,28 @@ def test_series_round_trip( @pytest.mark.parametrize( - ["input_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS + ["input_df", "expected_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS ) def test_dataframe_round_trip_with_table_schema( method_under_test, random_dataset_id, bigquery_client, input_df, + expected_df, table_schema, api_method, skip_csv, ): if api_method == "load_csv" and skip_csv: pytest.skip("Loading with CSV not supported.") + if expected_df is None: + expected_df = input_df table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}" - input_df["row_num"] = input_df.index - input_df.sort_values("row_num", inplace=True) method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method ) round_trip = bigquery_client.list_rows(table_id).to_dataframe( - dtypes=dict(zip(input_df.columns, input_df.dtypes)) + dtypes=dict(zip(expected_df.columns, expected_df.dtypes)) ) round_trip.sort_values("row_num", inplace=True) - pandas.testing.assert_frame_equal(input_df, round_trip) + pandas.testing.assert_frame_equal(expected_df, round_trip) From bd3604e7fc0ff2b34c226c9b66e1902751ec2602 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 11 Nov 2021 21:53:23 +0000 Subject: [PATCH 07/18] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- noxfile.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/noxfile.py b/noxfile.py index 825daf18..9c0b8abf 100644 --- a/noxfile.py +++ b/noxfile.py @@ -146,11 +146,7 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path) - if session.python == "3.9": - extras = "[tqdm,db-dtypes]" - else: - extras = "[tqdm]" - session.install("-e", f".{extras}", "-c", constraints_path) + session.install("-e", ".[tqdm]", "-c", constraints_path) # Run py.test against the system tests. if system_test_exists: From 19df618d9728eef07a9d70bca6d9600dc440ac63 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 15:53:53 -0600 Subject: [PATCH 08/18] require db-dtypes --- tests/system/test_to_gbq.py | 39 ++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index ee553500..e7c93daa 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -7,15 +7,11 @@ import functools import random +import db_dtypes import pandas import pandas.testing import pytest -try: - import db_dtypes -except ImportError: - db_dtypes = None - pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") @@ -130,6 +126,22 @@ def test_series_round_trip( [{"name": "date_col", "type": "DATE"}], True, ), + ( + ( + pandas.DataFrame( + { + "row_num": [0, 1, 2], + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], + dtype=db_dtypes.DateDtype(), + ), + } + ), + None, + [{"name": "date_col", "type": "DATE"}], + False, + ) + ), # Loading a DATE column should work for string objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 ( @@ -162,23 +174,6 @@ def test_series_round_trip( ), ] -if db_dtypes is not None: - DATAFRAME_ROUND_TRIPS.append( - ( - pandas.DataFrame( - { - "row_num": [0, 1, 2], - "date_col": pandas.Series( - ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="dbdate", - ), - } - ), - None, - [{"name": "date_col", "type": "DATE"}], - False, - ) - ) - @pytest.mark.parametrize( ["input_df", "expected_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS From ab85d8a51ea6423d796348302a54f5b80b72fabf Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 16:33:27 -0600 Subject: [PATCH 09/18] add unit tests for dataframe conversion --- tests/unit/test_load.py | 112 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index a32d2d9e..924859e9 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -4,12 +4,16 @@ # -*- coding: utf-8 -*- -import textwrap +import datetime +import decimal from io import StringIO +import textwrap from unittest import mock +import db_dtypes import numpy import pandas +import pandas.testing import pytest from pandas_gbq.features import FEATURES @@ -137,3 +141,109 @@ def test_load_chunks_omits_policy_tags( def test_load_chunks_with_invalid_api_method(): with pytest.raises(ValueError, match="Got unexpected api_method:"): load.load_chunks(None, None, None, api_method="not_a_thing") + + +@pytest.mark.parametrize( + ("numeric_type",), + ( + ("NUMERIC",), + ("DECIMAL",), + ("BIGNUMERIC",), + ("BIGDECIMAL",), + ("numeric",), + ("decimal",), + ("bignumeric",), + ("bigdecimal",), + ), +) +def test_cast_dataframe_for_parquet_w_float_numeric(numeric_type): + dataframe = pandas.DataFrame( + { + "row_num": [0, 1, 2], + "num_col": pandas.Series( + # Very much not recommend as the whole point of NUMERIC is to + # be more accurate than a floating point number, but tested to + # keep compatibility with CSV-based uploads. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/421 + [1.25, -1.25, 42.5], + dtype="float64", + ), + "row_num_2": [0, 1, 2], + }, + # Use multiple columns to ensure column order is maintained. + columns=["row_num", "num_col", "row_num_2"], + ) + schema = { + "fields": [ + {"name": "num_col", "type": numeric_type}, + {"name": "not_in_df", "type": "IGNORED"}, + ] + } + result = load.cast_dataframe_for_parquet(dataframe, schema) + expected = pandas.DataFrame( + { + "row_num": [0, 1, 2], + "num_col": pandas.Series( + [decimal.Decimal(1.25), decimal.Decimal(-1.25), decimal.Decimal(42.5)], + dtype="object", + ), + "row_num_2": [0, 1, 2], + }, + columns=["row_num", "num_col", "row_num_2"], + ) + pandas.testing.assert_frame_equal(result, expected) + + +def test_cast_dataframe_for_parquet_w_string_date(): + dataframe = pandas.DataFrame( + { + "row_num": [0, 1, 2], + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="object", + ), + "row_num_2": [0, 1, 2], + }, + # Use multiple columns to ensure column order is maintained. + columns=["row_num", "date_col", "row_num_2"], + ) + schema = { + "fields": [ + {"name": "date_col", "type": "DATE"}, + {"name": "not_in_df", "type": "IGNORED"}, + ] + } + result = load.cast_dataframe_for_parquet(dataframe, schema) + expected = pandas.DataFrame( + { + "row_num": [0, 1, 2], + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], dtype=db_dtypes.DateDtype(), + ), + "row_num_2": [0, 1, 2], + }, + columns=["row_num", "date_col", "row_num_2"], + ) + pandas.testing.assert_frame_equal(result, expected) + + +def test_cast_dataframe_for_parquet_ignores_repeated_fields(): + dataframe = pandas.DataFrame( + { + "row_num": [0, 1, 2], + "repeated_col": pandas.Series( + [ + [datetime.date(2021, 4, 17)], + [datetime.date(199, 12, 31)], + [datetime.date(2038, 1, 19)], + ], + dtype="object", + ), + "row_num_2": [0, 1, 2], + }, + # Use multiple columns to ensure column order is maintained. + columns=["row_num", "repeated_col", "row_num_2"], + ) + expected = dataframe.copy() + schema = {"fields": [{"name": "date_col", "type": "DATE", "mode": "REPEATED"}]} + result = load.cast_dataframe_for_parquet(dataframe, schema) + pandas.testing.assert_frame_equal(result, expected) From a08f90de976fdc01f9c0779470593345c0454b37 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 16:51:40 -0600 Subject: [PATCH 10/18] fix unit tests --- ci/requirements-3.7-0.23.2.conda | 1 + ci/requirements-3.9-NIGHTLY.conda | 1 + owlbot.py | 2 +- pandas_gbq/load.py | 3 ++- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/requirements-3.7-0.23.2.conda b/ci/requirements-3.7-0.23.2.conda index 1da6d226..82f4e7b9 100644 --- a/ci/requirements-3.7-0.23.2.conda +++ b/ci/requirements-3.7-0.23.2.conda @@ -1,5 +1,6 @@ codecov coverage +db-dtypes==0.3.0 fastavro flake8 numpy==1.16.6 diff --git a/ci/requirements-3.9-NIGHTLY.conda b/ci/requirements-3.9-NIGHTLY.conda index ccaa87e5..5a3e9fb7 100644 --- a/ci/requirements-3.9-NIGHTLY.conda +++ b/ci/requirements-3.9-NIGHTLY.conda @@ -1,3 +1,4 @@ +db-dtypes pydata-google-auth google-cloud-bigquery google-cloud-bigquery-storage diff --git a/owlbot.py b/owlbot.py index 7fa43dbd..c69d54de 100644 --- a/owlbot.py +++ b/owlbot.py @@ -32,7 +32,7 @@ templated_files = common.py_library( unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"], system_test_python_versions=["3.7", "3.8", "3.9", "3.10"], - cov_level=86, + cov_level=88, unit_test_extras=extras, system_test_extras=extras, intersphinx_dependencies={ diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index a886fce5..f6b88454 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -80,7 +80,8 @@ def cast_dataframe_for_parquet( column_type = column.get("type", "").upper() if ( column_type == "DATE" - and dataframe[column_name].dtype != db_dtypes.DateDtype() + # Use extension dtype first so that it uses the correct equality operator. + and db_dtypes.DateDtype() != dataframe[column_name].dtype ): # Construct converted column manually, because I can't use # .astype() with DateDtype. With .astype(), I get the error: From 442695774f4f2849095e9e9b794e311df2cc6c9a Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 11 Nov 2021 22:53:38 +0000 Subject: [PATCH 11/18] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- .coveragerc | 2 +- noxfile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index 61285af5..ba50bf32 100644 --- a/.coveragerc +++ b/.coveragerc @@ -22,7 +22,7 @@ omit = google/cloud/__init__.py [report] -fail_under = 86 +fail_under = 88 show_missing = True exclude_lines = # Re-enable the standard pragma diff --git a/noxfile.py b/noxfile.py index 9c0b8abf..2feeccdc 100644 --- a/noxfile.py +++ b/noxfile.py @@ -175,7 +175,7 @@ def cover(session): test runs (not system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=86") + session.run("coverage", "report", "--show-missing", "--fail-under=88") session.run("coverage", "erase") From 15d7b516f4818149e674bbd09421218491ab316c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 11 Nov 2021 17:11:03 -0600 Subject: [PATCH 12/18] remove 'default' from system tests. redundant with load_parquet --- tests/system/test_to_gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index e7c93daa..ed5d54de 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -16,7 +16,7 @@ pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") -@pytest.fixture(params=["default", "load_parquet", "load_csv"]) +@pytest.fixture(params=["load_parquet", "load_csv"]) def api_method(request): return request.param From 4ddcf9d45fe8878bd8ea5b42e2e68d61a2aac18d Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 16 Nov 2021 23:01:09 +0000 Subject: [PATCH 13/18] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- noxfile.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/noxfile.py b/noxfile.py index 278635df..2feeccdc 100644 --- a/noxfile.py +++ b/noxfile.py @@ -146,11 +146,7 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path) - if session.python == "3.9": - extras = "[tqdm,db-dtypes]" - else: - extras = "[tqdm]" - session.install("-e", f".{extras}", "-c", constraints_path) + session.install("-e", ".[tqdm]", "-c", constraints_path) # Run py.test against the system tests. if system_test_exists: From 95f051fb8f121443274326c95fbc073e2675895a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Nov 2021 10:28:46 -0600 Subject: [PATCH 14/18] correct repeated_col name --- tests/unit/test_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 924859e9..c50a58f4 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -244,6 +244,6 @@ def test_cast_dataframe_for_parquet_ignores_repeated_fields(): columns=["row_num", "repeated_col", "row_num_2"], ) expected = dataframe.copy() - schema = {"fields": [{"name": "date_col", "type": "DATE", "mode": "REPEATED"}]} + schema = {"fields": [{"name": "repeated_col", "type": "DATE", "mode": "REPEATED"}]} result = load.cast_dataframe_for_parquet(dataframe, schema) pandas.testing.assert_frame_equal(result, expected) From f8dddb2803dbc6cbd87cee90b7d76ce1ba2f02eb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Nov 2021 11:09:03 -0600 Subject: [PATCH 15/18] used namedtuple in tests --- tests/system/test_to_gbq.py | 115 ++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 52 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index e876385d..4421f3be 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -4,6 +4,7 @@ import datetime import decimal +import collections import functools import random @@ -12,11 +13,6 @@ import pandas.testing import pytest -try: - import db_dtypes -except ImportError: - db_dtypes = None - pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") @@ -35,13 +31,20 @@ def method_under_test(credentials, project_id): ) +SeriesRoundTripTestCase = collections.namedtuple( + "SeriesRoundTripTestCase", + ["input_series", "api_methods"], + defaults=[None, {"load_csv", "load_parquet"}], +) + + @pytest.mark.parametrize( - ["input_series", "skip_csv"], + ["input_series", "api_methods"], [ # Ensure that 64-bit floating point numbers are unchanged. # See: https://github.com/pydata/pandas-gbq/issues/326 - ( - pandas.Series( + SeriesRoundTripTestCase( + input_series=pandas.Series( [ 0.14285714285714285, 0.4406779661016949, @@ -54,10 +57,9 @@ def method_under_test(credentials, project_id): ], name="test_col", ), - False, ), - ( - pandas.Series( + SeriesRoundTripTestCase( + input_series=pandas.Series( [ "abc", "defg", @@ -69,10 +71,9 @@ def method_under_test(credentials, project_id): ], name="test_col", ), - False, ), - ( - pandas.Series( + SeriesRoundTripTestCase( + input_series=pandas.Series( [ "abc", "defg", @@ -84,7 +85,13 @@ def method_under_test(credentials, project_id): ], name="empty_strings", ), - True, + # BigQuery CSV loader uses empty string as the "null marker" by + # default. Potentially one could choose a rarely used character or + # string as the null marker to disambiguate null from empty string, + # but then that string couldn't be loaded. + # TODO: Revist when custom load job configuration is supported. + # https://github.com/googleapis/python-bigquery-pandas/issues/425 + api_methods={"load_parquet"}, ), ], ) @@ -94,10 +101,10 @@ def test_series_round_trip( bigquery_client, input_series, api_method, - skip_csv, + api_methods, ): - if api_method == "load_csv" and skip_csv: - pytest.skip("Loading with CSV not supported.") + if api_method not in api_methods: + pytest.skip(f"{api_method} not supported.") table_id = f"{random_dataset_id}.round_trip_{random.randrange(1_000_000)}" input_series = input_series.sort_values().reset_index(drop=True) df = pandas.DataFrame( @@ -114,12 +121,18 @@ def test_series_round_trip( ) +DataFrameRoundTripTestCase = collections.namedtuple( + "DataFrameRoundTripTestCase", + ["input_df", "expected_df", "table_schema", "api_methods"], + defaults=[None, None, [], {"load_csv", "load_parquet"}], +) + DATAFRAME_ROUND_TRIPS = [ # Ensure that a DATE column can be written with datetime64[ns] dtype # data. See: # https://github.com/googleapis/python-bigquery-pandas/issues/362 - ( - pandas.DataFrame( + DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( { "row_num": [0, 1, 2], "date_col": pandas.Series( @@ -127,61 +140,59 @@ def test_series_round_trip( ), } ), - None, - [{"name": "date_col", "type": "DATE"}], - True, + table_schema=[{"name": "date_col", "type": "DATE"}], + # Skip CSV because the pandas CSV writer includes time when writing + # datetime64 values. + api_methods={"load_parquet"}, ), - ( - ( - pandas.DataFrame( - { - "row_num": [0, 1, 2], - "date_col": pandas.Series( - ["2021-04-17", "1999-12-31", "2038-01-19"], - dtype=db_dtypes.DateDtype(), - ), - } - ), - None, - [{"name": "date_col", "type": "DATE"}], - False, - ) + DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "date_col": pandas.Series( + ["2021-04-17", "1999-12-31", "2038-01-19"], + dtype=db_dtypes.DateDtype(), + ), + } + ), + table_schema=[{"name": "date_col", "type": "DATE"}], ), # Loading a DATE column should work for string objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 - ( - pandas.DataFrame( + DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( {"row_num": [123], "date_col": ["2021-12-12"]}, columns=["row_num", "date_col"], ), - pandas.DataFrame( + expected_df=pandas.DataFrame( {"row_num": [123], "date_col": [datetime.date(2021, 12, 12)]}, columns=["row_num", "date_col"], ), - [{"name": "row_num", "type": "INTEGER"}, {"name": "date_col", "type": "DATE"}], - False, + table_schema=[ + {"name": "row_num", "type": "INTEGER"}, + {"name": "date_col", "type": "DATE"}, + ], ), # Loading a NUMERIC column should work for floating point objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 - ( - pandas.DataFrame( + DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( {"row_num": [123], "num_col": [1.25]}, columns=["row_num", "num_col"], ), - pandas.DataFrame( + expected_df=pandas.DataFrame( {"row_num": [123], "num_col": [decimal.Decimal("1.25")]}, columns=["row_num", "num_col"], ), - [ + table_schema=[ {"name": "row_num", "type": "INTEGER"}, {"name": "num_col", "type": "NUMERIC"}, ], - False, ), ] @pytest.mark.parametrize( - ["input_df", "expected_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS + ["input_df", "expected_df", "table_schema", "api_methods"], DATAFRAME_ROUND_TRIPS ) def test_dataframe_round_trip_with_table_schema( method_under_test, @@ -191,10 +202,10 @@ def test_dataframe_round_trip_with_table_schema( expected_df, table_schema, api_method, - skip_csv, + api_methods, ): - if api_method == "load_csv" and skip_csv: - pytest.skip("Loading with CSV not supported.") + if api_method not in api_methods: + pytest.skip(f"{api_method} not supported.") if expected_df is None: expected_df = input_df table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}" From 7563017c00e8149bb8540abee92c4d6c3e105369 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Nov 2021 14:01:38 -0600 Subject: [PATCH 16/18] protect against explicit None --- pandas_gbq/load.py | 11 +++++++++-- tests/unit/test_load.py | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index f6b88454..5422402e 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -65,7 +65,12 @@ def cast_dataframe_for_parquet( See: https://github.com/googleapis/python-bigquery-pandas/issues/421 """ + columns = schema.get("fields", []) + + # Protect against an explicit None in the dictionary. + columns = columns if columns is not None else [] + for column in columns: # Schema can be a superset of the columns in the dataframe, so ignore # columns that aren't present. @@ -73,8 +78,10 @@ def cast_dataframe_for_parquet( if column_name not in dataframe.columns: continue - # Skip array columns. - if column.get("mode", "NULLABLE").upper() not in {"REQUIRED", "NULLABLE"}: + # Skip array columns for now. Potentially casting the elements of the + # array would be possible, but not worth the effort until there is + # demand for it. + if column.get("mode", "NULLABLE").upper() == "REPEATED": continue column_type = column.get("type", "").upper() diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index c50a58f4..8e18cfb9 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -247,3 +247,11 @@ def test_cast_dataframe_for_parquet_ignores_repeated_fields(): schema = {"fields": [{"name": "repeated_col", "type": "DATE", "mode": "REPEATED"}]} result = load.cast_dataframe_for_parquet(dataframe, schema) pandas.testing.assert_frame_equal(result, expected) + + +def test_cast_dataframe_for_parquet_w_null_fields(): + dataframe = pandas.DataFrame({"int_col": [0, 1, 2], "str_col": ["a", "b", "c"]}) + expected = dataframe.copy() + schema = {"fields": None} + result = load.cast_dataframe_for_parquet(dataframe, schema) + pandas.testing.assert_frame_equal(result, expected) From 92837e641a2231cb517280d7ce2691959f2e11c9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Nov 2021 14:37:28 -0600 Subject: [PATCH 17/18] update tests since pandas 0.24+ is required --- tests/system/test_gbq.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index a8d6bd0d..f268a85d 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -26,8 +26,6 @@ TABLE_ID = "new_test" PANDAS_VERSION = pkg_resources.parse_version(pandas.__version__) -NULLABLE_INT_PANDAS_VERSION = pkg_resources.parse_version("0.24.0") -NULLABLE_INT_MESSAGE = "Require pandas 0.24+ in order to use nullable integer type." def test_imports(): @@ -173,9 +171,6 @@ def test_should_properly_handle_valid_integers(self, project_id): tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]})) def test_should_properly_handle_nullable_integers(self, project_id): - if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION: - pytest.skip(msg=NULLABLE_INT_MESSAGE) - query = """SELECT * FROM UNNEST([1, NULL]) AS nullable_integer """ @@ -188,9 +183,7 @@ def test_should_properly_handle_nullable_integers(self, project_id): ) tm.assert_frame_equal( df, - DataFrame( - {"nullable_integer": pandas.Series([1, pandas.NA], dtype="Int64")} - ), + DataFrame({"nullable_integer": pandas.Series([1, None], dtype="Int64")}), ) def test_should_properly_handle_valid_longs(self, project_id): @@ -204,9 +197,6 @@ def test_should_properly_handle_valid_longs(self, project_id): tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]})) def test_should_properly_handle_nullable_longs(self, project_id): - if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION: - pytest.skip(msg=NULLABLE_INT_MESSAGE) - query = """SELECT * FROM UNNEST([1 << 62, NULL]) AS nullable_long """ @@ -219,15 +209,10 @@ def test_should_properly_handle_nullable_longs(self, project_id): ) tm.assert_frame_equal( df, - DataFrame( - {"nullable_long": pandas.Series([1 << 62, pandas.NA], dtype="Int64")} - ), + DataFrame({"nullable_long": pandas.Series([1 << 62, None], dtype="Int64")}), ) def test_should_properly_handle_null_integers(self, project_id): - if PANDAS_VERSION < NULLABLE_INT_PANDAS_VERSION: - pytest.skip(msg=NULLABLE_INT_MESSAGE) - query = "SELECT CAST(NULL AS INT64) AS null_integer" df = gbq.read_gbq( query, @@ -237,7 +222,7 @@ def test_should_properly_handle_null_integers(self, project_id): dtypes={"null_integer": "Int64"}, ) tm.assert_frame_equal( - df, DataFrame({"null_integer": pandas.Series([pandas.NA], dtype="Int64")}), + df, DataFrame({"null_integer": pandas.Series([None], dtype="Int64")}), ) def test_should_properly_handle_valid_floats(self, project_id): From f9817053c46b4152b56dd8204c0958d65b89e960 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 17 Nov 2021 16:08:47 -0600 Subject: [PATCH 18/18] update conda test pandas version --- .circleci/config.yml | 2 +- ...uirements-3.7-0.23.2.conda => requirements-3.7-0.24.2.conda} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename ci/{requirements-3.7-0.23.2.conda => requirements-3.7-0.24.2.conda} (100%) diff --git a/.circleci/config.yml b/.circleci/config.yml index ec4d7448..4c378b3f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ jobs: - image: continuumio/miniconda3 environment: PYTHON: "3.7" - PANDAS: "0.23.2" + PANDAS: "0.24.2" steps: - checkout - run: ci/config_auth.sh diff --git a/ci/requirements-3.7-0.23.2.conda b/ci/requirements-3.7-0.24.2.conda similarity index 100% rename from ci/requirements-3.7-0.23.2.conda rename to ci/requirements-3.7-0.24.2.conda