feat(bigquery): unit and system test for dataframe with int column with Nan values (#39)

HemangChothani · web-flow · commit 5fd840e9d4c5 · 2020-05-13T13:00:11.000+05:30
* feat(bigquery): add unit and system tests for int columns

* feat(bigquery): cosmetic changes

* feat(bigquery): use pkg_resources for comparison

* feat(bigquery): nit
diff --git a/tests/system.py b/tests/system.py
@@ -31,6 +31,7 @@
 import psutil
 import pytest
 import pytz
+import pkg_resources
 
 try:
     from google.cloud import bigquery_storage_v1beta1
@@ -125,6 +126,9 @@
     (TooManyRequests, InternalServerError, ServiceUnavailable)
 )
 
+PANDAS_MINIMUM_VERSION = pkg_resources.parse_version("1.0.0")
+PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version
+
 
 def _has_rows(result):
     return len(result) > 0
@@ -742,6 +746,66 @@ def test_load_table_from_dataframe_w_automatic_schema(self):
         )
         self.assertEqual(table.num_rows, 3)
 
+    @unittest.skipIf(
+        pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIMUM_VERSION,
+        "Only `pandas version >=1.0.0` is supported",
+    )
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_load_table_from_dataframe_w_nullable_int64_datatype(self):
+        """Test that a DataFrame containing column with None-type values and int64 datatype
+        can be uploaded if a BigQuery schema is specified.
+
+        https://github.com/googleapis/python-bigquery/issues/22
+        """
+
+        dataset_id = _make_dataset_id("bq_load_test")
+        self.temp_dataset(dataset_id)
+        table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format(
+            Config.CLIENT.project, dataset_id
+        )
+        table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),)
+        table = retry_403(Config.CLIENT.create_table)(
+            Table(table_id, schema=table_schema)
+        )
+        self.to_delete.insert(0, table)
+
+        df_data = collections.OrderedDict(
+            [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))]
+        )
+        dataframe = pandas.DataFrame(df_data, columns=df_data.keys())
+        load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id)
+        load_job.result()
+        table = Config.CLIENT.get_table(table_id)
+        self.assertEqual(tuple(table.schema), (bigquery.SchemaField("x", "INTEGER"),))
+        self.assertEqual(table.num_rows, 4)
+
+    @unittest.skipIf(
+        pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIMUM_VERSION,
+        "Only `pandas version >=1.0.0` is supported",
+    )
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(self):
+        """Test that a DataFrame containing column with None-type values and int64 datatype
+        can be uploaded without specifying a schema.
+
+        https://github.com/googleapis/python-bigquery/issues/22
+        """
+
+        dataset_id = _make_dataset_id("bq_load_test")
+        self.temp_dataset(dataset_id)
+        table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format(
+            Config.CLIENT.project, dataset_id
+        )
+        df_data = collections.OrderedDict(
+            [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))]
+        )
+        dataframe = pandas.DataFrame(df_data, columns=df_data.keys())
+        load_job = Config.CLIENT.load_table_from_dataframe(dataframe, table_id)
+        load_job.result()
+        table = Config.CLIENT.get_table(table_id)
+        self.assertEqual(tuple(table.schema), (bigquery.SchemaField("x", "INTEGER"),))
+        self.assertEqual(table.num_rows, 4)
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
     def test_load_table_from_dataframe_w_nulls(self):
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
@@ -30,6 +30,7 @@
 from six.moves import http_client
 import pytest
 import pytz
+import pkg_resources
 
 try:
     import fastparquet
@@ -56,6 +57,9 @@
     bigquery_storage_v1beta1 = None
 from tests.unit.helpers import make_connection
 
+PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0")
+PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version
+
 
 def _make_credentials():
     import google.auth.credentials
@@ -6973,6 +6977,98 @@ def test_load_table_from_dataframe_no_schema_warning_wo_pyarrow(self):
         ]
         assert matches, "A missing schema deprecation warning was not raised."
 
+    @unittest.skipIf(
+        pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION,
+        "Only `pandas version >=1.0.0` supported",
+    )
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_load_table_from_dataframe_w_nullable_int64_datatype(self):
+        from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
+        from google.cloud.bigquery import job
+        from google.cloud.bigquery.schema import SchemaField
+
+        client = self._make_client()
+        dataframe = pandas.DataFrame({"x": [1, 2, None, 4]}, dtype="Int64")
+        load_patch = mock.patch(
+            "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
+        )
+
+        get_table_patch = mock.patch(
+            "google.cloud.bigquery.client.Client.get_table",
+            autospec=True,
+            return_value=mock.Mock(schema=[SchemaField("x", "INT64", "NULLABLE")]),
+        )
+
+        with load_patch as load_table_from_file, get_table_patch:
+            client.load_table_from_dataframe(
+                dataframe, self.TABLE_REF, location=self.LOCATION
+            )
+
+        load_table_from_file.assert_called_once_with(
+            client,
+            mock.ANY,
+            self.TABLE_REF,
+            num_retries=_DEFAULT_NUM_RETRIES,
+            rewind=True,
+            job_id=mock.ANY,
+            job_id_prefix=None,
+            location=self.LOCATION,
+            project=None,
+            job_config=mock.ANY,
+        )
+
+        sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
+        assert sent_config.source_format == job.SourceFormat.PARQUET
+        assert tuple(sent_config.schema) == (
+            SchemaField("x", "INT64", "NULLABLE", None),
+        )
+
+    @unittest.skipIf(
+        pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION,
+        "Only `pandas version >=1.0.0` supported",
+    )
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(self):
+        from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
+        from google.cloud.bigquery import job
+        from google.cloud.bigquery.schema import SchemaField
+
+        client = self._make_client()
+        dataframe = pandas.DataFrame({"x": [1, 2, None, 4]}, dtype="Int64")
+        load_patch = mock.patch(
+            "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
+        )
+
+        get_table_patch = mock.patch(
+            "google.cloud.bigquery.client.Client.get_table",
+            autospec=True,
+            side_effect=google.api_core.exceptions.NotFound("Table not found"),
+        )
+
+        with load_patch as load_table_from_file, get_table_patch:
+            client.load_table_from_dataframe(
+                dataframe, self.TABLE_REF, location=self.LOCATION
+            )
+
+        load_table_from_file.assert_called_once_with(
+            client,
+            mock.ANY,
+            self.TABLE_REF,
+            num_retries=_DEFAULT_NUM_RETRIES,
+            rewind=True,
+            job_id=mock.ANY,
+            job_id_prefix=None,
+            location=self.LOCATION,
+            project=None,
+            job_config=mock.ANY,
+        )
+
+        sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
+        assert sent_config.source_format == job.SourceFormat.PARQUET
+        assert tuple(sent_config.schema) == (
+            SchemaField("x", "INT64", "NULLABLE", None),
+        )
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
     def test_load_table_from_dataframe_struct_fields_error(self):