Skip to content

Commit 58752d5

Browse files
authored
fix: address bigquery/bqml test failures (#2920)
* fix: bigquery/bqml testing The BQML tests use a non-unique dataset ID for multiple examples and testing currently triggers a lot of concurrent creation/deletions of said dataset. Switch to a dataset that leverages uuid to avoid invocations stomping on one another, which also necessitates parameterizing much of the SQL. There's also an issue with the pandas import currently, possibly due to recent changes in panda. This change also pins pandas to 0.22 and doesn't rely on the dependency being expressed as an extra through google-cloud-bigquery. * whitespace lint * update dependencies in requirements.txt
1 parent 5ca361b commit 58752d5

File tree

5 files changed

+50
-40
lines changed

5 files changed

+50
-40
lines changed

bigquery/bqml/data_scientist_tutorial_test.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,34 @@
1616
from google.cloud import bigquery
1717
# [END bqml_data_scientist_tutorial_import_and_client]
1818
import pytest
19+
import uuid
1920

2021
# [START bqml_data_scientist_tutorial_import_and_client]
2122
client = bigquery.Client()
23+
# We use a unique dataset ID for this example to avoid collisions with
24+
# other invocations of this tutorial. In practice, you could leverage
25+
# a persistent dataset and not create/destroy it with each invocation.
26+
dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
2227
# [END bqml_data_scientist_tutorial_import_and_client]
2328

2429

2530
@pytest.fixture
2631
def delete_dataset():
2732
yield
2833
client.delete_dataset(
29-
client.dataset('bqml_tutorial'), delete_contents=True)
34+
client.dataset(dataset_id), delete_contents=True)
3035

3136

3237
def test_data_scientist_tutorial(delete_dataset):
3338
# [START bqml_data_scientist_tutorial_create_dataset]
34-
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
39+
dataset = bigquery.Dataset(client.dataset(dataset_id))
3540
dataset.location = 'US'
3641
client.create_dataset(dataset)
3742
# [END bqml_data_scientist_tutorial_create_dataset]
3843

3944
# [START bqml_data_scientist_tutorial_create_model]
4045
sql = """
41-
CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
46+
CREATE OR REPLACE MODEL `{}.sample_model`
4247
OPTIONS(model_type='logistic_reg') AS
4348
SELECT
4449
IF(totals.transactions IS NULL, 0, 1) AS label,
@@ -50,7 +55,7 @@ def test_data_scientist_tutorial(delete_dataset):
5055
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
5156
WHERE
5257
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
53-
"""
58+
""".format(dataset_id)
5459
df = client.query(sql).to_dataframe()
5560
print(df)
5661
# [END bqml_data_scientist_tutorial_create_model]
@@ -60,8 +65,8 @@ def test_data_scientist_tutorial(delete_dataset):
6065
SELECT
6166
*
6267
FROM
63-
ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)
64-
"""
68+
ML.TRAINING_INFO(MODEL `{}.sample_model`)
69+
""".format(dataset_id)
6570
df = client.query(sql).to_dataframe()
6671
print(df)
6772
# [END bqml_data_scientist_tutorial_get_training_statistics]
@@ -70,7 +75,7 @@ def test_data_scientist_tutorial(delete_dataset):
7075
sql = """
7176
SELECT
7277
*
73-
FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
78+
FROM ML.EVALUATE(MODEL `{}.sample_model`, (
7479
SELECT
7580
IF(totals.transactions IS NULL, 0, 1) AS label,
7681
IFNULL(device.operatingSystem, "") AS os,
@@ -81,7 +86,7 @@ def test_data_scientist_tutorial(delete_dataset):
8186
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
8287
WHERE
8388
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
84-
"""
89+
""".format(dataset_id)
8590
df = client.query(sql).to_dataframe()
8691
print(df)
8792
# [END bqml_data_scientist_tutorial_evaluate_model]
@@ -91,7 +96,7 @@ def test_data_scientist_tutorial(delete_dataset):
9196
SELECT
9297
country,
9398
SUM(predicted_label) as total_predicted_purchases
94-
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
99+
FROM ML.PREDICT(MODEL `{}.sample_model`, (
95100
SELECT
96101
IFNULL(device.operatingSystem, "") AS os,
97102
device.isMobile AS is_mobile,
@@ -104,7 +109,7 @@ def test_data_scientist_tutorial(delete_dataset):
104109
GROUP BY country
105110
ORDER BY total_predicted_purchases DESC
106111
LIMIT 10
107-
"""
112+
""".format(dataset_id)
108113
df = client.query(sql).to_dataframe()
109114
print(df)
110115
# [END bqml_data_scientist_tutorial_predict_transactions]
@@ -114,7 +119,7 @@ def test_data_scientist_tutorial(delete_dataset):
114119
SELECT
115120
fullVisitorId,
116121
SUM(predicted_label) as total_predicted_purchases
117-
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
122+
FROM ML.PREDICT(MODEL `{}.sample_model`, (
118123
SELECT
119124
IFNULL(device.operatingSystem, "") AS os,
120125
device.isMobile AS is_mobile,
@@ -128,7 +133,7 @@ def test_data_scientist_tutorial(delete_dataset):
128133
GROUP BY fullVisitorId
129134
ORDER BY total_predicted_purchases DESC
130135
LIMIT 10
131-
"""
136+
""".format(dataset_id)
132137
df = client.query(sql).to_dataframe()
133138
print(df)
134139
# [END bqml_data_scientist_tutorial_predict_purchases]

bigquery/bqml/ncaa_tutorial_test.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import io
1616
import os
17+
import uuid
1718

1819
# [START bqml_ncaa_tutorial_import_and_client]
1920
from google.cloud import bigquery
@@ -22,43 +23,41 @@
2223

2324
# [START bqml_ncaa_tutorial_import_and_client]
2425
client = bigquery.Client()
26+
# We use a unique dataset ID for this example to avoid collisions with
27+
# other invocations of this tutorial. In practice, you could leverage
28+
# a persistent dataset and not create/destroy it with each invocation.
29+
dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
2530
# [END bqml_ncaa_tutorial_import_and_client]
2631

2732

2833
@pytest.fixture
2934
def delete_dataset():
3035
yield
3136
client.delete_dataset(
32-
client.dataset('bqml_tutorial'), delete_contents=True)
37+
client.dataset(dataset_id), delete_contents=True)
3338

3439

3540
def test_ncaa_tutorial(delete_dataset):
3641
# [START bqml_ncaa_tutorial_create_dataset]
37-
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
42+
dataset = bigquery.Dataset(client.dataset(dataset_id))
3843
dataset.location = 'US'
3944
client.create_dataset(dataset)
4045
# [END bqml_ncaa_tutorial_create_dataset]
4146

4247
# Create the tables used by the tutorial
4348
# Note: the queries are saved to a file. This should be updated to use the
4449
# saved queries once the library supports running saved queries.
45-
query_filepath_to_table_name = {
46-
'feature_input_query.sql': 'cume_games',
47-
'training_data_query.sql': 'wide_games'
48-
}
50+
query_files = ['feature_input_query.sql', 'training_data_query.sql']
4951
resources_directory = os.path.join(os.path.dirname(__file__), 'resources')
50-
for query_filepath, table_name in query_filepath_to_table_name.items():
51-
table_ref = dataset.table(table_name)
52-
job_config = bigquery.QueryJobConfig()
53-
job_config.destination = table_ref
52+
for fname in query_files:
5453
query_filepath = os.path.join(
55-
resources_directory, query_filepath)
56-
sql = io.open(query_filepath, 'r', encoding='utf-8').read()
57-
client.query(sql, job_config=job_config).result()
54+
resources_directory, fname)
55+
sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id)
56+
client.query(sql).result()
5857

5958
# [START bqml_ncaa_tutorial_create_model]
6059
sql = """
61-
CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model`
60+
CREATE OR REPLACE MODEL `{0}.ncaa_model`
6261
OPTIONS (
6362
model_type='linear_reg',
6463
max_iteration=50 ) AS
@@ -69,11 +68,11 @@ def test_ncaa_tutorial(delete_dataset):
6968
total_three_points_att),
7069
total_three_points_att as label
7170
FROM
72-
`bqml_tutorial.wide_games`
71+
`{0}.wide_games`
7372
WHERE
7473
# remove the game to predict
7574
game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
76-
"""
75+
""".format(dataset_id)
7776
df = client.query(sql).to_dataframe()
7877
print(df)
7978
# [END bqml_ncaa_tutorial_create_model]
@@ -83,8 +82,8 @@ def test_ncaa_tutorial(delete_dataset):
8382
SELECT
8483
*
8584
FROM
86-
ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`)
87-
"""
85+
ML.TRAINING_INFO(MODEL `{}.ncaa_model`)
86+
""".format(dataset_id)
8887
df = client.query(sql).to_dataframe()
8988
print(df)
9089
# [END bqml_ncaa_tutorial_get_training_statistics]
@@ -96,13 +95,13 @@ def test_ncaa_tutorial(delete_dataset):
9695
*,
9796
total_three_points_att AS label
9897
FROM
99-
`bqml_tutorial.wide_games` )
98+
`{0}.wide_games` )
10099
SELECT
101100
*
102101
FROM
103-
ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`,
102+
ML.EVALUATE(MODEL `{0}.ncaa_model`,
104103
TABLE eval_table)
105-
"""
104+
""".format(dataset_id)
106105
df = client.query(sql).to_dataframe()
107106
print(df)
108107
# [END bqml_ncaa_tutorial_evaluate_model]
@@ -113,7 +112,7 @@ def test_ncaa_tutorial(delete_dataset):
113112
SELECT
114113
*
115114
FROM
116-
`bqml_tutorial.wide_games`
115+
`{0}.wide_games`
117116
WHERE
118117
game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
119118
SELECT
@@ -125,7 +124,7 @@ def test_ncaa_tutorial(delete_dataset):
125124
game_id,
126125
predicted_label AS predicted_total_three_points_att
127126
FROM
128-
ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`,
127+
ML.PREDICT(MODEL `{0}.ncaa_model`,
129128
table game_to_predict) ) AS predict
130129
JOIN (
131130
SELECT
@@ -135,7 +134,7 @@ def test_ncaa_tutorial(delete_dataset):
135134
game_to_predict) AS truth
136135
ON
137136
predict.game_id = truth.game_id
138-
"""
137+
""".format(dataset_id)
139138
df = client.query(sql).to_dataframe()
140139
print(df)
141140
# [END bqml_ncaa_tutorial_predict_outcomes]

bigquery/bqml/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
google-cloud-bigquery[pandas]==1.20.0
1+
pandas==0.24.2
2+
google-cloud-bigquery==1.23.1
23
flaky==3.6.1
34
mock==3.0.5

bigquery/bqml/resources/feature_input_query.sql

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
#standardSQL
1+
# This query creates a sample table using
2+
# the ncaa_basketball public dataset. It
3+
# uses a format string token for setting
4+
# the destination dataset.
5+
CREATE OR REPLACE TABLE `{0}.cume_games` AS
26
SELECT
37
game_id,
48
season,

bigquery/bqml/resources/training_data_query.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#standardSQL
2+
CREATE OR REPLACE TABLE `{0}.wide_games` AS
23
SELECT
34
team.game_id AS game_id,
45
team.season AS season,
@@ -768,9 +769,9 @@ SELECT
768769
opponent.opp_possessions_std_last_5 AS opponent_opp_possessions_std_last_5,
769770
opponent.opp_possessions_std_last_10 AS opponent_opp_possessions_std_last_10
770771
FROM
771-
`bqml_tutorial.cume_games` AS team
772+
`{0}.cume_games` AS team
772773
JOIN
773-
`bqml_tutorial.cume_games` AS opponent
774+
`{0}.cume_games` AS opponent
774775
ON
775776
team.game_id = opponent.game_id AND team.team_id != opponent.team_id
776777
WHERE

0 commit comments

Comments
 (0)