Skip to content

fix: address bigquery/bqml test failures #2920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 15, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions bigquery/bqml/data_scientist_tutorial_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,34 @@
from google.cloud import bigquery
# [END bqml_data_scientist_tutorial_import_and_client]
import pytest
import uuid

# [START bqml_data_scientist_tutorial_import_and_client]
client = bigquery.Client()
# We use a unique dataset ID for this example to avoid collisions with
# other invocations of this tutorial. In practice, you could leverage
# a persistent dataset and not create/destroy it with each invocation.
dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
# [END bqml_data_scientist_tutorial_import_and_client]


@pytest.fixture
def delete_dataset():
yield
client.delete_dataset(
client.dataset('bqml_tutorial'), delete_contents=True)
client.dataset(dataset_id), delete_contents=True)


def test_data_scientist_tutorial(delete_dataset):
# [START bqml_data_scientist_tutorial_create_dataset]
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
dataset = bigquery.Dataset(client.dataset(dataset_id))
dataset.location = 'US'
client.create_dataset(dataset)
# [END bqml_data_scientist_tutorial_create_dataset]

# [START bqml_data_scientist_tutorial_create_model]
sql = """
CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
CREATE OR REPLACE MODEL `{}.sample_model`
OPTIONS(model_type='logistic_reg') AS
SELECT
IF(totals.transactions IS NULL, 0, 1) AS label,
Expand All @@ -50,7 +55,7 @@ def test_data_scientist_tutorial(delete_dataset):
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_data_scientist_tutorial_create_model]
Expand All @@ -60,8 +65,8 @@ def test_data_scientist_tutorial(delete_dataset):
SELECT
*
FROM
ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)
"""
ML.TRAINING_INFO(MODEL `{}.sample_model`)
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_data_scientist_tutorial_get_training_statistics]
Expand All @@ -70,7 +75,7 @@ def test_data_scientist_tutorial(delete_dataset):
sql = """
SELECT
*
FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
FROM ML.EVALUATE(MODEL `{}.sample_model`, (
SELECT
IF(totals.transactions IS NULL, 0, 1) AS label,
IFNULL(device.operatingSystem, "") AS os,
Expand All @@ -81,7 +86,7 @@ def test_data_scientist_tutorial(delete_dataset):
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_data_scientist_tutorial_evaluate_model]
Expand All @@ -91,7 +96,7 @@ def test_data_scientist_tutorial(delete_dataset):
SELECT
country,
SUM(predicted_label) as total_predicted_purchases
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
FROM ML.PREDICT(MODEL `{}.sample_model`, (
SELECT
IFNULL(device.operatingSystem, "") AS os,
device.isMobile AS is_mobile,
Expand All @@ -104,7 +109,7 @@ def test_data_scientist_tutorial(delete_dataset):
GROUP BY country
ORDER BY total_predicted_purchases DESC
LIMIT 10
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_data_scientist_tutorial_predict_transactions]
Expand All @@ -114,7 +119,7 @@ def test_data_scientist_tutorial(delete_dataset):
SELECT
fullVisitorId,
SUM(predicted_label) as total_predicted_purchases
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
FROM ML.PREDICT(MODEL `{}.sample_model`, (
SELECT
IFNULL(device.operatingSystem, "") AS os,
device.isMobile AS is_mobile,
Expand All @@ -128,7 +133,7 @@ def test_data_scientist_tutorial(delete_dataset):
GROUP BY fullVisitorId
ORDER BY total_predicted_purchases DESC
LIMIT 10
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_data_scientist_tutorial_predict_purchases]
47 changes: 23 additions & 24 deletions bigquery/bqml/ncaa_tutorial_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import io
import os
import uuid

# [START bqml_ncaa_tutorial_import_and_client]
from google.cloud import bigquery
Expand All @@ -22,43 +23,41 @@

# [START bqml_ncaa_tutorial_import_and_client]
client = bigquery.Client()
# We use a unique dataset ID for this example to avoid collisions with
# other invocations of this tutorial. In practice, you could leverage
# a persistent dataset and not create/destroy it with each invocation.
dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
# [END bqml_ncaa_tutorial_import_and_client]


@pytest.fixture
def delete_dataset():
yield
client.delete_dataset(
client.dataset('bqml_tutorial'), delete_contents=True)
client.dataset(dataset_id), delete_contents=True)


def test_ncaa_tutorial(delete_dataset):
# [START bqml_ncaa_tutorial_create_dataset]
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
dataset = bigquery.Dataset(client.dataset(dataset_id))
dataset.location = 'US'
client.create_dataset(dataset)
# [END bqml_ncaa_tutorial_create_dataset]

# Create the tables used by the tutorial
# Note: the queries are saved to a file. This should be updated to use the
# saved queries once the library supports running saved queries.
query_filepath_to_table_name = {
'feature_input_query.sql': 'cume_games',
'training_data_query.sql': 'wide_games'
}
query_files = ['feature_input_query.sql', 'training_data_query.sql']
resources_directory = os.path.join(os.path.dirname(__file__), 'resources')
for query_filepath, table_name in query_filepath_to_table_name.items():
table_ref = dataset.table(table_name)
job_config = bigquery.QueryJobConfig()
job_config.destination = table_ref
for fname in query_files:
query_filepath = os.path.join(
resources_directory, query_filepath)
sql = io.open(query_filepath, 'r', encoding='utf-8').read()
client.query(sql, job_config=job_config).result()
resources_directory, fname)
sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id)
client.query(sql).result()

# [START bqml_ncaa_tutorial_create_model]
sql = """
CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model`
CREATE OR REPLACE MODEL `{0}.ncaa_model`
OPTIONS (
model_type='linear_reg',
max_iteration=50 ) AS
Expand All @@ -69,11 +68,11 @@ def test_ncaa_tutorial(delete_dataset):
total_three_points_att),
total_three_points_att as label
FROM
`bqml_tutorial.wide_games`
`{0}.wide_games`
WHERE
# remove the game to predict
game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_ncaa_tutorial_create_model]
Expand All @@ -83,8 +82,8 @@ def test_ncaa_tutorial(delete_dataset):
SELECT
*
FROM
ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`)
"""
ML.TRAINING_INFO(MODEL `{}.ncaa_model`)
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_ncaa_tutorial_get_training_statistics]
Expand All @@ -96,13 +95,13 @@ def test_ncaa_tutorial(delete_dataset):
*,
total_three_points_att AS label
FROM
`bqml_tutorial.wide_games` )
`{0}.wide_games` )
SELECT
*
FROM
ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`,
ML.EVALUATE(MODEL `{0}.ncaa_model`,
TABLE eval_table)
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_ncaa_tutorial_evaluate_model]
Expand All @@ -113,7 +112,7 @@ def test_ncaa_tutorial(delete_dataset):
SELECT
*
FROM
`bqml_tutorial.wide_games`
`{0}.wide_games`
WHERE
game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
SELECT
Expand All @@ -125,7 +124,7 @@ def test_ncaa_tutorial(delete_dataset):
game_id,
predicted_label AS predicted_total_three_points_att
FROM
ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`,
ML.PREDICT(MODEL `{0}.ncaa_model`,
table game_to_predict) ) AS predict
JOIN (
SELECT
Expand All @@ -135,7 +134,7 @@ def test_ncaa_tutorial(delete_dataset):
game_to_predict) AS truth
ON
predict.game_id = truth.game_id
"""
""".format(dataset_id)
df = client.query(sql).to_dataframe()
print(df)
# [END bqml_ncaa_tutorial_predict_outcomes]
3 changes: 2 additions & 1 deletion bigquery/bqml/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
google-cloud-bigquery[pandas]==1.20.0
pandas==0.22
google-cloud-bigquery>=1.24.0
flaky==3.6.1
mock==3.0.5
6 changes: 5 additions & 1 deletion bigquery/bqml/resources/feature_input_query.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#standardSQL
# This query creates a sample table using
# the ncaa_basketball public dataset. It
# uses a format string token for setting
# the destination dataset.
CREATE OR REPLACE TABLE `{0}.cume_games` AS
SELECT
game_id,
season,
Expand Down
5 changes: 3 additions & 2 deletions bigquery/bqml/resources/training_data_query.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#standardSQL
CREATE OR REPLACE TABLE `{0}.wide_games` AS
SELECT
team.game_id AS game_id,
team.season AS season,
Expand Down Expand Up @@ -768,9 +769,9 @@ SELECT
opponent.opp_possessions_std_last_5 AS opponent_opp_possessions_std_last_5,
opponent.opp_possessions_std_last_10 AS opponent_opp_possessions_std_last_10
FROM
`bqml_tutorial.cume_games` AS team
`{0}.cume_games` AS team
JOIN
`bqml_tutorial.cume_games` AS opponent
`{0}.cume_games` AS opponent
ON
team.game_id = opponent.game_id AND team.team_id != opponent.team_id
WHERE
Expand Down