From 0f436e14691835a985e04639dfff07a8c44934c8 Mon Sep 17 00:00:00 2001
From: Takashi Matsuo <tmatsuo@google.com>
Date: Sat, 30 May 2020 08:40:42 +0000
Subject: [PATCH 1/5] [dlp] fix: mitigate flakiness

* make the Pub/Sub fixture function level
* shorten the timeout for the tests from 300 secs to 30 secs
* retring all the tests in risk_test.py 3 times

fixes #3897
fixes #3896
fixes #3895
fixes #3894
fixes #3893
fixes #3892
fixes #3890
fixes #3889
---
 dlp/risk_test.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/dlp/risk_test.py b/dlp/risk_test.py
index ef2b0357dc6..156c6ea7549 100644
--- a/dlp/risk_test.py
+++ b/dlp/risk_test.py
@@ -38,7 +38,7 @@
 
 
 # Create new custom topic/subscription
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def topic_id():
     # Creates a pubsub topic, and tears it down.
     publisher = google.cloud.pubsub.PublisherClient()
@@ -53,7 +53,7 @@ def topic_id():
     publisher.delete_topic(topic_path)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def subscription_id(topic_id):
     # Subscribes to a topic.
     subscriber = google.cloud.pubsub.SubscriberClient()
@@ -160,7 +160,7 @@ def bigquery_project():
     bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_numerical_risk_analysis(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -172,13 +172,14 @@ def test_numerical_risk_analysis(
         NUMERIC_FIELD,
         topic_id,
         subscription_id,
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
     assert "Value Range:" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_categorical_risk_analysis_on_string_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -190,14 +191,14 @@ def test_categorical_risk_analysis_on_string_field(
         UNIQUE_FIELD,
         topic_id,
         subscription_id,
-        timeout=180,
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_categorical_risk_analysis_on_number_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -209,13 +210,14 @@ def test_categorical_risk_analysis_on_number_field(
         NUMERIC_FIELD,
         topic_id,
         subscription_id,
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_k_anonymity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -227,6 +229,7 @@ def test_k_anonymity_analysis_single_field(
         topic_id,
         subscription_id,
         [NUMERIC_FIELD],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -246,6 +249,7 @@ def test_k_anonymity_analysis_multiple_fields(
         topic_id,
         subscription_id,
         [NUMERIC_FIELD, REPEATED_FIELD],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -253,7 +257,7 @@ def test_k_anonymity_analysis_multiple_fields(
     assert "Class size:" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_l_diversity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -266,6 +270,7 @@ def test_l_diversity_analysis_single_field(
         subscription_id,
         UNIQUE_FIELD,
         [NUMERIC_FIELD],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -287,6 +292,7 @@ def test_l_diversity_analysis_multiple_field(
         subscription_id,
         UNIQUE_FIELD,
         [NUMERIC_FIELD, REPEATED_FIELD],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -295,7 +301,7 @@ def test_l_diversity_analysis_multiple_field(
     assert "Sensitive value" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_k_map_estimate_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -308,6 +314,7 @@ def test_k_map_estimate_analysis_single_field(
         subscription_id,
         [NUMERIC_FIELD],
         ["AGE"],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -329,6 +336,7 @@ def test_k_map_estimate_analysis_multiple_field(
         subscription_id,
         [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
         ["AGE", "GENDER"],
+        timeout=30,
     )
 
     out, _ = capsys.readouterr()
@@ -337,7 +345,7 @@ def test_k_map_estimate_analysis_multiple_field(
     assert "Values" in out
 
 
-@pytest.mark.flaky
+@pytest.mark.flaky(max_runs=3, min_passes=1)
 def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
     topic_id, subscription_id, bigquery_project
 ):
@@ -351,4 +359,5 @@ def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
             subscription_id,
             [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
             ["AGE"],
+            timeout=30,
         )

From dd542adde11b86a108f6429e9f70c38d196cd163 Mon Sep 17 00:00:00 2001
From: Takashi Matsuo <tmatsuo@google.com>
Date: Sat, 30 May 2020 09:23:05 +0000
Subject: [PATCH 2/5] more retries, comment

---
 dlp/risk_test.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/dlp/risk_test.py b/dlp/risk_test.py
index 156c6ea7549..b75dabe34a9 100644
--- a/dlp/risk_test.py
+++ b/dlp/risk_test.py
@@ -38,6 +38,9 @@
 
 
 # Create new custom topic/subscription
+# We observe sometimes all the tests in this file fail. In a
+# hypothesis where DLP service somehow loses the connection to the
+# topic, now we use function scope for Pub/Sub fixtures.
 @pytest.fixture(scope="function")
 def topic_id():
     # Creates a pubsub topic, and tears it down.
@@ -160,7 +163,7 @@ def bigquery_project():
     bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_numerical_risk_analysis(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -179,7 +182,7 @@ def test_numerical_risk_analysis(
     assert "Value Range:" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_categorical_risk_analysis_on_string_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -198,7 +201,7 @@ def test_categorical_risk_analysis_on_string_field(
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_categorical_risk_analysis_on_number_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -217,7 +220,7 @@ def test_categorical_risk_analysis_on_number_field(
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_k_anonymity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -237,7 +240,7 @@ def test_k_anonymity_analysis_single_field(
     assert "Class size:" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_k_anonymity_analysis_multiple_fields(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -257,7 +260,7 @@ def test_k_anonymity_analysis_multiple_fields(
     assert "Class size:" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_l_diversity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -279,7 +282,7 @@ def test_l_diversity_analysis_single_field(
     assert "Sensitive value" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_l_diversity_analysis_multiple_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -301,7 +304,7 @@ def test_l_diversity_analysis_multiple_field(
     assert "Sensitive value" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_k_map_estimate_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -323,7 +326,7 @@ def test_k_map_estimate_analysis_single_field(
     assert "Values" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_k_map_estimate_analysis_multiple_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -345,7 +348,7 @@ def test_k_map_estimate_analysis_multiple_field(
     assert "Values" in out
 
 
-@pytest.mark.flaky(max_runs=3, min_passes=1)
+@pytest.mark.flaky(max_runs=5, min_passes=1)
 def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
     topic_id, subscription_id, bigquery_project
 ):

From 3d92e61c5df185e09b28e1c60202d8c2ba44058a Mon Sep 17 00:00:00 2001
From: Takashi Matsuo <tmatsuo@google.com>
Date: Sat, 30 May 2020 18:57:08 +0000
Subject: [PATCH 3/5] 30 seconds operation wait and 20 minutes retry delay

---
 dlp/risk_test.py | 58 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/dlp/risk_test.py b/dlp/risk_test.py
index b75dabe34a9..eba8cccbcfc 100644
--- a/dlp/risk_test.py
+++ b/dlp/risk_test.py
@@ -36,6 +36,7 @@
 BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING
 BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING
 
+TIMEOUT=30
 
 # Create new custom topic/subscription
 # We observe sometimes all the tests in this file fail. In a
@@ -163,7 +164,24 @@ def bigquery_project():
     bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+def delay():
+    # 20 mins of delay. This sounds like too long a delay, but we
+    # occasionally observe consequtive time block where operations are
+    # slow which leads to the test failures. These situations tend to
+    # get self healed in 20 minutes or so, so I'm trying this strategy.
+    #
+    # The worst case execution time becomes longer, but I think it
+    # will give us higher success rate.
+    #
+    # There are ten tests, and each tests has 30 seconds wait time
+    # and retried 1 times, so the worst case latency for the waits are:
+    # 210 minutes (3 hours 30 minutes).
+    # This might cause time out on Kokoro.
+    time.sleep(60*20)
+    return True
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_numerical_risk_analysis(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -175,14 +193,14 @@ def test_numerical_risk_analysis(
         NUMERIC_FIELD,
         topic_id,
         subscription_id,
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
     assert "Value Range:" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_categorical_risk_analysis_on_string_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -194,14 +212,14 @@ def test_categorical_risk_analysis_on_string_field(
         UNIQUE_FIELD,
         topic_id,
         subscription_id,
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_categorical_risk_analysis_on_number_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -213,14 +231,14 @@ def test_categorical_risk_analysis_on_number_field(
         NUMERIC_FIELD,
         topic_id,
         subscription_id,
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
     assert "Most common value occurs" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_k_anonymity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -232,7 +250,7 @@ def test_k_anonymity_analysis_single_field(
         topic_id,
         subscription_id,
         [NUMERIC_FIELD],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -240,7 +258,7 @@ def test_k_anonymity_analysis_single_field(
     assert "Class size:" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_k_anonymity_analysis_multiple_fields(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -252,7 +270,7 @@ def test_k_anonymity_analysis_multiple_fields(
         topic_id,
         subscription_id,
         [NUMERIC_FIELD, REPEATED_FIELD],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -260,7 +278,7 @@ def test_k_anonymity_analysis_multiple_fields(
     assert "Class size:" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_l_diversity_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -273,7 +291,7 @@ def test_l_diversity_analysis_single_field(
         subscription_id,
         UNIQUE_FIELD,
         [NUMERIC_FIELD],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -282,7 +300,7 @@ def test_l_diversity_analysis_single_field(
     assert "Sensitive value" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_l_diversity_analysis_multiple_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -295,7 +313,7 @@ def test_l_diversity_analysis_multiple_field(
         subscription_id,
         UNIQUE_FIELD,
         [NUMERIC_FIELD, REPEATED_FIELD],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -304,7 +322,7 @@ def test_l_diversity_analysis_multiple_field(
     assert "Sensitive value" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_k_map_estimate_analysis_single_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -317,7 +335,7 @@ def test_k_map_estimate_analysis_single_field(
         subscription_id,
         [NUMERIC_FIELD],
         ["AGE"],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -326,7 +344,7 @@ def test_k_map_estimate_analysis_single_field(
     assert "Values" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_k_map_estimate_analysis_multiple_field(
     topic_id, subscription_id, bigquery_project, capsys
 ):
@@ -339,7 +357,7 @@ def test_k_map_estimate_analysis_multiple_field(
         subscription_id,
         [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
         ["AGE", "GENDER"],
-        timeout=30,
+        timeout=TIMEOUT,
     )
 
     out, _ = capsys.readouterr()
@@ -348,7 +366,7 @@ def test_k_map_estimate_analysis_multiple_field(
     assert "Values" in out
 
 
-@pytest.mark.flaky(max_runs=5, min_passes=1)
+@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay)
 def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
     topic_id, subscription_id, bigquery_project
 ):
@@ -362,5 +380,5 @@ def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
             subscription_id,
             [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
             ["AGE"],
-            timeout=30,
+            timeout=TIMEOUT,
         )

From 906ad3e54ac2bdce5b111c39d18ee427575f6c6e Mon Sep 17 00:00:00 2001
From: Takashi Matsuo <tmatsuo@google.com>
Date: Sat, 30 May 2020 19:18:57 +0000
Subject: [PATCH 4/5] lint fix etc

---
 dlp/risk_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dlp/risk_test.py b/dlp/risk_test.py
index eba8cccbcfc..e36c6cfa547 100644
--- a/dlp/risk_test.py
+++ b/dlp/risk_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import time
 import uuid
 
 import google.cloud.bigquery
@@ -36,7 +37,8 @@
 BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING
 BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING
 
-TIMEOUT=30
+TIMEOUT = 30
+
 
 # Create new custom topic/subscription
 # We observe sometimes all the tests in this file fail. In a
@@ -164,7 +166,7 @@ def bigquery_project():
     bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 
 
-def delay():
+def delay(err, *args):
     # 20 mins of delay. This sounds like too long a delay, but we
     # occasionally observe consequtive time block where operations are
     # slow which leads to the test failures. These situations tend to

From 42625757645985e68149aa60582dc9c11a997fa6 Mon Sep 17 00:00:00 2001
From: Takashi Matsuo <tmatsuo@google.com>
Date: Sun, 31 May 2020 19:22:51 +0000
Subject: [PATCH 5/5] limit the max retry wait time

---
 dlp/conftest.py  | 20 ++++++++++++++++++++
 dlp/risk_test.py | 14 ++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)
 create mode 100644 dlp/conftest.py

diff --git a/dlp/conftest.py b/dlp/conftest.py
new file mode 100644
index 00000000000..362e5a2c271
--- /dev/null
+++ b/dlp/conftest.py
@@ -0,0 +1,20 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+# Used in risk_test.py to limit the maximum wait time before the flaky retries.
+def pytest_configure(config):
+    pytest.MAX_FLAKY_WAIT = 3600  # maximum of an hour
diff --git a/dlp/risk_test.py b/dlp/risk_test.py
index e36c6cfa547..0164cf3b8c0 100644
--- a/dlp/risk_test.py
+++ b/dlp/risk_test.py
@@ -172,14 +172,12 @@ def delay(err, *args):
     # slow which leads to the test failures. These situations tend to
     # get self healed in 20 minutes or so, so I'm trying this strategy.
     #
-    # The worst case execution time becomes longer, but I think it
-    # will give us higher success rate.
-    #
-    # There are ten tests, and each tests has 30 seconds wait time
-    # and retried 1 times, so the worst case latency for the waits are:
-    # 210 minutes (3 hours 30 minutes).
-    # This might cause time out on Kokoro.
-    time.sleep(60*20)
+    # There are 10 tests, so we don't want the retry delay happening
+    # for all the tests. When we exhaust the MAX_FLAKY_WAIT, we retry
+    # the test immediately.
+    wait_time = min(pytest.MAX_FLAKY_WAIT, 60*20)
+    pytest.MAX_FLAKY_WAIT -= wait_time
+    time.sleep(wait_time)
     return True