ref(discover2) Test out sampling on tag values (#16387)

markstory · web-flow · commit a816d8d539cb · 2020-01-10T12:37:30.000-05:00
The facets endpoint is still not fast. I want to see what kind of
results can be obtained if we aggressively sample tag values. If this
yields acceptable performance I want to try scaling the sampling rate
based on the data volume so that we can better balance accurracy and
performance.
diff --git a/src/sentry/snuba/discover.py b/src/sentry/snuba/discover.py
@@ -447,15 +447,14 @@ def get_facets(query, params, limit=10, referrer=None):
     # Resolve the public aliases into the discover dataset names.
     snuba_args, translated_columns = resolve_discover_aliases(snuba_args)
 
-    # Force sampling for multi-project results as we don't need accuracy
-    # with that much data.
-    sample = len(snuba_filter.filter_keys["project_id"]) > 2
-
     # Exclude tracing tags as they are noisy and generally not helpful.
     excluded_tags = ["tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span"]]
 
-    # Get the most frequent tag keys, enable sampling
-    # as we don't need accuracy here.
+    # Sampling keys for multi-project results as we don't need accuracy
+    # with that much data.
+    sample = len(snuba_filter.filter_keys["project_id"]) > 2
+
+    # Get the most frequent tag keys
     key_names = raw_query(
         aggregations=[["count", None, "count"]],
         start=snuba_args.get("start"),
@@ -474,6 +473,11 @@ def get_facets(query, params, limit=10, referrer=None):
     if not top_tags:
         return []
 
+    # TODO(mark) Make the sampling rate scale based on the result size and scaling factor in
+    # sentry.options.
+    # To test the lowest acceptable sampling rate, we use turbo mode.
+    turbo_values = key_names["data"][0]["count"] > 10000
+
     fetch_projects = False
     if len(params.get("project_id", [])) > 1:
         if len(top_tags) == limit:
@@ -492,6 +496,7 @@ def get_facets(query, params, limit=10, referrer=None):
             orderby="-count",
             dataset=Dataset.Discover,
             referrer=referrer,
+            turbo=turbo_values,
         )
         results.extend(
             [FacetResult("project", r["project_id"], r["count"]) for r in project_values["data"]]
@@ -513,6 +518,7 @@ def get_facets(query, params, limit=10, referrer=None):
             limit=TOP_VALUES_DEFAULT_LIMIT,
             dataset=Dataset.Discover,
             referrer=referrer,
+            turbo=turbo_values,
         )
         results.extend([FacetResult(tag_name, r[tag], int(r["count"])) for r in tag_values["data"]])