Skip to content

Commit 03e3b04

Browse files
feat(alerts): Add ratelimiting for metric alerts (#58032)
Ratelimiting is added (under an options-backed feature flag) to ensure metric alerts don't fire more than every 10 minutes. Action item from INC-454. Closes #54730
1 parent 025d51a commit 03e3b04

File tree

3 files changed

+109
-1
lines changed

3 files changed

+109
-1
lines changed

src/sentry/incidents/subscription_processor.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88

99
from django.conf import settings
1010
from django.db import router, transaction
11+
from django.utils import timezone
1112
from snuba_sdk import Column, Condition, Limit, Op
1213

13-
from sentry import features
14+
from sentry import features, options
1415
from sentry.constants import CRASH_RATE_ALERT_AGGREGATE_ALIAS, CRASH_RATE_ALERT_SESSION_COUNT_ALIAS
1516
from sentry.incidents.logic import (
1617
CRITICAL_TRIGGER_LABEL,
@@ -550,8 +551,40 @@ def trigger_alert_threshold(
550551
:return:
551552
"""
552553
self.trigger_alert_counts[trigger.id] += 1
554+
555+
if options.get("metric_alerts.rate_limit"):
556+
# If an incident was created for this rule, trigger type, and subscription
557+
# within the last 10 minutes, don't make another one
558+
last_it = (
559+
IncidentTrigger.objects.filter(alert_rule_trigger=trigger)
560+
.order_by("-incident_id")
561+
.select_related("incident")
562+
.first()
563+
)
564+
last_incident: Incident | None = last_it.incident if last_it else None
565+
last_incident_projects = (
566+
[project.id for project in last_incident.projects.all()] if last_incident else []
567+
)
568+
minutes_since_last_incident = (
569+
(timezone.now() - last_incident.date_added).seconds / 60 if last_incident else None
570+
)
571+
if (
572+
last_incident
573+
and self.subscription.project.id in last_incident_projects
574+
and minutes_since_last_incident <= 10
575+
):
576+
metrics.incr(
577+
"incidents.alert_rules.hit_rate_limit",
578+
tags={
579+
"last_incident_id": last_incident.id,
580+
"project_id": self.subscription.project.id,
581+
"trigger_id": trigger.id,
582+
},
583+
)
584+
return None
553585
if self.trigger_alert_counts[trigger.id] >= self.alert_rule.threshold_period:
554586
metrics.incr("incidents.alert_rules.trigger", tags={"type": "fire"})
587+
555588
# Only create a new incident if we don't already have an active one
556589
if not self.active_incident:
557590
detected_at = self.calculate_event_date_from_update_date(self.last_update)

src/sentry/options/defaults.py

+2
Original file line numberDiff line numberDiff line change
@@ -1669,3 +1669,5 @@
16691669
default=False,
16701670
flags=FLAG_AUTOMATOR_MODIFIABLE,
16711671
)
1672+
1673+
register("metric_alerts.rate_limit", default=False, flags=FLAG_AUTOMATOR_MODIFIABLE)

tests/sentry/incidents/test_subscription_processor.py

+73
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from sentry.snuba.dataset import Dataset
4646
from sentry.snuba.models import QuerySubscription, SnubaQueryEventType
4747
from sentry.testutils.cases import BaseMetricsTestCase, SnubaTestCase, TestCase
48+
from sentry.testutils.helpers import override_options
4849
from sentry.testutils.helpers.datetime import freeze_time, iso_format
4950
from sentry.utils import json
5051
from sentry.utils.dates import to_timestamp
@@ -167,6 +168,12 @@ def assert_trigger_counts(self, processor, trigger, alert_triggers=0, resolve_tr
167168
def latest_activity(self, incident):
168169
return IncidentActivity.objects.filter(incident=incident).order_by("-id").first()
169170

171+
def assert_incident_is_latest_for_rule(self, incident):
172+
last_incident = (
173+
Incident.objects.filter(alert_rule=incident.alert_rule).order_by("-date_added").first()
174+
)
175+
assert last_incident == incident
176+
170177

171178
@freeze_time()
172179
class ProcessUpdateTest(ProcessUpdateBaseClass):
@@ -2106,6 +2113,72 @@ def test_comparison_alert_different_aggregate(self):
21062113
incident, [self.action], [(150.0, IncidentStatus.CLOSED, mock.ANY)]
21072114
)
21082115

2116+
@override_options({"metric_alerts.rate_limit": True})
2117+
def test_no_new_incidents_within_ten_minutes(self):
2118+
# Verify that a new incident is not made for the same rule, trigger, and
2119+
# subscription if an incident was already made within the last 10 minutes.
2120+
rule = self.rule
2121+
trigger = self.trigger
2122+
processor = self.send_update(
2123+
rule, trigger.alert_threshold + 1, timedelta(minutes=-2), self.sub
2124+
)
2125+
self.assert_trigger_counts(processor, self.trigger, 0, 0)
2126+
original_incident = self.assert_active_incident(rule)
2127+
original_incident.update(date_added=original_incident.date_added - timedelta(minutes=10))
2128+
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.ACTIVE)
2129+
2130+
# resolve the trigger
2131+
self.send_update(rule, 6, timedelta(minutes=-1), subscription=self.sub)
2132+
self.assert_no_active_incident(rule)
2133+
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)
2134+
2135+
# fire trigger again within 10 minutes; no new incident should be made
2136+
processor = self.send_update(rule, trigger.alert_threshold + 1, subscription=self.sub)
2137+
self.assert_trigger_counts(processor, self.trigger, 1, 0)
2138+
self.assert_no_active_incident(rule)
2139+
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)
2140+
self.assert_incident_is_latest_for_rule(original_incident)
2141+
self.metrics.incr.assert_has_calls(
2142+
[
2143+
call(
2144+
"incidents.alert_rules.hit_rate_limit",
2145+
tags={
2146+
"last_incident_id": original_incident.id,
2147+
"project_id": self.sub.project.id,
2148+
"trigger_id": trigger.id,
2149+
},
2150+
),
2151+
],
2152+
any_order=True,
2153+
)
2154+
2155+
@override_options({"metric_alerts.rate_limit": True})
2156+
def test_incident_made_after_ten_minutes(self):
2157+
# Verify that a new incident will be made for the same rule, trigger, and
2158+
# subscription if the last incident made for those was made more tha 10 minutes
2159+
# ago
2160+
rule = self.rule
2161+
trigger = self.trigger
2162+
processor = self.send_update(
2163+
rule, trigger.alert_threshold + 1, timedelta(minutes=-2), self.sub
2164+
)
2165+
self.assert_trigger_counts(processor, self.trigger, 0, 0)
2166+
original_incident = self.assert_active_incident(rule)
2167+
original_incident.update(date_added=original_incident.date_added - timedelta(minutes=11))
2168+
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.ACTIVE)
2169+
2170+
# resolve the trigger
2171+
self.send_update(rule, 6, timedelta(minutes=-1), self.sub)
2172+
self.assert_no_active_incident(rule)
2173+
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)
2174+
2175+
# fire trigger again after more than 10 minutes have passed; a new incident should be made
2176+
processor = self.send_update(rule, trigger.alert_threshold + 1, subscription=self.sub)
2177+
self.assert_trigger_counts(processor, self.trigger, 0, 0)
2178+
new_incident = self.assert_active_incident(rule)
2179+
self.assert_trigger_exists_with_status(new_incident, trigger, TriggerStatus.ACTIVE)
2180+
self.assert_incident_is_latest_for_rule(new_incident)
2181+
21092182

21102183
class MetricsCrashRateAlertProcessUpdateTest(ProcessUpdateBaseClass, BaseMetricsTestCase):
21112184
@pytest.fixture(autouse=True)

0 commit comments

Comments
 (0)