Skip to content

Commit 37cef4e

Browse files
authored
fix(alerts): Don't group on time (#89447)
Issue alerts that have "slow" conditions over a large time period (1 day, 1 week, 1 month) can hit the 10,000 limit we have for results when there are a lot of group to process as we return timestamped data per group (e.g. group id 1 has data for 12:01, 12:02, 12:03, and so on and that's multiplied by the number of groups we're querying) and we'll drop the remaining data past the limit which results in lower numbers than we'd expect. This can cause alerts to not fire as the data we're receiving isn't accurate. To fix this we'll disable `group_on_time` and instead of receiving all the timestamp data that we add up later, we'll simply receive the sum without the timestamps. I renamed `get_sums` to `get_timeseries_sums` and replaced the usages that rely on the timeseries data with that. The places that use `get_sums` now expect to receive the aggregate data in a simple dictionary `{group_id: count}`.
1 parent 4144c01 commit 37cef4e

File tree

14 files changed

+314
-64
lines changed

14 files changed

+314
-64
lines changed

src/sentry/digests/notifications.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
import logging
44
from collections import defaultdict
5-
from collections.abc import Sequence
6-
from typing import NamedTuple, TypeAlias
5+
from collections.abc import Mapping, Sequence
6+
from typing import Any, NamedTuple, TypeAlias
77

88
from sentry import tsdb
99
from sentry.digests.types import Notification, Record, RecordWithRuleObjects
@@ -22,7 +22,7 @@
2222
class DigestInfo(NamedTuple):
2323
digest: Digest
2424
event_counts: dict[int, int]
25-
user_counts: dict[int, int]
25+
user_counts: Mapping[int, int]
2626

2727

2828
def split_key(
@@ -113,7 +113,7 @@ def _group_records(
113113

114114

115115
def _sort_digest(
116-
digest: Digest, event_counts: dict[int, int], user_counts: dict[int, int]
116+
digest: Digest, event_counts: dict[int, int], user_counts: Mapping[Any, int]
117117
) -> Digest:
118118
# sort inner groups dict by (event_count, user_count) descending
119119
for key, rule_groups in digest.items():
@@ -142,7 +142,7 @@ def _build_digest_impl(
142142
groups: dict[int, Group],
143143
rules: dict[int, Rule],
144144
event_counts: dict[int, int],
145-
user_counts: dict[int, int],
145+
user_counts: Mapping[Any, int],
146146
) -> Digest:
147147
# sans-io implementation details
148148
bound_records = _bind_records(records, groups, rules)
@@ -171,7 +171,7 @@ def build_digest(project: Project, records: Sequence[Record]) -> DigestInfo:
171171
assert rule.project_id == project.id, "Rule must belong to Project"
172172

173173
tenant_ids = {"organization_id": project.organization_id}
174-
event_counts = tsdb.backend.get_sums(
174+
event_counts = tsdb.backend.get_timeseries_sums(
175175
TSDBModel.group,
176176
group_ids,
177177
start,

src/sentry/models/groupsnooze.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_frequency_rates(self) -> bool:
118118
end = timezone.now()
119119
start = end - timedelta(minutes=self.window)
120120

121-
rate = tsdb.backend.get_sums(
121+
rate = tsdb.backend.get_timeseries_sums(
122122
model=get_issue_tsdb_group_model(self.group.issue_category),
123123
keys=[self.group_id],
124124
start=start,

src/sentry/rules/conditions/event_frequency.py

+75-11
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,11 @@ def query(
230230
return self.query_hook(event, start, end, environment_id)
231231

232232
def query_hook(
233-
self, event: GroupEvent, start: datetime, end: datetime, environment_id: int
233+
self,
234+
event: GroupEvent,
235+
start: datetime,
236+
end: datetime,
237+
environment_id: int,
234238
) -> int | float:
235239
"""
236240
Abstract method that specifies how to query Snuba for a single group
@@ -244,10 +248,15 @@ def batch_query(
244248
"""
245249
Queries Snuba for a unique condition for multiple groups.
246250
"""
247-
return self.batch_query_hook(group_ids, start, end, environment_id)
251+
return self.batch_query_hook(group_ids, start, end, environment_id, False)
248252

249253
def batch_query_hook(
250-
self, group_ids: set[int], start: datetime, end: datetime, environment_id: int
254+
self,
255+
group_ids: set[int],
256+
start: datetime,
257+
end: datetime,
258+
environment_id: int,
259+
group_on_time: bool,
251260
) -> dict[int, int | float]:
252261
"""
253262
Abstract method that specifies how to query Snuba for multiple groups
@@ -339,6 +348,7 @@ def get_snuba_query_result(
339348
end: datetime,
340349
environment_id: int,
341350
referrer_suffix: str,
351+
group_on_time: bool = False,
342352
) -> Mapping[int, int]:
343353
result: Mapping[int, int] = tsdb_function(
344354
model=model,
@@ -350,6 +360,7 @@ def get_snuba_query_result(
350360
jitter_value=group_id,
351361
tenant_ids={"organization_id": organization_id},
352362
referrer_suffix=referrer_suffix,
363+
group_on_time=group_on_time,
353364
)
354365
return result
355366

@@ -363,6 +374,7 @@ def get_chunked_result(
363374
end: datetime,
364375
environment_id: int,
365376
referrer_suffix: str,
377+
group_on_time: bool = False,
366378
) -> dict[int, int]:
367379
batch_totals: dict[int, int] = defaultdict(int)
368380
group_id = group_ids[0]
@@ -377,6 +389,7 @@ def get_chunked_result(
377389
end=end,
378390
environment_id=environment_id,
379391
referrer_suffix=referrer_suffix,
392+
group_on_time=group_on_time,
380393
)
381394
batch_totals.update(result)
382395
return batch_totals
@@ -419,7 +432,11 @@ class EventFrequencyCondition(BaseEventFrequencyCondition):
419432
label = "The issue is seen more than {value} times in {interval}"
420433

421434
def query_hook(
422-
self, event: GroupEvent, start: datetime, end: datetime, environment_id: int
435+
self,
436+
event: GroupEvent,
437+
start: datetime,
438+
end: datetime,
439+
environment_id: int,
423440
) -> int:
424441
sums: Mapping[int, int] = self.get_snuba_query_result(
425442
tsdb_function=self.tsdb.get_sums,
@@ -431,11 +448,17 @@ def query_hook(
431448
end=end,
432449
environment_id=environment_id,
433450
referrer_suffix="alert_event_frequency",
451+
group_on_time=False,
434452
)
435453
return sums[event.group_id]
436454

437455
def batch_query_hook(
438-
self, group_ids: set[int], start: datetime, end: datetime, environment_id: int
456+
self,
457+
group_ids: set[int],
458+
start: datetime,
459+
end: datetime,
460+
environment_id: int,
461+
group_on_time: bool = False,
439462
) -> dict[int, int | float]:
440463
batch_sums: dict[int, int | float] = defaultdict(int)
441464
groups = Group.objects.filter(id__in=group_ids).values(
@@ -454,6 +477,7 @@ def batch_query_hook(
454477
end=end,
455478
environment_id=environment_id,
456479
referrer_suffix="batch_alert_event_frequency",
480+
group_on_time=group_on_time,
457481
)
458482
batch_sums.update(error_sums)
459483

@@ -468,6 +492,7 @@ def batch_query_hook(
468492
end=end,
469493
environment_id=environment_id,
470494
referrer_suffix="batch_alert_event_frequency",
495+
group_on_time=group_on_time,
471496
)
472497
batch_sums.update(generic_sums)
473498

@@ -482,7 +507,11 @@ class EventUniqueUserFrequencyCondition(BaseEventFrequencyCondition):
482507
label = "The issue is seen by more than {value} users in {interval}"
483508

484509
def query_hook(
485-
self, event: GroupEvent, start: datetime, end: datetime, environment_id: int
510+
self,
511+
event: GroupEvent,
512+
start: datetime,
513+
end: datetime,
514+
environment_id: int,
486515
) -> int:
487516
totals: Mapping[int, int] = self.get_snuba_query_result(
488517
tsdb_function=self.tsdb.get_distinct_counts_totals,
@@ -494,11 +523,17 @@ def query_hook(
494523
end=end,
495524
environment_id=environment_id,
496525
referrer_suffix="alert_event_uniq_user_frequency",
526+
group_on_time=False,
497527
)
498528
return totals[event.group_id]
499529

500530
def batch_query_hook(
501-
self, group_ids: set[int], start: datetime, end: datetime, environment_id: int
531+
self,
532+
group_ids: set[int],
533+
start: datetime,
534+
end: datetime,
535+
environment_id: int,
536+
group_on_time: bool = False,
502537
) -> dict[int, int | float]:
503538
batch_totals: dict[int, int | float] = defaultdict(int)
504539
groups = Group.objects.filter(id__in=group_ids).values(
@@ -517,6 +552,7 @@ def batch_query_hook(
517552
end=end,
518553
environment_id=environment_id,
519554
referrer_suffix="batch_alert_event_uniq_user_frequency",
555+
group_on_time=group_on_time,
520556
)
521557
batch_totals.update(error_totals)
522558

@@ -531,6 +567,7 @@ def batch_query_hook(
531567
end=end,
532568
environment_id=environment_id,
533569
referrer_suffix="batch_alert_event_uniq_user_frequency",
570+
group_on_time=group_on_time,
534571
)
535572
batch_totals.update(generic_totals)
536573

@@ -545,7 +582,11 @@ class EventUniqueUserFrequencyConditionWithConditions(EventUniqueUserFrequencyCo
545582
label = "The issue is seen by more than {value} users in {interval} with conditions"
546583

547584
def query_hook(
548-
self, event: GroupEvent, start: datetime, end: datetime, environment_id: int
585+
self,
586+
event: GroupEvent,
587+
start: datetime,
588+
end: datetime,
589+
environment_id: int,
549590
) -> int:
550591
assert self.rule
551592
if not features.has(
@@ -580,11 +621,17 @@ def query_hook(
580621
environment_id=environment_id,
581622
referrer_suffix="batch_alert_event_uniq_user_frequency",
582623
conditions=conditions,
624+
group_on_time=False,
583625
)
584626
return total[event.group.id]
585627

586628
def batch_query_hook(
587-
self, group_ids: set[int], start: datetime, end: datetime, environment_id: int
629+
self,
630+
group_ids: set[int],
631+
start: datetime,
632+
end: datetime,
633+
environment_id: int,
634+
group_on_time: bool = False,
588635
) -> dict[int, int | float]:
589636
logger = logging.getLogger(
590637
"sentry.rules.event_frequency.EventUniqueUserFrequencyConditionWithConditions"
@@ -643,6 +690,7 @@ def batch_query_hook(
643690
environment_id=environment_id,
644691
referrer_suffix="batch_alert_event_uniq_user_frequency",
645692
conditions=conditions,
693+
group_on_time=group_on_time,
646694
)
647695
batch_totals.update(error_totals)
648696

@@ -657,6 +705,7 @@ def batch_query_hook(
657705
environment_id=environment_id,
658706
referrer_suffix="batch_alert_event_uniq_user_frequency",
659707
conditions=conditions,
708+
group_on_time=group_on_time,
660709
)
661710
batch_totals.update(error_totals)
662711

@@ -677,6 +726,7 @@ def get_snuba_query_result(
677726
end: datetime,
678727
environment_id: int,
679728
referrer_suffix: str,
729+
group_on_time: bool = False,
680730
conditions: list[tuple[str, str, str | list[str]]] | None = None,
681731
) -> Mapping[int, int]:
682732
result: Mapping[int, int] = tsdb_function(
@@ -690,6 +740,7 @@ def get_snuba_query_result(
690740
tenant_ids={"organization_id": organization_id},
691741
referrer_suffix=referrer_suffix,
692742
conditions=conditions,
743+
group_on_time=group_on_time,
693744
)
694745
return result
695746

@@ -703,6 +754,7 @@ def get_chunked_result(
703754
end: datetime,
704755
environment_id: int,
705756
referrer_suffix: str,
757+
group_on_time: bool = False,
706758
conditions: list[tuple[str, str, str | list[str]]] | None = None,
707759
) -> dict[int, int]:
708760
batch_totals: dict[int, int] = defaultdict(int)
@@ -719,6 +771,7 @@ def get_chunked_result(
719771
environment_id=environment_id,
720772
referrer_suffix=referrer_suffix,
721773
conditions=conditions,
774+
group_on_time=group_on_time,
722775
)
723776
batch_totals.update(result)
724777
return batch_totals
@@ -864,7 +917,11 @@ def get_session_interval(self, session_count: int, interval: str) -> int | None:
864917
return None
865918

866919
def query_hook(
867-
self, event: GroupEvent, start: datetime, end: datetime, environment_id: int
920+
self,
921+
event: GroupEvent,
922+
start: datetime,
923+
end: datetime,
924+
environment_id: int,
868925
) -> float:
869926
project_id = event.project_id
870927
session_count_last_hour = self.get_session_count(project_id, environment_id, start, end)
@@ -882,6 +939,7 @@ def query_hook(
882939
end=end,
883940
environment_id=environment_id,
884941
referrer_suffix="alert_event_frequency_percent",
942+
group_on_time=False,
885943
)[event.group_id]
886944

887945
if issue_count > avg_sessions_in_interval:
@@ -900,7 +958,12 @@ def query_hook(
900958
return 0
901959

902960
def batch_query_hook(
903-
self, group_ids: set[int], start: datetime, end: datetime, environment_id: int
961+
self,
962+
group_ids: set[int],
963+
start: datetime,
964+
end: datetime,
965+
environment_id: int,
966+
group_on_time: bool = False,
904967
) -> dict[int, int | float]:
905968
groups = Group.objects.filter(id__in=group_ids).values(
906969
"id", "type", "project_id", "project__organization_id"
@@ -933,6 +996,7 @@ def batch_query_hook(
933996
end=end,
934997
environment_id=environment_id,
935998
referrer_suffix="batch_alert_event_frequency_percent",
999+
group_on_time=group_on_time,
9361000
)
9371001

9381002
batch_percents: dict[int, int | float] = {}

src/sentry/tasks/beacon.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def get_events_24h() -> int:
6868
end = timezone.now()
6969
sum_events = 0
7070
for organization_id in organization_ids:
71-
events_per_org_24h = tsdb.backend.get_sums(
71+
events_per_org_24h = tsdb.backend.get_timeseries_sums(
7272
model=TSDBModel.organization_total_received,
7373
keys=[organization_id],
7474
start=end - timedelta(hours=24),

0 commit comments

Comments
 (0)