Skip to content

Commit 27a2f5e

Browse files
authored
fix(similarity): use get_primary_hash in backfill (#72022)
- previously we were getting a random hash associated with the group. - now, we simply use `event.get_primary_hash()` - removes passing down these hashes through the various functions as its not needed now - deletes a few tests where we were going through edge cases where we weren't finding a group_id associated with the hash .
1 parent 96d41ce commit 27a2f5e

File tree

2 files changed

+42
-138
lines changed

2 files changed

+42
-138
lines changed

src/sentry/tasks/backfill_seer_grouping_records.py

+32-20
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from sentry.issues.grouptype import ErrorGroupType
2020
from sentry.issues.occurrence_consumer import EventLookupError
2121
from sentry.models.group import Group, GroupStatus
22-
from sentry.models.grouphash import GroupHash
2322
from sentry.models.project import Project
2423
from sentry.seer.similarity.backfill import (
2524
CreateGroupingRecordData,
@@ -221,12 +220,8 @@ def backfill_seer_grouping_records(
221220
group_id_batch.remove(group_id)
222221
del group_id_message_batch_filtered[group_id]
223222

224-
group_hashes = GroupHash.objects.filter(
225-
project_id=project.id, group_id__in=group_id_batch
226-
).distinct("group_id")
227-
group_hashes_dict = {group_hash.group_id: group_hash.hash for group_hash in group_hashes}
228223
data = lookup_group_data_stacktrace_bulk_with_fallback(
229-
project, rows, group_id_message_batch_filtered, group_hashes_dict
224+
project, rows, group_id_message_batch_filtered
230225
)
231226

232227
# If nodestore returns no data
@@ -245,6 +240,11 @@ def backfill_seer_grouping_records(
245240
)
246241
return
247242

243+
group_hashes_dict = {
244+
group_stacktrace_data["group_id"]: group_stacktrace_data["hash"]
245+
for group_stacktrace_data in data["data"]
246+
}
247+
248248
with metrics.timer(f"{BACKFILL_NAME}.post_bulk_grouping_records", sample_rate=1.0):
249249
response = post_bulk_grouping_records(
250250
CreateGroupingRecordsRequest(
@@ -258,7 +258,7 @@ def backfill_seer_grouping_records(
258258
groups_with_neighbor = response["groups_with_neighbor"]
259259
groups = Group.objects.filter(project_id=project.id, id__in=group_id_batch)
260260
for group in groups:
261-
seer_similarity = {
261+
seer_similarity: dict[str, Any] = {
262262
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
263263
"request_hash": group_hashes_dict[group.id],
264264
}
@@ -332,19 +332,19 @@ def backfill_seer_grouping_records(
332332

333333

334334
def lookup_group_data_stacktrace_bulk_with_fallback(
335-
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
335+
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
336336
) -> GroupStacktraceData:
337337
(
338338
bulk_event_ids,
339339
invalid_event_ids,
340340
bulk_group_data_stacktraces,
341-
) = lookup_group_data_stacktrace_bulk(project, rows, messages, hashes)
341+
) = lookup_group_data_stacktrace_bulk(project, rows, messages)
342342
for row in rows:
343343
event_id, group_id = row["event_id"], row["group_id"]
344344
if event_id not in bulk_event_ids and event_id not in invalid_event_ids:
345345
try:
346346
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
347-
project, event_id, int(group_id), messages[group_id], hashes[group_id]
347+
project, event_id, int(group_id), messages[group_id]
348348
)
349349
if group_data and stacktrace_string:
350350
bulk_group_data_stacktraces["data"].append(group_data)
@@ -374,7 +374,7 @@ def lookup_group_data_stacktrace_bulk_with_fallback(
374374

375375
@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0)
376376
def lookup_group_data_stacktrace_bulk(
377-
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
377+
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
378378
) -> tuple[set[str], set[str], GroupStacktraceData]:
379379
project_id = project.id
380380
node_id_to_group_data = {
@@ -421,18 +421,23 @@ def lookup_group_data_stacktrace_bulk(
421421
)
422422
event = Event(event_id=event_id, project_id=project_id, group_id=group_id)
423423
event.data = data
424-
if event and event.data and event.data.get("exception") and hashes.get(group_id):
424+
if event and event.data and event.data.get("exception"):
425425
grouping_info = get_grouping_info(None, project=project, event=event)
426426
stacktrace_string = get_stacktrace_string(grouping_info)
427427
if stacktrace_string == "":
428428
invalid_event_ids.add(event_id)
429429
continue
430+
primary_hash = event.get_primary_hash()
431+
if not primary_hash:
432+
invalid_event_ids.add(event_id)
433+
continue
434+
430435
group_data.append(
431436
CreateGroupingRecordData(
432437
group_id=group_id,
433438
project_id=project_id,
434439
message=messages[group_id],
435-
hash=hashes[group_id],
440+
hash=primary_hash,
436441
)
437442
)
438443
stacktrace_strings.append(stacktrace_string)
@@ -455,7 +460,7 @@ def lookup_group_data_stacktrace_bulk(
455460

456461
@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_single")
457462
def lookup_group_data_stacktrace_single(
458-
project: Project, event_id: str, group_id: int, message: str, hash: str
463+
project: Project, event_id: str, group_id: int, message: str
459464
) -> tuple[CreateGroupingRecordData | None, str]:
460465
project_id = project.id
461466
try:
@@ -485,13 +490,20 @@ def lookup_group_data_stacktrace_single(
485490
with sentry_sdk.start_transaction(op="embeddings_grouping.get_latest_event"):
486491
grouping_info = get_grouping_info(None, project=project, event=event)
487492
stacktrace_string = get_stacktrace_string(grouping_info)
488-
group_data = (
489-
CreateGroupingRecordData(
490-
group_id=group_id, hash=hash, project_id=project_id, message=message
493+
primary_hash = event.get_primary_hash()
494+
if not primary_hash:
495+
group_data = None
496+
else:
497+
group_data = (
498+
CreateGroupingRecordData(
499+
group_id=group_id,
500+
hash=primary_hash,
501+
project_id=project_id,
502+
message=message,
503+
)
504+
if stacktrace_string != ""
505+
else None
491506
)
492-
if stacktrace_string != ""
493-
else None
494-
)
495507

496508
return (group_data, stacktrace_string)
497509

tests/sentry/tasks/test_backfill_seer_grouping_records.py

+10-118
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def test_lookup_group_data_stacktrace_single_success(self):
165165
event = self.event
166166
hash = self.group_hashes[event.group.id]
167167
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
168-
self.project, event.event_id, event.group_id, event.group.message, hash
168+
self.project, event.event_id, event.group_id, event.group.message
169169
)
170170
expected_group_data = CreateGroupingRecordData(
171171
group_id=event.group.id,
@@ -200,7 +200,6 @@ def test_lookup_group_data_stacktrace_single_exceptions(
200200
event.event_id,
201201
event.group_id,
202202
event.group.message,
203-
self.group_hashes[event.group.id],
204203
)
205204
mock_logger.exception.assert_called_with(
206205
"tasks.backfill_seer_grouping_records.event_lookup_exception",
@@ -221,18 +220,16 @@ def test_lookup_group_data_stacktrace_single_not_stacktrace_grouping(self):
221220
project_id=self.project.id,
222221
assert_no_errors=False,
223222
)
224-
hash = GroupHash.objects.get(group_id=event.group.id)
225223
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
226-
self.project, event.event_id, event.group_id, event.group.message, hash
224+
self.project, event.event_id, event.group_id, event.group.message
227225
)
228226
assert (group_data, stacktrace_string) == (None, "")
229227

230228
def test_lookup_group_data_stacktrace_single_no_stacktrace(self):
231229
"""Test that no data is returned if the event has no stacktrace"""
232230
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
233-
hash = GroupHash.objects.get(group_id=event.group.id)
234231
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
235-
self.project, event.event_id, event.group_id, event.group.message, hash
232+
self.project, event.event_id, event.group_id, event.group.message
236233
)
237234
assert (group_data, stacktrace_string) == (None, "")
238235

@@ -244,7 +241,7 @@ def test_lookup_group_data_stacktrace_bulk_success(self, mock_metrics):
244241
bulk_event_ids,
245242
invalid_event_ids,
246243
bulk_group_data_stacktraces,
247-
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
244+
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
248245

249246
expected_event_ids = {event.event_id for event in events}
250247
expected_group_data = [
@@ -287,7 +284,7 @@ def test_lookup_group_data_stacktrace_bulk_exceptions(
287284
for exception in exceptions:
288285
mock_get_multi.side_effect = exception
289286
with pytest.raises(Exception):
290-
lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
287+
lookup_group_data_stacktrace_bulk(self.project, rows, messages)
291288
mock_logger.exception.assert_called_with(
292289
"tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
293290
extra={
@@ -323,7 +320,7 @@ def test_lookup_group_data_stacktrace_bulk_not_stacktrace_grouping(self):
323320
bulk_event_ids,
324321
invalid_event_ids,
325322
bulk_group_data_stacktraces,
326-
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
323+
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
327324
expected_group_data = [
328325
CreateGroupingRecordData(
329326
group_id=event.group.id,
@@ -363,46 +360,7 @@ def test_lookup_group_data_stacktrace_bulk_no_stacktrace_exception(self):
363360
bulk_event_ids,
364361
invalid_event_ids,
365362
bulk_group_data_stacktraces,
366-
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
367-
expected_group_data = [
368-
CreateGroupingRecordData(
369-
group_id=event.group.id,
370-
hash=hashes[event.group.id],
371-
project_id=self.project.id,
372-
message=event.group.message,
373-
)
374-
for event in events
375-
]
376-
expected_stacktraces = [
377-
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
378-
for i in range(2)
379-
]
380-
assert bulk_event_ids == {event.event_id for event in events}
381-
assert invalid_event_ids == {event.event_id}
382-
assert bulk_group_data_stacktraces["data"] == expected_group_data
383-
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
384-
385-
def test_lookup_group_data_stacktrace_bulk_no_hash(self):
386-
"""
387-
Test that if a group does not have a hash (for whatever reason), its data is not included
388-
in the bulk lookup result
389-
"""
390-
# Use 2 events
391-
rows, events, messages, hashes = self.bulk_rows[:2], self.bulk_events[:2], {}, {}
392-
group_ids = [row["group_id"] for row in rows]
393-
for group_id in group_ids:
394-
messages.update({group_id: self.bulk_messages[group_id]})
395-
hashes.update({group_id: self.group_hashes[group_id]})
396-
# Create one event with no hash
397-
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
398-
rows.append({"event_id": event.event_id, "group_id": event.group_id})
399-
messages.update({event.group_id: event.group.message})
400-
401-
(
402-
bulk_event_ids,
403-
invalid_event_ids,
404-
bulk_group_data_stacktraces,
405-
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
363+
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
406364
expected_group_data = [
407365
CreateGroupingRecordData(
408366
group_id=event.group.id,
@@ -430,7 +388,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_success(self):
430388
self.group_hashes,
431389
)
432390
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
433-
self.project, rows, messages, hashes
391+
self.project, rows, messages
434392
)
435393

436394
expected_group_data = [
@@ -480,7 +438,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(
480438

481439
rows, messages, hashes = self.bulk_rows, self.bulk_messages, self.group_hashes
482440
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
483-
self.project, rows, messages, hashes=hashes
441+
self.project, rows, messages
484442
)
485443

486444
events = self.bulk_events
@@ -500,72 +458,6 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(
500458
assert bulk_group_data_stacktraces["data"] == expected_group_data
501459
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
502460

503-
@patch("sentry.tasks.backfill_seer_grouping_records.logger")
504-
@patch("sentry.tasks.backfill_seer_grouping_records.lookup_group_data_stacktrace_bulk")
505-
def test_lookup_group_data_stacktrace_bulk_with_fallback_no_hash(
506-
self, mock_lookup_group_data_stacktrace_bulk, mock_logger
507-
):
508-
"""
509-
Test that if a group does not have a hash (for whatever reason), we do not attempt the
510-
fallback and we log it
511-
"""
512-
# Purposely exclude one event from being included in the bulk lookup response, so that the fallback is used
513-
events_missing = self.bulk_events[:-1]
514-
group_data, stacktrace_strings = [], []
515-
for event in events_missing:
516-
grouping_info = get_grouping_info(None, project=self.project, event=event)
517-
stacktrace_string = get_stacktrace_string(grouping_info)
518-
group_data.append(
519-
CreateGroupingRecordData(
520-
group_id=event.group.id,
521-
hash=self.group_hashes[event.group.id],
522-
project_id=self.project.id,
523-
message=event.group.message,
524-
)
525-
)
526-
stacktrace_strings.append(stacktrace_string)
527-
mock_lookup_group_data_stacktrace_bulk.return_value = (
528-
{event.event_id for event in events_missing},
529-
set(),
530-
GroupStacktraceData(data=group_data, stacktrace_list=stacktrace_strings),
531-
)
532-
533-
# Purposely remove the hash for the missing event
534-
hashes = copy.deepcopy(self.group_hashes)
535-
del hashes[self.bulk_events[-1].group.id]
536-
537-
rows, messages = self.bulk_rows, self.bulk_messages
538-
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
539-
self.project, rows, messages, hashes=hashes
540-
)
541-
542-
events = self.bulk_events[:-1]
543-
expected_group_data = [
544-
CreateGroupingRecordData(
545-
group_id=event.group.id,
546-
hash=hashes[event.group.id],
547-
project_id=self.project.id,
548-
message=event.group.message,
549-
)
550-
for event in events
551-
]
552-
expected_stacktraces = [
553-
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
554-
for i in range(4)
555-
]
556-
assert bulk_group_data_stacktraces["data"] == expected_group_data
557-
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
558-
assert bulk_group_data_stacktraces["data"] == expected_group_data
559-
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
560-
mock_logger.exception.assert_called_with(
561-
"tasks.backfill_seer_grouping_records.no_group_hash",
562-
extra={
563-
"organization_id": self.project.organization.id,
564-
"project_id": self.project.id,
565-
"group_id": self.bulk_events[-1].group_id,
566-
},
567-
)
568-
569461
@patch("sentry.tasks.backfill_seer_grouping_records.logger")
570462
def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self, mock_logger):
571463
"""
@@ -581,7 +473,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self
581473
rows[-1]["event_id"] = 10000
582474

583475
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
584-
self.project, rows, messages, hashes
476+
self.project, rows, messages
585477
)
586478

587479
events = self.bulk_events[:-1]

0 commit comments

Comments
 (0)