Skip to content

Commit c1799d9

Browse files
authored
feat(grouping): Store grouping-method-specific metadata in GroupHashMetadata (#80534)
This is a follow-up to #80531 (which added a new `hashing_metaadata` field to the `GroupHashMetadata` table), adding code to actual store data in the new field.
1 parent f167f36 commit c1799d9

File tree

703 files changed

+4346
-712
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

703 files changed

+4346
-712
lines changed

src/sentry/grouping/ingest/grouphash_metadata.py

+279-13
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,43 @@
11
from __future__ import annotations
22

33
import logging
4+
from typing import Any, cast
45

56
from sentry.eventstore.models import Event
6-
from sentry.grouping.variants import BaseVariant
7+
from sentry.grouping.component import (
8+
ChainedExceptionGroupingComponent,
9+
CSPGroupingComponent,
10+
ExceptionGroupingComponent,
11+
ExpectCTGroupingComponent,
12+
ExpectStapleGroupingComponent,
13+
HPKPGroupingComponent,
14+
MessageGroupingComponent,
15+
StacktraceGroupingComponent,
16+
TemplateGroupingComponent,
17+
ThreadsGroupingComponent,
18+
)
19+
from sentry.grouping.variants import (
20+
BaseVariant,
21+
ChecksumVariant,
22+
ComponentVariant,
23+
CustomFingerprintVariant,
24+
HashedChecksumVariant,
25+
SaltedComponentVariant,
26+
VariantsByDescriptor,
27+
)
728
from sentry.models.grouphash import GroupHash
829
from sentry.models.grouphashmetadata import GroupHashMetadata, HashBasis
930
from sentry.models.project import Project
31+
from sentry.types.grouphash_metadata import (
32+
ChecksumHashingMetadata,
33+
FallbackHashingMetadata,
34+
FingerprintHashingMetadata,
35+
HashingMetadata,
36+
MessageHashingMetadata,
37+
SecurityHashingMetadata,
38+
StacktraceHashingMetadata,
39+
TemplateHashingMetadata,
40+
)
1041

1142
logger = logging.getLogger(__name__)
1243

@@ -57,12 +88,13 @@ def create_or_update_grouphash_metadata(
5788
# we'll have to override the metadata creation date for them.
5889

5990
if created:
60-
hash_basis = _get_hash_basis(event, project, variants)
91+
hash_basis, hashing_metadata = get_hash_basis_and_metadata(event, project, variants)
6192

6293
GroupHashMetadata.objects.create(
6394
grouphash=grouphash,
6495
latest_grouping_config=grouping_config,
6596
hash_basis=hash_basis,
97+
hashing_metadata=hashing_metadata,
6698
)
6799
elif grouphash.metadata and grouphash.metadata.latest_grouping_config != grouping_config:
68100
# Keep track of the most recent config which computed this hash, so that once a
@@ -71,8 +103,16 @@ def create_or_update_grouphash_metadata(
71103
grouphash.metadata.update(latest_grouping_config=grouping_config)
72104

73105

74-
def _get_hash_basis(event: Event, project: Project, variants: dict[str, BaseVariant]) -> HashBasis:
75-
main_variant = (
106+
def get_hash_basis_and_metadata(
107+
event: Event, project: Project, variants: dict[str, BaseVariant]
108+
) -> tuple[HashBasis, HashingMetadata]:
109+
hashing_metadata: HashingMetadata = {}
110+
111+
# TODO: This (and `contributing_variant` below) are typed as `Any` so that we don't have to cast
112+
# them to whatever specific subtypes of `BaseVariant` and `GroupingComponent` (respectively)
113+
# each of the helper calls below requires. Casting once, to a type retrieved from a look-up,
114+
# doesn't work, but maybe there's a better way?
115+
contributing_variant: Any = (
76116
variants["app"]
77117
# TODO: We won't need this 'if' once we stop returning both app and system contributing
78118
# variants
@@ -87,20 +127,246 @@ def _get_hash_basis(event: Event, project: Project, variants: dict[str, BaseVari
87127
else [variant for variant in variants.values() if variant.contributes][0]
88128
)
89129
)
130+
contributing_component: Any = (
131+
# There should only ever be a single contributing component here at the top level
132+
[value for value in contributing_variant.component.values if value.contributes][0]
133+
if hasattr(contributing_variant, "component")
134+
else None
135+
)
136+
137+
# Hybrid fingerprinting adds 'modified' to the beginning of the description of whatever method
138+
# was used before the extra fingerprint was added. We classify events with hybrid fingerprints
139+
# by the `{{ default }}` portion of their grouping, so strip the prefix before doing the
140+
# look-up.
141+
is_hybrid_fingerprint = contributing_variant.description.startswith("modified")
142+
method_description = contributing_variant.description.replace("modified ", "")
90143

91144
try:
92-
hash_basis = GROUPING_METHODS_BY_DESCRIPTION[
93-
# Hybrid fingerprinting adds 'modified' to the beginning of the description of whatever
94-
# method was used beore the extra fingerprint was added, so strip that off before
95-
# looking it up
96-
main_variant.description.replace("modified ", "")
97-
]
145+
hash_basis = GROUPING_METHODS_BY_DESCRIPTION[method_description]
98146
except KeyError:
99147
logger.exception(
100148
"Encountered unknown grouping method '%s'.",
101-
main_variant.description,
149+
contributing_variant.description,
102150
extra={"project": project.id, "event": event.event_id},
103151
)
104-
hash_basis = HashBasis.UNKNOWN
152+
return (HashBasis.UNKNOWN, {})
153+
154+
# Gather different metadata depending on the grouping method
155+
156+
if hash_basis == HashBasis.STACKTRACE:
157+
hashing_metadata = _get_stacktrace_hashing_metadata(
158+
contributing_variant, contributing_component
159+
)
160+
161+
elif hash_basis == HashBasis.MESSAGE:
162+
hashing_metadata = _get_message_hashing_metadata(contributing_component)
163+
164+
elif hash_basis == HashBasis.FINGERPRINT:
165+
hashing_metadata = _get_fingerprint_hashing_metadata(contributing_variant)
166+
167+
elif hash_basis == HashBasis.SECURITY_VIOLATION:
168+
hashing_metadata = _get_security_hashing_metadata(contributing_component)
169+
170+
elif hash_basis == HashBasis.TEMPLATE:
171+
hashing_metadata = _get_template_hashing_metadata(contributing_component)
172+
173+
elif hash_basis == HashBasis.CHECKSUM:
174+
hashing_metadata = _get_checksum_hashing_metadata(contributing_variant)
175+
176+
elif hash_basis == HashBasis.FALLBACK:
177+
hashing_metadata = _get_fallback_hashing_metadata(
178+
# TODO: Once https://peps.python.org/pep-0728 is a thing (still in draft but
179+
# theoretically on track for 3.14), we can mark `VariantsByDescriptor` as closed and
180+
# annotate `variants` as a `VariantsByDescriptor` instance in the spot where it's created
181+
# and in all of the spots where it gets passed function to function. (Without the
182+
# closed-ness, the return values of `.items()` and `.values()` don't get typed as
183+
# `BaseVariant`, so for now we need to keep `variants` typed as `dict[str, BaseVariant]`
184+
# until we get here.)
185+
cast(VariantsByDescriptor, variants)
186+
)
187+
188+
if is_hybrid_fingerprint:
189+
hashing_metadata.update(
190+
_get_fingerprint_hashing_metadata(contributing_variant, is_hybrid=True)
191+
)
192+
193+
return hash_basis, hashing_metadata
194+
195+
196+
def _get_stacktrace_hashing_metadata(
197+
contributing_variant: ComponentVariant,
198+
contributing_component: (
199+
StacktraceGroupingComponent
200+
| ExceptionGroupingComponent
201+
| ChainedExceptionGroupingComponent
202+
| ThreadsGroupingComponent
203+
),
204+
) -> StacktraceHashingMetadata:
205+
return {
206+
"stacktrace_type": "in_app" if "in-app" in contributing_variant.description else "system",
207+
"stacktrace_location": (
208+
"exception"
209+
if "exception" in contributing_variant.description
210+
else "thread" if "thread" in contributing_variant.description else "top-level"
211+
),
212+
"num_stacktraces": (
213+
len(contributing_component.values)
214+
if contributing_component.id == "chained-exception"
215+
else 1
216+
),
217+
}
218+
219+
220+
def _get_message_hashing_metadata(
221+
contributing_component: (
222+
MessageGroupingComponent | ExceptionGroupingComponent | ChainedExceptionGroupingComponent
223+
),
224+
) -> MessageHashingMetadata:
225+
# In the simplest case, we already have the component we need to check
226+
if isinstance(contributing_component, MessageGroupingComponent):
227+
return {
228+
"message_source": "message",
229+
"message_parameterized": (
230+
contributing_component.hint == "stripped event-specific values"
231+
),
232+
}
233+
234+
# Otherwise, we have to look in the nested structure to figure out if the message was
235+
# parameterized. If it's a single exception, we can just check its subcomponents directly, but
236+
# if it's a chained exception we have to dig in an extra level, and look at the subcomponents of
237+
# all of its children. (The subcomponents are things like stacktrace, error type, error value,
238+
# etc.)
239+
exceptions_to_check: list[ExceptionGroupingComponent] = []
240+
if isinstance(contributing_component, ChainedExceptionGroupingComponent):
241+
exceptions = contributing_component.values
242+
exceptions_to_check = [exception for exception in exceptions if exception.contributes]
243+
else:
244+
exception = contributing_component
245+
exceptions_to_check = [exception]
246+
247+
for exception in exceptions_to_check:
248+
for subcomponent in exception.values:
249+
if subcomponent.contributes and subcomponent.hint == "stripped event-specific values":
250+
return {"message_source": "exception", "message_parameterized": True}
251+
252+
return {"message_source": "exception", "message_parameterized": False}
253+
254+
255+
def _get_fingerprint_hashing_metadata(
256+
contributing_variant: CustomFingerprintVariant | SaltedComponentVariant, is_hybrid: bool = False
257+
) -> FingerprintHashingMetadata:
258+
client_fingerprint = contributing_variant.info.get("client_fingerprint")
259+
matched_rule = contributing_variant.info.get("matched_rule")
260+
261+
metadata: FingerprintHashingMetadata = {
262+
# For simplicity, we stringify fingerprint values (which are always lists) to keep
263+
# `hashing_metadata` a flat structure
264+
"fingerprint": str(contributing_variant.values),
265+
"fingerprint_source": (
266+
"client"
267+
if not matched_rule
268+
else (
269+
"server_builtin_rule"
270+
if contributing_variant.type == "built_in_fingerprint"
271+
else "server_custom_rule"
272+
)
273+
),
274+
"is_hybrid_fingerprint": is_hybrid,
275+
}
276+
277+
# Note that these two conditions are not mutually exclusive - you can set a fingerprint in the
278+
# SDK and have your event match a server-based rule (in which case the latter will take
279+
# precedence)
280+
if matched_rule:
281+
metadata["matched_fingerprinting_rule"] = matched_rule["text"]
282+
if client_fingerprint:
283+
metadata["client_fingerprint"] = str(client_fingerprint)
284+
285+
return metadata
286+
287+
288+
def _get_security_hashing_metadata(
289+
contributing_component: (
290+
CSPGroupingComponent
291+
| ExpectCTGroupingComponent
292+
| ExpectStapleGroupingComponent
293+
| HPKPGroupingComponent
294+
),
295+
) -> SecurityHashingMetadata:
296+
subcomponents_by_id = {
297+
subcomponent.id: subcomponent for subcomponent in contributing_component.values
298+
}
299+
blocked_host_key = "uri" if contributing_component.id == "csp" else "hostname"
300+
301+
metadata: SecurityHashingMetadata = {
302+
"security_report_type": contributing_component.id,
303+
# Having a string which includes the "this is a string" quotes is a *real* footgun in terms
304+
# of querying, so strip those off before storing the value
305+
"blocked_host": subcomponents_by_id[blocked_host_key].values[0].strip("'"),
306+
}
307+
308+
if contributing_component.id == "csp":
309+
metadata["csp_directive"] = subcomponents_by_id["salt"].values[0]
310+
if subcomponents_by_id["violation"].contributes:
311+
metadata["csp_script_violation"] = subcomponents_by_id["violation"].values[0].strip("'")
312+
313+
return metadata
314+
315+
316+
def _get_template_hashing_metadata(
317+
contributing_component: TemplateGroupingComponent,
318+
) -> TemplateHashingMetadata:
319+
metadata: TemplateHashingMetadata = {}
320+
321+
subcomponents_by_id = {
322+
subcomponent.id: subcomponent for subcomponent in contributing_component.values
323+
}
324+
325+
if subcomponents_by_id["filename"].values:
326+
metadata["template_name"] = subcomponents_by_id["filename"].values[0]
327+
if subcomponents_by_id["context-line"].values:
328+
metadata["template_context_line"] = subcomponents_by_id["context-line"].values[0]
329+
330+
return metadata
331+
332+
333+
def _get_checksum_hashing_metadata(
334+
contributing_variant: ChecksumVariant | HashedChecksumVariant,
335+
) -> ChecksumHashingMetadata:
336+
metadata: ChecksumHashingMetadata = {"checksum": contributing_variant.checksum}
337+
338+
if isinstance(contributing_variant, HashedChecksumVariant):
339+
metadata["raw_checksum"] = contributing_variant.raw_checksum
340+
341+
return metadata
342+
343+
344+
def _get_fallback_hashing_metadata(
345+
variants: VariantsByDescriptor,
346+
) -> FallbackHashingMetadata:
347+
# TODO: All of the specific cases handled below relate to stacktrace frames. Let's how often we
348+
# land in the `other` category and then we can decide how much further it's worthwhile to break
349+
# it down.
350+
351+
if (
352+
"app" in variants
353+
and variants["app"].component.values[0].hint == "ignored because it contains no frames"
354+
):
355+
reason = "no_frames"
356+
357+
elif (
358+
"system" in variants
359+
and variants["system"].component.values[0].hint
360+
== "ignored because it contains no contributing frames"
361+
):
362+
reason = "no_contributing_frames"
363+
364+
elif "system" in variants and "min-frames" in (
365+
variants["system"].component.values[0].hint or ""
366+
):
367+
reason = "insufficient_contributing_frames"
368+
369+
else:
370+
reason = "other"
105371

106-
return hash_basis
372+
return {"fallback_reason": reason}

src/sentry/grouping/variants.py

+13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
from typing import TypedDict
4+
35
from sentry.grouping.utils import hash_from_values, is_default_fingerprint_var
46
from sentry.types.misc import KeyedList
57

@@ -213,3 +215,14 @@ def _get_metadata_as_dict(self):
213215
rv = ComponentVariant._get_metadata_as_dict(self)
214216
rv.update(expose_fingerprint_dict(self.values, self.info))
215217
return rv
218+
219+
220+
class VariantsByDescriptor(TypedDict, total=False):
221+
system: ComponentVariant
222+
app: ComponentVariant
223+
custom_fingerprint: CustomFingerprintVariant
224+
built_in_fingerprint: BuiltInFingerprintVariant
225+
checksum: ChecksumVariant
226+
hashed_checksum: HashedChecksumVariant
227+
default: ComponentVariant
228+
fallback: FallbackVariant

tests/sentry/grouping/snapshots/test_grouphash_metadata/test_metadata_from_variants/legacy@2019_03_12/actix.pysnap

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
---
2-
created: '2024-10-28T14:02:48.633593+00:00'
2+
created: '2024-11-08T19:20:22.951550+00:00'
33
creator: sentry
44
source: tests/sentry/grouping/test_grouphash_metadata.py
55
---
66
hash_basis: stacktrace
7+
hashing_metadata: {
8+
"num_stacktraces": 1,
9+
"stacktrace_location": "exception",
10+
"stacktrace_type": "in_app"
11+
}
712
---
813
contributing variants:
914
app*

tests/sentry/grouping/snapshots/test_grouphash_metadata/test_metadata_from_variants/legacy@2019_03_12/android_anr.pysnap

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
---
2-
created: '2024-10-28T14:02:48.697072+00:00'
2+
created: '2024-11-08T19:20:23.312203+00:00'
33
creator: sentry
44
source: tests/sentry/grouping/test_grouphash_metadata.py
55
---
66
hash_basis: stacktrace
7+
hashing_metadata: {
8+
"num_stacktraces": 1,
9+
"stacktrace_location": "exception",
10+
"stacktrace_type": "system"
11+
}
712
---
813
contributing variants:
914
system*

0 commit comments

Comments
 (0)