1
1
from __future__ import annotations
2
2
3
3
import logging
4
+ from typing import Any , cast
4
5
5
6
from sentry .eventstore .models import Event
6
- from sentry .grouping .variants import BaseVariant
7
+ from sentry .grouping .component import (
8
+ ChainedExceptionGroupingComponent ,
9
+ CSPGroupingComponent ,
10
+ ExceptionGroupingComponent ,
11
+ ExpectCTGroupingComponent ,
12
+ ExpectStapleGroupingComponent ,
13
+ HPKPGroupingComponent ,
14
+ MessageGroupingComponent ,
15
+ StacktraceGroupingComponent ,
16
+ TemplateGroupingComponent ,
17
+ ThreadsGroupingComponent ,
18
+ )
19
+ from sentry .grouping .variants import (
20
+ BaseVariant ,
21
+ ChecksumVariant ,
22
+ ComponentVariant ,
23
+ CustomFingerprintVariant ,
24
+ HashedChecksumVariant ,
25
+ SaltedComponentVariant ,
26
+ VariantsByDescriptor ,
27
+ )
7
28
from sentry .models .grouphash import GroupHash
8
29
from sentry .models .grouphashmetadata import GroupHashMetadata , HashBasis
9
30
from sentry .models .project import Project
31
+ from sentry .types .grouphash_metadata import (
32
+ ChecksumHashingMetadata ,
33
+ FallbackHashingMetadata ,
34
+ FingerprintHashingMetadata ,
35
+ HashingMetadata ,
36
+ MessageHashingMetadata ,
37
+ SecurityHashingMetadata ,
38
+ StacktraceHashingMetadata ,
39
+ TemplateHashingMetadata ,
40
+ )
10
41
11
42
logger = logging .getLogger (__name__ )
12
43
@@ -57,12 +88,13 @@ def create_or_update_grouphash_metadata(
57
88
# we'll have to override the metadata creation date for them.
58
89
59
90
if created :
60
- hash_basis = _get_hash_basis (event , project , variants )
91
+ hash_basis , hashing_metadata = get_hash_basis_and_metadata (event , project , variants )
61
92
62
93
GroupHashMetadata .objects .create (
63
94
grouphash = grouphash ,
64
95
latest_grouping_config = grouping_config ,
65
96
hash_basis = hash_basis ,
97
+ hashing_metadata = hashing_metadata ,
66
98
)
67
99
elif grouphash .metadata and grouphash .metadata .latest_grouping_config != grouping_config :
68
100
# Keep track of the most recent config which computed this hash, so that once a
@@ -71,8 +103,16 @@ def create_or_update_grouphash_metadata(
71
103
grouphash .metadata .update (latest_grouping_config = grouping_config )
72
104
73
105
74
- def _get_hash_basis (event : Event , project : Project , variants : dict [str , BaseVariant ]) -> HashBasis :
75
- main_variant = (
106
+ def get_hash_basis_and_metadata (
107
+ event : Event , project : Project , variants : dict [str , BaseVariant ]
108
+ ) -> tuple [HashBasis , HashingMetadata ]:
109
+ hashing_metadata : HashingMetadata = {}
110
+
111
+ # TODO: This (and `contributing_variant` below) are typed as `Any` so that we don't have to cast
112
+ # them to whatever specific subtypes of `BaseVariant` and `GroupingComponent` (respectively)
113
+ # each of the helper calls below requires. Casting once, to a type retrieved from a look-up,
114
+ # doesn't work, but maybe there's a better way?
115
+ contributing_variant : Any = (
76
116
variants ["app" ]
77
117
# TODO: We won't need this 'if' once we stop returning both app and system contributing
78
118
# variants
@@ -87,20 +127,246 @@ def _get_hash_basis(event: Event, project: Project, variants: dict[str, BaseVari
87
127
else [variant for variant in variants .values () if variant .contributes ][0 ]
88
128
)
89
129
)
130
+ contributing_component : Any = (
131
+ # There should only ever be a single contributing component here at the top level
132
+ [value for value in contributing_variant .component .values if value .contributes ][0 ]
133
+ if hasattr (contributing_variant , "component" )
134
+ else None
135
+ )
136
+
137
+ # Hybrid fingerprinting adds 'modified' to the beginning of the description of whatever method
138
+ # was used before the extra fingerprint was added. We classify events with hybrid fingerprints
139
+ # by the `{{ default }}` portion of their grouping, so strip the prefix before doing the
140
+ # look-up.
141
+ is_hybrid_fingerprint = contributing_variant .description .startswith ("modified" )
142
+ method_description = contributing_variant .description .replace ("modified " , "" )
90
143
91
144
try :
92
- hash_basis = GROUPING_METHODS_BY_DESCRIPTION [
93
- # Hybrid fingerprinting adds 'modified' to the beginning of the description of whatever
94
- # method was used beore the extra fingerprint was added, so strip that off before
95
- # looking it up
96
- main_variant .description .replace ("modified " , "" )
97
- ]
145
+ hash_basis = GROUPING_METHODS_BY_DESCRIPTION [method_description ]
98
146
except KeyError :
99
147
logger .exception (
100
148
"Encountered unknown grouping method '%s'." ,
101
- main_variant .description ,
149
+ contributing_variant .description ,
102
150
extra = {"project" : project .id , "event" : event .event_id },
103
151
)
104
- hash_basis = HashBasis .UNKNOWN
152
+ return (HashBasis .UNKNOWN , {})
153
+
154
+ # Gather different metadata depending on the grouping method
155
+
156
+ if hash_basis == HashBasis .STACKTRACE :
157
+ hashing_metadata = _get_stacktrace_hashing_metadata (
158
+ contributing_variant , contributing_component
159
+ )
160
+
161
+ elif hash_basis == HashBasis .MESSAGE :
162
+ hashing_metadata = _get_message_hashing_metadata (contributing_component )
163
+
164
+ elif hash_basis == HashBasis .FINGERPRINT :
165
+ hashing_metadata = _get_fingerprint_hashing_metadata (contributing_variant )
166
+
167
+ elif hash_basis == HashBasis .SECURITY_VIOLATION :
168
+ hashing_metadata = _get_security_hashing_metadata (contributing_component )
169
+
170
+ elif hash_basis == HashBasis .TEMPLATE :
171
+ hashing_metadata = _get_template_hashing_metadata (contributing_component )
172
+
173
+ elif hash_basis == HashBasis .CHECKSUM :
174
+ hashing_metadata = _get_checksum_hashing_metadata (contributing_variant )
175
+
176
+ elif hash_basis == HashBasis .FALLBACK :
177
+ hashing_metadata = _get_fallback_hashing_metadata (
178
+ # TODO: Once https://peps.python.org/pep-0728 is a thing (still in draft but
179
+ # theoretically on track for 3.14), we can mark `VariantsByDescriptor` as closed and
180
+ # annotate `variants` as a `VariantsByDescriptor` instance in the spot where it's created
181
+ # and in all of the spots where it gets passed function to function. (Without the
182
+ # closed-ness, the return values of `.items()` and `.values()` don't get typed as
183
+ # `BaseVariant`, so for now we need to keep `variants` typed as `dict[str, BaseVariant]`
184
+ # until we get here.)
185
+ cast (VariantsByDescriptor , variants )
186
+ )
187
+
188
+ if is_hybrid_fingerprint :
189
+ hashing_metadata .update (
190
+ _get_fingerprint_hashing_metadata (contributing_variant , is_hybrid = True )
191
+ )
192
+
193
+ return hash_basis , hashing_metadata
194
+
195
+
196
+ def _get_stacktrace_hashing_metadata (
197
+ contributing_variant : ComponentVariant ,
198
+ contributing_component : (
199
+ StacktraceGroupingComponent
200
+ | ExceptionGroupingComponent
201
+ | ChainedExceptionGroupingComponent
202
+ | ThreadsGroupingComponent
203
+ ),
204
+ ) -> StacktraceHashingMetadata :
205
+ return {
206
+ "stacktrace_type" : "in_app" if "in-app" in contributing_variant .description else "system" ,
207
+ "stacktrace_location" : (
208
+ "exception"
209
+ if "exception" in contributing_variant .description
210
+ else "thread" if "thread" in contributing_variant .description else "top-level"
211
+ ),
212
+ "num_stacktraces" : (
213
+ len (contributing_component .values )
214
+ if contributing_component .id == "chained-exception"
215
+ else 1
216
+ ),
217
+ }
218
+
219
+
220
+ def _get_message_hashing_metadata (
221
+ contributing_component : (
222
+ MessageGroupingComponent | ExceptionGroupingComponent | ChainedExceptionGroupingComponent
223
+ ),
224
+ ) -> MessageHashingMetadata :
225
+ # In the simplest case, we already have the component we need to check
226
+ if isinstance (contributing_component , MessageGroupingComponent ):
227
+ return {
228
+ "message_source" : "message" ,
229
+ "message_parameterized" : (
230
+ contributing_component .hint == "stripped event-specific values"
231
+ ),
232
+ }
233
+
234
+ # Otherwise, we have to look in the nested structure to figure out if the message was
235
+ # parameterized. If it's a single exception, we can just check its subcomponents directly, but
236
+ # if it's a chained exception we have to dig in an extra level, and look at the subcomponents of
237
+ # all of its children. (The subcomponents are things like stacktrace, error type, error value,
238
+ # etc.)
239
+ exceptions_to_check : list [ExceptionGroupingComponent ] = []
240
+ if isinstance (contributing_component , ChainedExceptionGroupingComponent ):
241
+ exceptions = contributing_component .values
242
+ exceptions_to_check = [exception for exception in exceptions if exception .contributes ]
243
+ else :
244
+ exception = contributing_component
245
+ exceptions_to_check = [exception ]
246
+
247
+ for exception in exceptions_to_check :
248
+ for subcomponent in exception .values :
249
+ if subcomponent .contributes and subcomponent .hint == "stripped event-specific values" :
250
+ return {"message_source" : "exception" , "message_parameterized" : True }
251
+
252
+ return {"message_source" : "exception" , "message_parameterized" : False }
253
+
254
+
255
+ def _get_fingerprint_hashing_metadata (
256
+ contributing_variant : CustomFingerprintVariant | SaltedComponentVariant , is_hybrid : bool = False
257
+ ) -> FingerprintHashingMetadata :
258
+ client_fingerprint = contributing_variant .info .get ("client_fingerprint" )
259
+ matched_rule = contributing_variant .info .get ("matched_rule" )
260
+
261
+ metadata : FingerprintHashingMetadata = {
262
+ # For simplicity, we stringify fingerprint values (which are always lists) to keep
263
+ # `hashing_metadata` a flat structure
264
+ "fingerprint" : str (contributing_variant .values ),
265
+ "fingerprint_source" : (
266
+ "client"
267
+ if not matched_rule
268
+ else (
269
+ "server_builtin_rule"
270
+ if contributing_variant .type == "built_in_fingerprint"
271
+ else "server_custom_rule"
272
+ )
273
+ ),
274
+ "is_hybrid_fingerprint" : is_hybrid ,
275
+ }
276
+
277
+ # Note that these two conditions are not mutually exclusive - you can set a fingerprint in the
278
+ # SDK and have your event match a server-based rule (in which case the latter will take
279
+ # precedence)
280
+ if matched_rule :
281
+ metadata ["matched_fingerprinting_rule" ] = matched_rule ["text" ]
282
+ if client_fingerprint :
283
+ metadata ["client_fingerprint" ] = str (client_fingerprint )
284
+
285
+ return metadata
286
+
287
+
288
+ def _get_security_hashing_metadata (
289
+ contributing_component : (
290
+ CSPGroupingComponent
291
+ | ExpectCTGroupingComponent
292
+ | ExpectStapleGroupingComponent
293
+ | HPKPGroupingComponent
294
+ ),
295
+ ) -> SecurityHashingMetadata :
296
+ subcomponents_by_id = {
297
+ subcomponent .id : subcomponent for subcomponent in contributing_component .values
298
+ }
299
+ blocked_host_key = "uri" if contributing_component .id == "csp" else "hostname"
300
+
301
+ metadata : SecurityHashingMetadata = {
302
+ "security_report_type" : contributing_component .id ,
303
+ # Having a string which includes the "this is a string" quotes is a *real* footgun in terms
304
+ # of querying, so strip those off before storing the value
305
+ "blocked_host" : subcomponents_by_id [blocked_host_key ].values [0 ].strip ("'" ),
306
+ }
307
+
308
+ if contributing_component .id == "csp" :
309
+ metadata ["csp_directive" ] = subcomponents_by_id ["salt" ].values [0 ]
310
+ if subcomponents_by_id ["violation" ].contributes :
311
+ metadata ["csp_script_violation" ] = subcomponents_by_id ["violation" ].values [0 ].strip ("'" )
312
+
313
+ return metadata
314
+
315
+
316
+ def _get_template_hashing_metadata (
317
+ contributing_component : TemplateGroupingComponent ,
318
+ ) -> TemplateHashingMetadata :
319
+ metadata : TemplateHashingMetadata = {}
320
+
321
+ subcomponents_by_id = {
322
+ subcomponent .id : subcomponent for subcomponent in contributing_component .values
323
+ }
324
+
325
+ if subcomponents_by_id ["filename" ].values :
326
+ metadata ["template_name" ] = subcomponents_by_id ["filename" ].values [0 ]
327
+ if subcomponents_by_id ["context-line" ].values :
328
+ metadata ["template_context_line" ] = subcomponents_by_id ["context-line" ].values [0 ]
329
+
330
+ return metadata
331
+
332
+
333
+ def _get_checksum_hashing_metadata (
334
+ contributing_variant : ChecksumVariant | HashedChecksumVariant ,
335
+ ) -> ChecksumHashingMetadata :
336
+ metadata : ChecksumHashingMetadata = {"checksum" : contributing_variant .checksum }
337
+
338
+ if isinstance (contributing_variant , HashedChecksumVariant ):
339
+ metadata ["raw_checksum" ] = contributing_variant .raw_checksum
340
+
341
+ return metadata
342
+
343
+
344
+ def _get_fallback_hashing_metadata (
345
+ variants : VariantsByDescriptor ,
346
+ ) -> FallbackHashingMetadata :
347
+ # TODO: All of the specific cases handled below relate to stacktrace frames. Let's how often we
348
+ # land in the `other` category and then we can decide how much further it's worthwhile to break
349
+ # it down.
350
+
351
+ if (
352
+ "app" in variants
353
+ and variants ["app" ].component .values [0 ].hint == "ignored because it contains no frames"
354
+ ):
355
+ reason = "no_frames"
356
+
357
+ elif (
358
+ "system" in variants
359
+ and variants ["system" ].component .values [0 ].hint
360
+ == "ignored because it contains no contributing frames"
361
+ ):
362
+ reason = "no_contributing_frames"
363
+
364
+ elif "system" in variants and "min-frames" in (
365
+ variants ["system" ].component .values [0 ].hint or ""
366
+ ):
367
+ reason = "insufficient_contributing_frames"
368
+
369
+ else :
370
+ reason = "other"
105
371
106
- return hash_basis
372
+ return { "fallback_reason" : reason }
0 commit comments