Skip to content

Commit 46db7b3

Browse files
authored
feat(grouping): Add hashing_metadata field to GroupHashMetadata table (#80531)
This adds a new field, `hashing_metadata`, to the `GroupHashMetadata` table, to serve as a compliment to the `hash_basis` field added in #79835. Whereas that field stores the overall grouping method (stacktrace, message, custom fingerprint, etc.), this new field will store more detailed, grouping-method specific data. For example, when grouping on message, it will store whether or not the message was parameterized; when grouping on fingerprint, it will store the source of the fingerprint; when grouping on stacktrace, it will note whether the stacktrace was found in an exception or in a thread. (The full scope of the data stored for each grouping method can be found in the `XYZHashingMetadata` types added in this PR.) Code to add data to this field is included in #80534.
1 parent 4441e5d commit 46db7b3

File tree

4 files changed

+202
-1
lines changed

4 files changed

+202
-1
lines changed

migrations_lockfile.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
1010
nodestore: 0002_nodestore_no_dictfield
1111
remote_subscriptions: 0003_drop_remote_subscription
1212
replays: 0004_index_together
13-
sentry: 0790_delete_dashboard_perms_col
13+
sentry: 0791_add_hashing_metadata_to_grouphash_metadata
1414
social_auth: 0002_default_auto_field
1515
uptime: 0018_add_trace_sampling_field_to_uptime
1616
workflow_engine: 0012_data_source_type_change
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Generated by Django 5.1.1 on 2024-11-14 22:09
2+
3+
from django.db import migrations
4+
5+
import sentry.db.models.fields.jsonfield
6+
from sentry.new_migrations.migrations import CheckedMigration
7+
8+
9+
class Migration(CheckedMigration):
10+
# This flag is used to mark that a migration shouldn't be automatically run in production.
11+
# This should only be used for operations where it's safe to run the migration after your
12+
# code has deployed. So this should not be used for most operations that alter the schema
13+
# of a table.
14+
# Here are some things that make sense to mark as post deployment:
15+
# - Large data migrations. Typically we want these to be run manually so that they can be
16+
# monitored and not block the deploy for a long period of time while they run.
17+
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
18+
# run this outside deployments so that we don't block them. Note that while adding an index
19+
# is a schema change, it's completely safe to run the operation after the code has deployed.
20+
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
21+
22+
is_post_deployment = False
23+
24+
dependencies = [
25+
("sentry", "0790_delete_dashboard_perms_col"),
26+
]
27+
28+
operations = [
29+
migrations.AddField(
30+
model_name="grouphashmetadata",
31+
name="hashing_metadata",
32+
field=sentry.db.models.fields.jsonfield.JSONField(null=True),
33+
),
34+
]

src/sentry/models/grouphashmetadata.py

+8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from sentry.db.models import Model, region_silo_model
66
from sentry.db.models.base import sane_repr
77
from sentry.db.models.fields.foreignkey import FlexibleForeignKey
8+
from sentry.db.models.fields.jsonfield import JSONField
9+
from sentry.types.grouphash_metadata import HashingMetadata
810

911

1012
# The overall grouping method used
@@ -56,6 +58,12 @@ class GroupHashMetadata(Model):
5658
latest_grouping_config = models.CharField(null=True)
5759
# The primary grouping method (message, stacktrace, fingerprint, etc.)
5860
hash_basis = models.CharField(choices=HashBasis, null=True)
61+
# Metadata about the inputs to the hashing process and the hashing process itself (what
62+
# fingerprinting rules were matched? did we parameterize the message? etc.). For the specific
63+
# data stored, see the class definitions of the `HashingMetadata` subtypes.
64+
hashing_metadata: models.Field[HashingMetadata | None, HashingMetadata | None] = JSONField(
65+
null=True
66+
)
5967

6068
# SEER
6169

+159
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
from __future__ import annotations
2+
3+
from typing import NotRequired, TypedDict
4+
5+
# NOTE: The structure in these metadata types is intentionaly flat, to make it easier to query in
6+
# Redash or BigQuery, and they are all merged into a single flat JSON blob (which is then stored in
7+
# `GroupHashMetadata.hashing_metadata`). Therefore, if entries are added, they should be namespaced
8+
# according to their corresponding hash basis (so, for example, `fingerprint_source` and
9+
# `message_source`, rather than just `source`), both for clarity and to avoid collisions.
10+
11+
12+
class FingerprintHashingMetadata(TypedDict):
13+
"""
14+
Fingerprint data, gathered both during stand-alone custom/built-in fingerprinting and hybrid
15+
fingerprinting involving message, stacktrace, security, or template hashing
16+
"""
17+
18+
# The fingerprint value
19+
fingerprint: str
20+
# Either "client", "server_builtin_rule", or "server_custom_rule". (We don't have a "none of the
21+
# above" option here because we only record fingerprint metadata in cases where there's some
22+
# sort of custom fingerprint.)
23+
fingerprint_source: str
24+
# The fingerprint value set in the SDK, if anything other than ["{{ default }}"]. Note that just
25+
# because this is set doesn't mean we necessarily used it for grouping, since server-side rules
26+
# take precedence over client fingerprints. See `fingerprint_source` above.
27+
client_fingerprint: NotRequired[str]
28+
# The server-side rule applied, if any
29+
matched_fingerprinting_rule: NotRequired[str]
30+
# Whether or not a hybrid fingerprint (one involving both the signal value `{{ default }}` and a
31+
# custom value) was used. In that case, we group as we normally would, but then split the events
32+
# into more granular groups based on the custom value.
33+
is_hybrid_fingerprint: bool
34+
35+
36+
class MessageHashingMetadata(TypedDict):
37+
"""
38+
Data gathered when an event is grouped by log message or error type and value
39+
"""
40+
41+
# Either "message" (from "message" or "logentry") or "exception" (error type and value, in cases
42+
# where there's no stacktrace)
43+
message_source: str
44+
# Whether we've done any parameterization of the message, such as replacing a number with "<int>"
45+
message_parameterized: bool
46+
47+
48+
class SaltedMessageHashingMetadata(MessageHashingMetadata, FingerprintHashingMetadata):
49+
"""
50+
Data from message-based bybrid fingerprinting
51+
"""
52+
53+
pass
54+
55+
56+
class StacktraceHashingMetadata(TypedDict):
57+
"""
58+
Data gathered when an event is grouped based on a stacktrace found in an exception, a thread, or
59+
diretly in the event
60+
"""
61+
62+
# Either "in-app" or "system"
63+
stacktrace_type: str
64+
# Where in the event data the stacktrace was found - either "exception", "thread", or
65+
# "top-level"
66+
stacktrace_location: str
67+
# The number of stacktraces used for grouping (will be more than 1 in cases of chained
68+
# exceptions)
69+
num_stacktraces: int
70+
71+
72+
class SaltedStacktraceHashingMetadata(StacktraceHashingMetadata, FingerprintHashingMetadata):
73+
"""
74+
Data from stacktrace-based bybrid fingerprinting
75+
"""
76+
77+
pass
78+
79+
80+
class SecurityHashingMetadata(TypedDict):
81+
"""
82+
Data gathered when grouping browser-based security (Content Security Policy, Certifcate
83+
Transparency, Online Certificate Status Protocol Stapling, or HTTP Public Key Pinning) reports
84+
"""
85+
86+
# Either "csp", "expect-ct", "expect-staple", or "hpkp"
87+
security_report_type: str
88+
# Domain name of the blocked address
89+
blocked_host: str
90+
# The CSP directive which was violated
91+
csp_directive: NotRequired[str]
92+
# In the case of a local `script-src` violation, whether it's an `unsafe-inline` or an
93+
# `unsafe-eval` violation
94+
csp_script_violation: NotRequired[str]
95+
96+
97+
class SaltedSecurityHashingMetadata(SecurityHashingMetadata, FingerprintHashingMetadata):
98+
"""
99+
Data from security-report-based bybrid fingerprinting
100+
"""
101+
102+
pass
103+
104+
105+
class TemplateHashingMetadata(TypedDict):
106+
"""
107+
Data gathered when grouping errors generated by Django templates
108+
"""
109+
110+
# The name of the template with the invalid template variable
111+
template_name: NotRequired[str]
112+
# The text of the line in the template containing the invalid variable
113+
template_context_line: NotRequired[str]
114+
115+
116+
class SaltedTemplateHashingMetadata(TemplateHashingMetadata, FingerprintHashingMetadata):
117+
"""
118+
Data from template-based bybrid fingerprinting
119+
"""
120+
121+
pass
122+
123+
124+
class ChecksumHashingMetadata(TypedDict):
125+
"""
126+
Data gathered when legacy checksum grouping (wherein a hash is provided directly in the event)
127+
is used
128+
"""
129+
130+
# The checksum used for grouping
131+
checksum: str
132+
# The incoming checksum value, if it was something other than a 32-digit hex value and we
133+
# therefore had to hash it before using it
134+
raw_checksum: NotRequired[str]
135+
136+
137+
class FallbackHashingMetadata(TypedDict):
138+
"""
139+
Data gathered when no other grouping method produces results
140+
"""
141+
142+
# Whether we landed in the fallback because of a lack of data, because we had a stacktrace but
143+
# all frames were ignored, or some other reason
144+
fallback_reason: str
145+
146+
147+
HashingMetadata = (
148+
FingerprintHashingMetadata
149+
| MessageHashingMetadata
150+
| SaltedMessageHashingMetadata
151+
| StacktraceHashingMetadata
152+
| SaltedStacktraceHashingMetadata
153+
| SecurityHashingMetadata
154+
| SaltedSecurityHashingMetadata
155+
| TemplateHashingMetadata
156+
| SaltedTemplateHashingMetadata
157+
| ChecksumHashingMetadata
158+
| FallbackHashingMetadata
159+
)

0 commit comments

Comments
 (0)