Skip to content

feat(grouping): Add hashing_metadata field to GroupHashMetadata table #80531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
nodestore: 0002_nodestore_no_dictfield
remote_subscriptions: 0003_drop_remote_subscription
replays: 0004_index_together
sentry: 0790_delete_dashboard_perms_col
sentry: 0791_add_hashing_metadata_to_grouphash_metadata
social_auth: 0002_default_auto_field
uptime: 0017_unique_on_timeout
workflow_engine: 0012_data_source_type_change
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Generated by Django 5.1.1 on 2024-11-14 22:09

from django.db import migrations

import sentry.db.models.fields.jsonfield
from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("sentry", "0790_delete_dashboard_perms_col"),
]

operations = [
migrations.AddField(
model_name="grouphashmetadata",
name="hashing_metadata",
field=sentry.db.models.fields.jsonfield.JSONField(null=True),
),
]
8 changes: 8 additions & 0 deletions src/sentry/models/grouphashmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from sentry.db.models import Model, region_silo_model
from sentry.db.models.base import sane_repr
from sentry.db.models.fields.foreignkey import FlexibleForeignKey
from sentry.db.models.fields.jsonfield import JSONField
from sentry.types.grouphash_metadata import HashingMetadata


# The overall grouping method used
Expand Down Expand Up @@ -56,6 +58,12 @@ class GroupHashMetadata(Model):
latest_grouping_config = models.CharField(null=True)
# The primary grouping method (message, stacktrace, fingerprint, etc.)
hash_basis = models.CharField(choices=HashBasis, null=True)
# Metadata about the inputs to the hashing process and the hashing process itself (what
# fingerprinting rules were matched? did we parameterize the message? etc.). For the specific
# data stored, see the class definitions of the `HashingMetadata` subtypes.
hashing_metadata: models.Field[HashingMetadata | None, HashingMetadata | None] = JSONField(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this need to be repeated HashingMetadata | None, HashingMetadata | None?

Copy link
Member Author

@lobsterkatie lobsterkatie Nov 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

null=True
)

# SEER

Expand Down
159 changes: 159 additions & 0 deletions src/sentry/types/grouphash_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from __future__ import annotations

from typing import NotRequired, TypedDict

# NOTE: The structure in these metadata types is intentionaly flat, to make it easier to query in
# Redash or BigQuery, and they are all merged into a single flat JSON blob (which is then stored in
# `GroupHashMetadata.hashing_metadata`). Therefore, if entries are added, they should be namespaced
# according to their corresponding hash basis (so, for example, `fingerprint_source` and
# `message_source`, rather than just `source`), both for clarity and to avoid collisions.


class FingerprintHashingMetadata(TypedDict):
"""
Fingerprint data, gathered both during stand-alone custom/built-in fingerprinting and hybrid
fingerprinting involving message, stacktrace, security, or template hashing
"""

# The fingerprint value
fingerprint: str
# Either "client", "server_builtin_rule", or "server_custom_rule". (We don't have a "none of the
# above" option here because we only record fingerprint metadata in cases where there's some
# sort of custom fingerprint.)
fingerprint_source: str
# The fingerprint value set in the SDK, if anything other than ["{{ default }}"]. Note that just
# because this is set doesn't mean we necessarily used it for grouping, since server-side rules
# take precedence over client fingerprints. See `fingerprint_source` above.
client_fingerprint: NotRequired[str]
# The server-side rule applied, if any
matched_fingerprinting_rule: NotRequired[str]
# Whether or not a hybrid fingerprint (one involving both the signal value `{{ default }}` and a
# custom value) was used. In that case, we group as we normally would, but then split the events
# into more granular groups based on the custom value.
is_hybrid_fingerprint: bool


class MessageHashingMetadata(TypedDict):
"""
Data gathered when an event is grouped by log message or error type and value
"""

# Either "message" (from "message" or "logentry") or "exception" (error type and value, in cases
# where there's no stacktrace)
message_source: str
# Whether we've done any parameterization of the message, such as replacing a number with "<int>"
message_parameterized: bool


class SaltedMessageHashingMetadata(MessageHashingMetadata, FingerprintHashingMetadata):
"""
Data from message-based bybrid fingerprinting
"""

pass


class StacktraceHashingMetadata(TypedDict):
"""
Data gathered when an event is grouped based on a stacktrace found in an exception, a thread, or
diretly in the event
"""

# Either "in-app" or "system"
stacktrace_type: str
# Where in the event data the stacktrace was found - either "exception", "thread", or
# "top-level"
stacktrace_location: str
# The number of stacktraces used for grouping (will be more than 1 in cases of chained
# exceptions)
num_stacktraces: int


class SaltedStacktraceHashingMetadata(StacktraceHashingMetadata, FingerprintHashingMetadata):
"""
Data from stacktrace-based bybrid fingerprinting
"""

pass


class SecurityHashingMetadata(TypedDict):
"""
Data gathered when grouping browser-based security (Content Security Policy, Certifcate
Transparency, Online Certificate Status Protocol Stapling, or HTTP Public Key Pinning) reports
"""

# Either "csp", "expect-ct", "expect-staple", or "hpkp"
security_report_type: str
# Domain name of the blocked address
blocked_host: str
# The CSP directive which was violated
csp_directive: NotRequired[str]
# In the case of a local `script-src` violation, whether it's an `unsafe-inline` or an
# `unsafe-eval` violation
csp_script_violation: NotRequired[str]


class SaltedSecurityHashingMetadata(SecurityHashingMetadata, FingerprintHashingMetadata):
"""
Data from security-report-based bybrid fingerprinting
"""

pass


class TemplateHashingMetadata(TypedDict):
"""
Data gathered when grouping errors generated by Django templates
"""

# The name of the template with the invalid template variable
template_name: NotRequired[str]
# The text of the line in the template containing the invalid variable
template_context_line: NotRequired[str]


class SaltedTemplateHashingMetadata(TemplateHashingMetadata, FingerprintHashingMetadata):
"""
Data from template-based bybrid fingerprinting
"""

pass


class ChecksumHashingMetadata(TypedDict):
"""
Data gathered when legacy checksum grouping (wherein a hash is provided directly in the event)
is used
"""

# The checksum used for grouping
checksum: str
# The incoming checksum value, if it was something other than a 32-digit hex value and we
# therefore had to hash it before using it
raw_checksum: NotRequired[str]


class FallbackHashingMetadata(TypedDict):
"""
Data gathered when no other grouping method produces results
"""

# Whether we landed in the fallback because of a lack of data, because we had a stacktrace but
# all frames were ignored, or some other reason
fallback_reason: str


HashingMetadata = (
FingerprintHashingMetadata
| MessageHashingMetadata
| SaltedMessageHashingMetadata
| StacktraceHashingMetadata
| SaltedStacktraceHashingMetadata
| SecurityHashingMetadata
| SaltedSecurityHashingMetadata
| TemplateHashingMetadata
| SaltedTemplateHashingMetadata
| ChecksumHashingMetadata
| FallbackHashingMetadata
)
Loading