|
4 | 4 | from typing import Dict, Mapping, Optional, TypedDict
|
5 | 5 |
|
6 | 6 | import msgpack
|
| 7 | +import sentry_sdk |
7 | 8 | from arroyo.backends.kafka.consumer import KafkaPayload
|
8 | 9 | from arroyo.processing.strategies.abstract import ProcessingStrategy, ProcessingStrategyFactory
|
9 | 10 | from arroyo.processing.strategies.commit import CommitOffsets
|
10 | 11 | from arroyo.processing.strategies.run_task import RunTask
|
11 |
| -from arroyo.types import Commit, Message, Partition |
| 12 | +from arroyo.types import BrokerValue, Commit, Message, Partition |
12 | 13 | from django.conf import settings
|
13 | 14 | from django.db import router, transaction
|
14 | 15 | from django.utils.text import slugify
|
|
37 | 38 | valid_duration,
|
38 | 39 | )
|
39 | 40 | from sentry.monitors.validators import ConfigValidator, MonitorCheckInValidator
|
40 |
| -from sentry.utils import json, metrics |
| 41 | +from sentry.utils import json, metrics, redis |
41 | 42 | from sentry.utils.dates import to_datetime
|
42 | 43 | from sentry.utils.locking import UnableToAcquireLock
|
43 | 44 | from sentry.utils.locking.manager import LockManager
|
|
50 | 51 | CHECKIN_QUOTA_LIMIT = 5
|
51 | 52 | CHECKIN_QUOTA_WINDOW = 60
|
52 | 53 |
|
| 54 | +# This key is used when SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled and we |
| 55 | +# trigger the monitor tasks as a side-effect of check-ins coming in. It is used |
| 56 | +# to store he last timestamp that the tasks were triggered. |
| 57 | +HIGH_VOLUME_LAST_TRIGGER_TS_KEY = "sentry.monitors.last_tasks_ts" |
| 58 | + |
53 | 59 |
|
54 | 60 | class CheckinMessage(TypedDict):
|
55 | 61 | payload: str
|
@@ -140,7 +146,74 @@ def _ensure_monitor_with_config(
|
140 | 146 | return monitor
|
141 | 147 |
|
142 | 148 |
|
143 |
| -def _process_message(wrapper: CheckinMessage) -> None: |
| 149 | +def _dispatch_tasks(ts: datetime): |
| 150 | + # For now we're going to have this do nothing. We want to validate that |
| 151 | + # we're not going to be skipping any check-ins |
| 152 | + return |
| 153 | + |
| 154 | + # check_missing.delay(current_datetime=ts) |
| 155 | + # check_timeout.delay(current_datetime=ts) |
| 156 | + |
| 157 | + |
| 158 | +def _handle_clock_pulse_task_trigger(ts: datetime): |
| 159 | + """ |
| 160 | + Handles clock pulse messages. These pulses are generated by the |
| 161 | + `sentry.monitors.tasks.clock_pulse` tasks which runs every minute. Clock |
| 162 | + pulses will NOT be generated when SENTRY_MONITORS_HIGH_VOLUME_MODE is |
| 163 | + enabled. |
| 164 | +
|
| 165 | + This function is responsible for dispatching the missed check-in and timed |
| 166 | + out check-in detection tasks. |
| 167 | + """ |
| 168 | + _dispatch_tasks(ts) |
| 169 | + |
| 170 | + |
| 171 | +def _try_handle_high_volume_task_trigger(ts: datetime): |
| 172 | + """ |
| 173 | + When SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled we use each check-in |
| 174 | + message as a pseudo clock. |
| 175 | + """ |
| 176 | + redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) |
| 177 | + |
| 178 | + # Trim the timestamp seconds off, these tasks are run once per minute and |
| 179 | + # should have their timestamp clamped to the minute. |
| 180 | + reference_datetime = ts.replace(second=0, microsecond=0) |
| 181 | + reference_ts = int(reference_datetime.timestamp()) |
| 182 | + |
| 183 | + last_ts = redis_client.get(HIGH_VOLUME_LAST_TRIGGER_TS_KEY) |
| 184 | + if last_ts is not None: |
| 185 | + last_ts = int(last_ts) |
| 186 | + |
| 187 | + # Do nothing until the message we process moves across the minute boundary |
| 188 | + if last_ts == reference_ts: |
| 189 | + return |
| 190 | + |
| 191 | + try: |
| 192 | + lock = locks.get("sentry.monitors.task_trigger", duration=5) |
| 193 | + with lock.acquire(): |
| 194 | + # If more than exactly a minute has passed then we've skipped a |
| 195 | + # task run, report that to sentry, it is a problem. |
| 196 | + if last_ts is not None and last_ts + 60 != reference_ts: |
| 197 | + with sentry_sdk.push_scope() as scope: |
| 198 | + scope.set_extra("last_ts", last_ts) |
| 199 | + scope.set_extra("reference_ts", reference_ts) |
| 200 | + sentry_sdk.capture_message("Monitor task dispatch minute skipped") |
| 201 | + |
| 202 | + _dispatch_tasks(ts) |
| 203 | + metrics.incr("monitors.tassk.triggered_via_high_volume_clock") |
| 204 | + redis_client.set(HIGH_VOLUME_LAST_TRIGGER_TS_KEY, reference_ts) |
| 205 | + except UnableToAcquireLock: |
| 206 | + # Another message processor is handling this. Nothing to do |
| 207 | + pass |
| 208 | + |
| 209 | + |
| 210 | +def _process_message(ts: datetime, wrapper: CheckinMessage) -> None: |
| 211 | + # When running in high volume mode we will not consume clock pulses (The |
| 212 | + # clock_pulse task is not enabled). Instead we use each check-in message as |
| 213 | + # a means for trigering our tasks. |
| 214 | + if settings.SENTRY_MONITORS_HIGH_VOLUME_MODE: |
| 215 | + _try_handle_high_volume_task_trigger(ts) |
| 216 | + |
144 | 217 | params: CheckinPayload = json.loads(wrapper["payload"])
|
145 | 218 | start_time = to_datetime(float(wrapper["start_time"]))
|
146 | 219 | project_id = int(wrapper["project_id"])
|
@@ -441,9 +514,10 @@ def create_with_partitions(
|
441 | 514 | partitions: Mapping[Partition, int],
|
442 | 515 | ) -> ProcessingStrategy[KafkaPayload]:
|
443 | 516 | def process_message(message: Message[KafkaPayload]) -> None:
|
| 517 | + assert isinstance(message.value, BrokerValue) |
444 | 518 | try:
|
445 | 519 | wrapper = msgpack.unpackb(message.payload.value)
|
446 |
| - _process_message(wrapper) |
| 520 | + _process_message(message.value.timestamp, wrapper) |
447 | 521 | except Exception:
|
448 | 522 | logger.exception("Failed to process message payload")
|
449 | 523 |
|
|
0 commit comments