1
+ from __future__ import annotations
2
+
1
3
import logging
2
4
import uuid
3
5
from datetime import datetime , timedelta
4
- from typing import Dict , Mapping , Optional , TypedDict
6
+ from typing import Dict , Literal , Mapping , Optional , TypedDict
5
7
6
8
import msgpack
7
9
import sentry_sdk
51
53
CHECKIN_QUOTA_LIMIT = 5
52
54
CHECKIN_QUOTA_WINDOW = 60
53
55
54
- # This key is used when SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled and we
55
- # trigger the monitor tasks as a side-effect of check-ins coming in. It is used
56
- # to store he last timestamp that the tasks were triggered.
57
- HIGH_VOLUME_LAST_TRIGGER_TS_KEY = "sentry.monitors.last_tasks_ts"
56
+ # This key is used to store he last timestamp that the tasks were triggered.
57
+ MONITOR_TASKS_LAST_TRIGGERED_KEY = "sentry.monitors.last_tasks_ts"
58
58
59
59
60
60
class CheckinMessage (TypedDict ):
61
+ # TODO(epurkhiser): We should make this required and ensure the message
62
+ # produced by relay includes this message type
63
+ message_type : NotRequired [Literal ["check_in" ]]
61
64
payload : str
62
- start_time : str
65
+ start_time : float
63
66
project_id : str
64
67
sdk : str
65
68
66
69
70
+ class ClockPulseMessage (TypedDict ):
71
+ message_type : Literal ["clock_pulse" ]
72
+
73
+
67
74
class CheckinTrace (TypedDict ):
68
75
trace_id : str
69
76
@@ -147,6 +154,23 @@ def _ensure_monitor_with_config(
147
154
148
155
149
156
def _dispatch_tasks (ts : datetime ):
157
+ """
158
+ Dispatch monitor tasks triggered by the consumer clock. These will run
159
+ after the MONITOR_TASK_DELAY (in seconds), This is to give some breathing
160
+ room for check-ins to start and not be EXACTLY on the minute
161
+
162
+ These tasks are triggered via the consumer processing check-ins. This
163
+ allows the monitor tasks to be synchronized to any backlog of check-ins
164
+ that are being processed.
165
+
166
+ To ensure these tasks are always triggered there is an additional celery
167
+ beat task that produces a clock pulse message into the topic that can be
168
+ used to trigger these tasks when there is a low volume of check-ins. It is
169
+ however, preferred to have a high volume of check-ins, so we do not need to
170
+ rely on celery beat, which in some cases may fail to trigger (such as in
171
+ sentry.io, when we deploy we restart the celery beat worker and it will
172
+ skip any tasks it missed)
173
+ """
150
174
# For now we're going to have this do nothing. We want to validate that
151
175
# we're not going to be skipping any check-ins
152
176
return
@@ -155,23 +179,9 @@ def _dispatch_tasks(ts: datetime):
155
179
# check_timeout.delay(current_datetime=ts)
156
180
157
181
158
- def _handle_clock_pulse_task_trigger (ts : datetime ):
182
+ def _try_monitor_tasks_trigger (ts : datetime ):
159
183
"""
160
- Handles clock pulse messages. These pulses are generated by the
161
- `sentry.monitors.tasks.clock_pulse` tasks which runs every minute. Clock
162
- pulses will NOT be generated when SENTRY_MONITORS_HIGH_VOLUME_MODE is
163
- enabled.
164
-
165
- This function is responsible for dispatching the missed check-in and timed
166
- out check-in detection tasks.
167
- """
168
- _dispatch_tasks (ts )
169
-
170
-
171
- def _try_handle_high_volume_task_trigger (ts : datetime ):
172
- """
173
- When SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled we use each check-in
174
- message as a pseudo clock.
184
+ Handles triggering the monitor tasks when we've rolled over the minute.
175
185
"""
176
186
redis_client = redis .redis_clusters .get (settings .SENTRY_MONITORS_REDIS_CLUSTER )
177
187
@@ -182,7 +192,7 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
182
192
183
193
# Since GETSET is atomic this acts as a guard against another consumer
184
194
# picking up the minute rollover
185
- last_ts = redis_client .getset (HIGH_VOLUME_LAST_TRIGGER_TS_KEY , reference_ts )
195
+ last_ts = redis_client .getset (MONITOR_TASKS_LAST_TRIGGERED_KEY , reference_ts )
186
196
if last_ts is not None :
187
197
last_ts = int (last_ts )
188
198
@@ -194,8 +204,8 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
194
204
# close, but in the case of a backlog, this will be much higher
195
205
total_delay = reference_ts - datetime .now ().timestamp ()
196
206
197
- metrics .incr ("monitors.task.triggered_via_high_volume_clock " )
198
- metrics .gauge ("monitors.task.high_volume_clock_delay " , total_delay )
207
+ metrics .incr ("monitors.task.triggered " )
208
+ metrics .gauge ("monitors.task.clock_delay " , total_delay )
199
209
200
210
# If more than exactly a minute has passed then we've skipped a
201
211
# task run, report that to sentry, it is a problem.
@@ -208,15 +218,21 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
208
218
_dispatch_tasks (ts )
209
219
210
220
211
- def _process_message (ts : datetime , wrapper : CheckinMessage ) -> None :
212
- # When running in high volume mode we will not consume clock pulses (The
213
- # clock_pulse task is not enabled). Instead we use each check-in message as
214
- # a means for triggering our tasks.
215
- if settings .SENTRY_MONITORS_HIGH_VOLUME_MODE :
216
- try :
217
- _try_handle_high_volume_task_trigger (ts )
218
- except Exception :
219
- logger .exception ("Failed try high-volume task trigger" , exc_info = True )
221
+ def _process_message (ts : datetime , wrapper : CheckinMessage | ClockPulseMessage ) -> None :
222
+ # XXX: Relay does not attach a message type, to properly discriminate the
223
+ # type we add it by default here. This can be removed once the message_type
224
+ # is guaranteed
225
+ if "message_type" not in wrapper :
226
+ wrapper ["message_type" ] = "check_in"
227
+
228
+ try :
229
+ _try_monitor_tasks_trigger (ts )
230
+ except Exception :
231
+ logger .exception ("Failed to trigger monitor tasks" , exc_info = True )
232
+
233
+ # Nothing else to do with clock pulses
234
+ if wrapper ["message_type" ] == "clock_pulse" :
235
+ return
220
236
221
237
params : CheckinPayload = json .loads (wrapper ["payload" ])
222
238
start_time = to_datetime (float (wrapper ["start_time" ]))
0 commit comments