1
+ from __future__ import annotations
2
+
1
3
import logging
2
4
import uuid
3
5
from datetime import datetime , timedelta
4
- from typing import Dict , Mapping , Optional , TypedDict
6
+ from typing import Dict , Literal , Mapping , Optional , TypedDict
5
7
6
8
import msgpack
7
9
import sentry_sdk
51
53
CHECKIN_QUOTA_LIMIT = 5
52
54
CHECKIN_QUOTA_WINDOW = 60
53
55
54
- # This key is used when SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled and we
55
- # trigger the monitor tasks as a side-effect of check-ins coming in. It is used
56
- # to store he last timestamp that the tasks were triggered.
57
- HIGH_VOLUME_LAST_TRIGGER_TS_KEY = "sentry.monitors.last_tasks_ts"
56
+ # This key is used to store he last timestamp that the tasks were triggered.
57
+ MONITOR_TASKS_LAST_TRIGGERED_KEY = "sentry.monitors.last_tasks_ts"
58
58
59
59
60
60
class CheckinMessage (TypedDict ):
61
+ # TODO(epurkhiser): We should make this required and ensure the message
62
+ # produced by relay includes this message type
63
+ message_type : NotRequired [Literal ["check_in" ]]
61
64
payload : str
62
- start_time : str
65
+ start_time : float
63
66
project_id : str
64
67
sdk : str
65
68
66
69
70
+ class ClockPulseMessage (TypedDict ):
71
+ message_type : Literal ["clock_pulse" ]
72
+
73
+
67
74
class CheckinTrace (TypedDict ):
68
75
trace_id : str
69
76
@@ -147,6 +154,23 @@ def _ensure_monitor_with_config(
147
154
148
155
149
156
def _dispatch_tasks (ts : datetime ):
157
+ """
158
+ Dispatch monitor tasks triggered by the consumer clock. These will run
159
+ after the MONITOR_TASK_DELAY (in seconds), This is to give some breathing
160
+ room for check-ins to start and not be EXACTLY on the minute
161
+
162
+ These tasks are triggered via the consumer processing check-ins. This
163
+ allows the monitor tasks to be synchronized to any backlog of check-ins
164
+ that are being processed.
165
+
166
+ To ensure these tasks are always triggered there is an additional celery
167
+ beat task that produces a clock pulse message into the topic that can be
168
+ used to trigger these tasks when there is a low volume of check-ins. It is
169
+ however, preferred to have a high volume of check-ins, so we do not need to
170
+ rely on celery beat, which in some cases may fail to trigger (such as in
171
+ sentry.io, when we deploy we restart the celery beat worker and it will
172
+ skip any tasks it missed)
173
+ """
150
174
# For now we're going to have this do nothing. We want to validate that
151
175
# we're not going to be skipping any check-ins
152
176
return
@@ -155,23 +179,9 @@ def _dispatch_tasks(ts: datetime):
155
179
# check_timeout.delay(current_datetime=ts)
156
180
157
181
158
- def _handle_clock_pulse_task_trigger (ts : datetime ):
182
+ def _try_monitor_tasks_trigger (ts : datetime ):
159
183
"""
160
- Handles clock pulse messages. These pulses are generated by the
161
- `sentry.monitors.tasks.clock_pulse` tasks which runs every minute. Clock
162
- pulses will NOT be generated when SENTRY_MONITORS_HIGH_VOLUME_MODE is
163
- enabled.
164
-
165
- This function is responsible for dispatching the missed check-in and timed
166
- out check-in detection tasks.
167
- """
168
- _dispatch_tasks (ts )
169
-
170
-
171
- def _try_handle_high_volume_task_trigger (ts : datetime ):
172
- """
173
- When SENTRY_MONITORS_HIGH_VOLUME_MODE is enabled we use each check-in
174
- message as a pseudo clock.
184
+ Handles triggering the monitor tasks when we've rolled over the minute.
175
185
"""
176
186
redis_client = redis .redis_clusters .get (settings .SENTRY_MONITORS_REDIS_CLUSTER )
177
187
@@ -180,7 +190,7 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
180
190
reference_datetime = ts .replace (second = 0 , microsecond = 0 )
181
191
reference_ts = int (reference_datetime .timestamp ())
182
192
183
- precheck_last_ts = redis_client .get (HIGH_VOLUME_LAST_TRIGGER_TS_KEY )
193
+ precheck_last_ts = redis_client .get (MONITOR_TASKS_LAST_TRIGGERED_KEY )
184
194
if precheck_last_ts is not None :
185
195
precheck_last_ts = int (precheck_last_ts )
186
196
@@ -194,7 +204,7 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
194
204
195
205
# GETSET is atomic. This is critical to avoid another consumer also
196
206
# processing the same tick.
197
- last_ts = redis_client .getset (HIGH_VOLUME_LAST_TRIGGER_TS_KEY , reference_ts )
207
+ last_ts = redis_client .getset (MONITOR_TASKS_LAST_TRIGGERED_KEY , reference_ts )
198
208
if last_ts is not None :
199
209
last_ts = int (last_ts )
200
210
@@ -212,7 +222,7 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
212
222
"monitors.consumer.clock_tick" ,
213
223
extra = {"reference_datetime" : str (reference_datetime )},
214
224
)
215
- metrics .gauge ("monitors.task.high_volume_clock_delay " , total_delay , sample_rate = 1.0 )
225
+ metrics .gauge ("monitors.task.clock_delay " , total_delay , sample_rate = 1.0 )
216
226
217
227
# If more than exactly a minute has passed then we've skipped a
218
228
# task run, report that to sentry, it is a problem.
@@ -225,15 +235,21 @@ def _try_handle_high_volume_task_trigger(ts: datetime):
225
235
_dispatch_tasks (ts )
226
236
227
237
228
- def _process_message (ts : datetime , wrapper : CheckinMessage ) -> None :
229
- # When running in high volume mode we will not consume clock pulses (The
230
- # clock_pulse task is not enabled). Instead we use each check-in message as
231
- # a means for triggering our tasks.
232
- if settings .SENTRY_MONITORS_HIGH_VOLUME_MODE :
233
- try :
234
- _try_handle_high_volume_task_trigger (ts )
235
- except Exception :
236
- logger .exception ("Failed try high-volume task trigger" , exc_info = True )
238
+ def _process_message (ts : datetime , wrapper : CheckinMessage | ClockPulseMessage ) -> None :
239
+ # XXX: Relay does not attach a message type, to properly discriminate the
240
+ # type we add it by default here. This can be removed once the message_type
241
+ # is guaranteed
242
+ if "message_type" not in wrapper :
243
+ wrapper ["message_type" ] = "check_in"
244
+
245
+ try :
246
+ _try_monitor_tasks_trigger (ts )
247
+ except Exception :
248
+ logger .exception ("Failed to trigger monitor tasks" , exc_info = True )
249
+
250
+ # Nothing else to do with clock pulses
251
+ if wrapper ["message_type" ] == "clock_pulse" :
252
+ return
237
253
238
254
params : CheckinPayload = json .loads (wrapper ["payload" ])
239
255
start_time = to_datetime (float (wrapper ["start_time" ]))
0 commit comments