fix(crons): Enable clock tick partition syncing

evanpurkhiser · evanpurkhiser · commit 93e5ea01c230 · 2023-10-12T14:53:51.000-07:00
This is a follow up after GH-58003. The clock which dispatches tasks will now only tick forward once all partitions have been read up to to the synchronized time
diff --git a/src/sentry/monitors/tasks.py b/src/sentry/monitors/tasks.py
@@ -129,26 +129,23 @@ def try_monitor_tasks_trigger(ts: datetime, partition: int):
     # timestamp. Use `int()` to keep the timestamp (score) as an int
     slowest_part_ts = int(slowest_partitions[0][1])
 
-    # TODO(epurkhiser): The `slowest_part_ts` is going to become the
-    # reference_ts for the rest of this function. But we don't want to flip
-    # this over quite yet since we want to make sure this is working as
-    # expected.
-
     precheck_last_ts = redis_client.get(MONITOR_TASKS_LAST_TRIGGERED_KEY)
     if precheck_last_ts is not None:
         precheck_last_ts = int(precheck_last_ts)
 
-    # If we have the same or an older reference timestamp from the most recent
-    # tick there is nothing to do, we've already handled this tick.
+    # If we have the same or an older timestamp from the most recent tick there
+    # is nothing to do, we've already handled this tick.
     #
-    # The scenario where the reference_ts is older is likely due to a partition
-    # being slightly behind another partition that we've already read from
-    if precheck_last_ts is not None and precheck_last_ts >= reference_ts:
+    # The scenario where the slowest_part_ts is older may happen when our
+    # MONITOR_TASKS_PARTITION_CLOCKS set did not know about every partition the
+    # topic is responsible for. Older check-ins may be processed after newer
+    # ones in diferent topics. This should only happen if redis loses state.
+    if precheck_last_ts is not None and precheck_last_ts >= slowest_part_ts:
         return
 
     # GETSET is atomic. This is critical to avoid another consumer also
     # processing the same tick.
-    last_ts = redis_client.getset(MONITOR_TASKS_LAST_TRIGGERED_KEY, reference_ts)
+    last_ts = redis_client.getset(MONITOR_TASKS_LAST_TRIGGERED_KEY, slowest_part_ts)
     if last_ts is not None:
         last_ts = int(last_ts)
 
@@ -160,32 +157,22 @@ def try_monitor_tasks_trigger(ts: datetime, partition: int):
 
     # Track the delay from the true time, ideally this should be pretty
     # close, but in the case of a backlog, this will be much higher
-    total_delay = datetime.now().timestamp() - reference_ts
+    total_delay = datetime.now().timestamp() - slowest_part_ts
 
-    # TODO(epurkhiser): For now we will just log the slowest partition
-    # timestamp and in production we can validate the value moves forward
-    # correctly. It's likely this will be a minute behind the actual
-    # reference_ts since there will always be a sloest partition
-    logger.info(
-        "monitors.consumer.clock_tick_slowest_partition",
-        extra={"slowest_part_ts": slowest_part_ts},
-    )
+    tick = datetime.fromtimestamp(slowest_part_ts)
 
-    logger.info(
-        "monitors.consumer.clock_tick",
-        extra={"reference_datetime": str(reference_datetime)},
-    )
+    logger.info("monitors.consumer.clock_tick", extra={"reference_datetime": str(tick)})
     metrics.gauge("monitors.task.clock_delay", total_delay, sample_rate=1.0)
 
     # If more than exactly a minute has passed then we've skipped a
     # task run, report that to sentry, it is a problem.
-    if last_ts is not None and reference_ts > last_ts + 60:
+    if last_ts is not None and slowest_part_ts > last_ts + 60:
         with sentry_sdk.push_scope() as scope:
             scope.set_extra("last_ts", last_ts)
-            scope.set_extra("reference_ts", reference_ts)
+            scope.set_extra("slowest_part_ts", slowest_part_ts)
             sentry_sdk.capture_message("Monitor task dispatch minute skipped")
 
-    _dispatch_tasks(ts)
+    _dispatch_tasks(tick)
 
 
 @instrumented_task(name="sentry.monitors.tasks.clock_pulse", silo_mode=SiloMode.REGION)
diff --git a/tests/sentry/monitors/test_tasks.py b/tests/sentry/monitors/test_tasks.py
@@ -949,6 +949,9 @@ def test_clock_pulse(checkin_producer_mock):
 def test_monitor_task_trigger(dispatch_tasks):
     now = datetime.now().replace(second=0, microsecond=0)
 
+    # Assumes a single partition for simplicitly. Multi-partition cases are
+    # covered in further test cases.
+
     # First checkin triggers tasks
     try_monitor_tasks_trigger(ts=now, partition=0)
     assert dispatch_tasks.call_count == 1
@@ -982,21 +985,98 @@ def test_monitor_task_trigger_partition_desync(dispatch_tasks):
     """
     now = datetime.now().replace(second=0, microsecond=0)
 
-    # First message with timestamp just after the minute boundary
-    # triggers the task
+    # First message in partition 0 with timestamp just after the minute
+    # boundary triggers the task
     try_monitor_tasks_trigger(ts=now + timedelta(seconds=1), partition=0)
     assert dispatch_tasks.call_count == 1
 
-    # Second message has a timestamp just before the minute boundary,
-    # should not trigger anything since we've already ticked ahead of this
-    try_monitor_tasks_trigger(ts=now - timedelta(seconds=1), partition=0)
+    # Second message in a partition 1 has a timestamp just before the minute
+    # boundary, should not trigger anything since we've already ticked ahead of
+    # this
+    try_monitor_tasks_trigger(ts=now - timedelta(seconds=1), partition=1)
     assert dispatch_tasks.call_count == 1
 
-    # Third message again just after the minute boundary does NOT trigger
-    # the task, we've already ticked at that time.
-    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1), partition=0)
+    # Third message in partition 1 again just after the minute boundary does
+    # NOT trigger the task, we've already ticked at that time.
+    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1), partition=1)
     assert dispatch_tasks.call_count == 1
 
-    # Fourth message moves past a new minute boundary, tick
+    # Next two messages in both partitions move the clock forward
     try_monitor_tasks_trigger(ts=now + timedelta(minutes=1, seconds=1), partition=0)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1, seconds=1), partition=1)
+    assert dispatch_tasks.call_count == 2
+
+
+@mock.patch("sentry.monitors.tasks._dispatch_tasks")
+def test_monitor_task_trigger_partition_sync(dispatch_tasks):
+    """
+    When the kafka topic has multiple partitions we want to only tick our clock
+    forward once all partitions have caught up. This test simulates that
+    """
+    now = datetime.now().replace(second=0, microsecond=0)
+
+    # Tick for 4 partitions
+    try_monitor_tasks_trigger(ts=now, partition=0)
+    try_monitor_tasks_trigger(ts=now, partition=1)
+    try_monitor_tasks_trigger(ts=now, partition=2)
+    try_monitor_tasks_trigger(ts=now, partition=3)
+    assert dispatch_tasks.call_count == 1
+    assert dispatch_tasks.mock_calls[0] == mock.call(now)
+
+    # Tick forward 3 of the partitions, global clock does not tick
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=0)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=1)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=2)
+    assert dispatch_tasks.call_count == 1
+
+    # Slowest partition ticks forward, global clock ticks
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=3)
+    assert dispatch_tasks.call_count == 2
+    assert dispatch_tasks.mock_calls[1] == mock.call(now + timedelta(minutes=1))
+
+
+@mock.patch("sentry.monitors.tasks._dispatch_tasks")
+def test_monitor_task_trigger_partition_tick_skip(dispatch_tasks):
+    """
+    In a scenario where all partitions move multiple ticks past the slowest
+    partition we may end up skipping a tick.
+    """
+    now = datetime.now().replace(second=0, microsecond=0)
+
+    # Tick for 4 partitions
+    try_monitor_tasks_trigger(ts=now, partition=0)
+    try_monitor_tasks_trigger(ts=now, partition=1)
+    try_monitor_tasks_trigger(ts=now, partition=2)
+    try_monitor_tasks_trigger(ts=now, partition=3)
+    assert dispatch_tasks.call_count == 1
+    assert dispatch_tasks.mock_calls[0] == mock.call(now)
+
+    # Tick forward twice for 3 partitions
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=0)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=1)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=2)
+
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=2), partition=0)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=3), partition=1)
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=3), partition=2)
+    assert dispatch_tasks.call_count == 1
+
+    # Slowest partition catches up, but has a timestamp gap, capture the fact
+    # that we skipped a minute
+    with mock.patch("sentry_sdk.capture_message") as capture_message:
+        assert capture_message.call_count == 0
+        try_monitor_tasks_trigger(ts=now + timedelta(minutes=2), partition=3)
+        capture_message.assert_called_with("Monitor task dispatch minute skipped")
+
+    # XXX(epurkhiser): Another approach we could take here is to detect the
+    # skipped minute and generate a tick for that minute, since we know
+    # processed past that minute.
+    #
+    # This still could be a problem though since it may mean we will not
+    # produce missed check-ins since the monitor already may have already
+    # checked-in after and moved the `next_checkin_latest` forward.
+    #
+    # In practice this should almost never happen since we have a high volume of
+
     assert dispatch_tasks.call_count == 2
+    assert dispatch_tasks.mock_calls[1] == mock.call(now + timedelta(minutes=2))