getsentry · evanpurkhiser · Oct 12, 2023 · Oct 12, 2023 · wedamija · Oct 12, 2023
@@ -570,15 +570,20 @@ def update_existing_check_in(
         logger.exception("Failed to process check-in", exc_info=True)
 
 
-def _process_message(ts: datetime, wrapper: CheckinMessage | ClockPulseMessage) -> None:
-    # XXX: Relay does not attach a message type, to properly discriminate
-    # we add it by default here. This can be removed once the message_type
-    # is guaranteed
+def _process_message(
+    ts: datetime,
+    partition: int,
+    wrapper: CheckinMessage | ClockPulseMessage,
+) -> None:
+
+    # XXX: Relay does not attach a message type, to properly discriminate the
+    # message_type we add it by default here. This can be removed once the
+    # message_type is guaranteed
     if "message_type" not in wrapper:
         wrapper["message_type"] = "check_in"
 
     try:
-        try_monitor_tasks_trigger(ts)
+        try_monitor_tasks_trigger(ts, partition)
     except Exception:
         logger.exception("Failed to trigger monitor tasks", exc_info=True)
 
@@ -608,7 +613,11 @@ def process_message(message: Message[KafkaPayload]) -> None:
             assert isinstance(message.value, BrokerValue)
             try:
                 wrapper = msgpack.unpackb(message.payload.value)
-                _process_message(message.value.timestamp, wrapper)
+                _process_message(
+                    message.value.timestamp,
+                    message.value.partition.index,
+                    wrapper,
+                )
             except Exception:
                 logger.exception("Failed to process message payload")
 

@@ -44,6 +44,9 @@
 # This key is used to store the last timestamp that the tasks were triggered.
 MONITOR_TASKS_LAST_TRIGGERED_KEY = "sentry.monitors.last_tasks_ts"
 
+# This key is used to store the hashmap of Mapping[PartitionKey, Timestamp]
+MONITOR_TASKS_PARTITION_CLOCKS = "sentry.monitors.partition_clocks"
+
 
 def _get_producer() -> KafkaProducer:
     cluster_name = get_topic_definition(settings.KAFKA_INGEST_MONITORS)["cluster"]
@@ -90,10 +93,14 @@ def _dispatch_tasks(ts: datetime):
     check_timeout.delay(current_datetime=ts)
 
 
-def try_monitor_tasks_trigger(ts: datetime):
+def try_monitor_tasks_trigger(ts: datetime, partition: int):
     """
     Handles triggering the monitor tasks when we've rolled over the minute.
 
+    We keep a reference to the most recent timestamp for each partition and use
+    the slowest partition as our reference time. This ensures all partitions
+    have been synchronized before ticking our clock.
+
     This function is called by our consumer processor
     """
     redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
@@ -103,6 +110,30 @@ def try_monitor_tasks_trigger(ts: datetime):
     reference_datetime = ts.replace(second=0, microsecond=0)
     reference_ts = int(reference_datetime.timestamp())
 
+    # Store the current clock value for this partition.
+    redis_client.zadd(
+        name=MONITOR_TASKS_PARTITION_CLOCKS,
+        mapping={f"part-{partition}": reference_ts},
+    )
+
+    # Find the slowest partition from our sorted set of partitions, where the
+    # clock is the score.
+    slowest_partitions = redis_client.zrange(
+        name=MONITOR_TASKS_PARTITION_CLOCKS,
+        withscores=True,
+        start=0,
+        end=0,
+    )
+
+    # the first tuple is the slowest (part-<id>, score), the score is the
+    # timestamp. Use `int()` to keep the timestamp (score) as an int
+    slowest_part_ts = int(slowest_partitions[0][1])
+
+    # TODO(epurkhiser): The `slowest_part_ts` is going to become the
+    # reference_ts for the rest of this function. But we don't want to flip
+    # this over quite yet since we want to make sure this is working as
+    # expected.
+
     precheck_last_ts = redis_client.get(MONITOR_TASKS_LAST_TRIGGERED_KEY)
     if precheck_last_ts is not None:
         precheck_last_ts = int(precheck_last_ts)
@@ -131,6 +162,15 @@ def try_monitor_tasks_trigger(ts: datetime):
     # close, but in the case of a backlog, this will be much higher
     total_delay = datetime.now().timestamp() - reference_ts
 
+    # TODO(epurkhiser): For now we will just log the slowest partition
+    # timestamp and in production we can validate the value moves forward
+    # correctly. It's likely this will be a minute behind the actual
+    # reference_ts since there will always be a sloest partition
+    logger.info(
+        "monitors.consumer.clock_tick_slowest_partition",
+        extra={"slowest_part_ts": slowest_part_ts},
+    )
+
     logger.info(
         "monitors.consumer.clock_tick",
         extra={"reference_datetime": str(reference_datetime)},
@@ -160,7 +200,8 @@ def clock_pulse(current_datetime=None):
 
     if settings.SENTRY_EVENTSTREAM != "sentry.eventstream.kafka.KafkaEventStream":
         # Directly trigger try_monitor_tasks_trigger in dev
-        try_monitor_tasks_trigger(current_datetime)
+        for partition in _get_partitions().values():
+            try_monitor_tasks_trigger(current_datetime, partition.id)
         return
 
     message: ClockPulseMessage = {

@@ -950,25 +950,25 @@ def test_monitor_task_trigger(dispatch_tasks):
     now = datetime.now().replace(second=0, microsecond=0)
 
     # First checkin triggers tasks
-    try_monitor_tasks_trigger(ts=now)
+    try_monitor_tasks_trigger(ts=now, partition=0)
     assert dispatch_tasks.call_count == 1
 
     # 5 seconds later does NOT trigger the task
-    try_monitor_tasks_trigger(ts=now + timedelta(seconds=5))
+    try_monitor_tasks_trigger(ts=now + timedelta(seconds=5), partition=0)
     assert dispatch_tasks.call_count == 1
 
     # a minute later DOES trigger the task
-    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1))
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=0)
     assert dispatch_tasks.call_count == 2
 
     # Same time does NOT trigger the task
-    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1))
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1), partition=0)
     assert dispatch_tasks.call_count == 2
 
     # A skipped minute triggers the task AND captures an error
     with mock.patch("sentry_sdk.capture_message") as capture_message:
         assert capture_message.call_count == 0
-        try_monitor_tasks_trigger(ts=now + timedelta(minutes=3, seconds=5))
+        try_monitor_tasks_trigger(ts=now + timedelta(minutes=3, seconds=5), partition=0)
         assert dispatch_tasks.call_count == 3
         capture_message.assert_called_with("Monitor task dispatch minute skipped")
 
@@ -984,19 +984,19 @@ def test_monitor_task_trigger_partition_desync(dispatch_tasks):
 
     # First message with timestamp just after the minute boundary
     # triggers the task
-    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1))
+    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1), partition=0)
     assert dispatch_tasks.call_count == 1
 
     # Second message has a timestamp just before the minute boundary,
     # should not trigger anything since we've already ticked ahead of this
-    try_monitor_tasks_trigger(ts=now - timedelta(seconds=1))
+    try_monitor_tasks_trigger(ts=now - timedelta(seconds=1), partition=0)
     assert dispatch_tasks.call_count == 1
 
     # Third message again just after the minute boundary does NOT trigger
     # the task, we've already ticked at that time.
-    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1))
+    try_monitor_tasks_trigger(ts=now + timedelta(seconds=1), partition=0)
     assert dispatch_tasks.call_count == 1
 
     # Fourth message moves past a new minute boundary, tick
-    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1, seconds=1))
+    try_monitor_tasks_trigger(ts=now + timedelta(minutes=1, seconds=1), partition=0)
     assert dispatch_tasks.call_count == 2