@@ -949,6 +949,9 @@ def test_clock_pulse(checkin_producer_mock):
949
949
def test_monitor_task_trigger (dispatch_tasks ):
950
950
now = datetime .now ().replace (second = 0 , microsecond = 0 )
951
951
952
+ # Assumes a single partition for simplicitly. Multi-partition cases are
953
+ # covered in further test cases.
954
+
952
955
# First checkin triggers tasks
953
956
try_monitor_tasks_trigger (ts = now , partition = 0 )
954
957
assert dispatch_tasks .call_count == 1
@@ -982,21 +985,98 @@ def test_monitor_task_trigger_partition_desync(dispatch_tasks):
982
985
"""
983
986
now = datetime .now ().replace (second = 0 , microsecond = 0 )
984
987
985
- # First message with timestamp just after the minute boundary
986
- # triggers the task
988
+ # First message in partition 0 with timestamp just after the minute
989
+ # boundary triggers the task
987
990
try_monitor_tasks_trigger (ts = now + timedelta (seconds = 1 ), partition = 0 )
988
991
assert dispatch_tasks .call_count == 1
989
992
990
- # Second message has a timestamp just before the minute boundary,
991
- # should not trigger anything since we've already ticked ahead of this
992
- try_monitor_tasks_trigger (ts = now - timedelta (seconds = 1 ), partition = 0 )
993
+ # Second message in a partition 1 has a timestamp just before the minute
994
+ # boundary, should not trigger anything since we've already ticked ahead of
995
+ # this
996
+ try_monitor_tasks_trigger (ts = now - timedelta (seconds = 1 ), partition = 1 )
993
997
assert dispatch_tasks .call_count == 1
994
998
995
- # Third message again just after the minute boundary does NOT trigger
996
- # the task, we've already ticked at that time.
997
- try_monitor_tasks_trigger (ts = now + timedelta (seconds = 1 ), partition = 0 )
999
+ # Third message in partition 1 again just after the minute boundary does
1000
+ # NOT trigger the task, we've already ticked at that time.
1001
+ try_monitor_tasks_trigger (ts = now + timedelta (seconds = 1 ), partition = 1 )
998
1002
assert dispatch_tasks .call_count == 1
999
1003
1000
- # Fourth message moves past a new minute boundary, tick
1004
+ # Next two messages in both partitions move the clock forward
1001
1005
try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 , seconds = 1 ), partition = 0 )
1006
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 , seconds = 1 ), partition = 1 )
1007
+ assert dispatch_tasks .call_count == 2
1008
+
1009
+
1010
+ @mock .patch ("sentry.monitors.tasks._dispatch_tasks" )
1011
+ def test_monitor_task_trigger_partition_sync (dispatch_tasks ):
1012
+ """
1013
+ When the kafka topic has multiple partitions we want to only tick our clock
1014
+ forward once all partitions have caught up. This test simulates that
1015
+ """
1016
+ now = datetime .now ().replace (second = 0 , microsecond = 0 )
1017
+
1018
+ # Tick for 4 partitions
1019
+ try_monitor_tasks_trigger (ts = now , partition = 0 )
1020
+ try_monitor_tasks_trigger (ts = now , partition = 1 )
1021
+ try_monitor_tasks_trigger (ts = now , partition = 2 )
1022
+ try_monitor_tasks_trigger (ts = now , partition = 3 )
1023
+ assert dispatch_tasks .call_count == 1
1024
+ assert dispatch_tasks .mock_calls [0 ] == mock .call (now )
1025
+
1026
+ # Tick forward 3 of the partitions, global clock does not tick
1027
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 0 )
1028
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 1 )
1029
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 2 )
1030
+ assert dispatch_tasks .call_count == 1
1031
+
1032
+ # Slowest partition ticks forward, global clock ticks
1033
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 3 )
1034
+ assert dispatch_tasks .call_count == 2
1035
+ assert dispatch_tasks .mock_calls [1 ] == mock .call (now + timedelta (minutes = 1 ))
1036
+
1037
+
1038
+ @mock .patch ("sentry.monitors.tasks._dispatch_tasks" )
1039
+ def test_monitor_task_trigger_partition_tick_skip (dispatch_tasks ):
1040
+ """
1041
+ In a scenario where all partitions move multiple ticks past the slowest
1042
+ partition we may end up skipping a tick.
1043
+ """
1044
+ now = datetime .now ().replace (second = 0 , microsecond = 0 )
1045
+
1046
+ # Tick for 4 partitions
1047
+ try_monitor_tasks_trigger (ts = now , partition = 0 )
1048
+ try_monitor_tasks_trigger (ts = now , partition = 1 )
1049
+ try_monitor_tasks_trigger (ts = now , partition = 2 )
1050
+ try_monitor_tasks_trigger (ts = now , partition = 3 )
1051
+ assert dispatch_tasks .call_count == 1
1052
+ assert dispatch_tasks .mock_calls [0 ] == mock .call (now )
1053
+
1054
+ # Tick forward twice for 3 partitions
1055
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 0 )
1056
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 1 )
1057
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 1 ), partition = 2 )
1058
+
1059
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 2 ), partition = 0 )
1060
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 3 ), partition = 1 )
1061
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 3 ), partition = 2 )
1062
+ assert dispatch_tasks .call_count == 1
1063
+
1064
+ # Slowest partition catches up, but has a timestamp gap, capture the fact
1065
+ # that we skipped a minute
1066
+ with mock .patch ("sentry_sdk.capture_message" ) as capture_message :
1067
+ assert capture_message .call_count == 0
1068
+ try_monitor_tasks_trigger (ts = now + timedelta (minutes = 2 ), partition = 3 )
1069
+ capture_message .assert_called_with ("Monitor task dispatch minute skipped" )
1070
+
1071
+ # XXX(epurkhiser): Another approach we could take here is to detect the
1072
+ # skipped minute and generate a tick for that minute, since we know
1073
+ # processed past that minute.
1074
+ #
1075
+ # This still could be a problem though since it may mean we will not
1076
+ # produce missed check-ins since the monitor already may have already
1077
+ # checked-in after and moved the `next_checkin_latest` forward.
1078
+ #
1079
+ # In practice this should almost never happen since we have a high volume of
1080
+
1002
1081
assert dispatch_tasks .call_count == 2
1082
+ assert dispatch_tasks .mock_calls [1 ] == mock .call (now + timedelta (minutes = 2 ))
0 commit comments