Skip to content

feat(crons): Use timeout_at for timed out check-ins #52570

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions src/sentry/monitors/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from datetime import timedelta

from django.utils import timezone

Expand Down Expand Up @@ -92,21 +91,13 @@ def check_monitors(current_datetime=None):
except Exception:
logger.exception("Exception in check_monitors - mark missed")

qs = MonitorCheckIn.objects.filter(status=CheckInStatus.IN_PROGRESS).select_related(
"monitor", "monitor_environment"
)[:CHECKINS_LIMIT]
qs = MonitorCheckIn.objects.filter(
status=CheckInStatus.IN_PROGRESS, timeout_at__lte=current_datetime
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be worth changing the query to
Q(timeout_at__lte=current_datetime) | Q(timeout_at=None)
as a safety precaution? we don't have any null values currently but would be a good protection in case there's a regression so we don't ignore the null timeouts. could also put it in a separate query

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. Maybe we can add a metric that we put an alert on?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll run a query and raise an exception at the bottom as a safety check

).select_related("monitor", "monitor_environment")[:CHECKINS_LIMIT]
metrics.gauge("sentry.monitors.tasks.check_monitors.timeout_count", qs.count())
# check for any monitors which are still running and have exceeded their maximum runtime
for checkin in qs:
try:
timeout = timedelta(
minutes=(checkin.monitor.config or {}).get("max_runtime") or TIMEOUT
)
# Check against date_updated to allow monitors to run for longer as
# long as they continue to send heart beats updating the checkin
if checkin.date_updated > current_datetime - timeout:
continue

monitor_environment = checkin.monitor_environment
logger.info(
"monitor_environment.checkin-timeout",
Expand Down
8 changes: 8 additions & 0 deletions tests/sentry/monitors/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def test_timeout_with_no_future_complete_checkin(self):
status=CheckInStatus.IN_PROGRESS,
date_added=check_in_24hr_ago,
date_updated=check_in_24hr_ago,
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
)
# We started another checkin right now
checkin2 = MonitorCheckIn.objects.create(
Expand All @@ -215,6 +216,7 @@ def test_timeout_with_no_future_complete_checkin(self):
status=CheckInStatus.IN_PROGRESS,
date_added=next_checkin_ts,
date_updated=next_checkin_ts,
timeout_at=next_checkin_ts + timedelta(minutes=30),
)

assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago
Expand Down Expand Up @@ -268,6 +270,7 @@ def test_timeout_with_future_complete_checkin(self):
status=CheckInStatus.IN_PROGRESS,
date_added=check_in_24hr_ago,
date_updated=check_in_24hr_ago,
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
)
checkin2 = MonitorCheckIn.objects.create(
monitor=monitor,
Expand All @@ -276,6 +279,7 @@ def test_timeout_with_future_complete_checkin(self):
status=CheckInStatus.OK,
date_added=next_checkin_ts,
date_updated=next_checkin_ts,
timeout_at=next_checkin_ts + timedelta(minutes=30),
)

assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago
Expand Down Expand Up @@ -321,6 +325,7 @@ def test_timeout_via_max_runtime_configuration(self):
status=CheckInStatus.IN_PROGRESS,
date_added=next_checkin_ts,
date_updated=next_checkin_ts,
timeout_at=next_checkin_ts + timedelta(minutes=60),
)

assert checkin.date_added == checkin.date_updated == next_checkin_ts
Expand Down Expand Up @@ -424,6 +429,7 @@ def test_timeout_exception_handling(self, logger):
status=CheckInStatus.IN_PROGRESS,
date_added=check_in_24hr_ago,
date_updated=check_in_24hr_ago,
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
)

# This monitor will be fine
Expand All @@ -448,6 +454,7 @@ def test_timeout_exception_handling(self, logger):
status=CheckInStatus.IN_PROGRESS,
date_added=check_in_24hr_ago,
date_updated=check_in_24hr_ago,
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
)
checkin2 = MonitorCheckIn.objects.create(
monitor=monitor,
Expand All @@ -456,6 +463,7 @@ def test_timeout_exception_handling(self, logger):
status=CheckInStatus.IN_PROGRESS,
date_added=next_checkin_ts,
date_updated=next_checkin_ts,
timeout_at=next_checkin_ts + timedelta(minutes=30),
)

assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago
Expand Down