getsentry
diff --git a/‎src/sentry/utils/circuit_breaker2.py
+328 b/‎src/sentry/utils/circuit_breaker2.py
+328
@@ -0,0 +1,328 @@
+import logging
+import time
+from enum import Enum
+from math import ceil
+from typing import Any, Literal, NotRequired, TypedDict, cast, overload
+
+from django.core.cache import cache
+
+from sentry.ratelimits.sliding_windows import (
+    GrantedQuota,
+    Quota,
+    RedisSlidingWindowRateLimiter,
+    RequestedQuota,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CircuitBreakerState(Enum):
+    CLOSED = "circuit_closed"
+    BROKEN = "circuit_broken"
+    RECOVERY = "recovery"
+
+
+class CircuitBreakerConfig(TypedDict):
+    # The number of errors within the given time period necessary to trip the breaker
+    error_limit: int
+    # The time period, in seconds, over which we're tracking errors
+    error_limit_window: int
+    # The length, in seconds, of each time bucket ("granule") used by the underlying rate limiter -
+    # effectively the resolution of the time window. Will be set automatically based on
+    # `error_limit_window` if not provided.
+    error_limit_window_granularity: NotRequired[int]
+    # How long, in seconds, to stay in the broken state (blocking all requests) before entering the
+    # recovery phase
+    broken_state_duration: int
+    # The number of errors within the given time period necessary to trip the breaker while in recovery
+    recovery_error_limit: int
+    # The time period, in seconds, over which we're tracking errors in recovery
+    recovery_error_limit_window: int
+    # The length, in seconds, of each time bucket ("granule") used by the underlying rate limiter -
+    # effectively the resolution of the time window. Will be set automatically based on
+    # `recovery_error_limit_window` if not provided.
+    recovery_error_limit_window_granularity: NotRequired[int]
+    # How long, in seconds, to stay in the recovery state (allowing requests but with a stricter
+    # error limit) before returning to normal operation.
+    recovery_duration: int
+
+
+# TODO: These limits were estimated based on EA traffic. (In an average 10 min period, there are
+# roughly 35K events without matching hashes. About 2% of orgs are EA, so for simplicity, assume 2%
+# of those events are from EA orgs. If we're willing to tolerate up to a 95% failure rate, then we
+# need 35K * 0.02 * 0.95 events to fail to trip the breaker. Technically that's 665, not 666, but
+# we're talking about everything going to hell, so the bump to 666 seemed appropriate!)
+#
+# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current
+# calculation).
+CIRCUIT_BREAKER_DEFAULT_CONFIG: CircuitBreakerConfig = {
+    "error_limit": 666,
+    "error_limit_window": 600,  # 10 min
+    "broken_state_duration": 300,  # 5 min
+    "recovery_error_limit": 3,  # In recovery, we're twice as strict as normal limit
+    "recovery_error_limit_window": 60,  # And we bail much more quickly
+    "recovery_duration": 300,  # 5 min
+}
+
+
+class CircuitBreaker:
+    """
+    A circuit breaker to be used to temporarily block requests to or calls of a service or function
+    which is throwing too many errors.
+
+    The breaker has three states: circuit CLOSED (normal operation), circuit BROKEN (all requests
+    blocked), and RECOVERY (requests allowed under a stricter error limit).
+
+        In a CLOSED state (normal operation), the breaker tracks errors but allows through all
+        requests. If the frequency of errors passes a given threshold, it moves to BROKEN state.
+
+        In a BROKEN state, all requests are blocked. Once a set amount of time has passed, it moves
+        to RECOVERY state.
+
+        RECOVERY state is identical to CLOSED state, except that the threshold for the circuit
+        breaking (moving back into BROKEN state) is much stricter. Once a set amount of time has
+        passed without the breaker being tripped, it moves back to CLOSED state.
+
+    The overall idea is to stop hitting a service which seems to have broken, but periodically make
+    short attempts to use it in order to be able to resume requests once it comes back up.
+
+    Usage:
+
+    # See `CircuitBreakerConfig` class for config options
+    breaker = CircuitBreaker("squirrel_chasing", config)
+
+    def get_top_dogs(payload):
+        try:
+            if breaker.should_allow_request():
+                response = call_chase_simulation_service("/hall-of-fame", payload)
+            else:
+                logger.warning("Request blocked by circuit breaker!")
+                return None
+        except TimeoutError:
+            breaker.record_error()
+            return None
+
+        if response.status == 500:
+            breaker.record_error()
+            return None
+
+        return format_hof_entries(response)
+
+    The `breaker.should_allow_request()` check can alternatively be used outside of
+    `get_top_dogs`, to prevent calls to it. In that case, the original `breaker` object can be
+    imported alongside `get_top_dogs` or reinstantiated with the same config - it has no state of
+    its own, instead relying on redis and the cache to track error count and breaker status.
+    """
+
+    def __init__(self, key: str, config: CircuitBreakerConfig | None = None):
+        self.key = key
+        self.broken_state_key = f"{key}.circuit_breaker.broken"
+        self.recovery_state_key = f"{key}.circuit_breaker.in_recovery"
+
+        final_config: CircuitBreakerConfig = {
+            **CIRCUIT_BREAKER_DEFAULT_CONFIG,
+            **(config or cast(Any, {})),
+        }
+        default_window_granularity = self._get_default_window_granularity(
+            final_config["error_limit_window"]
+        )
+        default_recovery_window_granularity = self._get_default_window_granularity(
+            final_config["recovery_error_limit_window"]
+        )
+
+        self.limiter = RedisSlidingWindowRateLimiter()
+        self.primary_quota = Quota(
+            final_config["error_limit_window"],
+            final_config.get("error_limit_window_granularity", default_window_granularity),
+            final_config["error_limit"],
+            f"{key}.circuit_breaker",
+        )
+        self.recovery_quota = Quota(
+            final_config["recovery_error_limit_window"],
+            final_config.get(
+                "recovery_error_limit_window_granularity", default_recovery_window_granularity
+            ),
+            final_config["recovery_error_limit"],
+            f"{key}.circuit_breaker_recovery",
+        )
+
+        self.broken_state_duration = final_config["broken_state_duration"]
+        self.recovery_duration = final_config["recovery_duration"]
+
+        if self.recovery_duration < final_config["recovery_error_limit_window"]:
+            logger.warning(
+                "Circuit breaker %s has `recovery_duration` < `recovery_error_limit_window`."
+                + " Recovery duration has been reset to match the window.",
+                key,
+            )
+            self.recovery_duration = final_config["recovery_error_limit_window"]
+        if self.recovery_duration < final_config["recovery_error_limit_window"]:
+            logger.warning(
+                "Circuit breaker %s has `recovery_duration` < `recovery_error_limit_window`."
+                + " Recovery duration has been reset to match the window.",
+                key,
+            )
+            self.recovery_duration = final_config["recovery_error_limit_window"]
+
+    def record_error(self) -> None:
+        state, seconds_left_in_state = self._get_state_and_remaining_time()
+
+        if state == CircuitBreakerState.BROKEN:
+            # If the circuit is broken, and `should_allow_request` is being used correctly, requests
+            # should be blocked and we shouldn't even be here. That said, maybe there was a race
+            # condition, so make sure the circuit hasn't just broken before crying foul.
+            seconds_elapsed_in_state = self.broken_state_duration - seconds_left_in_state
+            if seconds_elapsed_in_state > 5:
+                logger.warning(
+                    "Attempt to record circuit breaker error while circuit is broken",
+                    extra={"key": self.key, "time_in_state": seconds_elapsed_in_state},
+                )
+            # We shouldn't have made the request, so don't record the error
+            return
+
+        # We track errors with the primary quota even during recovery (when we're not checking it),
+        # because they still happened, and eventually switching back to the closed state doesn't
+        # make that untrue
+        quotas = (
+            [self.primary_quota, self.recovery_quota]
+            if state == CircuitBreakerState.RECOVERY
+            else [self.primary_quota]
+        )
+        self.limiter.use_quotas(
+            [RequestedQuota(self.key, 1, quotas)], [GrantedQuota(self.key, 1, [])], int(time.time())
+        )
+
+        # If incrementing has made us hit the current limit, switch to the broken state
+        controlling_quota = self._get_controlling_quota(state)
+        remaining_errors_allowed = self.get_remaining_error_quota(controlling_quota)
+        if remaining_errors_allowed == 0:
+            logger.warning(
+                "Circuit breaker '%s' error limit hit",
+                self.key,
+                extra={
+                    "current_state": state,
+                    "error_limit": controlling_quota.limit,
+                    "error_limit_window": controlling_quota.window_seconds,
+                },
+            )
+
+            # Recovery will only start after the broken state has expired, so push out the recovery
+            # expiry time. We'll store the expiries as our cache values so we can determine how long
+            # we've been in a given state.
+            now = int(time.time())
+            broken_state_timeout = self.broken_state_duration
+            recovery_state_timeout = self.broken_state_duration + self.recovery_duration
+            broken_state_expiry = now + broken_state_timeout
+            recovery_state_expiry = now + recovery_state_timeout
+
+            # Set cache keys for switching state. While they're both set (starting now) we'll be in
+            # the broken state. Once the broken state key expires we'll switch to recovery, and then
+            # once the recovery key expires we'll be back to normal.
+            cache.set(self.broken_state_key, broken_state_expiry, broken_state_timeout)
+            cache.set(self.recovery_state_key, recovery_state_expiry, recovery_state_timeout)
+
+    def should_allow_request(self) -> bool:
+        state, _ = self._get_state_and_remaining_time()
+
+        if state == CircuitBreakerState.BROKEN:
+            return False
+
+        controlling_quota = self._get_controlling_quota(state)
+
+        return self.get_remaining_error_quota(controlling_quota) > 0
+
+    @overload
+    def get_remaining_error_quota(self, quota: None, window_end: int | None) -> None:
+        ...
+
+    @overload
+    def get_remaining_error_quota(self, quota: Quota, window_end: int | None) -> int:
+        ...
+
+    @overload
+    def get_remaining_error_quota(self, quota: None) -> None:
+        ...
+
+    @overload
+    def get_remaining_error_quota(self, quota: Quota) -> int:
+        ...
+
+    @overload
+    def get_remaining_error_quota(self) -> int | None:
+        ...
+
+    def get_remaining_error_quota(
+        self, quota: Quota | None = None, window_end: int | None = None
+    ) -> int | None:
+        """
+        # TODO: write me
+        returns None when in broken state
+        """
+        if not quota:
+            quota = self._get_controlling_quota()
+            if quota is None:  # broken state
+                return None
+
+        now = int(time.time())
+        window_end = window_end or now
+
+        _, result = self.limiter.check_within_quotas(
+            [RequestedQuota(self.key, quota.limit, [quota])],
+            window_end,
+        )
+
+        return result[0].granted
+
+    def _get_default_window_granularity(self, window_duration: int) -> int:
+        # Never more than 10 buckets, and no bucket smaller than 5 seconds. If greater precision is
+        # needed, the `error_limit_window_granularity` and `recovery_error_limit_window_granularity`
+        # config options can be used.
+        return max(ceil(window_duration / 10), 5)
+
+    @overload
+    def _get_controlling_quota(
+        self, state: Literal[CircuitBreakerState.CLOSED, CircuitBreakerState.RECOVERY]
+    ) -> Quota:
+        ...
+
+    @overload
+    def _get_controlling_quota(self, state: Literal[CircuitBreakerState.BROKEN]) -> None:
+        ...
+
+    @overload
+    def _get_controlling_quota(self) -> Quota | None:
+        ...
+
+    def _get_controlling_quota(self, state: CircuitBreakerState | None = None) -> Quota | None:
+        """
+        # TODO: write me
+        returns None when in broken state
+        """
+        controlling_quota_by_state = {
+            CircuitBreakerState.CLOSED: self.primary_quota,
+            CircuitBreakerState.BROKEN: None,
+            CircuitBreakerState.RECOVERY: self.recovery_quota,
+        }
+
+        _state = state or self._get_state_and_remaining_time()[0]
+
+        return controlling_quota_by_state[_state]
+
+    def _get_state_and_remaining_time(self) -> tuple[CircuitBreakerState, int]:
+        """
+        Return the current state of the breaker (closed, broken, or in recovery), along with the
+        number of seconds until that state expires.
+        """
+        now = int(time.time())
+
+        # The broken state key should always expire before the recovery state one, so check it first
+        if cache.has_key(self.broken_state_key):
+            return (CircuitBreakerState.BROKEN, cache.get(self.broken_state_key) - now)
+
+        if cache.has_key(self.recovery_state_key):
+            return (CircuitBreakerState.RECOVERY, cache.get(self.recovery_state_key) - now)
+
+        # TODO Fix this with overloads?
+        # 0 here is just a placeholder, as "remaining seconds" doesn't really apply to a state we
+        # hope to stay in indefinitely
+        return (CircuitBreakerState.CLOSED, 0)