diff --git a/README.md b/README.md index 9428afaf..854609f3 100644 --- a/README.md +++ b/README.md @@ -336,6 +336,8 @@ If processes have no chance of cleaning up before exiting (e.g. if someone pulls In a similar way, if a worker is terminated in any other way not initiated by the above signals (e.g. a worker is sent a `KILL` signal), jobs in progress will be marked as failed so that they can be inspected, with a `SolidQueue::Processes::Process::ProcessExitError`. Sometimes a job in particular is responsible for this, for example, if it has a memory leak and you have a mechanism to kill processes over a certain memory threshold, so this will help identifying this kind of situation. +In the unlikely event that the supervisor fails (e.g. the database goes offline), Solid Queue can attempt to recover itself. It uses an exponential backoff delay that maxes out at 60 seconds, and the user can set the number of restart attempts that should be tried. See `max_restart_attempts` below. + ### Database configuration @@ -362,6 +364,7 @@ There are several settings that control how Solid Queue works that you can set a - `process_heartbeat_interval`: the heartbeat interval that all processes will follow—defaults to 60 seconds. - `process_alive_threshold`: how long to wait until a process is considered dead after its last heartbeat—defaults to 5 minutes. - `shutdown_timeout`: time the supervisor will wait since it sent the `TERM` signal to its supervised processes before sending a `QUIT` version to them requesting immediate termination—defaults to 5 seconds. +- `max_restart_attempts`: the number of restart attempts Solid Queue should make if the supervisor fails. Set to any number, or `nil` if you want Solid Queue to keep trying forever. The default is 0, which means Solid Queue won't try to recover. - `silence_polling`: whether to silence Active Record logs emitted when polling for both workers and dispatchers—defaults to `true`. - `supervisor_pidfile`: path to a pidfile that the supervisor will create when booting to prevent running more than one supervisor in the same host, or in case you want to use it for a health check. It's `nil` by default. - `preserve_finished_jobs`: whether to keep finished jobs in the `solid_queue_jobs` table—defaults to `true`. diff --git a/lib/solid_queue.rb b/lib/solid_queue.rb index 02b88d05..c10262bb 100644 --- a/lib/solid_queue.rb +++ b/lib/solid_queue.rb @@ -36,6 +36,7 @@ module SolidQueue mattr_accessor :supervisor_pidfile mattr_accessor :supervisor, default: false + mattr_accessor :max_restart_attempts, default: 0 mattr_accessor :preserve_finished_jobs, default: true mattr_accessor :clear_finished_jobs_after, default: 1.day diff --git a/lib/solid_queue/log_subscriber.rb b/lib/solid_queue/log_subscriber.rb index 96fb19bf..78b4bff5 100644 --- a/lib/solid_queue/log_subscriber.rb +++ b/lib/solid_queue/log_subscriber.rb @@ -144,6 +144,14 @@ def unhandled_signal_error(event) error formatted_event(event, action: "Received unhandled signal", **event.payload.slice(:signal)) end + def supervisor_restart(event) + info formatted_event(event, action: "Supervisor terminated unexpectedly: attempting restart in #{event.payload[:delay]}s", **event.payload.slice(:attempt)) + end + + def supervisor_restart_failure(event) + error formatted_event(event, action: "Supervisor restart attempts failed - exiting", error: formatted_error(event.payload[:error])) + end + def replace_fork(event) supervisor_pid = event.payload[:supervisor_pid] status = event.payload[:status] diff --git a/lib/solid_queue/supervisor.rb b/lib/solid_queue/supervisor.rb index f2207691..3679e530 100644 --- a/lib/solid_queue/supervisor.rb +++ b/lib/solid_queue/supervisor.rb @@ -13,7 +13,7 @@ def start(**options) configuration = Configuration.new(**options) if configuration.valid? - new(configuration).tap(&:start) + SolidQueue::Supervisor::Launcher.new(configuration).tap(&:start) else abort configuration.errors.full_messages.join("\n") + "\nExiting..." end diff --git a/lib/solid_queue/supervisor/launcher.rb b/lib/solid_queue/supervisor/launcher.rb new file mode 100644 index 00000000..dfe21179 --- /dev/null +++ b/lib/solid_queue/supervisor/launcher.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module SolidQueue + class Supervisor::Launcher + MAX_RESTART_DELAY = 60 + + def initialize(configuration) + @configuration = configuration + @current_restart_attempt = 0 + end + + def start + SolidQueue.on_start { @current_restart_attempt = 0 } # reset after successful start + + begin + SolidQueue::Supervisor.new(@configuration).tap(&:start) + rescue StandardError => error + if should_attempt_restart? + @current_restart_attempt += 1 + delay = [ 2 ** @current_restart_attempt, MAX_RESTART_DELAY ].min + + SolidQueue.instrument(:supervisor_restart, delay: delay, attempt: @current_restart_attempt) + sleep delay + retry + else + SolidQueue.instrument(:supervisor_restart_failure, error: error) + raise + end + end + end + + private + + def should_attempt_restart? + SolidQueue.max_restart_attempts.nil? || @current_restart_attempt < SolidQueue.max_restart_attempts + end + end +end diff --git a/test/unit/supervisor_test.rb b/test/unit/supervisor_test.rb index c430544a..19489564 100644 --- a/test/unit/supervisor_test.rb +++ b/test/unit/supervisor_test.rb @@ -155,6 +155,22 @@ class SupervisorTest < ActiveSupport::TestCase end end + test "attempt to restart supervisor if it fails unexpectedly" do + SolidQueue.stubs(:max_restart_attempts).returns(2) + SolidQueue::Supervisor.any_instance.expects(:start).raises(StandardError).times(SolidQueue.max_restart_attempts + 1) + assert_raises StandardError do + SolidQueue::Supervisor.start + end + end + + test "skip restart attempt if configured not to" do + SolidQueue.stubs(:max_restart_attempts).returns(0) + SolidQueue::Supervisor.any_instance.expects(:start).raises(StandardError).times(1) + assert_raises StandardError do + SolidQueue::Supervisor.start + end + end + private def assert_registered_workers(supervisor_pid: nil, count: 1) assert_registered_processes(kind: "Worker", count: count, supervisor_pid: supervisor_pid)