From a4e3f35efb4f4181cc7b563e8813ae0b6250a39c Mon Sep 17 00:00:00 2001 From: Cody Herriges <193064+ody@users.noreply.github.com> Date: Fri, 1 Apr 2022 21:46:51 +0000 Subject: [PATCH] (SOLARCH-434) Procedure for recovering PSQL Provides documentation for the automated recovery of a failed PE-PostgreSQL node in a XL with replica architecture --- documentation/automated_recovery.md | 37 +++++++++++++++++++++++++++++ plans/util/update_classification.pp | 6 +++-- 2 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 documentation/automated_recovery.md diff --git a/documentation/automated_recovery.md b/documentation/automated_recovery.md new file mode 100644 index 00000000..64b3218e --- /dev/null +++ b/documentation/automated_recovery.md @@ -0,0 +1,37 @@ +# Recovery procedures + +These instructions provide automated procedures for recovering from select failures of PE components which are managed by PEADM. + +Additional manual procedures are documented in [recovery.md](recovery.md) + +## Replace failed PE-PostgreSQL server (A or B side) + +The procedure for replacing a failed PE-PostgreSQL server is the same regardless of which PE-PostgreSQL server is missing or if the name of the PE-PostgrSQL server is the same or different. This procedure uses the following placeholder references. + +* _\_ - The FQDN and certname of the new server being brought in to replace the failed PE-PostgreSQL server +* _\_ - The FQDN and certname of the still-working PE-PostgreSQL server +* _\_ - The FQDN and certname of the failed PE-PostgreSQL server +* _\_ - The FQDN and certname of the primary Puppet server +* _\_ - The FQDN and certname of the replica Puppet server + +Procedure: + +1. Stop `puppet.service` on Puppet server primary and replica + + bolt task run service name=puppet.service action=stop --targets , + +2. Temporarily set both primary and replica server nodes so that they use the remaining healthy PE-PostgreSQL server + + bolt plan run peadm::util::update_db_setting --target , primary_postgresql_host= + +3. Restart `pe-puppetdb.service` on Puppet server primary and replica + + bolt task run service name=pe-puppetdb.service action=restart --targets , + +4. Purge failed PE-PostgreSQL node from PuppetDB + + bolt command run "/opt/puppetlabs/bin/puppet node purge " --targets + +5. Run `peadm::add_database` plan to deploy replacement PE-PostgreSQL server + + bolt plan run peadm::add_database -t primary_host= \ No newline at end of file diff --git a/plans/util/update_classification.pp b/plans/util/update_classification.pp index dff731f9..9dc65d02 100644 --- a/plans/util/update_classification.pp +++ b/plans/util/update_classification.pp @@ -42,7 +42,7 @@ $overridden_replica_postgresql_target = $replica_postgresql_target } - $new = merge($current, { + $filtered = { 'primary_host' => $primary_target.peadm::certname(), 'replica_host' => $replica_target.peadm::certname(), 'primary_postgresql_host' => $primary_postgresql_target.peadm::certname(), @@ -50,7 +50,9 @@ 'compiler_pool_address' => $compiler_pool_address, 'internal_compiler_a_pool_address' => $internal_compiler_a_pool_address, 'internal_compiler_b_pool_address' => $internal_compiler_b_pool_address - }) + }.filter |$parameter| { $parameter[1] } + + $new = merge($current, $filtered) out::message('Classification to be updated using the following hash...') out::message($new)