feat: implement graceful shutdown of GitLab Runner (#1117)

tmeijn · kayman-mk · web-flow · commit d2e22249bb5e · 2024-05-29T18:35:14.000+02:00
## Description Based on the discussion #1067: 1. Move the EventBridge rule that triggers the Lambda from `TERMINATING` to `TERMINATE`. The Lambda now functions as an "after-the-fact" cleanup instead of being responsible of cleanup _during_ termination. 2. Introduces a shell script managed by Systemd, that monitors the target lifecycle of the instance and initiates GitLab Runner graceful shutdown. 3. Makes the heartbeat timeout of the ASG terminating hook configurable, with a default of the maximum job timeout + 5 minutes, capped at `7200` (2 hours). 4. Introduces a launching lifecyclehook, allowing the new instance to provision itself and GitLab Runner to provision its set capacity before terminating the current instance. ## Migrations required No, except that if the default behavior of immediately terminating all Workers + Manager, the `runner_worker_graceful_terminate_timeout_duration` variable should be set to 30 (the minimum allowed). ## Verification ### Graceful terminate 1. Deploy this version of the module. 2. Start a long running GitLab job. 3. Manually trigger an instance refresh in the runner ASG. 4. Verify the job keeps running and has output. Verify from the instance logs that GitLab Runner service is still running. 6. Once remaining jobs have been completed, observe that GitLab Runner service is terminated and instance is put into `Terminating:Proceed` status ### Zero Downtime deployment 1. Deploy this version of the module. 2. Start multiple, long running GitLab jobs, twice the capacity of the GitLab Runner. 3. Manually trigger an instance refresh in the runner ASG. 4. Verify the jobs keep running and have output. Verify from the instance logs that GitLab Runner service is still running. 5. Verify new instance gets spun up, while the current instance stays `InService`. 7. Verify new instance is able to provision its set capacity. 8. Verify new instance starts picking up GitLab jobs from the queue before current instance gets terminated. 9. Observe that there is zero downtime. 10. Once remaining jobs have been completed, observe that GitLab Runner service is terminated and current instance is put into `Terminating:Proceed` status Closes #1029 --------- Co-authored-by: Matthias Kay <matthias.kay@hlag.com> Co-authored-by: Matthias Kay <github@matthiaskay.de>
diff --git a/.cspell.json b/.cspell.json
@@ -6,6 +6,7 @@
     "amazonec",
     "anytrue",
     "amannn",
+    "autonumber",
     "awscli",
     "boto",
     "botocore",
@@ -53,6 +54,7 @@
     "tftpl",
     "tfvars",
     "tmpfs",
+    "tonumber",
     "trivy",
     "userns",
     "xanzy",
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,10 @@
 repos:
   - repo: https://github.com/antonbabenko/pre-commit-terraform
-    rev: v1.64.1
+    rev: v1.89.0
     hooks:
       - id: terraform_fmt
         args:
           - --args=-recursive
-      - id: terraform_tflint
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.2.0
     hooks:
diff --git a/.pylintrc b/.pylintrc
@@ -1,5 +1,5 @@
 [MASTER]
-init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.11/site-packages/')"
+init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.12/site-packages/')"
 
 [FORMAT]
 max-line-length=132
diff --git a/docs/usage.md b/docs/usage.md
@@ -54,14 +54,14 @@ module "runner" {
 
   vpc_id    = module.vpc.vpc_id
   subnet_id = element(module.vpc.private_subnets, 0)
-   
+
   runner_instance = {
-    name       = "docker-default"      
+    name       = "docker-default"
   }
-   
+
   runner_gitlab = {
     url = "https://gitlab.com"
-     
+
     preregistered_runner_token_ssm_parameter_name = "my-gitlab-runner-token-ssm-parameter-name"
   }
 }
@@ -77,23 +77,23 @@ map. A simple example for this would be to set _region-specific-prefix_ to the A
 module "runner" {
   # https://registry.terraform.io/modules/cattle-ops/gitlab-runner/aws/
   source  = "cattle-ops/gitlab-runner/aws"
-   
+
   environment = "multi-region-1"
   iam_object_prefix = "<region-specific-prefix>-gitlab-runner-iam"
-   
+
   vpc_id    = module.vpc.vpc_id
   subnet_id = element(module.vpc.private_subnets, 0)
-   
+
   runner_gitlab = {
     url = "https://gitlab.com"
 
     preregistered_runner_token_ssm_parameter_name = "my-gitlab-runner-token-ssm-parameter-name"
   }
-   
+
    runner_worker_cache = {
      bucket_prefix = "<region-specific-prefix>"
    }
-   
+
    runner_worker_docker_machine_instance = {
      subnet_ids = module.vpc.private_subnets
    }
@@ -208,14 +208,39 @@ module "runner" {
 }
 ```
 
-#### Instance Termination
-
-The Auto Scaling Group may be configured with a [lifecycle hook](https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html)
-that executes a provided Lambda function when the runner is terminated to terminate additional instances that were spawned.
-
-The use of the termination lifecycle can be toggled using the `runner_enable_asg_recreation` variable.
+#### Graceful termination / Zero Downtime deployment
+
+This module supports zero-downtime deployments by following a structured process:
+
+- The new instance is first set to the `pending` state, allowing it to provision both GitLab Runner and its configured capacity.
+This process is allocated a maximum of five minutes.
+- Once provisioning is complete, a signal is sent to the current instance, setting it to the `terminating:wait` state.
+- This signal triggers the monitor_runner.sh systemd service, which sends a SIGQUIT signal to the GitLab Runner process,
+initiating a graceful shutdown.
+- The maximum allowed time for the shutdown process is defined by the `runner_terminate_ec2_lifecycle_timeout_duration` variable.
+
+The diagram below illustrates this process.
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant ASG as Autoscaling Group
+    participant CI as Current Instance
+    participant NI as New Instance
+    ASG->>NI: Provision New Instance (status: Pending)
+    Note over NI: Install GitLab Runner <br/>and provision capacity<br/>(5m grace period)
+    ASG->>NI: Set status to InService
+    ASG->>CI: Set status to Terminating:Wait
+    CI->>CI: Graceful terminate:<br/>Stop picking up new jobs,<br/>Finish current jobs<br/>assigned to this Runner
+    CI->>ASG: Send complete-lifecycle-action
+    ASG->>CI: Set status to Terminating:Proceed
+    Note over CI: Instance is terminated:<br/>Cleanup Lambda is triggered
+```
 
-When using this feature, a `builds/` directory relative to the root module will persist that contains the packaged Lambda function.
+The Auto Scaling Group is configured with a [lifecycle hook](https://docs.aws.amazon.com/autoscaling/ec2/userguide/lifecycle-hooks.html)
+that executes a provided Lambda function when the runner is terminated to terminate additional instances that were
+provisioned by the Docker Machine executor. a `builds/` directory relative to the root module persists that
+contains the packaged Lambda function.
 
 ### Access the Runner instance
 
diff --git a/locals.tf b/locals.tf
@@ -91,6 +91,10 @@ locals {
   ]
 
   docker_machine_adds_name_tag = signum(sum(local.docker_machine_version_test)) <= 0
+
+  runner_worker_graceful_terminate_timeout_duration = (var.runner_terminate_ec2_lifecycle_timeout_duration == null
+    ? min(7200, tonumber(coalesce(var.runner_gitlab_registration_config.maximum_timeout, 0)) + 300)
+  : var.runner_terminate_ec2_lifecycle_timeout_duration)
 }
 
 resource "local_file" "config_toml" {
diff --git a/main.tf b/main.tf
@@ -80,6 +80,7 @@ locals {
       use_fleet                                                    = var.runner_worker_docker_machine_fleet.enable
       private_key                                                  = var.runner_worker_docker_machine_fleet.enable == true ? tls_private_key.fleet[0].private_key_pem : ""
       use_new_runner_authentication_gitlab_16                      = var.runner_gitlab_registration_config.type != ""
+      user_data_trace_log                                          = var.debug.trace_runner_user_data
   })
 
   template_runner_config = templatefile("${path.module}/template/runner-config.tftpl",
@@ -174,10 +175,15 @@ resource "aws_autoscaling_group" "gitlab_runner_instance" {
     version = aws_launch_template.gitlab_runner_instance.latest_version
   }
 
+  instance_maintenance_policy {
+    max_healthy_percentage = 110
+    min_healthy_percentage = 100
+  }
+
   instance_refresh {
     strategy = "Rolling"
     preferences {
-      min_healthy_percentage = 0
+      min_healthy_percentage = 100
     }
     triggers = ["tag"]
   }
@@ -656,21 +662,31 @@ resource "aws_iam_role_policy_attachment" "eip" {
   policy_arn = aws_iam_policy.eip[0].arn
 }
 
+# We wait for 5 minutes until we set an EC2 instance to status `InService` so it has time to provision itself and it's configured capacity.
+resource "aws_autoscaling_lifecycle_hook" "wait_for_gitlab_runner" {
+  name                   = "${var.environment}-wait-for-gitlab-runner-up"
+  autoscaling_group_name = aws_autoscaling_group.gitlab_runner_instance.name
+  default_result         = "CONTINUE"
+  heartbeat_timeout      = 300
+  lifecycle_transition   = "autoscaling:EC2_INSTANCE_LAUNCHING"
+}
+
 ################################################################################
 ### Lambda function triggered as soon as an agent is terminated.
 ################################################################################
 module "terminate_agent_hook" {
   source = "./modules/terminate-agent-hook"
 
-  name                                 = var.runner_terminate_ec2_lifecycle_hook_name == null ? "terminate-instances" : var.runner_terminate_ec2_lifecycle_hook_name
-  environment                          = var.environment
-  asg_arn                              = aws_autoscaling_group.gitlab_runner_instance.arn
-  asg_name                             = aws_autoscaling_group.gitlab_runner_instance.name
-  cloudwatch_logging_retention_in_days = var.runner_cloudwatch.retention_days
-  name_iam_objects                     = local.name_iam_objects
-  name_docker_machine_runners          = local.runner_tags_merged["Name"]
-  role_permissions_boundary            = var.iam_permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.iam_permissions_boundary}"
-  kms_key_id                           = local.kms_key
+  name                                   = var.runner_terminate_ec2_lifecycle_hook_name == null ? "terminate-instances" : var.runner_terminate_ec2_lifecycle_hook_name
+  environment                            = var.environment
+  asg_arn                                = aws_autoscaling_group.gitlab_runner_instance.arn
+  asg_name                               = aws_autoscaling_group.gitlab_runner_instance.name
+  cloudwatch_logging_retention_in_days   = var.runner_cloudwatch.retention_days
+  name_iam_objects                       = local.name_iam_objects
+  name_docker_machine_runners            = local.runner_tags_merged["Name"]
+  role_permissions_boundary              = var.iam_permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.iam_permissions_boundary}"
+  kms_key_id                             = local.kms_key
+  asg_hook_terminating_heartbeat_timeout = local.runner_worker_graceful_terminate_timeout_duration
 
   tags = local.tags
 }
diff --git a/modules/terminate-agent-hook/README.md b/modules/terminate-agent-hook/README.md
@@ -162,6 +162,7 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_asg_arn"></a> [asg\_arn](#input\_asg\_arn) | The ARN of the Auto Scaling Group to attach to. | `string` | n/a | yes |
+| <a name="input_asg_hook_terminating_heartbeat_timeout"></a> [asg\_hook\_terminating\_heartbeat\_timeout](#input\_asg\_hook\_terminating\_heartbeat\_timeout) | Duration the ASG should stay in the Terminating:Wait state. | `number` | `30` | no |
 | <a name="input_asg_name"></a> [asg\_name](#input\_asg\_name) | The name of the Auto Scaling Group to attach to. The 'environment' will be prefixed to this. | `string` | n/a | yes |
 | <a name="input_cloudwatch_logging_retention_in_days"></a> [cloudwatch\_logging\_retention\_in\_days](#input\_cloudwatch\_logging\_retention\_in\_days) | The number of days to retain logs in CloudWatch. | `number` | `30` | no |
 | <a name="input_enable_xray_tracing"></a> [enable\_xray\_tracing](#input\_enable\_xray\_tracing) | Enables X-Ray for debugging and analysis | `bool` | `false` | no |
diff --git a/modules/terminate-agent-hook/cloudwatch.tf b/modules/terminate-agent-hook/cloudwatch.tf
@@ -12,7 +12,7 @@ resource "aws_cloudwatch_event_rule" "terminate_instances" {
   event_pattern = <<EOF
 {
   "source": ["aws.autoscaling"],
-  "detail-type": ["EC2 Instance-terminate Lifecycle Action"],
+  "detail-type": ["EC2 Instance Terminate Successful", "EC2 Instance Terminate Unsuccessful"],
   "detail": {
     "AutoScalingGroupName": ["${var.asg_name}"]
   }
diff --git a/modules/terminate-agent-hook/lambda/lambda_function.py b/modules/terminate-agent-hook/lambda/lambda_function.py
@@ -234,9 +234,6 @@ def handler(event, context):
     """
     event_detail = event['detail']
 
-    if event_detail['LifecycleTransition'] != "autoscaling:EC2_INSTANCE_TERMINATING":
-        sys.exit()
-
     client = boto3.client("ec2", region_name=event['region'])
 
     # make sure that no new instances are created
diff --git a/modules/terminate-agent-hook/locals.tf b/modules/terminate-agent-hook/locals.tf
diff --git a/modules/terminate-agent-hook/main.tf b/modules/terminate-agent-hook/main.tf
@@ -36,7 +36,7 @@ resource "aws_lambda_function" "terminate_runner_instances" {
   publish          = true
   role             = aws_iam_role.lambda.arn
   runtime          = "python3.11"
-  timeout          = local.lambda_timeout
+  timeout          = 30
   kms_key_arn      = var.kms_key_id
 
   tags = var.tags
@@ -77,6 +77,6 @@ resource "aws_autoscaling_lifecycle_hook" "terminate_instances" {
   name                   = "${var.environment}-${var.name}"
   autoscaling_group_name = var.asg_name
   default_result         = "CONTINUE"
-  heartbeat_timeout      = local.lambda_timeout + 20 # allow some extra time for cold starts
+  heartbeat_timeout      = var.asg_hook_terminating_heartbeat_timeout
   lifecycle_transition   = "autoscaling:EC2_INSTANCE_TERMINATING"
 }
diff --git a/modules/terminate-agent-hook/variables.tf b/modules/terminate-agent-hook/variables.tf
@@ -60,3 +60,14 @@ variable "enable_xray_tracing" {
   type        = bool
   default     = false
 }
+
+variable "asg_hook_terminating_heartbeat_timeout" {
+  description = "Duration in seconds the ASG should stay in the Terminating:Wait state."
+  type        = number
+  default     = 30
+
+  validation {
+    condition     = var.asg_hook_terminating_heartbeat_timeout >= 30 && var.asg_hook_terminating_heartbeat_timeout <= 7200
+    error_message = "AWS only supports heartbeat timeout in the range of 30 to 7200."
+  }
+}
diff --git a/policies/instance-docker-machine-policy.json b/policies/instance-docker-machine-policy.json
@@ -18,7 +18,9 @@
           "ec2:CancelSpotInstanceRequests",
           "ec2:DescribeSubnets",
           "ec2:AssociateIamInstanceProfile",
-          "ec2:CreateFleet"
+          "ec2:CreateFleet",
+          "autoscaling:CompleteLifecycleAction",
+          "autoscaling:DescribeLifecycleHooks"
         ],
         "Effect": "Allow",
         "Resource": "*"
diff --git a/template/gitlab-runner.tftpl b/template/gitlab-runner.tftpl
diff --git a/variables.tf b/variables.tf
diff --git a/versions.tf b/versions.tf

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,10 @@ locals {`
`91`	`91`	`]`
`92`	`92`
`93`	`93`	`docker_machine_adds_name_tag = signum(sum(local.docker_machine_version_test)) <= 0`
	`94`	`+`
	`95`	`+ runner_worker_graceful_terminate_timeout_duration = (var.runner_terminate_ec2_lifecycle_timeout_duration == null`
	`96`	`+ ? min(7200, tonumber(coalesce(var.runner_gitlab_registration_config.maximum_timeout, 0)) + 300)`
	`97`	`+ : var.runner_terminate_ec2_lifecycle_timeout_duration)`
`94`	`98`	`}`
`95`	`99`
`96`	`100`	`resource "local_file" "config_toml" {`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ resource "aws_cloudwatch_event_rule" "terminate_instances" {`
`12`	`12`	`event_pattern = <<EOF`
`13`	`13`	`{`
`14`	`14`	`"source": ["aws.autoscaling"],`
`15`		`- "detail-type": ["EC2 Instance-terminate Lifecycle Action"],`
	`15`	`+ "detail-type": ["EC2 Instance Terminate Successful", "EC2 Instance Terminate Unsuccessful"],`
`16`	`16`	`"detail": {`
`17`	`17`	`"AutoScalingGroupName": ["${var.asg_name}"]`
`18`	`18`	`}`