diff --git a/.cspell.json b/.cspell.json index 50ed9faa1..95c5a43b8 100644 --- a/.cspell.json +++ b/.cspell.json @@ -70,6 +70,7 @@ "gitter", "Niek", "oxsecurity", + "redrive", "signoff", "typecheck", "userdata", diff --git a/.pylintrc b/.pylintrc index 0ae464248..eaacba061 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [MASTER] -init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.11/site-packages/')" +init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.12/site-packages/')" [FORMAT] max-line-length=132 diff --git a/README.md b/README.md index 849573a11..224257962 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,9 @@ The runner supports 3 main scenarios: ![runners-docker](https://github.com/cattle-ops/terraform-aws-gitlab-runner/raw/main/assets/images/runner-docker.png) -For detailed concepts and usage please refer to [usage](docs/usage.md). +For detailed information on usage please refer to [usage](docs/usage.md). + +Key concepts for module developers are explained in [concepts](docs/concepts.md). ## Contributors ✨ @@ -205,6 +207,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file | [runner\_worker\_docker\_services\_volumes\_tmpfs](#input\_runner\_worker\_docker\_services\_volumes\_tmpfs) | Mount a tmpfs in gitlab service container. https://docs.gitlab.com/runner/executors/docker.html#mounting-a-directory-in-ram |
list(object({
volume = string
options = string
}))
| `[]` | no | | [runner\_worker\_docker\_volumes\_tmpfs](#input\_runner\_worker\_docker\_volumes\_tmpfs) | Mount a tmpfs in Executor container. https://docs.gitlab.com/runner/executors/docker.html#mounting-a-directory-in-ram |
list(object({
volume = string
options = string
}))
| `[]` | no | | [runner\_worker\_gitlab\_pipeline](#input\_runner\_worker\_gitlab\_pipeline) | post\_build\_script = Script to execute in the pipeline just after the build, but before executing after\_script.
pre\_build\_script = Script to execute in the pipeline just before the build.
pre\_clone\_script = Script to execute in the pipeline before cloning the Git repository. this can be used to adjust the Git client configuration first, for example. |
object({
post_build_script = optional(string, "\"\"")
pre_build_script = optional(string, "\"\"")
pre_clone_script = optional(string, "\"\"")
})
| `{}` | no | +| [runner\_worker\_graceful\_terminate](#input\_runner\_worker\_graceful\_terminate) | Enable to gracefully terminate runner instances, giving running jobs a chance to finish.

enabled = Boolean used to enable or disable graceful terminate.
timeout = Time in seconds to wait before aborting graceful termination and force terminating runner instances, this value should be the max duration of jobs using the runner, or else jobs running longer than this value won't finish running
retry_period = Time in seconds between retrying to stop the gitlab-runner service
job_timeout = Time in seconds to wait for gitlab jobs to stop running when stopping the gitlab-runner service |
object({
enabled = optional(bool, false)
timeout = optional(number, 1800)
retry_period = optional(number, 300)
job_timeout = optional(number, 3600)
})
| `{}` | no | | [security\_group\_prefix](#input\_security\_group\_prefix) | Set the name prefix and overwrite the `Name` tag for all security groups. | `string` | `""` | no | | [subnet\_id](#input\_subnet\_id) | Subnet id used for the Runner and Runner Workers. Must belong to the `vpc_id`. In case the fleet mode is used, multiple subnets for
the Runner Workers can be provided with runner\_worker\_docker\_machine\_instance.subnet\_ids. | `string` | n/a | yes | | [suppressed\_tags](#input\_suppressed\_tags) | List of tag keys which are automatically removed and never added as default tag by the module. | `list(string)` | `[]` | no | diff --git a/assets/images/graceful_shutdown.png b/assets/images/graceful_shutdown.png new file mode 100644 index 000000000..c7d6221bd Binary files /dev/null and b/assets/images/graceful_shutdown.png differ diff --git a/docs/concepts.md b/docs/concepts.md new file mode 100644 index 000000000..fd9efba21 --- /dev/null +++ b/docs/concepts.md @@ -0,0 +1,3 @@ +# Graceful Termination + +![Graceful Termination](../assets/images/graceful_shutdown.png) diff --git a/main.tf b/main.tf index 1d9895f9d..5825d6397 100644 --- a/main.tf +++ b/main.tf @@ -79,6 +79,7 @@ locals { use_fleet = var.runner_worker_docker_machine_fleet.enable private_key = var.runner_worker_docker_machine_fleet.enable == true ? tls_private_key.fleet[0].private_key_pem : "" use_new_runner_authentication_gitlab_16 = var.runner_gitlab_registration_config.type != "" + runner_service_stop_timeout = var.runner_worker_graceful_terminate.job_timeout }) template_runner_config = templatefile("${path.module}/template/runner-config.tftpl", @@ -643,6 +644,10 @@ module "terminate_agent_hook" { name_docker_machine_runners = local.runner_tags_merged["Name"] role_permissions_boundary = var.iam_permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.iam_permissions_boundary}" kms_key_id = local.kms_key + graceful_terminate_enabled = var.runner_worker_graceful_terminate.enabled + graceful_terminate_timeout = var.runner_worker_graceful_terminate.timeout + sqs_max_receive_count = ceil(var.runner_worker_graceful_terminate.timeout / var.runner_worker_graceful_terminate.retry_period) + 1 + sqs_visibility_timeout = var.runner_worker_graceful_terminate.retry_period tags = local.tags } diff --git a/modules/terminate-agent-hook/README.md b/modules/terminate-agent-hook/README.md index 85295ec16..e68d149ba 100644 --- a/modules/terminate-agent-hook/README.md +++ b/modules/terminate-agent-hook/README.md @@ -23,6 +23,10 @@ is, that no tags are added to the spot request by the docker+machine driver and to our module. The rule is, that parts of the Executor's name become part of the related SSH key which is in turn part of the spot request. +Optionally, graceful terminate can be enabled for this module with the `graceful_terminate_enabled` variable. +When enabled, the lambda function will attempt to stop the `gitlab-runner` service on the runner before terminating +runner instances, which gives running jobs a chance to finish. + ## Usage ### Default Behavior - Package With the Module @@ -91,6 +95,13 @@ module "runner" { expiration_days = 90 } + # optional, if excluded then the default terminate instances behavior will be used + runner_worker_graceful_terminate = { + enabled = true # defaults to false + timeout = 600 + retry_period = 60 + } + runner_gitlab_registration_config = { type = "instance" # or "group" or "project" # group_id = 1234 # for "group" @@ -141,21 +152,34 @@ No modules. | [aws_cloudwatch_event_rule.terminate_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_target.terminate_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_log_group.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_iam_policy.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_lambda_event_source_mapping.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_event_source_mapping) | resource | | [aws_lambda_function.terminate_runner_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_function_event_invoke_config.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function_event_invoke_config) | resource | | [aws_lambda_permission.current_version_triggers](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.unqualified_alias_triggers](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [archive_file.terminate_runner_instances_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.asg_lifecycle_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | | [aws_region.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | +| [aws_sqs_queue.graceful_terminate_dlq](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sqs_queue) | resource | +| [aws_sqs_queue.graceful_terminate_queue](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sqs_queue) | resource | +| [aws_ssm_document.stop_gitlab_runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_document) | resource | ## Inputs diff --git a/modules/terminate-agent-hook/cloudwatch.tf b/modules/terminate-agent-hook/cloudwatch.tf index 80e0727f7..fa86e7b11 100644 --- a/modules/terminate-agent-hook/cloudwatch.tf +++ b/modules/terminate-agent-hook/cloudwatch.tf @@ -6,6 +6,8 @@ # ---------------------------------------------------------------------------- resource "aws_cloudwatch_event_rule" "terminate_instances" { + count = var.graceful_terminate_enabled ? 0 : 1 + name = "${var.environment}-${var.name}" description = "Trigger GitLab runner instance lifecycle hook on termination." @@ -23,7 +25,9 @@ EOF } resource "aws_cloudwatch_event_target" "terminate_instances" { - rule = aws_cloudwatch_event_rule.terminate_instances.name + count = var.graceful_terminate_enabled ? 0 : 1 + + rule = aws_cloudwatch_event_rule.terminate_instances[0].name target_id = "${var.environment}-TriggerTerminateLambda" arn = aws_lambda_function.terminate_runner_instances.arn } diff --git a/modules/terminate-agent-hook/iam.tf b/modules/terminate-agent-hook/iam.tf index 47cd87210..25857fc1c 100644 --- a/modules/terminate-agent-hook/iam.tf +++ b/modules/terminate-agent-hook/iam.tf @@ -8,6 +8,75 @@ data "aws_region" "this" {} # Terminate Instances - IAM Resources # ---------------------------------------------------------------------------- +################################################################################ +### ASG IAM +################################################################################ + +data "aws_iam_policy_document" "asg_lifecycle_assume_role" { + count = var.graceful_terminate_enabled ? 1 : 0 + + statement { + actions = [ + "sts:AssumeRole", + ] + effect = "Allow" + + principals { + identifiers = ["autoscaling.amazonaws.com"] + type = "Service" + } + } +} + +resource "aws_iam_role" "asg_lifecycle" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.name_iam_objects}-${var.name}-asg-lifecycle" + description = "Role for the graceful terminate ASG lifecycle hook" + path = "/" + permissions_boundary = var.role_permissions_boundary + assume_role_policy = data.aws_iam_policy_document.asg_lifecycle_assume_role[0].json + force_detach_policies = true + tags = var.tags +} + +# This IAM policy is used by the ASG lifecycle hook. +data "aws_iam_policy_document" "asg_lifecycle" { + count = var.graceful_terminate_enabled ? 1 : 0 + + # Permit the GitLab Runner ASG to send messages to SQS + statement { + sid = "ASGLifecycleSqs" + actions = [ + "sqs:SendMessage", + "sqs:GetQueueUrl" + ] + resources = ["${aws_sqs_queue.graceful_terminate_queue[0].arn}"] + effect = "Allow" + } +} + +resource "aws_iam_policy" "asg_lifecycle" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.name_iam_objects}-${var.name}-asg-lifecycle" + path = "/" + policy = data.aws_iam_policy_document.asg_lifecycle[0].json + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "asg_lifecycle" { + count = var.graceful_terminate_enabled ? 1 : 0 + + role = aws_iam_role.asg_lifecycle[0].name + policy_arn = aws_iam_policy.asg_lifecycle[0].arn +} + +################################################################################ +### Lambda IAM +################################################################################ + data "aws_iam_policy_document" "assume_role" { statement { actions = [ @@ -134,6 +203,65 @@ data "aws_iam_policy_document" "spot_request_housekeeping" { } } +data "aws_iam_policy_document" "graceful_terminate" { + count = var.graceful_terminate_enabled ? 1 : 0 + + # Permit the function to process SQS messages + statement { + sid = "GitLabRunnerGracefulTerminateSQS" + actions = [ + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + effect = "Allow" + resources = [ + resource.aws_sqs_queue.graceful_terminate_queue[0].arn + ] + } + + # Permit the function to invoke the SSM document for stopping gitlab-runner + statement { + sid = "GitLabRunnerGracefulTerminateSSMSend" + actions = [ + "ssm:SendCommand" + ] + effect = "Allow" + resources = [ + resource.aws_ssm_document.stop_gitlab_runner[0].arn + ] + } + + # Permit the function to send SSM commands to the GitLab Runner instance + statement { + sid = "GitLabRunnerGracefulTerminateSSMSendEC2" + actions = [ + "ssm:SendCommand" + ] + effect = "Allow" + resources = [ + "arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.this.name}:${data.aws_caller_identity.this.account_id}:instance/*" + ] + condition { + test = "StringLike" + variable = "ssm:ResourceTag/Name" + values = ["${var.environment}*"] + } + } + + # Permit the function to get SSM command invocation details + statement { + sid = "GitLabRunnerGracefulTerminateSSMGet" + actions = [ + "ssm:GetCommandInvocation" + ] + effect = "Allow" + resources = [ + "*" + ] + } +} + resource "aws_iam_policy" "lambda" { name = "${var.name_iam_objects}-${var.name}-lambda" path = "/" @@ -159,3 +287,20 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" { role = aws_iam_role.lambda.name policy_arn = aws_iam_policy.spot_request_housekeeping.arn } + +resource "aws_iam_policy" "graceful_terminate" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.name_iam_objects}-${var.name}-graceful-terminate" + path = "/" + policy = data.aws_iam_policy_document.graceful_terminate[0].json + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "graceful_terminate" { + count = var.graceful_terminate_enabled ? 1 : 0 + + role = aws_iam_role.lambda.name + policy_arn = aws_iam_policy.graceful_terminate[0].arn +} diff --git a/modules/terminate-agent-hook/lambda/lambda_function.py b/modules/terminate-agent-hook/lambda/lambda_function.py index c2aa7e9b9..13cca9874 100644 --- a/modules/terminate-agent-hook/lambda/lambda_function.py +++ b/modules/terminate-agent-hook/lambda/lambda_function.py @@ -10,12 +10,105 @@ This is rudimentary and doesn't check if a build runner has a current job. """ import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import ClientError, WaiterError import json import os import sys +def check_runner_running(client, instance_id): + """ + Checks if the runner instance is running. + :param client: the boto3 ec2 client + :param instance_id: the ID of the runner instance + :return: true or false, whether the runner instance is running + """ + print(json.dumps({ + "Level": "info", + "Message": "Looking for running runner instance..." + })) + try: + reservations = client.describe_instances(InstanceIds=[instance_id], Filters=[ + { + "Name": "instance-state-name", + "Values": ["running", "pending"], + } + ]).get("Reservations") + except ClientError as error: + print(json.dumps({ + "Level": "error", + "Message": "Failed to lookup runner instance" + })) + raise error + + if len(reservations) > 0: + print(json.dumps({ + "Level": "info", + "Message": "Runner instance still running" + })) + return True + + print(json.dumps({ + "Level": "info", + "Message": "Runner instance already terminated" + })) + return False + + +def stop_runner_service(client, instance_id): + """ + Stops the gitlab-runner service on the runner instance using SSM command. + The command may fail if the gitlab-runner service has jobs running, in + which case the function will error and be re-tried by SQS. + :param client: the boto3 SSM client + :param instance_id: the ID of the runner instance + """ + print(json.dumps({ + "Level": "info", + "Message": "Stopping gitlab-runner service..." + })) + + try: + initial_response = client.send_command(DocumentName=os.environ['DOCUMENT_NAME'], + Comment="Stop gitlab-runner service, and check whether it's stopped.", + InstanceIds=[instance_id] + ) + except ClientError as error: + print(json.dumps({ + "Level": "error", + "Message": "Failed to send SSM command" + })) + raise error + + command_id = initial_response['Command']['CommandId'] + + try: + waiter = client.get_waiter('command_executed') + waiter.wait( + CommandId=command_id, + InstanceId=instance_id, + WaiterConfig={ + "Delay": 3, + "MaxAttempts": 10 + } + ) + command_response = client.get_command_invocation(CommandId=command_id, InstanceId=instance_id) + except WaiterError as error: + print(json.dumps({ + "Level": "error", + "Message": "Failure waiting for command to be successful" + })) + raise error + + if command_response['Status'] == "Success": + print(json.dumps({ + "Level": "info", + "Message": f"gitlab-runner service stopped, SSM command response: {command_response}" + })) + else: + raise RuntimeError(f"ERROR: gitlab-runner service not stopped, SSM command response: {command_response}") + + def ec2_list(client, **args): # to be refactored in #631 # pylint: disable=too-many-branches, too-many-nested-blocks @@ -232,18 +325,94 @@ def handler(event, context): :param event: see https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-concepts.html#gettingstarted-concepts-event :param context: see https://docs.aws.amazon.com/lambda/latest/dg/python-context.html """ - event_detail = event['detail'] - if event_detail['LifecycleTransition'] != "autoscaling:EC2_INSTANCE_TERMINATING": - sys.exit() + # if graceful terminate is enabled, then a SQS queue is created to + # accept messages from the ASG lifecycle hook and trigger this lambda, + # so the event received by this lambda will be in SQS message format + # + # if graceful terminate is disabled then a cloudwatch event rule for + # the ASG lifecycle hook is created to trigger this lambda, so the + # event received by this lambda will be in cloudwatch event format + if os.environ['GRACEFUL_TERMINATE_ENABLED'] == "true": + message = json.loads(event['Records'][0]['body']) + + region = event['Records'][0]['awsRegion'] + instance_id = message.get("EC2InstanceId") + + if instance_id is None: + no_instance_id_msg = "No instance ID, skipping" + print(json.dumps({ + "Level": "info", + "Message": no_instance_id_msg + })) + return no_instance_id_msg + + ec2_client = boto3.client("ec2", region_name=region) + ssm_client = boto3.client("ssm", region_name=region) + as_client = boto3.client("autoscaling", region_name=region) + + try: + if check_runner_running(ec2_client, instance_id): + stop_runner_service(ssm_client, instance_id) + + print(json.dumps({ + "Level": "info", + "Message": "Completing lifecycle action..." + })) + lifecycle_action_response = as_client.complete_lifecycle_action( + AutoScalingGroupName=message['AutoScalingGroupName'], + LifecycleHookName=message['LifecycleHookName'], + LifecycleActionToken=message['LifecycleActionToken'], + LifecycleActionResult="CONTINUE" + ) + + print(json.dumps({ + "Level": "info", + "Message": f"CompleteLifecycleAction Successful, response: {lifecycle_action_response}" + })) + # catch everything here and log it + # pylint: disable=broad-exception-caught + except Exception as ex: + print(json.dumps({ + "Level": "exception", + "Exception": str(ex) + })) - client = boto3.client("ec2", region_name=event['region']) + # if the gitlab-runner service fails to be stopped, the function can error out and the SQS + # message will go back to the queue to be retried, up to a set amount of times + message_receive_count = int(event['Records'][0]['attributes']['ApproximateReceiveCount']) + max_receive_count = int(os.environ['SQS_MAX_RECEIVE_COUNT']) + print(json.dumps({ + "Level": "info", + "Message": f"Graceful termination retry count: {message_receive_count}/{max_receive_count}" + })) + if message_receive_count < max_receive_count: + print(json.dumps({ + "Level": "info", + "Message": "Graceful termination will be retried in next function run" + })) + sys.exit(1) + else: + print(json.dumps({ + "Level": "info", + "Message": "Reached max received count, continuing with instance termination" + })) + else: + event_detail = event['detail'] + instance_id = event_detail['EC2InstanceId'] + region = event['region'] + + if event_detail['LifecycleTransition'] != "autoscaling:EC2_INSTANCE_TERMINATING": + sys.exit() + + ec2_client = boto3.client("ec2", region_name=region) + # make sure that no new instances are created - cancel_active_spot_requests(ec2_client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) + cancel_active_spot_requests(ec2_client=ec2_client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) # find the executors connected to this agent and terminate them as well - _terminate_list = ec2_list(client=client, parent=event_detail['EC2InstanceId']) + _terminate_list = ec2_list(client=ec2_client, parent=instance_id) if len(_terminate_list) > 0: print(json.dumps({ @@ -251,7 +420,7 @@ def handler(event, context): "Message": f"Terminating instances {', '.join(_terminate_list)}" })) try: - client.terminate_instances(InstanceIds=_terminate_list, DryRun=False) + ec2_client.terminate_instances(InstanceIds=_terminate_list, DryRun=False) print(json.dumps({ "Level": "info", @@ -270,7 +439,7 @@ def handler(event, context): "Message": "No instances to terminate." })) - remove_unused_ssh_key_pairs(client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) + remove_unused_ssh_key_pairs(client=ec2_client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) return "Housekeeping done" diff --git a/modules/terminate-agent-hook/locals.tf b/modules/terminate-agent-hook/locals.tf index d2f10528f..5b633770b 100644 --- a/modules/terminate-agent-hook/locals.tf +++ b/modules/terminate-agent-hook/locals.tf @@ -1,3 +1,4 @@ locals { - lambda_timeout = 30 + lambda_timeout = 30 + graceful_terminate_lambda_timeout = 60 } \ No newline at end of file diff --git a/modules/terminate-agent-hook/main.tf b/modules/terminate-agent-hook/main.tf index 33f315c4a..7f589dd59 100644 --- a/modules/terminate-agent-hook/main.tf +++ b/modules/terminate-agent-hook/main.tf @@ -36,14 +36,17 @@ resource "aws_lambda_function" "terminate_runner_instances" { publish = true role = aws_iam_role.lambda.arn runtime = "python3.11" - timeout = local.lambda_timeout + timeout = var.graceful_terminate_enabled ? local.graceful_terminate_lambda_timeout : local.lambda_timeout kms_key_arn = var.kms_key_id tags = var.tags environment { variables = { - NAME_EXECUTOR_INSTANCE = var.name_docker_machine_runners + NAME_EXECUTOR_INSTANCE = var.name_docker_machine_runners + GRACEFUL_TERMINATE_ENABLED = var.graceful_terminate_enabled + DOCUMENT_NAME = var.graceful_terminate_enabled ? aws_ssm_document.stop_gitlab_runner[0].name : null + SQS_MAX_RECEIVE_COUNT = var.sqs_max_receive_count } } @@ -56,27 +59,52 @@ resource "aws_lambda_function" "terminate_runner_instances" { } } +resource "aws_autoscaling_lifecycle_hook" "terminate_instances" { + + name = "${var.environment}-${var.name}" + autoscaling_group_name = var.asg_name + default_result = "CONTINUE" + heartbeat_timeout = var.graceful_terminate_enabled ? var.graceful_terminate_timeout : local.lambda_timeout + 20 # allow some extra time for cold starts + lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" + notification_target_arn = var.graceful_terminate_enabled ? aws_sqs_queue.graceful_terminate_queue[0].arn : null + role_arn = var.graceful_terminate_enabled ? aws_iam_role.asg_lifecycle[0].arn : null +} + +# use cloudwatch event trigger when graceful terminate is disabled + resource "aws_lambda_permission" "current_version_triggers" { + count = var.graceful_terminate_enabled ? 0 : 1 + function_name = aws_lambda_function.terminate_runner_instances.function_name qualifier = aws_lambda_function.terminate_runner_instances.version statement_id = "TerminateInstanceEvent" action = "lambda:InvokeFunction" principal = "events.amazonaws.com" - source_arn = aws_cloudwatch_event_rule.terminate_instances.arn + source_arn = aws_cloudwatch_event_rule.terminate_instances[0].arn } resource "aws_lambda_permission" "unqualified_alias_triggers" { + count = var.graceful_terminate_enabled ? 0 : 1 + function_name = aws_lambda_function.terminate_runner_instances.function_name statement_id = "TerminateInstanceEventUnqualified" action = "lambda:InvokeFunction" principal = "events.amazonaws.com" - source_arn = aws_cloudwatch_event_rule.terminate_instances.arn + source_arn = aws_cloudwatch_event_rule.terminate_instances[0].arn } -resource "aws_autoscaling_lifecycle_hook" "terminate_instances" { - name = "${var.environment}-${var.name}" - autoscaling_group_name = var.asg_name - default_result = "CONTINUE" - heartbeat_timeout = local.lambda_timeout + 20 # allow some extra time for cold starts - lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" +# use SQS trigger when graceful terminate is enabled + +resource "aws_lambda_function_event_invoke_config" "graceful_terminate" { + count = var.graceful_terminate_enabled ? 1 : 0 + + function_name = aws_lambda_function.terminate_runner_instances.function_name + maximum_retry_attempts = 0 +} + +resource "aws_lambda_event_source_mapping" "graceful_terminate" { + count = var.graceful_terminate_enabled ? 1 : 0 + + event_source_arn = aws_sqs_queue.graceful_terminate_queue[0].arn + function_name = aws_lambda_function.terminate_runner_instances.arn } diff --git a/modules/terminate-agent-hook/sqs.tf b/modules/terminate-agent-hook/sqs.tf new file mode 100644 index 000000000..f8d90102a --- /dev/null +++ b/modules/terminate-agent-hook/sqs.tf @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------------- +# Graceful Terminate - SQS Resources +# ---------------------------------------------------------------------------- + +resource "aws_sqs_queue" "graceful_terminate_dlq" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.environment}-graceful-terminate-dlq" + sqs_managed_sse_enabled = true + + tags = var.tags +} + +resource "aws_sqs_queue" "graceful_terminate_queue" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.environment}-graceful-terminate-queue" + sqs_managed_sse_enabled = true + visibility_timeout_seconds = var.sqs_visibility_timeout + redrive_policy = jsonencode({ + deadLetterTargetArn = aws_sqs_queue.graceful_terminate_dlq[0].arn + maxReceiveCount = var.sqs_max_receive_count + }) + + tags = var.tags +} diff --git a/modules/terminate-agent-hook/ssm.tf b/modules/terminate-agent-hook/ssm.tf new file mode 100644 index 000000000..32940568e --- /dev/null +++ b/modules/terminate-agent-hook/ssm.tf @@ -0,0 +1,45 @@ +# ---------------------------------------------------------------------------- +# Graceful Terminate - SQS Resources +# ---------------------------------------------------------------------------- + +resource "aws_ssm_document" "stop_gitlab_runner" { + count = var.graceful_terminate_enabled ? 1 : 0 + + name = "${var.environment}-stop-gitlab-runner" + document_format = "YAML" + document_type = "Command" + + content = <&2 + exit 1 + fi +DOC + + tags = var.tags +} diff --git a/modules/terminate-agent-hook/variables.tf b/modules/terminate-agent-hook/variables.tf index 3cfb49970..d019e6dac 100644 --- a/modules/terminate-agent-hook/variables.tf +++ b/modules/terminate-agent-hook/variables.tf @@ -60,3 +60,24 @@ variable "enable_xray_tracing" { type = bool default = false } + +variable "graceful_terminate_enabled" { + description = "Whether to graceful terminate is enabled." + type = bool + default = false +} + +variable "graceful_terminate_timeout" { + description = "Time in seconds to wait for lifecycle complete action before continuing termination." + type = number +} + +variable "sqs_visibility_timeout" { + description = "Time in seconds that a message will be invisible for after being consumed." + type = number +} + +variable "sqs_max_receive_count" { + description = "Number of times a message can be consumed before it's placed in the DLQ." + type = number +} diff --git a/template/gitlab-runner.tftpl b/template/gitlab-runner.tftpl index 14d3dcccb..5830b6286 100644 --- a/template/gitlab-runner.tftpl +++ b/template/gitlab-runner.tftpl @@ -246,6 +246,16 @@ then yum install gitlab-runner-${gitlab_runner_version} -y fi +# set timeout for runner service to wait before stopping +mkdir /etc/systemd/system/gitlab-runner.service.d +cat < /etc/systemd/system/gitlab-runner.service.d/kill.conf +[Service] +# Time to wait before stopping the service in seconds +TimeoutStopSec=${runner_service_stop_timeout} +# according to the GitLab Runner docs, this initiates a graceful shutdown of the service +KillSignal=SIGQUIT +EOF + ${post_install} chkconfig gitlab-runner on diff --git a/variables.tf b/variables.tf index 68811217f..be4cd21f2 100644 --- a/variables.tf +++ b/variables.tf @@ -752,6 +752,17 @@ variable "runner_worker_docker_machine_autoscaling_options" { default = [] } +variable "runner_worker_graceful_terminate" { + description = "Gracefully terminate Runner Worker, by waiting a set amount of time for running jobs to finish before termination." + type = object({ + enabled = optional(bool, false) + timeout = optional(number, 1800) + retry_period = optional(number, 300) + job_timeout = optional(number, 3600) + }) + default = {} +} + variable "debug" { description = <<-EOT trace_runner_user_data: Enable bash trace for the user data script on the Agent. Be aware this could log sensitive data such as you GitLab runner token.