Skip to content

Commit 7499b0b

Browse files
committed
feat: add graceful terminate option to terminate-agent-hook
1 parent 51d5ea5 commit 7499b0b

15 files changed

+507
-22
lines changed

.cspell.json

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"gitter",
7171
"Niek",
7272
"oxsecurity",
73+
"redrive",
7374
"signoff",
7475
"typecheck",
7576
"userdata",

.pylintrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[MASTER]
2-
init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.11/site-packages/')"
2+
init-hook="import sys; sys.path.insert(0, '/usr/local/lib/python3.12/site-packages/')"
33

44
[FORMAT]
55
max-line-length=132

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
205205
| <a name="input_runner_worker_docker_services_volumes_tmpfs"></a> [runner\_worker\_docker\_services\_volumes\_tmpfs](#input\_runner\_worker\_docker\_services\_volumes\_tmpfs) | Mount a tmpfs in gitlab service container. https://docs.gitlab.com/runner/executors/docker.html#mounting-a-directory-in-ram | <pre>list(object({<br> volume = string<br> options = string<br> }))</pre> | `[]` | no |
206206
| <a name="input_runner_worker_docker_volumes_tmpfs"></a> [runner\_worker\_docker\_volumes\_tmpfs](#input\_runner\_worker\_docker\_volumes\_tmpfs) | Mount a tmpfs in Executor container. https://docs.gitlab.com/runner/executors/docker.html#mounting-a-directory-in-ram | <pre>list(object({<br> volume = string<br> options = string<br> }))</pre> | `[]` | no |
207207
| <a name="input_runner_worker_gitlab_pipeline"></a> [runner\_worker\_gitlab\_pipeline](#input\_runner\_worker\_gitlab\_pipeline) | post\_build\_script = Script to execute in the pipeline just after the build, but before executing after\_script.<br>pre\_build\_script = Script to execute in the pipeline just before the build.<br>pre\_clone\_script = Script to execute in the pipeline before cloning the Git repository. this can be used to adjust the Git client configuration first, for example. | <pre>object({<br> post_build_script = optional(string, "\"\"")<br> pre_build_script = optional(string, "\"\"")<br> pre_clone_script = optional(string, "\"\"")<br> })</pre> | `{}` | no |
208+
| <a name="input_runner_worker_graceful_terminate"></a> [runner\_worker\_graceful\_terminate](#input\_runner\_worker\_graceful\_terminate) | Enable to gracefully terminate runner instances, giving running jobs a chance to finish.<br><br> enabled = Boolean used to enable or disable graceful terminate.<br>timeout = Time in seconds to wait before aborting graceful termination and force terminating runner instances<br>retry_period = Time in seconds between retrying to stop the gitlab-runner service<br>job_timeout = Time in seconds to wait for gitlab jobs to stop running when stopping the gitlab-runner service | <pre>object({<br> enabled = optional(bool, false)<br> timeout = optional(number, 1800)<br> retry_period = optional(number, 300)<br> job_timeout = optional(number, 3600)<br> })</pre> | `{}` | no |
208209
| <a name="input_security_group_prefix"></a> [security\_group\_prefix](#input\_security\_group\_prefix) | Set the name prefix and overwrite the `Name` tag for all security groups. | `string` | `""` | no |
209210
| <a name="input_subnet_id"></a> [subnet\_id](#input\_subnet\_id) | Subnet id used for the Runner and Runner Workers. Must belong to the `vpc_id`. In case the fleet mode is used, multiple subnets for<br>the Runner Workers can be provided with runner\_worker\_docker\_machine\_instance.subnet\_ids. | `string` | n/a | yes |
210211
| <a name="input_suppressed_tags"></a> [suppressed\_tags](#input\_suppressed\_tags) | List of tag keys which are automatically removed and never added as default tag by the module. | `list(string)` | `[]` | no |

main.tf

+5
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ locals {
7979
use_fleet = var.runner_worker_docker_machine_fleet.enable
8080
private_key = var.runner_worker_docker_machine_fleet.enable == true ? tls_private_key.fleet[0].private_key_pem : ""
8181
use_new_runner_authentication_gitlab_16 = var.runner_gitlab_registration_config.type != ""
82+
runner_service_stop_timeout = var.runner_worker_graceful_terminate.job_timeout
8283
})
8384

8485
template_runner_config = templatefile("${path.module}/template/runner-config.tftpl",
@@ -641,6 +642,10 @@ module "terminate_agent_hook" {
641642
name_docker_machine_runners = local.runner_tags_merged["Name"]
642643
role_permissions_boundary = var.iam_permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.iam_permissions_boundary}"
643644
kms_key_id = local.kms_key
645+
graceful_terminate_enabled = var.runner_worker_graceful_terminate.enabled
646+
graceful_terminate_timeout = var.runner_worker_graceful_terminate.timeout
647+
sqs_max_receive_count = ceil(var.runner_worker_graceful_terminate.timeout / var.runner_worker_graceful_terminate.retry_period) + 1
648+
sqs_visibility_timeout = var.runner_worker_graceful_terminate.retry_period
644649

645650
tags = local.tags
646651
}

modules/terminate-agent-hook/README.md

+24
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ is, that no tags are added to the spot request by the docker+machine driver and
2323
to our module. The rule is, that parts of the Executor's name become part of the related SSH key which is in turn part
2424
of the spot request.
2525

26+
Optionally, graceful terminate can be enabled for this module with the `graceful_terminate_enabled` variable.
27+
When enabled, the lambda function will attempt to stop the `gitlab-runner` service on the runner before terminating
28+
runner instances, which gives running jobs a chance to finish.
29+
2630
## Usage
2731

2832
### Default Behavior - Package With the Module
@@ -91,6 +95,13 @@ module "runner" {
9195
expiration_days = 90
9296
}
9397
98+
# optional, if excluded then the default terminate instances behavior will be used
99+
runner_worker_graceful_terminate = {
100+
enabled = true # defaults to false
101+
timeout = 600
102+
retry_period = 60
103+
}
104+
94105
runner_gitlab_registration_config = {
95106
type = "instance" # or "group" or "project"
96107
# group_id = 1234 # for "group"
@@ -141,21 +152,34 @@ No modules.
141152
| [aws_cloudwatch_event_rule.terminate_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
142153
| [aws_cloudwatch_event_target.terminate_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
143154
| [aws_cloudwatch_log_group.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
155+
| [aws_iam_policy.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
156+
| [aws_iam_policy.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
144157
| [aws_iam_policy.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
145158
| [aws_iam_policy.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
159+
| [aws_iam_role.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
146160
| [aws_iam_role.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
161+
| [aws_iam_role_policy_attachment.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
162+
| [aws_iam_role_policy_attachment.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
147163
| [aws_iam_role_policy_attachment.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
148164
| [aws_iam_role_policy_attachment.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
165+
| [aws_lambda_event_source_mapping.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_event_source_mapping) | resource |
149166
| [aws_lambda_function.terminate_runner_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
167+
| [aws_lambda_function_event_invoke_config.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function_event_invoke_config) | resource |
150168
| [aws_lambda_permission.current_version_triggers](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
151169
| [aws_lambda_permission.unqualified_alias_triggers](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
152170
| [archive_file.terminate_runner_instances_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
153171
| [aws_caller_identity.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
172+
| [aws_iam_policy_document.asg_lifecycle_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
154173
| [aws_iam_policy_document.assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
174+
| [aws_iam_policy_document.asg_lifecycle](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
175+
| [aws_iam_policy_document.graceful_terminate](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
155176
| [aws_iam_policy_document.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
156177
| [aws_iam_policy_document.spot_request_housekeeping](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
157178
| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
158179
| [aws_region.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
180+
| [aws_sqs_queue.graceful_terminate_dlq](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sqs_queue) | resource |
181+
| [aws_sqs_queue.graceful_terminate_queue](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sqs_queue) | resource |
182+
| [aws_ssm_document.stop_gitlab_runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_document) | resource |
159183

160184
## Inputs
161185

modules/terminate-agent-hook/cloudwatch.tf

+5-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# ----------------------------------------------------------------------------
77

88
resource "aws_cloudwatch_event_rule" "terminate_instances" {
9+
count = var.graceful_terminate_enabled ? 0 : 1
10+
911
name = "${var.environment}-${var.name}"
1012
description = "Trigger GitLab runner instance lifecycle hook on termination."
1113

@@ -23,7 +25,9 @@ EOF
2325
}
2426

2527
resource "aws_cloudwatch_event_target" "terminate_instances" {
26-
rule = aws_cloudwatch_event_rule.terminate_instances.name
28+
count = var.graceful_terminate_enabled ? 0 : 1
29+
30+
rule = aws_cloudwatch_event_rule.terminate_instances[0].name
2731
target_id = "${var.environment}-TriggerTerminateLambda"
2832
arn = aws_lambda_function.terminate_runner_instances.arn
2933
}

modules/terminate-agent-hook/iam.tf

+145
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,75 @@ data "aws_region" "this" {}
88
# Terminate Instances - IAM Resources
99
# ----------------------------------------------------------------------------
1010

11+
################################################################################
12+
### ASG IAM
13+
################################################################################
14+
15+
data "aws_iam_policy_document" "asg_lifecycle_assume_role" {
16+
count = var.graceful_terminate_enabled ? 1 : 0
17+
18+
statement {
19+
actions = [
20+
"sts:AssumeRole",
21+
]
22+
effect = "Allow"
23+
24+
principals {
25+
identifiers = ["autoscaling.amazonaws.com"]
26+
type = "Service"
27+
}
28+
}
29+
}
30+
31+
resource "aws_iam_role" "asg_lifecycle" {
32+
count = var.graceful_terminate_enabled ? 1 : 0
33+
34+
name = "${var.name_iam_objects}-${var.name}-asg-lifecycle"
35+
description = "Role for the graceful terminate ASG lifecycle hook"
36+
path = "/"
37+
permissions_boundary = var.role_permissions_boundary
38+
assume_role_policy = data.aws_iam_policy_document.asg_lifecycle_assume_role[0].json
39+
force_detach_policies = true
40+
tags = var.tags
41+
}
42+
43+
# This IAM policy is used by the ASG lifecycle hook.
44+
data "aws_iam_policy_document" "asg_lifecycle" {
45+
count = var.graceful_terminate_enabled ? 1 : 0
46+
47+
# Permit the GitLab Runner ASG to send messages to SQS
48+
statement {
49+
sid = "ASGLifecycleSqs"
50+
actions = [
51+
"sqs:SendMessage",
52+
"sqs:GetQueueUrl"
53+
]
54+
resources = ["${aws_sqs_queue.graceful_terminate_queue[0].arn}"]
55+
effect = "Allow"
56+
}
57+
}
58+
59+
resource "aws_iam_policy" "asg_lifecycle" {
60+
count = var.graceful_terminate_enabled ? 1 : 0
61+
62+
name = "${var.name_iam_objects}-${var.name}-asg-lifecycle"
63+
path = "/"
64+
policy = data.aws_iam_policy_document.asg_lifecycle[0].json
65+
66+
tags = var.tags
67+
}
68+
69+
resource "aws_iam_role_policy_attachment" "asg_lifecycle" {
70+
count = var.graceful_terminate_enabled ? 1 : 0
71+
72+
role = aws_iam_role.asg_lifecycle[0].name
73+
policy_arn = aws_iam_policy.asg_lifecycle[0].arn
74+
}
75+
76+
################################################################################
77+
### Lambda IAM
78+
################################################################################
79+
1180
data "aws_iam_policy_document" "assume_role" {
1281
statement {
1382
actions = [
@@ -134,6 +203,65 @@ data "aws_iam_policy_document" "spot_request_housekeeping" {
134203
}
135204
}
136205

206+
data "aws_iam_policy_document" "graceful_terminate" {
207+
count = var.graceful_terminate_enabled ? 1 : 0
208+
209+
# Permit the function to process SQS messages
210+
statement {
211+
sid = "GitLabRunnerGracefulTerminateSQS"
212+
actions = [
213+
"sqs:DeleteMessage",
214+
"sqs:GetQueueAttributes",
215+
"sqs:ReceiveMessage"
216+
]
217+
effect = "Allow"
218+
resources = [
219+
resource.aws_sqs_queue.graceful_terminate_queue[0].arn
220+
]
221+
}
222+
223+
# Permit the function to invoke the SSM document for stopping gitlab-runner
224+
statement {
225+
sid = "GitLabRunnerGracefulTerminateSSMSend"
226+
actions = [
227+
"ssm:SendCommand"
228+
]
229+
effect = "Allow"
230+
resources = [
231+
resource.aws_ssm_document.stop_gitlab_runner[0].arn
232+
]
233+
}
234+
235+
# Permit the function to send SSM commands to the GitLab Runner instance
236+
statement {
237+
sid = "GitLabRunnerGracefulTerminateSSMSendEC2"
238+
actions = [
239+
"ssm:SendCommand"
240+
]
241+
effect = "Allow"
242+
resources = [
243+
"arn:${data.aws_partition.current.partition}:ec2:${data.aws_region.this.name}:${data.aws_caller_identity.this.account_id}:instance/*"
244+
]
245+
condition {
246+
test = "StringLike"
247+
variable = "ssm:ResourceTag/Name"
248+
values = ["${var.environment}*"]
249+
}
250+
}
251+
252+
# Permit the function to get SSM command invocation details
253+
statement {
254+
sid = "GitLabRunnerGracefulTerminateSSMGet"
255+
actions = [
256+
"ssm:GetCommandInvocation"
257+
]
258+
effect = "Allow"
259+
resources = [
260+
"*"
261+
]
262+
}
263+
}
264+
137265
resource "aws_iam_policy" "lambda" {
138266
name = "${var.name_iam_objects}-${var.name}-lambda"
139267
path = "/"
@@ -159,3 +287,20 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" {
159287
role = aws_iam_role.lambda.name
160288
policy_arn = aws_iam_policy.spot_request_housekeeping.arn
161289
}
290+
291+
resource "aws_iam_policy" "graceful_terminate" {
292+
count = var.graceful_terminate_enabled ? 1 : 0
293+
294+
name = "${var.name_iam_objects}-${var.name}-graceful-terminate"
295+
path = "/"
296+
policy = data.aws_iam_policy_document.graceful_terminate[0].json
297+
298+
tags = var.tags
299+
}
300+
301+
resource "aws_iam_role_policy_attachment" "graceful_terminate" {
302+
count = var.graceful_terminate_enabled ? 1 : 0
303+
304+
role = aws_iam_role.lambda.name
305+
policy_arn = aws_iam_policy.graceful_terminate[0].arn
306+
}

0 commit comments

Comments
 (0)