Skip to content

Commit 3151807

Browse files
authored
feat: remove unused SSH keys (#652)
## Description Whenever an executor is created it needs an SSH key so the agent is able to contact it. In case the executor is killed this SSH key is not removed. Over time they accumulate and reach the AWS limit. From this point no further SSH keys can be created and thus the module is no longer able to create executors. You won't be able to run any pipeline. Some users have created their own housekeeping of SSH keys. But this should be done within the module as the keys are created here. Challenge: Make sure that keys are deleted only which belong to our module. Unfortunately the GitLab Runner does not tag the keys. We use the following logic to determine the keys to delete: - they are unused - their name starts with `runner` - they have the `var.environment`/`var.overrides['name_docker_machine']` somewhere in the name Solves part of #623 Closes #592
1 parent 8746f14 commit 3151807

File tree

7 files changed

+128
-24
lines changed

7 files changed

+128
-24
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,7 @@ builds/
3131

3232
# exceptions for semantic release
3333
!.release/package*
34-
!.release/*.lock
34+
!.release/*.lock
35+
36+
# Python
37+
venv/

locals.tf

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ locals {
2525
name_runner_agent_instance = var.overrides["name_runner_agent_instance"] == "" ? local.tags["Name"] : var.overrides["name_runner_agent_instance"]
2626
name_sg = var.overrides["name_sg"] == "" ? local.tags["Name"] : var.overrides["name_sg"]
2727
name_iam_objects = lookup(var.overrides, "name_iam_objects", "") == "" ? local.tags["Name"] : var.overrides["name_iam_objects"]
28+
2829
runners_additional_volumes = <<-EOT
2930
%{~if var.runners_add_dind_volumes~},"/certs/client", "/builds", "/var/run/docker.sock:/var/run/docker.sock"%{endif~}%{~for volume in var.runners_additional_volumes~},"${volume}"%{endfor~}
3031
EOT

main.tf

+1
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,7 @@ module "terminate_agent_hook" {
545545
asg_name = aws_autoscaling_group.gitlab_runner_instance.name
546546
cloudwatch_logging_retention_in_days = var.cloudwatch_logging_retention_in_days
547547
name_iam_objects = local.name_iam_objects
548+
name_docker_machine_runners = local.runner_tags_merged["Name"]
548549
role_permissions_boundary = var.permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.permissions_boundary}"
549550
kms_key_id = local.kms_key
550551
arn_format = var.arn_format

modules/terminate-agent-hook/iam.tf

+31-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ data "aws_iam_policy_document" "lambda" {
4040
"ec2:DescribeInstances",
4141
"ec2:DescribeTags",
4242
"ec2:DescribeRegions",
43-
"ec2:DescribeInstanceStatus"
43+
"ec2:DescribeInstanceStatus",
4444
]
4545
resources = ["*"]
4646
effect = "Allow"
@@ -71,6 +71,7 @@ data "aws_iam_policy_document" "lambda" {
7171
]
7272
resources = [var.asg_arn]
7373
}
74+
7475
statement {
7576
sid = "GitLabRunnerLifecycleTerminateLogs"
7677
actions = [
@@ -85,13 +86,40 @@ data "aws_iam_policy_document" "lambda" {
8586
"${aws_cloudwatch_log_group.lambda.arn}:log-stream:*"
8687
]
8788
}
89+
90+
statement {
91+
sid = "SSHKeyHousekeepingList"
92+
93+
effect = "Allow"
94+
actions = [
95+
"ec2:DescribeKeyPairs"
96+
]
97+
resources = ["*"]
98+
}
99+
100+
# separate statement due to the condition
101+
statement {
102+
sid = "SSHKeyHousekeepingDelete"
103+
104+
effect = "Allow"
105+
actions = [
106+
"ec2:DeleteKeyPair"
107+
]
108+
resources = ["*"]
109+
condition {
110+
test = "StringLike"
111+
variable = "ec2:KeyPairName"
112+
values = ["runner-*"]
113+
}
114+
}
88115
}
89116

90117
resource "aws_iam_policy" "lambda" {
91-
name = "${var.name_iam_objects}-${var.name}"
118+
name = "${var.name_iam_objects}-${var.name}-lambda"
92119
path = "/"
93120
policy = data.aws_iam_policy_document.lambda.json
94-
tags = var.tags
121+
122+
tags = var.tags
95123
}
96124

97125
resource "aws_iam_role_policy_attachment" "lambda" {

modules/terminate-agent-hook/lambda/lambda_function.py

+80-20
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,36 @@
11
"""
2-
AWS Lambda function to terminate orphaned GitLab runners.
2+
AWS Lambda function to terminate orphaned GitLab runners and remove unused resources.
33
4-
This checks for running GitLab runner instances and terminates them,
5-
intended to be triggered by an ASG life cycle hook at instance termination.
4+
- This checks for running GitLab runner instances and terminates them, intended to be triggered by an ASG life cycle hook at
5+
instance termination.
6+
- Removes all unused SSH keys
67
7-
https://github.com/npalm/terraform-aws-gitlab-runner/issues/317 has some
8-
discussion about this scenario.
8+
https://github.com/npalm/terraform-aws-gitlab-runner/issues/317 has some discussion about this scenario.
99
1010
This is rudimentary and doesn't check if a build runner has a current job.
1111
"""
1212
import boto3
13+
import botocore
1314
import json
15+
import os
1416

15-
def ec2_list(client, **args):
1617

18+
def ec2_list(client, **args):
1719
print(json.dumps({
1820
"Level": "info",
1921
"Message": f"Searching for children of GitLab runner instance {args['parent']}"
2022
}))
2123

2224
ec2_instances = client.describe_instances(Filters=[
23-
{
24-
"Name": "instance-state-name",
25-
"Values": ['running', 'pending'],
26-
},
27-
{
28-
"Name": "tag:gitlab-runner-parent-id",
29-
"Values": ["*"]
30-
}
31-
]).get("Reservations")
25+
{
26+
"Name": "instance-state-name",
27+
"Values": ['running', 'pending'],
28+
},
29+
{
30+
"Name": "tag:gitlab-runner-parent-id",
31+
"Values": ["*"]
32+
}
33+
]).get("Reservations")
3234

3335
_terminate_list = []
3436
for _instances in ec2_instances:
@@ -87,34 +89,92 @@ def ec2_list(client, **args):
8789

8890
return _terminate_list
8991

92+
93+
def remove_unused_ssh_key_pairs(client, executor_name_part):
94+
print(json.dumps({
95+
"Level": "info",
96+
"Message": f"Removing unused SSH key pairs for agent {executor_name_part}"
97+
}))
98+
99+
# build list of SSH keys to keep
100+
paginator = client.get_paginator('describe_instances')
101+
reservations = paginator.paginate(Filters=[
102+
{
103+
"Name": "key-name",
104+
"Values": ['runner-*'],
105+
},
106+
{
107+
"Name": "instance-state-name",
108+
"Values": ['pending', 'running'],
109+
},
110+
]).build_full_result().get("Reservations")
111+
112+
used_key_pairs = []
113+
114+
for reservation in reservations:
115+
for instance in reservation["Instances"]:
116+
used_key_pairs.append(instance['KeyName'])
117+
118+
all_key_pairs = client.describe_key_pairs(Filters=[
119+
{
120+
"Name": "key-name",
121+
"Values": ['runner-*'],
122+
},
123+
])
124+
125+
for key_pair in all_key_pairs['KeyPairs']:
126+
key_name = key_pair['KeyName']
127+
128+
if key_name not in used_key_pairs:
129+
# make sure to delete only those keys which belongs to our module
130+
# unfortunately there are no tags set on the keys and GitLab runner is not able to do that
131+
if executor_name_part in key_name:
132+
try:
133+
client.delete_key_pair(KeyName=key_name)
134+
135+
print(json.dumps({
136+
"Level": "info",
137+
"Message": f"Key pair deleted: {key_name}"
138+
}))
139+
except botocore.exceptions.ClientError as error:
140+
print(json.dumps({
141+
"Level": "error",
142+
"Message": f"Unable to delete key pair: {key_name}",
143+
"Exception": str(error)
144+
}))
145+
146+
90147
def handler(event, context):
91148
response = []
92149
event_detail = event['detail']
93150
client = boto3.client("ec2", region_name=event['region'])
94151
if event_detail['LifecycleTransition'] != "autoscaling:EC2_INSTANCE_TERMINATING":
95152
exit()
96153

97-
_terminate_list = ec2_list(client=client,parent=event_detail['EC2InstanceId'])
154+
# find the executors connected to this agent and terminate them as well
155+
_terminate_list = ec2_list(client=client, parent=event_detail['EC2InstanceId'])
98156
if len(_terminate_list) > 0:
99157
print(json.dumps({
100158
"Level": "info",
101159
"Message": f"Terminating instances {', '.join(_terminate_list)}"
102160
}))
103161
try:
104162
client.terminate_instances(InstanceIds=_terminate_list, DryRun=False)
105-
return f"Terminated instances {', '.join(_terminate_list)}"
106163
except Exception as e:
107164
print(json.dumps({
108165
"Level": "exception",
109166
"Exception": str(e)
110167
}))
111-
raise Exception(f"Encountered exception when terminating instances: {str(e)}")
112168
else:
113169
print(json.dumps({
114170
"Level": "info",
115171
"Message": "No instances to terminate."
116172
}))
117-
return "No instances to terminate."
173+
174+
remove_unused_ssh_key_pairs(client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])
175+
176+
return f"Housekeeping done"
177+
118178

119179
if __name__ == "__main__":
120-
handler(None, None)
180+
handler(None, None)

modules/terminate-agent-hook/main.tf

+6
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ resource "aws_lambda_function" "terminate_runner_instances" {
3131
timeout = local.lambda_timeout
3232
tags = var.tags
3333

34+
environment {
35+
variables = {
36+
NAME_EXECUTOR_INSTANCE = var.name_docker_machine_runners
37+
}
38+
}
39+
3440
dynamic "tracing_config" {
3541
for_each = var.enable_xray_tracing ? [1] : []
3642

modules/terminate-agent-hook/variables.tf

+5
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ variable "name_iam_objects" {
4545
default = ""
4646
}
4747

48+
variable "name_docker_machine_runners" {
49+
description = "The `Name` tag of EC2 instances created by the runner agent."
50+
type = string
51+
}
52+
4853
variable "kms_key_id" {
4954
description = "KMS key id to encrypted the CloudWatch logs. Ensure CloudWatch has access to the provided KMS key."
5055
type = string

0 commit comments

Comments
 (0)