Skip to content

Commit f1b4f4a

Browse files
authored
feat: cancel spot requests (#653)
## Description Whenever a spot executor is created a spot request is created. Occasionally there might be open spot requests which have not been fulfilled. These spot requests have to be deleted as soon as the agent terminates as no executors are needed. Challenge: Make sure that spot requests are deleted only which belong to our module. Unfortunately the GitLab Runner does not tag them. We use the following logic for determinination: - state is open or active - they are associated to a SSH key which name starts with `runner` - they have the `var.environment`/`var.overrides['name_docker_machine']` somewhere in the name Solves part of #623 Closes #493
1 parent 924e810 commit f1b4f4a

File tree

3 files changed

+105
-2
lines changed

3 files changed

+105
-2
lines changed

modules/terminate-agent-hook/README.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ instances with no running parent runner.
1717
See [issue #214](https://github.com/npalm/terraform-aws-gitlab-runner/issues/214) for
1818
discussion on the scenario this module addresses.
1919

20+
Furthermore, all spot requests which are still open are cancelled. Otherwise they might be fulfilled later but
21+
without the creating instance running, these spot request are never terminated and costs incur. The problem here
22+
is, that no tags are added to the spot request by the docker+machine driver and we can only guess which ones belong
23+
to our module. The rule is, that parts of the Executor's name become part of the related SSH key which is in turn part
24+
of the spot request.
25+
2026
## Usage
2127

2228
### Default Behavior - Package With the Module
@@ -157,4 +163,4 @@ No modules.
157163
| <a name="output_lambda_function_invoke_arn"></a> [lambda\_function\_invoke\_arn](#output\_lambda\_function\_invoke\_arn) | Lambda function invoke arn. |
158164
| <a name="output_lambda_function_name"></a> [lambda\_function\_name](#output\_lambda\_function\_name) | Lambda function name. |
159165
| <a name="output_lambda_function_source_code_hash"></a> [lambda\_function\_source\_code\_hash](#output\_lambda\_function\_source\_code\_hash) | Lambda function source code hash. |
160-
<!-- END_TF_DOCS -->
166+
<!-- END_TF_DOCS -->

modules/terminate-agent-hook/iam.tf

+27
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,20 @@ data "aws_iam_policy_document" "lambda" {
114114
}
115115
}
116116

117+
data "aws_iam_policy_document" "spot_request_housekeeping" {
118+
statement {
119+
sid = "SpotRequestHousekeepingList"
120+
121+
effect = "Allow"
122+
actions = [
123+
"ec2:CancelSpotInstanceRequests",
124+
"ec2:DescribeSpotInstanceRequests"
125+
]
126+
# I didn't found any condition to limit the access
127+
resources = ["*"]
128+
}
129+
}
130+
117131
resource "aws_iam_policy" "lambda" {
118132
name = "${var.name_iam_objects}-${var.name}-lambda"
119133
path = "/"
@@ -126,3 +140,16 @@ resource "aws_iam_role_policy_attachment" "lambda" {
126140
role = aws_iam_role.lambda.name
127141
policy_arn = aws_iam_policy.lambda.arn
128142
}
143+
144+
resource "aws_iam_policy" "spot_request_housekeeping" {
145+
name = "${var.name_iam_objects}-${var.name}-cancel-spot"
146+
path = "/"
147+
policy = data.aws_iam_policy_document.spot_request_housekeeping.json
148+
149+
tags = var.tags
150+
}
151+
152+
resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" {
153+
role = aws_iam_role.lambda.name
154+
policy_arn = aws_iam_policy.spot_request_housekeeping.arn
155+
}

modules/terminate-agent-hook/lambda/lambda_function.py

+71-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616

1717

18+
1819
def ec2_list(client, **args):
1920
print(json.dumps({
2021
"Level": "info",
@@ -90,6 +91,63 @@ def ec2_list(client, **args):
9091
return _terminate_list
9192

9293

94+
def cancel_active_spot_requests(ec2_client, executor_name_part):
95+
print(json.dumps({
96+
"Level": "info",
97+
"Message": f"Removing open spot requests for environment {executor_name_part}"
98+
}))
99+
100+
spot_requests_to_cancel = []
101+
102+
next_token = ''
103+
has_more_spot_requests = True
104+
105+
while has_more_spot_requests:
106+
response = ec2_client.describe_spot_instance_requests(Filters=[
107+
{
108+
"Name": "state",
109+
"Values": ['active', 'open']
110+
},
111+
{
112+
"Name": "launch.key-name",
113+
"Values": ["runner-*"]
114+
}
115+
], MaxResults=1000, NextToken=next_token)
116+
117+
for spot_request in response["SpotInstanceRequests"]:
118+
if executor_name_part in spot_request["LaunchSpecification"]["KeyName"]:
119+
spot_requests_to_cancel.append(spot_request["SpotInstanceRequestId"])
120+
121+
print(json.dumps({
122+
"Level": "info",
123+
"Message": f"Identified spot request {spot_request['SpotInstanceRequestId']}"
124+
}))
125+
126+
if 'NextToken' in response and response['NextToken']:
127+
next_token = response['NextToken']
128+
else:
129+
has_more_spot_requests = False
130+
131+
if spot_requests_to_cancel:
132+
try:
133+
ec2_client.cancel_spot_instance_requests(SpotInstanceRequestIds=spot_requests_to_cancel)
134+
135+
print(json.dumps({
136+
"Level": "info",
137+
"Message": "Spot requests deleted"
138+
}))
139+
except Exception as e:
140+
print(json.dumps({
141+
"Level": "exception",
142+
"Message": "Bulk cancelling spot requests failed",
143+
"Exception": str(e)
144+
}))
145+
else:
146+
print(json.dumps({
147+
"Level": "info",
148+
"Message": "No spot requests to cancel"
149+
}))
150+
93151
def remove_unused_ssh_key_pairs(client, executor_name_part):
94152
print(json.dumps({
95153
"Level": "info",
@@ -147,19 +205,31 @@ def remove_unused_ssh_key_pairs(client, executor_name_part):
147205
def handler(event, context):
148206
response = []
149207
event_detail = event['detail']
150-
client = boto3.client("ec2", region_name=event['region'])
208+
151209
if event_detail['LifecycleTransition'] != "autoscaling:EC2_INSTANCE_TERMINATING":
152210
exit()
153211

212+
client = boto3.client("ec2", region_name=event['region'])
213+
214+
# make sure that no new instances are created
215+
cancel_active_spot_requests(ec2_client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])
216+
154217
# find the executors connected to this agent and terminate them as well
155218
_terminate_list = ec2_list(client=client, parent=event_detail['EC2InstanceId'])
219+
156220
if len(_terminate_list) > 0:
157221
print(json.dumps({
158222
"Level": "info",
159223
"Message": f"Terminating instances {', '.join(_terminate_list)}"
160224
}))
161225
try:
162226
client.terminate_instances(InstanceIds=_terminate_list, DryRun=False)
227+
228+
print(json.dumps({
229+
"Level": "info",
230+
"Message": "Instances terminated"
231+
}))
232+
163233
except Exception as e:
164234
print(json.dumps({
165235
"Level": "exception",

0 commit comments

Comments
 (0)