Skip to content

fix: Honnor booting instance in runner pool #2801

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 112 additions & 44 deletions modules/runners/lambdas/runners/src/pool/pool.test.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { Octokit } from '@octokit/rest';
import { mocked } from 'jest-mock';
import moment from 'moment-timezone';
import nock from 'nock';

import { listEC2Runners } from '../aws/runners';
import * as ghAuth from '../gh-auth/gh-auth';
import * as scale from '../scale-runners/scale-up';
import { createRunners } from '../scale-runners/scale-up';
import { adjust } from './pool';

const mockOctokit = {
Expand All @@ -24,6 +25,7 @@ jest.mock('@octokit/rest', () => ({

jest.mock('./../aws/runners');
jest.mock('./../gh-auth/gh-auth');
jest.mock('./../scale-runners/scale-up');

const mocktokit = Octokit as jest.MockedClass<typeof Octokit>;
const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, {
Expand All @@ -36,6 +38,36 @@ const mockListRunners = mocked(listEC2Runners);
const cleanEnv = process.env;

const ORG = 'my-org';
const MINIMUM_TIME_RUNNING = 15;

const ec2InstancesRegistered = [
{
instanceId: 'i-1-idle',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-2-busy',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-3-offline',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-4-idle-older-than-minimum-time-running',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
];

beforeEach(() => {
nock.disableNetConnect();
Expand All @@ -55,6 +87,7 @@ beforeEach(() => {
process.env.INSTANCE_TYPES = 'm5.large';
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
process.env.RUNNER_OWNER = ORG;
process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();

const mockTokenReturnValue = {
data: {
Expand All @@ -66,66 +99,39 @@ beforeEach(() => {
mockOctokit.paginate.mockImplementation(() => [
{
id: 1,
name: 'i-1',
name: 'i-1-idle',
os: 'linux',
status: 'online',
busy: false,
labels: [],
},
{
id: 2,
name: 'i-2',
name: 'i-2-busy',
os: 'linux',
status: 'online',
busy: true,
labels: [],
},
{
id: 3,
name: 'i-3',
name: 'i-3-offline',
os: 'linux',
status: 'offline',
busy: false,
labels: [],
},
{
id: 11,
name: 'j-1', // some runner of another env
id: 3,
name: 'i-4-idle-older-than-minimum-time-running',
os: 'linux',
status: 'online',
busy: false,
labels: [],
},
{
id: 12,
name: 'j-2', // some runner of another env
os: 'linux',
status: 'online',
busy: true,
labels: [],
},
]);

mockListRunners.mockImplementation(async () => [
{
instanceId: 'i-1',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-2',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-3',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
]);
mockListRunners.mockImplementation(async () => ec2InstancesRegistered);

const mockInstallationIdReturnValueOrgs = {
data: {
Expand Down Expand Up @@ -156,16 +162,74 @@ beforeEach(() => {

describe('Test simple pool.', () => {
describe('With GitHub Cloud', () => {
it('Top up pool with pool size 2.', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 2 })).resolves;
expect(spy).toBeCalled;
it('Top up pool with pool size 2 registered.', async () => {
await expect(await adjust({ poolSize: 3 })).resolves;
expect(createRunners).toHaveBeenCalledTimes(1);
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 1 }),
expect.anything(),
);
});

it('Should not top up if pool size is reached.', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 1 })).resolves;
expect(spy).not.toHaveBeenCalled;
await expect(await adjust({ poolSize: 1 })).resolves;
expect(createRunners).not.toHaveBeenCalled();
});

it('Should top up if pool size is not reached including a booting instance.', async () => {
mockListRunners.mockImplementation(async () => [
...ec2InstancesRegistered,
{
instanceId: 'i-4-still-booting',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-5-orphan',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
]);

// 2 idle + 1 booting = 3, top up with 2 to match a pool of 5
await expect(await adjust({ poolSize: 5 })).resolves;
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 2 }),
expect.anything(),
);
});

it('Should not top up if pool size is reached including a booting instance.', async () => {
mockListRunners.mockImplementation(async () => [
...ec2InstancesRegistered,
{
instanceId: 'i-4-still-booting',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-5-orphan',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
]);

await expect(await adjust({ poolSize: 2 })).resolves;
expect(createRunners).not.toHaveBeenCalled();
});
});

Expand All @@ -175,9 +239,13 @@ describe('Test simple pool.', () => {
});

it('Top up if the pool size is set to 5', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 5 })).resolves;
expect(spy).toBeCalled;
await expect(await adjust({ poolSize: 5 })).resolves;
// 2 idle, top up with 3 to match a pool of 5
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 3 }),
expect.anything(),
);
});
});
});
59 changes: 46 additions & 13 deletions modules/runners/lambdas/runners/src/pool/pool.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import moment from 'moment';
import yn from 'yn';

import { listEC2Runners } from '../aws/runners';
import { RunnerList, listEC2Runners } from '../aws/runners';
import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth';
import { logger as rootLogger } from '../logger';
import { createRunners } from '../scale-runners/scale-up';
Expand All @@ -11,6 +12,11 @@ export interface PoolEvent {
poolSize: number;
}

interface RunnerStatus {
busy: boolean;
status: string;
}

export async function adjust(event: PoolEvent): Promise<void> {
logger.info(`Checking current pool size against pool of size: ${event.poolSize}`);
const runnerExtraLabels = process.env.RUNNER_EXTRA_LABELS;
Expand Down Expand Up @@ -45,20 +51,41 @@ export async function adjust(event: PoolEvent): Promise<void> {
per_page: 100,
},
);
const idleRunners = runners.filter((r) => !r.busy && r.status === 'online').map((r) => r.name);
const runnerStatus = new Map<string, RunnerStatus>();
for (const runner of runners) {
runnerStatus.set(runner.name, { busy: runner.busy, status: runner.status });
}

// Look up the managed ec2 runners in AWS, but running does not mean idle
const ec2runners = (
await listEC2Runners({
environment,
runnerOwner,
runnerType: 'Org',
statuses: ['running'],
})
).map((r) => r.instanceId);
const ec2runners = await listEC2Runners({
environment,
runnerOwner,
runnerType: 'Org',
statuses: ['running'],
});

const managedIdleRunners = ec2runners.filter((r) => idleRunners.includes(r));
const topUp = event.poolSize - managedIdleRunners.length;
// Runner should be considered idle if it is still booting, or is idle in GitHub
let numberOfRunnersInPool = 0;
for (const ec2Instance of ec2runners) {
if (
runnerStatus.get(ec2Instance.instanceId)?.busy === false &&
runnerStatus.get(ec2Instance.instanceId)?.status === 'online'
) {
numberOfRunnersInPool++;
logger.debug(`Runner ${ec2Instance.instanceId} is idle in GitHub and counted as part of the pool`);
} else if (runnerStatus.get(ec2Instance.instanceId) != null) {
logger.debug(`Runner ${ec2Instance.instanceId} is not idle in GitHub and NOT counted as part of the pool`);
} else if (!bootTimeExceeded(ec2Instance)) {
numberOfRunnersInPool++;
logger.info(`Runner ${ec2Instance.instanceId} is still booting and counted as part of the pool`);
} else {
logger.debug(
`Runner ${ec2Instance.instanceId} is not idle in GitHub nor booting and not counted as part of the pool`,
);
}
}

const topUp = event.poolSize - numberOfRunnersInPool;
if (topUp > 0) {
logger.info(`The pool will be topped up with ${topUp} runners.`);
await createRunners(
Expand Down Expand Up @@ -87,7 +114,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
githubInstallationClient,
);
} else {
logger.info(`Pool will not be topped up. Find ${managedIdleRunners} managed idle runners.`);
logger.info(`Pool will not be topped up. Found ${numberOfRunnersInPool} managed idle runners.`);
}
}

Expand All @@ -101,3 +128,9 @@ async function getInstallationId(ghesApiUrl: string, org: string): Promise<numbe
})
).data.id;
}

function bootTimeExceeded(ec2Runner: RunnerList): boolean {
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
return launchTimePlusBootTime < moment(new Date()).utc();
}
1 change: 1 addition & 0 deletions modules/runners/pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ module "pool" {
runner = {
disable_runner_autoupdate = var.disable_runner_autoupdate
ephemeral = var.enable_ephemeral_runners
boot_time_in_minutes = var.runner_boot_time_in_minutes
extra_labels = var.runner_extra_labels
launch_template = aws_launch_template.runner
group_name = var.runner_group_name
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ resource "aws_lambda_function" "pool" {
NODE_TLS_REJECT_UNAUTHORIZED = var.config.ghes.url != null && !var.config.ghes.ssl_verify ? 0 : 1
PARAMETER_GITHUB_APP_ID_NAME = var.config.github_app_parameters.id.name
PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.config.github_app_parameters.key_base64.name
RUNNER_BOOT_TIME_IN_MINUTES = var.config.runner.boot_time_in_minutes
RUNNER_EXTRA_LABELS = var.config.runner.extra_labels
RUNNER_GROUP_NAME = var.config.runner.group_name
RUNNER_OWNER = var.config.runner.pool_owner
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ variable "config" {
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
boot_time_in_minutes = number
extra_labels = string
launch_template = object({
name = string
Expand Down