From 4b708970a3784423d2e3db85cbbb26f8f725a7cc Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 8 Apr 2025 11:28:43 +0100 Subject: [PATCH 1/4] don't hardcode max old space size in deployment images --- packages/cli-v3/src/deploy/buildImage.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/cli-v3/src/deploy/buildImage.ts b/packages/cli-v3/src/deploy/buildImage.ts index e5e03c1fc5..42ff998af3 100644 --- a/packages/cli-v3/src/deploy/buildImage.ts +++ b/packages/cli-v3/src/deploy/buildImage.ts @@ -688,8 +688,7 @@ ENV TRIGGER_PROJECT_ID=\${TRIGGER_PROJECT_ID} \ TRIGGER_CONTENT_HASH=\${TRIGGER_CONTENT_HASH} \ TRIGGER_PROJECT_REF=\${TRIGGER_PROJECT_REF} \ NODE_EXTRA_CA_CERTS=\${NODE_EXTRA_CA_CERTS} \ - NODE_ENV=production \ - NODE_OPTIONS="--max_old_space_size=8192" + NODE_ENV=production # Copy the files from the install stage COPY --from=build --chown=node:node /app ./ From 07b369b01f99d5ea34d74055789799ea46f837b7 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 8 Apr 2025 11:52:18 +0100 Subject: [PATCH 2/4] flags: treat underscores as hyphens --- packages/core/src/v3/build/flags.test.ts | 5 +++++ packages/core/src/v3/build/flags.ts | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/packages/core/src/v3/build/flags.test.ts b/packages/core/src/v3/build/flags.test.ts index aaa3149bea..694934b7b0 100644 --- a/packages/core/src/v3/build/flags.test.ts +++ b/packages/core/src/v3/build/flags.test.ts @@ -18,6 +18,11 @@ describe("dedupFlags", () => { expect(dedupFlags("--log=info --log=warn --log=error")).toBe("--log=error"); }); + it("should treat underscores as hyphens", () => { + expect(dedupFlags("--debug_level=info")).toBe("--debug-level=info"); + expect(dedupFlags("--debug_level=info --debug-level=warn")).toBe("--debug-level=warn"); + }); + it("should handle mix of flags with and without values", () => { expect(dedupFlags("--debug=false -v --debug=true")).toBe("-v --debug=true"); expect(dedupFlags("-v --quiet -v")).toBe("--quiet -v"); diff --git a/packages/core/src/v3/build/flags.ts b/packages/core/src/v3/build/flags.ts index 88448fa730..8f6c00349d 100644 --- a/packages/core/src/v3/build/flags.ts +++ b/packages/core/src/v3/build/flags.ts @@ -27,11 +27,11 @@ export function dedupFlags(flags: string): string { .map((flag): [string, string | boolean] => { const equalIndex = flag.indexOf("="); if (equalIndex !== -1) { - const key = flag.slice(0, equalIndex); + const key = flag.slice(0, equalIndex).replace(/_/g, "-"); const value = flag.slice(equalIndex + 1); return [key, value]; } else { - return [flag, true]; + return [flag.replace(/_/g, "-"), true]; } }); From a05f8d50f0955833c036752577b86a6907b34408 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 8 Apr 2025 12:55:32 +0100 Subject: [PATCH 3/4] append attempt number to runner name if >1 --- apps/supervisor/src/util.ts | 10 ++++++++-- apps/supervisor/src/workloadManager/docker.ts | 2 +- apps/supervisor/src/workloadManager/kubernetes.ts | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/supervisor/src/util.ts b/apps/supervisor/src/util.ts index 6e666cb1b1..ba1bc1b2fd 100644 --- a/apps/supervisor/src/util.ts +++ b/apps/supervisor/src/util.ts @@ -5,6 +5,12 @@ export function getDockerHostDomain() { return isMacOs || isWindows ? "host.docker.internal" : "localhost"; } -export function getRunnerId(runId: string) { - return `runner-${runId.replace("run_", "")}`; +export function getRunnerId(runId: string, attemptNumber?: number) { + const parts = ["runner", runId.replace("run_", "")]; + + if (attemptNumber && attemptNumber > 1) { + parts.push(`attempt-${attemptNumber}`); + } + + return parts.join("-"); } diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts index 09695bc897..9e4ba29594 100644 --- a/apps/supervisor/src/workloadManager/docker.ts +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -22,7 +22,7 @@ export class DockerWorkloadManager implements WorkloadManager { async create(opts: WorkloadManagerCreateOptions) { this.logger.log("[DockerWorkloadProvider] Creating container", { opts }); - const runnerId = getRunnerId(opts.runFriendlyId); + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); const runArgs = [ "run", diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index 8b3c48ffed..54dd95a795 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -31,7 +31,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { async create(opts: WorkloadManagerCreateOptions) { this.logger.log("[KubernetesWorkloadManager] Creating container", { opts }); - const runnerId = getRunnerId(opts.runFriendlyId); + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); try { await this.k8s.core.createNamespacedPod({ From 98749b44afe65d5eebaa57562a1a3cb4b41e1824 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 8 Apr 2025 12:55:58 +0100 Subject: [PATCH 4/4] improve retry spans for oom errors --- apps/webapp/app/v3/runEngineHandlers.server.ts | 9 ++++++++- internal-packages/run-engine/src/engine/eventBus.ts | 1 + .../run-engine/src/engine/systems/runAttemptSystem.ts | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 0663e76537..6f236cf3ed 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -326,12 +326,19 @@ export function registerRunEngineEventBusHandlers() { engine.eventBus.on("runRetryScheduled", async ({ time, run, environment, retryAt }) => { try { - await eventRepository.recordEvent(`Retry #${run.attemptNumber} delay`, { + let retryMessage = `Retry #${run.attemptNumber} delay`; + + if (run.nextMachineAfterOOM) { + retryMessage += ` after OOM`; + } + + await eventRepository.recordEvent(retryMessage, { taskSlug: run.taskIdentifier, environment, attributes: { properties: { retryAt: retryAt.toISOString(), + nextMachine: run.nextMachineAfterOOM, }, runId: run.friendlyId, style: { diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts index c64d0b2c11..5662cae00c 100644 --- a/internal-packages/run-engine/src/engine/eventBus.ts +++ b/internal-packages/run-engine/src/engine/eventBus.ts @@ -85,6 +85,7 @@ export type EventBusEvents = { traceContext: Record; taskIdentifier: string; baseCostInCents: number; + nextMachineAfterOOM?: string; }; organization: { id: string; diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 9827d7ec1d..4aaa395821 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -692,6 +692,7 @@ export class RunAttemptSystem { traceContext: run.traceContext as Record, baseCostInCents: run.baseCostInCents, spanId: run.spanId, + nextMachineAfterOOM: retryResult.machine, }, organization: { id: run.runtimeEnvironment.organizationId,