Skip to content

Run Engine 2: More robust attempt failing/retrying (inc. OOM retrying) #1773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
d5cd1c7
Added describe to tests that were missing it
matt-aitken Mar 6, 2025
0ce5e0f
Added a function to get the maxOldSpaceSize
matt-aitken Mar 6, 2025
c560a18
Make it easy to take `NODE_OPTIONS` and set the old space flag
matt-aitken Mar 6, 2025
2b92cc0
Merge remote-tracking branch 'origin/main' into run-engine-2-retry-oom
matt-aitken Mar 6, 2025
8b66469
Added a zed task to rebuild the packages
matt-aitken Mar 6, 2025
87bd15a
Moved isOOMRunError and added SIBABRT condition
matt-aitken Mar 6, 2025
e60020a
Deduplication flags function with tests
matt-aitken Mar 6, 2025
408a42d
Export flags file
matt-aitken Mar 6, 2025
0a2c090
On TaskRunProcess, set max old space and deduplicate the flags with p…
matt-aitken Mar 6, 2025
a06b7d1
Move retrying logic to a separate function, it was getting very messy
matt-aitken Mar 6, 2025
6766720
Created new test file for attempt failures
matt-aitken Mar 6, 2025
8f1086c
Allow setting retry settings for tests
matt-aitken Mar 6, 2025
7aa4598
Some retrying tests, including OOM
matt-aitken Mar 6, 2025
c46d560
More failure condition tests
matt-aitken Mar 6, 2025
c8dec74
Fix for OOM retrying
matt-aitken Mar 6, 2025
206b9bc
Complete the attempt span if it was an OOM error
matt-aitken Mar 6, 2025
a5b6f06
Merge remote-tracking branch 'origin/main' into run-engine-2-retry-oom
matt-aitken Mar 6, 2025
a4fadc3
Remove old broken import
matt-aitken Mar 6, 2025
6f5cfac
Fixed order of exports
matt-aitken Mar 6, 2025
271eaa8
Fix for docker-provider checkpoints import
matt-aitken Mar 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .zed/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
[
{
"label": "Build packages",
"command": "pnpm run build --filter \"@trigger.dev/*\" --filter trigger.dev",
//"args": [],
// Env overrides for the command, will be appended to the terminal's environment from the settings.
"env": { "foo": "bar" },
// Current working directory to spawn the command into, defaults to current project root.
//"cwd": "/path/to/working/directory",
// Whether to use a new terminal tab or reuse the existing one to spawn the process, defaults to `false`.
"use_new_terminal": false,
// Whether to allow multiple instances of the same task to be run, or rather wait for the existing ones to finish, defaults to `false`.
"allow_concurrent_runs": false,
// What to do with the terminal pane and tab, after the command was started:
// * `always` — always show the task's pane, and focus the corresponding tab in it (default)
// * `no_focus` — always show the task's pane, add the task's tab in it, but don't focus it
// * `never` — do not alter focus, but still add/reuse the task's tab in its pane
"reveal": "always",
// What to do with the terminal pane and tab, after the command has finished:
// * `never` — Do nothing when the command finishes (default)
// * `always` — always hide the terminal tab, hide the pane also if it was the last tab in it
// * `on_success` — hide the terminal tab on task success only, otherwise behaves similar to `always`
"hide": "never",
// Which shell to use when running a task inside the terminal.
// May take 3 values:
// 1. (default) Use the system's default terminal configuration in /etc/passwd
// "shell": "system"
// 2. A program:
// "shell": {
// "program": "sh"
// }
// 3. A program with arguments:
// "shell": {
// "with_arguments": {
// "program": "/bin/bash",
// "args": ["--login"]
// }
// }
"shell": "system",
// Whether to show the task line in the output of the spawned task, defaults to `true`.
"show_summary": true,
// Whether to show the command line in the output of the spawned task, defaults to `true`.
"show_output": true
}
]
2 changes: 1 addition & 1 deletion apps/docker-provider/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
} from "@trigger.dev/core/v3/apps";
import { SimpleLogger } from "@trigger.dev/core/v3/apps";
import { isExecaChildProcess } from "@trigger.dev/core/v3/apps";
import { testDockerCheckpoint } from "@trigger.dev/core/v3/checkpoints";
import { testDockerCheckpoint } from "@trigger.dev/core/v3/serverOnly";
import { setTimeout } from "node:timers/promises";
import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3";

Expand Down
4 changes: 2 additions & 2 deletions apps/webapp/app/v3/services/alerts/deliverAlert.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
RunFailedWebhook,
DeploymentFailedWebhook,
DeploymentSuccessWebhook,
isOOMRunError,
} from "@trigger.dev/core/v3";
import assertNever from "assert-never";
import { subtle } from "crypto";
Expand All @@ -39,7 +40,6 @@ import { generateFriendlyId } from "~/v3/friendlyIdentifiers";
import { ProjectAlertChannelType, ProjectAlertType } from "@trigger.dev/database";
import { alertsRateLimiter } from "~/v3/alertsRateLimiter.server";
import { v3RunPath } from "~/utils/pathBuilder";
import { isOOMError } from "../completeAttempt.server";
import { ApiRetrieveRunPresenter } from "~/presenters/v3/ApiRetrieveRunPresenter.server";

type FoundAlert = Prisma.Result<
Expand Down Expand Up @@ -375,7 +375,7 @@ export class DeliverAlertService extends BaseService {
idempotencyKey: alert.taskRun.idempotencyKey ?? undefined,
tags: alert.taskRun.runTags,
error,
isOutOfMemoryError: isOOMError(error),
isOutOfMemoryError: isOOMRunError(error),
machine: alert.taskRun.machinePreset ?? "Unknown",
dashboardUrl: `${env.APP_ORIGIN}${v3RunPath(
alert.project.organization,
Expand Down
42 changes: 2 additions & 40 deletions apps/webapp/app/v3/services/completeAttempt.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
TaskRunSuccessfulExecutionResult,
flattenAttributes,
isManualOutOfMemoryError,
isOOMRunError,
sanitizeError,
shouldRetryError,
taskRunErrorEnhancer,
Expand Down Expand Up @@ -255,7 +256,7 @@ export class CompleteAttemptService extends BaseService {

let retriableError = shouldRetryError(taskRunErrorEnhancer(completion.error));
let isOOMRetry = false;
let isOOMAttempt = isOOMError(completion.error);
let isOOMAttempt = isOOMRunError(completion.error);
let isOnMaxOOMMachine = false;
let oomMachine: MachinePresetName | undefined;

Expand Down Expand Up @@ -738,45 +739,6 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId:
});
}

export function isOOMError(error: TaskRunError) {
if (error.type === "INTERNAL_ERROR") {
if (
error.code === "TASK_PROCESS_OOM_KILLED" ||
error.code === "TASK_PROCESS_MAYBE_OOM_KILLED"
) {
return true;
}

// For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
// This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
// {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
if (
error.code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
error.message &&
error.message.includes("SIGKILL") &&
error.message.includes("-1")
) {
return true;
}
}

if (error.type === "BUILT_IN_ERROR") {
// ffmpeg also does weird stuff
// { "name": "Error", "type": "BUILT_IN_ERROR", "message": "ffmpeg was killed with signal SIGKILL" }
if (error.message && error.message.includes("ffmpeg was killed with signal SIGKILL")) {
return true;
}
}

// Special `OutOfMemoryError` for doing a manual OOM kill.
// Useful if a native library does an OOM but doesn't actually crash the run and you want to manually
if (isManualOutOfMemoryError(error)) {
return true;
}

return false;
}

function exitRun(runId: string) {
socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
version: "v1",
Expand Down
Loading
Loading