Skip to content

Commit f13312c

Browse files
committed
fix: i think this resolves remaining log aggregator instabilities
bump to @guidebooks/store 0.5.7
1 parent 85917d1 commit f13312c

File tree

6 files changed

+83
-43
lines changed

6 files changed

+83
-43
lines changed

Diff for: deploy/log-aggregator/wait-for.sh

+2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ D=$(dirname $F)
66
# wait for the file to exist and be non-empty
77
while [ ! -s "$F" ]; do
88
if [ ! -d "$D" ]; then mkdir -p "$D" || break; fi
9+
echo "Waiting for $F" 1>&2
910
inotifywait -qq -e create -e modify "$D"
11+
if [ ! -s "$F" ]; then sleep 2; fi
1012
done
1113

1214
if [ ! -s "$F" ]; then

Diff for: package-lock.json

+16-16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: plugins/plugin-codeflare/src/controller/attach.ts

+23-2
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@
1616

1717
import Debug from "debug"
1818
import { MadWizardOptions } from "madwizard"
19-
import { Arguments, ParsedOptions } from "@kui-shell/core"
19+
import { Arguments, Capabilities, ParsedOptions } from "@kui-shell/core"
2020

2121
import { DashboardOptions } from "./dashboard"
2222

2323
export type Options = ParsedOptions &
2424
DashboardOptions & {
2525
/** verbose output from madwizard */
2626
V?: boolean
27+
28+
/** wait for a SIGTERM signal? */
29+
wait?: boolean
2730
}
2831

2932
/**
@@ -85,12 +88,15 @@ export async function attach(
8588
undefined,
8689
Object.assign({}, options, { name: "log-aggregator-start", clean: false })
8790
)
91+
stderr("Attaching to job done...\n")
8892

8993
// the logdir that we captured
9094
const logdir = resp && resp.env ? resp.env.LOGDIR_STAGE : undefined
9195
if (logdir) {
9296
debug("successfully attached to", jobId, logdir)
9397
stderr("Successfully attached to job")
98+
} else {
99+
stderr("Error in attach to job (logdir not found)")
94100
}
95101

96102
return {
@@ -115,7 +121,22 @@ export default async function attachCommand(args: Arguments<Options>) {
115121

116122
if (logdir) {
117123
if (cleanExit) {
118-
window.addEventListener("beforeunload", () => cleanExit())
124+
if (Capabilities.isHeadless()) {
125+
process.once("exit", () => {
126+
console.error("attach exit")
127+
})
128+
if (args.parsedOptions.wait) {
129+
await new Promise<void>((resolve) => {
130+
process.once("SIGTERM", () => {
131+
console.error("attach sigterm")
132+
cleanExit("SIGTERM")
133+
resolve()
134+
})
135+
})
136+
}
137+
} else {
138+
window.addEventListener("beforeunload", () => cleanExit())
139+
}
119140
}
120141

121142
return logdir

Diff for: plugins/plugin-codeflare/src/controller/index.ts

+4-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ export default function registerCodeflareCommands(registrar: Registrar) {
4646
description(registrar)
4747
registrar.listen("/help", help)
4848

49-
registrar.listen<KResponse, AttachOptions>("/codeflare/attach", (args) =>
50-
import("./attach").then((_) => _.default(args))
49+
registrar.listen<KResponse, AttachOptions>(
50+
"/codeflare/attach",
51+
(args) => import("./attach").then((_) => _.default(args)),
52+
{ flags: { boolean: ["wait"] } }
5153
)
5254
registrar.listen("/codeflare/version", () => import("@kui-shell/client/package.json").then((_) => _.version))
5355
registrar.listen("/codeflare/gui/guide", (args) => import("./guide").then((_) => _.default(args)), {

Diff for: plugins/plugin-madwizard/package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"access": "public"
2424
},
2525
"dependencies": {
26-
"madwizard": "^0.19.5",
27-
"@guidebooks/store": "^0.5.5"
26+
"madwizard": "^0.19.6",
27+
"@guidebooks/store": "^0.5.7"
2828
}
2929
}

Diff for: tests/kind/run.sh

+36-21
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ do
1919
case $opt in
2020
a) FORCE_ALL=true; continue;;
2121
f) FORCE=$OPTARG; continue;;
22-
s) export GUIDEBOOK_STORE=$OPTARG; echo "Using store=$GUIDEBOOK_STORE"; continue;;
22+
s) export GUIDEBOOK_STORE=$OPTARG; echo "[Test] Using store=$GUIDEBOOK_STORE"; continue;;
2323
b) GUIDEBOOK=$OPTARG; continue;;
2424
i) NO_KIND=true; continue;;
2525
*) continue;;
@@ -29,13 +29,20 @@ shift $((OPTIND-1))
2929

3030
if [ -z "$NO_KIND" ]; then
3131
export KUBECONFIG=$("$SCRIPTDIR"/setup.sh)
32-
echo "Using KUBECONFIG=$KUBECONFIG"
32+
echo "[Test] Using KUBECONFIG=$KUBECONFIG"
3333
fi
3434

3535
# We get this for free from github actions. Add it generally. This
3636
# informs the guidebooks to adjust their resource demands.
3737
export CI=true
3838

39+
# If you find that dangling processes (mostly kubectl) linger, this
40+
# may help with debugging; you may also set it to "*" to get
41+
# everything. Word of warning: kubectl seems to respond to DEBUG being
42+
# set to *anything*, so expect to get a bunch of low-level go logs, no
43+
# matter what you set this to.
44+
# export DEBUG=madwizard/cleanup
45+
3946
# build docker image of log aggregator just for this test and load it
4047
# into kind
4148
function build {
@@ -63,12 +70,12 @@ function run {
6370

6471
local PRE="$MWPROFILES_PATH_BASE"/../profiles.d/$profile/pre
6572
if [ -f "$PRE" ]; then
66-
echo "Running pre guidebooks for profile=$profile"
73+
echo "[Test] Running pre guidebooks for profile=$profile"
6774
cat "$PRE" | xargs -n1 "$ROOT"/bin/codeflare -p $profile $yes
6875
fi
6976

70-
echo "Running with variant=$variant profile=$profile yes=$yes"
71-
"$ROOT"/bin/codeflare -V -p $profile $yes $guidebook
77+
echo "[Test] Running with variant=$variant profile=$profile yes=$yes"
78+
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -V -p $profile $yes $guidebook
7279
}
7380

7481
# Undeploy any prior log aggregator
@@ -78,8 +85,8 @@ function cleanup {
7885
local profile=$(basename $profileFull)
7986
export MWPROFILES_PATH="$MWPROFILES_PATH_BASE"/$variant
8087

81-
echo "Undeploying any prior log aggregator"
82-
("$ROOT"/bin/codeflare -p $profile -y ml/ray/aggregator/in-cluster/client-side/undeploy \
88+
echo "[Test] Undeploying any prior log aggregator"
89+
(GUIDEBOOK_NAME="log-aggregator-undeploy" "$ROOT"/bin/codeflare -p $profile -y ml/ray/aggregator/in-cluster/client-side/undeploy \
8390
|| exit 0)
8491
}
8592

@@ -96,8 +103,10 @@ function attach {
96103

97104
local jobId=$2
98105

99-
echo "Attaching variant=$variant profile=$profile jobId=$jobId"
100-
"$ROOT"/bin/codeflare -V -p $profile attach -a $jobId
106+
echo "[Test] Attaching variant=$variant profile=$profile jobId=$jobId"
107+
GUIDEBOOK_NAME="log-aggregator-attach" "$ROOT"/bin/codeflare -V -p $profile attach -a $jobId --wait &
108+
ATTACH_PID=$!
109+
echo "[Test] Attach underway"
101110
}
102111

103112
# @return path to locally captured logs for the given jobId, run in the given profile
@@ -121,23 +130,23 @@ function validateAttach {
121130
RUNDIR=$(localpath $profile $jobId)
122131

123132
if [ ! -d "$RUNDIR" ]; then
124-
echo "❌ Logs were not captured locally: missing logdir"
133+
echo "[Test] ❌ Logs were not captured locally: missing logdir"
125134
exit 1
126135
elif [ ! -f "$RUNDIR/jobid.txt" ]; then
127-
echo "❌ Logs were not captured locally: missing jobid.txt"
136+
echo "[Test] ❌ Logs were not captured locally: missing jobid.txt"
128137
exit 1
129138
elif [ ! -f "$RUNDIR/logs/job.txt" ]; then
130-
echo "❌ Logs were not captured locally: missing logs/job.txt"
139+
echo "[Test] ❌ Logs were not captured locally: missing logs/job.txt"
131140
exit 1
132141
elif [ ! -s "$RUNDIR/logs/job.txt" ]; then
133-
echo "❌ Logs were not captured locally: empty logs/job.txt"
142+
echo "[Test] ❌ Logs were not captured locally: empty logs/job.txt"
134143
exit 1
135144
fi
136145

137146
# TODO the expected output is going to be profile-specific
138147
grep -q 'Final result' "$RUNDIR/logs/job.txt" \
139-
&& echo "✅ Logs seem good!" \
140-
|| (echo "❌ Logs were not captured locally: job logs incomplete" && exit 1)
148+
&& echo "[Test] ✅ Logs seem good!" \
149+
|| (echo "[Test] ❌ Logs were not captured locally: job logs incomplete" && exit 1)
141150
}
142151

143152
function logpoller {
@@ -152,18 +161,24 @@ function logpoller {
152161
# clean up after ourselves before we exit
153162
#
154163
function onexit {
164+
if [ -n "$ATTACH_PID" ]; then
165+
echo "!!!!!!!!!KILL ATTACH $ATTACH_PID"
166+
(pkill -P $ATTACH_PID || exit 0)
167+
fi
155168
if [ -n "$HEAD_POLLER_PID" ]; then
156-
(kill $HEAD_POLLER_PID || exit 0)
169+
(pkill -P $HEAD_POLLER_PID || exit 0)
157170
fi
158171
if [ -n "$WORKER_POLLER_PID" ]; then
159-
(kill $WORKER_POLLER_PID || exit 0)
172+
(pkill -P $WORKER_POLLER_PID || exit 0)
160173
fi
161174
if [ -n "$EVENTS_PID" ]; then
162-
(kill $EVENTS_PID || exit 0)
175+
(pkill -P $EVENTS_PID || exit 0)
163176
fi
164177
if [ -n "$AGGREGATOR_POLLER_PID" ]; then
165-
(kill $AGGREGATOR_POLLER_PID || exit 0)
178+
(pkill -P $AGGREGATOR_POLLER_PID || exit 0)
166179
fi
180+
181+
pkill -P $$
167182
}
168183

169184
#
@@ -199,7 +214,7 @@ function test {
199214
# allocate JOB_ID (requires node and `uuid` npm; but we should
200215
# have both for codeflare-cli dev)
201216
export JOB_ID=$(node -e 'console.log(require("uuid").v4())')
202-
echo "Using JOB_ID=$JOB_ID"
217+
echo "[Test] Using JOB_ID=$JOB_ID"
203218

204219
# 1. launch codeflare guidebook run
205220
run "$1" | tee $OUTPUT &
@@ -216,11 +231,11 @@ function test {
216231
# done
217232
sleep 10
218233

219-
echo "About to attach"
220234
attach "$1" "$JOB_ID"
221235
fi
222236

223237
wait $RUN_PID
238+
echo "[Test] Run has finished"
224239
# the job should be done now
225240

226241
# 3. if asked, now validate the log aggregator

0 commit comments

Comments
 (0)