Skip to content

Commit 4f1c237

Browse files
committed
fix: time out if we can't find gpu info
1 parent 188aae6 commit 4f1c237

File tree

7 files changed

+226
-169
lines changed

7 files changed

+226
-169
lines changed

Diff for: plugins/plugin-codeflare-dashboard/src/controller/dashboard/status/Live.ts

+67-63
Original file line numberDiff line numberDiff line change
@@ -69,79 +69,81 @@ export default class Live {
6969

7070
public constructor(
7171
historyConfig: HistoryConfig,
72-
private readonly tails: Promise<Tail>[],
72+
private readonly tails: Promise<null | Tail>[],
7373
cb: OnData,
7474
styleOf: Record<WorkerState, TextProps>,
7575
private readonly opts: Pick<Options, "events">
7676
) {
7777
tails.map((tailf) => {
78-
tailf.then(({ kind, stream }) => {
79-
stream.on("data", (data) => {
80-
if (data) {
81-
if (kind === "logs") {
82-
this.pushLineAndPublish(data, cb)
83-
}
78+
tailf.then((tail) => {
79+
if (tail) {
80+
tail.stream.on("data", (data) => {
81+
if (data) {
82+
if (tail.kind === "logs") {
83+
this.pushLineAndPublish(data, cb)
84+
}
8485

85-
const line = stripAnsi(data)
86-
const cols = line.split(/\s+/)
87-
88-
const provider = !cols[0] ? undefined : cols[0].replace(/^\[/, "")
89-
const key = !cols[1] ? undefined : cols[1].replace(/\]$/, "")
90-
const fullKey = (provider || "") + "_" + (key || "")
91-
const metric = !provider || !key ? undefined : stateFor[fullKey] || stateFor[key]
92-
const name = cols[2] ? cols[2].trim() : undefined
93-
const timestamp = this.asMillisSinceEpoch(cols[cols.length - 1])
94-
95-
if (!name || !timestamp) {
96-
// console.error("Bad status record", line)
97-
return
98-
} else if (!metric) {
99-
// ignoring this line
100-
return
101-
} else if (provider === "Workers" && (!/^pod\//.test(name) || /cleaner/.test(name))) {
102-
// only track pod events, and ignore our custodial pods
103-
return
104-
} else {
105-
const update = (name: string) => {
106-
if (!this.workers[name]) {
107-
// never seen this named worker before
108-
this.workers[name] = {
109-
name,
110-
metric,
111-
metricHistory: [],
112-
firstUpdate: timestamp,
113-
lastUpdate: timestamp,
114-
style: styleOf[metric],
115-
}
116-
} else if (this.workers[name].lastUpdate <= timestamp) {
117-
// we have seen it before, update the metric value and
118-
// timestamp; note that we only update the model if our
119-
// timestamp is after the lastUpdate for this worker
120-
this.workers[name].metric = metric
121-
this.workers[name].lastUpdate = timestamp
122-
this.workers[name].style = styleOf[metric]
123-
} else {
124-
// out of date event, drop it
125-
return
126-
}
86+
const line = stripAnsi(data)
87+
const cols = line.split(/\s+/)
12788

128-
// inform the UI that we have updates
129-
cb({
130-
events: this.pushEvent(data, metric, timestamp),
131-
workers: Object.values(this.workers),
132-
})
133-
}
89+
const provider = !cols[0] ? undefined : cols[0].replace(/^\[/, "")
90+
const key = !cols[1] ? undefined : cols[1].replace(/\]$/, "")
91+
const fullKey = (provider || "") + "_" + (key || "")
92+
const metric = !provider || !key ? undefined : stateFor[fullKey] || stateFor[key]
93+
const name = cols[2] ? cols[2].trim() : undefined
94+
const timestamp = this.asMillisSinceEpoch(cols[cols.length - 1])
13495

135-
if (name === "*") {
136-
// this event affects every worker
137-
Object.keys(this.workers).forEach(update)
96+
if (!name || !timestamp) {
97+
// console.error("Bad status record", line)
98+
return
99+
} else if (!metric) {
100+
// ignoring this line
101+
return
102+
} else if (provider === "Workers" && (!/^pod\//.test(name) || /cleaner/.test(name))) {
103+
// only track pod events, and ignore our custodial pods
104+
return
138105
} else {
139-
// this event affects a specific worker
140-
update(name)
106+
const update = (name: string) => {
107+
if (!this.workers[name]) {
108+
// never seen this named worker before
109+
this.workers[name] = {
110+
name,
111+
metric,
112+
metricHistory: [],
113+
firstUpdate: timestamp,
114+
lastUpdate: timestamp,
115+
style: styleOf[metric],
116+
}
117+
} else if (this.workers[name].lastUpdate <= timestamp) {
118+
// we have seen it before, update the metric value and
119+
// timestamp; note that we only update the model if our
120+
// timestamp is after the lastUpdate for this worker
121+
this.workers[name].metric = metric
122+
this.workers[name].lastUpdate = timestamp
123+
this.workers[name].style = styleOf[metric]
124+
} else {
125+
// out of date event, drop it
126+
return
127+
}
128+
129+
// inform the UI that we have updates
130+
cb({
131+
events: this.pushEvent(data, metric, timestamp),
132+
workers: Object.values(this.workers),
133+
})
134+
}
135+
136+
if (name === "*") {
137+
// this event affects every worker
138+
Object.keys(this.workers).forEach(update)
139+
} else {
140+
// this event affects a specific worker
141+
update(name)
142+
}
141143
}
142144
}
143-
}
144-
})
145+
})
146+
}
145147
})
146148
})
147149
}
@@ -243,7 +245,9 @@ export default class Live {
243245
this.tails.map(async (_) => {
244246
try {
245247
const tail = await _
246-
return tail.quit()
248+
if (tail) {
249+
return tail.quit()
250+
}
247251
} catch (err) {
248252
// error initializing tailf, probably doesn't matter now that
249253
// we're cleaning up

Diff for: plugins/plugin-codeflare-dashboard/src/controller/dashboard/status/index.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import { states } from "./states.js"
2828
import { isValidStatusTheme, statusThemes } from "./theme.js"
2929

3030
export default function statusDashboard(
31-
tails: Promise<Tail>[],
31+
tails: Promise<null | Tail>[],
3232
historyConfig: HistoryConfig,
3333
opts: Pick<Options, "demo" | "theme" | "events">
3434
): GridSpec {

Diff for: plugins/plugin-codeflare-dashboard/src/controller/dashboard/tailf.ts

+49-22
Original file line numberDiff line numberDiff line change
@@ -27,32 +27,57 @@ export type Tail = {
2727
quit: TailFile["quit"]
2828
}
2929

30-
export function waitTillExists(filepath: string) {
31-
const watcher = chokidar.watch(filepath)
32-
return new Promise<void>((resolve, reject) => {
33-
watcher.on("add", () => resolve())
34-
watcher.on("error", reject)
30+
export function waitTillExists(filepath: string, okIf404 = true) {
31+
return new Promise<boolean>((resolve, reject) => {
32+
const watcher = chokidar.watch(filepath)
33+
34+
const closeAndResolve = async (exists = true) => {
35+
await watcher.close()
36+
resolve(exists)
37+
}
38+
39+
const closeAndReject = async (err: unknown) => {
40+
await watcher.close()
41+
reject(err)
42+
}
43+
44+
watcher.on("add", closeAndResolve)
45+
watcher.on("error", closeAndReject)
46+
47+
// oof, we need to give up, at some point
48+
const timeoutSeconds = process.env.FILE_WAIT_TIMEOUT ? parseInt(process.env.FILE_WAIT_TIMEOUT, 10) : 5
49+
setTimeout(() => {
50+
if (okIf404) {
51+
closeAndResolve(false)
52+
} else {
53+
closeAndReject(new Error(`Could not find ${filepath} after ${timeoutSeconds} seconds`))
54+
}
55+
}, timeoutSeconds * 1000)
3556
})
3657
}
3758

38-
async function initTail({ kind, filepath }: KindedSource, split = true): Promise<Tail> {
39-
await waitTillExists(filepath)
59+
async function initTail({ kind, filepath }: KindedSource, split = true, okIf404 = true): Promise<null | Tail> {
60+
const exists = await waitTillExists(filepath, okIf404)
4061

41-
return new Promise<Tail>((resolve, reject) => {
42-
const tail = new TailFile(filepath, {
43-
startPos: 0,
44-
pollFileIntervalMs: 500,
45-
})
62+
if (!exists) {
63+
return null
64+
} else {
65+
return new Promise<Tail>((resolve, reject) => {
66+
const tail = new TailFile(filepath, {
67+
startPos: 0,
68+
pollFileIntervalMs: 500,
69+
})
4670

47-
tail.once("tail_error", reject)
48-
tail.start()
71+
tail.once("tail_error", reject)
72+
tail.start()
4973

50-
resolve({
51-
kind,
52-
stream: split ? tail.pipe(split2()) : tail,
53-
quit: tail.quit.bind(tail),
74+
resolve({
75+
kind,
76+
stream: split ? tail.pipe(split2()) : tail,
77+
quit: tail.quit.bind(tail),
78+
})
5479
})
55-
})
80+
}
5681
}
5782

5883
export async function pathsFor(mkind: Kind, profile: string, jobId: string) {
@@ -71,7 +96,9 @@ export default async function tailf(
7196
kind: Kind,
7297
profile: string,
7398
jobId: string,
74-
split = true
75-
): Promise<Promise<Tail>[]> {
76-
return pathsFor(kind, profile, jobId).then((_) => _.map((src) => initTail(src, split)))
99+
split = true,
100+
okIf404 = true
101+
): Promise<Promise<null | Tail>[]> {
102+
const paths = await pathsFor(kind, profile, jobId)
103+
return paths.map((src) => initTail(src, split, okIf404))
77104
}

0 commit comments

Comments
 (0)