Skip to content

Commit f41c652

Browse files
sagor999roboquat
authored andcommitted
[ws-manager] startWorkspace: properly handle edge cases when pod fails to start
1 parent a0454c3 commit f41c652

File tree

1 file changed

+49
-11
lines changed

1 file changed

+49
-11
lines changed

components/ws-manager/pkg/manager/manager.go

+49-11
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,15 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
204204
backoff := wait.Backoff{
205205
Steps: 10,
206206
Duration: 100 * time.Millisecond,
207-
Factor: 5.0,
207+
Factor: 2.5,
208208
Jitter: 0.1,
209+
Cap: 5 * time.Minute,
209210
}
210211

211212
var retryErr error
212213
err = wait.ExponentialBackoff(backoff, func() (bool, error) {
214+
// remove resource version so that we can attempt to re-create the pod
215+
pod.ResourceVersion = ""
213216
err = m.Clientset.Create(ctx, pod)
214217
if err != nil {
215218
m, _ := json.Marshal(pod)
@@ -224,26 +227,50 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
224227
return false, err
225228
}
226229

227-
err = wait.PollWithContext(ctx, 100*time.Millisecond, 5*time.Second, podRunning(m.Clientset, pod.Namespace, pod.Name))
230+
err = wait.PollWithContext(ctx, 100*time.Millisecond, 5*time.Second, podRunning(m.Clientset, pod.Name, pod.Namespace))
228231
if err != nil {
229-
m, _ := json.Marshal(pod)
230-
safePod, _ := log.RedactJSON(m)
232+
jsonPod, _ := json.Marshal(pod)
233+
safePod, _ := log.RedactJSON(jsonPod)
231234
clog.WithError(err).WithField("req", req).WithField("pod", safePod).Error("was unable to reach ready state")
232235
retryErr = err
233-
return true, nil
236+
237+
var tempPod corev1.Pod
238+
getErr := m.Clientset.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, &tempPod)
239+
if getErr != nil {
240+
clog.WithError(getErr).WithField("pod.Namespace", pod.Namespace).WithField("pod.Name", pod.Name).Error("was unable to get pod")
241+
// pod doesn't exist, so we are safe to proceed with retry
242+
return false, nil
243+
}
244+
tempPod.Finalizers = []string{}
245+
updateErr := m.Clientset.Update(ctx, &tempPod)
246+
if updateErr != nil {
247+
clog.WithError(updateErr).WithField("pod.Namespace", pod.Namespace).WithField("pod.Name", pod.Name).Error("was unable to remove finalizer")
248+
// failed to remove finalizer, we not going to be able to create a new pod, so bail out with retry error
249+
return false, retryErr
250+
}
251+
252+
deleteErr := m.Clientset.Delete(ctx, &tempPod)
253+
if deleteErr != nil {
254+
clog.WithError(deleteErr).WithField("pod.Namespace", pod.Namespace).WithField("pod.Name", pod.Name).Error("was unable to delete pod")
255+
// failed to delete pod, so not going to be able to create a new pod, so bail out
256+
return false, retryErr
257+
}
258+
259+
// we deleted original pod, so now we can try to create a new one and see if this one will be able to be scheduled\started
260+
return false, nil
234261
}
235262

236263
return true, nil
237264
})
238-
if err == wait.ErrWaitTimeout {
265+
if err == wait.ErrWaitTimeout && retryErr != nil {
239266
err = retryErr
240267
}
241268

242269
if err != nil {
243270
return nil, xerrors.Errorf("cannot create workspace pod: %w", err)
244271
}
245272

246-
span.LogKV("event", "pod created and scheduled")
273+
span.LogKV("event", "pod started successfully")
247274

248275
// all workspaces get a service now
249276
okResponse := &api.StartWorkspaceResponse{
@@ -268,14 +295,25 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
268295
case corev1.PodFailed, corev1.PodSucceeded:
269296
return false, fmt.Errorf("pod ran to completion")
270297
case corev1.PodPending:
271-
if pod.Status.Reason == "OutOfmemory" || pod.Status.Reason == "OutOfcpu" {
272-
return false, xerrors.Errorf("cannot schedule pod, reason: %s", pod.Status.Reason)
298+
if strings.HasPrefix(pod.Status.Reason, "OutOf") {
299+
return false, xerrors.Errorf("cannot schedule pod due to out of resources, reason: %s", pod.Status.Reason)
273300
}
274301

275-
return false, fmt.Errorf("pod ran to completion")
302+
for _, c := range pod.Status.Conditions {
303+
if c.Type == corev1.PodScheduled && c.Status == corev1.ConditionTrue {
304+
// even if pod is pending but was scheduled already, it means kubelet is pulling images and running init containers
305+
// we can consider this as pod running
306+
return true, nil
307+
}
308+
}
309+
310+
// if pod is pending, wait for it to get scheduled
311+
return false, nil
312+
case corev1.PodRunning:
313+
return true, nil
276314
}
277315

278-
return true, nil
316+
return false, xerrors.Errorf("pod in unknown state: %s", pod.Status.Phase)
279317
}
280318
}
281319

0 commit comments

Comments
 (0)