Skip to content

Commit d76877f

Browse files
committed
[ws-manager] Handle Pending pod due to Unschedulable reason
1 parent c89c80c commit d76877f

File tree

1 file changed

+83
-4
lines changed

1 file changed

+83
-4
lines changed

components/ws-manager/pkg/manager/manager.go

+83-4
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,27 @@ func (m *Manager) Close() {
163163
m.wsdaemonPool.Close()
164164
}
165165

166+
type (
167+
ctxKeyRemainingTime struct{}
168+
)
169+
166170
// StartWorkspace creates a new running workspace within the manager's cluster
167-
func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) {
171+
func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) {
172+
startWorkspaceTime := time.Now()
173+
168174
// We cannot use the passed context because we need to decouple the timeouts
169175
// Create a context with a high timeout value to be able to wait for scale-up events in the cluster (slow operation)
170176
// Important!!!: this timeout must be lower than https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager-api/typescript/src/promisified-client.ts#L122
171-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
177+
startWorkspaceTimeout := 10 * time.Minute
178+
179+
// Edge case: when a workspace cannot be scheduled can stay in Pending state forever we
180+
// delete the pod and call StartWorkspace passing the remaining process time until timeout.
181+
// In case of timeout, the context is canceled and the error is propagated to the caller.
182+
if remainingTime, ok := ctx.Value(ctxKeyRemainingTime{}).(time.Duration); ok {
183+
startWorkspaceTimeout = remainingTime
184+
}
185+
186+
ctx, cancel := context.WithTimeout(context.Background(), startWorkspaceTimeout)
172187
defer cancel()
173188

174189
owi := log.LogContext(req.Metadata.Owner, req.Metadata.MetaId, req.Id, req.Metadata.GetProject(), req.Metadata.GetTeam())
@@ -321,11 +336,24 @@ func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceReque
321336
}
322337

323338
// if we reach this point the pod is created
324-
// in case the context is canceled or a timeout happens we should delete the pod?
325-
326339
err = wait.PollImmediateWithContext(ctx, 100*time.Millisecond, 7*time.Minute, podRunning(m.Clientset, pod.Name, pod.Namespace))
327340
if err != nil {
328341
clog.WithError(err).WithField("req", req).WithField("pod", pod.Name).Warn("was unable to start workspace")
342+
if err == wait.ErrWaitTimeout && isPodUnschedulable(m.Clientset, pod.Name, pod.Namespace) {
343+
// this could be an error due to a scale-up event
344+
delErr := deleteWorkspacePod(m.Clientset, pod.Name, pod.Namespace)
345+
if delErr != nil {
346+
clog.WithError(delErr).WithField("req", req).WithField("pod", pod.Name).Warn("was unable to delete workspace pod")
347+
return nil, xerrors.Errorf("workspace pod never reached Running state: %w", err)
348+
}
349+
350+
// invoke StartWorkspace passing the remaining execution time in the context
351+
ctx := context.Background()
352+
remainingTime := startWorkspaceTimeout - time.Since(startWorkspaceTime)
353+
ctx = context.WithValue(ctx, ctxKeyRemainingTime{}, remainingTime)
354+
return m.StartWorkspace(ctx, req)
355+
}
356+
329357
return nil, xerrors.Errorf("workspace pod never reached Running state: %w", err)
330358
}
331359

@@ -633,6 +661,57 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
633661
}
634662
}
635663

664+
func isPodUnschedulable(clientset client.Client, podName, namespace string) bool {
665+
var pod corev1.Pod
666+
err := clientset.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: podName}, &pod)
667+
if err != nil {
668+
return false
669+
}
670+
671+
if pod.Status.Phase != corev1.PodPending {
672+
return false
673+
}
674+
675+
for _, c := range pod.Status.Conditions {
676+
if c.Type != corev1.PodScheduled {
677+
continue
678+
}
679+
680+
if c.Status == corev1.ConditionFalse && c.Reason == corev1.PodReasonUnschedulable {
681+
return true
682+
}
683+
}
684+
685+
return false
686+
}
687+
688+
func deleteWorkspacePod(clientset client.Client, name, namespace string) error {
689+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
690+
defer cancel()
691+
692+
var pod corev1.Pod
693+
err := clientset.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &pod)
694+
if err != nil {
695+
if k8serr.IsNotFound(err) {
696+
return nil
697+
}
698+
699+
return err
700+
}
701+
702+
// we successfully got the pod, now we attempt to remove finalizer
703+
pod.Finalizers = []string{}
704+
err = clientset.Update(ctx, &pod)
705+
if err != nil {
706+
if k8serr.IsNotFound(err) {
707+
return nil
708+
}
709+
710+
return err
711+
}
712+
713+
return nil
714+
}
636715
func areValidFeatureFlags(value interface{}) error {
637716
s, ok := value.([]api.WorkspaceFeatureFlag)
638717
if !ok {

0 commit comments

Comments
 (0)