Skip to content

Commit 496abdb

Browse files
committed
Handle Pending pod due to Unschedulable reason
1 parent 86cafcf commit 496abdb

File tree

1 file changed

+81
-4
lines changed

1 file changed

+81
-4
lines changed

components/ws-manager/pkg/manager/manager.go

+81-4
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,27 @@ func (m *Manager) Close() {
163163
m.wsdaemonPool.Close()
164164
}
165165

166+
type (
167+
ctxKeyRemainingTime struct{}
168+
)
169+
166170
// StartWorkspace creates a new running workspace within the manager's cluster
167-
func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) {
171+
func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) {
172+
startWorkspaceTime := time.Now()
173+
168174
// We cannot use the passed context because we need to decouple the timeouts
169175
// Create a context with a high timeout value to be able to wait for scale-up events in the cluster (slow operation)
170176
// Important!!!: this timeout must be lower than https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager-api/typescript/src/promisified-client.ts#L122
171-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
177+
startWorkspaceTimeout := 10 * time.Minute
178+
179+
// Edge case: when a workspace cannot be scheduled can stay in Pending state forever we
180+
// delete the pod and call StartWorkspace passing the remaining process time until timeout.
181+
// In case of timeout, the context is canceled and the error is propagated to the caller.
182+
if remainingTime, ok := ctx.Value(ctxKeyRemainingTime{}).(time.Duration); ok {
183+
startWorkspaceTimeout = remainingTime
184+
}
185+
186+
ctx, cancel := context.WithTimeout(context.Background(), startWorkspaceTimeout)
172187
defer cancel()
173188

174189
owi := log.LogContext(req.Metadata.Owner, req.Metadata.MetaId, req.Id, req.Metadata.GetProject(), req.Metadata.GetTeam())
@@ -317,11 +332,24 @@ func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceReque
317332
}
318333

319334
// if we reach this point the pod is created
320-
// in case the context is canceled or a timeout happens we should delete the pod?
321-
322335
err = wait.PollImmediateWithContext(ctx, 100*time.Millisecond, 7*time.Minute, podRunning(m.Clientset, pod.Name, pod.Namespace))
323336
if err != nil {
324337
clog.WithError(err).WithField("req", req).WithField("pod", pod.Name).Warn("was unable to start workspace")
338+
if err == wait.ErrWaitTimeout && isPodUnschedulable(m.Clientset, pod.Name, pod.Namespace) {
339+
// this could be an error due to a scale-up event
340+
delErr := deleteWorkspacePodForce(m.Clientset, pod.Name, pod.Namespace)
341+
if delErr != nil {
342+
clog.WithError(delErr).WithField("req", req).WithField("pod", pod.Name).Warn("was unable to delete workspace pod")
343+
return nil, xerrors.Errorf("workspace pod never reached Running state: %w", err)
344+
}
345+
346+
// invoke StartWorkspace passing the remaining execution time in the context
347+
ctx := context.Background()
348+
remainingTime := startWorkspaceTimeout - time.Since(startWorkspaceTime)
349+
ctx = context.WithValue(ctx, ctxKeyRemainingTime{}, remainingTime)
350+
return m.StartWorkspace(ctx, req)
351+
}
352+
325353
return nil, xerrors.Errorf("workspace pod never reached Running state: %w", err)
326354
}
327355

@@ -629,6 +657,55 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
629657
}
630658
}
631659

660+
func isPodUnschedulable(clientset client.Client, podName, namespace string) bool {
661+
var pod corev1.Pod
662+
err := clientset.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: podName}, &pod)
663+
if err != nil {
664+
return false
665+
}
666+
667+
if pod.Status.Phase != corev1.PodPending {
668+
return false
669+
}
670+
671+
for _, c := range pod.Status.Conditions {
672+
if c.Type == corev1.PodScheduled &&
673+
c.Status == corev1.ConditionFalse &&
674+
c.Reason == corev1.PodReasonUnschedulable {
675+
return true
676+
}
677+
}
678+
679+
return false
680+
}
681+
682+
func deleteWorkspacePodForce(clientset client.Client, name, namespace string) error {
683+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
684+
defer cancel()
685+
686+
var pod corev1.Pod
687+
err := clientset.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &pod)
688+
if err != nil {
689+
if k8serr.IsNotFound(err) {
690+
return nil
691+
}
692+
693+
return err
694+
}
695+
696+
// we successfully got the pod, now we attempt to remove finalizer
697+
pod.Finalizers = []string{}
698+
err = clientset.Update(ctx, &pod)
699+
if err != nil {
700+
if k8serr.IsNotFound(err) {
701+
return nil
702+
}
703+
704+
return err
705+
}
706+
707+
return nil
708+
}
632709
func areValidFeatureFlags(value interface{}) error {
633710
s, ok := value.([]api.WorkspaceFeatureFlag)
634711
if !ok {

0 commit comments

Comments
 (0)