@@ -163,12 +163,27 @@ func (m *Manager) Close() {
163
163
m .wsdaemonPool .Close ()
164
164
}
165
165
166
+ type (
167
+ ctxKeyRemainingTime struct {}
168
+ )
169
+
166
170
// StartWorkspace creates a new running workspace within the manager's cluster
167
- func (m * Manager ) StartWorkspace (_ context.Context , req * api.StartWorkspaceRequest ) (res * api.StartWorkspaceResponse , err error ) {
171
+ func (m * Manager ) StartWorkspace (ctx context.Context , req * api.StartWorkspaceRequest ) (res * api.StartWorkspaceResponse , err error ) {
172
+ startWorkspaceTime := time .Now ()
173
+
168
174
// We cannot use the passed context because we need to decouple the timeouts
169
175
// Create a context with a high timeout value to be able to wait for scale-up events in the cluster (slow operation)
170
176
// Important!!!: this timeout must be lower than https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager-api/typescript/src/promisified-client.ts#L122
171
- ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Minute )
177
+ startWorkspaceTimeout := 10 * time .Minute
178
+
179
+ // Edge case: when a workspace cannot be scheduled can stay in Pending state forever we
180
+ // delete the pod and call StartWorkspace passing the remaining process time until timeout.
181
+ // In case of timeout, the context is canceled and the error is propagated to the caller.
182
+ if remainingTime , ok := ctx .Value (ctxKeyRemainingTime {}).(time.Duration ); ok {
183
+ startWorkspaceTimeout = remainingTime
184
+ }
185
+
186
+ ctx , cancel := context .WithTimeout (context .Background (), startWorkspaceTimeout )
172
187
defer cancel ()
173
188
174
189
owi := log .LogContext (req .Metadata .Owner , req .Metadata .MetaId , req .Id , req .Metadata .GetProject (), req .Metadata .GetTeam ())
@@ -321,11 +336,24 @@ func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceReque
321
336
}
322
337
323
338
// if we reach this point the pod is created
324
- // in case the context is canceled or a timeout happens we should delete the pod?
325
-
326
339
err = wait .PollImmediateWithContext (ctx , 100 * time .Millisecond , 7 * time .Minute , podRunning (m .Clientset , pod .Name , pod .Namespace ))
327
340
if err != nil {
328
341
clog .WithError (err ).WithField ("req" , req ).WithField ("pod" , pod .Name ).Warn ("was unable to start workspace" )
342
+ if err == wait .ErrWaitTimeout && isPodUnschedulable (m .Clientset , pod .Name , pod .Namespace ) {
343
+ // this could be an error due to a scale-up event
344
+ delErr := deleteWorkspacePod (m .Clientset , pod .Name , pod .Namespace )
345
+ if delErr != nil {
346
+ clog .WithError (delErr ).WithField ("req" , req ).WithField ("pod" , pod .Name ).Warn ("was unable to delete workspace pod" )
347
+ return nil , xerrors .Errorf ("workspace pod never reached Running state: %w" , err )
348
+ }
349
+
350
+ // invoke StartWorkspace passing the remaining execution time in the context
351
+ ctx := context .Background ()
352
+ remainingTime := startWorkspaceTimeout - time .Since (startWorkspaceTime )
353
+ ctx = context .WithValue (ctx , ctxKeyRemainingTime {}, remainingTime )
354
+ return m .StartWorkspace (ctx , req )
355
+ }
356
+
329
357
return nil , xerrors .Errorf ("workspace pod never reached Running state: %w" , err )
330
358
}
331
359
@@ -633,6 +661,57 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
633
661
}
634
662
}
635
663
664
+ func isPodUnschedulable (clientset client.Client , podName , namespace string ) bool {
665
+ var pod corev1.Pod
666
+ err := clientset .Get (context .Background (), types.NamespacedName {Namespace : namespace , Name : podName }, & pod )
667
+ if err != nil {
668
+ return false
669
+ }
670
+
671
+ if pod .Status .Phase != corev1 .PodPending {
672
+ return false
673
+ }
674
+
675
+ for _ , c := range pod .Status .Conditions {
676
+ if c .Type != corev1 .PodScheduled {
677
+ continue
678
+ }
679
+
680
+ if c .Status == corev1 .ConditionFalse && c .Reason == corev1 .PodReasonUnschedulable {
681
+ return true
682
+ }
683
+ }
684
+
685
+ return false
686
+ }
687
+
688
+ func deleteWorkspacePod (clientset client.Client , name , namespace string ) error {
689
+ ctx , cancel := context .WithTimeout (context .Background (), 30 * time .Second )
690
+ defer cancel ()
691
+
692
+ var pod corev1.Pod
693
+ err := clientset .Get (ctx , types.NamespacedName {Namespace : namespace , Name : name }, & pod )
694
+ if err != nil {
695
+ if k8serr .IsNotFound (err ) {
696
+ return nil
697
+ }
698
+
699
+ return err
700
+ }
701
+
702
+ // we successfully got the pod, now we attempt to remove finalizer
703
+ pod .Finalizers = []string {}
704
+ err = clientset .Update (ctx , & pod )
705
+ if err != nil {
706
+ if k8serr .IsNotFound (err ) {
707
+ return nil
708
+ }
709
+
710
+ return err
711
+ }
712
+
713
+ return nil
714
+ }
636
715
func areValidFeatureFlags (value interface {}) error {
637
716
s , ok := value .([]api.WorkspaceFeatureFlag )
638
717
if ! ok {
0 commit comments