@@ -163,12 +163,27 @@ func (m *Manager) Close() {
163
163
m .wsdaemonPool .Close ()
164
164
}
165
165
166
+ type (
167
+ ctxKeyRemainingTime struct {}
168
+ )
169
+
166
170
// StartWorkspace creates a new running workspace within the manager's cluster
167
- func (m * Manager ) StartWorkspace (_ context.Context , req * api.StartWorkspaceRequest ) (res * api.StartWorkspaceResponse , err error ) {
171
+ func (m * Manager ) StartWorkspace (ctx context.Context , req * api.StartWorkspaceRequest ) (res * api.StartWorkspaceResponse , err error ) {
172
+ startWorkspaceTime := time .Now ()
173
+
168
174
// We cannot use the passed context because we need to decouple the timeouts
169
175
// Create a context with a high timeout value to be able to wait for scale-up events in the cluster (slow operation)
170
176
// Important!!!: this timeout must be lower than https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager-api/typescript/src/promisified-client.ts#L122
171
- ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Minute )
177
+ startWorkspaceTimeout := 10 * time .Minute
178
+
179
+ // Edge case: when a workspace cannot be scheduled can stay in Pending state forever we
180
+ // delete the pod and call StartWorkspace passing the remaining process time until timeout.
181
+ // In case of timeout, the context is canceled and the error is propagated to the caller.
182
+ if remainingTime , ok := ctx .Value (ctxKeyRemainingTime {}).(time.Duration ); ok {
183
+ startWorkspaceTimeout = remainingTime
184
+ }
185
+
186
+ ctx , cancel := context .WithTimeout (context .Background (), startWorkspaceTimeout )
172
187
defer cancel ()
173
188
174
189
owi := log .LogContext (req .Metadata .Owner , req .Metadata .MetaId , req .Id , req .Metadata .GetProject (), req .Metadata .GetTeam ())
@@ -317,11 +332,24 @@ func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceReque
317
332
}
318
333
319
334
// if we reach this point the pod is created
320
- // in case the context is canceled or a timeout happens we should delete the pod?
321
-
322
335
err = wait .PollImmediateWithContext (ctx , 100 * time .Millisecond , 7 * time .Minute , podRunning (m .Clientset , pod .Name , pod .Namespace ))
323
336
if err != nil {
324
337
clog .WithError (err ).WithField ("req" , req ).WithField ("pod" , pod .Name ).Warn ("was unable to start workspace" )
338
+ if err == wait .ErrWaitTimeout && isPodUnschedulable (m .Clientset , pod .Name , pod .Namespace ) {
339
+ // this could be an error due to a scale-up event
340
+ delErr := deleteWorkspacePodForce (m .Clientset , pod .Name , pod .Namespace )
341
+ if delErr != nil {
342
+ clog .WithError (delErr ).WithField ("req" , req ).WithField ("pod" , pod .Name ).Warn ("was unable to delete workspace pod" )
343
+ return nil , xerrors .Errorf ("workspace pod never reached Running state: %w" , err )
344
+ }
345
+
346
+ // invoke StartWorkspace passing the remaining execution time in the context
347
+ ctx := context .Background ()
348
+ remainingTime := startWorkspaceTimeout - time .Since (startWorkspaceTime )
349
+ ctx = context .WithValue (ctx , ctxKeyRemainingTime {}, remainingTime )
350
+ return m .StartWorkspace (ctx , req )
351
+ }
352
+
325
353
return nil , xerrors .Errorf ("workspace pod never reached Running state: %w" , err )
326
354
}
327
355
@@ -629,6 +657,55 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
629
657
}
630
658
}
631
659
660
+ func isPodUnschedulable (clientset client.Client , podName , namespace string ) bool {
661
+ var pod corev1.Pod
662
+ err := clientset .Get (context .Background (), types.NamespacedName {Namespace : namespace , Name : podName }, & pod )
663
+ if err != nil {
664
+ return false
665
+ }
666
+
667
+ if pod .Status .Phase != corev1 .PodPending {
668
+ return false
669
+ }
670
+
671
+ for _ , c := range pod .Status .Conditions {
672
+ if c .Type == corev1 .PodScheduled &&
673
+ c .Status == corev1 .ConditionFalse &&
674
+ c .Reason == corev1 .PodReasonUnschedulable {
675
+ return true
676
+ }
677
+ }
678
+
679
+ return false
680
+ }
681
+
682
+ func deleteWorkspacePodForce (clientset client.Client , name , namespace string ) error {
683
+ ctx , cancel := context .WithTimeout (context .Background (), 30 * time .Second )
684
+ defer cancel ()
685
+
686
+ var pod corev1.Pod
687
+ err := clientset .Get (ctx , types.NamespacedName {Namespace : namespace , Name : name }, & pod )
688
+ if err != nil {
689
+ if k8serr .IsNotFound (err ) {
690
+ return nil
691
+ }
692
+
693
+ return err
694
+ }
695
+
696
+ // we successfully got the pod, now we attempt to remove finalizer
697
+ pod .Finalizers = []string {}
698
+ err = clientset .Update (ctx , & pod )
699
+ if err != nil {
700
+ if k8serr .IsNotFound (err ) {
701
+ return nil
702
+ }
703
+
704
+ return err
705
+ }
706
+
707
+ return nil
708
+ }
632
709
func areValidFeatureFlags (value interface {}) error {
633
710
s , ok := value .([]api.WorkspaceFeatureFlag )
634
711
if ! ok {
0 commit comments