@@ -204,12 +204,15 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
204
204
backoff := wait.Backoff {
205
205
Steps : 10 ,
206
206
Duration : 100 * time .Millisecond ,
207
- Factor : 5.0 ,
207
+ Factor : 2.5 ,
208
208
Jitter : 0.1 ,
209
+ Cap : 5 * time .Minute ,
209
210
}
210
211
211
212
var retryErr error
212
213
err = wait .ExponentialBackoff (backoff , func () (bool , error ) {
214
+ // remove resource version so that we can attempt to re-create the pod
215
+ pod .ResourceVersion = ""
213
216
err = m .Clientset .Create (ctx , pod )
214
217
if err != nil {
215
218
m , _ := json .Marshal (pod )
@@ -224,26 +227,50 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq
224
227
return false , err
225
228
}
226
229
227
- err = wait .PollWithContext (ctx , 100 * time .Millisecond , 5 * time .Second , podRunning (m .Clientset , pod .Namespace , pod .Name ))
230
+ err = wait .PollWithContext (ctx , 100 * time .Millisecond , 5 * time .Second , podRunning (m .Clientset , pod .Name , pod .Namespace ))
228
231
if err != nil {
229
- m , _ := json .Marshal (pod )
230
- safePod , _ := log .RedactJSON (m )
232
+ jsonPod , _ := json .Marshal (pod )
233
+ safePod , _ := log .RedactJSON (jsonPod )
231
234
clog .WithError (err ).WithField ("req" , req ).WithField ("pod" , safePod ).Error ("was unable to reach ready state" )
232
235
retryErr = err
233
- return true , nil
236
+
237
+ var tempPod corev1.Pod
238
+ getErr := m .Clientset .Get (ctx , types.NamespacedName {Namespace : pod .Namespace , Name : pod .Name }, & tempPod )
239
+ if getErr != nil {
240
+ clog .WithError (getErr ).WithField ("pod.Namespace" , pod .Namespace ).WithField ("pod.Name" , pod .Name ).Error ("was unable to get pod" )
241
+ // pod doesn't exist, so we are safe to proceed with retry
242
+ return false , nil
243
+ }
244
+ tempPod .Finalizers = []string {}
245
+ updateErr := m .Clientset .Update (ctx , & tempPod )
246
+ if updateErr != nil {
247
+ clog .WithError (updateErr ).WithField ("pod.Namespace" , pod .Namespace ).WithField ("pod.Name" , pod .Name ).Error ("was unable to remove finalizer" )
248
+ // failed to remove finalizer, we not going to be able to create a new pod, so bail out with retry error
249
+ return false , retryErr
250
+ }
251
+
252
+ deleteErr := m .Clientset .Delete (ctx , & tempPod )
253
+ if deleteErr != nil {
254
+ clog .WithError (deleteErr ).WithField ("pod.Namespace" , pod .Namespace ).WithField ("pod.Name" , pod .Name ).Error ("was unable to delete pod" )
255
+ // failed to delete pod, so not going to be able to create a new pod, so bail out
256
+ return false , retryErr
257
+ }
258
+
259
+ // we deleted original pod, so now we can try to create a new one and see if this one will be able to be scheduled\started
260
+ return false , nil
234
261
}
235
262
236
263
return true , nil
237
264
})
238
- if err == wait .ErrWaitTimeout {
265
+ if err == wait .ErrWaitTimeout && retryErr != nil {
239
266
err = retryErr
240
267
}
241
268
242
269
if err != nil {
243
270
return nil , xerrors .Errorf ("cannot create workspace pod: %w" , err )
244
271
}
245
272
246
- span .LogKV ("event" , "pod created and scheduled " )
273
+ span .LogKV ("event" , "pod started successfully " )
247
274
248
275
// all workspaces get a service now
249
276
okResponse := & api.StartWorkspaceResponse {
@@ -268,14 +295,25 @@ func podRunning(clientset client.Client, podName, namespace string) wait.Conditi
268
295
case corev1 .PodFailed , corev1 .PodSucceeded :
269
296
return false , fmt .Errorf ("pod ran to completion" )
270
297
case corev1 .PodPending :
271
- if pod . Status . Reason == "OutOfmemory" || pod .Status .Reason == "OutOfcpu" {
272
- return false , xerrors .Errorf ("cannot schedule pod, reason: %s" , pod .Status .Reason )
298
+ if strings . HasPrefix ( pod .Status .Reason , "OutOf" ) {
299
+ return false , xerrors .Errorf ("cannot schedule pod due to out of resources , reason: %s" , pod .Status .Reason )
273
300
}
274
301
275
- return false , fmt .Errorf ("pod ran to completion" )
302
+ for _ , c := range pod .Status .Conditions {
303
+ if c .Type == corev1 .PodScheduled && c .Status == corev1 .ConditionTrue {
304
+ // even if pod is pending but was scheduled already, it means kubelet is pulling images and running init containers
305
+ // we can consider this as pod running
306
+ return true , nil
307
+ }
308
+ }
309
+
310
+ // if pod is pending, wait for it to get scheduled
311
+ return false , nil
312
+ case corev1 .PodRunning :
313
+ return true , nil
276
314
}
277
315
278
- return true , nil
316
+ return false , xerrors . Errorf ( "pod in unknown state: %s" , pod . Status . Phase )
279
317
}
280
318
}
281
319
0 commit comments