gitpod-io · roboquat · Mar 7, 2022 · Feb 28, 2022 · Feb 28, 2022 · Mar 1, 2022
@@ -123,4 +123,26 @@ const gitpodVersionInfo = new prometheusClient.Gauge({
 
 export function setGitpodVersion(gitpod_version: string){
     gitpodVersionInfo.set({gitpod_version}, 1)
+}
+
+const instanceStartsSuccessTotal = new prometheusClient.Counter({
+    name: 'gitpod_server_instance_starts_success_total',
+    help: 'Total amount of successfully performed instance starts',
+    labelNames: ['retries'],
+    registers: [prometheusClient.register],
+});
+
+export function increaseSuccessfulInstanceStartCounter(retries: number = 0) {
+    instanceStartsSuccessTotal.inc({ retries });
+}
+
+const instanceStartsFailedTotal = new prometheusClient.Counter({
+    name: 'gitpod_server_instance_starts_failed_total',
+    help: 'Total amount of failed performed instance starts',
+    labelNames: ['reason'],
+    registers: [prometheusClient.register],
+});
+
+export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed" | "startOnClusterFailed") {
+    instanceStartsFailedTotal.inc({ reason });
 }
@@ -36,13 +36,17 @@ import { WithReferrerContext } from "@gitpod/gitpod-protocol/lib/protocol";
 import { IDEOption } from "@gitpod/gitpod-protocol/lib/ide-protocol";
 import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
 import { ExtendedUser } from "@gitpod/ws-manager/lib/constraints";
+import { increaseFailedInstanceStartCounter, increaseSuccessfulInstanceStartCounter } from "../prometheus-metrics";
 
 export interface StartWorkspaceOptions {
     rethrow?: boolean;
     forceDefaultImage?: boolean;
     excludeFeatureFlags?: NamedWorkspaceFeatureFlag[];
 }
 
+const MAX_INSTANCE_START_RETRIES = 2;
+const INSTANCE_START_RETRY_INTERVAL_SECONDS = 2;
+
 @injectable()
 export class WorkspaceStarter {
     @inject(WorkspaceManagerClientProvider) protected readonly clientProvider: WorkspaceManagerClientProvider;
@@ -180,45 +184,29 @@ export class WorkspaceStarter {
             const euser: ExtendedUser = {
                 ...user,
                 getsMoreResources: await this.userService.userGetsMoreResources(user),
-            }
-
-            // tell the world we're starting this instance
-            let resp: StartWorkspaceResponse.AsObject | undefined;
-            let lastInstallation = "";
-            const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
-            for await (let cluster of clusters) {
-                try {
-                    // getStartManager will throw an exception if there's no cluster available and hence exit the loop
-                    const { manager, installation } = cluster;
-                    lastInstallation = installation;
-
-                    instance.status.phase = "pending";
-                    instance.region = installation;
-                    await this.workspaceDb.trace({ span }).storeInstance(instance);
-                    try {
-                        await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
-                    } catch (err) {
-                        // if sending the notification fails that's no reason to stop the workspace creation.
-                        // If the dashboard misses this event it will catch up at the next one.
-                        span.log({ "notifyOnInstanceUpdate.error": err });
-                        log.debug("cannot send instance update - this should be mostly inconsequential", err);
-                    }
+            };
 
-                    // start that thing
-                    log.info({ instanceId: instance.id }, 'starting instance');
-                    resp = (await manager.startWorkspace({ span }, startRequest)).toObject();
-                    break;
-                } catch (err: any) {
-                    if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
-                        log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
-                    } else {
-                        throw err;
+            // choose a cluster and start the instance
+            let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
+            let retries = 0;
+            try {
+                for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
+                    resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance);
+                    if (resp) {
+                        break;
                     }
+                    await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
                 }
+            } catch (err) {
+                increaseFailedInstanceStartCounter("startOnClusterFailed");
+                throw err;
             }
+
             if (!resp) {
+                increaseFailedInstanceStartCounter("clusterSelectionFailed");
                 throw new Error("cannot start a workspace because no workspace clusters are available");
             }
+            increaseSuccessfulInstanceStartCounter(retries);
 
             span.log({ "resp": resp });
 
@@ -259,6 +247,42 @@ export class WorkspaceStarter {
         }
     }
 
+    protected async tryStartOnCluster(ctx: TraceContext, startRequest: StartWorkspaceRequest, euser: ExtendedUser, workspace: Workspace, instance: WorkspaceInstance): Promise<StartWorkspaceResponse.AsObject | undefined> {
+        let lastInstallation = "";
+        const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
+        for await (let cluster of clusters) {
+            try {
+                // getStartManager will throw an exception if there's no cluster available and hence exit the loop
+                const { manager, installation } = cluster;
+                lastInstallation = installation;
+
+                instance.status.phase = "pending";
+                instance.region = installation;
+                await this.workspaceDb.trace(ctx).storeInstance(instance);
+                try {
+                    await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
+                } catch (err) {
+                    // if sending the notification fails that's no reason to stop the workspace creation.
+                    // If the dashboard misses this event it will catch up at the next one.
+                    ctx.span?.log({ "notifyOnInstanceUpdate.error": err });
+                    log.debug("cannot send instance update - this should be mostly inconsequential", err);
+                }
+
+                // start that thing
+                log.info({ instanceId: instance.id }, 'starting instance');
+                return (await manager.startWorkspace(ctx, startRequest)).toObject();
+            } catch (err: any) {
+                if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
+                    log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
+                } else {
+                    throw err;
+                }
+            }
+        }
+
+        return undefined;
+    }
+
     protected async notifyOnPrebuildQueued(ctx: TraceContext, workspaceId: string) {
         const span = TraceContext.startSpan("notifyOnPrebuildQueued", ctx);
         const prebuild = await this.workspaceDb.trace({ span }).findPrebuildByWorkspaceID(workspaceId);