Skip to content

Commit 6fcc73b

Browse files
ci: add workaround for WSL hanging in CI (#993)
Issue #, if available: There is a known issue, microsoft/WSL#8529, where WSL commands can hang. This can cause Windows e2e tests to block until hitting the 2 hour timeout. *Description of changes:* This change adds a workaround to detect the bad state and attempt to mitigate by killing the WSL service. If the issue cannot be resolved, the test will only hang for 300 seconds before failing. *Testing done:* CI run was successful with 8 WSL shutdown failures. https://github.com/runfinch/finch/actions/runs/9682445232/job/26715743040 <img width="2051" alt="image" src="https://github.com/runfinch/finch/assets/55906459/ee582249-8257-48eb-bab3-993150feec80"> - [x] I've reviewed the guidance in CONTRIBUTING.md *Trade-off analysis* The trade-off for this approach is the test suite can take longer with multiple reset VM calls being made. Sample runs which previously took ~15 minutes are up to ~37 minutes with the hanging mitigation; however, this is down from the 2 hour timeout failure which would occur without the mitigation. #### License Acceptance By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. Signed-off-by: Austin Vazquez <[email protected]>
1 parent 1f68260 commit 6fcc73b

File tree

3 files changed

+49
-3
lines changed

3 files changed

+49
-3
lines changed

e2e/vm/vm_darwin_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,8 @@ var resetDisks = func(_ *option.Option, installed bool) {
9696
}
9797
gomega.Expect(os.RemoveAll(dataDiskDir)).ShouldNot(gomega.HaveOccurred())
9898
}
99+
100+
var shutdownWSL = func() error {
101+
// no-op on darwin
102+
return nil
103+
}

e2e/vm/vm_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
package vm
55

66
import (
7-
"os/exec"
87
"runtime"
98
"time"
109

@@ -29,16 +28,17 @@ var resetVM = func(o *option.Option) {
2928
// clean up iptables
3029
//nolint:lll // link to explanation
3130
// https://docs.rancherdesktop.io/troubleshooting-tips/#q-how-do-i-fix-fata0005-subnet-1040024-overlaps-with-other-one-on-this-address-space-when-running-a-container-using-nerdctl-run
32-
gomega.Expect(exec.Command("wsl", "--shutdown").Run()).Should(gomega.BeNil())
31+
gomega.Expect(shutdownWSL()).Should(gomega.BeNil())
3332
}
3433

3534
ginkgo.DeferCleanup(func() {
3635
writeFile(finchConfigFilePath, origFinchCfg)
3736
command.New(o, virtualMachineRootCmd, "stop", "-f").WithoutCheckingExitCode().WithTimeoutInSeconds(20).Run()
3837
time.Sleep(1 * time.Second)
3938
command.New(o, virtualMachineRootCmd, "remove", "-f").WithoutCheckingExitCode().WithTimeoutInSeconds(10).Run()
39+
time.Sleep(1 * time.Second)
4040
if runtime.GOOS == "windows" {
41-
gomega.Expect(exec.Command("wsl", "--shutdown").Run()).Should(gomega.BeNil())
41+
gomega.Expect(shutdownWSL()).Should(gomega.BeNil())
4242
}
4343
time.Sleep(1 * time.Second)
4444
command.New(o, virtualMachineRootCmd, "init").WithoutCheckingExitCode().WithTimeoutInSeconds(160).Run()

e2e/vm/vm_windows_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@
77
package vm
88

99
import (
10+
"context"
11+
"fmt"
1012
"os"
13+
"os/exec"
1114
"path/filepath"
1215
"testing"
16+
"time"
1317

1418
"github.com/onsi/ginkgo/v2"
1519
"github.com/onsi/gomega"
@@ -60,3 +64,40 @@ var resetDisks = func(_ *option.Option, _ bool) {
6064
dataDiskDir := filepath.Join(finchRootDir, ".finch", ".disks")
6165
gomega.Expect(os.RemoveAll(dataDiskDir)).ShouldNot(gomega.HaveOccurred())
6266
}
67+
68+
// shutdownWSL is a wrapper function for "wsl --shutdown".
69+
//
70+
// This is a workaround for https://github.com/microsoft/WSL/issues/8529
71+
//
72+
// If WSL is suspected of hanging for longer than 180 seconds, then
73+
// kill the WSL service and retry the shutdown command.
74+
//
75+
// This function will at maximum run for 300 seconds before returning
76+
// context.DeadlineExceeded error.
77+
var shutdownWSL = func() error {
78+
ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
79+
defer cancel()
80+
81+
if err := exec.CommandContext(ctx, "wsl", "--shutdown").Run(); err != nil {
82+
ginkgo.GinkgoLogr.Error(err, "WSL shutdown failed", "time", time.Now().Format(time.RFC3339))
83+
84+
// wsl might be hung, kill the wsl service and try again.
85+
// https://github.com/microsoft/WSL/issues/8529
86+
killCtx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
87+
defer cancel()
88+
89+
if err := exec.CommandContext(killCtx, "taskkill", "/f", "/im", "wslservice.exe").Run(); err != nil {
90+
ginkgo.GinkgoLogr.Error(err, "WSL task kill failed", "time", time.Now().Format(time.RFC3339))
91+
return fmt.Errorf("unable to kill wsl service: %w", err)
92+
}
93+
94+
retryCtx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
95+
defer cancel()
96+
97+
if err := exec.CommandContext(retryCtx, "wsl", "--shutdown").Run(); err != nil {
98+
return fmt.Errorf("unable to shutdown wsl: %w", err)
99+
}
100+
}
101+
102+
return nil
103+
}

0 commit comments

Comments
 (0)