@@ -6,6 +6,7 @@ package tmpnet
6
6
import (
7
7
"context"
8
8
"encoding/json"
9
+ "errors"
9
10
"fmt"
10
11
"net"
11
12
"syscall"
@@ -20,6 +21,8 @@ const (
20
21
DefaultNodeTickerInterval = 50 * time .Millisecond
21
22
)
22
23
24
+ var ErrUnrecoverableNodeHealthCheck = errors .New ("failed to query node health" )
25
+
23
26
func CheckNodeHealth (ctx context.Context , uri string ) (* health.APIReply , error ) {
24
27
// Check that the node is reporting healthy
25
28
healthReply , err := health .NewClient (uri ).Health (ctx , nil )
@@ -31,16 +34,16 @@ func CheckNodeHealth(ctx context.Context, uri string) (*health.APIReply, error)
31
34
case * net.OpError :
32
35
if t .Op == "read" {
33
36
// Connection refused - potentially recoverable
34
- return nil , nil
37
+ return nil , err
35
38
}
36
39
case syscall.Errno :
37
40
if t == syscall .ECONNREFUSED {
38
41
// Connection refused - potentially recoverable
39
- return nil , nil
42
+ return nil , err
40
43
}
41
44
}
42
45
// Assume all other errors are not recoverable
43
- return nil , fmt .Errorf ("failed to query node health : %w" , err )
46
+ return nil , fmt .Errorf ("%w : %w" , ErrUnrecoverableNodeHealthCheck , err )
44
47
}
45
48
46
49
// WaitForHealthy blocks until Node.IsHealthy returns true or an error (including context timeout) is observed.
@@ -53,10 +56,14 @@ func WaitForHealthy(ctx context.Context, node *Node) error {
53
56
54
57
for {
55
58
healthy , err := node .IsHealthy (ctx )
56
- if err != nil {
57
- return fmt .Errorf ("failed to wait for health of node %q: %w" , node .NodeID , err )
58
- }
59
- if healthy {
59
+ switch {
60
+ case errors .Is (err , ErrUnrecoverableNodeHealthCheck ):
61
+ return fmt .Errorf ("%w for node %q" , err , node .NodeID )
62
+ case err != nil :
63
+ // Error is recoverable
64
+ // TODO(marun) Log the error to aid in troubleshooting once a logger is available
65
+ continue
66
+ case healthy :
60
67
return nil
61
68
}
62
69
0 commit comments