Skip to content

Commit 2d0108b

Browse files
committed
Remove fatal logging in executable code
All affected functions should now return an error. ErrorS is now being used instead of Fatal.
1 parent 68354f0 commit 2d0108b

File tree

3 files changed

+180
-79
lines changed

3 files changed

+180
-79
lines changed

Diff for: pkg/ext-proc/main.go

+126-53
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ import (
66
"fmt"
77
"net"
88
"net/http"
9+
"os"
910
"strconv"
1011

1112
"github.com/prometheus/client_golang/prometheus/promhttp"
13+
"golang.org/x/sync/errgroup"
1214
"google.golang.org/grpc"
1315
healthPb "google.golang.org/grpc/health/grpc_health_v1"
1416
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
@@ -79,17 +81,25 @@ func init() {
7981
}
8082

8183
func main() {
84+
if err := run(); err != nil {
85+
os.Exit(1)
86+
}
87+
}
88+
89+
func run() error {
8290
klog.InitFlags(nil)
8391
flag.Parse()
8492

8593
ctrl.SetLogger(klog.TODO())
8694
cfg, err := ctrl.GetConfig()
8795
if err != nil {
88-
klog.Fatalf("Failed to get rest config: %v", err)
96+
klog.ErrorS(err, "Failed to get rest config")
97+
return err
8998
}
9099
// Validate flags
91100
if err := validateFlags(); err != nil {
92-
klog.Fatalf("Failed to validate flags: %v", err)
101+
klog.ErrorS(err, "Failed to validate flags")
102+
return err
93103
}
94104

95105
// Print all flag values
@@ -114,101 +124,164 @@ func main() {
114124
Config: ctrl.GetConfigOrDie(),
115125
Datastore: datastore,
116126
}
117-
serverRunner.Setup()
127+
if err := serverRunner.Setup(); err != nil {
128+
klog.ErrorS(err, "Failed to setup server runner")
129+
return err
130+
}
118131

119-
// Start health and ext-proc servers in goroutines
120-
healthSvr := startHealthServer(datastore, *grpcHealthPort)
121-
extProcSvr := serverRunner.Start(
122-
datastore,
123-
&vllm.PodMetricsClientImpl{},
124-
)
125-
// Start metrics handler
126-
metricsSvr := startMetricsHandler(*metricsPort, cfg)
132+
// Start processing signals and init the group to manage goroutines.
133+
g, ctx := errgroup.WithContext(ctrl.SetupSignalHandler())
127134

128-
// Start manager, blocking
129-
serverRunner.StartManager()
135+
// Start health server.
136+
startHealthServer(g, ctx, datastore, *grpcHealthPort)
130137

131-
// Gracefully shutdown servers
132-
if healthSvr != nil {
133-
klog.Info("Health server shutting down")
134-
healthSvr.GracefulStop()
135-
}
136-
if extProcSvr != nil {
137-
klog.Info("Ext-proc server shutting down")
138-
extProcSvr.GracefulStop()
139-
}
140-
if metricsSvr != nil {
141-
klog.Info("Metrics server shutting down")
142-
if err := metricsSvr.Shutdown(context.Background()); err != nil {
143-
klog.Infof("Metrics server Shutdown: %v", err)
144-
}
145-
}
138+
// Start ext-proc server.
139+
startExtProcServer(g, ctx, serverRunner, datastore)
140+
141+
// Start metrics handler.
142+
startMetricsHandler(g, ctx, *metricsPort, cfg)
146143

147-
klog.Info("All components shutdown")
144+
// Start manager.
145+
g.Go(func() error {
146+
return serverRunner.StartManager(ctx)
147+
})
148+
149+
err = g.Wait()
150+
klog.InfoS("All components terminated")
151+
return err
148152
}
149153

150154
// startHealthServer starts the gRPC health probe server in a goroutine.
151-
func startHealthServer(ds *backend.K8sDatastore, port int) *grpc.Server {
155+
func startHealthServer(g *errgroup.Group, ctx context.Context, ds *backend.K8sDatastore, port int) {
156+
klog.InfoS("Health server starting...")
157+
152158
svr := grpc.NewServer()
153159
healthPb.RegisterHealthServer(svr, &healthServer{datastore: ds})
154160

155-
go func() {
161+
g.Go(func() error {
162+
// Init the listener.
156163
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
157164
if err != nil {
158-
klog.Fatalf("Health server failed to listen: %v", err)
165+
klog.ErrorS(err, "Health server failed to listen")
166+
return err
159167
}
160-
klog.Infof("Health server listening on port: %d", port)
161168

162-
// Blocking and will return when shutdown is complete.
163-
if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped {
164-
klog.Fatalf("Health server failed: %v", err)
165-
}
166-
klog.Info("Health server shutting down")
167-
}()
168-
return svr
169+
klog.InfoS("Health server listening", "port", port)
170+
171+
// Keep serving until interrupted.
172+
g.Go(func() error {
173+
if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped {
174+
klog.ErrorS(err, "Health server failed")
175+
return err
176+
}
177+
klog.InfoS("Health server terminated")
178+
return nil
179+
})
180+
181+
// Shutdown on interrupt.
182+
g.Go(func() error {
183+
<-ctx.Done()
184+
klog.InfoS("Health server shutting down...")
185+
svr.GracefulStop()
186+
return nil
187+
})
188+
return nil
189+
})
169190
}
170191

171-
func startMetricsHandler(port int, cfg *rest.Config) *http.Server {
192+
func startExtProcServer(g *errgroup.Group, ctx context.Context, runner *runserver.ExtProcServerRunner, ds *backend.K8sDatastore) {
193+
g.Go(func() error {
194+
errCh := make(chan error, 1)
195+
svr := runner.Start(ds, &vllm.PodMetricsClientImpl{}, func(err error) {
196+
errCh <- err
197+
})
198+
199+
g.Go(func() error {
200+
<-ctx.Done()
201+
klog.InfoS("Ext-proc server shutting down...")
202+
svr.GracefulStop()
203+
return nil
204+
})
205+
return <-errCh
206+
})
207+
}
208+
209+
func startMetricsHandler(g *errgroup.Group, ctx context.Context, port int, cfg *rest.Config) {
172210
metrics.Register()
211+
klog.InfoS("Metrics HTTP handler starting...")
173212

174213
var svr *http.Server
175214
go func() {
176-
klog.Info("Starting metrics HTTP handler ...")
215+
}()
216+
217+
g.Go(func() error {
218+
// Init the listener.
219+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
220+
if err != nil {
221+
klog.ErrorS(err, "Metrics HTTP handler failed to listen")
222+
return err
223+
}
224+
225+
klog.InfoS("Metrics server listening", "port", port)
226+
227+
// Init HTTP server.
228+
h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg)
229+
if err != nil {
230+
return err
231+
}
177232

178233
mux := http.NewServeMux()
179-
mux.Handle(defaultMetricsEndpoint, metricsHandlerWithAuthenticationAndAuthorization(cfg))
234+
mux.Handle(defaultMetricsEndpoint, h)
180235

181236
svr = &http.Server{
182237
Addr: net.JoinHostPort("", strconv.Itoa(port)),
183238
Handler: mux,
184239
}
185-
if err := svr.ListenAndServe(); err != http.ErrServerClosed {
186-
klog.Fatalf("failed to start metrics HTTP handler: %v", err)
187-
}
188-
}()
189-
return svr
240+
241+
// Keep serving until interrupted.
242+
g.Go(func() error {
243+
if err := svr.Serve(lis); err != http.ErrServerClosed {
244+
klog.ErrorS(err, "Failed to start metrics HTTP handler")
245+
return err
246+
}
247+
klog.InfoS("Metrics HTTP handler terminated")
248+
return nil
249+
})
250+
251+
// Shutdown on interrupt.
252+
g.Go(func() error {
253+
<-ctx.Done()
254+
klog.InfoS("Metrics HTTP handler shutting down...")
255+
_ = svr.Shutdown(context.Background())
256+
return nil
257+
})
258+
return nil
259+
})
190260
}
191261

192-
func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) http.Handler {
262+
func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) {
193263
h := promhttp.HandlerFor(
194264
legacyregistry.DefaultGatherer,
195265
promhttp.HandlerOpts{},
196266
)
197267
httpClient, err := rest.HTTPClientFor(cfg)
198268
if err != nil {
199-
klog.Fatalf("failed to create http client for metrics auth: %v", err)
269+
klog.ErrorS(err, "Failed to create http client for metrics auth")
270+
return nil, err
200271
}
201272

202273
filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient)
203274
if err != nil {
204-
klog.Fatalf("failed to create metrics filter for auth: %v", err)
275+
klog.ErrorS(err, "Failed to create metrics filter for auth")
276+
return nil, err
205277
}
206278
metricsLogger := klog.LoggerWithValues(klog.NewKlogr(), "path", defaultMetricsEndpoint)
207279
metricsAuthHandler, err := filter(metricsLogger, h)
208280
if err != nil {
209-
klog.Fatalf("failed to create metrics auth handler: %v", err)
281+
klog.ErrorS(err, "Failed to create metrics auth handler")
282+
return nil, err
210283
}
211-
return metricsAuthHandler
284+
return metricsAuthHandler, nil
212285
}
213286

214287
func validateFlags() error {

Diff for: pkg/ext-proc/server/runserver.go

+42-18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package server
22

33
import (
4+
"context"
5+
"errors"
46
"fmt"
57
"net"
68
"time"
@@ -60,11 +62,11 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
6062
}
6163

6264
// Setup creates the reconcilers for pools, models, and endpointSlices and starts the manager.
63-
func (r *ExtProcServerRunner) Setup() {
65+
func (r *ExtProcServerRunner) Setup() error {
6466
// Create a new manager to manage controllers
6567
mgr, err := ctrl.NewManager(r.Config, ctrl.Options{Scheme: r.Scheme})
6668
if err != nil {
67-
klog.Fatalf("Failed to create controller manager: %v", err)
69+
return fmt.Errorf("failed to create controller manager: %w", err)
6870
}
6971
r.manager = mgr
7072

@@ -79,7 +81,7 @@ func (r *ExtProcServerRunner) Setup() {
7981
},
8082
Record: mgr.GetEventRecorderFor("InferencePool"),
8183
}).SetupWithManager(mgr); err != nil {
82-
klog.Fatalf("Failed setting up InferencePoolReconciler: %v", err)
84+
return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err)
8385
}
8486

8587
if err := (&backend.InferenceModelReconciler{
@@ -92,7 +94,7 @@ func (r *ExtProcServerRunner) Setup() {
9294
},
9395
Record: mgr.GetEventRecorderFor("InferenceModel"),
9496
}).SetupWithManager(mgr); err != nil {
95-
klog.Fatalf("Failed setting up InferenceModelReconciler: %v", err)
97+
return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err)
9698
}
9799

98100
if err := (&backend.EndpointSliceReconciler{
@@ -103,28 +105,40 @@ func (r *ExtProcServerRunner) Setup() {
103105
ServiceName: r.ServiceName,
104106
Zone: r.Zone,
105107
}).SetupWithManager(mgr); err != nil {
106-
klog.Fatalf("Failed setting up EndpointSliceReconciler: %v", err)
108+
return fmt.Errorf("failed setting up EndpointSliceReconciler: %w", err)
107109
}
110+
return nil
108111
}
109112

110113
// Start starts the Envoy external processor server in a goroutine.
114+
// Any error can be handled using the done callback, which is called exactly once.
115+
// When no callback is provided, errors are ignored.
111116
func (r *ExtProcServerRunner) Start(
112117
podDatastore *backend.K8sDatastore,
113118
podMetricsClient backend.PodMetricsClient,
119+
done func(error),
114120
) *grpc.Server {
115-
svr := grpc.NewServer()
121+
if done == nil {
122+
done = func(_ error) {}
123+
}
116124

117-
go func() {
125+
klog.InfoS("Ext-proc server starting...")
126+
127+
svr := grpc.NewServer()
128+
run := func() error {
118129
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", r.GrpcPort))
119130
if err != nil {
120-
klog.Fatalf("Ext-proc server failed to listen: %v", err)
131+
klog.ErrorS(err, "Ext-proc server failed to listen")
132+
return err
121133
}
122-
klog.Infof("Ext-proc server listening on port: %d", r.GrpcPort)
134+
135+
klog.InfoS("Ext-proc server listening", "port", r.GrpcPort)
123136

124137
// Initialize backend provider
125138
pp := backend.NewProvider(podMetricsClient, podDatastore)
126139
if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval); err != nil {
127-
klog.Fatalf("Failed to initialize backend provider: %v", err)
140+
klog.ErrorS(err, "Failed to initialize backend provider")
141+
return err
128142
}
129143

130144
// Register ext_proc handlers
@@ -135,22 +149,32 @@ func (r *ExtProcServerRunner) Start(
135149

136150
// Blocking and will return when shutdown is complete.
137151
if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped {
138-
klog.Fatalf("Ext-proc server failed: %v", err)
152+
klog.ErrorS(err, "Ext-proc server failed")
153+
return err
139154
}
140-
klog.Info("Ext-proc server shutting down")
155+
klog.InfoS("Ext-proc server terminated")
156+
return nil
157+
}
158+
go func() {
159+
done(run())
141160
}()
142161
return svr
143162
}
144163

145-
func (r *ExtProcServerRunner) StartManager() {
164+
func (r *ExtProcServerRunner) StartManager(ctx context.Context) error {
146165
if r.manager == nil {
147-
klog.Fatalf("Runner has no manager setup to run: %v", r)
166+
err := errors.New("runner manager is not set")
167+
klog.ErrorS(err, "Runner has no manager setup to run", "runner", r)
168+
return err
148169
}
170+
149171
// Start the controller manager. Blocking and will return when shutdown is complete.
150-
klog.Infof("Starting controller manager")
172+
klog.InfoS("Controller manager starting...")
151173
mgr := r.manager
152-
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
153-
klog.Fatalf("Error starting controller manager: %v", err)
174+
if err := mgr.Start(ctx); err != nil {
175+
klog.ErrorS(err, "Error starting controller manager")
176+
return err
154177
}
155-
klog.Info("Controller manager shutting down")
178+
klog.InfoS("Controller manager terminated")
179+
return nil
156180
}

0 commit comments

Comments
 (0)