@@ -6,9 +6,11 @@ import (
6
6
"fmt"
7
7
"net"
8
8
"net/http"
9
+ "os"
9
10
"strconv"
10
11
11
12
"github.com/prometheus/client_golang/prometheus/promhttp"
13
+ "golang.org/x/sync/errgroup"
12
14
"google.golang.org/grpc"
13
15
healthPb "google.golang.org/grpc/health/grpc_health_v1"
14
16
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
@@ -79,17 +81,25 @@ func init() {
79
81
}
80
82
81
83
func main () {
84
+ if err := run (); err != nil {
85
+ os .Exit (1 )
86
+ }
87
+ }
88
+
89
+ func run () error {
82
90
klog .InitFlags (nil )
83
91
flag .Parse ()
84
92
85
93
ctrl .SetLogger (klog .TODO ())
86
94
cfg , err := ctrl .GetConfig ()
87
95
if err != nil {
88
- klog .Fatalf ("Failed to get rest config: %v" , err )
96
+ klog .ErrorS (err , "Failed to get rest config" )
97
+ return err
89
98
}
90
99
// Validate flags
91
100
if err := validateFlags (); err != nil {
92
- klog .Fatalf ("Failed to validate flags: %v" , err )
101
+ klog .ErrorS (err , "Failed to validate flags" )
102
+ return err
93
103
}
94
104
95
105
// Print all flag values
@@ -114,101 +124,140 @@ func main() {
114
124
Config : ctrl .GetConfigOrDie (),
115
125
Datastore : datastore ,
116
126
}
117
- serverRunner .Setup ()
127
+ if err := serverRunner .Setup (); err != nil {
128
+ klog .ErrorS (err , "Failed to setup server runner" )
129
+ return err
130
+ }
118
131
119
- // Start health and ext-proc servers in goroutines
120
- healthSvr := startHealthServer (datastore , * grpcHealthPort )
121
- extProcSvr := serverRunner .Start (
122
- datastore ,
123
- & vllm.PodMetricsClientImpl {},
124
- )
125
- // Start metrics handler
126
- metricsSvr := startMetricsHandler (* metricsPort , cfg )
132
+ // Start processing signals and init the group to manage goroutines.
133
+ g , ctx := errgroup .WithContext (ctrl .SetupSignalHandler ())
127
134
128
- // Start manager, blocking
129
- serverRunner . StartManager ( )
135
+ // Start health server.
136
+ startHealthServer ( ctx , g , datastore , * grpcHealthPort )
130
137
131
- // Gracefully shutdown servers
132
- if healthSvr != nil {
133
- klog .Info ("Health server shutting down" )
134
- healthSvr .GracefulStop ()
135
- }
136
- if extProcSvr != nil {
137
- klog .Info ("Ext-proc server shutting down" )
138
- extProcSvr .GracefulStop ()
139
- }
140
- if metricsSvr != nil {
141
- klog .Info ("Metrics server shutting down" )
142
- if err := metricsSvr .Shutdown (context .Background ()); err != nil {
143
- klog .Infof ("Metrics server Shutdown: %v" , err )
144
- }
145
- }
138
+ // Start ext-proc server.
139
+ g .Go (func () error {
140
+ return serverRunner .Start (ctx , datastore , & vllm.PodMetricsClientImpl {})
141
+ })
142
+
143
+ // Start metrics handler.
144
+ startMetricsHandler (ctx , g , * metricsPort , cfg )
146
145
147
- klog .Info ("All components shutdown" )
146
+ // Start manager.
147
+ g .Go (func () error {
148
+ return serverRunner .StartManager (ctx )
149
+ })
150
+
151
+ err = g .Wait ()
152
+ klog .InfoS ("All components terminated" )
153
+ return err
148
154
}
149
155
150
- // startHealthServer starts the gRPC health probe server in a goroutine .
151
- func startHealthServer (ds * backend.K8sDatastore , port int ) * grpc. Server {
152
- svr := grpc . NewServer ()
153
- healthPb . RegisterHealthServer ( svr , & healthServer { datastore : ds } )
156
+ // startHealthServer starts the gRPC health probe server using the given errgroup .
157
+ func startHealthServer (ctx context. Context , g * errgroup. Group , ds * backend.K8sDatastore , port int ) {
158
+ g . Go ( func () error {
159
+ klog . InfoS ( "Health server starting..." )
154
160
155
- go func () {
161
+ // Start listening.
156
162
lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
157
163
if err != nil {
158
- klog .Fatalf ("Health server failed to listen: %v" , err )
164
+ klog .ErrorS (err , "Health server failed to listen" )
165
+ return err
159
166
}
160
- klog .Infof ("Health server listening on port: %d" , port )
161
167
162
- // Blocking and will return when shutdown is complete.
168
+ klog .InfoS ("Health server listening" , "port" , port )
169
+
170
+ svr := grpc .NewServer ()
171
+ healthPb .RegisterHealthServer (svr , & healthServer {datastore : ds })
172
+
173
+ // Shutdown on context closed.
174
+ g .Go (func () error {
175
+ <- ctx .Done ()
176
+ klog .InfoS ("Health server shutting down..." )
177
+ svr .GracefulStop ()
178
+ return nil
179
+ })
180
+
181
+ // Keep serving until terminated.
163
182
if err := svr .Serve (lis ); err != nil && err != grpc .ErrServerStopped {
164
- klog .Fatalf ("Health server failed: %v" , err )
183
+ klog .ErrorS (err , "Health server failed" )
184
+ return err
165
185
}
166
- klog .Info ("Health server shutting down " )
167
- }()
168
- return svr
186
+ klog .InfoS ("Health server terminated " )
187
+ return nil
188
+ })
169
189
}
170
190
171
- func startMetricsHandler (port int , cfg * rest.Config ) * http.Server {
172
- metrics .Register ()
191
+ // startMetricsHandler starts the metrics HTTP handler using the given errgroup.
192
+ func startMetricsHandler (ctx context.Context , g * errgroup.Group , port int , cfg * rest.Config ) {
193
+ g .Go (func () error {
194
+ metrics .Register ()
195
+ klog .InfoS ("Metrics HTTP handler starting..." )
196
+
197
+ // Start listening.
198
+ lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
199
+ if err != nil {
200
+ klog .ErrorS (err , "Metrics HTTP handler failed to listen" )
201
+ return err
202
+ }
173
203
174
- var svr * http.Server
175
- go func () {
176
- klog .Info ("Starting metrics HTTP handler ..." )
204
+ klog .InfoS ("Metrics HTTP handler listening" , "port" , port )
205
+
206
+ // Init HTTP server.
207
+ h , err := metricsHandlerWithAuthenticationAndAuthorization (cfg )
208
+ if err != nil {
209
+ return err
210
+ }
177
211
178
212
mux := http .NewServeMux ()
179
- mux .Handle (defaultMetricsEndpoint , metricsHandlerWithAuthenticationAndAuthorization ( cfg ) )
213
+ mux .Handle (defaultMetricsEndpoint , h )
180
214
181
- svr = & http.Server {
215
+ svr : = & http.Server {
182
216
Addr : net .JoinHostPort ("" , strconv .Itoa (port )),
183
217
Handler : mux ,
184
218
}
185
- if err := svr .ListenAndServe (); err != http .ErrServerClosed {
186
- klog .Fatalf ("failed to start metrics HTTP handler: %v" , err )
219
+
220
+ // Shutdown on interrupt.
221
+ g .Go (func () error {
222
+ <- ctx .Done ()
223
+ klog .InfoS ("Metrics HTTP handler shutting down..." )
224
+ _ = svr .Shutdown (context .Background ())
225
+ return nil
226
+ })
227
+
228
+ // Keep serving until terminated.
229
+ if err := svr .Serve (lis ); err != http .ErrServerClosed {
230
+ klog .ErrorS (err , "Metrics HTTP handler failed" )
231
+ return err
187
232
}
188
- }()
189
- return svr
233
+ klog .InfoS ("Metrics HTTP handler terminated" )
234
+ return nil
235
+ })
190
236
}
191
237
192
- func metricsHandlerWithAuthenticationAndAuthorization (cfg * rest.Config ) http.Handler {
238
+ func metricsHandlerWithAuthenticationAndAuthorization (cfg * rest.Config ) ( http.Handler , error ) {
193
239
h := promhttp .HandlerFor (
194
240
legacyregistry .DefaultGatherer ,
195
241
promhttp.HandlerOpts {},
196
242
)
197
243
httpClient , err := rest .HTTPClientFor (cfg )
198
244
if err != nil {
199
- klog .Fatalf ("failed to create http client for metrics auth: %v" , err )
245
+ klog .ErrorS (err , "Failed to create http client for metrics auth" )
246
+ return nil , err
200
247
}
201
248
202
249
filter , err := filters .WithAuthenticationAndAuthorization (cfg , httpClient )
203
250
if err != nil {
204
- klog .Fatalf ("failed to create metrics filter for auth: %v" , err )
251
+ klog .ErrorS (err , "Failed to create metrics filter for auth" )
252
+ return nil , err
205
253
}
206
254
metricsLogger := klog .LoggerWithValues (klog .NewKlogr (), "path" , defaultMetricsEndpoint )
207
255
metricsAuthHandler , err := filter (metricsLogger , h )
208
256
if err != nil {
209
- klog .Fatalf ("failed to create metrics auth handler: %v" , err )
257
+ klog .ErrorS (err , "Failed to create metrics auth handler" )
258
+ return nil , err
210
259
}
211
- return metricsAuthHandler
260
+ return metricsAuthHandler , nil
212
261
}
213
262
214
263
func validateFlags () error {
0 commit comments