@@ -6,9 +6,11 @@ import (
6
6
"fmt"
7
7
"net"
8
8
"net/http"
9
+ "os"
9
10
"strconv"
10
11
11
12
"github.com/prometheus/client_golang/prometheus/promhttp"
13
+ "golang.org/x/sync/errgroup"
12
14
"google.golang.org/grpc"
13
15
healthPb "google.golang.org/grpc/health/grpc_health_v1"
14
16
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
@@ -72,17 +74,25 @@ func init() {
72
74
}
73
75
74
76
func main () {
77
+ if err := run (); err != nil {
78
+ os .Exit (1 )
79
+ }
80
+ }
81
+
82
+ func run () error {
75
83
klog .InitFlags (nil )
76
84
flag .Parse ()
77
85
78
86
ctrl .SetLogger (klog .TODO ())
79
87
cfg , err := ctrl .GetConfig ()
80
88
if err != nil {
81
- klog .Fatalf ("Failed to get rest config: %v" , err )
89
+ klog .ErrorS (err , "Failed to get rest config" )
90
+ return err
82
91
}
83
92
// Validate flags
84
93
if err := validateFlags (); err != nil {
85
- klog .Fatalf ("Failed to validate flags: %v" , err )
94
+ klog .ErrorS (err , "Failed to validate flags" )
95
+ return err
86
96
}
87
97
88
98
// Print all flag values
@@ -105,104 +115,152 @@ func main() {
105
115
Config : ctrl .GetConfigOrDie (),
106
116
Datastore : datastore ,
107
117
}
108
- serverRunner .Setup ()
118
+ if err := serverRunner .Setup (); err != nil {
119
+ klog .ErrorS (err , "Failed to setup ext-proc server" )
120
+ return err
121
+ }
109
122
110
123
k8sClient , err := kubernetes .NewForConfigAndClient (cfg , serverRunner .Manager .GetHTTPClient ())
111
124
if err != nil {
112
- klog .Fatalf ("Failed to create client: %v" , err )
125
+ klog .ErrorS (err , "Failed to create client" )
126
+ return err
113
127
}
114
128
datastore .SetClient (k8sClient )
115
129
116
- // Start health and ext-proc servers in goroutines
117
- healthSvr := startHealthServer (datastore , * grpcHealthPort )
118
- extProcSvr := serverRunner .Start (& vllm.PodMetricsClientImpl {})
119
- // Start metrics handler
120
- metricsSvr := startMetricsHandler (* metricsPort , cfg )
130
+ if err := serverRunner .Setup (); err != nil {
131
+ klog .ErrorS (err , "Failed to setup server runner" )
132
+ return err
133
+ }
121
134
122
- // Start manager, blocking
123
- serverRunner . StartManager ( )
135
+ // Start processing signals and init the group to manage goroutines.
136
+ g , ctx := errgroup . WithContext ( ctrl . SetupSignalHandler () )
124
137
125
- // Gracefully shutdown servers
126
- if healthSvr != nil {
127
- klog . Info ( "Health server shutting down" )
128
- healthSvr . GracefulStop ()
129
- }
130
- if extProcSvr != nil {
131
- klog . Info ( "Ext-proc server shutting down" )
132
- extProcSvr . GracefulStop ()
133
- }
134
- if metricsSvr != nil {
135
- klog . Info ( "Metrics server shutting down" )
136
- if err := metricsSvr . Shutdown ( context . Background ()); err != nil {
137
- klog . Infof ( "Metrics server Shutdown: %v" , err )
138
- }
139
- }
138
+ // Start health server.
139
+ startHealthServer ( ctx , g , datastore , * grpcHealthPort )
140
+
141
+ // Start ext-proc server.
142
+ g . Go ( func () error {
143
+ return serverRunner . Start ( ctx , & vllm. PodMetricsClientImpl {})
144
+ } )
145
+
146
+ // Start metrics handler.
147
+ startMetricsHandler ( ctx , g , * metricsPort , cfg )
148
+
149
+ // Start manager.
150
+ g . Go ( func () error {
151
+ return serverRunner . StartManager ( ctx )
152
+ })
140
153
141
- klog .Info ("All components shutdown" )
154
+ err = g .Wait ()
155
+ klog .InfoS ("All components terminated" )
156
+ return err
142
157
}
143
158
144
- // startHealthServer starts the gRPC health probe server in a goroutine .
145
- func startHealthServer (ds * backend.K8sDatastore , port int ) * grpc. Server {
146
- svr := grpc . NewServer ()
147
- healthPb . RegisterHealthServer ( svr , & healthServer { datastore : ds } )
159
+ // startHealthServer starts the gRPC health probe server using the given errgroup .
160
+ func startHealthServer (ctx context. Context , g * errgroup. Group , ds * backend.K8sDatastore , port int ) {
161
+ g . Go ( func () error {
162
+ klog . InfoS ( "Health server starting..." )
148
163
149
- go func () {
164
+ // Start listening.
150
165
lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
151
166
if err != nil {
152
- klog .Fatalf ("Health server failed to listen: %v" , err )
167
+ klog .ErrorS (err , "Health server failed to listen" )
168
+ return err
153
169
}
154
- klog .Infof ("Health server listening on port: %d" , port )
155
170
156
- // Blocking and will return when shutdown is complete.
171
+ klog .InfoS ("Health server listening" , "port" , port )
172
+
173
+ svr := grpc .NewServer ()
174
+ healthPb .RegisterHealthServer (svr , & healthServer {datastore : ds })
175
+
176
+ // Shutdown on context closed.
177
+ g .Go (func () error {
178
+ <- ctx .Done ()
179
+ klog .InfoS ("Health server shutting down..." )
180
+ svr .GracefulStop ()
181
+ return nil
182
+ })
183
+
184
+ // Keep serving until terminated.
157
185
if err := svr .Serve (lis ); err != nil && err != grpc .ErrServerStopped {
158
- klog .Fatalf ("Health server failed: %v" , err )
186
+ klog .ErrorS (err , "Health server failed" )
187
+ return err
159
188
}
160
- klog .Info ("Health server shutting down " )
161
- }()
162
- return svr
189
+ klog .InfoS ("Health server terminated " )
190
+ return nil
191
+ })
163
192
}
164
193
165
- func startMetricsHandler (port int , cfg * rest.Config ) * http.Server {
166
- metrics .Register ()
194
+ // startMetricsHandler starts the metrics HTTP handler using the given errgroup.
195
+ func startMetricsHandler (ctx context.Context , g * errgroup.Group , port int , cfg * rest.Config ) {
196
+ g .Go (func () error {
197
+ metrics .Register ()
198
+ klog .InfoS ("Metrics HTTP handler starting..." )
199
+
200
+ // Start listening.
201
+ lis , err := net .Listen ("tcp" , fmt .Sprintf (":%d" , port ))
202
+ if err != nil {
203
+ klog .ErrorS (err , "Metrics HTTP handler failed to listen" )
204
+ return err
205
+ }
206
+
207
+ klog .InfoS ("Metrics HTTP handler listening" , "port" , port )
167
208
168
- var svr * http.Server
169
- go func () {
170
- klog .Info ("Starting metrics HTTP handler ..." )
209
+ // Init HTTP server.
210
+ h , err := metricsHandlerWithAuthenticationAndAuthorization (cfg )
211
+ if err != nil {
212
+ return err
213
+ }
171
214
172
215
mux := http .NewServeMux ()
173
- mux .Handle (defaultMetricsEndpoint , metricsHandlerWithAuthenticationAndAuthorization ( cfg ) )
216
+ mux .Handle (defaultMetricsEndpoint , h )
174
217
175
- svr = & http.Server {
218
+ svr : = & http.Server {
176
219
Addr : net .JoinHostPort ("" , strconv .Itoa (port )),
177
220
Handler : mux ,
178
221
}
179
- if err := svr .ListenAndServe (); err != http .ErrServerClosed {
180
- klog .Fatalf ("failed to start metrics HTTP handler: %v" , err )
222
+
223
+ // Shutdown on interrupt.
224
+ g .Go (func () error {
225
+ <- ctx .Done ()
226
+ klog .InfoS ("Metrics HTTP handler shutting down..." )
227
+ _ = svr .Shutdown (context .Background ())
228
+ return nil
229
+ })
230
+
231
+ // Keep serving until terminated.
232
+ if err := svr .Serve (lis ); err != http .ErrServerClosed {
233
+ klog .ErrorS (err , "Metrics HTTP handler failed" )
234
+ return err
181
235
}
182
- }()
183
- return svr
236
+ klog .InfoS ("Metrics HTTP handler terminated" )
237
+ return nil
238
+ })
184
239
}
185
240
186
- func metricsHandlerWithAuthenticationAndAuthorization (cfg * rest.Config ) http.Handler {
241
+ func metricsHandlerWithAuthenticationAndAuthorization (cfg * rest.Config ) ( http.Handler , error ) {
187
242
h := promhttp .HandlerFor (
188
243
legacyregistry .DefaultGatherer ,
189
244
promhttp.HandlerOpts {},
190
245
)
191
246
httpClient , err := rest .HTTPClientFor (cfg )
192
247
if err != nil {
193
- klog .Fatalf ("failed to create http client for metrics auth: %v" , err )
248
+ klog .ErrorS (err , "Failed to create http client for metrics auth" )
249
+ return nil , err
194
250
}
195
251
196
252
filter , err := filters .WithAuthenticationAndAuthorization (cfg , httpClient )
197
253
if err != nil {
198
- klog .Fatalf ("failed to create metrics filter for auth: %v" , err )
254
+ klog .ErrorS (err , "Failed to create metrics filter for auth" )
255
+ return nil , err
199
256
}
200
257
metricsLogger := klog .LoggerWithValues (klog .NewKlogr (), "path" , defaultMetricsEndpoint )
201
258
metricsAuthHandler , err := filter (metricsLogger , h )
202
259
if err != nil {
203
- klog .Fatalf ("failed to create metrics auth handler: %v" , err )
260
+ klog .ErrorS (err , "Failed to create metrics auth handler" )
261
+ return nil , err
204
262
}
205
- return metricsAuthHandler
263
+ return metricsAuthHandler , nil
206
264
}
207
265
208
266
func validateFlags () error {
0 commit comments