Skip to content

Commit cb5f178

Browse files
committed
refactor: edit of run to start controller and health + metrics concurrently
this commit also edits the startHealthServer to also start collecting of default prometheus metrics. channels are used to collect potential errors and allow for graceful shutdown of servers
1 parent f11e9aa commit cb5f178

File tree

1 file changed

+94
-15
lines changed

1 file changed

+94
-15
lines changed

Diff for: cmd/kar-controllers/app/server.go

+94-15
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,20 @@ limitations under the License.
3131
package app
3232

3333
import (
34+
"context"
35+
"fmt"
36+
"net/http"
37+
"time"
38+
3439
"k8s.io/client-go/rest"
3540
"k8s.io/client-go/tools/clientcmd"
36-
"net/http"
3741

3842
"github.com/project-codeflare/multi-cluster-app-dispatcher/cmd/kar-controllers/app/options"
3943
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejob"
4044
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/health"
45+
"github.com/prometheus/client_golang/prometheus"
46+
"github.com/prometheus/client_golang/prometheus/collectors"
47+
"github.com/prometheus/client_golang/prometheus/promhttp"
4148

4249
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
4350
)
@@ -49,41 +56,113 @@ func buildConfig(master, kubeconfig string) (*rest.Config, error) {
4956
return rest.InClusterConfig()
5057
}
5158

52-
func Run(opt *options.ServerOption) error {
59+
func Run(ctx context.Context, opt *options.ServerOption) error {
5360
config, err := buildConfig(opt.Master, opt.Kubeconfig)
5461
if err != nil {
5562
return err
5663
}
5764

58-
neverStop := make(chan struct{})
59-
6065
config.QPS = 100.0
6166
config.Burst = 200.0
6267

6368
jobctrl := queuejob.NewJobController(config, opt)
6469
if jobctrl == nil {
65-
return nil
70+
return fmt.Errorf("failed to create a job controller")
6671
}
67-
jobctrl.Run(neverStop)
6872

69-
// This call is blocking (unless an error occurs) which equates to <-neverStop
70-
err = listenHealthProbe(opt)
73+
stopCh := make(chan struct{})
74+
75+
go func() {
76+
defer close(stopCh)
77+
<-ctx.Done()
78+
}()
79+
80+
go jobctrl.Run(stopCh)
81+
82+
err = startHealthAndMetricsServers(ctx, opt)
7183
if err != nil {
7284
return err
7385
}
7486

87+
<-ctx.Done()
7588
return nil
7689
}
7790

7891
// Starts the health probe listener
79-
func listenHealthProbe(opt *options.ServerOption) error {
80-
handler := http.NewServeMux()
81-
handler.Handle("/healthz", &health.Handler{})
82-
err := http.ListenAndServe(opt.HealthProbeListenAddr, handler)
83-
if err != nil {
84-
return err
92+
func startHealthAndMetricsServers(ctx context.Context, opt *options.ServerOption) error {
93+
94+
// Create a new registry.
95+
reg := prometheus.NewRegistry()
96+
97+
// Add Go module build info.
98+
reg.MustRegister(collectors.NewBuildInfoCollector())
99+
reg.MustRegister(collectors.NewGoCollector())
100+
reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
101+
102+
metricsHandler := http.NewServeMux()
103+
104+
// Use the HTTPErrorOnError option for the Prometheus handler
105+
handlerOpts := promhttp.HandlerOpts{
106+
ErrorHandling: promhttp.HTTPErrorOnError,
85107
}
86108

87-
return nil
109+
metricsHandler.Handle("/metrics", promhttp.HandlerFor(prometheus.DefaultGatherer, handlerOpts))
110+
111+
healthHandler := http.NewServeMux()
112+
healthHandler.Handle("/healthz", &health.Handler{})
113+
114+
metricsServer := &http.Server{
115+
Addr: opt.MetricsListenAddr,
116+
Handler: metricsHandler,
117+
}
118+
119+
healthServer := &http.Server{
120+
Addr: opt.HealthProbeListenAddr,
121+
Handler: healthHandler,
122+
}
123+
124+
// make a channel for errors for each server
125+
metricsServerErrChan := make(chan error)
126+
healthServerErrChan := make(chan error)
127+
128+
// start servers in their own goroutines
129+
go func() {
130+
defer close(metricsServerErrChan)
131+
err := metricsServer.ListenAndServe()
132+
if err != nil && err != http.ErrServerClosed {
133+
metricsServerErrChan <- err
134+
}
135+
}()
136+
137+
go func() {
138+
defer close(healthServerErrChan)
139+
err := healthServer.ListenAndServe()
140+
if err != nil && err != http.ErrServerClosed {
141+
healthServerErrChan <- err
142+
}
143+
}()
144+
145+
// use select to wait for either a shutdown signal or an error
146+
select {
147+
case <-ctx.Done():
148+
// received an OS shutdown signal, shut down servers gracefully
149+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
150+
defer cancel()
151+
152+
errM := metricsServer.Shutdown(ctx)
153+
if errM != nil {
154+
return fmt.Errorf("metrics server shutdown error: %v", errM)
155+
}
156+
errH := healthServer.Shutdown(ctx)
157+
if errH != nil {
158+
return fmt.Errorf("health server shutdown error: %v", errH)
159+
}
160+
case err := <-metricsServerErrChan:
161+
return fmt.Errorf("metrics server error: %v", err)
162+
case err := <-healthServerErrChan:
163+
return fmt.Errorf("health server error: %v", err)
164+
}
165+
166+
return nil
88167
}
89168

0 commit comments

Comments
 (0)