@@ -5,8 +5,12 @@ import (
5
5
"errors"
6
6
"fmt"
7
7
"io"
8
+ "os"
9
+ "os/signal"
8
10
"strconv"
9
11
"sync"
12
+ "sync/atomic"
13
+ "syscall"
10
14
"time"
11
15
12
16
"github.com/grafana/synthetic-monitoring-agent/internal/feature"
@@ -24,8 +28,9 @@ import (
24
28
)
25
29
26
30
var (
27
- errNotAuthorized = errors .New ("probe not authorized" )
28
- errTransportClosing = errors .New ("transport closing" )
31
+ errNotAuthorized = errors .New ("probe not authorized" )
32
+ errTransportClosing = errors .New ("transport closing" )
33
+ errProbeUnregistered = errors .New ("probe no longer registered" )
29
34
)
30
35
31
36
// Updater represents a probe along with the collection of scrapers
@@ -219,16 +224,29 @@ func (c *Updater) Run(ctx context.Context) error {
219
224
Msg ("context cancelled, closing updater" )
220
225
return nil
221
226
227
+ case errors .Is (err , errProbeUnregistered ):
228
+ // Probe unregistered itself from the API, wait
229
+ // until attempting to reconnect again.
230
+ c .logger .Warn ().
231
+ Str ("connection_state" , c .api .conn .GetState ().String ()).
232
+ Msg ("unregistered probe in API, sleeping for 1 minute..." )
233
+
234
+ if err := sleepCtx (ctx , 1 * time .Minute ); err != nil {
235
+ return err
236
+ }
237
+
222
238
default :
223
239
c .logger .Warn ().
224
240
Err (err ).
225
241
Str ("connection_state" , c .api .conn .GetState ().String ()).
226
242
Msg ("handling check changes" )
243
+
227
244
// TODO(mem): this might be a transient error (e.g. bad connection). We probably need to
228
245
// fine-tune GRPPC's backoff parameters. We might also need to keep count of the reconnects, and
229
246
// give up if they hit some threshold?
230
- time .Sleep (2 * time .Second )
231
- continue
247
+ if err := sleepCtx (ctx , 2 * time .Second ); err != nil {
248
+ return err
249
+ }
232
250
}
233
251
}
234
252
}
@@ -240,7 +258,7 @@ func (c *Updater) loop(ctx context.Context) error {
240
258
241
259
grpcErrorHandler := func (action string , err error ) error {
242
260
status , ok := status .FromError (err )
243
- c .logger .Error ().Err (err ).Uint32 ("code" , uint32 (status .Code ())).Msg (status .Message ())
261
+ c .logger .Error ().Err (err ).Str ( "action" , action ). Uint32 ("code" , uint32 (status .Code ())).Msg (status .Message ())
244
262
245
263
switch {
246
264
case ! ok :
@@ -297,18 +315,79 @@ func (c *Updater) loop(ctx context.Context) error {
297
315
"buildstamp" : version .Buildstamp (),
298
316
}).Set (1 )
299
317
300
- cc , err := client .GetChanges (ctx , & sm.Void {})
301
- if err != nil {
302
- return grpcErrorHandler ("requesting changes from synthetic-monitoring-api" , err )
318
+ // Create a child context so that we can communicate to the
319
+ // signal handler that we are done.
320
+ sigCtx , cancel := context .WithCancel (ctx )
321
+ defer cancel ()
322
+
323
+ // We get _another_ context from the signal handler that we can
324
+ // use tell the GRPC client that we need to break out. We have
325
+ // multiple ways of cancelling the context (another signal
326
+ // elsewhere in the system communicated through the parent
327
+ // context; cancelling the child context because we are
328
+ // returning from this function; cancelling the new context
329
+ // because the signal fired), so we need an additional way of
330
+ // telling them apart.
331
+ sigCtx , signalFired := installSignalHandler (sigCtx )
332
+
333
+ action := "requesting changes from synthetic-monitoring-api"
334
+ cc , err := client .GetChanges (sigCtx , & sm.Void {})
335
+ if err == nil {
336
+ action = "getting changes from synthetic-monitoring-api"
337
+ // processChanges uses the context in its first
338
+ // argument to create scrapers. This means that
339
+ // cancelling that context cancels all the running
340
+ // scrapers. That's why we are passing the _original_
341
+ // context, not sigCtx, so that scrapers are _not_
342
+ // stopped if the signal is trapped. We want scrapers to
343
+ // continue running in case the agent is _not_ killed.
344
+ err = c .processChanges (ctx , cc )
303
345
}
304
346
305
- if err := c .processChanges (ctx , cc ); err != nil {
306
- return grpcErrorHandler ("getting changes from synthetic-monitoring-api" , err )
347
+ if err != nil {
348
+ if atomic .LoadInt32 (signalFired ) == 1 {
349
+ return errProbeUnregistered
350
+ }
351
+
352
+ return grpcErrorHandler (action , err )
307
353
}
308
354
309
355
return nil
310
356
}
311
357
358
+ // installSignalHandler installs a signal handler for SIGUSR1.
359
+ //
360
+ // The returned context's Done channel is closed if the signal is
361
+ // delivered. To make it simpler to determine if the signal was
362
+ // delivered, a value of 1 is written to the location pointed to by the
363
+ // returned int32 pointer.
364
+ //
365
+ // If the provided context's Done channel is closed before the signal is
366
+ // delivered, the signal handler is removed and the returned context's
367
+ // Done channel is closed, too. It's the callers responsibility to
368
+ // cancel the provided context if it's no longer interested in the
369
+ // signal.
370
+ func installSignalHandler (ctx context.Context ) (context.Context , * int32 ) {
371
+ sigCtx , cancel := context .WithCancel (ctx )
372
+
373
+ fired := new (int32 )
374
+
375
+ sigCh := make (chan os.Signal , 1 )
376
+ signal .Notify (sigCh , syscall .SIGUSR1 )
377
+
378
+ go func () {
379
+ select {
380
+ case <- sigCh :
381
+ atomic .StoreInt32 (fired , 1 )
382
+ cancel ()
383
+ case <- ctx .Done ():
384
+ }
385
+ signal .Stop (sigCh )
386
+ }()
387
+
388
+ return sigCtx , fired
389
+ }
390
+
312
391
func (c * Updater ) processChanges (ctx context.Context , cc sm.Checks_GetChangesClient ) error {
313
392
firstBatch := true
314
393
@@ -317,6 +396,9 @@ func (c *Updater) processChanges(ctx context.Context, cc sm.Checks_GetChangesCli
317
396
case <- cc .Context ().Done ():
318
397
return nil
319
398
399
+ case <- ctx .Done ():
400
+ return ctx .Err ()
401
+
320
402
default :
321
403
switch msg , err := cc .Recv (); err {
322
404
case nil :
@@ -472,6 +554,10 @@ func (c *Updater) handleFirstBatch(ctx context.Context, changes *sm.Changes) {
472
554
continue
473
555
}
474
556
557
+ c .logger .Debug ().
558
+ Int64 ("check_id" , id ).
559
+ Msg ("stopping scraper during first batch handling" )
560
+
475
561
checkType := scraper .CheckType ().String ()
476
562
scraper .Stop ()
477
563
@@ -599,3 +685,24 @@ func (c *Updater) addAndStartScraperWithLock(ctx context.Context, check sm.Check
599
685
600
686
return nil
601
687
}
688
+
689
+ // sleepCtx is like time.Sleep, but it pays attention to the
690
+ // cancellation of the provided context.
691
+ func sleepCtx (ctx context.Context , d time.Duration ) error {
692
+ var err error
693
+
694
+ timer := time .NewTimer (d )
695
+
696
+ select {
697
+ case <- ctx .Done ():
698
+ err = ctx .Err ()
699
+
700
+ if ! timer .Stop () {
701
+ <- timer .C
702
+ }
703
+
704
+ case <- timer .C :
705
+ }
706
+
707
+ return err
708
+ }
0 commit comments