Skip to content

Commit 92ede84

Browse files
committed
Prevent the router from deadlocking itself when calling Commit()
Backport of #13717 The router reload function (Commit()) was changed so that it could be rate limited. The rate limiter tracks changes in a kcache.Queue object. It will coalesce changes to the same call so that if three calls to Invoke() the rate limited function happen before the next time it is allowed to process, then only one will occur. Our problem was that we were doing: Thread 1 (the rate-limiter background process): - Wake up - Ratelimit sees there is work to be done and calls fifo.go's Pop() function - Fifo.go acquires a fifo lock and call the processing function - Router.go's commitAndReload() function acquires a lock on the router object Thread 2 (triggered by the event handler that commit's changes to the router): - Get the event and process it - Since there are changes to be made, call router.Commit() - Commit() grabs a lock on the router object - Then calls the rate-limiter wrapper around commitAndReload() using Invoke() to queue work - In order to queue the work... it acquires a lock on the fifo So thread 1 locks: fifo then router; thread 2 locks: router then fifo. If you get unlucky, those threads deadlock and you never process another event. The fix is to release the lock on the router object in our Commit() function before we call Invoke on the rate limited function. The lock is not actually protecting anything at that point since the rate limited function does its own locking, and is run in a separate thread anyway. Bug 1440977 (https://bugzilla.redhat.com/show_bug.cgi?id=1440977)
1 parent 54c5ba1 commit 92ede84

File tree

3 files changed

+24
-7
lines changed

3 files changed

+24
-7
lines changed

pkg/router/template/fake.go

+11
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ func NewFakeTemplateRouter() *templateRouter {
1313
}
1414
}
1515

16+
// FakeReloadHandler implements the minimal changes needed to make the locking behavior work
17+
// This MUST match the behavior with the stateChanged of commitAndReload
18+
func (r *templateRouter) FakeReloadHandler() {
19+
r.lock.Lock()
20+
defer r.lock.Unlock()
21+
22+
r.stateChanged = false
23+
24+
return
25+
}
26+
1627
// fakeCertWriter is a certificate writer that records actions but is a no-op
1728
type fakeCertWriter struct {
1829
addedCerts []string

pkg/router/template/router.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -366,19 +366,21 @@ func (r *templateRouter) readState() error {
366366
// Commit applies the changes made to the router configuration - persists
367367
// the state and refresh the backend. This is all done in the background
368368
// so that we can rate limit + coalesce multiple changes.
369+
// Note: If this is changed FakeCommit() in fake.go should also be updated
369370
func (r *templateRouter) Commit() {
370371
r.lock.Lock()
371-
defer r.lock.Unlock()
372372

373373
if !r.synced {
374374
glog.V(4).Infof("Router state synchronized for the first time")
375375
r.synced = true
376376
r.stateChanged = true
377377
}
378378

379-
if r.stateChanged {
379+
needsCommit := r.stateChanged
380+
r.lock.Unlock()
381+
382+
if needsCommit {
380383
r.rateLimitedCommitFunction.Invoke(r.rateLimitedCommitFunction)
381-
r.stateChanged = false
382384
}
383385
}
384386

@@ -387,6 +389,8 @@ func (r *templateRouter) commitAndReload() error {
387389
r.lock.Lock()
388390
defer r.lock.Unlock()
389391

392+
r.stateChanged = false
393+
390394
glog.V(4).Infof("Writing the router state")
391395
if err := r.writeState(); err != nil {
392396
return err

test/integration/router_stress_test.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func stressRouter(t *testing.T, namespaceCount, routesPerNamespace, routerCount,
112112
// Create the routers
113113
for i := int32(0); i < routerCount; i++ {
114114
routerName := fmt.Sprintf("router_%d", i)
115-
router := launchRouter(oc, kc, maxRouterDelay, routerName, reloadInterval, reloadedMap)
115+
router := launchRouter(t, oc, kc, maxRouterDelay, routerName, reloadInterval, reloadedMap)
116116
plugins = append(plugins, router)
117117
}
118118

@@ -148,12 +148,12 @@ func stressRouter(t *testing.T, namespaceCount, routesPerNamespace, routerCount,
148148
}
149149
}
150150

151-
for _, reloadCount := range reloadedMap {
151+
for routerName, reloadCount := range reloadedMap {
152152
if reloadCount > 1 {
153153
// If a router reloads more than once, post-sync watch
154154
// events resulting from route status updates are
155155
// incorrectly updating router state.
156-
t.Fatalf("One or more routers reloaded more than once")
156+
t.Fatalf("One or more routers reloaded more than once (%s reloaded %d times)", routerName, reloadCount)
157157
}
158158
}
159159

@@ -281,12 +281,14 @@ func (p *DelayPlugin) Commit() error {
281281

282282
// launchRouter launches a template router that communicates with the
283283
// api via the provided clients.
284-
func launchRouter(oc osclient.Interface, kc kclientset.Interface, maxDelay int32, name string, reloadInterval int, reloadedMap map[string]int) (templatePlugin *templateplugin.TemplatePlugin) {
284+
func launchRouter(t *testing.T, oc osclient.Interface, kc kclientset.Interface, maxDelay int32, name string, reloadInterval int, reloadedMap map[string]int) (templatePlugin *templateplugin.TemplatePlugin) {
285285
r := templateplugin.NewFakeTemplateRouter()
286286

287287
reloadedMap[name] = 0
288288
r.EnableRateLimiter(reloadInterval, func() error {
289289
reloadedMap[name] += 1
290+
t.Logf("Router %s reloaded (%d times)\n", name, reloadedMap[name])
291+
r.FakeReloadHandler()
290292
return nil
291293
})
292294

0 commit comments

Comments
 (0)