Skip to content

Commit 36ed4a5

Browse files
authored
Fix deadlock bug exposed by a test (#89934)
A new test exposed a very rare bug where the file settings service was in the middle of processing the file when the node closed. This terminated the cluster state update task, but nobody unlocked the latch await. The fix allows the stop operation to properly terminate the watcher thread.
1 parent 91611ca commit 36ed4a5

File tree

3 files changed

+109
-5
lines changed

3 files changed

+109
-5
lines changed

docs/changelog/89934.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 89934
2+
summary: Fix deadlock bug exposed by the test
3+
area: Infra/Core
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsService.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ public class FileSettingsService extends AbstractLifecycleComponent implements C
5959

6060
private WatchService watchService; // null;
6161
private CountDownLatch watcherThreadLatch;
62+
private volatile CountDownLatch processingLatch;
6263

6364
private volatile FileUpdateState fileUpdateState = null;
6465
private volatile WatchKey settingsDirWatchKey = null;
@@ -311,7 +312,15 @@ synchronized void startWatcher(ClusterState clusterState, boolean onStartup) {
311312
settingsDirWatchKey = enableSettingsWatcher(settingsDirWatchKey, settingsDir);
312313

313314
if (watchedFileChanged(path)) {
314-
processFileSettings(path, (e) -> logger.error("Error processing operator settings json file", e)).await();
315+
processingLatch = processFileSettings(
316+
path,
317+
(e) -> logger.error("Error processing operator settings json file", e)
318+
);
319+
// After we get and set the processing latch, we need to check if stop wasn't
320+
// invoked in the meantime. Stop will invalidate all watch keys.
321+
if (configDirWatchKey != null) {
322+
processingLatch.await();
323+
}
315324
}
316325
} catch (IOException e) {
317326
logger.warn("encountered I/O error while watching file settings", e);
@@ -338,6 +347,9 @@ synchronized void stopWatcher() {
338347
cleanupWatchKeys();
339348
fileUpdateState = null;
340349
watchService.close();
350+
if (processingLatch != null) {
351+
processingLatch.countDown();
352+
}
341353
if (watcherThreadLatch != null) {
342354
watcherThreadLatch.await();
343355
}

server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsServiceTests.java

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public class FileSettingsServiceTests extends ESTestCase {
5656
private Environment env;
5757
private ClusterService clusterService;
5858
private FileSettingsService fileSettingsService;
59+
private ReservedClusterStateService controller;
5960
private ThreadPool threadpool;
6061

6162
@Before
@@ -86,10 +87,7 @@ public void setUp() throws Exception {
8687

8788
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
8889

89-
ReservedClusterStateService controller = new ReservedClusterStateService(
90-
clusterService,
91-
List.of(new ReservedClusterSettingsAction(clusterSettings))
92-
);
90+
controller = new ReservedClusterStateService(clusterService, List.of(new ReservedClusterSettingsAction(clusterSettings)));
9391

9492
fileSettingsService = new FileSettingsService(clusterService, controller, env);
9593
}
@@ -217,4 +215,93 @@ public void testInitialFile() throws Exception {
217215
service.stop();
218216
service.close();
219217
}
218+
219+
@SuppressWarnings("unchecked")
220+
public void testStopWorksInMiddleOfProcessing() throws Exception {
221+
var spiedController = spy(controller);
222+
var fsService = new FileSettingsService(clusterService, spiedController, env);
223+
224+
FileSettingsService service = spy(fsService);
225+
CountDownLatch processFileLatch = new CountDownLatch(1);
226+
CountDownLatch deadThreadLatch = new CountDownLatch(1);
227+
228+
doAnswer((Answer<Void>) invocation -> {
229+
processFileLatch.countDown();
230+
new Thread(() -> {
231+
// Simulate a thread that never comes back and decrements the
232+
// countdown latch in FileSettingsService.processFileSettings
233+
try {
234+
deadThreadLatch.await();
235+
} catch (InterruptedException e) {
236+
throw new RuntimeException(e);
237+
}
238+
}).start();
239+
return null;
240+
}).when(spiedController).process(any(String.class), any(XContentParser.class), any(Consumer.class));
241+
242+
service.start();
243+
assertTrue(service.watching());
244+
245+
Files.createDirectories(service.operatorSettingsDir());
246+
247+
// Make some fake settings file to cause the file settings service to process it
248+
Files.write(service.operatorSettingsFile(), "{}".getBytes(StandardCharsets.UTF_8));
249+
250+
// we need to wait a bit, on MacOS it may take up to 10 seconds for the Java watcher service to notice the file,
251+
// on Linux is instantaneous. Windows is instantaneous too.
252+
processFileLatch.await(30, TimeUnit.SECONDS);
253+
254+
// Stopping the service should interrupt the watcher thread, we should be able to stop
255+
service.stop();
256+
assertFalse(service.watching());
257+
service.close();
258+
// let the deadlocked thread end, so we can cleanly exit the test
259+
deadThreadLatch.countDown();
260+
}
261+
262+
@SuppressWarnings("unchecked")
263+
public void testStopWorksIfProcessingDidntReturnYet() throws Exception {
264+
var spiedController = spy(controller);
265+
var fsService = new FileSettingsService(clusterService, spiedController, env);
266+
267+
FileSettingsService service = spy(fsService);
268+
CountDownLatch processFileLatch = new CountDownLatch(1);
269+
CountDownLatch deadThreadLatch = new CountDownLatch(1);
270+
271+
doAnswer((Answer<Void>) invocation -> {
272+
processFileLatch.countDown();
273+
// allow the other thread to continue, but hold on a bit to avoid
274+
// setting the count-down latch in the main watcher loop.
275+
Thread.sleep(1_000);
276+
new Thread(() -> {
277+
// Simulate a thread that never comes back and decrements the
278+
// countdown latch in FileSettingsService.processFileSettings
279+
try {
280+
deadThreadLatch.await();
281+
} catch (InterruptedException e) {
282+
throw new RuntimeException(e);
283+
}
284+
}).start();
285+
return null;
286+
}).when(spiedController).process(any(String.class), any(XContentParser.class), any(Consumer.class));
287+
288+
service.start();
289+
assertTrue(service.watching());
290+
291+
Files.createDirectories(service.operatorSettingsDir());
292+
293+
// Make some fake settings file to cause the file settings service to process it
294+
Files.write(service.operatorSettingsFile(), "{}".getBytes(StandardCharsets.UTF_8));
295+
296+
// we need to wait a bit, on MacOS it may take up to 10 seconds for the Java watcher service to notice the file,
297+
// on Linux is instantaneous. Windows is instantaneous too.
298+
processFileLatch.await(30, TimeUnit.SECONDS);
299+
300+
// Stopping the service should interrupt the watcher thread, we should be able to stop
301+
service.stop();
302+
assertFalse(service.watching());
303+
service.close();
304+
// let the deadlocked thread end, so we can cleanly exit the test
305+
deadThreadLatch.countDown();
306+
}
220307
}

0 commit comments

Comments
 (0)