Skip to content

Commit 4900c89

Browse files
committed
Fix Watcher deadlock that can cause in-abilty to index documents.
This commit removes the usage of the `BulkProcessor` to write history documents and delete triggered watches on a `EsRejectedExecutionException`. Since the exception could be handled on the write thread, the write thread can be blocked waiting on watcher threads (due to a synchronous method). This is problematic since those watcher threads can be blocked waiting on write threads. This commit also moves the handling of the exception to the generic threadpool to avoid submitting write requests from the write thread pool. fixes elastic#41390
1 parent 7d29e37 commit 4900c89

File tree

2 files changed

+142
-20
lines changed

2 files changed

+142
-20
lines changed

x-pack/plugin/watcher/src/main/java/org/elasticsearch/xpack/watcher/execution/ExecutionService.java

Lines changed: 75 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
import org.elasticsearch.action.ActionListener;
1616
import org.elasticsearch.action.bulk.BulkItemResponse;
1717
import org.elasticsearch.action.bulk.BulkResponse;
18+
import org.elasticsearch.action.delete.DeleteRequest;
1819
import org.elasticsearch.action.get.GetRequest;
1920
import org.elasticsearch.action.get.GetResponse;
21+
import org.elasticsearch.action.index.IndexRequest;
2022
import org.elasticsearch.action.support.PlainActionFuture;
2123
import org.elasticsearch.action.update.UpdateRequest;
2224
import org.elasticsearch.client.Client;
@@ -32,19 +34,25 @@
3234
import org.elasticsearch.common.util.concurrent.ThreadContext;
3335
import org.elasticsearch.common.xcontent.ToXContent;
3436
import org.elasticsearch.common.xcontent.XContentBuilder;
37+
import org.elasticsearch.common.xcontent.XContentFactory;
3538
import org.elasticsearch.common.xcontent.XContentType;
3639
import org.elasticsearch.common.xcontent.json.JsonXContent;
3740
import org.elasticsearch.index.engine.DocumentMissingException;
41+
import org.elasticsearch.index.engine.VersionConflictEngineException;
3842
import org.elasticsearch.xpack.core.watcher.actions.ActionWrapper;
3943
import org.elasticsearch.xpack.core.watcher.actions.ActionWrapperResult;
4044
import org.elasticsearch.xpack.core.watcher.common.stats.Counters;
4145
import org.elasticsearch.xpack.core.watcher.condition.Condition;
4246
import org.elasticsearch.xpack.core.watcher.execution.ExecutionState;
4347
import org.elasticsearch.xpack.core.watcher.execution.QueuedWatch;
48+
import org.elasticsearch.xpack.core.watcher.execution.TriggeredWatchStoreField;
4449
import org.elasticsearch.xpack.core.watcher.execution.WatchExecutionContext;
4550
import org.elasticsearch.xpack.core.watcher.execution.WatchExecutionSnapshot;
51+
import org.elasticsearch.xpack.core.watcher.execution.Wid;
52+
import org.elasticsearch.xpack.core.watcher.history.HistoryStoreField;
4653
import org.elasticsearch.xpack.core.watcher.history.WatchRecord;
4754
import org.elasticsearch.xpack.core.watcher.input.Input;
55+
import org.elasticsearch.xpack.core.watcher.support.xcontent.WatcherParams;
4856
import org.elasticsearch.xpack.core.watcher.transform.Transform;
4957
import org.elasticsearch.xpack.core.watcher.trigger.TriggerEvent;
5058
import org.elasticsearch.xpack.core.watcher.watch.Watch;
@@ -66,10 +74,15 @@
6674
import java.util.LinkedList;
6775
import java.util.List;
6876
import java.util.Map;
77+
import java.util.concurrent.ExecutionException;
6978
import java.util.concurrent.ExecutorService;
7079
import java.util.concurrent.TimeUnit;
80+
import java.util.concurrent.TimeoutException;
7181
import java.util.concurrent.atomic.AtomicBoolean;
7282
import java.util.concurrent.atomic.AtomicReference;
83+
import java.util.concurrent.locks.Lock;
84+
import java.util.concurrent.locks.ReadWriteLock;
85+
import java.util.concurrent.locks.ReentrantReadWriteLock;
7386

7487
import static org.elasticsearch.xpack.core.ClientHelper.WATCHER_ORIGIN;
7588
import static org.elasticsearch.xpack.core.ClientHelper.stashWithOrigin;
@@ -97,6 +110,8 @@ public class ExecutionService {
97110
private final Client client;
98111
private final WatchExecutor executor;
99112
private final ExecutorService genericExecutor;
113+
private final ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
114+
private final Lock putUpdateLock = readWriteLock.readLock();
100115

101116
private AtomicReference<CurrentExecutions> currentExecutions = new AtomicReference<>();
102117
private final AtomicBoolean paused = new AtomicBoolean(false);
@@ -399,22 +414,70 @@ private void executeAsync(WatchExecutionContext ctx, final TriggeredWatch trigge
399414
try {
400415
executor.execute(new WatchExecutionTask(ctx, () -> execute(ctx)));
401416
} catch (EsRejectedExecutionException e) {
402-
String message = "failed to run triggered watch [" + triggeredWatch.id() + "] due to thread pool capacity";
403-
WatchRecord record = ctx.abortBeforeExecution(ExecutionState.THREADPOOL_REJECTION, message);
404-
try {
405-
if (ctx.overrideRecordOnConflict()) {
406-
historyStore.forcePut(record);
407-
} else {
408-
historyStore.put(record);
417+
//Using the generic pool here since this can happen from a write thread and we don't want to block a write
418+
//thread to kick off these additional write/delete requests.
419+
//Intentionally not using the HistoryStore or TriggerWatchStore to avoid re-using the same synchronous
420+
//BulkProcessor which can cause a deadlock see #41390
421+
genericExecutor.execute(new WatchExecutionTask(ctx, () -> {
422+
String message = "failed to run triggered watch [" + triggeredWatch.id() + "] due to thread pool capacity";
423+
WatchRecord record = ctx.abortBeforeExecution(ExecutionState.THREADPOOL_REJECTION, message);
424+
try {
425+
forcePutHistory(record);
426+
} catch (Exception exc) {
427+
logger.error((Supplier<?>) () ->
428+
new ParameterizedMessage(
429+
"Error storing watch history record for watch [{}] after thread pool rejection",
430+
triggeredWatch.id()), exc);
409431
}
410-
} catch (Exception exc) {
411-
logger.error((Supplier<?>) () ->
412-
new ParameterizedMessage("Error storing watch history record for watch [{}] after thread pool rejection",
413-
triggeredWatch.id()), exc);
432+
deleteTrigger(triggeredWatch.id());
433+
}));
434+
}
435+
}
436+
437+
/**
438+
* Stores the specified watchRecord.
439+
* Any existing watchRecord will be overwritten.
440+
*/
441+
private void forcePutHistory(WatchRecord watchRecord) {
442+
String index = HistoryStoreField.getHistoryIndexNameForTime(watchRecord.triggerEvent().triggeredTime());
443+
putUpdateLock.lock();
444+
try {
445+
try (XContentBuilder builder = XContentFactory.jsonBuilder();
446+
ThreadContext.StoredContext ignore = client.threadPool().getThreadContext().stashWithOrigin(WATCHER_ORIGIN)) {
447+
watchRecord.toXContent(builder, WatcherParams.HIDE_SECRETS);
448+
IndexRequest request = new IndexRequest(index)
449+
.id(watchRecord.id().value())
450+
.source(builder)
451+
.opType(IndexRequest.OpType.CREATE);
452+
client.index(request).get(30, TimeUnit.SECONDS);
453+
logger.debug("indexed watch history record [{}]", watchRecord.id().value());
454+
} catch (VersionConflictEngineException vcee) {
455+
watchRecord = new WatchRecord.MessageWatchRecord(watchRecord, ExecutionState.EXECUTED_MULTIPLE_TIMES,
456+
"watch record [{ " + watchRecord.id() + " }] has been stored before, previous state [" + watchRecord.state() + "]");
457+
try (XContentBuilder xContentBuilder = XContentFactory.jsonBuilder();
458+
ThreadContext.StoredContext ignore = stashWithOrigin(client.threadPool().getThreadContext(), WATCHER_ORIGIN)) {
459+
IndexRequest request = new IndexRequest(index)
460+
.id(watchRecord.id().value())
461+
.source(xContentBuilder.value(watchRecord));
462+
client.index(request).get(30, TimeUnit.SECONDS);
463+
}
464+
logger.debug("overwrote watch history record [{}]", watchRecord.id().value());
414465
}
466+
} catch (InterruptedException | ExecutionException | TimeoutException | IOException ioe) {
467+
final WatchRecord wr = watchRecord;
468+
logger.error((Supplier<?>) () -> new ParameterizedMessage("failed to persist watch record [{}]", wr), ioe);
469+
} finally {
470+
putUpdateLock.unlock();
471+
}
472+
}
415473

416-
triggeredWatchStore.delete(triggeredWatch.id());
474+
private void deleteTrigger(Wid watcherId) {
475+
DeleteRequest request = new DeleteRequest(TriggeredWatchStoreField.INDEX_NAME);
476+
request.id(watcherId.value());
477+
try (ThreadContext.StoredContext ignore = client.threadPool().getThreadContext().stashWithOrigin(WATCHER_ORIGIN)) {
478+
client.delete(request).actionGet(30, TimeUnit.SECONDS);
417479
}
480+
logger.trace("successfully deleted triggered watch with id [{}]", watcherId);
418481
}
419482

420483
WatchRecord executeInner(WatchExecutionContext ctx) {

x-pack/plugin/watcher/src/test/java/org/elasticsearch/xpack/watcher/execution/ExecutionServiceTests.java

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
*/
66
package org.elasticsearch.xpack.watcher.execution;
77

8-
import org.elasticsearch.ElasticsearchException;
98
import org.elasticsearch.Version;
9+
import org.elasticsearch.action.ActionFuture;
1010
import org.elasticsearch.action.ActionListener;
11+
import org.elasticsearch.action.delete.DeleteRequest;
1112
import org.elasticsearch.action.get.GetRequest;
1213
import org.elasticsearch.action.get.GetResponse;
14+
import org.elasticsearch.action.index.IndexRequest;
1315
import org.elasticsearch.action.support.PlainActionFuture;
1416
import org.elasticsearch.action.update.UpdateRequest;
1517
import org.elasticsearch.action.update.UpdateResponse;
@@ -28,8 +30,11 @@
2830
import org.elasticsearch.common.xcontent.XContentFactory;
2931
import org.elasticsearch.common.xcontent.XContentParser;
3032
import org.elasticsearch.common.xcontent.XContentType;
33+
import org.elasticsearch.index.Index;
3134
import org.elasticsearch.index.IndexNotFoundException;
35+
import org.elasticsearch.index.engine.VersionConflictEngineException;
3236
import org.elasticsearch.index.get.GetResult;
37+
import org.elasticsearch.index.shard.ShardId;
3338
import org.elasticsearch.test.ESTestCase;
3439
import org.elasticsearch.threadpool.ThreadPool;
3540
import org.elasticsearch.xpack.core.security.authc.Authentication;
@@ -48,6 +53,7 @@
4853
import org.elasticsearch.xpack.core.watcher.execution.ExecutionPhase;
4954
import org.elasticsearch.xpack.core.watcher.execution.ExecutionState;
5055
import org.elasticsearch.xpack.core.watcher.execution.QueuedWatch;
56+
import org.elasticsearch.xpack.core.watcher.execution.TriggeredWatchStoreField;
5157
import org.elasticsearch.xpack.core.watcher.execution.WatchExecutionContext;
5258
import org.elasticsearch.xpack.core.watcher.execution.WatchExecutionSnapshot;
5359
import org.elasticsearch.xpack.core.watcher.execution.Wid;
@@ -91,6 +97,7 @@
9197
import static java.util.Collections.singletonMap;
9298
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
9399
import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO;
100+
import static org.hamcrest.Matchers.containsString;
94101
import static org.hamcrest.Matchers.equalTo;
95102
import static org.hamcrest.Matchers.greaterThan;
96103
import static org.hamcrest.Matchers.hasSize;
@@ -844,22 +851,74 @@ public void testThatTriggeredWatchDeletionWorksOnExecutionRejection() throws Exc
844851
when(getResponse.isExists()).thenReturn(true);
845852
when(getResponse.getId()).thenReturn("foo");
846853
mockGetWatchResponse(client, "foo", getResponse);
854+
ActionFuture actionFuture = mock(ActionFuture.class);
855+
when(actionFuture.get()).thenReturn("");
856+
when(client.index(any())).thenReturn(actionFuture);
857+
when(client.delete(any())).thenReturn(actionFuture);
858+
847859
when(parser.parseWithSecrets(eq("foo"), eq(true), any(), any(), any(), anyLong(), anyLong())).thenReturn(watch);
848860

849-
// execute needs to fail as well as storing the history
861+
// execute needs to fail
850862
doThrow(new EsRejectedExecutionException()).when(executor).execute(any());
851-
doThrow(new ElasticsearchException("whatever")).when(historyStore).forcePut(any());
852863

853864
Wid wid = new Wid(watch.id(), ZonedDateTime.now(ZoneOffset.UTC));
854865

855866
TriggeredWatch triggeredWatch = new TriggeredWatch(wid,
856867
new ScheduleTriggerEvent(ZonedDateTime.now(ZoneOffset.UTC) ,ZonedDateTime.now(ZoneOffset.UTC)));
857868
executionService.executeTriggeredWatches(Collections.singleton(triggeredWatch));
858869

859-
verify(triggeredWatchStore, times(1)).delete(wid);
860-
ArgumentCaptor<WatchRecord> captor = ArgumentCaptor.forClass(WatchRecord.class);
861-
verify(historyStore, times(1)).forcePut(captor.capture());
862-
assertThat(captor.getValue().state(), is(ExecutionState.THREADPOOL_REJECTION));
870+
ArgumentCaptor<DeleteRequest> deleteCaptor = ArgumentCaptor.forClass(DeleteRequest.class);
871+
verify(client).delete(deleteCaptor.capture());
872+
assertThat(deleteCaptor.getValue().index(), equalTo(TriggeredWatchStoreField.INDEX_NAME));
873+
assertThat(deleteCaptor.getValue().id(), equalTo(wid.value()));
874+
875+
ArgumentCaptor<IndexRequest> watchHistoryCaptor = ArgumentCaptor.forClass(IndexRequest.class);
876+
verify(client).index(watchHistoryCaptor.capture());
877+
878+
assertThat(watchHistoryCaptor.getValue().source().utf8ToString(), containsString(ExecutionState.THREADPOOL_REJECTION.toString()));
879+
assertThat(watchHistoryCaptor.getValue().index(), containsString(".watcher-history"));
880+
}
881+
882+
public void testForcePutHistoryOnExecutionRejection() throws Exception {
883+
Watch watch = mock(Watch.class);
884+
when(watch.id()).thenReturn("foo");
885+
WatchStatus status = new WatchStatus(ZonedDateTime.now(ZoneOffset.UTC), Collections.emptyMap());
886+
when(watch.status()).thenReturn(status);
887+
GetResponse getResponse = mock(GetResponse.class);
888+
when(getResponse.isExists()).thenReturn(true);
889+
when(getResponse.getId()).thenReturn("foo");
890+
mockGetWatchResponse(client, "foo", getResponse);
891+
ActionFuture actionFuture = mock(ActionFuture.class);
892+
when(actionFuture.get()).thenReturn("");
893+
when(client.index(any()))
894+
.thenThrow(new VersionConflictEngineException(
895+
new ShardId(new Index("mockindex", "mockuuid"), 0), "id", "explaination"))
896+
.thenReturn(actionFuture);
897+
when(client.delete(any())).thenReturn(actionFuture);
898+
899+
when(parser.parseWithSecrets(eq("foo"), eq(true), any(), any(), any(), anyLong(), anyLong())).thenReturn(watch);
900+
901+
// execute needs to fail
902+
doThrow(new EsRejectedExecutionException()).when(executor).execute(any());
903+
904+
Wid wid = new Wid(watch.id(), ZonedDateTime.now(ZoneOffset.UTC));
905+
906+
TriggeredWatch triggeredWatch = new TriggeredWatch(wid,
907+
new ScheduleTriggerEvent(ZonedDateTime.now(ZoneOffset.UTC), ZonedDateTime.now(ZoneOffset.UTC)));
908+
executionService.executeTriggeredWatches(Collections.singleton(triggeredWatch));
909+
910+
ArgumentCaptor<DeleteRequest> deleteCaptor = ArgumentCaptor.forClass(DeleteRequest.class);
911+
verify(client).delete(deleteCaptor.capture());
912+
assertThat(deleteCaptor.getValue().index(), equalTo(TriggeredWatchStoreField.INDEX_NAME));
913+
assertThat(deleteCaptor.getValue().id(), equalTo(wid.value()));
914+
915+
ArgumentCaptor<IndexRequest> watchHistoryCaptor = ArgumentCaptor.forClass(IndexRequest.class);
916+
verify(client, times(2)).index(watchHistoryCaptor.capture());
917+
List<IndexRequest> indexRequests = watchHistoryCaptor.getAllValues();
918+
919+
assertThat(indexRequests.get(0).id(), equalTo(indexRequests.get(1).id()));
920+
assertThat(indexRequests.get(0).source().utf8ToString(), containsString(ExecutionState.THREADPOOL_REJECTION.toString()));
921+
assertThat(indexRequests.get(1).source().utf8ToString(), containsString(ExecutionState.EXECUTED_MULTIPLE_TIMES.toString()));
863922
}
864923

865924
public void testThatTriggeredWatchDeletionHappensOnlyIfWatchExists() throws Exception {
@@ -898,7 +957,7 @@ public void testThatTriggeredWatchDeletionHappensOnlyIfWatchExists() throws Exce
898957
when(watch.status()).thenReturn(watchStatus);
899958

900959
executionService.execute(context);
901-
verify(triggeredWatchStore, never()).delete(any());
960+
verify(client, never()).delete(any());
902961
}
903962

904963
public void testThatSingleWatchCannotBeExecutedConcurrently() throws Exception {

0 commit comments

Comments
 (0)