Skip to content

Commit d49f2c4

Browse files
andrershovAndrey Ershov
authored and
Andrey Ershov
committed
Fail shard if IndexShard#storeStats runs into an IOException (#32241)
Fail shard if IndexShard#storeStats runs into an IOException. Closes #29008 (cherry picked from commit 33f11e6)
1 parent 864711b commit d49f2c4

File tree

5 files changed

+113
-20
lines changed

5 files changed

+113
-20
lines changed

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@ public StoreStats storeStats() {
949949
try {
950950
return store.stats();
951951
} catch (IOException e) {
952+
failShard("Failing shard because of exception during storeStats", e);
952953
throw new ElasticsearchException("io exception while building 'store stats'", e);
953954
}
954955
}

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
import org.apache.lucene.search.TermQuery;
2828
import org.apache.lucene.search.TopDocs;
2929
import org.apache.lucene.store.AlreadyClosedException;
30+
import org.apache.lucene.store.Directory;
31+
import org.apache.lucene.store.FilterDirectory;
3032
import org.apache.lucene.store.IOContext;
3133
import org.apache.lucene.util.Constants;
3234
import org.elasticsearch.Version;
@@ -111,6 +113,7 @@
111113
import org.elasticsearch.test.FieldMaskingReader;
112114
import org.elasticsearch.test.VersionUtils;
113115
import org.elasticsearch.threadpool.ThreadPool;
116+
import org.elasticsearch.ElasticsearchException;
114117

115118
import java.io.IOException;
116119
import java.nio.charset.Charset;
@@ -137,6 +140,7 @@
137140
import java.util.function.BiConsumer;
138141
import java.util.function.Consumer;
139142
import java.util.function.LongFunction;
143+
import java.util.function.Supplier;
140144
import java.util.stream.Collectors;
141145
import java.util.stream.IntStream;
142146

@@ -1161,6 +1165,81 @@ public void testShardStats() throws IOException {
11611165
closeShards(shard);
11621166
}
11631167

1168+
1169+
public void testShardStatsWithFailures() throws IOException {
1170+
allowShardFailures();
1171+
final ShardId shardId = new ShardId("index", "_na_", 0);
1172+
final ShardRouting shardRouting = newShardRouting(shardId, "node", true, RecoverySource.StoreRecoverySource.EMPTY_STORE_INSTANCE, ShardRoutingState.INITIALIZING);
1173+
final NodeEnvironment.NodePath nodePath = new NodeEnvironment.NodePath(createTempDir());
1174+
1175+
1176+
ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId);
1177+
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
1178+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
1179+
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
1180+
.build();
1181+
IndexMetaData metaData = IndexMetaData.builder(shardRouting.getIndexName())
1182+
.settings(settings)
1183+
.primaryTerm(0, 1)
1184+
.build();
1185+
1186+
// Override two Directory methods to make them fail at our will
1187+
// We use AtomicReference here to inject failure in the middle of the test not immediately
1188+
// We use Supplier<IOException> instead of IOException to produce meaningful stacktrace
1189+
// (remember stack trace is filled when exception is instantiated)
1190+
AtomicReference<Supplier<IOException>> exceptionToThrow = new AtomicReference<>();
1191+
AtomicBoolean throwWhenMarkingStoreCorrupted = new AtomicBoolean(false);
1192+
Directory directory = new FilterDirectory(newFSDirectory(shardPath.resolveIndex())) {
1193+
//fileLength method is called during storeStats try block
1194+
//it's not called when store is marked as corrupted
1195+
@Override
1196+
public long fileLength(String name) throws IOException {
1197+
Supplier<IOException> ex = exceptionToThrow.get();
1198+
if (ex == null) {
1199+
return super.fileLength(name);
1200+
} else {
1201+
throw ex.get();
1202+
}
1203+
}
1204+
1205+
//listAll method is called when marking store as corrupted
1206+
@Override
1207+
public String[] listAll() throws IOException {
1208+
Supplier<IOException> ex = exceptionToThrow.get();
1209+
if (throwWhenMarkingStoreCorrupted.get() && ex != null) {
1210+
throw ex.get();
1211+
} else {
1212+
return super.listAll();
1213+
}
1214+
}
1215+
};
1216+
1217+
try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
1218+
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
1219+
null, new InternalEngineFactory(), () -> {
1220+
}, EMPTY_EVENT_LISTENER);
1221+
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
1222+
shard.addShardFailureCallback((ig)->failureCallbackTriggered.set(true));
1223+
1224+
recoverShardFromStore(shard);
1225+
1226+
final boolean corruptIndexException = randomBoolean();
1227+
1228+
if (corruptIndexException) {
1229+
exceptionToThrow.set(() -> new CorruptIndexException("Test CorruptIndexException", "Test resource"));
1230+
throwWhenMarkingStoreCorrupted.set(randomBoolean());
1231+
} else {
1232+
exceptionToThrow.set(() -> new IOException("Test IOException"));
1233+
}
1234+
ElasticsearchException e = expectThrows(ElasticsearchException.class, shard::storeStats);
1235+
assertTrue(failureCallbackTriggered.get());
1236+
1237+
if (corruptIndexException && !throwWhenMarkingStoreCorrupted.get()) {
1238+
assertTrue(store.isMarkedCorrupted());
1239+
}
1240+
}
1241+
}
1242+
11641243
public void testRefreshMetric() throws IOException {
11651244
IndexShard shard = newStartedShard();
11661245
assertThat(shard.refreshStats().getTotal(), equalTo(2L)); // refresh on: finalize and end of recovery
@@ -1867,6 +1946,7 @@ public IndexSearcher wrap(IndexSearcher searcher) throws EngineException {
18671946
ShardRoutingHelper.initWithSameId(shard.routingEntry(), RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE),
18681947
shard.shardPath(),
18691948
shard.indexSettings().getIndexMetaData(),
1949+
null,
18701950
wrapper,
18711951
new InternalEngineFactory(),
18721952
() -> {},
@@ -2018,6 +2098,7 @@ public IndexSearcher wrap(IndexSearcher searcher) throws EngineException {
20182098
ShardRoutingHelper.initWithSameId(shard.routingEntry(), RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE),
20192099
shard.shardPath(),
20202100
shard.indexSettings().getIndexMetaData(),
2101+
null,
20212102
wrapper,
20222103
new InternalEngineFactory(),
20232104
() -> {},
@@ -2504,7 +2585,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
25042585
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
25052586
.build();
25062587
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
2507-
null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2588+
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
25082589

25092590
Store.MetadataSnapshot storeFileMetaDatas = newShard.snapshotStoreMetadata();
25102591
assertTrue("at least 2 files, commit and data: " + storeFileMetaDatas.toString(), storeFileMetaDatas.size() > 1);
@@ -2861,7 +2942,7 @@ public void testFlushOnInactive() throws Exception {
28612942
ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId);
28622943
AtomicBoolean markedInactive = new AtomicBoolean();
28632944
AtomicReference<IndexShard> primaryRef = new AtomicReference<>();
2864-
IndexShard primary = newShard(shardRouting, shardPath, metaData, null, new InternalEngineFactory(), () -> {
2945+
IndexShard primary = newShard(shardRouting, shardPath, metaData, null, null, new InternalEngineFactory(), () -> {
28652946
}, new IndexEventListener() {
28662947
@Override
28672948
public void onShardInactive(IndexShard indexShard) {

server/src/test/java/org/elasticsearch/repositories/blobstore/BlobStoreRepositoryRestoreTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ public void testRestoreSnapshotWithExistingFiles() throws IOException {
105105
shard.shardPath(),
106106
shard.indexSettings().getIndexMetaData(),
107107
null,
108+
null,
108109
new InternalEngineFactory(),
109110
() -> {},
110111
EMPTY_EVENT_LISTENER);

test/framework/src/main/java/org/elasticsearch/index/replication/ESIndexLevelReplicationTestCase.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,14 @@ assert shardRoutings().stream()
259259

260260
public synchronized IndexShard addReplicaWithExistingPath(final ShardPath shardPath, final String nodeId) throws IOException {
261261
final ShardRouting shardRouting = TestShardRouting.newShardRouting(
262-
shardId,
263-
nodeId,
264-
false, ShardRoutingState.INITIALIZING,
265-
RecoverySource.PeerRecoverySource.INSTANCE);
262+
shardId,
263+
nodeId,
264+
false, ShardRoutingState.INITIALIZING,
265+
RecoverySource.PeerRecoverySource.INSTANCE);
266266

267267
final IndexShard newReplica =
268-
newShard(shardRouting, shardPath, indexMetaData, null, getEngineFactory(shardRouting), () -> {}, EMPTY_EVENT_LISTENER);
268+
newShard(shardRouting, shardPath, indexMetaData, null, null, getEngineFactory(shardRouting),
269+
() -> {}, EMPTY_EVENT_LISTENER);
269270
replicas.add(newReplica);
270271
updateAllocationIDsOnPrimary();
271272
return newReplica;

test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -163,15 +163,20 @@ public Settings threadPoolSettings() {
163163
return Settings.EMPTY;
164164
}
165165

166-
private Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException {
167-
final ShardId shardId = shardPath.getShardId();
166+
167+
protected Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException {
168+
return createStore(shardPath.getShardId(), indexSettings, newFSDirectory(shardPath.resolveIndex()));
169+
}
170+
171+
protected Store createStore(ShardId shardId, IndexSettings indexSettings, Directory directory) throws IOException {
168172
final DirectoryService directoryService = new DirectoryService(shardId, indexSettings) {
169173
@Override
170174
public Directory newDirectory() throws IOException {
171-
return newFSDirectory(shardPath.resolveIndex());
175+
return directory;
172176
}
173177
};
174178
return new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId));
179+
175180
}
176181

177182
/**
@@ -284,29 +289,32 @@ protected IndexShard newShard(ShardRouting routing, IndexMetaData indexMetaData,
284289
final ShardId shardId = routing.shardId();
285290
final NodeEnvironment.NodePath nodePath = new NodeEnvironment.NodePath(createTempDir());
286291
ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId);
287-
return newShard(routing, shardPath, indexMetaData, indexSearcherWrapper, engineFactory, globalCheckpointSyncer,
292+
return newShard(routing, shardPath, indexMetaData, null, indexSearcherWrapper, engineFactory, globalCheckpointSyncer,
288293
EMPTY_EVENT_LISTENER, listeners);
289294
}
290295

291296
/**
292297
* creates a new initializing shard.
293-
* @param routing shard routing to use
294-
* @param shardPath path to use for shard data
295-
* @param indexMetaData indexMetaData for the shard, including any mapping
296-
* @param indexSearcherWrapper an optional wrapper to be used during searchers
297-
* @param globalCheckpointSyncer callback for syncing global checkpoints
298-
* @param indexEventListener index even listener
299-
* @param listeners an optional set of listeners to add to the shard
298+
* @param routing shard routing to use
299+
* @param shardPath path to use for shard data
300+
* @param indexMetaData indexMetaData for the shard, including any mapping
301+
* @param store an optional custom store to use. If null a default file based store will be created
302+
* @param indexSearcherWrapper an optional wrapper to be used during searchers
303+
* @param globalCheckpointSyncer callback for syncing global checkpoints
304+
* @param indexEventListener index event listener
305+
* @param listeners an optional set of listeners to add to the shard
300306
*/
301307
protected IndexShard newShard(ShardRouting routing, ShardPath shardPath, IndexMetaData indexMetaData,
302-
@Nullable IndexSearcherWrapper indexSearcherWrapper,
308+
@Nullable Store store, @Nullable IndexSearcherWrapper indexSearcherWrapper,
303309
@Nullable EngineFactory engineFactory,
304310
Runnable globalCheckpointSyncer,
305311
IndexEventListener indexEventListener, IndexingOperationListener... listeners) throws IOException {
306312
final Settings nodeSettings = Settings.builder().put("node.name", routing.currentNodeId()).build();
307313
final IndexSettings indexSettings = new IndexSettings(indexMetaData, nodeSettings);
308314
final IndexShard indexShard;
309-
final Store store = createStore(indexSettings, shardPath);
315+
if (store == null) {
316+
store = createStore(indexSettings, shardPath);
317+
}
310318
boolean success = false;
311319
try {
312320
IndexCache indexCache = new IndexCache(indexSettings, new DisabledQueryCache(indexSettings), null);
@@ -357,6 +365,7 @@ protected IndexShard reinitShard(IndexShard current, ShardRouting routing, Index
357365
current.shardPath(),
358366
current.indexSettings().getIndexMetaData(),
359367
null,
368+
null,
360369
current.engineFactory,
361370
current.getGlobalCheckpointSyncer(),
362371
EMPTY_EVENT_LISTENER, listeners);

0 commit comments

Comments
 (0)