Skip to content

Commit 3d82a30

Browse files
drop index.shard.check_on_startup: fix (#32279)
drop `index.shard.check_on_startup: fix` Relates #31389
1 parent d4f2b5b commit 3d82a30

File tree

5 files changed

+197
-39
lines changed

5 files changed

+197
-39
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,7 @@ corruption is detected, it will prevent the shard from being opened. Accepts:
6565

6666
`fix`::
6767

68-
Check for both physical and logical corruption. Segments that were reported
69-
as corrupted will be automatically removed. This option *may result in data loss*.
70-
Use with extreme caution!
68+
The same as `false`. This option is deprecated and will be completely removed in 7.0.
7169

7270
WARNING: Expert only. Checking shards may take a lot of time on large indices.
7371
--

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,10 @@ public IndexShard(
301301
logger.debug("state: [CREATED]");
302302

303303
this.checkIndexOnStartup = indexSettings.getValue(IndexSettings.INDEX_CHECK_ON_STARTUP);
304+
if ("fix".equals(checkIndexOnStartup)) {
305+
deprecationLogger.deprecated("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
306+
+ "which has no effect and will not be accepted in future");
307+
}
304308
this.translogConfig = new TranslogConfig(shardId, shardPath().resolveTranslog(), indexSettings, bigArrays);
305309
final String aId = shardRouting.allocationId().getId();
306310
this.globalCheckpointListeners = new GlobalCheckpointListeners(shardId, threadPool.executor(ThreadPool.Names.LISTENER), logger);
@@ -1325,7 +1329,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
13251329
}
13261330
recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
13271331
// also check here, before we apply the translog
1328-
if (Booleans.isTrue(checkIndexOnStartup)) {
1332+
if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
13291333
try {
13301334
checkIndex();
13311335
} catch (IOException ex) {
@@ -1933,6 +1937,9 @@ void checkIndex() throws IOException {
19331937
if (store.tryIncRef()) {
19341938
try {
19351939
doCheckIndex();
1940+
} catch (IOException e) {
1941+
store.markStoreCorrupted(e);
1942+
throw e;
19361943
} finally {
19371944
store.decRef();
19381945
}
@@ -1976,18 +1983,7 @@ private void doCheckIndex() throws IOException {
19761983
return;
19771984
}
19781985
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
1979-
if ("fix".equals(checkIndexOnStartup)) {
1980-
if (logger.isDebugEnabled()) {
1981-
logger.debug("fixing index, writing new segments file ...");
1982-
}
1983-
store.exorciseIndex(status);
1984-
if (logger.isDebugEnabled()) {
1985-
logger.debug("index fixed, wrote new segments file \"{}\"", status.segmentsFileName);
1986-
}
1987-
} else {
1988-
// only throw a failure if we are not going to fix the index
1989-
throw new IllegalStateException("index check failure but can't fix it");
1990-
}
1986+
throw new IOException("index check failure");
19911987
}
19921988
}
19931989

server/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
134134
static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
135135
static final int VERSION_START = 0;
136136
static final int VERSION = VERSION_WRITE_THROWABLE;
137-
static final String CORRUPTED = "corrupted_";
137+
// public is for test purposes
138+
public static final String CORRUPTED = "corrupted_";
138139
public static final Setting<TimeValue> INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING =
139140
Setting.timeSetting("index.store.stats_refresh_interval", TimeValue.timeValueSeconds(10), Property.IndexScope);
140141

@@ -360,18 +361,6 @@ public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
360361
}
361362
}
362363

363-
/**
364-
* Repairs the index using the previous returned status from {@link #checkIndex(PrintStream)}.
365-
*/
366-
public void exorciseIndex(CheckIndex.Status status) throws IOException {
367-
metadataLock.writeLock().lock();
368-
try (CheckIndex checkIndex = new CheckIndex(directory)) {
369-
checkIndex.exorciseIndex(status);
370-
} finally {
371-
metadataLock.writeLock().unlock();
372-
}
373-
}
374-
375364
public StoreStats stats() throws IOException {
376365
ensureOpen();
377366
return new StoreStats(directory.estimateSize());

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

Lines changed: 155 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.lucene.index.DirectoryReader;
2424
import org.apache.lucene.index.IndexCommit;
2525
import org.apache.lucene.index.IndexableField;
26+
import org.apache.lucene.index.IndexWriter;
2627
import org.apache.lucene.index.Term;
2728
import org.apache.lucene.search.IndexSearcher;
2829
import org.apache.lucene.search.TermQuery;
@@ -118,6 +119,7 @@
118119
import org.elasticsearch.snapshots.SnapshotId;
119120
import org.elasticsearch.snapshots.SnapshotInfo;
120121
import org.elasticsearch.snapshots.SnapshotShardFailure;
122+
import org.elasticsearch.test.CorruptionUtils;
121123
import org.elasticsearch.test.DummyShardLock;
122124
import org.elasticsearch.test.FieldMaskingReader;
123125
import org.elasticsearch.test.VersionUtils;
@@ -126,7 +128,11 @@
126128

127129
import java.io.IOException;
128130
import java.nio.charset.Charset;
131+
import java.nio.file.FileVisitResult;
132+
import java.nio.file.Files;
129133
import java.nio.file.Path;
134+
import java.nio.file.SimpleFileVisitor;
135+
import java.nio.file.attribute.BasicFileAttributes;
130136
import java.util.ArrayList;
131137
import java.util.Arrays;
132138
import java.util.Collections;
@@ -1239,7 +1245,7 @@ public String[] listAll() throws IOException {
12391245
};
12401246

12411247
try (Store store = createStore(shardId, new IndexSettings(metaData, Settings.EMPTY), directory)) {
1242-
IndexShard shard = newShard(shardRouting, shardPath, metaData, store,
1248+
IndexShard shard = newShard(shardRouting, shardPath, metaData, i -> store,
12431249
null, new InternalEngineFactory(), () -> {
12441250
}, EMPTY_EVENT_LISTENER);
12451251
AtomicBoolean failureCallbackTriggered = new AtomicBoolean(false);
@@ -2590,6 +2596,143 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept
25902596
closeShards(newShard);
25912597
}
25922598

2599+
public void testIndexCheckOnStartup() throws Exception {
2600+
final IndexShard indexShard = newStartedShard(true);
2601+
2602+
final long numDocs = between(10, 100);
2603+
for (long i = 0; i < numDocs; i++) {
2604+
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
2605+
}
2606+
indexShard.flush(new FlushRequest());
2607+
closeShards(indexShard);
2608+
2609+
final ShardPath shardPath = indexShard.shardPath();
2610+
2611+
final Path indexPath = corruptIndexFile(shardPath);
2612+
2613+
final AtomicInteger corruptedMarkerCount = new AtomicInteger();
2614+
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
2615+
@Override
2616+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
2617+
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
2618+
corruptedMarkerCount.incrementAndGet();
2619+
}
2620+
return FileVisitResult.CONTINUE;
2621+
}
2622+
};
2623+
Files.walkFileTree(indexPath, corruptedVisitor);
2624+
2625+
assertThat("corruption marker should not be there", corruptedMarkerCount.get(), equalTo(0));
2626+
2627+
final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
2628+
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
2629+
);
2630+
// start shard and perform index check on startup. It enforce shard to fail due to corrupted index files
2631+
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
2632+
.settings(Settings.builder()
2633+
.put(indexShard.indexSettings.getSettings())
2634+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("true", "checksum")))
2635+
.build();
2636+
2637+
IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
2638+
null, null, indexShard.engineFactory,
2639+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2640+
2641+
final IndexShardRecoveryException indexShardRecoveryException =
2642+
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
2643+
assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery"));
2644+
2645+
// check that corrupt marker is there
2646+
Files.walkFileTree(indexPath, corruptedVisitor);
2647+
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));
2648+
2649+
try {
2650+
closeShards(corruptedShard);
2651+
} catch (RuntimeException e) {
2652+
assertThat(e.getMessage(), equalTo("CheckIndex failed"));
2653+
}
2654+
}
2655+
2656+
public void testShardDoesNotStartIfCorruptedMarkerIsPresent() throws Exception {
2657+
final IndexShard indexShard = newStartedShard(true);
2658+
2659+
final long numDocs = between(10, 100);
2660+
for (long i = 0; i < numDocs; i++) {
2661+
indexDoc(indexShard, "_doc", Long.toString(i), "{}");
2662+
}
2663+
indexShard.flush(new FlushRequest());
2664+
closeShards(indexShard);
2665+
2666+
final ShardPath shardPath = indexShard.shardPath();
2667+
2668+
final ShardRouting shardRouting = ShardRoutingHelper.initWithSameId(indexShard.routingEntry(),
2669+
RecoverySource.StoreRecoverySource.EXISTING_STORE_INSTANCE
2670+
);
2671+
final IndexMetaData indexMetaData = indexShard.indexSettings().getIndexMetaData();
2672+
2673+
final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
2674+
2675+
// create corrupted marker
2676+
final String corruptionMessage = "fake ioexception";
2677+
try(Store store = createStore(indexShard.indexSettings(), shardPath)) {
2678+
store.markStoreCorrupted(new IOException(corruptionMessage));
2679+
}
2680+
2681+
// try to start shard on corrupted files
2682+
final IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetaData,
2683+
null, null, indexShard.engineFactory,
2684+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2685+
2686+
final IndexShardRecoveryException exception1 = expectThrows(IndexShardRecoveryException.class,
2687+
() -> newStartedShard(p -> corruptedShard, true));
2688+
assertThat(exception1.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
2689+
closeShards(corruptedShard);
2690+
2691+
final AtomicInteger corruptedMarkerCount = new AtomicInteger();
2692+
final SimpleFileVisitor<Path> corruptedVisitor = new SimpleFileVisitor<Path>() {
2693+
@Override
2694+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
2695+
if (Files.isRegularFile(file) && file.getFileName().toString().startsWith(Store.CORRUPTED)) {
2696+
corruptedMarkerCount.incrementAndGet();
2697+
}
2698+
return FileVisitResult.CONTINUE;
2699+
}
2700+
};
2701+
Files.walkFileTree(indexPath, corruptedVisitor);
2702+
assertThat("store has to be marked as corrupted", corruptedMarkerCount.get(), equalTo(1));
2703+
2704+
// try to start another time shard on corrupted files
2705+
final IndexShard corruptedShard2 = newShard(shardRouting, shardPath, indexMetaData,
2706+
null, null, indexShard.engineFactory,
2707+
indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
2708+
2709+
final IndexShardRecoveryException exception2 = expectThrows(IndexShardRecoveryException.class,
2710+
() -> newStartedShard(p -> corruptedShard2, true));
2711+
assertThat(exception2.getCause().getMessage(), equalTo(corruptionMessage + " (resource=preexisting_corruption)"));
2712+
closeShards(corruptedShard2);
2713+
2714+
// check that corrupt marker is there
2715+
corruptedMarkerCount.set(0);
2716+
Files.walkFileTree(indexPath, corruptedVisitor);
2717+
assertThat("store still has a single corrupt marker", corruptedMarkerCount.get(), equalTo(1));
2718+
}
2719+
2720+
private Path corruptIndexFile(ShardPath shardPath) throws IOException {
2721+
final Path indexPath = shardPath.getDataPath().resolve(ShardPath.INDEX_FOLDER_NAME);
2722+
final Path[] filesToCorrupt =
2723+
Files.walk(indexPath)
2724+
.filter(p -> {
2725+
final String name = p.getFileName().toString();
2726+
return Files.isRegularFile(p)
2727+
&& name.startsWith("extra") == false // Skip files added by Lucene's ExtrasFS
2728+
&& IndexWriter.WRITE_LOCK_NAME.equals(name) == false
2729+
&& name.startsWith("segments_") == false && name.endsWith(".si") == false;
2730+
})
2731+
.toArray(Path[]::new);
2732+
CorruptionUtils.corruptFile(random(), filesToCorrupt);
2733+
return indexPath;
2734+
}
2735+
25932736
/**
25942737
* Simulates a scenario that happens when we are async fetching snapshot metadata from GatewayService
25952738
* and checking index concurrently. This should always be possible without any exception.
@@ -2613,7 +2756,7 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
26132756
final IndexMetaData indexMetaData = IndexMetaData.builder(indexShard.indexSettings().getIndexMetaData())
26142757
.settings(Settings.builder()
26152758
.put(indexShard.indexSettings.getSettings())
2616-
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum", "fix")))
2759+
.put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), randomFrom("false", "true", "checksum")))
26172760
.build();
26182761
final IndexShard newShard = newShard(shardRouting, indexShard.shardPath(), indexMetaData,
26192762
null, null, indexShard.engineFactory, indexShard.getGlobalCheckpointSyncer(), EMPTY_EVENT_LISTENER);
@@ -2655,6 +2798,16 @@ public void testReadSnapshotAndCheckIndexConcurrently() throws Exception {
26552798
closeShards(newShard);
26562799
}
26572800

2801+
public void testCheckOnStartupDeprecatedValue() throws Exception {
2802+
final Settings settings = Settings.builder().put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "fix").build();
2803+
2804+
final IndexShard newShard = newShard(true, settings);
2805+
closeShards(newShard);
2806+
2807+
assertWarnings("Setting [index.shard.check_on_startup] is set to deprecated value [fix], "
2808+
+ "which has no effect and will not be accepted in future");
2809+
}
2810+
26582811
class Result {
26592812
private final int localCheckpoint;
26602813
private final int maxSeqNo;

test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.elasticsearch.cluster.routing.ShardRoutingHelper;
3333
import org.elasticsearch.cluster.routing.ShardRoutingState;
3434
import org.elasticsearch.cluster.routing.TestShardRouting;
35+
import org.elasticsearch.common.CheckedFunction;
3536
import org.elasticsearch.common.Nullable;
3637
import org.elasticsearch.common.bytes.BytesArray;
3738
import org.elasticsearch.common.lucene.uid.Versions;
@@ -156,7 +157,6 @@ public Settings threadPoolSettings() {
156157
return Settings.EMPTY;
157158
}
158159

159-
160160
protected Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException {
161161
return createStore(shardPath.getShardId(), indexSettings, newFSDirectory(shardPath.resolveIndex()));
162162
}
@@ -169,7 +169,6 @@ public Directory newDirectory() throws IOException {
169169
}
170170
};
171171
return new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId));
172-
173172
}
174173

175174
/**
@@ -179,7 +178,17 @@ public Directory newDirectory() throws IOException {
179178
* another shard)
180179
*/
181180
protected IndexShard newShard(boolean primary) throws IOException {
182-
return newShard(primary, Settings.EMPTY, new InternalEngineFactory());
181+
return newShard(primary, Settings.EMPTY);
182+
}
183+
184+
/**
185+
* Creates a new initializing shard. The shard will have its own unique data path.
186+
*
187+
* @param primary indicates whether to a primary shard (ready to recover from an empty store) or a replica (ready to recover from
188+
* another shard)
189+
*/
190+
protected IndexShard newShard(final boolean primary, final Settings settings) throws IOException {
191+
return newShard(primary, settings, new InternalEngineFactory());
183192
}
184193

185194
/**
@@ -318,23 +327,25 @@ protected IndexShard newShard(ShardRouting routing, IndexMetaData indexMetaData,
318327
* @param routing shard routing to use
319328
* @param shardPath path to use for shard data
320329
* @param indexMetaData indexMetaData for the shard, including any mapping
321-
* @param store an optional custom store to use. If null a default file based store will be created
330+
* @param storeProvider an optional custom store provider to use. If null a default file based store will be created
322331
* @param indexSearcherWrapper an optional wrapper to be used during searchers
323332
* @param globalCheckpointSyncer callback for syncing global checkpoints
324333
* @param indexEventListener index event listener
325334
* @param listeners an optional set of listeners to add to the shard
326335
*/
327336
protected IndexShard newShard(ShardRouting routing, ShardPath shardPath, IndexMetaData indexMetaData,
328-
@Nullable Store store, @Nullable IndexSearcherWrapper indexSearcherWrapper,
337+
@Nullable CheckedFunction<IndexSettings, Store, IOException> storeProvider,
338+
@Nullable IndexSearcherWrapper indexSearcherWrapper,
329339
@Nullable EngineFactory engineFactory,
330340
Runnable globalCheckpointSyncer,
331341
IndexEventListener indexEventListener, IndexingOperationListener... listeners) throws IOException {
332342
final Settings nodeSettings = Settings.builder().put("node.name", routing.currentNodeId()).build();
333343
final IndexSettings indexSettings = new IndexSettings(indexMetaData, nodeSettings);
334344
final IndexShard indexShard;
335-
if (store == null) {
336-
store = createStore(indexSettings, shardPath);
345+
if (storeProvider == null) {
346+
storeProvider = is -> createStore(is, shardPath);
337347
}
348+
final Store store = storeProvider.apply(indexSettings);
338349
boolean success = false;
339350
try {
340351
IndexCache indexCache = new IndexCache(indexSettings, new DisabledQueryCache(indexSettings), null);
@@ -424,7 +435,18 @@ protected IndexShard newStartedShard(final boolean primary) throws IOException {
424435
*/
425436
protected IndexShard newStartedShard(
426437
final boolean primary, final Settings settings, final EngineFactory engineFactory) throws IOException {
427-
IndexShard shard = newShard(primary, settings, engineFactory);
438+
return newStartedShard(p -> newShard(p, settings, engineFactory), primary);
439+
}
440+
441+
/**
442+
* creates a new empty shard and starts it.
443+
*
444+
* @param shardFunction shard factory function
445+
* @param primary controls whether the shard will be a primary or a replica.
446+
*/
447+
protected IndexShard newStartedShard(CheckedFunction<Boolean, IndexShard, IOException> shardFunction,
448+
boolean primary) throws IOException {
449+
IndexShard shard = shardFunction.apply(primary);
428450
if (primary) {
429451
recoverShardFromStore(shard);
430452
} else {

0 commit comments

Comments
 (0)