Skip to content

Commit a6e6c4e

Browse files
committed
[CORE] Ensure shards are deleted under lock on close
Today there is a race condition between the actual deletion of the shard and the release of the lock in the store. This race can cause rare imports of dangeling indices if the cluster state update loop tires to import the dangeling index in that particular windonw. This commit adds more safety to the import of dangeling indices and removes the race condition by holding on to the lock on store closing while the listener is notified.
1 parent abc0bc4 commit a6e6c4e

File tree

4 files changed

+32
-16
lines changed

4 files changed

+32
-16
lines changed

src/main/java/org/elasticsearch/gateway/local/state/meta/LocalGatewayMetaState.java

+20-6
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import org.elasticsearch.common.util.concurrent.FutureUtils;
4646
import org.elasticsearch.common.xcontent.*;
4747
import org.elasticsearch.env.NodeEnvironment;
48+
import org.elasticsearch.env.ShardLock;
4849
import org.elasticsearch.index.Index;
4950
import org.elasticsearch.threadpool.ThreadPool;
5051

@@ -287,6 +288,25 @@ public void clusterChanged(ClusterChangedEvent event) {
287288
}
288289
final IndexMetaData indexMetaData = loadIndexState(indexName);
289290
if (indexMetaData != null) {
291+
final Index index = new Index(indexName);
292+
try {
293+
// the index deletion might not have worked due to shards still being locked
294+
// we have three cases here:
295+
// - we acquired all shards locks here --> we can import the dangeling index
296+
// - we failed to acquire the lock --> somebody else uses it - DON'T IMPORT
297+
// - we acquired successfully but the lock list is empty --> no shards present - DON'T IMPORT
298+
// in the last case we should in-fact try to delete the directory since it might be a leftover...
299+
final List<ShardLock> shardLocks = nodeEnv.lockAllForIndex(index);
300+
if (shardLocks.isEmpty()) {
301+
// no shards - try to remove the directory
302+
nodeEnv.deleteIndexDirectorySafe(index);
303+
continue;
304+
}
305+
IOUtils.closeWhileHandlingException(shardLocks);
306+
} catch (IOException ex) {
307+
logger.warn("[{}] skipping locked dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state is set to [{}]", ex, indexName, autoImportDangled);
308+
continue;
309+
}
290310
if(autoImportDangled.shouldImport()){
291311
logger.info("[{}] dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state [{}]", indexName, autoImportDangled);
292312
danglingIndices.put(indexName, new DanglingIndex(indexName, null));
@@ -300,12 +320,6 @@ public void clusterChanged(ClusterChangedEvent event) {
300320
logger.warn("[{}] failed to delete dangling index", ex, indexName);
301321
}
302322
} else {
303-
try { // the index deletion might not have worked due to shards still being locked
304-
IOUtils.closeWhileHandlingException(nodeEnv.lockAllForIndex(new Index(indexName)));
305-
} catch (IOException ex) {
306-
logger.warn("[{}] skipping locked dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state is set to [{}]", ex, indexName, autoImportDangled);
307-
continue;
308-
}
309323
logger.info("[{}] dangling index, exists on local file system, but not in cluster metadata, scheduling to delete in [{}], auto import to cluster state [{}]", indexName, danglingTimeout, autoImportDangled);
310324
danglingIndices.put(indexName, new DanglingIndex(indexName, threadPool.schedule(danglingTimeout, ThreadPool.Names.SAME, new RemoveDanglingIndex(indexName))));
311325
}

src/main/java/org/elasticsearch/index/store/Store.java

+6-8
Original file line numberDiff line numberDiff line change
@@ -364,15 +364,13 @@ private void closeInternal() {
364364
logger.debug("failed to close directory", e);
365365
} finally {
366366
try {
367-
IOUtils.closeWhileHandlingException(shardLock);
368-
} finally {
369-
try {
370-
if (listener != null) {
371-
listener.onClose(shardId);
372-
}
373-
} catch (Exception ex){
374-
logger.debug("OnCloseListener threw an exception", ex);
367+
if (listener != null) {
368+
listener.onClose(shardId);
375369
}
370+
} catch (Exception ex){
371+
logger.debug("OnCloseListener threw an exception", ex);
372+
} finally {
373+
IOUtils.closeWhileHandlingException(shardLock);
376374
}
377375

378376

src/main/java/org/elasticsearch/indices/IndicesService.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ public static interface IndexCloseListener {
113113
public void onAllShardsClosed(Index index, List<Throwable> failures);
114114

115115
/**
116-
* Invoked once the last resource using the given shard ID is released
116+
* Invoked once the last resource using the given shard ID is released.
117+
* Yet, this method is called while still holding the shards lock such that
118+
* operations on the shards data can safely be executed in this callback.
117119
*/
118120
public void onShardClosed(ShardId shardId);
119121

src/main/java/org/elasticsearch/indices/InternalIndicesService.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.indices;
2121

2222
import com.google.common.collect.*;
23+
import org.apache.lucene.util.IOUtils;
2324
import org.elasticsearch.ElasticsearchException;
2425
import org.elasticsearch.ElasticsearchIllegalStateException;
2526
import org.elasticsearch.action.admin.indices.stats.CommonStats;
@@ -357,7 +358,8 @@ public void onAllShardsClosed(Index index, List<Throwable> failures) {
357358
@Override
358359
public void onShardClosed(ShardId shardId) {
359360
try {
360-
nodeEnv.deleteShardDirectorySafe(shardId);
361+
// this is called under the shard lock - we can safely delete it
362+
IOUtils.rm(nodeEnv.shardPaths(shardId));
361363
logger.debug("deleted shard [{}] from filesystem", shardId);
362364
} catch (IOException e) {
363365
logger.warn("Can't delete shard {} ", e, shardId);

0 commit comments

Comments
 (0)