-
Notifications
You must be signed in to change notification settings - Fork 25.2k
Refactor ShardFailure listener infrastructure #14206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,6 @@ | |
import org.elasticsearch.common.Nullable; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.io.stream.BytesStreamOutput; | ||
import org.elasticsearch.common.lease.Releasable; | ||
import org.elasticsearch.common.lease.Releasables; | ||
import org.elasticsearch.common.logging.ESLogger; | ||
import org.elasticsearch.common.logging.support.LoggerMessageFormat; | ||
|
@@ -51,6 +50,7 @@ | |
import org.elasticsearch.common.unit.ByteSizeUnit; | ||
import org.elasticsearch.common.unit.ByteSizeValue; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
import org.elasticsearch.common.util.Callback; | ||
import org.elasticsearch.common.util.concurrent.AbstractRefCounted; | ||
import org.elasticsearch.common.util.concurrent.AbstractRunnable; | ||
import org.elasticsearch.common.util.concurrent.FutureUtils; | ||
|
@@ -76,7 +76,6 @@ | |
import org.elasticsearch.index.merge.MergeStats; | ||
import org.elasticsearch.index.percolator.PercolateStats; | ||
import org.elasticsearch.index.percolator.PercolatorQueriesRegistry; | ||
import org.elasticsearch.index.query.IndexQueryParserService; | ||
import org.elasticsearch.index.recovery.RecoveryStats; | ||
import org.elasticsearch.index.refresh.RefreshStats; | ||
import org.elasticsearch.index.search.stats.SearchStats; | ||
|
@@ -167,7 +166,7 @@ public class IndexShard extends AbstractIndexShardComponent implements IndexSett | |
private final MeanMetric refreshMetric = new MeanMetric(); | ||
private final MeanMetric flushMetric = new MeanMetric(); | ||
|
||
private final ShardEngineFailListener failedEngineListener = new ShardEngineFailListener(); | ||
private final ShardEventListener shardEventListener = new ShardEventListener(); | ||
private volatile boolean flushOnClose = true; | ||
private volatile int flushThresholdOperations; | ||
private volatile ByteSizeValue flushThresholdSize; | ||
|
@@ -979,8 +978,8 @@ private void startScheduledTasksIfNeeded() { | |
|
||
public static final String INDEX_REFRESH_INTERVAL = "index.refresh_interval"; | ||
|
||
public void addFailedEngineListener(Engine.FailedEngineListener failedEngineListener) { | ||
this.failedEngineListener.delegates.add(failedEngineListener); | ||
public void addShardFailureCallback(Callback<ShardFailure> onShardFailure) { | ||
this.shardEventListener.delegates.add(onShardFailure); | ||
} | ||
|
||
/** Change the indexing and translog buffer sizes. If {@code IndexWriter} is currently using more than | ||
|
@@ -1369,15 +1368,16 @@ protected Engine getEngineOrNull() { | |
return this.currentEngineReference.get(); | ||
} | ||
|
||
class ShardEngineFailListener implements Engine.FailedEngineListener { | ||
private final CopyOnWriteArrayList<Engine.FailedEngineListener> delegates = new CopyOnWriteArrayList<>(); | ||
class ShardEventListener implements Engine.EventListener { | ||
private final CopyOnWriteArrayList<Callback<ShardFailure>> delegates = new CopyOnWriteArrayList<>(); | ||
|
||
// called by the current engine | ||
@Override | ||
public void onFailedEngine(ShardId shardId, String reason, @Nullable Throwable failure) { | ||
for (Engine.FailedEngineListener listener : delegates) { | ||
public void onFailedEngine(String reason, @Nullable Throwable failure) { | ||
final ShardFailure shardFailure = new ShardFailure(shardRouting, reason, failure, getIndexUUID()); | ||
for (Callback<ShardFailure> listener : delegates) { | ||
try { | ||
listener.onFailedEngine(shardId, reason, failure); | ||
listener.handle(shardFailure); | ||
} catch (Exception e) { | ||
logger.warn("exception while notifying engine failure", e); | ||
} | ||
|
@@ -1457,7 +1457,7 @@ protected void operationProcessed() { | |
}; | ||
return new EngineConfig(shardId, | ||
threadPool, indexingService, indexSettings, warmer, store, deletionPolicy, mergePolicyConfig.getMergePolicy(), mergeSchedulerConfig, | ||
mapperService.indexAnalyzer(), similarityService.similarity(mapperService), codecService, failedEngineListener, translogRecoveryPerformer, indexCache.query(), cachingPolicy, translogConfig); | ||
mapperService.indexAnalyzer(), similarityService.similarity(mapperService), codecService, shardEventListener, translogRecoveryPerformer, indexCache.query(), cachingPolicy, translogConfig); | ||
} | ||
|
||
private static class IndexShardOperationCounter extends AbstractRefCounted { | ||
|
@@ -1571,4 +1571,23 @@ public void onAfter() { | |
return false; | ||
} | ||
|
||
/** | ||
* Simple struct encapsulating a shard failure | ||
* @see IndexShard#addShardFailureCallback(Callback) | ||
*/ | ||
public static final class ShardFailure { | ||
public final ShardRouting routing; | ||
public final String reason; | ||
@Nullable | ||
public final Throwable cause; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we mark this as nullable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this for? it doesn't by me anything except of bloat? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We use @nullable everywhere (including in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we should have requirements like this when writing new code... I use getFoo everywhere when I add it even if we are not consistent at least I am not adding any new unneeded annotations.. |
||
public final String indexUUID; | ||
|
||
public ShardFailure(ShardRouting routing, String reason, @Nullable Throwable cause, String indexUUID) { | ||
this.routing = routing; | ||
this.reason = reason; | ||
this.cause = cause; | ||
this.indexUUID = indexUUID; | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,10 +41,10 @@ | |
import org.elasticsearch.common.lucene.Lucene; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
import org.elasticsearch.common.util.Callback; | ||
import org.elasticsearch.common.util.concurrent.ConcurrentCollections; | ||
import org.elasticsearch.index.IndexService; | ||
import org.elasticsearch.index.IndexShardAlreadyExistsException; | ||
import org.elasticsearch.index.engine.Engine; | ||
import org.elasticsearch.index.mapper.DocumentMapper; | ||
import org.elasticsearch.index.mapper.MapperService; | ||
import org.elasticsearch.index.settings.IndexSettingsService; | ||
|
@@ -98,7 +98,7 @@ static class FailedShard { | |
} | ||
|
||
private final Object mutex = new Object(); | ||
private final FailedEngineHandler failedEngineHandler = new FailedEngineHandler(); | ||
private final FailedShardHandler failedShardHandler = new FailedShardHandler(); | ||
|
||
private final boolean sendRefreshMapping; | ||
|
||
|
@@ -381,7 +381,7 @@ private void applyMappings(ClusterChangedEvent event) { | |
// so this failure typically means wrong node level configuration or something similar | ||
for (IndexShard indexShard : indexService) { | ||
ShardRouting shardRouting = indexShard.routingEntry(); | ||
failAndRemoveShard(shardRouting, indexService, true, "failed to update mappings", t); | ||
failAndRemoveShard(shardRouting, indexService.indexUUID(), indexService, true, "failed to update mappings", t); | ||
} | ||
} | ||
} | ||
|
@@ -637,11 +637,11 @@ private void applyInitializingShard(final ClusterState state, final IndexMetaDat | |
} | ||
IndexShard indexShard = indexService.createShard(shardId, shardRouting); | ||
indexShard.updateRoutingEntry(shardRouting, state.blocks().disableStatePersistence() == false); | ||
indexShard.addFailedEngineListener(failedEngineHandler); | ||
indexShard.addShardFailureCallback(failedShardHandler); | ||
} catch (IndexShardAlreadyExistsException e) { | ||
// ignore this, the method call can happen several times | ||
} catch (Throwable e) { | ||
failAndRemoveShard(shardRouting, indexService, true, "failed to create shard", e); | ||
failAndRemoveShard(shardRouting, indexService.indexUUID(), indexService, true, "failed to create shard", e); | ||
return; | ||
} | ||
} | ||
|
@@ -768,7 +768,7 @@ public void onRecoveryFailure(RecoveryState state, RecoveryFailedException e, bo | |
|
||
private void handleRecoveryFailure(IndexService indexService, ShardRouting shardRouting, boolean sendShardFailure, Throwable failure) { | ||
synchronized (mutex) { | ||
failAndRemoveShard(shardRouting, indexService, sendShardFailure, "failed recovery", failure); | ||
failAndRemoveShard(shardRouting, indexService.indexUUID(), indexService, sendShardFailure, "failed recovery", failure); | ||
} | ||
} | ||
|
||
|
@@ -802,8 +802,10 @@ private void deleteIndex(String index, String reason) { | |
|
||
} | ||
|
||
private void failAndRemoveShard(ShardRouting shardRouting, IndexService indexService, boolean sendShardFailure, String message, @Nullable Throwable failure) { | ||
if (indexService.hasShard(shardRouting.getId())) { | ||
private void failAndRemoveShard(ShardRouting shardRouting, String indexUUID, @Nullable IndexService indexService, boolean sendShardFailure, String message, @Nullable Throwable failure) { | ||
if (indexService != null && indexService.hasShard(shardRouting.getId())) { | ||
// if the indexService is null we can't remove the shard, that's fine since we might have a failure | ||
// when the index is remove and then we already removed the index service for that shard... | ||
try { | ||
indexService.removeShard(shardRouting.getId(), message); | ||
} catch (ShardNotFoundException e) { | ||
|
@@ -813,7 +815,7 @@ private void failAndRemoveShard(ShardRouting shardRouting, IndexService indexSer | |
} | ||
} | ||
if (sendShardFailure) { | ||
sendFailShard(shardRouting, indexService.indexUUID(), message, failure); | ||
sendFailShard(shardRouting, indexUUID, message, failure); | ||
} | ||
} | ||
|
||
|
@@ -827,29 +829,14 @@ private void sendFailShard(ShardRouting shardRouting, String indexUUID, String m | |
} | ||
} | ||
|
||
private class FailedEngineHandler implements Engine.FailedEngineListener { | ||
private class FailedShardHandler implements Callback<IndexShard.ShardFailure> { | ||
@Override | ||
public void onFailedEngine(final ShardId shardId, final String reason, final @Nullable Throwable failure) { | ||
ShardRouting shardRouting = null; | ||
final IndexService indexService = indicesService.indexService(shardId.index().name()); | ||
if (indexService != null) { | ||
IndexShard indexShard = indexService.getShardOrNull(shardId.id()); | ||
if (indexShard != null) { | ||
shardRouting = indexShard.routingEntry(); | ||
} | ||
} | ||
if (shardRouting == null) { | ||
logger.warn("[{}][{}] engine failed, but can't find index shard. failure reason: [{}]", failure, | ||
shardId.index().name(), shardId.id(), reason); | ||
return; | ||
} | ||
final ShardRouting fShardRouting = shardRouting; | ||
threadPool.generic().execute(new Runnable() { | ||
@Override | ||
public void run() { | ||
synchronized (mutex) { | ||
failAndRemoveShard(fShardRouting, indexService, true, "engine failure, reason [" + reason + "]", failure); | ||
} | ||
public void handle(final IndexShard.ShardFailure shardFailure) { | ||
final IndexService indexService = indicesService.indexService(shardFailure.routing.shardId().index().name()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we used to protect agains null here -> I think we should still do this? I'm thinking about failures that happen during index deletion.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. look again we only checked for null to get the ShardRouting... not needed here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we'll run into an NPE in the failAndRemoveShard code:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get why this should be null I mean if it's null it's an error condition and we just throw an NPE? How can we fail and have no IndexService for it? something is messed up here and wrong. I don't think we should check for null here... the lenient code before seems wrong? |
||
final ShardRouting shardRouting = shardFailure.routing; | ||
threadPool.generic().execute(() -> { | ||
synchronized (mutex) { | ||
failAndRemoveShard(shardRouting, shardFailure.indexUUID, indexService, true, "shard failure, reason [" + shardFailure.reason + "]", shardFailure.cause); | ||
} | ||
}); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks.