|
21 | 21 | import org.apache.logging.log4j.Logger;
|
22 | 22 | import org.apache.logging.log4j.message.ParameterizedMessage;
|
23 | 23 | import org.apache.lucene.store.AlreadyClosedException;
|
| 24 | +import org.elasticsearch.Assertions; |
24 | 25 | import org.elasticsearch.ElasticsearchException;
|
25 | 26 | import org.elasticsearch.ExceptionsHelper;
|
26 | 27 | import org.elasticsearch.action.ActionListener;
|
27 | 28 | import org.elasticsearch.action.UnavailableShardsException;
|
28 | 29 | import org.elasticsearch.action.support.ActiveShardCount;
|
29 | 30 | import org.elasticsearch.action.support.TransportActions;
|
| 31 | +import org.elasticsearch.cluster.action.shard.ShardStateAction; |
30 | 32 | import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
|
31 | 33 | import org.elasticsearch.cluster.routing.ShardRouting;
|
32 | 34 | import org.elasticsearch.common.Nullable;
|
33 | 35 | import org.elasticsearch.common.io.stream.StreamInput;
|
34 | 36 | import org.elasticsearch.index.seqno.SequenceNumbers;
|
35 | 37 | import org.elasticsearch.index.shard.ReplicationGroup;
|
36 | 38 | import org.elasticsearch.index.shard.ShardId;
|
| 39 | +import org.elasticsearch.node.NodeClosedException; |
37 | 40 | import org.elasticsearch.rest.RestStatus;
|
| 41 | +import org.elasticsearch.transport.TransportException; |
38 | 42 |
|
39 | 43 | import java.io.IOException;
|
40 | 44 | import java.util.ArrayList;
|
|
43 | 47 | import java.util.Locale;
|
44 | 48 | import java.util.concurrent.atomic.AtomicBoolean;
|
45 | 49 | import java.util.concurrent.atomic.AtomicInteger;
|
46 |
| -import java.util.function.Consumer; |
47 | 50 |
|
48 | 51 | public class ReplicationOperation<
|
49 | 52 | Request extends ReplicationRequest<Request>,
|
@@ -133,10 +136,7 @@ private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Replica
|
133 | 136 | for (String allocationId : replicationGroup.getUnavailableInSyncShards()) {
|
134 | 137 | pendingActions.incrementAndGet();
|
135 | 138 | replicasProxy.markShardCopyAsStaleIfNeeded(replicaRequest.shardId(), allocationId,
|
136 |
| - ReplicationOperation.this::decPendingAndFinishIfNeeded, |
137 |
| - ReplicationOperation.this::onPrimaryDemoted, |
138 |
| - throwable -> decPendingAndFinishIfNeeded() |
139 |
| - ); |
| 139 | + ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary)); |
140 | 140 | }
|
141 | 141 | }
|
142 | 142 |
|
@@ -192,20 +192,32 @@ public void onFailure(Exception replicaException) {
|
192 | 192 | shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false));
|
193 | 193 | }
|
194 | 194 | String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard);
|
195 |
| - replicasProxy.failShardIfNeeded(shard, message, |
196 |
| - replicaException, ReplicationOperation.this::decPendingAndFinishIfNeeded, |
197 |
| - ReplicationOperation.this::onPrimaryDemoted, throwable -> decPendingAndFinishIfNeeded()); |
| 195 | + replicasProxy.failShardIfNeeded(shard, message, replicaException, |
| 196 | + ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary)); |
198 | 197 | }
|
199 | 198 | });
|
200 | 199 | }
|
201 | 200 |
|
202 |
| - private void onPrimaryDemoted(Exception demotionFailure) { |
203 |
| - String primaryFail = String.format(Locale.ROOT, |
204 |
| - "primary shard [%s] was demoted while failing replica shard", |
205 |
| - primary.routingEntry()); |
206 |
| - // we are no longer the primary, fail ourselves and start over |
207 |
| - primary.failShard(primaryFail, demotionFailure); |
208 |
| - finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), primaryFail, demotionFailure)); |
| 201 | + private void onNoLongerPrimary(Exception failure) { |
| 202 | + final boolean nodeIsClosing = failure instanceof NodeClosedException || |
| 203 | + (failure instanceof TransportException && "TransportService is closed stopped can't send request".equals(failure.getMessage())); |
| 204 | + final String message; |
| 205 | + if (nodeIsClosing) { |
| 206 | + message = String.format(Locale.ROOT, |
| 207 | + "node with primary [%s] is shutting down while failing replica shard", primary.routingEntry()); |
| 208 | + // We prefer not to fail the primary to avoid unnecessary warning log |
| 209 | + // when the node with the primary shard is gracefully shutting down. |
| 210 | + } else { |
| 211 | + if (Assertions.ENABLED) { |
| 212 | + if (failure instanceof ShardStateAction.NoLongerPrimaryShardException == false) { |
| 213 | + throw new AssertionError("unexpected failure", failure); |
| 214 | + } |
| 215 | + } |
| 216 | + // we are no longer the primary, fail ourselves and start over |
| 217 | + message = String.format(Locale.ROOT, "primary shard [%s] was demoted while failing replica shard", primary.routingEntry()); |
| 218 | + primary.failShard(message, failure); |
| 219 | + } |
| 220 | + finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), message, failure)); |
209 | 221 | }
|
210 | 222 |
|
211 | 223 | /**
|
@@ -365,31 +377,23 @@ void performOn(ShardRouting replica, RequestT replicaRequest, long globalCheckpo
|
365 | 377 | * of active shards. Whether a failure is needed is left up to the
|
366 | 378 | * implementation.
|
367 | 379 | *
|
368 |
| - * @param replica shard to fail |
369 |
| - * @param message a (short) description of the reason |
370 |
| - * @param exception the original exception which caused the ReplicationOperation to request the shard to be failed |
371 |
| - * @param onSuccess a callback to call when the shard has been successfully removed from the active set. |
372 |
| - * @param onPrimaryDemoted a callback to call when the shard can not be failed because the current primary has been demoted |
373 |
| - * by the master. |
374 |
| - * @param onIgnoredFailure a callback to call when failing a shard has failed, but it that failure can be safely ignored and the |
| 380 | + * @param replica shard to fail |
| 381 | + * @param message a (short) description of the reason |
| 382 | + * @param exception the original exception which caused the ReplicationOperation to request the shard to be failed |
| 383 | + * @param listener a listener that will be notified when the failing shard has been removed from the in-sync set |
375 | 384 | */
|
376 |
| - void failShardIfNeeded(ShardRouting replica, String message, Exception exception, Runnable onSuccess, |
377 |
| - Consumer<Exception> onPrimaryDemoted, Consumer<Exception> onIgnoredFailure); |
| 385 | + void failShardIfNeeded(ShardRouting replica, String message, Exception exception, ActionListener<Void> listener); |
378 | 386 |
|
379 | 387 | /**
|
380 | 388 | * Marks shard copy as stale if needed, removing its allocation id from
|
381 | 389 | * the set of in-sync allocation ids. Whether marking as stale is needed
|
382 | 390 | * is left up to the implementation.
|
383 | 391 | *
|
384 |
| - * @param shardId shard id |
385 |
| - * @param allocationId allocation id to remove from the set of in-sync allocation ids |
386 |
| - * @param onSuccess a callback to call when the allocation id has been successfully removed from the in-sync set. |
387 |
| - * @param onPrimaryDemoted a callback to call when the request failed because the current primary was already demoted |
388 |
| - * by the master. |
389 |
| - * @param onIgnoredFailure a callback to call when the request failed, but the failure can be safely ignored. |
| 392 | + * @param shardId shard id |
| 393 | + * @param allocationId allocation id to remove from the set of in-sync allocation ids |
| 394 | + * @param listener a listener that will be notified when the failing shard has been removed from the in-sync set |
390 | 395 | */
|
391 |
| - void markShardCopyAsStaleIfNeeded(ShardId shardId, String allocationId, Runnable onSuccess, |
392 |
| - Consumer<Exception> onPrimaryDemoted, Consumer<Exception> onIgnoredFailure); |
| 396 | + void markShardCopyAsStaleIfNeeded(ShardId shardId, String allocationId, ActionListener<Void> listener); |
393 | 397 | }
|
394 | 398 |
|
395 | 399 | /**
|
|
0 commit comments