Skip to content

Commit 9b6d37a

Browse files
committed
Do not start snapshots that are deleted during initialization (#27931)
When a new snapshot is created it is added to the cluster state as a snapshot-in-progress in INIT state, and the initialization is kicked off in a new runnable task by SnapshotService.beginSnapshot(). The initialization writes multiple files before updating the cluster state to change the snapshot-in-progress to STARTED state. This leaves a short window in which the snapshot could be deleted (let's say, because the snapshot is stuck in INIT or because it takes too much time to upload all the initialization files for all snapshotted indices). If the INIT snapshot is deleted, the snapshot-in-progress becomes ABORTED but once the initialization in SnapshotService.beginSnapshot() finished it is change back to STARTED state again. This commit avoids an ABORTED snapshot to be started if it has been deleted during initialization. It also adds a test that would have failed with the previous behavior, and changes few method names here and there.
1 parent 9875fd2 commit 9b6d37a

File tree

5 files changed

+181
-71
lines changed

5 files changed

+181
-71
lines changed

core/src/main/java/org/elasticsearch/snapshots/SnapshotException.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ public SnapshotException(final String repositoryName, final String snapshotName,
6666
}
6767

6868
public SnapshotException(final String repositoryName, final String snapshotName, final String msg, final Throwable cause) {
69-
super("[" + repositoryName + ":" + snapshotName + "]" + msg, cause);
69+
super("[" + repositoryName + ":" + snapshotName + "] " + msg, cause);
7070
this.repositoryName = repositoryName;
7171
this.snapshotName = snapshotName;
7272
}

core/src/main/java/org/elasticsearch/snapshots/SnapshotMissingException.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@
3030
public class SnapshotMissingException extends SnapshotException {
3131

3232
public SnapshotMissingException(final String repositoryName, final SnapshotId snapshotId, final Throwable cause) {
33-
super(repositoryName, snapshotId, " is missing", cause);
33+
super(repositoryName, snapshotId, "is missing", cause);
3434
}
3535

3636
public SnapshotMissingException(final String repositoryName, final SnapshotId snapshotId) {
37-
super(repositoryName, snapshotId, " is missing");
37+
super(repositoryName, snapshotId, "is missing");
3838
}
3939

4040
public SnapshotMissingException(final String repositoryName, final String snapshotName) {
41-
super(repositoryName, snapshotName, " is missing");
41+
super(repositoryName, snapshotName, "is missing");
4242
}
4343

4444
public SnapshotMissingException(StreamInput in) throws IOException {

core/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 99 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
2323
import org.apache.logging.log4j.message.ParameterizedMessage;
2424
import org.apache.logging.log4j.util.Supplier;
25+
import org.apache.lucene.util.SetOnce;
2526
import org.elasticsearch.ExceptionsHelper;
2627
import org.elasticsearch.cluster.ClusterChangedEvent;
2728
import org.elasticsearch.cluster.ClusterState;
@@ -146,7 +147,6 @@ protected void doStop() {
146147
} finally {
147148
shutdownLock.unlock();
148149
}
149-
150150
}
151151

152152
@Override
@@ -157,14 +157,16 @@ protected void doClose() {
157157
@Override
158158
public void applyClusterState(ClusterChangedEvent event) {
159159
try {
160-
SnapshotsInProgress prev = event.previousState().custom(SnapshotsInProgress.TYPE);
161-
SnapshotsInProgress curr = event.state().custom(SnapshotsInProgress.TYPE);
162-
163-
if ((prev == null && curr != null) || (prev != null && prev.equals(curr) == false)) {
160+
SnapshotsInProgress previousSnapshots = event.previousState().custom(SnapshotsInProgress.TYPE);
161+
SnapshotsInProgress currentSnapshots = event.state().custom(SnapshotsInProgress.TYPE);
162+
if ((previousSnapshots == null && currentSnapshots != null)
163+
|| (previousSnapshots != null && previousSnapshots.equals(currentSnapshots) == false)) {
164164
processIndexShardSnapshots(event);
165165
}
166-
String masterNodeId = event.state().nodes().getMasterNodeId();
167-
if (masterNodeId != null && masterNodeId.equals(event.previousState().nodes().getMasterNodeId()) == false) {
166+
167+
String previousMasterNodeId = event.previousState().nodes().getMasterNodeId();
168+
String currentMasterNodeId = event.state().nodes().getMasterNodeId();
169+
if (currentMasterNodeId != null && currentMasterNodeId.equals(previousMasterNodeId) == false) {
168170
syncShardStatsOnNewMaster(event);
169171
}
170172

@@ -281,17 +283,18 @@ private void processIndexShardSnapshots(ClusterChangedEvent event) {
281283
snapshotStatus.abort();
282284
break;
283285
case FINALIZE:
284-
logger.debug("[{}] trying to cancel snapshot on shard [{}] that is finalizing, letting it finish", entry.snapshot(), shard.key);
286+
logger.debug("[{}] trying to cancel snapshot on shard [{}] that is finalizing, " +
287+
"letting it finish", entry.snapshot(), shard.key);
285288
break;
286289
case DONE:
287-
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that is already done, updating status on the master", entry.snapshot(), shard.key);
288-
updateIndexShardSnapshotStatus(entry.snapshot(), shard.key,
289-
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
290+
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that is already done, " +
291+
"updating status on the master", entry.snapshot(), shard.key);
292+
notifySuccessfulSnapshotShard(entry.snapshot(), shard.key, localNodeId, masterNode);
290293
break;
291294
case FAILURE:
292-
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that has already failed, updating status on the master", entry.snapshot(), shard.key);
293-
updateIndexShardSnapshotStatus(entry.snapshot(), shard.key,
294-
new ShardSnapshotStatus(localNodeId, State.FAILED, snapshotStatus.failure()), masterNode);
295+
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that has already failed, " +
296+
"updating status on the master", entry.snapshot(), shard.key);
297+
notifyFailedSnapshotShard(entry.snapshot(), shard.key, localNodeId, snapshotStatus.failure(), masterNode);
295298
break;
296299
default:
297300
throw new IllegalStateException("Unknown snapshot shard stage " + snapshotStatus.stage());
@@ -320,34 +323,47 @@ private void processIndexShardSnapshots(ClusterChangedEvent event) {
320323
if (newSnapshots.isEmpty() == false) {
321324
Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
322325
for (final Map.Entry<Snapshot, Map<ShardId, IndexShardSnapshotStatus>> entry : newSnapshots.entrySet()) {
323-
Map<String, IndexId> indicesMap = snapshotIndices.get(entry.getKey());
326+
final Snapshot snapshot = entry.getKey();
327+
final Map<String, IndexId> indicesMap = snapshotIndices.get(snapshot);
324328
assert indicesMap != null;
329+
325330
for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : entry.getValue().entrySet()) {
326331
final ShardId shardId = shardEntry.getKey();
327-
try {
328-
final IndexShard indexShard = indicesService.indexServiceSafe(shardId.getIndex()).getShardOrNull(shardId.id());
329-
final IndexId indexId = indicesMap.get(shardId.getIndexName());
330-
assert indexId != null;
331-
executor.execute(new AbstractRunnable() {
332-
@Override
333-
public void doRun() {
334-
snapshot(indexShard, entry.getKey(), indexId, shardEntry.getValue());
335-
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
336-
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
337-
}
332+
final IndexShard indexShard = indicesService.indexServiceSafe(shardId.getIndex()).getShardOrNull(shardId.id());
333+
final IndexId indexId = indicesMap.get(shardId.getIndexName());
334+
assert indexId != null;
335+
executor.execute(new AbstractRunnable() {
338336

339-
@Override
340-
public void onFailure(Exception e) {
341-
logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}] [{}] failed to create snapshot", shardId, entry.getKey()), e);
342-
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
343-
new ShardSnapshotStatus(localNodeId, State.FAILED, ExceptionsHelper.detailedMessage(e)), masterNode);
344-
}
337+
final SetOnce<Exception> failure = new SetOnce<>();
345338

346-
});
347-
} catch (Exception e) {
348-
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
349-
new ShardSnapshotStatus(localNodeId, State.FAILED, ExceptionsHelper.detailedMessage(e)), masterNode);
350-
}
339+
@Override
340+
public void doRun() {
341+
snapshot(indexShard, snapshot, indexId, shardEntry.getValue());
342+
}
343+
344+
@Override
345+
public void onFailure(Exception e) {
346+
logger.warn((Supplier<?>) () ->
347+
new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
348+
failure.set(e);
349+
}
350+
351+
@Override
352+
public void onRejection(Exception e) {
353+
failure.set(e);
354+
}
355+
356+
@Override
357+
public void onAfter() {
358+
final Exception exception = failure.get();
359+
if (exception != null) {
360+
final String failure = ExceptionsHelper.detailedMessage(exception);
361+
notifyFailedSnapshotShard(snapshot, shardId, localNodeId, failure, masterNode);
362+
} else {
363+
notifySuccessfulSnapshotShard(snapshot, shardId, localNodeId, masterNode);
364+
}
365+
}
366+
});
351367
}
352368
}
353369
}
@@ -360,34 +376,36 @@ public void onFailure(Exception e) {
360376
* @param snapshotStatus snapshot status
361377
*/
362378
private void snapshot(final IndexShard indexShard, final Snapshot snapshot, final IndexId indexId, final IndexShardSnapshotStatus snapshotStatus) {
363-
Repository repository = snapshotsService.getRepositoriesService().repository(snapshot.getRepository());
364-
ShardId shardId = indexShard.shardId();
365-
if (!indexShard.routingEntry().primary()) {
379+
final ShardId shardId = indexShard.shardId();
380+
if (indexShard.routingEntry().primary() == false) {
366381
throw new IndexShardSnapshotFailedException(shardId, "snapshot should be performed only on primary");
367382
}
368383
if (indexShard.routingEntry().relocating()) {
369384
// do not snapshot when in the process of relocation of primaries so we won't get conflicts
370385
throw new IndexShardSnapshotFailedException(shardId, "cannot snapshot while relocating");
371386
}
372-
if (indexShard.state() == IndexShardState.CREATED || indexShard.state() == IndexShardState.RECOVERING) {
387+
388+
final IndexShardState indexShardState = indexShard.state();
389+
if (indexShardState == IndexShardState.CREATED || indexShardState == IndexShardState.RECOVERING) {
373390
// shard has just been created, or still recovering
374391
throw new IndexShardSnapshotFailedException(shardId, "shard didn't fully recover yet");
375392
}
376393

394+
final Repository repository = snapshotsService.getRepositoriesService().repository(snapshot.getRepository());
377395
try {
378396
// we flush first to make sure we get the latest writes snapshotted
379397
try (Engine.IndexCommitRef snapshotRef = indexShard.acquireIndexCommit(true)) {
380398
repository.snapshotShard(indexShard, snapshot.getSnapshotId(), indexId, snapshotRef.getIndexCommit(), snapshotStatus);
381399
if (logger.isDebugEnabled()) {
382-
StringBuilder sb = new StringBuilder();
383-
sb.append(" index : version [").append(snapshotStatus.indexVersion()).append("], number_of_files [").append(snapshotStatus.numberOfFiles()).append("] with total_size [").append(new ByteSizeValue(snapshotStatus.totalSize())).append("]\n");
400+
StringBuilder details = new StringBuilder();
401+
details.append(" index : version [").append(snapshotStatus.indexVersion());
402+
details.append("], number_of_files [").append(snapshotStatus.numberOfFiles());
403+
details.append("] with total_size [").append(new ByteSizeValue(snapshotStatus.totalSize())).append("]\n");
384404
logger.debug("snapshot ({}) completed to {}, took [{}]\n{}", snapshot, repository,
385-
TimeValue.timeValueMillis(snapshotStatus.time()), sb);
405+
TimeValue.timeValueMillis(snapshotStatus.time()), details);
386406
}
387407
}
388-
} catch (SnapshotFailedEngineException e) {
389-
throw e;
390-
} catch (IndexShardSnapshotFailedException e) {
408+
} catch (SnapshotFailedEngineException | IndexShardSnapshotFailedException e) {
391409
throw e;
392410
} catch (Exception e) {
393411
throw new IndexShardSnapshotFailedException(shardId, "Failed to snapshot", e);
@@ -402,6 +420,7 @@ private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
402420
if (snapshotsInProgress == null) {
403421
return;
404422
}
423+
405424
final String localNodeId = event.state().nodes().getLocalNodeId();
406425
final DiscoveryNode masterNode = event.state().nodes().getMasterNode();
407426
for (SnapshotsInProgress.Entry snapshot : snapshotsInProgress.entries()) {
@@ -417,15 +436,16 @@ private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
417436
// Master knows about the shard and thinks it has not completed
418437
if (localShardStatus.stage() == Stage.DONE) {
419438
// but we think the shard is done - we need to make new master know that the shard is done
420-
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard is done locally, updating status on the master", snapshot.snapshot(), shardId);
421-
updateIndexShardSnapshotStatus(snapshot.snapshot(), shardId,
422-
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
439+
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard is done locally, " +
440+
"updating status on the master", snapshot.snapshot(), shardId);
441+
notifySuccessfulSnapshotShard(snapshot.snapshot(), shardId, localNodeId, masterNode);
442+
423443
} else if (localShard.getValue().stage() == Stage.FAILURE) {
424444
// but we think the shard failed - we need to make new master know that the shard failed
425-
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard failed locally, updating status on master", snapshot.snapshot(), shardId);
426-
updateIndexShardSnapshotStatus(snapshot.snapshot(), shardId,
427-
new ShardSnapshotStatus(localNodeId, State.FAILED, localShardStatus.failure()), masterNode);
428-
445+
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard failed locally, " +
446+
"updating status on master", snapshot.snapshot(), shardId);
447+
final String failure = localShardStatus.failure();
448+
notifyFailedSnapshotShard(snapshot.snapshot(), shardId, localNodeId, failure, masterNode);
429449
}
430450
}
431451
}
@@ -445,7 +465,6 @@ private SnapshotShards(Map<ShardId, IndexShardSnapshotStatus> shards) {
445465
}
446466
}
447467

448-
449468
/**
450469
* Internal request that is used to send changes in snapshot status to master
451470
*/
@@ -498,15 +517,33 @@ public String toString() {
498517
}
499518
}
500519

501-
/**
502-
* Updates the shard status
503-
*/
504-
public void updateIndexShardSnapshotStatus(Snapshot snapshot, ShardId shardId, ShardSnapshotStatus status, DiscoveryNode master) {
505-
UpdateIndexShardSnapshotStatusRequest request = new UpdateIndexShardSnapshotStatusRequest(snapshot, shardId, status);
520+
/** Notify the master node that the given shard has been successfully snapshotted **/
521+
void notifySuccessfulSnapshotShard(final Snapshot snapshot,
522+
final ShardId shardId,
523+
final String localNodeId,
524+
final DiscoveryNode masterNode) {
525+
sendSnapshotShardUpdate(snapshot, shardId, new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
526+
}
527+
528+
/** Notify the master node that the given shard failed to be snapshotted **/
529+
void notifyFailedSnapshotShard(final Snapshot snapshot,
530+
final ShardId shardId,
531+
final String localNodeId,
532+
final String failure,
533+
final DiscoveryNode masterNode) {
534+
sendSnapshotShardUpdate(snapshot, shardId, new ShardSnapshotStatus(localNodeId, State.FAILED, failure), masterNode);
535+
}
536+
537+
/** Updates the shard snapshot status by sending a {@link UpdateIndexShardSnapshotStatusRequest} to the master node */
538+
void sendSnapshotShardUpdate(final Snapshot snapshot,
539+
final ShardId shardId,
540+
final ShardSnapshotStatus status,
541+
final DiscoveryNode masterNode) {
506542
try {
507-
transportService.sendRequest(master, UPDATE_SNAPSHOT_ACTION_NAME, request, EmptyTransportResponseHandler.INSTANCE_SAME);
543+
final UpdateIndexShardSnapshotStatusRequest request = new UpdateIndexShardSnapshotStatusRequest(snapshot, shardId, status);
544+
transportService.sendRequest(masterNode, UPDATE_SNAPSHOT_ACTION_NAME, request, EmptyTransportResponseHandler.INSTANCE_SAME);
508545
} catch (Exception e) {
509-
logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}] [{}] failed to update snapshot state", request.snapshot(), request.status()), e);
546+
logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}] [{}] failed to update snapshot state", snapshot, status), e);
510547
}
511548
}
512549

0 commit comments

Comments
 (0)