Skip to content

Commit bf88da3

Browse files
Register SLM run before snapshotting to save stats (#110216)
The SLM health indicator relies on the policyMetadata.getInvocationsSinceLastSuccess to determine if the last several snapshots have failed. If a snapshot fails and the master is shutdown before setting invocationsSinceLastSuccess, the fact that failure occurred will be lost. To solve this, before snapshotting, we register that a snapshot is about to run, in the cluster state custom metadata. If the run fails, and invocationsSinceLastSuccess is not updated before a master shutdown, the fact that the failure occurred will not be lost. On completion of a subsequent snapshot run, SnapshotLifecycleTask will observe that there exists a registered snapshot which is no longer running. It will infer that the snapshot failed, and update invocationsSinceLastSuccess and other stats accordingly. A few parts of this change touch general snapshot code, and are worth noting: * Snapshots can only be uniquely identified with a uuid which had previously been generated in SnapshotService. This uuid is needed when there is a snapshot failure, but was not available to SLM as it was only returned in the SnapshotInfo after a success. To make this available, the uuid is now generated in the CreateSnapshotRequest constructor and passed to SnapshotService. In mixed version clusters, there exists a special case where the uuid is still generated in SnapshotService. * If a snapshot were registered before calling snapshot service, there would a small period of time when a snapshot is registered but not yet stored in SnapshotsInProgress. During this time, another snapshot from the same policy might incorrectly infer that the snapshot failed and update its stats accordingly. To avoid this race condition, we register snapshots within SnapshotService in the same cluster update which updates SnapshotsInProgress.
1 parent ec4444b commit bf88da3

File tree

16 files changed

+1623
-55
lines changed

16 files changed

+1623
-55
lines changed

docs/changelog/110216.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 110216
2+
summary: Register SLM run before snapshotting to save stats
3+
area: ILM+SLM
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ static TransportVersion def(int id) {
183183
public static final TransportVersion FIX_VECTOR_SIMILARITY_INNER_HITS = def(8_713_00_0);
184184
public static final TransportVersion INDEX_REQUEST_UPDATE_BY_DOC_ORIGIN = def(8_714_00_0);
185185
public static final TransportVersion ESQL_ATTRIBUTE_CACHED_SERIALIZATION = def(8_715_00_0);
186+
public static final TransportVersion REGISTER_SLM_STATS = def(8_716_00_0);
186187

187188
/*
188189
* STOP! READ THIS FIRST! No, really,

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/create/CreateSnapshotRequest.java

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.elasticsearch.action.support.master.MasterNodeRequest;
1818
import org.elasticsearch.cluster.metadata.DataStream;
1919
import org.elasticsearch.common.Strings;
20+
import org.elasticsearch.common.UUIDs;
2021
import org.elasticsearch.common.bytes.BytesReference;
2122
import org.elasticsearch.common.io.stream.StreamInput;
2223
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -82,6 +83,9 @@ public class CreateSnapshotRequest extends MasterNodeRequest<CreateSnapshotReque
8283
@Nullable
8384
private Map<String, Object> userMetadata;
8485

86+
@Nullable
87+
private String uuid = null;
88+
8589
public CreateSnapshotRequest(TimeValue masterNodeTimeout) {
8690
super(masterNodeTimeout);
8791
}
@@ -96,6 +100,7 @@ public CreateSnapshotRequest(TimeValue masterNodeTimeout, String repository, Str
96100
this(masterNodeTimeout);
97101
this.snapshot = snapshot;
98102
this.repository = repository;
103+
this.uuid = UUIDs.randomBase64UUID();
99104
}
100105

101106
public CreateSnapshotRequest(StreamInput in) throws IOException {
@@ -112,6 +117,7 @@ public CreateSnapshotRequest(StreamInput in) throws IOException {
112117
waitForCompletion = in.readBoolean();
113118
partial = in.readBoolean();
114119
userMetadata = in.readGenericMap();
120+
uuid = in.getTransportVersion().onOrAfter(TransportVersions.REGISTER_SLM_STATS) ? in.readOptionalString() : null;
115121
}
116122

117123
@Override
@@ -129,6 +135,9 @@ public void writeTo(StreamOutput out) throws IOException {
129135
out.writeBoolean(waitForCompletion);
130136
out.writeBoolean(partial);
131137
out.writeGenericMap(userMetadata);
138+
if (out.getTransportVersion().onOrAfter(TransportVersions.REGISTER_SLM_STATS)) {
139+
out.writeOptionalString(uuid);
140+
}
132141
}
133142

134143
@Override
@@ -364,6 +373,35 @@ public CreateSnapshotRequest userMetadata(@Nullable Map<String, Object> userMeta
364373
return this;
365374
}
366375

376+
/**
377+
* Set a uuid to identify snapshot.
378+
* If no uuid is specified, one will be created within SnapshotService
379+
*/
380+
public CreateSnapshotRequest uuid(String uuid) {
381+
this.uuid = uuid;
382+
return this;
383+
}
384+
385+
/**
386+
* Get the uuid, generating it if one does not yet exist.
387+
* Because the uuid can be set, this method is NOT thread-safe.
388+
* <p>
389+
* The uuid was previously generated in SnapshotService.createSnapshot
390+
* but was moved to the CreateSnapshotRequest constructor so that the caller could
391+
* uniquely identify the snapshot. Unfortunately, in a mixed-version cluster,
392+
* the CreateSnapshotRequest could be created on a node which does not yet
393+
* generate the uuid in the constructor. In this case, the uuid
394+
* must be generated when it is first accessed with this getter.
395+
*
396+
* @return the uuid that will be used for the snapshot
397+
*/
398+
public String uuid() {
399+
if (this.uuid == null) {
400+
this.uuid = UUIDs.randomBase64UUID();
401+
}
402+
return this.uuid;
403+
}
404+
367405
/**
368406
* @return Which plugin states should be included in the snapshot
369407
*/
@@ -469,12 +507,13 @@ public boolean equals(Object o) {
469507
&& Objects.equals(indicesOptions, that.indicesOptions)
470508
&& Arrays.equals(featureStates, that.featureStates)
471509
&& Objects.equals(masterNodeTimeout(), that.masterNodeTimeout())
472-
&& Objects.equals(userMetadata, that.userMetadata);
510+
&& Objects.equals(userMetadata, that.userMetadata)
511+
&& Objects.equals(uuid, that.uuid);
473512
}
474513

475514
@Override
476515
public int hashCode() {
477-
int result = Objects.hash(snapshot, repository, indicesOptions, partial, includeGlobalState, waitForCompletion, userMetadata);
516+
int result = Objects.hash(snapshot, repository, indicesOptions, partial, includeGlobalState, waitForCompletion, userMetadata, uuid);
478517
result = 31 * result + Arrays.hashCode(indices);
479518
result = 31 * result + Arrays.hashCode(featureStates);
480519
return result;
@@ -505,6 +544,8 @@ public String toString() {
505544
+ masterNodeTimeout()
506545
+ ", metadata="
507546
+ userMetadata
547+
+ ", uuid="
548+
+ uuid
508549
+ '}';
509550
}
510551
}

server/src/main/java/org/elasticsearch/cluster/ClusterModule.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
import org.elasticsearch.persistent.PersistentTasksNodeService;
7777
import org.elasticsearch.plugins.ClusterPlugin;
7878
import org.elasticsearch.script.ScriptMetadata;
79+
import org.elasticsearch.snapshots.RegisteredPolicySnapshots;
7980
import org.elasticsearch.snapshots.SnapshotsInfoService;
8081
import org.elasticsearch.tasks.Task;
8182
import org.elasticsearch.tasks.TaskResultsService;
@@ -234,6 +235,12 @@ public static List<Entry> getNamedWriteables() {
234235
registerMetadataCustom(entries, NodesShutdownMetadata.TYPE, NodesShutdownMetadata::new, NodesShutdownMetadata::readDiffFrom);
235236
registerMetadataCustom(entries, FeatureMigrationResults.TYPE, FeatureMigrationResults::new, FeatureMigrationResults::readDiffFrom);
236237
registerMetadataCustom(entries, DesiredNodesMetadata.TYPE, DesiredNodesMetadata::new, DesiredNodesMetadata::readDiffFrom);
238+
registerMetadataCustom(
239+
entries,
240+
RegisteredPolicySnapshots.TYPE,
241+
RegisteredPolicySnapshots::new,
242+
RegisteredPolicySnapshots.RegisteredSnapshotsDiff::new
243+
);
237244

238245
// Task Status (not Diffable)
239246
entries.add(new Entry(Task.Status.class, PersistentTasksNodeService.Status.NAME, PersistentTasksNodeService.Status::new));

0 commit comments

Comments
 (0)