Skip to content

Commit 1738b9b

Browse files
authored
Handle snapshot lifecycle policy updates and deletions (#40062)
(Note this is a PR against the `snapshot-lifecycle-management` feature branch) This adds logic to `SnapshotLifecycleService` to handle updates and deletes for snapshot policies. Policies with incremented versions have the old policy cancelled and the new one scheduled. Deleted policies have their schedules cancelled when they are no longer present in the cluster state metadata. Relates to #38461
1 parent 4ba731a commit 1738b9b

File tree

5 files changed

+316
-19
lines changed

5 files changed

+316
-19
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/IndexLifecycle.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
import org.elasticsearch.xpack.indexlifecycle.action.TransportStartILMAction;
8282
import org.elasticsearch.xpack.indexlifecycle.action.TransportStopILMAction;
8383
import org.elasticsearch.xpack.snapshotlifecycle.SnapshotLifecycleService;
84+
import org.elasticsearch.xpack.snapshotlifecycle.SnapshotLifecycleTask;
8485
import org.elasticsearch.xpack.snapshotlifecycle.action.DeleteSnapshotLifecycleAction;
8586
import org.elasticsearch.xpack.snapshotlifecycle.action.GetSnapshotLifecycleAction;
8687
import org.elasticsearch.xpack.snapshotlifecycle.action.PutSnapshotLifecycleAction;
@@ -151,7 +152,8 @@ public Collection<Object> createComponents(Client client, ClusterService cluster
151152
}
152153
indexLifecycleInitialisationService.set(new IndexLifecycleService(settings, client, clusterService, threadPool,
153154
getClock(), System::currentTimeMillis, xContentRegistry));
154-
snapshotLifecycleService.set(new SnapshotLifecycleService(settings, client, clusterService, getClock()));
155+
snapshotLifecycleService.set(new SnapshotLifecycleService(settings,
156+
() -> new SnapshotLifecycleTask(client), clusterService, getClock()));
155157
return Arrays.asList(indexLifecycleInitialisationService.get(), snapshotLifecycleService.get());
156158
}
157159

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/snapshotlifecycle/SnapshotLifecycleMetadata.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.IOException;
2222
import java.util.Collections;
2323
import java.util.EnumSet;
24+
import java.util.HashMap;
2425
import java.util.Map;
2526
import java.util.TreeMap;
2627

@@ -35,7 +36,7 @@ public class SnapshotLifecycleMetadata implements XPackMetaDataCustom {
3536
private final Map<String, SnapshotLifecyclePolicyMetadata> snapshotConfigurations;
3637

3738
public SnapshotLifecycleMetadata(Map<String, SnapshotLifecyclePolicyMetadata> snapshotConfigurations) {
38-
this.snapshotConfigurations = Collections.unmodifiableMap(snapshotConfigurations);
39+
this.snapshotConfigurations = new HashMap<>(snapshotConfigurations);
3940
// TODO: maybe operation mode here so it can be disabled/re-enabled separately like ILM is
4041
}
4142

@@ -44,7 +45,7 @@ public SnapshotLifecycleMetadata(StreamInput in) throws IOException {
4445
}
4546

4647
public Map<String, SnapshotLifecyclePolicyMetadata> getSnapshotConfigurations() {
47-
return this.snapshotConfigurations;
48+
return Collections.unmodifiableMap(this.snapshotConfigurations);
4849
}
4950

5051
@Override

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/snapshotlifecycle/SnapshotLifecycleService.java

+76-15
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import org.apache.logging.log4j.LogManager;
1010
import org.apache.logging.log4j.Logger;
11-
import org.elasticsearch.client.Client;
1211
import org.elasticsearch.cluster.ClusterChangedEvent;
1312
import org.elasticsearch.cluster.ClusterState;
1413
import org.elasticsearch.cluster.ClusterStateListener;
@@ -23,6 +22,10 @@
2322
import java.io.Closeable;
2423
import java.time.Clock;
2524
import java.util.Map;
25+
import java.util.Set;
26+
import java.util.function.Supplier;
27+
import java.util.regex.Pattern;
28+
import java.util.stream.Collectors;
2629

2730
/**
2831
* {@code SnapshotLifecycleService} manages snapshot policy scheduling and triggering of the
@@ -32,28 +35,31 @@
3235
public class SnapshotLifecycleService implements LocalNodeMasterListener, Closeable, ClusterStateListener {
3336

3437
private static final Logger logger = LogManager.getLogger(SnapshotLifecycleMetadata.class);
38+
private static final String JOB_PATTERN_SUFFIX = "-\\d+$";
3539

3640
private final SchedulerEngine scheduler;
3741
private final ClusterService clusterService;
3842
private final SnapshotLifecycleTask snapshotTask;
3943
private final Map<String, SchedulerEngine.Job> scheduledTasks = ConcurrentCollections.newConcurrentMap();
4044
private volatile boolean isMaster = false;
4145

42-
public SnapshotLifecycleService(Settings settings, Client client, ClusterService clusterService,
46+
public SnapshotLifecycleService(Settings settings,
47+
Supplier<SnapshotLifecycleTask> taskSupplier,
48+
ClusterService clusterService,
4349
Clock clock) {
4450
this.scheduler = new SchedulerEngine(settings, clock);
4551
this.clusterService = clusterService;
46-
this.snapshotTask = new SnapshotLifecycleTask(client);
52+
this.snapshotTask = taskSupplier.get();
4753
clusterService.addLocalNodeMasterListener(this); // TODO: change this not to use 'this'
4854
clusterService.addListener(this);
4955
}
5056

5157
@Override
52-
public void clusterChanged(ClusterChangedEvent event) {
58+
public void clusterChanged(final ClusterChangedEvent event) {
5359
if (this.isMaster) {
54-
// TODO: handle modified policies (currently they are ignored)
55-
// TODO: handle deleted policies
56-
scheduleSnapshotJobs(event.state());
60+
final ClusterState state = event.state();
61+
scheduleSnapshotJobs(state);
62+
cleanupDeletedPolicies(state);
5763
}
5864
}
5965

@@ -71,6 +77,11 @@ public void offMaster() {
7177
cancelSnapshotJobs();
7278
}
7379

80+
// Only used for testing
81+
SchedulerEngine getScheduler() {
82+
return this.scheduler;
83+
}
84+
7485
/**
7586
* Schedule all non-scheduled snapshot jobs contained in the cluster state
7687
*/
@@ -81,35 +92,85 @@ public void scheduleSnapshotJobs(final ClusterState state) {
8192
}
8293
}
8394

95+
public void cleanupDeletedPolicies(final ClusterState state) {
96+
SnapshotLifecycleMetadata snapMeta = state.metaData().custom(SnapshotLifecycleMetadata.TYPE);
97+
if (snapMeta != null) {
98+
// Retrieve all of the expected policy job ids from the policies in the metadata
99+
final Set<String> policyJobIds = snapMeta.getSnapshotConfigurations().values().stream()
100+
.map(SnapshotLifecycleService::getJobId)
101+
.collect(Collectors.toSet());
102+
103+
// Cancel all jobs that are *NOT* in the scheduled tasks map
104+
scheduledTasks.keySet().stream()
105+
.filter(jobId -> policyJobIds.contains(jobId) == false)
106+
.forEach(this::cancelScheduledSnapshot);
107+
}
108+
}
109+
84110
/**
85-
* Schedule the {@link SnapshotLifecyclePolicy} job if it does not already exist. If the job already
86-
* exists it is not interfered with.
111+
* Schedule the {@link SnapshotLifecyclePolicy} job if it does not already exist. First checks
112+
* to see if any previous versions of the policy were scheduled, and if so, cancels those. If
113+
* the same version of a policy has already been scheduled it does not overwrite the job.
87114
*/
88115
public void maybeScheduleSnapshot(final SnapshotLifecyclePolicyMetadata snapshotLifecyclePolicy) {
89-
final String jobId = snapshotLifecyclePolicy.getPolicy().getId();
116+
final String jobId = getJobId(snapshotLifecyclePolicy);
117+
final Pattern existingJobPattern = Pattern.compile(snapshotLifecyclePolicy.getPolicy().getId() + JOB_PATTERN_SUFFIX);
118+
119+
// Find and cancel any existing jobs for this policy
120+
final boolean existingJobsFoundAndCancelled = scheduledTasks.keySet().stream()
121+
// Find all jobs matching the `jobid-\d+` pattern
122+
.filter(jId -> existingJobPattern.matcher(jId).matches())
123+
// Filter out a job that has not been changed (matches the id exactly meaning the version is the same)
124+
.filter(jId -> jId.equals(jobId) == false)
125+
.map(existingJobId -> {
126+
// Cancel existing job so the new one can be scheduled
127+
logger.debug("removing existing snapshot lifecycle job [{}] as it has been updated", existingJobId);
128+
scheduledTasks.remove(existingJobId);
129+
boolean existed = scheduler.remove(existingJobId);
130+
assert existed : "expected job for " + existingJobId + " to exist in scheduler";
131+
return existed;
132+
})
133+
.reduce(false, (a, b) -> a || b);
134+
135+
// Now atomically schedule the new job and add it to the scheduled tasks map. If the jobId
136+
// is identical to an existing job (meaning the version has not changed) then this does
137+
// not reschedule it.
90138
scheduledTasks.computeIfAbsent(jobId, id -> {
91139
final SchedulerEngine.Job job = new SchedulerEngine.Job(jobId,
92140
new CronSchedule(snapshotLifecyclePolicy.getPolicy().getSchedule()));
93-
logger.info("scheduling snapshot lifecycle job [{}]", jobId);
141+
if (existingJobsFoundAndCancelled) {
142+
logger.info("rescheduling updated snapshot lifecycle job [{}]", jobId);
143+
} else {
144+
logger.info("scheduling snapshot lifecycle job [{}]", jobId);
145+
}
94146
scheduler.add(job);
95147
return job;
96148
});
97149
}
98150

151+
/**
152+
* Generate the job id for a given policy metadata. The job id is {@code <policyid>-<version>}
153+
*/
154+
static String getJobId(SnapshotLifecyclePolicyMetadata policyMeta) {
155+
return policyMeta.getPolicy().getId() + "-" + policyMeta.getVersion();
156+
}
157+
99158
/**
100159
* Cancel all scheduled snapshot jobs
101160
*/
102161
public void cancelSnapshotJobs() {
162+
logger.trace("cancelling all snapshot lifecycle jobs");
103163
scheduler.scheduledJobIds().forEach(scheduler::remove);
104164
scheduledTasks.clear();
105165
}
106166

107167
/**
108-
* Cancel the given snapshot lifecycle id
168+
* Cancel the given policy job id (from {@link #getJobId(SnapshotLifecyclePolicyMetadata)}
109169
*/
110-
public void cancelScheduledSnapshot(final String snapshotLifecycleId) {
111-
scheduledTasks.remove(snapshotLifecycleId);
112-
scheduler.remove(snapshotLifecycleId);
170+
public void cancelScheduledSnapshot(final String lifecycleJobId) {
171+
logger.debug("cancelling snapshot lifecycle job [{}] as it no longer exists", lifecycleJobId);
172+
scheduledTasks.remove(lifecycleJobId);
173+
scheduler.remove(lifecycleJobId);
113174
}
114175

115176
@Override

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/snapshotlifecycle/SnapshotLifecycleTask.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public class SnapshotLifecycleTask implements SchedulerEngine.Listener {
1717

1818
private final Client client;
1919

20-
SnapshotLifecycleTask(final Client client) {
20+
public SnapshotLifecycleTask(final Client client) {
2121
this.client = client;
2222
}
2323

0 commit comments

Comments
 (0)