18
18
import org .elasticsearch .cluster .ClusterState ;
19
19
import org .elasticsearch .cluster .service .ClusterService ;
20
20
import org .elasticsearch .common .Strings ;
21
+ import org .elasticsearch .common .unit .TimeValue ;
22
+ import org .elasticsearch .snapshots .SnapshotId ;
21
23
import org .elasticsearch .snapshots .SnapshotInfo ;
24
+ import org .elasticsearch .snapshots .SnapshotState ;
22
25
import org .elasticsearch .xpack .core .ClientHelper ;
26
+ import org .elasticsearch .xpack .core .ilm .LifecycleSettings ;
23
27
import org .elasticsearch .xpack .core .scheduler .SchedulerEngine ;
24
28
import org .elasticsearch .xpack .core .slm .SnapshotLifecycleMetadata ;
25
29
import org .elasticsearch .xpack .core .slm .SnapshotLifecyclePolicy ;
35
39
import java .util .concurrent .CountDownLatch ;
36
40
import java .util .concurrent .atomic .AtomicBoolean ;
37
41
import java .util .function .Consumer ;
42
+ import java .util .function .LongSupplier ;
38
43
import java .util .stream .Collectors ;
39
44
40
45
/**
@@ -50,10 +55,12 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
50
55
51
56
private final Client client ;
52
57
private final ClusterService clusterService ;
58
+ private final LongSupplier nowNanoSupplier ;
53
59
54
- public SnapshotRetentionTask (Client client , ClusterService clusterService ) {
60
+ public SnapshotRetentionTask (Client client , ClusterService clusterService , LongSupplier nowNanoSupplier ) {
55
61
this .client = new OriginSettingClient (client , ClientHelper .INDEX_LIFECYCLE_ORIGIN );
56
62
this .clusterService = clusterService ;
63
+ this .nowNanoSupplier = nowNanoSupplier ;
57
64
}
58
65
59
66
@ Override
@@ -64,6 +71,7 @@ public void triggered(SchedulerEngine.Event event) {
64
71
try {
65
72
logger .info ("starting SLM retention snapshot cleanup task" );
66
73
final ClusterState state = clusterService .state ();
74
+ final TimeValue maxDeletionTime = LifecycleSettings .SLM_RETENTION_DURATION_SETTING .get (state .metaData ().settings ());
67
75
68
76
// Find all SLM policies that have retention enabled
69
77
final Map <String , SnapshotLifecyclePolicy > policiesWithRetention = getAllPoliciesWithRetentionEnabled (state );
@@ -74,7 +82,7 @@ public void triggered(SchedulerEngine.Event event) {
74
82
.map (SnapshotLifecyclePolicy ::getRepository )
75
83
.collect (Collectors .toSet ());
76
84
77
- getAllSnapshots (repositioriesToFetch , new ActionListener <>() {
85
+ getAllSuccessfulSnapshots (repositioriesToFetch , new ActionListener <>() {
78
86
@ Override
79
87
public void onResponse (Map <String , List <SnapshotInfo >> allSnapshots ) {
80
88
// Find all the snapshots that are past their retention date
@@ -85,7 +93,7 @@ public void onResponse(Map<String, List<SnapshotInfo>> allSnapshots) {
85
93
.collect (Collectors .toList ())));
86
94
87
95
// Finally, delete the snapshots that need to be deleted
88
- deleteSnapshots (snapshotsToBeDeleted );
96
+ deleteSnapshots (snapshotsToBeDeleted , maxDeletionTime );
89
97
}
90
98
91
99
@ Override
@@ -160,8 +168,8 @@ static boolean snapshotEligibleForDeletion(SnapshotInfo snapshot, Map<String, Li
160
168
return eligible ;
161
169
}
162
170
163
- void getAllSnapshots (Collection <String > repositories , ActionListener <Map <String , List <SnapshotInfo >>> listener ,
164
- Consumer <Exception > errorHandler ) {
171
+ void getAllSuccessfulSnapshots (Collection <String > repositories , ActionListener <Map <String , List <SnapshotInfo >>> listener ,
172
+ Consumer <Exception > errorHandler ) {
165
173
if (repositories .isEmpty ()) {
166
174
// Skip retrieving anything if there are no repositories to fetch
167
175
listener .onResponse (Collections .emptyMap ());
@@ -175,7 +183,11 @@ void getAllSnapshots(Collection<String> repositories, ActionListener<Map<String,
175
183
public void onResponse (final GetSnapshotsResponse resp ) {
176
184
Map <String , List <SnapshotInfo >> snapshots = new HashMap <>();
177
185
repositories .forEach (repo -> {
178
- snapshots .put (repo , resp .getSnapshots (repo ));
186
+ snapshots .put (repo ,
187
+ // Only return snapshots in the SUCCESS state
188
+ resp .getSnapshots (repo ).stream ()
189
+ .filter (info -> info .state () == SnapshotState .SUCCESS )
190
+ .collect (Collectors .toList ()));
179
191
});
180
192
listener .onResponse (snapshots );
181
193
}
@@ -188,42 +200,64 @@ public void onFailure(Exception e) {
188
200
});
189
201
}
190
202
191
- void deleteSnapshots (Map <String , List <SnapshotInfo >> snapshotsToDelete ) {
192
- // TODO: make this more resilient and possibly only delete for a certain amount of time
203
+ void deleteSnapshots (Map <String , List <SnapshotInfo >> snapshotsToDelete , TimeValue maximumTime ) {
193
204
int count = snapshotsToDelete .values ().stream ().mapToInt (List ::size ).sum ();
194
205
if (count == 0 ) {
195
206
logger .debug ("no snapshots are eligible for deletion" );
196
207
return ;
197
208
}
209
+
198
210
logger .info ("starting snapshot retention deletion for [{}] snapshots" , count );
199
- snapshotsToDelete .forEach ((repo , snapshots ) -> {
200
- snapshots .forEach (info -> {
201
- logger .info ("[{}] snapshot retention deleting snapshot [{}]" , repo , info .snapshotId ());
202
- CountDownLatch latch = new CountDownLatch (1 );
203
- client .admin ().cluster ().prepareDeleteSnapshot (repo , info .snapshotId ().getName ())
204
- .execute (new LatchedActionListener <>(new ActionListener <>() {
205
- @ Override
206
- public void onResponse (AcknowledgedResponse acknowledgedResponse ) {
207
- if (acknowledgedResponse .isAcknowledged ()) {
208
- logger .debug ("[{}] snapshot [{}] deleted successfully" , repo , info .snapshotId ());
209
- }
210
- }
211
-
212
- @ Override
213
- public void onFailure (Exception e ) {
214
- logger .warn (new ParameterizedMessage ("[{}] failed to delete snapshot [{}] for retention" ,
215
- repo , info .snapshotId ()), e );
216
- }
217
- }, latch ));
218
- try {
219
- // Deletes cannot occur simultaneously, so wait for this
220
- // deletion to complete before attempting the next one
221
- latch .await ();
222
- } catch (InterruptedException e ) {
223
- logger .error (new ParameterizedMessage ("[{}] deletion of snapshot [{}] interrupted" ,
224
- repo , info .snapshotId ()), e );
211
+ long startTime = nowNanoSupplier .getAsLong ();
212
+ int deleted = 0 ;
213
+ for (Map .Entry <String , List <SnapshotInfo >> entry : snapshotsToDelete .entrySet ()) {
214
+ String repo = entry .getKey ();
215
+ List <SnapshotInfo > snapshots = entry .getValue ();
216
+ for (SnapshotInfo info : snapshots ) {
217
+ deleteSnapshot (repo , info .snapshotId ());
218
+ deleted ++;
219
+ // Check whether we have exceeded the maximum time allowed to spend deleting
220
+ // snapshots, if we have, short-circuit the rest of the deletions
221
+ TimeValue elapsedDeletionTime = TimeValue .timeValueNanos (nowNanoSupplier .getAsLong () - startTime );
222
+ logger .trace ("elapsed time for deletion of [{}] snapshot: {}" , info .snapshotId (), elapsedDeletionTime );
223
+ if (elapsedDeletionTime .compareTo (maximumTime ) > 0 ) {
224
+ logger .info ("maximum snapshot retention deletion time reached, time spent: [{}]," +
225
+ " maximum allowed time: [{}], deleted {} out of {} snapshots scheduled for deletion" ,
226
+ elapsedDeletionTime , maximumTime , deleted , count );
227
+ return ;
225
228
}
226
- });
227
- });
229
+ }
230
+ }
231
+ }
232
+
233
+ /**
234
+ * Delete the given snapshot from the repository in blocking manner
235
+ */
236
+ void deleteSnapshot (String repo , SnapshotId snapshot ) {
237
+ logger .info ("[{}] snapshot retention deleting snapshot [{}]" , repo , snapshot );
238
+ CountDownLatch latch = new CountDownLatch (1 );
239
+ client .admin ().cluster ().prepareDeleteSnapshot (repo , snapshot .getName ())
240
+ .execute (new LatchedActionListener <>(new ActionListener <>() {
241
+ @ Override
242
+ public void onResponse (AcknowledgedResponse acknowledgedResponse ) {
243
+ if (acknowledgedResponse .isAcknowledged ()) {
244
+ logger .debug ("[{}] snapshot [{}] deleted successfully" , repo , snapshot );
245
+ }
246
+ }
247
+
248
+ @ Override
249
+ public void onFailure (Exception e ) {
250
+ logger .warn (new ParameterizedMessage ("[{}] failed to delete snapshot [{}] for retention" ,
251
+ repo , snapshot ), e );
252
+ }
253
+ }, latch ));
254
+ try {
255
+ // Deletes cannot occur simultaneously, so wait for this
256
+ // deletion to complete before attempting the next one
257
+ latch .await ();
258
+ } catch (InterruptedException e ) {
259
+ logger .error (new ParameterizedMessage ("[{}] deletion of snapshot [{}] interrupted" ,
260
+ repo , snapshot ), e );
261
+ }
228
262
}
229
263
}
0 commit comments