Skip to content

Commit 7f5c2f1

Browse files
authored
[CCR] Validate follower index historyUUIDs (#34078)
The follower index shard history UUID will be fetched from the indices stats api when the shard follow task starts and will be provided with the bulk shard operation requests. The bulk shard operations api will fail if the provided history uuid is unequal to the actual history uuid. No longer record the leader history uuid in shard follow task params, but rather use the leader history UUIDs directly from follower index's custom metadata. The resume follow api will remain to fail if leader index shard history UUIDs are missing. Closes #33956
1 parent 8539fb6 commit 7f5c2f1

14 files changed

+159
-84
lines changed

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/Ccr.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ public Collection<Object> createComponents(
148148
@Override
149149
public List<PersistentTasksExecutor<?>> getPersistentTasksExecutor(ClusterService clusterService,
150150
ThreadPool threadPool, Client client) {
151-
return Collections.singletonList(new ShardFollowTasksExecutor(settings, client, threadPool));
151+
return Collections.singletonList(new ShardFollowTasksExecutor(settings, client, threadPool, clusterService));
152152
}
153153

154154
public List<ActionHandler<? extends ActionRequest, ? extends ActionResponse>> getActions() {

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ public abstract class ShardFollowNodeTask extends AllocatedPersistentTask {
6262
private final BiConsumer<TimeValue, Runnable> scheduler;
6363
private final LongSupplier relativeTimeProvider;
6464

65+
private String followerHistoryUUID;
6566
private long leaderGlobalCheckpoint;
6667
private long leaderMaxSeqNo;
6768
private long leaderMaxSeqNoOfUpdatesOrDeletes = SequenceNumbers.UNASSIGNED_SEQ_NO;
@@ -110,15 +111,17 @@ protected boolean removeEldestEntry(final Map.Entry<Long, Tuple<AtomicInteger, E
110111
}
111112

112113
void start(
113-
final long leaderGlobalCheckpoint,
114-
final long leaderMaxSeqNo,
115-
final long followerGlobalCheckpoint,
116-
final long followerMaxSeqNo) {
114+
final String followerHistoryUUID,
115+
final long leaderGlobalCheckpoint,
116+
final long leaderMaxSeqNo,
117+
final long followerGlobalCheckpoint,
118+
final long followerMaxSeqNo) {
117119
/*
118120
* While this should only ever be called once and before any other threads can touch these fields, we use synchronization here to
119121
* avoid the need to declare these fields as volatile. That is, we are ensuring thesefields are always accessed under the same lock.
120122
*/
121123
synchronized (this) {
124+
this.followerHistoryUUID = followerHistoryUUID;
122125
this.leaderGlobalCheckpoint = leaderGlobalCheckpoint;
123126
this.leaderMaxSeqNo = leaderMaxSeqNo;
124127
this.followerGlobalCheckpoint = followerGlobalCheckpoint;
@@ -305,7 +308,7 @@ private void sendBulkShardOperationsRequest(List<Translog.Operation> operations,
305308
AtomicInteger retryCounter) {
306309
assert leaderMaxSeqNoOfUpdatesOrDeletes != SequenceNumbers.UNASSIGNED_SEQ_NO : "mus is not replicated";
307310
final long startTime = relativeTimeProvider.getAsLong();
308-
innerSendBulkShardOperationsRequest(operations, leaderMaxSeqNoOfUpdatesOrDeletes,
311+
innerSendBulkShardOperationsRequest(followerHistoryUUID, operations, leaderMaxSeqNoOfUpdatesOrDeletes,
309312
response -> {
310313
synchronized (ShardFollowNodeTask.this) {
311314
totalIndexTimeMillis += TimeUnit.NANOSECONDS.toMillis(relativeTimeProvider.getAsLong() - startTime);
@@ -404,8 +407,11 @@ static boolean shouldRetry(Exception e) {
404407
// These methods are protected for testing purposes:
405408
protected abstract void innerUpdateMapping(LongConsumer handler, Consumer<Exception> errorHandler);
406409

407-
protected abstract void innerSendBulkShardOperationsRequest(List<Translog.Operation> operations, long leaderMaxSeqNoOfUpdatesOrDeletes,
408-
Consumer<BulkShardOperationsResponse> handler, Consumer<Exception> errorHandler);
410+
protected abstract void innerSendBulkShardOperationsRequest(String followerHistoryUUID,
411+
List<Translog.Operation> operations,
412+
long leaderMaxSeqNoOfUpdatesOrDeletes,
413+
Consumer<BulkShardOperationsResponse> handler,
414+
Consumer<Exception> errorHandler);
409415

410416
protected abstract void innerSendShardChangesRequest(long from, int maxOperationCount, Consumer<ShardChangesAction.Response> handler,
411417
Consumer<Exception> errorHandler);

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTask.java

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,12 @@ public class ShardFollowTask implements XPackPlugin.XPackPersistentTaskParams {
5151
public static final ParseField MAX_WRITE_BUFFER_SIZE = new ParseField("max_write_buffer_size");
5252
public static final ParseField MAX_RETRY_DELAY = new ParseField("max_retry_delay");
5353
public static final ParseField POLL_TIMEOUT = new ParseField("poll_timeout");
54-
public static final ParseField RECORDED_HISTORY_UUID = new ParseField("recorded_history_uuid");
5554

5655
@SuppressWarnings("unchecked")
5756
private static ConstructingObjectParser<ShardFollowTask, Void> PARSER = new ConstructingObjectParser<>(NAME,
5857
(a) -> new ShardFollowTask((String) a[0], new ShardId((String) a[1], (String) a[2], (int) a[3]),
5958
new ShardId((String) a[4], (String) a[5], (int) a[6]), (int) a[7], (int) a[8], (ByteSizeValue) a[9],
60-
(int) a[10], (int) a[11], (TimeValue) a[12], (TimeValue) a[13], (String) a[14], (Map<String, String>) a[15]));
59+
(int) a[10], (int) a[11], (TimeValue) a[12], (TimeValue) a[13], (Map<String, String>) a[14]));
6160

6261
static {
6362
PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), LEADER_CLUSTER_ALIAS_FIELD);
@@ -82,7 +81,6 @@ public class ShardFollowTask implements XPackPlugin.XPackPersistentTaskParams {
8281
PARSER.declareField(ConstructingObjectParser.constructorArg(),
8382
(p, c) -> TimeValue.parseTimeValue(p.text(), POLL_TIMEOUT.getPreferredName()),
8483
POLL_TIMEOUT, ObjectParser.ValueType.STRING);
85-
PARSER.declareString(ConstructingObjectParser.constructorArg(), RECORDED_HISTORY_UUID);
8684
PARSER.declareObject(ConstructingObjectParser.constructorArg(), (p, c) -> p.mapStrings(), HEADERS);
8785
}
8886

@@ -96,7 +94,6 @@ public class ShardFollowTask implements XPackPlugin.XPackPersistentTaskParams {
9694
private final int maxWriteBufferSize;
9795
private final TimeValue maxRetryDelay;
9896
private final TimeValue pollTimeout;
99-
private final String recordedLeaderIndexHistoryUUID;
10097
private final Map<String, String> headers;
10198

10299
ShardFollowTask(
@@ -110,7 +107,6 @@ public class ShardFollowTask implements XPackPlugin.XPackPersistentTaskParams {
110107
final int maxWriteBufferSize,
111108
final TimeValue maxRetryDelay,
112109
final TimeValue pollTimeout,
113-
final String recordedLeaderIndexHistoryUUID,
114110
final Map<String, String> headers) {
115111
this.leaderClusterAlias = leaderClusterAlias;
116112
this.followShardId = followShardId;
@@ -122,7 +118,6 @@ public class ShardFollowTask implements XPackPlugin.XPackPersistentTaskParams {
122118
this.maxWriteBufferSize = maxWriteBufferSize;
123119
this.maxRetryDelay = maxRetryDelay;
124120
this.pollTimeout = pollTimeout;
125-
this.recordedLeaderIndexHistoryUUID = recordedLeaderIndexHistoryUUID;
126121
this.headers = headers != null ? Collections.unmodifiableMap(headers) : Collections.emptyMap();
127122
}
128123

@@ -137,7 +132,6 @@ public ShardFollowTask(StreamInput in) throws IOException {
137132
this.maxWriteBufferSize = in.readVInt();
138133
this.maxRetryDelay = in.readTimeValue();
139134
this.pollTimeout = in.readTimeValue();
140-
this.recordedLeaderIndexHistoryUUID = in.readString();
141135
this.headers = Collections.unmodifiableMap(in.readMap(StreamInput::readString, StreamInput::readString));
142136
}
143137

@@ -185,10 +179,6 @@ public String getTaskId() {
185179
return followShardId.getIndex().getUUID() + "-" + followShardId.getId();
186180
}
187181

188-
public String getRecordedLeaderIndexHistoryUUID() {
189-
return recordedLeaderIndexHistoryUUID;
190-
}
191-
192182
public Map<String, String> getHeaders() {
193183
return headers;
194184
}
@@ -210,7 +200,6 @@ public void writeTo(StreamOutput out) throws IOException {
210200
out.writeVInt(maxWriteBufferSize);
211201
out.writeTimeValue(maxRetryDelay);
212202
out.writeTimeValue(pollTimeout);
213-
out.writeString(recordedLeaderIndexHistoryUUID);
214203
out.writeMap(headers, StreamOutput::writeString, StreamOutput::writeString);
215204
}
216205

@@ -237,7 +226,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
237226
builder.field(MAX_WRITE_BUFFER_SIZE.getPreferredName(), maxWriteBufferSize);
238227
builder.field(MAX_RETRY_DELAY.getPreferredName(), maxRetryDelay.getStringRep());
239228
builder.field(POLL_TIMEOUT.getPreferredName(), pollTimeout.getStringRep());
240-
builder.field(RECORDED_HISTORY_UUID.getPreferredName(), recordedLeaderIndexHistoryUUID);
241229
builder.field(HEADERS.getPreferredName(), headers);
242230
return builder.endObject();
243231
}
@@ -257,7 +245,6 @@ public boolean equals(Object o) {
257245
maxWriteBufferSize == that.maxWriteBufferSize &&
258246
Objects.equals(maxRetryDelay, that.maxRetryDelay) &&
259247
Objects.equals(pollTimeout, that.pollTimeout) &&
260-
Objects.equals(recordedLeaderIndexHistoryUUID, that.recordedLeaderIndexHistoryUUID) &&
261248
Objects.equals(headers, that.headers);
262249
}
263250

@@ -274,8 +261,8 @@ public int hashCode() {
274261
maxWriteBufferSize,
275262
maxRetryDelay,
276263
pollTimeout,
277-
recordedLeaderIndexHistoryUUID,
278-
headers);
264+
headers
265+
);
279266
}
280267

281268
public String toString() {

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@
1717
import org.elasticsearch.cluster.metadata.IndexMetaData;
1818
import org.elasticsearch.cluster.metadata.MappingMetaData;
1919
import org.elasticsearch.cluster.routing.IndexRoutingTable;
20+
import org.elasticsearch.cluster.service.ClusterService;
2021
import org.elasticsearch.common.settings.Settings;
2122
import org.elasticsearch.common.unit.TimeValue;
2223
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
2324
import org.elasticsearch.common.xcontent.XContentType;
2425
import org.elasticsearch.index.Index;
2526
import org.elasticsearch.index.IndexNotFoundException;
27+
import org.elasticsearch.index.engine.CommitStats;
28+
import org.elasticsearch.index.engine.Engine;
2629
import org.elasticsearch.index.seqno.SeqNoStats;
2730
import org.elasticsearch.index.shard.ShardId;
2831
import org.elasticsearch.index.shard.ShardNotFoundException;
@@ -47,16 +50,19 @@
4750
import java.util.function.LongConsumer;
4851

4952
import static org.elasticsearch.xpack.ccr.CcrLicenseChecker.wrapClient;
53+
import static org.elasticsearch.xpack.ccr.action.TransportResumeFollowAction.extractLeaderShardHistoryUUIDs;
5054

5155
public class ShardFollowTasksExecutor extends PersistentTasksExecutor<ShardFollowTask> {
5256

5357
private final Client client;
5458
private final ThreadPool threadPool;
59+
private final ClusterService clusterService;
5560

56-
public ShardFollowTasksExecutor(Settings settings, Client client, ThreadPool threadPool) {
61+
public ShardFollowTasksExecutor(Settings settings, Client client, ThreadPool threadPool, ClusterService clusterService) {
5762
super(settings, ShardFollowTask.NAME, Ccr.CCR_THREAD_POOL_NAME);
5863
this.client = client;
5964
this.threadPool = threadPool;
65+
this.clusterService = clusterService;
6066
}
6167

6268
@Override
@@ -99,8 +105,10 @@ protected AllocatedPersistentTask createTask(long id, String type, String action
99105
}
100106
}
101107
};
102-
return new ShardFollowNodeTask(
103-
id, type, action, getDescription(taskInProgress), parentTaskId, headers, params, scheduler, System::nanoTime) {
108+
109+
final String recordedLeaderShardHistoryUUID = getLeaderShardHistoryUUID(params);
110+
return new ShardFollowNodeTask(id, type, action, getDescription(taskInProgress), parentTaskId, headers, params,
111+
scheduler, System::nanoTime) {
104112

105113
@Override
106114
protected void innerUpdateMapping(LongConsumer handler, Consumer<Exception> errorHandler) {
@@ -135,12 +143,14 @@ protected void innerUpdateMapping(LongConsumer handler, Consumer<Exception> erro
135143

136144
@Override
137145
protected void innerSendBulkShardOperationsRequest(
138-
final List<Translog.Operation> operations,
139-
final long maxSeqNoOfUpdatesOrDeletes,
140-
final Consumer<BulkShardOperationsResponse> handler,
141-
final Consumer<Exception> errorHandler) {
142-
final BulkShardOperationsRequest request = new BulkShardOperationsRequest(
143-
params.getFollowShardId(), operations, maxSeqNoOfUpdatesOrDeletes);
146+
final String followerHistoryUUID,
147+
final List<Translog.Operation> operations,
148+
final long maxSeqNoOfUpdatesOrDeletes,
149+
final Consumer<BulkShardOperationsResponse> handler,
150+
final Consumer<Exception> errorHandler) {
151+
152+
final BulkShardOperationsRequest request = new BulkShardOperationsRequest(params.getFollowShardId(),
153+
followerHistoryUUID, operations, maxSeqNoOfUpdatesOrDeletes);
144154
followerClient.execute(BulkShardOperationsAction.INSTANCE, request,
145155
ActionListener.wrap(response -> handler.accept(response), errorHandler));
146156
}
@@ -149,7 +159,7 @@ protected void innerSendBulkShardOperationsRequest(
149159
protected void innerSendShardChangesRequest(long from, int maxOperationCount, Consumer<ShardChangesAction.Response> handler,
150160
Consumer<Exception> errorHandler) {
151161
ShardChangesAction.Request request =
152-
new ShardChangesAction.Request(params.getLeaderShardId(), params.getRecordedLeaderIndexHistoryUUID());
162+
new ShardChangesAction.Request(params.getLeaderShardId(), recordedLeaderShardHistoryUUID);
153163
request.setFromSeqNo(from);
154164
request.setMaxOperationCount(maxOperationCount);
155165
request.setMaxBatchSize(params.getMaxBatchSize());
@@ -159,8 +169,15 @@ protected void innerSendShardChangesRequest(long from, int maxOperationCount, Co
159169
};
160170
}
161171

162-
interface BiLongConsumer {
163-
void accept(long x, long y);
172+
private String getLeaderShardHistoryUUID(ShardFollowTask params) {
173+
IndexMetaData followIndexMetaData = clusterService.state().metaData().index(params.getFollowShardId().getIndex());
174+
Map<String, String> ccrIndexMetadata = followIndexMetaData.getCustomData(Ccr.CCR_CUSTOM_METADATA_KEY);
175+
String[] recordedLeaderShardHistoryUUIDs = extractLeaderShardHistoryUUIDs(ccrIndexMetadata);
176+
return recordedLeaderShardHistoryUUIDs[params.getLeaderShardId().id()];
177+
}
178+
179+
interface FollowerStatsInfoHandler {
180+
void accept(String followerHistoryUUID, long globalCheckpoint, long maxSeqNo);
164181
}
165182

166183
@Override
@@ -169,7 +186,9 @@ protected void nodeOperation(final AllocatedPersistentTask task, final ShardFoll
169186
ShardFollowNodeTask shardFollowNodeTask = (ShardFollowNodeTask) task;
170187
logger.info("{} Starting to track leader shard {}", params.getFollowShardId(), params.getLeaderShardId());
171188

172-
BiLongConsumer handler = (followerGCP, maxSeqNo) -> shardFollowNodeTask.start(followerGCP, maxSeqNo, followerGCP, maxSeqNo);
189+
FollowerStatsInfoHandler handler = (followerHistoryUUID, followerGCP, maxSeqNo) -> {
190+
shardFollowNodeTask.start(followerHistoryUUID, followerGCP, maxSeqNo, followerGCP, maxSeqNo);
191+
};
173192
Consumer<Exception> errorHandler = e -> {
174193
if (shardFollowNodeTask.isStopped()) {
175194
return;
@@ -184,13 +203,13 @@ protected void nodeOperation(final AllocatedPersistentTask task, final ShardFoll
184203
}
185204
};
186205

187-
fetchGlobalCheckpoint(followerClient, params.getFollowShardId(), handler, errorHandler);
206+
fetchFollowerShardInfo(followerClient, params.getFollowShardId(), handler, errorHandler);
188207
}
189208

190-
private void fetchGlobalCheckpoint(
209+
private void fetchFollowerShardInfo(
191210
final Client client,
192211
final ShardId shardId,
193-
final BiLongConsumer handler,
212+
final FollowerStatsInfoHandler handler,
194213
final Consumer<Exception> errorHandler) {
195214
client.admin().indices().stats(new IndicesStatsRequest().indices(shardId.getIndexName()), ActionListener.wrap(r -> {
196215
IndexStats indexStats = r.getIndex(shardId.getIndexName());
@@ -204,10 +223,14 @@ private void fetchGlobalCheckpoint(
204223
.filter(shardStats -> shardStats.getShardRouting().primary())
205224
.findAny();
206225
if (filteredShardStats.isPresent()) {
207-
final SeqNoStats seqNoStats = filteredShardStats.get().getSeqNoStats();
226+
final ShardStats shardStats = filteredShardStats.get();
227+
final CommitStats commitStats = shardStats.getCommitStats();
228+
final String historyUUID = commitStats.getUserData().get(Engine.HISTORY_UUID_KEY);
229+
230+
final SeqNoStats seqNoStats = shardStats.getSeqNoStats();
208231
final long globalCheckpoint = seqNoStats.getGlobalCheckpoint();
209232
final long maxSeqNo = seqNoStats.getMaxSeqNo();
210-
handler.accept(globalCheckpoint, maxSeqNo);
233+
handler.accept(historyUUID, globalCheckpoint, maxSeqNo);
211234
} else {
212235
errorHandler.accept(new ShardNotFoundException(shardId));
213236
}

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/TransportPutFollowAction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ private void createFollowerIndex(
174174
listener::onFailure);
175175
// Can't use create index api here, because then index templates can alter the mappings / settings.
176176
// And index templates could introduce settings / mappings that are incompatible with the leader index.
177-
clusterService.submitStateUpdateTask("follow_index_action", new AckedClusterStateUpdateTask<Boolean>(request, handler) {
177+
clusterService.submitStateUpdateTask("create_following_index", new AckedClusterStateUpdateTask<Boolean>(request, handler) {
178178

179179
@Override
180180
protected Boolean newResponse(final boolean acknowledged) {

0 commit comments

Comments
 (0)