Skip to content

Commit 0096f1b

Browse files
authored
Ensure changes requests return the latest mapping version (#37633)
Today we keep the mapping on the follower in sync with the leader's using the mapping version from changes requests. There are two rare cases where the mapping on the follower is not synced properly: 1. The returned mapping version (from ClusterService) is outdated than the actual mapping. This happens because we expose the latest cluster state in ClusterService after applying it to IndexService. 2. It's possible for the FollowTask to receive an outdated mapping than the min_required_mapping. In that case, it should fetch the mapping again; otherwise, the follower won't have the right mapping. Relates to #31140
1 parent d7fe4e5 commit 0096f1b

13 files changed

+159
-56
lines changed

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/Ccr.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,7 @@ public List<PersistentTasksExecutor<?>> getPersistentTasksExecutor(ClusterServic
179179
ThreadPool threadPool,
180180
Client client,
181181
SettingsModule settingsModule) {
182-
IndexScopedSettings indexScopedSettings = settingsModule.getIndexScopedSettings();
183-
return Collections.singletonList(new ShardFollowTasksExecutor(client, threadPool, clusterService, indexScopedSettings));
182+
return Collections.singletonList(new ShardFollowTasksExecutor(client, threadPool, clusterService, settingsModule));
184183
}
185184

186185
public List<ActionHandler<? extends ActionRequest, ? extends ActionResponse>> getActions() {

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/CcrSettings.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,17 @@ public final class CcrSettings {
3030
Setting.boolSetting("index.xpack.ccr.following_index", false, Property.IndexScope, Property.InternalIndex);
3131

3232
/**
33-
* Dynamic node setting for specifying the wait_for_timeout that the auto follow coordinator should be using.
33+
* Dynamic node setting for specifying the wait_for_timeout that the auto follow coordinator and shard follow task should be using.
3434
*/
35-
public static final Setting<TimeValue> CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT = Setting.timeSetting(
36-
"ccr.auto_follow.wait_for_metadata_timeout", TimeValue.timeValueSeconds(60), Property.NodeScope, Property.Dynamic);
35+
public static final Setting<TimeValue> CCR_WAIT_FOR_METADATA_TIMEOUT = Setting.timeSetting(
36+
"ccr.wait_for_metadata_timeout", TimeValue.timeValueSeconds(60), Property.NodeScope, Property.Dynamic);
3737

38+
/**
39+
* Dynamic node setting for specifying the wait_for_timeout that the auto follow coordinator should be using.
40+
* TODO: Deprecate and remove this setting
41+
*/
42+
private static final Setting<TimeValue> CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT = Setting.timeSetting(
43+
"ccr.auto_follow.wait_for_metadata_timeout", CCR_WAIT_FOR_METADATA_TIMEOUT, Property.NodeScope, Property.Dynamic);
3844

3945
/**
4046
* Max bytes a node can recover per second.
@@ -62,7 +68,8 @@ static List<Setting<?>> getSettings() {
6268
CCR_FOLLOWING_INDEX_SETTING,
6369
RECOVERY_MAX_BYTES_PER_SECOND,
6470
INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING,
65-
CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT);
71+
CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT,
72+
CCR_WAIT_FOR_METADATA_TIMEOUT);
6673
}
6774

6875
private final CombinedRateLimiter ccrRateLimiter;

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/AutoFollowCoordinator.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ protected boolean removeEldestEntry(final Map.Entry<String, Tuple<Long, Elastics
113113
waitForMetadataTimeOut = newWaitForTimeOut;
114114
}
115115
};
116-
clusterService.getClusterSettings().addSettingsUpdateConsumer(CcrSettings.CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT, updater);
117-
waitForMetadataTimeOut = CcrSettings.CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT.get(settings);
116+
clusterService.getClusterSettings().addSettingsUpdateConsumer(CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT, updater);
117+
waitForMetadataTimeOut = CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT.get(settings);
118118
}
119119

120120
public synchronized AutoFollowStats getStats() {

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardChangesAction.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -335,19 +335,21 @@ protected Response shardOperation(Request request, ShardId shardId) throws IOExc
335335
final IndexService indexService = indicesService.indexServiceSafe(request.getShard().getIndex());
336336
final IndexShard indexShard = indexService.getShard(request.getShard().id());
337337
final SeqNoStats seqNoStats = indexShard.seqNoStats();
338-
final IndexMetaData indexMetaData = clusterService.state().metaData().index(shardId.getIndex());
339-
final long mappingVersion = indexMetaData.getMappingVersion();
340-
final long settingsVersion = indexMetaData.getSettingsVersion();
341-
342338
final Translog.Operation[] operations = getOperations(
343339
indexShard,
344340
seqNoStats.getGlobalCheckpoint(),
345341
request.getFromSeqNo(),
346342
request.getMaxOperationCount(),
347343
request.getExpectedHistoryUUID(),
348344
request.getMaxBatchSize());
349-
// must capture after after snapshotting operations to ensure this MUS is at least the highest MUS of any of these operations.
345+
// must capture after snapshotting operations to ensure this MUS is at least the highest MUS of any of these operations.
350346
final long maxSeqNoOfUpdatesOrDeletes = indexShard.getMaxSeqNoOfUpdatesOrDeletes();
347+
// must capture IndexMetaData after snapshotting operations to ensure the returned mapping version is at least as up-to-date
348+
// as the mapping version that these operations used. Here we must not use IndexMetaData from ClusterService for we expose
349+
// a new cluster state to ClusterApplier(s) before exposing it in the ClusterService.
350+
final IndexMetaData indexMetaData = indexService.getMetaData();
351+
final long mappingVersion = indexMetaData.getMappingVersion();
352+
final long settingsVersion = indexMetaData.getSettingsVersion();
351353
return getResponse(
352354
mappingVersion,
353355
settingsVersion,

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ void start(
130130
}
131131

132132
// updates follower mapping, this gets us the leader mapping version and makes sure that leader and follower mapping are identical
133-
updateMapping(followerMappingVersion -> {
133+
updateMapping(0L, followerMappingVersion -> {
134134
synchronized (ShardFollowNodeTask.this) {
135135
currentMappingVersion = followerMappingVersion;
136136
}
@@ -370,15 +370,15 @@ private synchronized void handleWriteResponse(final BulkShardOperationsResponse
370370
coordinateReads();
371371
}
372372

373-
private synchronized void maybeUpdateMapping(Long minimumRequiredMappingVersion, Runnable task) {
373+
private synchronized void maybeUpdateMapping(long minimumRequiredMappingVersion, Runnable task) {
374374
if (currentMappingVersion >= minimumRequiredMappingVersion) {
375375
LOGGER.trace("{} mapping version [{}] is higher or equal than minimum required mapping version [{}]",
376376
params.getFollowShardId(), currentMappingVersion, minimumRequiredMappingVersion);
377377
task.run();
378378
} else {
379379
LOGGER.trace("{} updating mapping, mapping version [{}] is lower than minimum required mapping version [{}]",
380380
params.getFollowShardId(), currentMappingVersion, minimumRequiredMappingVersion);
381-
updateMapping(mappingVersion -> {
381+
updateMapping(minimumRequiredMappingVersion, mappingVersion -> {
382382
currentMappingVersion = mappingVersion;
383383
task.run();
384384
});
@@ -400,12 +400,13 @@ private synchronized void maybeUpdateSettings(final Long minimumRequiredSettings
400400
}
401401
}
402402

403-
private void updateMapping(LongConsumer handler) {
404-
updateMapping(handler, new AtomicInteger(0));
403+
private void updateMapping(long minRequiredMappingVersion, LongConsumer handler) {
404+
updateMapping(minRequiredMappingVersion, handler, new AtomicInteger(0));
405405
}
406406

407-
private void updateMapping(LongConsumer handler, AtomicInteger retryCounter) {
408-
innerUpdateMapping(handler, e -> handleFailure(e, retryCounter, () -> updateMapping(handler, retryCounter)));
407+
private void updateMapping(long minRequiredMappingVersion, LongConsumer handler, AtomicInteger retryCounter) {
408+
innerUpdateMapping(minRequiredMappingVersion, handler,
409+
e -> handleFailure(e, retryCounter, () -> updateMapping(minRequiredMappingVersion, handler, retryCounter)));
409410
}
410411

411412
private void updateSettings(final LongConsumer handler) {
@@ -471,7 +472,7 @@ static boolean shouldRetry(String remoteCluster, Exception e) {
471472
}
472473

473474
// These methods are protected for testing purposes:
474-
protected abstract void innerUpdateMapping(LongConsumer handler, Consumer<Exception> errorHandler);
475+
protected abstract void innerUpdateMapping(long minRequiredMappingVersion, LongConsumer handler, Consumer<Exception> errorHandler);
475476

476477
protected abstract void innerUpdateSettings(LongConsumer handler, Consumer<Exception> errorHandler);
477478

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java

Lines changed: 61 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,13 @@
2424
import org.elasticsearch.cluster.ClusterState;
2525
import org.elasticsearch.cluster.metadata.IndexMetaData;
2626
import org.elasticsearch.cluster.metadata.MappingMetaData;
27+
import org.elasticsearch.cluster.metadata.MetaData;
2728
import org.elasticsearch.cluster.routing.IndexRoutingTable;
2829
import org.elasticsearch.cluster.service.ClusterService;
2930
import org.elasticsearch.common.CheckedConsumer;
3031
import org.elasticsearch.common.settings.IndexScopedSettings;
3132
import org.elasticsearch.common.settings.Settings;
33+
import org.elasticsearch.common.settings.SettingsModule;
3234
import org.elasticsearch.common.unit.TimeValue;
3335
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
3436
import org.elasticsearch.index.Index;
@@ -46,6 +48,7 @@
4648
import org.elasticsearch.tasks.TaskId;
4749
import org.elasticsearch.threadpool.ThreadPool;
4850
import org.elasticsearch.xpack.ccr.Ccr;
51+
import org.elasticsearch.xpack.ccr.CcrSettings;
4952
import org.elasticsearch.xpack.ccr.action.bulk.BulkShardOperationsAction;
5053
import org.elasticsearch.xpack.ccr.action.bulk.BulkShardOperationsRequest;
5154
import org.elasticsearch.xpack.ccr.action.bulk.BulkShardOperationsResponse;
@@ -69,16 +72,20 @@ public class ShardFollowTasksExecutor extends PersistentTasksExecutor<ShardFollo
6972
private final ThreadPool threadPool;
7073
private final ClusterService clusterService;
7174
private final IndexScopedSettings indexScopedSettings;
75+
private volatile TimeValue waitForMetadataTimeOut;
7276

7377
public ShardFollowTasksExecutor(Client client,
7478
ThreadPool threadPool,
7579
ClusterService clusterService,
76-
IndexScopedSettings indexScopedSettings) {
80+
SettingsModule settingsModule) {
7781
super(ShardFollowTask.NAME, Ccr.CCR_THREAD_POOL_NAME);
7882
this.client = client;
7983
this.threadPool = threadPool;
8084
this.clusterService = clusterService;
81-
this.indexScopedSettings = indexScopedSettings;
85+
this.indexScopedSettings = settingsModule.getIndexScopedSettings();
86+
this.waitForMetadataTimeOut = CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT.get(settingsModule.getSettings());
87+
clusterService.getClusterSettings().addSettingsUpdateConsumer(CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT,
88+
newVal -> this.waitForMetadataTimeOut = newVal);
8289
}
8390

8491
@Override
@@ -112,33 +119,25 @@ protected AllocatedPersistentTask createTask(long id, String type, String action
112119
scheduler, System::nanoTime) {
113120

114121
@Override
115-
protected void innerUpdateMapping(LongConsumer handler, Consumer<Exception> errorHandler) {
116-
Index leaderIndex = params.getLeaderShardId().getIndex();
117-
Index followIndex = params.getFollowShardId().getIndex();
118-
119-
ClusterStateRequest clusterStateRequest = CcrRequests.metaDataRequest(leaderIndex.getName());
120-
CheckedConsumer<ClusterStateResponse, Exception> onResponse = clusterStateResponse -> {
121-
IndexMetaData indexMetaData = clusterStateResponse.getState().metaData().getIndexSafe(leaderIndex);
122-
if (indexMetaData.getMappings().isEmpty()) {
123-
assert indexMetaData.getMappingVersion() == 1;
124-
handler.accept(indexMetaData.getMappingVersion());
125-
return;
126-
}
127-
128-
assert indexMetaData.getMappings().size() == 1 : "expected exactly one mapping, but got [" +
129-
indexMetaData.getMappings().size() + "]";
130-
MappingMetaData mappingMetaData = indexMetaData.getMappings().iterator().next().value;
131-
132-
PutMappingRequest putMappingRequest = CcrRequests.putMappingRequest(followIndex.getName(), mappingMetaData);
133-
followerClient.admin().indices().putMapping(putMappingRequest, ActionListener.wrap(
134-
putMappingResponse -> handler.accept(indexMetaData.getMappingVersion()),
135-
errorHandler));
136-
};
137-
try {
138-
remoteClient(params).admin().cluster().state(clusterStateRequest, ActionListener.wrap(onResponse, errorHandler));
139-
} catch (Exception e) {
140-
errorHandler.accept(e);
141-
}
122+
protected void innerUpdateMapping(long minRequiredMappingVersion, LongConsumer handler, Consumer<Exception> errorHandler) {
123+
final Index followerIndex = params.getFollowShardId().getIndex();
124+
getIndexMetadata(minRequiredMappingVersion, 0L, params, ActionListener.wrap(
125+
indexMetaData -> {
126+
if (indexMetaData.getMappings().isEmpty()) {
127+
assert indexMetaData.getMappingVersion() == 1;
128+
handler.accept(indexMetaData.getMappingVersion());
129+
return;
130+
}
131+
assert indexMetaData.getMappings().size() == 1 : "expected exactly one mapping, but got [" +
132+
indexMetaData.getMappings().size() + "]";
133+
MappingMetaData mappingMetaData = indexMetaData.getMappings().iterator().next().value;
134+
PutMappingRequest putMappingRequest = CcrRequests.putMappingRequest(followerIndex.getName(), mappingMetaData);
135+
followerClient.admin().indices().putMapping(putMappingRequest, ActionListener.wrap(
136+
putMappingResponse -> handler.accept(indexMetaData.getMappingVersion()),
137+
errorHandler));
138+
},
139+
errorHandler
140+
));
142141
}
143142

144143
@Override
@@ -257,6 +256,39 @@ private Client remoteClient(ShardFollowTask params) {
257256
return wrapClient(client.getRemoteClusterClient(params.getRemoteCluster()), params.getHeaders());
258257
}
259258

259+
private void getIndexMetadata(long minRequiredMappingVersion, long minRequiredMetadataVersion,
260+
ShardFollowTask params, ActionListener<IndexMetaData> listener) {
261+
final Index leaderIndex = params.getLeaderShardId().getIndex();
262+
final ClusterStateRequest clusterStateRequest = CcrRequests.metaDataRequest(leaderIndex.getName());
263+
if (minRequiredMetadataVersion > 0) {
264+
clusterStateRequest.waitForMetaDataVersion(minRequiredMetadataVersion).waitForTimeout(waitForMetadataTimeOut);
265+
}
266+
try {
267+
remoteClient(params).admin().cluster().state(clusterStateRequest, ActionListener.wrap(
268+
r -> {
269+
// if wait_for_metadata_version timeout, the response is empty
270+
if (r.getState() == null) {
271+
assert minRequiredMetadataVersion > 0;
272+
getIndexMetadata(minRequiredMappingVersion, minRequiredMetadataVersion, params, listener);
273+
return;
274+
}
275+
final MetaData metaData = r.getState().metaData();
276+
final IndexMetaData indexMetaData = metaData.getIndexSafe(leaderIndex);
277+
if (indexMetaData.getMappingVersion() < minRequiredMappingVersion) {
278+
// ask for the next version.
279+
getIndexMetadata(minRequiredMappingVersion, metaData.version() + 1, params, listener);
280+
} else {
281+
assert metaData.version() >= minRequiredMetadataVersion : metaData.version() + " < " + minRequiredMetadataVersion;
282+
listener.onResponse(indexMetaData);
283+
}
284+
},
285+
listener::onFailure
286+
));
287+
} catch (Exception e) {
288+
listener.onFailure(e);
289+
}
290+
}
291+
260292
interface FollowerStatsInfoHandler {
261293
void accept(String followerHistoryUUID, long globalCheckpoint, long maxSeqNo);
262294
}

x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/CcrIntegTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ private NodeConfigurationSource createNodeConfigurationSource(String leaderSeedA
201201
builder.put(XPackSettings.LOGSTASH_ENABLED.getKey(), false);
202202
builder.put(LicenseService.SELF_GENERATED_LICENSE_TYPE.getKey(), "trial");
203203
// Let cluster state api return quickly in order to speed up auto follow tests:
204-
builder.put(CcrSettings.CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT.getKey(), TimeValue.timeValueMillis(100));
204+
builder.put(CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT.getKey(), TimeValue.timeValueMillis(100));
205205
if (configureRemoteClusterViaNodeSettings() && leaderSeedAddress != null) {
206206
builder.put("cluster.remote.leader_cluster.seeds", leaderSeedAddress);
207207
}

x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/CcrSingleNodeTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ protected Settings nodeSettings() {
4646
builder.put(XPackSettings.LOGSTASH_ENABLED.getKey(), false);
4747
builder.put(LicenseService.SELF_GENERATED_LICENSE_TYPE.getKey(), "trial");
4848
// Let cluster state api return quickly in order to speed up auto follow tests:
49-
builder.put(CcrSettings.CCR_AUTO_FOLLOW_WAIT_FOR_METADATA_TIMEOUT.getKey(), TimeValue.timeValueMillis(100));
49+
builder.put(CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT.getKey(), TimeValue.timeValueMillis(100));
5050
return builder.build();
5151
}
5252

0 commit comments

Comments
 (0)