You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Write shard state metadata as soon as shard is created / initializing
As we rely on active allocation ids persisted in the cluster state to select
the primary shard copy, we can write shard state metadata on the allocated node
as soon as the node knows about receiving this shard. This also ensures that
in case of primary relocation, when the relocation target is marked as started
by the master node, the shard state metadata with the correct allocation id has
already been written on the relocation target. Before this change, shard state
metadata was only written once the node knows it is marked as started. In case
of failures between master marking the node as started and the node
receiving and processing this event, the relation between the shard copy on disk
and the cluster state could get lost. This means that manual allocation of
the shard using the reroute command allocate_stale_primary was necessary.
Closes#16625
Copy file name to clipboardExpand all lines: core/src/main/java/org/elasticsearch/gateway/PrimaryShardAllocator.java
+20-7Lines changed: 20 additions & 7 deletions
Original file line number
Diff line number
Diff line change
@@ -187,12 +187,14 @@ protected NodeShardsResult buildAllocationIdBasedNodeShardsResult(ShardRouting s
187
187
}
188
188
189
189
if (nodeShardState.storeException() == null) {
190
-
if (allocationId == null && nodeShardState.legacyVersion() != ShardStateMetaData.NO_VERSION) {
191
-
// old shard with no allocation id, assign dummy value so that it gets added below in case of matchAnyShard
192
-
allocationId = "_n/a_";
190
+
if (allocationId == null && nodeShardState.legacyVersion() == ShardStateMetaData.NO_VERSION) {
191
+
logger.trace("[{}] on node [{}] has no shard state information", shard, nodeShardState.getNode());
192
+
} elseif (allocationId != null) {
193
+
assertnodeShardState.legacyVersion() == ShardStateMetaData.NO_VERSION : "Allocation id and legacy version cannot be both present";
194
+
logger.trace("[{}] on node [{}] has allocation id [{}]", shard, nodeShardState.getNode(), allocationId);
195
+
} else {
196
+
logger.trace("[{}] on node [{}] has no allocation id, out-dated shard (shard state version: [{}])", shard, nodeShardState.getNode(), nodeShardState.legacyVersion());
193
197
}
194
-
195
-
logger.trace("[{}] on node [{}] has allocation id [{}] of shard", shard, nodeShardState.getNode(), allocationId);
196
198
} else {
197
199
logger.trace("[{}] on node [{}] has allocation id [{}] but the store can not be opened, treating as no allocation id", nodeShardState.storeException(), shard, nodeShardState.getNode(), allocationId);
198
200
allocationId = null;
@@ -299,9 +301,20 @@ NodeShardsResult buildVersionBasedNodeShardsResult(ShardRouting shard, boolean m
299
301
continue;
300
302
}
301
303
302
-
// no version means it does not exists, which is what the API returns, and what we expect to
303
304
if (nodeShardState.storeException() == null) {
304
-
logger.trace("[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version);
305
+
if (version == ShardStateMetaData.NO_VERSION && nodeShardState.allocationId() == null) {
306
+
logger.trace("[{}] on node [{}] has no shard state information", shard, nodeShardState.getNode());
assertnodeShardState.allocationId() == null : "Allocation id and legacy version cannot be both present";
309
+
logger.trace("[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version);
310
+
} else {
311
+
// shard was already selected in a 5.x cluster as primary for recovery, was initialized (and wrote a new state file) but
312
+
// did not make it to STARTED state before the cluster crashed (otherwise list of active allocation ids would be
313
+
// non-empty and allocation id - based allocation mode would be chosen).
314
+
// Prefer this shard copy again.
315
+
version = Long.MAX_VALUE;
316
+
logger.trace("[{}] on node [{}] has allocation id [{}]", shard, nodeShardState.getNode(), nodeShardState.allocationId());
317
+
}
305
318
} else {
306
319
// when there is an store exception, we disregard the reported version and assign it as no version (same as shard does not exist)
307
320
logger.trace("[{}] on node [{}] has version [{}] but the store can not be opened, treating no version", nodeShardState.storeException(), shard, nodeShardState.getNode(), version);
thrownewIllegalArgumentException("Trying to set a routing entry with shardId [" + newRouting.shardId() + "] on a shard with shardId [" + shardId() + "]");
327
322
}
328
323
if ((currentRouting == null || newRouting.isSameAllocation(currentRouting)) == false) {
329
324
thrownewIllegalArgumentException("Trying to set a routing entry with a different allocation. Current " + currentRouting + ", new " + newRouting);
330
325
}
331
-
try {
332
-
if (currentRouting != null) {
333
-
if (!newRouting.primary() && currentRouting.primary()) {
334
-
logger.warn("suspect illegal state: trying to move shard from primary mode to replica mode");
335
-
}
336
-
// if its the same routing, return
337
-
if (currentRouting.equals(newRouting)) {
338
-
return;
339
-
}
326
+
if (currentRouting != null) {
327
+
if (!newRouting.primary() && currentRouting.primary()) {
328
+
logger.warn("suspect illegal state: trying to move shard from primary mode to replica mode");
329
+
}
330
+
// if its the same routing, return
331
+
if (currentRouting.equals(newRouting)) {
332
+
return;
340
333
}
334
+
}
341
335
342
-
if (state == IndexShardState.POST_RECOVERY) {
343
-
// if the state is started or relocating (cause it might move right away from started to relocating)
344
-
// then move to STARTED
345
-
if (newRouting.state() == ShardRoutingState.STARTED || newRouting.state() == ShardRoutingState.RELOCATING) {
346
-
// we want to refresh *before* we move to internal STARTED state
347
-
try {
348
-
getEngine().refresh("cluster_state_started");
349
-
} catch (Throwablet) {
350
-
logger.debug("failed to refresh due to move to cluster wide started", t);
351
-
}
336
+
if (state == IndexShardState.POST_RECOVERY) {
337
+
// if the state is started or relocating (cause it might move right away from started to relocating)
338
+
// then move to STARTED
339
+
if (newRouting.state() == ShardRoutingState.STARTED || newRouting.state() == ShardRoutingState.RELOCATING) {
340
+
// we want to refresh *before* we move to internal STARTED state
341
+
try {
342
+
getEngine().refresh("cluster_state_started");
343
+
} catch (Throwablet) {
344
+
logger.debug("failed to refresh due to move to cluster wide started", t);
345
+
}
352
346
353
-
booleanmovedToStarted = false;
354
-
synchronized (mutex) {
355
-
// do the check under a mutex, so we make sure to only change to STARTED if in POST_RECOVERY
356
-
if (state == IndexShardState.POST_RECOVERY) {
357
-
changeState(IndexShardState.STARTED, "global state is [" + newRouting.state() + "]");
358
-
movedToStarted = true;
359
-
} else {
360
-
logger.debug("state [{}] not changed, not in POST_RECOVERY, global state is [{}]", state, newRouting.state());
361
-
}
362
-
}
363
-
if (movedToStarted) {
364
-
indexEventListener.afterIndexShardStarted(this);
347
+
booleanmovedToStarted = false;
348
+
synchronized (mutex) {
349
+
// do the check under a mutex, so we make sure to only change to STARTED if in POST_RECOVERY
350
+
if (state == IndexShardState.POST_RECOVERY) {
351
+
changeState(IndexShardState.STARTED, "global state is [" + newRouting.state() + "]");
352
+
movedToStarted = true;
353
+
} else {
354
+
logger.debug("state [{}] not changed, not in POST_RECOVERY, global state is [{}]", state, newRouting.state());
0 commit comments