Skip to content

Commit 1bb0fa9

Browse files
authored
Fix autoexpand during node replace (#96281)
Prior to this change NodeReplacementAllocationDecider was unconditionally skipping both replacement source and target nodes when calculation auto-expand replicas. This is fixed by autoexpanding to the replacement node if source node already had shards of the index
1 parent 7a194c1 commit 1bb0fa9

File tree

7 files changed

+534
-177
lines changed

7 files changed

+534
-177
lines changed

docs/changelog/96281.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 96281
2+
summary: Fix autoexpand during node replace
3+
area: Allocation
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/cluster/node/DiscoveryNodes.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,20 @@ public DiscoveryNode findByAddress(TransportAddress address) {
298298
return null;
299299
}
300300

301+
/**
302+
* Check if a node with provided name exists
303+
*
304+
* @return {@code true} node identified with provided name exists or {@code false} otherwise
305+
*/
306+
public boolean hasByName(String name) {
307+
for (DiscoveryNode node : nodes.values()) {
308+
if (node.getName().equals(name)) {
309+
return true;
310+
}
311+
}
312+
return false;
313+
}
314+
301315
/**
302316
* Returns the version of the node with the oldest version in the cluster that is not a client node
303317
*

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDecider.java

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
import org.elasticsearch.cluster.routing.ShardRouting;
1616
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
1717

18+
/**
19+
* An allocation decider that ensures that all the shards allocated to the node scheduled for removal are relocated to the replacement node.
20+
* It also ensures that auto-expands replicas are expanded to only the replacement source or target (not both at the same time)
21+
* and only of the shards that were already present on the source node.
22+
*/
1823
public class NodeReplacementAllocationDecider extends AllocationDecider {
1924

2025
public static final String NAME = "node_replacement";
@@ -38,8 +43,8 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
3843
Decision.YES,
3944
NAME,
4045
"node [%s] is replacing node [%s], and may receive shards from it",
41-
shardRouting.currentNodeId(),
42-
node.nodeId()
46+
node.nodeId(),
47+
shardRouting.currentNodeId()
4348
);
4449
} else if (isReplacementSource(allocation, shardRouting.currentNodeId())) {
4550
if (allocation.isReconciling()) {
@@ -110,27 +115,64 @@ public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNod
110115
return YES__NO_REPLACEMENTS;
111116
} else if (isReplacementTargetName(allocation, node.getName())) {
112117
final SingleNodeShutdownMetadata shutdown = allocation.replacementTargetShutdowns().get(node.getName());
113-
return allocation.decision(
114-
Decision.NO,
115-
NAME,
116-
"node [%s] is a node replacement target for node [%s], "
117-
+ "shards cannot auto expand to be on it until the replacement is complete",
118-
node.getId(),
119-
shutdown == null ? null : shutdown.getNodeId()
120-
);
118+
final String sourceNodeId = shutdown != null ? shutdown.getNodeId() : null;
119+
final boolean hasShardsAllocatedOnSourceOrTarget = hasShardOnNode(indexMetadata, node.getId(), allocation)
120+
|| (sourceNodeId != null && hasShardOnNode(indexMetadata, sourceNodeId, allocation));
121+
122+
if (hasShardsAllocatedOnSourceOrTarget) {
123+
return allocation.decision(
124+
Decision.YES,
125+
NAME,
126+
"node [%s] is a node replacement target for node [%s], "
127+
+ "shard can auto expand to it as it was already present on the source node",
128+
node.getId(),
129+
sourceNodeId
130+
);
131+
} else {
132+
return allocation.decision(
133+
Decision.NO,
134+
NAME,
135+
"node [%s] is a node replacement target for node [%s], "
136+
+ "shards cannot auto expand to be on it until the replacement is complete",
137+
node.getId(),
138+
sourceNodeId
139+
);
140+
}
121141
} else if (isReplacementSource(allocation, node.getId())) {
122-
return allocation.decision(
123-
Decision.NO,
124-
NAME,
125-
"node [%s] is being replaced by [%s], shards cannot auto expand to be on it",
126-
node.getId(),
127-
getReplacementName(allocation, node.getId())
128-
);
142+
final SingleNodeShutdownMetadata shutdown = allocation.getClusterState().metadata().nodeShutdowns().get(node.getId());
143+
final String replacementNodeName = shutdown != null ? shutdown.getTargetNodeName() : null;
144+
final boolean hasShardOnSource = hasShardOnNode(indexMetadata, node.getId(), allocation)
145+
&& shutdown != null
146+
&& allocation.getClusterState().getNodes().hasByName(replacementNodeName) == false;
147+
148+
if (hasShardOnSource) {
149+
return allocation.decision(
150+
Decision.YES,
151+
NAME,
152+
"node [%s] is being replaced by [%s], shards can auto expand to be on it "
153+
+ "while replacement node has not joined the cluster",
154+
node.getId(),
155+
replacementNodeName
156+
);
157+
} else {
158+
return allocation.decision(
159+
Decision.NO,
160+
NAME,
161+
"node [%s] is being replaced by [%s], shards cannot auto expand to be on it",
162+
node.getId(),
163+
replacementNodeName
164+
);
165+
}
129166
} else {
130167
return YES__NO_APPLICABLE_REPLACEMENTS;
131168
}
132169
}
133170

171+
private static boolean hasShardOnNode(IndexMetadata indexMetadata, String nodeId, RoutingAllocation allocation) {
172+
RoutingNode node = allocation.routingNodes().node(nodeId);
173+
return node != null && node.numberOfOwningShardsForIndex(indexMetadata.getIndex()) >= 1;
174+
}
175+
134176
@Override
135177
public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
136178
if (replacementFromSourceToTarget(allocation, shardRouting.currentNodeId(), node.node().getName())) {

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDecider.java

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public class NodeShutdownAllocationDecider extends AllocationDecider {
3333
*/
3434
@Override
3535
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
36-
return getDecision(allocation, node.nodeId());
36+
return getDecision(allocation, node.nodeId(), false);
3737
}
3838

3939
/**
@@ -51,10 +51,10 @@ public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting
5151
*/
5252
@Override
5353
public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNode node, RoutingAllocation allocation) {
54-
return getDecision(allocation, node.getId());
54+
return getDecision(allocation, node.getId(), true);
5555
}
5656

57-
private static Decision getDecision(RoutingAllocation allocation, String nodeId) {
57+
private static Decision getDecision(RoutingAllocation allocation, String nodeId, boolean canAllocateBeforeReplacementIsReady) {
5858
final var nodeShutdowns = allocation.metadata().nodeShutdowns().getAll();
5959
if (nodeShutdowns.isEmpty()) {
6060
return YES_EMPTY_SHUTDOWN_METADATA;
@@ -66,12 +66,16 @@ private static Decision getDecision(RoutingAllocation allocation, String nodeId)
6666
}
6767

6868
return switch (thisNodeShutdownMetadata.getType()) {
69-
case REPLACE, REMOVE, SIGTERM -> allocation.decision(
70-
Decision.NO,
71-
NAME,
72-
"node [%s] is preparing to be removed from the cluster",
73-
nodeId
74-
);
69+
case REMOVE, SIGTERM -> allocation.decision(Decision.NO, NAME, "node [%s] is preparing to be removed from the cluster", nodeId);
70+
case REPLACE -> canAllocateBeforeReplacementIsReady
71+
&& allocation.getClusterState().getNodes().hasByName(thisNodeShutdownMetadata.getTargetNodeName()) == false
72+
? allocation.decision(
73+
Decision.YES,
74+
NAME,
75+
"node [%s] is preparing to be removed from the cluster, but replacement is not yet present",
76+
nodeId
77+
)
78+
: allocation.decision(Decision.NO, NAME, "node [%s] is preparing to be removed from the cluster", nodeId);
7579
case RESTART -> allocation.decision(
7680
Decision.YES,
7781
NAME,

0 commit comments

Comments
 (0)