Skip to content

Commit 2603391

Browse files
authored
Add node id to shard failure message (#28024)
This will help in the allocation explain API to figure out which node a shard was last allocated to before it failed. Closes #28018
1 parent 100a7b1 commit 2603391

File tree

3 files changed

+14
-12
lines changed

3 files changed

+14
-12
lines changed

core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
181181
shardToFail.shardId(), shardToFail, failedShard);
182182
}
183183
int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
184-
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShardEntry.getMessage(),
184+
String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
185+
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message,
185186
failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
186187
AllocationStatus.NO_ATTEMPT);
187188
routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes());

core/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ public void testFailedShard() {
260260
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1));
261261
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue());
262262
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
263-
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("test fail"));
264-
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("test fail"));
263+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
264+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
265265
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L));
266266
}
267267

core/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
4444
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
4545
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
46+
import static org.hamcrest.Matchers.containsString;
4647
import static org.hamcrest.Matchers.equalTo;
4748
import static org.hamcrest.Matchers.not;
4849

@@ -98,7 +99,7 @@ public void testSingleRetryOnIgnore() {
9899
assertEquals(routingTable.index("idx").shards().size(), 1);
99100
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
100101
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i+1);
101-
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom" + i);
102+
assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom" + i));
102103
}
103104
// now we go and check that we are actually stick to unassigned on the next failure
104105
List<FailedShard> failedShards = Collections.singletonList(
@@ -111,7 +112,7 @@ public void testSingleRetryOnIgnore() {
111112
assertEquals(routingTable.index("idx").shards().size(), 1);
112113
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries);
113114
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
114-
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
115+
assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
115116

116117
// manual resetting of retry count
117118
newState = strategy.reroute(clusterState, new AllocationCommands(), false, true).getClusterState();
@@ -123,7 +124,7 @@ public void testSingleRetryOnIgnore() {
123124
assertEquals(routingTable.index("idx").shards().size(), 1);
124125
assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
125126
assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
126-
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
127+
assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
127128

128129
// again fail it N-1 times
129130
for (int i = 0; i < retries-1; i++) {
@@ -138,7 +139,7 @@ public void testSingleRetryOnIgnore() {
138139
assertEquals(routingTable.index("idx").shards().size(), 1);
139140
assertEquals(i + 1, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
140141
assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
141-
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
142+
assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
142143
}
143144

144145
// now we go and check that we are actually stick to unassigned on the next failure
@@ -152,7 +153,7 @@ public void testSingleRetryOnIgnore() {
152153
assertEquals(routingTable.index("idx").shards().size(), 1);
153154
assertEquals(retries, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
154155
assertEquals(UNASSIGNED, routingTable.index("idx").shard(0).shards().get(0).state());
155-
assertEquals("boom", routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage());
156+
assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
156157
}
157158

158159
public void testFailedAllocation() {
@@ -172,7 +173,7 @@ public void testFailedAllocation() {
172173
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
173174
assertEquals(unassignedPrimary.state(), INITIALIZING);
174175
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), i+1);
175-
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom" + i);
176+
assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom" + i));
176177
// MaxRetryAllocationDecider#canForceAllocatePrimary should return YES decisions because canAllocate returns YES here
177178
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
178179
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -190,7 +191,7 @@ public void testFailedAllocation() {
190191
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
191192
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
192193
assertEquals(unassignedPrimary.state(), UNASSIGNED);
193-
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom");
194+
assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
194195
// MaxRetryAllocationDecider#canForceAllocatePrimary should return a NO decision because canAllocate returns NO here
195196
assertEquals(Decision.NO, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
196197
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -212,7 +213,7 @@ public void testFailedAllocation() {
212213
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
213214
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
214215
assertEquals(unassignedPrimary.state(), INITIALIZING);
215-
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom");
216+
assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
216217
// bumped up the max retry count, so canForceAllocatePrimary should return a YES decision
217218
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
218219
routingTable.index("idx").shard(0).shards().get(0), null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -239,7 +240,7 @@ public void testFailedAllocation() {
239240
unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
240241
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), 1);
241242
assertEquals(unassignedPrimary.state(), UNASSIGNED);
242-
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "ZOOOMG");
243+
assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("ZOOOMG"));
243244
// Counter reset, so MaxRetryAllocationDecider#canForceAllocatePrimary should return a YES decision
244245
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
245246
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));

0 commit comments

Comments
 (0)