27
27
import org .elasticsearch .cluster .routing .RoutingNode ;
28
28
import org .elasticsearch .cluster .routing .RoutingNodes ;
29
29
import org .elasticsearch .cluster .routing .ShardRouting ;
30
+ import org .elasticsearch .cluster .routing .UnassignedInfo ;
30
31
import org .elasticsearch .cluster .routing .UnassignedInfo .AllocationStatus ;
31
32
import org .elasticsearch .cluster .routing .allocation .RoutingAllocation ;
32
33
import org .elasticsearch .cluster .routing .allocation .decider .Decision ;
33
34
import org .elasticsearch .common .component .AbstractComponent ;
35
+ import org .elasticsearch .common .logging .ESLogger ;
34
36
import org .elasticsearch .common .settings .Setting ;
35
37
import org .elasticsearch .common .settings .Setting .Property ;
36
38
import org .elasticsearch .common .settings .Settings ;
47
49
import java .util .stream .Collectors ;
48
50
49
51
/**
50
- * The primary shard allocator allocates primary shard that were not created as
51
- * a result of an API to a node that held them last to be recovered.
52
+ * The primary shard allocator allocates unassigned primary shards to nodes that hold
53
+ * valid copies of the unassigned primaries. It does this by iterating over all unassigned
54
+ * primary shards in the routing table and fetching shard metadata from each node in the cluster
55
+ * that holds a copy of the shard. The shard metadata from each node is compared against the
56
+ * set of valid allocation IDs and for all valid shard copies (if any), the primary shard allocator
57
+ * executes the allocation deciders to chose a copy to assign the primary shard to.
58
+ *
59
+ * Note that the PrimaryShardAllocator does *not* allocate primaries on index creation
60
+ * (see {@link org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator}),
61
+ * nor does it allocate primaries when a primary shard failed and there is a valid replica
62
+ * copy that can immediately be promoted to primary, as this takes place in
63
+ * {@link RoutingNodes#failShard(ESLogger, ShardRouting, UnassignedInfo, IndexMetaData)}.
52
64
*/
53
65
public abstract class PrimaryShardAllocator extends AbstractComponent {
54
66
@@ -154,17 +166,35 @@ public boolean allocateUnassigned(RoutingAllocation allocation) {
154
166
continue ;
155
167
}
156
168
157
- final NodesToAllocate nodesToAllocate = buildNodesToAllocate (shard , allocation , nodeShardsResult .orderedAllocationCandidates );
169
+ final NodesToAllocate nodesToAllocate = buildNodesToAllocate (
170
+ allocation , nodeShardsResult .orderedAllocationCandidates , shard , false
171
+ );
158
172
if (nodesToAllocate .yesNodeShards .isEmpty () == false ) {
159
173
NodeGatewayStartedShards nodeShardState = nodesToAllocate .yesNodeShards .get (0 );
160
174
logger .debug ("[{}][{}]: allocating [{}] to [{}] on primary allocation" , shard .index (), shard .id (), shard , nodeShardState .getNode ());
161
175
changed = true ;
162
176
unassignedIterator .initialize (nodeShardState .getNode ().getId (), nodeShardState .allocationId (), ShardRouting .UNAVAILABLE_EXPECTED_SHARD_SIZE );
163
177
} else if (nodesToAllocate .throttleNodeShards .isEmpty () == true && nodesToAllocate .noNodeShards .isEmpty () == false ) {
164
- NodeGatewayStartedShards nodeShardState = nodesToAllocate .noNodeShards .get (0 );
165
- logger .debug ("[{}][{}]: forcing allocating [{}] to [{}] on primary allocation" , shard .index (), shard .id (), shard , nodeShardState .getNode ());
166
- changed = true ;
167
- unassignedIterator .initialize (nodeShardState .getNode ().getId (), nodeShardState .allocationId (), ShardRouting .UNAVAILABLE_EXPECTED_SHARD_SIZE );
178
+ // The deciders returned a NO decision for all nodes with shard copies, so we check if primary shard
179
+ // can be force-allocated to one of the nodes.
180
+ final NodesToAllocate nodesToForceAllocate = buildNodesToAllocate (
181
+ allocation , nodeShardsResult .orderedAllocationCandidates , shard , true
182
+ );
183
+ if (nodesToForceAllocate .yesNodeShards .isEmpty () == false ) {
184
+ NodeGatewayStartedShards nodeShardState = nodesToForceAllocate .yesNodeShards .get (0 );
185
+ logger .debug ("[{}][{}]: allocating [{}] to [{}] on forced primary allocation" ,
186
+ shard .index (), shard .id (), shard , nodeShardState .getNode ());
187
+ changed = true ;
188
+ unassignedIterator .initialize (nodeShardState .getNode ().getId (), nodeShardState .allocationId (),
189
+ ShardRouting .UNAVAILABLE_EXPECTED_SHARD_SIZE );
190
+ } else if (nodesToForceAllocate .throttleNodeShards .isEmpty () == false ) {
191
+ logger .debug ("[{}][{}]: throttling allocation [{}] to [{}] on forced primary allocation" ,
192
+ shard .index (), shard .id (), shard , nodesToForceAllocate .throttleNodeShards );
193
+ changed |= unassignedIterator .removeAndIgnore (AllocationStatus .DECIDERS_THROTTLED );
194
+ } else {
195
+ logger .debug ("[{}][{}]: forced primary allocation denied [{}]" , shard .index (), shard .id (), shard );
196
+ changed |= unassignedIterator .removeAndIgnore (AllocationStatus .DECIDERS_NO );
197
+ }
168
198
} else {
169
199
// we are throttling this, but we have enough to allocate to this node, ignore it for now
170
200
logger .debug ("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation" , shard .index (), shard .id (), shard , nodesToAllocate .throttleNodeShards );
@@ -268,7 +298,10 @@ private boolean isEnoughVersionBasedAllocationsFound(IndexMetaData indexMetaData
268
298
/**
269
299
* Split the list of node shard states into groups yes/no/throttle based on allocation deciders
270
300
*/
271
- private NodesToAllocate buildNodesToAllocate (ShardRouting shard , RoutingAllocation allocation , List <NodeGatewayStartedShards > nodeShardStates ) {
301
+ private NodesToAllocate buildNodesToAllocate (RoutingAllocation allocation ,
302
+ List <NodeGatewayStartedShards > nodeShardStates ,
303
+ ShardRouting shardRouting ,
304
+ boolean forceAllocate ) {
272
305
List <NodeGatewayStartedShards > yesNodeShards = new ArrayList <>();
273
306
List <NodeGatewayStartedShards > throttledNodeShards = new ArrayList <>();
274
307
List <NodeGatewayStartedShards > noNodeShards = new ArrayList <>();
@@ -278,7 +311,8 @@ private NodesToAllocate buildNodesToAllocate(ShardRouting shard, RoutingAllocati
278
311
continue ;
279
312
}
280
313
281
- Decision decision = allocation .deciders ().canAllocate (shard , node , allocation );
314
+ Decision decision = forceAllocate ? allocation .deciders ().canForceAllocatePrimary (shardRouting , node , allocation ) :
315
+ allocation .deciders ().canAllocate (shardRouting , node , allocation );
282
316
if (decision .type () == Decision .Type .THROTTLE ) {
283
317
throttledNodeShards .add (nodeShardState );
284
318
} else if (decision .type () == Decision .Type .NO ) {
0 commit comments