19
19
20
20
package org .elasticsearch .cluster .routing .allocation .decider ;
21
21
22
- import java .util .Set ;
23
-
24
22
import com .carrotsearch .hppc .cursors .ObjectCursor ;
25
23
import org .apache .logging .log4j .LogManager ;
26
24
import org .apache .logging .log4j .Logger ;
42
40
import org .elasticsearch .index .Index ;
43
41
import org .elasticsearch .index .shard .ShardId ;
44
42
43
+ import java .util .Set ;
44
+
45
45
import static org .elasticsearch .cluster .routing .allocation .DiskThresholdSettings .CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING ;
46
46
import static org .elasticsearch .cluster .routing .allocation .DiskThresholdSettings .CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING ;
47
47
@@ -138,12 +138,24 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
138
138
139
139
// subtractLeavingShards is passed as false here, because they still use disk space, and therefore should we should be extra careful
140
140
// and take the size into account
141
- DiskUsage usage = getDiskUsage (node , allocation , usages , false );
141
+ final DiskUsageWithRelocations usage = getDiskUsage (node , allocation , usages , false );
142
142
// First, check that the node currently over the low watermark
143
143
double freeDiskPercentage = usage .getFreeDiskAsPercentage ();
144
144
// Cache the used disk percentage for displaying disk percentages consistent with documentation
145
145
double usedDiskPercentage = usage .getUsedDiskAsPercentage ();
146
146
long freeBytes = usage .getFreeBytes ();
147
+ if (freeBytes < 0L ) {
148
+ final long sizeOfRelocatingShards = sizeOfRelocatingShards (node , allocation , false , usage .getPath ());
149
+ logger .debug ("fewer free bytes remaining than the size of all incoming shards: " +
150
+ "usage {} on node {} including {} bytes of relocations, preventing allocation" ,
151
+ usage , node .nodeId (), sizeOfRelocatingShards );
152
+
153
+ return allocation .decision (Decision .NO , NAME ,
154
+ "the node has fewer free bytes remaining than the total size of all incoming shards: " +
155
+ "free space [%sB], relocating shards [%sB]" ,
156
+ freeBytes + sizeOfRelocatingShards , sizeOfRelocatingShards );
157
+ }
158
+
147
159
ByteSizeValue freeBytesValue = new ByteSizeValue (freeBytes );
148
160
if (logger .isTraceEnabled ()) {
149
161
logger .trace ("node [{}] has {}% used disk" , node .nodeId (), usedDiskPercentage );
@@ -240,6 +252,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
240
252
241
253
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
242
254
final long shardSize = getExpectedShardSize (shardRouting , allocation , 0 );
255
+ assert shardSize >= 0 : shardSize ;
243
256
double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned (usage , shardSize );
244
257
long freeBytesAfterShard = freeBytes - shardSize ;
245
258
if (freeBytesAfterShard < diskThresholdSettings .getFreeBytesThresholdHigh ().getBytes ()) {
@@ -266,6 +279,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
266
279
diskThresholdSettings .getHighWatermarkRaw (), usedDiskThresholdHigh , freeSpaceAfterShard );
267
280
}
268
281
282
+ assert freeBytesAfterShard >= 0 : freeBytesAfterShard ;
269
283
return allocation .decision (Decision .YES , NAME ,
270
284
"enough disk for shard on node, free: [%s], shard size: [%s], free after allocating shard: [%s]" ,
271
285
freeBytesValue ,
@@ -287,7 +301,7 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
287
301
288
302
// subtractLeavingShards is passed as true here, since this is only for shards remaining, we will *eventually* have enough disk
289
303
// since shards are moving away. No new shards will be incoming since in canAllocate we pass false for this check.
290
- final DiskUsage usage = getDiskUsage (node , allocation , usages , true );
304
+ final DiskUsageWithRelocations usage = getDiskUsage (node , allocation , usages , true );
291
305
final String dataPath = clusterInfo .getDataPath (shardRouting );
292
306
// If this node is already above the high threshold, the shard cannot remain (get it off!)
293
307
final double freeDiskPercentage = usage .getFreeDiskAsPercentage ();
@@ -299,6 +313,16 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
299
313
return allocation .decision (Decision .YES , NAME ,
300
314
"this shard is not allocated on the most utilized disk and can remain" );
301
315
}
316
+ if (freeBytes < 0L ) {
317
+ final long sizeOfRelocatingShards = sizeOfRelocatingShards (node , allocation , true , usage .getPath ());
318
+ logger .debug ("fewer free bytes remaining than the size of all incoming shards: " +
319
+ "usage {} on node {} including {} bytes of relocations, shard cannot remain" ,
320
+ usage , node .nodeId (), sizeOfRelocatingShards );
321
+ return allocation .decision (Decision .NO , NAME ,
322
+ "the shard cannot remain on this node because the node has fewer free bytes remaining than the total size of all " +
323
+ "incoming shards: free space [%s], relocating shards [%s]" ,
324
+ freeBytes + sizeOfRelocatingShards , sizeOfRelocatingShards );
325
+ }
302
326
if (freeBytes < diskThresholdSettings .getFreeBytesThresholdHigh ().getBytes ()) {
303
327
if (logger .isDebugEnabled ()) {
304
328
logger .debug ("less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain" ,
@@ -328,8 +352,8 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
328
352
"there is enough disk on this node for the shard to remain, free: [%s]" , new ByteSizeValue (freeBytes ));
329
353
}
330
354
331
- private DiskUsage getDiskUsage (RoutingNode node , RoutingAllocation allocation ,
332
- ImmutableOpenMap <String , DiskUsage > usages , boolean subtractLeavingShards ) {
355
+ private DiskUsageWithRelocations getDiskUsage (RoutingNode node , RoutingAllocation allocation ,
356
+ ImmutableOpenMap <String , DiskUsage > usages , boolean subtractLeavingShards ) {
333
357
DiskUsage usage = usages .get (node .nodeId ());
334
358
if (usage == null ) {
335
359
// If there is no usage, and we have other nodes in the cluster,
@@ -340,18 +364,14 @@ private DiskUsage getDiskUsage(RoutingNode node, RoutingAllocation allocation,
340
364
node .nodeId (), usage .getTotalBytes (), usage .getFreeBytes (), usage .getFreeDiskAsPercentage ());
341
365
}
342
366
}
343
-
344
- if (diskThresholdSettings .includeRelocations ()) {
345
- long relocatingShardsSize = sizeOfRelocatingShards (node , allocation , subtractLeavingShards , usage .getPath ());
346
- DiskUsage usageIncludingRelocations = new DiskUsage (node .nodeId (), node .node ().getName (), usage .getPath (),
347
- usage .getTotalBytes (), usage .getFreeBytes () - relocatingShardsSize );
348
- if (logger .isTraceEnabled ()) {
349
- logger .trace ("usage without relocations: {}" , usage );
350
- logger .trace ("usage with relocations: [{} bytes] {}" , relocatingShardsSize , usageIncludingRelocations );
351
- }
352
- usage = usageIncludingRelocations ;
367
+ final DiskUsageWithRelocations diskUsageWithRelocations = new DiskUsageWithRelocations (usage ,
368
+ diskThresholdSettings .includeRelocations ()
369
+ ? sizeOfRelocatingShards (node , allocation , subtractLeavingShards , usage .getPath ()) : 0 );
370
+ if (logger .isTraceEnabled ()) {
371
+ logger .trace ("getDiskUsage(subtractLeavingShards={}) returning {}" , subtractLeavingShards , diskUsageWithRelocations );
353
372
}
354
- return usage ;
373
+
374
+ return diskUsageWithRelocations ;
355
375
}
356
376
357
377
/**
@@ -381,7 +401,7 @@ DiskUsage averageUsage(RoutingNode node, ImmutableOpenMap<String, DiskUsage> usa
381
401
* @param shardSize Size in bytes of the shard
382
402
* @return Percentage of free space after the shard is assigned to the node
383
403
*/
384
- double freeDiskPercentageAfterShardAssigned (DiskUsage usage , Long shardSize ) {
404
+ double freeDiskPercentageAfterShardAssigned (DiskUsageWithRelocations usage , Long shardSize ) {
385
405
shardSize = (shardSize == null ) ? 0 : shardSize ;
386
406
DiskUsage newUsage = new DiskUsage (usage .getNodeId (), usage .getNodeName (), usage .getPath (),
387
407
usage .getTotalBytes (), usage .getFreeBytes () - shardSize );
@@ -450,4 +470,59 @@ public static long getExpectedShardSize(ShardRouting shard, RoutingAllocation al
450
470
}
451
471
452
472
}
473
+
474
+ static class DiskUsageWithRelocations {
475
+
476
+ private final DiskUsage diskUsage ;
477
+ private final long relocatingShardSize ;
478
+
479
+ DiskUsageWithRelocations (DiskUsage diskUsage , long relocatingShardSize ) {
480
+ this .diskUsage = diskUsage ;
481
+ this .relocatingShardSize = relocatingShardSize ;
482
+ }
483
+
484
+ @ Override
485
+ public String toString () {
486
+ return "DiskUsageWithRelocations{" +
487
+ "diskUsage=" + diskUsage +
488
+ ", relocatingShardSize=" + relocatingShardSize +
489
+ '}' ;
490
+ }
491
+
492
+ double getFreeDiskAsPercentage () {
493
+ if (getTotalBytes () == 0L ) {
494
+ return 100.0 ;
495
+ }
496
+ return 100.0 * ((double )getFreeBytes () / getTotalBytes ());
497
+ }
498
+
499
+ double getUsedDiskAsPercentage () {
500
+ return 100.0 - getFreeDiskAsPercentage ();
501
+ }
502
+
503
+ long getFreeBytes () {
504
+ try {
505
+ return Math .subtractExact (diskUsage .getFreeBytes (), relocatingShardSize );
506
+ } catch (ArithmeticException e ) {
507
+ return Long .MAX_VALUE ;
508
+ }
509
+ }
510
+
511
+ String getPath () {
512
+ return diskUsage .getPath ();
513
+ }
514
+
515
+ String getNodeId () {
516
+ return diskUsage .getNodeId ();
517
+ }
518
+
519
+ String getNodeName () {
520
+ return diskUsage .getNodeName ();
521
+ }
522
+
523
+ long getTotalBytes () {
524
+ return diskUsage .getTotalBytes ();
525
+ }
526
+ }
527
+
453
528
}
0 commit comments