Skip to content

Commit 82f5b44

Browse files
committed
Return 0 for negative "free" and "total" memory reported by the OS (#42725)
* Return 0 for negative "free" and "total" memory reported by the OS We've had a situation where the MX bean reported negative values for the free memory of the OS, in those rare cases we want to return a value of 0 rather than blowing up later down the pipeline. In the event that there is a serialization or creation error with regard to memory use, this adds asserts so the failure will occur as soon as possible and give us a better location for investigation. Resolves #42157 * Fix test passing in invalid memory value * Fix another test passing in invalid memory value * Also change mem check in MachineLearning.machineMemoryFromStats * Add background documentation for why we prevent negative return values * Clarify comment a bit more
1 parent d37334d commit 82f5b44

File tree

5 files changed

+47
-11
lines changed

5 files changed

+47
-11
lines changed

server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

+38-6
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,24 @@
4242
import java.util.regex.Pattern;
4343
import java.util.stream.Collectors;
4444

45+
/**
46+
* The {@link OsProbe} class retrieves information about the physical and swap size of the machine
47+
* memory, as well as the system load average and cpu load.
48+
*
49+
* In some exceptional cases, it's possible the underlying native method used by
50+
* {@link #getFreePhysicalMemorySize()} and {@link #getTotalPhysicalMemorySize()} can return a
51+
* negative value. Because of this, we prevent those methods from returning negative values,
52+
* returning 0 instead.
53+
*
54+
* The OS can report a negative number in a number of cases:
55+
* - Non-supported OSes (HP-UX, or AIX)
56+
* - A failure of macOS to initialize host statistics
57+
* - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
58+
* - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
59+
* - An error case retrieving these values from a linux kernel
60+
* - A non-standard libc implementation not implementing the required values
61+
* For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
62+
*/
4563
public class OsProbe {
4664

4765
private static final OperatingSystemMXBean osMxBean = ManagementFactory.getOperatingSystemMXBean();
@@ -67,12 +85,19 @@ public class OsProbe {
6785
*/
6886
public long getFreePhysicalMemorySize() {
6987
if (getFreePhysicalMemorySize == null) {
70-
return -1;
88+
logger.warn("getFreePhysicalMemorySize is not available");
89+
return 0;
7190
}
7291
try {
73-
return (long) getFreePhysicalMemorySize.invoke(osMxBean);
92+
final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean);
93+
if (freeMem < 0) {
94+
logger.warn("OS reported a negative free memory value [{}]", freeMem);
95+
return 0;
96+
}
97+
return freeMem;
7498
} catch (Exception e) {
75-
return -1;
99+
logger.warn("exception retrieving free physical memory", e);
100+
return 0;
76101
}
77102
}
78103

@@ -81,12 +106,19 @@ public long getFreePhysicalMemorySize() {
81106
*/
82107
public long getTotalPhysicalMemorySize() {
83108
if (getTotalPhysicalMemorySize == null) {
84-
return -1;
109+
logger.warn("getTotalPhysicalMemorySize is not available");
110+
return 0;
85111
}
86112
try {
87-
return (long) getTotalPhysicalMemorySize.invoke(osMxBean);
113+
final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean);
114+
if (totalMem < 0) {
115+
logger.warn("OS reported a negative total memory value [{}]", totalMem);
116+
return 0;
117+
}
118+
return totalMem;
88119
} catch (Exception e) {
89-
return -1;
120+
logger.warn("exception retrieving total physical memory", e);
121+
return 0;
90122
}
91123
}
92124

server/src/main/java/org/elasticsearch/monitor/os/OsStats.java

+4
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,17 @@ public static class Mem implements Writeable, ToXContentFragment {
229229
private final long free;
230230

231231
public Mem(long total, long free) {
232+
assert total >= 0 : "expected total memory to be positive, got: " + total;
233+
assert free >= 0 : "expected free memory to be positive, got: " + total;
232234
this.total = total;
233235
this.free = free;
234236
}
235237

236238
public Mem(StreamInput in) throws IOException {
237239
this.total = in.readLong();
240+
assert total >= 0 : "expected total memory to be positive, got: " + total;
238241
this.free = in.readLong();
242+
assert free >= 0 : "expected free memory to be positive, got: " + total;
239243
}
240244

241245
@Override

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -805,8 +805,8 @@ static long machineMemoryFromStats(OsStats stats) {
805805
if (containerLimitStr != null) {
806806
BigInteger containerLimit = new BigInteger(containerLimitStr);
807807
if ((containerLimit.compareTo(BigInteger.valueOf(mem)) < 0 && containerLimit.compareTo(BigInteger.ZERO) > 0)
808-
// mem < 0 means the value couldn't be obtained for some reason
809-
|| (mem < 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
808+
// mem <= 0 means the value couldn't be obtained for some reason
809+
|| (mem <= 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
810810
mem = containerLimit.longValue();
811811
}
812812
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ public void testNoAttributes_givenClash() {
9797

9898
public void testMachineMemory_givenStatsFailure() throws IOException {
9999
OsStats stats = mock(OsStats.class);
100-
when(stats.getMem()).thenReturn(new OsStats.Mem(-1, -1));
101-
assertEquals(-1L, MachineLearning.machineMemoryFromStats(stats));
100+
when(stats.getMem()).thenReturn(new OsStats.Mem(0, 0));
101+
assertEquals(0L, MachineLearning.machineMemoryFromStats(stats));
102102
}
103103

104104
public void testMachineMemory_givenNoCgroup() throws IOException {

x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ private static NodeStats mockNodeStats() {
329329
final OsStats.Cgroup osCgroup = new OsStats.Cgroup("_cpu_acct_ctrl_group", ++iota, "_cpu_ctrl_group", ++iota, ++iota, osCpuStat,
330330
"_memory_ctrl_group", "2000000000", "1000000000");
331331

332-
final OsStats.Mem osMem = new OsStats.Mem(no, no);
332+
final OsStats.Mem osMem = new OsStats.Mem(0, 0);
333333
final OsStats.Swap osSwap = new OsStats.Swap(no, no);
334334
final OsStats os = new OsStats(no, osCpu, osMem, osSwap, osCgroup);
335335

0 commit comments

Comments
 (0)