Skip to content

Commit d8f549c

Browse files
authored
[ML] Add total ML memory to ML info (#65214)
This change adds an extra piece of information, limits.total_ml_memory, to the ML info response. This returns the total amount of memory that ML is permitted to use for native processes across all ML nodes in the cluster. Some of this may already be in use; the value returned is total, not available ML memory. Backport of #65195
1 parent 7df9873 commit d8f549c

File tree

4 files changed

+47
-11
lines changed

4 files changed

+47
-11
lines changed

docs/reference/ml/anomaly-detection/apis/get-ml-info.asciidoc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ privileges. See <<security-privileges>>, <<built-in-roles>> and
3030
This endpoint is designed to be used by a user interface that needs to fully
3131
understand machine learning configurations where some options are not specified,
3232
meaning that the defaults should be used. This endpoint may be used to find out
33-
what those defaults are.
33+
what those defaults are. It also provides information about the maximum size
34+
of {ml} jobs that could run in the current cluster configuration.
3435

3536
[[get-ml-info-example]]
3637
== {api-examples-title}
@@ -115,11 +116,13 @@ This is a possible response:
115116
"build_hash": "99a07c016d5a73"
116117
},
117118
"limits" : {
118-
"effective_max_model_memory_limit": "28961mb"
119+
"effective_max_model_memory_limit": "28961mb",
120+
"total_ml_memory": "86883mb"
119121
}
120122
}
121123
----
122124
// TESTRESPONSE[s/"upgrade_mode": false/"upgrade_mode": $body.upgrade_mode/]
123125
// TESTRESPONSE[s/"version": "7.0.0",/"version": "$body.native_code.version",/]
124126
// TESTRESPONSE[s/"build_hash": "99a07c016d5a73"/"build_hash": "$body.native_code.build_hash"/]
125127
// TESTRESPONSE[s/"effective_max_model_memory_limit": "28961mb"/"effective_max_model_memory_limit": "$body.limits.effective_max_model_memory_limit"/]
128+
// TESTRESPONSE[s/"total_ml_memory": "86883mb"/"total_ml_memory": "$body.limits.total_ml_memory"/]

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlInfoAction.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.cluster.service.ClusterService;
1616
import org.elasticsearch.common.inject.Inject;
1717
import org.elasticsearch.common.settings.ClusterSettings;
18+
import org.elasticsearch.common.unit.ByteSizeUnit;
1819
import org.elasticsearch.common.unit.ByteSizeValue;
1920
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
2021
import org.elasticsearch.env.Environment;
@@ -128,6 +129,23 @@ private Map<String, Object> datafeedsDefaults() {
128129
return anomalyDetectorsDefaults;
129130
}
130131

132+
static ByteSizeValue calculateTotalMlMemory(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
133+
134+
long totalMlMemory = 0;
135+
136+
for (DiscoveryNode node : nodes) {
137+
OptionalLong limit = NativeMemoryCalculator.allowedBytesForMl(node, clusterSettings);
138+
if (limit.isPresent() == false) {
139+
continue;
140+
}
141+
totalMlMemory += limit.getAsLong();
142+
}
143+
144+
// Round down to a whole number of megabytes, since we generally deal with model
145+
// memory limits in whole megabytes
146+
return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(totalMlMemory));
147+
}
148+
131149
static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
132150

133151
long maxMlMemory = -1;
@@ -148,7 +166,7 @@ static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clust
148166

149167
maxMlMemory -= Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes());
150168
maxMlMemory -= MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
151-
return ByteSizeValue.ofMb(Math.max(0L, maxMlMemory) / 1024 / 1024);
169+
return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(Math.max(0L, maxMlMemory)));
152170
}
153171

154172
private Map<String, Object> limits() {
@@ -166,6 +184,8 @@ private Map<String, Object> limits() {
166184
if (effectiveMaxModelMemoryLimit != null) {
167185
limits.put("effective_max_model_memory_limit", effectiveMaxModelMemoryLimit.getStringRep());
168186
}
187+
limits.put("total_ml_memory",
188+
calculateTotalMlMemory(clusterService.getClusterSettings(), clusterService.state().getNodes()).getStringRep());
169189
return limits;
170190
}
171191
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportMlInfoActionTests.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
2626
import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
27+
import static org.hamcrest.Matchers.is;
2728
import static org.hamcrest.Matchers.lessThanOrEqualTo;
2829
import static org.hamcrest.Matchers.notNullValue;
2930
import static org.hamcrest.Matchers.nullValue;
@@ -36,7 +37,8 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
3637
ClusterSettings clusterSettings = new ClusterSettings(
3738
Settings.builder().put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
3839
Sets.newHashSet(MAX_MACHINE_MEMORY_PERCENT, USE_AUTO_MACHINE_MEMORY_PERCENT));
39-
long highestMlMachineMemory = -1;
40+
long highestMlMachineMemoryBytes = -1;
41+
long totalMlMemoryBytes = 0;
4042

4143
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
4244
for (int i = randomIntBetween(1, 10); i > 0; --i) {
@@ -49,7 +51,8 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
4951
} else {
5052
// ML node
5153
long machineMemory = randomLongBetween(2000000000L, 100000000000L);
52-
highestMlMachineMemory = Math.max(machineMemory, highestMlMachineMemory);
54+
highestMlMachineMemoryBytes = Math.max(machineMemory, highestMlMachineMemoryBytes);
55+
totalMlMemoryBytes += machineMemory * mlMemoryPercent / 100;
5356
builder.add(new DiscoveryNode(nodeName, nodeId, ta,
5457
Collections.singletonMap(MachineLearning.MACHINE_MEMORY_NODE_ATTR, String.valueOf(machineMemory)),
5558
Collections.emptySet(), Version.CURRENT));
@@ -59,14 +62,19 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
5962

6063
ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
6164

62-
if (highestMlMachineMemory < 0) {
65+
if (highestMlMachineMemoryBytes < 0) {
6366
assertThat(effectiveMaxModelMemoryLimit, nullValue());
6467
} else {
6568
assertThat(effectiveMaxModelMemoryLimit, notNullValue());
6669
assertThat(effectiveMaxModelMemoryLimit.getBytes()
6770
+ Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
6871
+ MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
69-
lessThanOrEqualTo(highestMlMachineMemory * mlMemoryPercent / 100));
72+
lessThanOrEqualTo(highestMlMachineMemoryBytes * mlMemoryPercent / 100));
7073
}
74+
75+
ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
76+
77+
assertThat(totalMlMemory, notNullValue());
78+
assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
7179
}
7280
}

x-pack/plugin/src/test/resources/rest-api-spec/test/ml/ml_info.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ teardown:
1717
- match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
1818
- match: { defaults.datafeeds.scroll_size: 1000 }
1919
- is_false: limits.max_model_memory_limit
20-
# We cannot assert an exact value for the next one as it will vary depending on the test machine
20+
# We cannot assert an exact value for the next two as they will vary depending on the test machine
2121
- match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
22+
- match: { limits.total_ml_memory: "/\\d+mb/" }
2223
- match: { upgrade_mode: false }
2324

2425
- do:
@@ -36,8 +37,9 @@ teardown:
3637
- match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
3738
- match: { defaults.datafeeds.scroll_size: 1000 }
3839
- match: { limits.max_model_memory_limit: "512mb" }
39-
# We cannot assert an exact value for the next one as it will vary depending on the test machine
40+
# We cannot assert an exact value for the next two as they will vary depending on the test machine
4041
- match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
42+
- match: { limits.total_ml_memory: "/\\d+mb/" }
4143
- match: { upgrade_mode: false }
4244

4345
- do:
@@ -55,8 +57,9 @@ teardown:
5557
- match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
5658
- match: { defaults.datafeeds.scroll_size: 1000 }
5759
- match: { limits.max_model_memory_limit: "6gb" }
58-
# We cannot assert an exact value for the next one as it will vary depending on the test machine
60+
# We cannot assert an exact value for the next two as they will vary depending on the test machine
5961
- match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
62+
- match: { limits.total_ml_memory: "/\\d+mb/" }
6063
- match: { upgrade_mode: false }
6164

6265
- do:
@@ -74,8 +77,9 @@ teardown:
7477
- match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
7578
- match: { defaults.datafeeds.scroll_size: 1000 }
7679
- match: { limits.max_model_memory_limit: "6gb" }
77-
# We cannot assert an exact value for the next one as it will vary depending on the test machine
80+
# We cannot assert an exact value for the next two as they will vary depending on the test machine
7881
- match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
82+
- match: { limits.total_ml_memory: "/\\d+mb/" }
7983
- match: { upgrade_mode: false }
8084

8185
- do:
@@ -95,4 +99,5 @@ teardown:
9599
- match: { limits.max_model_memory_limit: "1mb" }
96100
# This time we can assert an exact value for the next one because the hard limit is so low
97101
- match: { limits.effective_max_model_memory_limit: "1mb" }
102+
- match: { limits.total_ml_memory: "/\\d+mb/" }
98103
- match: { upgrade_mode: false }

0 commit comments

Comments
 (0)