Skip to content

Commit 5cc1d26

Browse files
authored
[ML] Round up data frame analytics memory estimates to next MB (elastic#1126)
Previously data frame analytics memory estimates were rounded to the nearest kilobyte, but this results in excessive precision for large analyses. This changes the estimates to always be reported in whole megabytes, rounded up from the low level estimate. Closes elastic#1110 Closes elastic/elasticsearch#54506
1 parent 40595de commit 5cc1d26

4 files changed

+21
-17
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@
4343
* Reduce CPU scheduling priority of native analysis processes to favor the ES JVM
4444
when CPU is constrained. (See {ml-pull}1109[#1109].)
4545
* Take `training_percent` into account when estimating memory usage for classification and regression.
46-
(See {ml-pull}1111[1111].)
46+
(See {ml-pull}1111[#1111].)
4747
* Support maximize minimum recall when assigning class labels for multiclass classification.
4848
(See {ml-pull}1113[#1113].)
4949
* Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].)
5050
* Adds new `num_matches` and `preferred_to_categories` fields to category output.
51-
(See {ml-pull}1062[#1062])
51+
(See {ml-pull}1062[#1062].)
52+
* Switched data frame analytics model memory estimates from kilobytes to megabytes.
53+
(See {ml-pull}1126[#1126], issue: {issue}54506[#54506].)
5254

5355
== {es} version 7.7.0
5456

lib/api/CDataFrameAnalysisRunner.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ std::size_t maximumNumberPartitions(const CDataFrameAnalysisSpecification& spec)
3232
// user to allocate more resources for the job in this case.
3333
return static_cast<std::size_t>(std::sqrt(static_cast<double>(spec.numberRows())) + 0.5);
3434
}
35+
36+
const std::size_t BYTES_IN_MB{1024 * 1024};
3537
}
3638

3739
CDataFrameAnalysisRunner::CDataFrameAnalysisRunner(const CDataFrameAnalysisSpecification& spec)
@@ -54,11 +56,11 @@ void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJ
5456
this->estimateMemoryUsage(numberRows, numberRows, numberColumns)};
5557
std::size_t expectedMemoryWithDisk{this->estimateMemoryUsage(
5658
numberRows, numberRows / maxNumberPartitions, numberColumns)};
57-
auto roundUpToNearestKilobyte = [](std::size_t bytes) {
58-
return std::to_string((bytes + 1024 - 1) / 1024) + "kB";
59+
auto roundUpToNearestMb = [](std::size_t bytes) {
60+
return std::to_string((bytes + BYTES_IN_MB - 1) / BYTES_IN_MB) + "mb";
5961
};
60-
writer.write(roundUpToNearestKilobyte(expectedMemoryWithoutDisk),
61-
roundUpToNearestKilobyte(expectedMemoryWithDisk));
62+
writer.write(roundUpToNearestMb(expectedMemoryWithoutDisk),
63+
roundUpToNearestMb(expectedMemoryWithDisk));
6264
}
6365

6466
void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() {
@@ -95,7 +97,7 @@ void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() {
9597

9698
if (memoryUsage > memoryLimit) {
9799
auto roundMb = [](std::size_t memory) {
98-
return 0.01 * static_cast<double>((100 * memory) / (1024 * 1024));
100+
return 0.01 * static_cast<double>((100 * memory) / BYTES_IN_MB);
99101
};
100102

101103
// Report rounded up to the nearest MB.

lib/api/unittest/CDataFrameAnalysisRunnerTest.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -189,19 +189,19 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor0Rows) {
189189
}
190190

191191
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1Row) {
192-
testEstimateMemoryUsage(1, "4kB", "4kB", 0);
192+
testEstimateMemoryUsage(1, "1mb", "1mb", 0);
193193
}
194194

195-
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10Rows) {
196-
testEstimateMemoryUsage(10, "12kB", "10kB", 0);
195+
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) {
196+
testEstimateMemoryUsage(10000, "5mb", "2mb", 0);
197197
}
198198

199-
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100Rows) {
200-
testEstimateMemoryUsage(100, "57kB", "35kB", 0);
199+
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) {
200+
testEstimateMemoryUsage(100000, "40mb", "9mb", 0);
201201
}
202202

203-
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) {
204-
testEstimateMemoryUsage(1000, "403kB", "142kB", 0);
203+
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) {
204+
testEstimateMemoryUsage(10000000, "4511mb", "88mb", 0);
205205
}
206206

207207
BOOST_AUTO_TEST_SUITE_END()

lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(testWrite) {
2929
{
3030
core::CJsonOutputStreamWrapper wrappedOutStream(sstream);
3131
CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream);
32-
writer.write("16kB", "8kB");
32+
writer.write("16mb", "8mb");
3333
}
3434

3535
rapidjson::Document arrayDoc;
@@ -42,10 +42,10 @@ BOOST_AUTO_TEST_CASE(testWrite) {
4242
BOOST_TEST_REQUIRE(object.IsObject());
4343

4444
BOOST_TEST_REQUIRE(object.HasMember("expected_memory_without_disk"));
45-
BOOST_REQUIRE_EQUAL(std::string("16kB"),
45+
BOOST_REQUIRE_EQUAL(std::string("16mb"),
4646
std::string(object["expected_memory_without_disk"].GetString()));
4747
BOOST_TEST_REQUIRE(object.HasMember("expected_memory_with_disk"));
48-
BOOST_REQUIRE_EQUAL(std::string("8kB"),
48+
BOOST_REQUIRE_EQUAL(std::string("8mb"),
4949
std::string(object["expected_memory_with_disk"].GetString()));
5050
}
5151

0 commit comments

Comments
 (0)