From 59cd459b0725e5c1406e8bb07fb2c9f4b4f71a49 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 6 Apr 2020 18:29:15 +0100 Subject: [PATCH 1/3] [ML] Round up data frame analytics memory estimates to next MB Previously data frame analytics memory estimates were rounded to the nearest kilobyte, but this results in excessive precision for large analyses. This changes the estimates to always be reported in whole megabytes, rounded up from the low level estimate. Closes #1110 Closes elastic/elasticsearch#54506 --- lib/api/CDataFrameAnalysisRunner.cc | 12 +++++++----- lib/api/unittest/CDataFrameAnalysisRunnerTest.cc | 14 +++++++------- .../CMemoryUsageEstimationResultJsonWriterTest.cc | 6 +++--- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index b81ff58e7f..c1ba307ca7 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -32,6 +32,8 @@ std::size_t maximumNumberPartitions(const CDataFrameAnalysisSpecification& spec) // user to allocate more resources for the job in this case. return static_cast(std::sqrt(static_cast(spec.numberRows())) + 0.5); } + +const std::size_t BYTES_IN_MB{1024 * 1024}; } CDataFrameAnalysisRunner::CDataFrameAnalysisRunner(const CDataFrameAnalysisSpecification& spec) @@ -54,11 +56,11 @@ void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJ this->estimateMemoryUsage(numberRows, numberRows, numberColumns)}; std::size_t expectedMemoryWithDisk{this->estimateMemoryUsage( numberRows, numberRows / maxNumberPartitions, numberColumns)}; - auto roundUpToNearestKilobyte = [](std::size_t bytes) { - return std::to_string((bytes + 1024 - 1) / 1024) + "kB"; + auto roundUpToNearestMb = [](std::size_t bytes) { + return std::to_string((bytes + BYTES_IN_MB - 1) / BYTES_IN_MB) + "mb"; }; - writer.write(roundUpToNearestKilobyte(expectedMemoryWithoutDisk), - roundUpToNearestKilobyte(expectedMemoryWithDisk)); + writer.write(roundUpToNearestMb(expectedMemoryWithoutDisk), + roundUpToNearestMb(expectedMemoryWithDisk)); } void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() { @@ -95,7 +97,7 @@ void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() { if (memoryUsage > memoryLimit) { auto roundMb = [](std::size_t memory) { - return 0.01 * static_cast((100 * memory) / (1024 * 1024)); + return 0.01 * static_cast((100 * memory) / BYTES_IN_MB); }; // Report rounded up to the nearest MB. diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 1e221a1d58..561671eaa7 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -189,19 +189,19 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor0Rows) { } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1Row) { - testEstimateMemoryUsage(1, "4kB", "4kB", 0); + testEstimateMemoryUsage(1, "1mb", "1mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10Rows) { - testEstimateMemoryUsage(10, "12kB", "10kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) { + testEstimateMemoryUsage(10000, "5mb", "2mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100Rows) { - testEstimateMemoryUsage(100, "57kB", "35kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) { + testEstimateMemoryUsage(100000, "40mb", "9mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) { - testEstimateMemoryUsage(1000, "403kB", "142kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000000Rows) { + testEstimateMemoryUsage(10000000, "4511mb", "88mb", 0); } BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc b/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc index 1ddae95a66..006b3a7a6e 100644 --- a/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc +++ b/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc @@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { { core::CJsonOutputStreamWrapper wrappedOutStream(sstream); CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream); - writer.write("16kB", "8kB"); + writer.write("16mb", "8mb"); } rapidjson::Document arrayDoc; @@ -42,10 +42,10 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_TEST_REQUIRE(object.IsObject()); BOOST_TEST_REQUIRE(object.HasMember("expected_memory_without_disk")); - BOOST_REQUIRE_EQUAL(std::string("16kB"), + BOOST_REQUIRE_EQUAL(std::string("16mb"), std::string(object["expected_memory_without_disk"].GetString())); BOOST_TEST_REQUIRE(object.HasMember("expected_memory_with_disk")); - BOOST_REQUIRE_EQUAL(std::string("8kB"), + BOOST_REQUIRE_EQUAL(std::string("8mb"), std::string(object["expected_memory_with_disk"].GetString())); } From 08f02cc672e06df8014f7b092ed338d9bd95a4ff Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 7 Apr 2020 11:26:27 +0100 Subject: [PATCH 2/3] Add changelog entry --- docs/CHANGELOG.asciidoc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 743c4bdcf5..e4910160ce 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -43,10 +43,12 @@ * Reduce CPU scheduling priority of native analysis processes to favor the ES JVM when CPU is constrained. (See {ml-pull}1109[#1109].) * Take `training_percent` into account when estimating memory usage for classification and regression. - (See {ml-pull}1111[1111].) + (See {ml-pull}1111[#1111].) * Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].) * Adds new `num_matches` and `preferred_to_categories` fields to category output. - (See {ml-pull}1062[#1062]) + (See {ml-pull}1062[#1062].) +* Switched data frame analytics model memory estimates from kilobytes to megabytes. + (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].) == {es} version 7.7.0 From 939ff41714821eab53ce7d26cee9a40af006cc07 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 7 Apr 2020 16:24:21 +0100 Subject: [PATCH 3/3] Fix test name --- lib/api/unittest/CDataFrameAnalysisRunnerTest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 561671eaa7..7cfb5ababa 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -200,7 +200,7 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) { testEstimateMemoryUsage(100000, "40mb", "9mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000000Rows) { +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) { testEstimateMemoryUsage(10000000, "4511mb", "88mb", 0); }