diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 5743b3ff44..55934f2437 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -43,12 +43,14 @@ * Reduce CPU scheduling priority of native analysis processes to favor the ES JVM when CPU is constrained. (See {ml-pull}1109[#1109].) * Take `training_percent` into account when estimating memory usage for classification and regression. - (See {ml-pull}1111[1111].) + (See {ml-pull}1111[#1111].) * Support maximize minimum recall when assigning class labels for multiclass classification. (See {ml-pull}1113[#1113].) * Improve robustness of anomaly detection to bad input data. (See {ml-pull}1114[#1114].) * Adds new `num_matches` and `preferred_to_categories` fields to category output. - (See {ml-pull}1062[#1062]) + (See {ml-pull}1062[#1062].) +* Switched data frame analytics model memory estimates from kilobytes to megabytes. + (See {ml-pull}1126[#1126], issue: {issue}54506[#54506].) == {es} version 7.7.0 diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index b81ff58e7f..c1ba307ca7 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -32,6 +32,8 @@ std::size_t maximumNumberPartitions(const CDataFrameAnalysisSpecification& spec) // user to allocate more resources for the job in this case. return static_cast(std::sqrt(static_cast(spec.numberRows())) + 0.5); } + +const std::size_t BYTES_IN_MB{1024 * 1024}; } CDataFrameAnalysisRunner::CDataFrameAnalysisRunner(const CDataFrameAnalysisSpecification& spec) @@ -54,11 +56,11 @@ void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJ this->estimateMemoryUsage(numberRows, numberRows, numberColumns)}; std::size_t expectedMemoryWithDisk{this->estimateMemoryUsage( numberRows, numberRows / maxNumberPartitions, numberColumns)}; - auto roundUpToNearestKilobyte = [](std::size_t bytes) { - return std::to_string((bytes + 1024 - 1) / 1024) + "kB"; + auto roundUpToNearestMb = [](std::size_t bytes) { + return std::to_string((bytes + BYTES_IN_MB - 1) / BYTES_IN_MB) + "mb"; }; - writer.write(roundUpToNearestKilobyte(expectedMemoryWithoutDisk), - roundUpToNearestKilobyte(expectedMemoryWithDisk)); + writer.write(roundUpToNearestMb(expectedMemoryWithoutDisk), + roundUpToNearestMb(expectedMemoryWithDisk)); } void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() { @@ -95,7 +97,7 @@ void CDataFrameAnalysisRunner::computeAndSaveExecutionStrategy() { if (memoryUsage > memoryLimit) { auto roundMb = [](std::size_t memory) { - return 0.01 * static_cast((100 * memory) / (1024 * 1024)); + return 0.01 * static_cast((100 * memory) / BYTES_IN_MB); }; // Report rounded up to the nearest MB. diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 1e221a1d58..7cfb5ababa 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -189,19 +189,19 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor0Rows) { } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1Row) { - testEstimateMemoryUsage(1, "4kB", "4kB", 0); + testEstimateMemoryUsage(1, "1mb", "1mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10Rows) { - testEstimateMemoryUsage(10, "12kB", "10kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) { + testEstimateMemoryUsage(10000, "5mb", "2mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100Rows) { - testEstimateMemoryUsage(100, "57kB", "35kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) { + testEstimateMemoryUsage(100000, "40mb", "9mb", 0); } -BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1000Rows) { - testEstimateMemoryUsage(1000, "403kB", "142kB", 0); +BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) { + testEstimateMemoryUsage(10000000, "4511mb", "88mb", 0); } BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc b/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc index 1ddae95a66..006b3a7a6e 100644 --- a/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc +++ b/lib/api/unittest/CMemoryUsageEstimationResultJsonWriterTest.cc @@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { { core::CJsonOutputStreamWrapper wrappedOutStream(sstream); CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream); - writer.write("16kB", "8kB"); + writer.write("16mb", "8mb"); } rapidjson::Document arrayDoc; @@ -42,10 +42,10 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_TEST_REQUIRE(object.IsObject()); BOOST_TEST_REQUIRE(object.HasMember("expected_memory_without_disk")); - BOOST_REQUIRE_EQUAL(std::string("16kB"), + BOOST_REQUIRE_EQUAL(std::string("16mb"), std::string(object["expected_memory_without_disk"].GetString())); BOOST_TEST_REQUIRE(object.HasMember("expected_memory_with_disk")); - BOOST_REQUIRE_EQUAL(std::string("8kB"), + BOOST_REQUIRE_EQUAL(std::string("8mb"), std::string(object["expected_memory_with_disk"].GetString())); }