Skip to content

Commit 7e1b5b0

Browse files
committed
[ML] Improve hard_limit audit message (elastic#486)
Add the current model memory limit and the number of bytes in excess of that at the point of the last allocation failure to the model size stats. These will be used to construct a (hopefully) more informative hard_limit audit message. The reported memory usage is also scaled to take into account the byte limit margin, which is in play in the initial period of a jobs' lifetime and is used to scale down the high memory limit. This should give a more accurate representation of how close the memory usage is to the high limit. relates elastic/elasticsearch#42086 closes elastic/elasticsearch#38034
1 parent 610c114 commit 7e1b5b0

File tree

6 files changed

+78
-35
lines changed

6 files changed

+78
-35
lines changed

docs/CHANGELOG.asciidoc

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
4444
4545
* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
4646
47+
* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
48+
4749
=== Bug Fixes
4850
4951
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

include/model/CResourceMonitor.h

+11-4
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
3434
public:
3535
struct MODEL_EXPORT SResults {
3636
std::size_t s_Usage;
37+
std::size_t s_AdjustedUsage;
3738
std::size_t s_ByFields;
3839
std::size_t s_PartitionFields;
3940
std::size_t s_OverFields;
4041
std::size_t s_AllocationFailures;
4142
model_t::EMemoryStatus s_MemoryStatus;
4243
core_t::TTime s_BucketStartTime;
44+
std::size_t s_BytesExceeded;
45+
std::size_t s_BytesMemoryLimit;
4346
};
4447

4548
public:
@@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
6568
//! taking up too much memory and further allocations should be banned
6669
bool areAllocationsAllowed() const;
6770

68-
//! Query the resource monitor to found out if it's Ok to
69-
//! create structures of a certain size
70-
bool areAllocationsAllowed(std::size_t size) const;
71-
7271
//! Return the amount of remaining space for allocations
7372
std::size_t allocationLimit() const;
7473

@@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
164163
//! Returns the sum of used memory plus any extra memory
165164
std::size_t totalMemory() const;
166165

166+
//! Adjusts the amount of memory reported to take into
167+
//! account the current value of the byte limit margin and the effects
168+
//! of background persistence.
169+
std::size_t adjustedUsage(std::size_t usage) const;
170+
167171
private:
168172
//! The registered collection of components
169173
TDetectorPtrSizeUMap m_Detectors;
@@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
226230
//! Don't do any sort of memory checking if this is set
227231
bool m_NoLimit;
228232

233+
//! The number of bytes over the high limit for memory usage at the last allocation failure
234+
std::size_t m_CurrentBytesExceeded;
235+
229236
//! Test friends
230237
friend class ::CResourceMonitorTest;
231238
friend class ::CResourceLimitTest;

lib/api/CModelSizeStatsJsonWriter.cc

+10-10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ namespace {
1616
const std::string JOB_ID("job_id");
1717
const std::string MODEL_SIZE_STATS("model_size_stats");
1818
const std::string MODEL_BYTES("model_bytes");
19+
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
20+
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
1921
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
2022
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
2123
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
@@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
3335

3436
writer.String(JOB_ID);
3537
writer.String(jobId);
38+
3639
writer.String(MODEL_BYTES);
37-
// Background persist causes the memory size to double due to copying
38-
// the models. On top of that, after the persist is done we may not
39-
// be able to retrieve that memory back. Thus, we report twice the
40-
// memory usage in order to allow for that.
41-
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
42-
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
43-
// discusses adding an option to perform only foreground persist.
44-
// If that gets implemented, we should only double when background
45-
// persist is configured.
46-
writer.Uint64(results.s_Usage * 2);
40+
writer.Uint64(results.s_AdjustedUsage);
41+
42+
writer.String(MODEL_BYTES_EXCEEDED);
43+
writer.Uint64(results.s_BytesExceeded);
44+
45+
writer.String(MODEL_BYTES_MEMORY_LIMIT);
46+
writer.Uint64(results.s_BytesMemoryLimit);
4747

4848
writer.String(TOTAL_BY_FIELD_COUNT);
4949
writer.Uint64(results.s_ByFields);

lib/api/unittest/CJsonOutputWriterTest.cc

+17-10
Original file line numberDiff line numberDiff line change
@@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
15551555

15561556
ml::model::CResourceMonitor::SResults resourceUsage;
15571557
resourceUsage.s_Usage = 1;
1558-
resourceUsage.s_ByFields = 2;
1559-
resourceUsage.s_PartitionFields = 3;
1560-
resourceUsage.s_OverFields = 4;
1561-
resourceUsage.s_AllocationFailures = 5;
1558+
resourceUsage.s_AdjustedUsage = 2;
1559+
resourceUsage.s_ByFields = 3;
1560+
resourceUsage.s_PartitionFields = 4;
1561+
resourceUsage.s_OverFields = 5;
1562+
resourceUsage.s_AllocationFailures = 6;
15621563
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
1563-
resourceUsage.s_BucketStartTime = 6;
1564+
resourceUsage.s_BucketStartTime = 7;
1565+
resourceUsage.s_BytesExceeded = 8;
1566+
resourceUsage.s_BytesMemoryLimit = 9;
15641567

15651568
writer.reportMemoryUsage(resourceUsage);
15661569
writer.endOutputBatch(false, 1ul);
@@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
15801583
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
15811584
CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
15821585
CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
1583-
CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
1586+
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
15841587
CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
1585-
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
1588+
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
15861589
CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
1587-
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
1590+
CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
15881591
CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
1589-
CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
1592+
CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
15901593
CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
1591-
CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
1594+
CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
15921595
CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
15931596
CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
15941597
std::string(sizeStats["memory_status"].GetString()));
15951598
CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
15961599
int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
15971600
CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
15981601
CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
1602+
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
1603+
CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
1604+
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
1605+
CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
15991606
}
16001607

16011608
void CJsonOutputWriterTest::testWriteScheduledEvent() {

lib/api/unittest/CModelSnapshotJsonWriterTest.cc

+9-1
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
3333
{
3434
model::CResourceMonitor::SResults modelSizeStats{
3535
10000, // bytes used
36+
20000, // bytes used (adjusted)
3637
3, // # by fields
3738
1, // # partition fields
3839
150, // # over fields
3940
4, // # allocation failures
4041
model_t::E_MemoryStatusOk, // memory status
41-
core_t::TTime(1521046309) // bucket start time
42+
core_t::TTime(1521046309), // bucket start time
43+
0, // model bytes exceeded
44+
50000 // model bytes memory limit
4245
};
4346

4447
CModelSnapshotJsonWriter::SModelSnapshotReport report{
@@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
114117
CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
115118
CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
116119
CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
120+
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
121+
CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
122+
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
123+
CPPUNIT_ASSERT_EQUAL(int64_t(50000),
124+
modelSizeStats["model_bytes_memory_limit"].GetInt64());
117125

118126
CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
119127
const rapidjson::Value& quantiles = snapshot["quantiles"];

lib/model/CResourceMonitor.cc

+29-10
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
3535
m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
3636
m_PruneWindow(std::numeric_limits<std::size_t>::max()),
3737
m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
38-
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
38+
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
39+
m_NoLimit(false), m_CurrentBytesExceeded(0) {
3940
this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
4041
}
4142

@@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {
108109

109110
void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
110111
this->memUsage(&detector);
111-
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
112-
LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());
112+
113113
this->updateAllowAllocations();
114114
}
115115

116116
void CResourceMonitor::updateAllowAllocations() {
117117
std::size_t total{this->totalMemory()};
118+
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
119+
LOG_TRACE(<< "Checking allocations: currently at " << total);
118120
if (m_AllowAllocations) {
119121
if (total > this->highLimit()) {
120122
LOG_INFO(<< "Over current allocation high limit. " << total
121123
<< " bytes used, the limit is " << this->highLimit());
122124
m_AllowAllocations = false;
125+
std::size_t bytesExceeded{total - this->highLimit()};
126+
m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
123127
}
124128
} else if (total < this->lowLimit()) {
125129
LOG_INFO(<< "Below current allocation low limit. " << total
@@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
204208
return m_AllowAllocations;
205209
}
206210

207-
bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
208-
if (m_AllowAllocations) {
209-
return this->totalMemory() + size < this->highLimit();
210-
}
211-
return false;
212-
}
213-
214211
std::size_t CResourceMonitor::allocationLimit() const {
215212
return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
216213
}
@@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
268265
res.s_OverFields = 0;
269266
res.s_PartitionFields = 0;
270267
res.s_Usage = this->totalMemory();
268+
res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
269+
res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
270+
res.s_BytesExceeded = m_CurrentBytesExceeded;
271271
res.s_AllocationFailures = 0;
272272
res.s_MemoryStatus = m_MemoryStatus;
273273
res.s_BucketStartTime = bucketStartTime;
@@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
281281
return res;
282282
}
283283

284+
std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
285+
// Background persist causes the memory size to double due to copying
286+
// the models. On top of that, after the persist is done we may not
287+
// be able to retrieve that memory back. Thus, we report twice the
288+
// memory usage in order to allow for that.
289+
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
290+
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
291+
// discusses adding an option to perform only foreground persist.
292+
// If that gets implemented, we should only double when background
293+
// persist is configured.
294+
295+
// We also scale the reported memory usage by the inverse of the byte limit margin.
296+
// This gives the user a fairer indication of how close the job is to hitting
297+
// the model memory limit in a concise manner (as the limit is scaled down by
298+
// the margin during the beginning period of the job's existence).
299+
size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
300+
return adjustedUsage;
301+
}
302+
284303
void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
285304
m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
286305
++m_AllocationFailures[time];

0 commit comments

Comments
 (0)