Skip to content

Commit 72346b9

Browse files
authored
[ML] Add new categorization stats to model_size_stats (#51879)
This change adds support for the following new model_size_stats fields: - categorized_doc_count - total_category_count - frequent_category_count - rare_category_count - dead_category_count - categorization_status Relates #50749
1 parent de4cf2b commit 72346b9

File tree

11 files changed

+497
-20
lines changed

11 files changed

+497
-20
lines changed

client/rest-high-level/src/main/java/org/elasticsearch/client/ml/job/process/ModelSizeStats.java

+133-6
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import java.util.Objects;
3434

3535
/**
36-
* Provide access to the C++ model memory usage numbers for the Java process.
36+
* Provide access to the C++ model size stats for the Java process.
3737
*/
3838
public class ModelSizeStats implements ToXContentObject {
3939

@@ -54,6 +54,12 @@ public class ModelSizeStats implements ToXContentObject {
5454
public static final ParseField TOTAL_PARTITION_FIELD_COUNT_FIELD = new ParseField("total_partition_field_count");
5555
public static final ParseField BUCKET_ALLOCATION_FAILURES_COUNT_FIELD = new ParseField("bucket_allocation_failures_count");
5656
public static final ParseField MEMORY_STATUS_FIELD = new ParseField("memory_status");
57+
public static final ParseField CATEGORIZED_DOC_COUNT_FIELD = new ParseField("categorized_doc_count");
58+
public static final ParseField TOTAL_CATEGORY_COUNT_FIELD = new ParseField("total_category_count");
59+
public static final ParseField FREQUENT_CATEGORY_COUNT_FIELD = new ParseField("frequent_category_count");
60+
public static final ParseField RARE_CATEGORY_COUNT_FIELD = new ParseField("rare_category_count");
61+
public static final ParseField DEAD_CATEGORY_COUNT_FIELD = new ParseField("dead_category_count");
62+
public static final ParseField CATEGORIZATION_STATUS_FIELD = new ParseField("categorization_status");
5763
public static final ParseField LOG_TIME_FIELD = new ParseField("log_time");
5864
public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp");
5965

@@ -69,6 +75,14 @@ public class ModelSizeStats implements ToXContentObject {
6975
PARSER.declareLong(Builder::setTotalByFieldCount, TOTAL_BY_FIELD_COUNT_FIELD);
7076
PARSER.declareLong(Builder::setTotalOverFieldCount, TOTAL_OVER_FIELD_COUNT_FIELD);
7177
PARSER.declareLong(Builder::setTotalPartitionFieldCount, TOTAL_PARTITION_FIELD_COUNT_FIELD);
78+
PARSER.declareField(Builder::setMemoryStatus, p -> MemoryStatus.fromString(p.text()), MEMORY_STATUS_FIELD, ValueType.STRING);
79+
PARSER.declareLong(Builder::setCategorizedDocCount, CATEGORIZED_DOC_COUNT_FIELD);
80+
PARSER.declareLong(Builder::setTotalCategoryCount, TOTAL_CATEGORY_COUNT_FIELD);
81+
PARSER.declareLong(Builder::setFrequentCategoryCount, FREQUENT_CATEGORY_COUNT_FIELD);
82+
PARSER.declareLong(Builder::setRareCategoryCount, RARE_CATEGORY_COUNT_FIELD);
83+
PARSER.declareLong(Builder::setDeadCategoryCount, DEAD_CATEGORY_COUNT_FIELD);
84+
PARSER.declareField(Builder::setCategorizationStatus,
85+
p -> CategorizationStatus.fromString(p.text()), CATEGORIZATION_STATUS_FIELD, ValueType.STRING);
7286
PARSER.declareField(Builder::setLogTime,
7387
(p) -> TimeUtil.parseTimeField(p, LOG_TIME_FIELD.getPreferredName()),
7488
LOG_TIME_FIELD,
@@ -77,7 +91,6 @@ public class ModelSizeStats implements ToXContentObject {
7791
(p) -> TimeUtil.parseTimeField(p, TIMESTAMP_FIELD.getPreferredName()),
7892
TIMESTAMP_FIELD,
7993
ValueType.VALUE);
80-
PARSER.declareField(Builder::setMemoryStatus, p -> MemoryStatus.fromString(p.text()), MEMORY_STATUS_FIELD, ValueType.STRING);
8194
}
8295

8396
/**
@@ -99,6 +112,23 @@ public String toString() {
99112
}
100113
}
101114

115+
/**
116+
* The status of categorization for a job. OK is default, WARN
117+
* means that inappropriate numbers of categories are being found
118+
*/
119+
public enum CategorizationStatus {
120+
OK, WARN;
121+
122+
public static CategorizationStatus fromString(String statusName) {
123+
return valueOf(statusName.trim().toUpperCase(Locale.ROOT));
124+
}
125+
126+
@Override
127+
public String toString() {
128+
return name().toLowerCase(Locale.ROOT);
129+
}
130+
}
131+
102132
private final String jobId;
103133
private final long modelBytes;
104134
private final Long modelBytesExceeded;
@@ -108,12 +138,20 @@ public String toString() {
108138
private final long totalPartitionFieldCount;
109139
private final long bucketAllocationFailuresCount;
110140
private final MemoryStatus memoryStatus;
141+
private final long categorizedDocCount;
142+
private final long totalCategoryCount;
143+
private final long frequentCategoryCount;
144+
private final long rareCategoryCount;
145+
private final long deadCategoryCount;
146+
private final CategorizationStatus categorizationStatus;
111147
private final Date timestamp;
112148
private final Date logTime;
113149

114150
private ModelSizeStats(String jobId, long modelBytes, Long modelBytesExceeded, Long modelBytesMemoryLimit, long totalByFieldCount,
115151
long totalOverFieldCount, long totalPartitionFieldCount, long bucketAllocationFailuresCount,
116-
MemoryStatus memoryStatus, Date timestamp, Date logTime) {
152+
MemoryStatus memoryStatus, long categorizedDocCount, long totalCategoryCount, long frequentCategoryCount,
153+
long rareCategoryCount, long deadCategoryCount, CategorizationStatus categorizationStatus,
154+
Date timestamp, Date logTime) {
117155
this.jobId = jobId;
118156
this.modelBytes = modelBytes;
119157
this.modelBytesExceeded = modelBytesExceeded;
@@ -123,6 +161,12 @@ private ModelSizeStats(String jobId, long modelBytes, Long modelBytesExceeded, L
123161
this.totalPartitionFieldCount = totalPartitionFieldCount;
124162
this.bucketAllocationFailuresCount = bucketAllocationFailuresCount;
125163
this.memoryStatus = memoryStatus;
164+
this.categorizedDocCount = categorizedDocCount;
165+
this.totalCategoryCount = totalCategoryCount;
166+
this.frequentCategoryCount = frequentCategoryCount;
167+
this.rareCategoryCount = rareCategoryCount;
168+
this.deadCategoryCount = deadCategoryCount;
169+
this.categorizationStatus = categorizationStatus;
126170
this.timestamp = timestamp;
127171
this.logTime = logTime;
128172
}
@@ -145,6 +189,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
145189
builder.field(TOTAL_PARTITION_FIELD_COUNT_FIELD.getPreferredName(), totalPartitionFieldCount);
146190
builder.field(BUCKET_ALLOCATION_FAILURES_COUNT_FIELD.getPreferredName(), bucketAllocationFailuresCount);
147191
builder.field(MEMORY_STATUS_FIELD.getPreferredName(), memoryStatus);
192+
builder.field(CATEGORIZED_DOC_COUNT_FIELD.getPreferredName(), categorizedDocCount);
193+
builder.field(TOTAL_CATEGORY_COUNT_FIELD.getPreferredName(), totalCategoryCount);
194+
builder.field(FREQUENT_CATEGORY_COUNT_FIELD.getPreferredName(), frequentCategoryCount);
195+
builder.field(RARE_CATEGORY_COUNT_FIELD.getPreferredName(), rareCategoryCount);
196+
builder.field(DEAD_CATEGORY_COUNT_FIELD.getPreferredName(), deadCategoryCount);
197+
builder.field(CATEGORIZATION_STATUS_FIELD.getPreferredName(), categorizationStatus);
148198
builder.timeField(LOG_TIME_FIELD.getPreferredName(), LOG_TIME_FIELD.getPreferredName() + "_string", logTime.getTime());
149199
if (timestamp != null) {
150200
builder.timeField(TIMESTAMP_FIELD.getPreferredName(), TIMESTAMP_FIELD.getPreferredName() + "_string", timestamp.getTime());
@@ -190,6 +240,30 @@ public MemoryStatus getMemoryStatus() {
190240
return memoryStatus;
191241
}
192242

243+
public long getCategorizedDocCount() {
244+
return categorizedDocCount;
245+
}
246+
247+
public long getTotalCategoryCount() {
248+
return totalCategoryCount;
249+
}
250+
251+
public long getFrequentCategoryCount() {
252+
return frequentCategoryCount;
253+
}
254+
255+
public long getRareCategoryCount() {
256+
return rareCategoryCount;
257+
}
258+
259+
public long getDeadCategoryCount() {
260+
return deadCategoryCount;
261+
}
262+
263+
public CategorizationStatus getCategorizationStatus() {
264+
return categorizationStatus;
265+
}
266+
193267
/**
194268
* The timestamp of the last processed record when this instance was created.
195269
*
@@ -211,7 +285,8 @@ public Date getLogTime() {
211285
@Override
212286
public int hashCode() {
213287
return Objects.hash(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
214-
totalPartitionFieldCount, this.bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
288+
totalPartitionFieldCount, this.bucketAllocationFailuresCount, memoryStatus, categorizedDocCount, totalCategoryCount,
289+
frequentCategoryCount, rareCategoryCount, deadCategoryCount, categorizationStatus, timestamp, logTime);
215290
}
216291

217292
/**
@@ -233,7 +308,14 @@ public boolean equals(Object other) {
233308
&& Objects.equals(this.modelBytesMemoryLimit, that.modelBytesMemoryLimit) && this.totalByFieldCount == that.totalByFieldCount
234309
&& this.totalOverFieldCount == that.totalOverFieldCount && this.totalPartitionFieldCount == that.totalPartitionFieldCount
235310
&& this.bucketAllocationFailuresCount == that.bucketAllocationFailuresCount
236-
&& Objects.equals(this.memoryStatus, that.memoryStatus) && Objects.equals(this.timestamp, that.timestamp)
311+
&& Objects.equals(this.memoryStatus, that.memoryStatus)
312+
&& this.categorizedDocCount == that.categorizedDocCount
313+
&& this.totalCategoryCount == that.totalCategoryCount
314+
&& this.frequentCategoryCount == that.frequentCategoryCount
315+
&& this.rareCategoryCount == that.rareCategoryCount
316+
&& this.deadCategoryCount == that.deadCategoryCount
317+
&& Objects.equals(this.categorizationStatus, that.categorizationStatus)
318+
&& Objects.equals(this.timestamp, that.timestamp)
237319
&& Objects.equals(this.logTime, that.logTime)
238320
&& Objects.equals(this.jobId, that.jobId);
239321
}
@@ -249,12 +331,19 @@ public static class Builder {
249331
private long totalPartitionFieldCount;
250332
private long bucketAllocationFailuresCount;
251333
private MemoryStatus memoryStatus;
334+
private long categorizedDocCount;
335+
private long totalCategoryCount;
336+
private long frequentCategoryCount;
337+
private long rareCategoryCount;
338+
private long deadCategoryCount;
339+
private CategorizationStatus categorizationStatus;
252340
private Date timestamp;
253341
private Date logTime;
254342

255343
public Builder(String jobId) {
256344
this.jobId = jobId;
257345
memoryStatus = MemoryStatus.OK;
346+
categorizationStatus = CategorizationStatus.OK;
258347
logTime = new Date();
259348
}
260349

@@ -268,6 +357,12 @@ public Builder(ModelSizeStats modelSizeStats) {
268357
this.totalPartitionFieldCount = modelSizeStats.totalPartitionFieldCount;
269358
this.bucketAllocationFailuresCount = modelSizeStats.bucketAllocationFailuresCount;
270359
this.memoryStatus = modelSizeStats.memoryStatus;
360+
this.categorizedDocCount = modelSizeStats.categorizedDocCount;
361+
this.totalCategoryCount = modelSizeStats.totalCategoryCount;
362+
this.frequentCategoryCount = modelSizeStats.frequentCategoryCount;
363+
this.rareCategoryCount = modelSizeStats.rareCategoryCount;
364+
this.deadCategoryCount = modelSizeStats.deadCategoryCount;
365+
this.categorizationStatus = modelSizeStats.categorizationStatus;
271366
this.timestamp = modelSizeStats.timestamp;
272367
this.logTime = modelSizeStats.logTime;
273368
}
@@ -313,6 +408,37 @@ public Builder setMemoryStatus(MemoryStatus memoryStatus) {
313408
return this;
314409
}
315410

411+
public Builder setCategorizedDocCount(long categorizedDocCount) {
412+
this.categorizedDocCount = categorizedDocCount;
413+
return this;
414+
}
415+
416+
public Builder setTotalCategoryCount(long totalCategoryCount) {
417+
this.totalCategoryCount = totalCategoryCount;
418+
return this;
419+
}
420+
421+
public Builder setFrequentCategoryCount(long frequentCategoryCount) {
422+
this.frequentCategoryCount = frequentCategoryCount;
423+
return this;
424+
}
425+
426+
public Builder setRareCategoryCount(long rareCategoryCount) {
427+
this.rareCategoryCount = rareCategoryCount;
428+
return this;
429+
}
430+
431+
public Builder setDeadCategoryCount(long deadCategoryCount) {
432+
this.deadCategoryCount = deadCategoryCount;
433+
return this;
434+
}
435+
436+
public Builder setCategorizationStatus(CategorizationStatus categorizationStatus) {
437+
Objects.requireNonNull(categorizationStatus, "[" + CATEGORIZATION_STATUS_FIELD.getPreferredName() + "] must not be null");
438+
this.categorizationStatus = categorizationStatus;
439+
return this;
440+
}
441+
316442
public Builder setTimestamp(Date timestamp) {
317443
this.timestamp = timestamp;
318444
return this;
@@ -325,7 +451,8 @@ public Builder setLogTime(Date logTime) {
325451

326452
public ModelSizeStats build() {
327453
return new ModelSizeStats(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
328-
totalPartitionFieldCount, bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
454+
totalPartitionFieldCount, bucketAllocationFailuresCount, memoryStatus, categorizedDocCount, totalCategoryCount,
455+
frequentCategoryCount, rareCategoryCount, deadCategoryCount, categorizationStatus, timestamp, logTime);
329456
}
330457
}
331458
}

client/rest-high-level/src/test/java/org/elasticsearch/client/ml/job/process/ModelSizeStatsTests.java

+28-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import java.util.Date;
2626

27+
import static org.elasticsearch.client.ml.job.process.ModelSizeStats.CategorizationStatus;
2728
import static org.elasticsearch.client.ml.job.process.ModelSizeStats.MemoryStatus;
2829

2930
public class ModelSizeStatsTests extends AbstractXContentTestCase<ModelSizeStats> {
@@ -38,6 +39,12 @@ public void testDefaultConstructor() {
3839
assertEquals(0, stats.getTotalPartitionFieldCount());
3940
assertEquals(0, stats.getBucketAllocationFailuresCount());
4041
assertEquals(MemoryStatus.OK, stats.getMemoryStatus());
42+
assertEquals(0, stats.getCategorizedDocCount());
43+
assertEquals(0, stats.getTotalCategoryCount());
44+
assertEquals(0, stats.getFrequentCategoryCount());
45+
assertEquals(0, stats.getRareCategoryCount());
46+
assertEquals(0, stats.getDeadCategoryCount());
47+
assertEquals(CategorizationStatus.OK, stats.getCategorizationStatus());
4148
}
4249

4350
public void testSetMemoryStatus_GivenNull() {
@@ -85,13 +92,31 @@ public static ModelSizeStats createRandomized() {
8592
stats.setTotalPartitionFieldCount(randomNonNegativeLong());
8693
}
8794
if (randomBoolean()) {
88-
stats.setLogTime(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
95+
stats.setMemoryStatus(randomFrom(MemoryStatus.values()));
8996
}
9097
if (randomBoolean()) {
91-
stats.setTimestamp(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
98+
stats.setCategorizedDocCount(randomNonNegativeLong());
9299
}
93100
if (randomBoolean()) {
94-
stats.setMemoryStatus(randomFrom(MemoryStatus.values()));
101+
stats.setTotalCategoryCount(randomNonNegativeLong());
102+
}
103+
if (randomBoolean()) {
104+
stats.setFrequentCategoryCount(randomNonNegativeLong());
105+
}
106+
if (randomBoolean()) {
107+
stats.setRareCategoryCount(randomNonNegativeLong());
108+
}
109+
if (randomBoolean()) {
110+
stats.setDeadCategoryCount(randomNonNegativeLong());
111+
}
112+
if (randomBoolean()) {
113+
stats.setCategorizationStatus(randomFrom(CategorizationStatus.values()));
114+
}
115+
if (randomBoolean()) {
116+
stats.setLogTime(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
117+
}
118+
if (randomBoolean()) {
119+
stats.setTimestamp(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
95120
}
96121
return stats.build();
97122
}

docs/reference/ml/anomaly-detection/apis/get-job-stats.asciidoc

+41
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,35 @@ model. It has the following properties:
195195
processed due to insufficient model memory. This situation is also signified
196196
by a `hard_limit: memory_status` property value.
197197

198+
`model_size_stats`.`categorized_doc_count`:::
199+
(long) The number of documents that have had a field categorized.
200+
201+
`model_size_stats`.`categorization_status`:::
202+
(string) The status of categorization for this job.
203+
Contains one of the following values.
204+
+
205+
--
206+
* `ok`: Categorization is performing acceptably well (or not being
207+
used at all).
208+
* `warn`: Categorization is detecting a distribution of categories
209+
that suggests the input data is inappropriate for categorization.
210+
Problems could be that there is only one category, more than 90% of
211+
categories are rare, the number of categories is greater than 50% of
212+
the number of categorized documents, there are no frequently
213+
matched categories, or more than 50% of categories are dead.
214+
215+
--
216+
217+
`model_size_stats`.`dead_category_count`:::
218+
(long) The number of categories created by categorization that will
219+
never be assigned again because another category's definition
220+
makes it a superset of the dead category. (Dead categories are a
221+
side effect of the way categorization has no prior training.)
222+
223+
`model_size_stats`.`frequent_category_count`:::
224+
(long) The number of categories that match more than 1% of categorized
225+
documents.
226+
198227
`model_size_stats`.`job_id`:::
199228
(string)
200229
include::{docdir}/ml/ml-shared.asciidoc[tag=job-id-anomaly-detection]
@@ -226,13 +255,19 @@ this value indicates the latest size.
226255
`model_size_stats`.`model_bytes_memory_limit`:::
227256
(long) The upper limit for memory usage, checked on increasing values.
228257

258+
`model_size_stats`.`rare_category_count`:::
259+
(long) The number of categories that match just one categorized document.
260+
229261
`model_size_stats`.`result_type`:::
230262
(string) For internal use. The type of result.
231263

232264
`model_size_stats`.`total_by_field_count`:::
233265
(long) The number of `by` field values that were analyzed by the models. This
234266
value is cumulative for all detectors.
235267

268+
`model_size_stats`.`total_category_count`:::
269+
(long) The number of categories created by categorization.
270+
236271
`model_size_stats`.`total_over_field_count`:::
237272
(long) The number of `over` field values that were analyzed by the models. This
238273
value is cumulative for all detectors.
@@ -371,6 +406,12 @@ The API returns the following results:
371406
"total_partition_field_count" : 2,
372407
"bucket_allocation_failures_count" : 0,
373408
"memory_status" : "ok",
409+
"categorized_doc_count" : 0,
410+
"total_category_count" : 0,
411+
"frequent_category_count" : 0,
412+
"rare_category_count" : 0,
413+
"dead_category_count" : 0,
414+
"categorization_status" : "ok",
374415
"log_time" : 1576017596000,
375416
"timestamp" : 1580410800000
376417
},

0 commit comments

Comments
 (0)