Skip to content

Commit 50c34b2

Browse files
authored
[ML] Reverse engineer Grok patterns from categorization results (#30125)
This change adds a grok_pattern field to the GET categories API output in ML. It's calculated using the regex and examples in the categorization result, and applying a list of candidate Grok patterns to the bits in between the tokens that are considered to define the category. This can currently be considered a prototype, as the Grok patterns it produces are not optimal. However, enough people have said it would be useful for it to be worthwhile exposing it as experimental functionality for interested parties to try out.
1 parent 7dd816e commit 50c34b2

File tree

11 files changed

+561
-43
lines changed

11 files changed

+561
-43
lines changed

x-pack/docs/en/rest-api/ml/get-category.asciidoc

+14-10
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ roles provide these privileges. For more information, see
6262
==== Examples
6363

6464
The following example gets information about one category for the
65-
`it_ops_new_logs` job:
65+
`esxi_log` job:
6666

6767
[source,js]
6868
--------------------------------------------------
69-
GET _xpack/ml/anomaly_detectors/it_ops_new_logs/results/categories
69+
GET _xpack/ml/anomaly_detectors/esxi_log/results/categories
7070
{
7171
"page":{
7272
"size": 1
@@ -83,14 +83,18 @@ In this example, the API returns the following information:
8383
"count": 11,
8484
"categories": [
8585
{
86-
"job_id": "it_ops_new_logs",
87-
"category_id": 1,
88-
"terms": "Actual Transaction Already Voided Reversed hostname dbserver.acme.com physicalhost esxserver1.acme.com vmhost app1.acme.com",
89-
"regex": ".*?Actual.+?Transaction.+?Already.+?Voided.+?Reversed.+?hostname.+?dbserver.acme.com.+?physicalhost.+?esxserver1.acme.com.+?vmhost.+?app1.acme.com.*",
90-
"max_matching_length": 137,
91-
"examples": [
92-
"Actual Transaction Already Voided / Reversed;hostname=dbserver.acme.com;physicalhost=esxserver1.acme.com;vmhost=app1.acme.com"
93-
]
86+
"job_id" : "esxi_log",
87+
"category_id" : 1,
88+
"terms" : "Vpxa verbose vpxavpxaInvtVm opID VpxaInvtVmChangeListener Guest DiskInfo Changed",
89+
"regex" : ".*?Vpxa.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*",
90+
"max_matching_length": 154,
91+
"examples" : [
92+
"Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
93+
"Oct 19 17:04:45 esxi2.acme.com Vpxa: [3CA66B90 verbose 'vpxavpxaInvtVm' opID=WFU-33927856] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
94+
"Oct 19 17:04:51 esxi1.acme.com Vpxa: [FFDBAB90 verbose 'vpxavpxaInvtVm' opID=WFU-25e0d447] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
95+
"Oct 19 17:04:58 esxi2.acme.com Vpxa: [FFDDBB90 verbose 'vpxavpxaInvtVm' opID=WFU-bbff0134] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"
96+
],
97+
"grok_pattern" : ".*?%{SYSLOGTIMESTAMP:timestamp}.+?Vpxa.+?%{BASE16NUM:field}.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*"
9498
}
9599
]
96100
}

x-pack/docs/en/rest-api/ml/resultsresource.asciidoc

+7
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,13 @@ A category resource has the following properties:
405405
`examples`::
406406
(array) A list of examples of actual values that matched the category.
407407

408+
`grok_pattern`::
409+
experimental[] (string) A Grok pattern that could be used in Logstash or an
410+
Ingest Pipeline to extract fields from messages that match the category. This
411+
field is experimental and may be changed or removed in a future release. The
412+
Grok patterns that are found are not optimal, but are often a good starting
413+
point for manual tweaking.
414+
408415
`job_id`::
409416
(string) The unique identifier for the job that these results belong to.
410417

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java

+24-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66
package org.elasticsearch.xpack.core.ml.job.results;
77

8+
import org.elasticsearch.Version;
89
import org.elasticsearch.common.ParseField;
910
import org.elasticsearch.common.io.stream.StreamInput;
1011
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
3435
public static final ParseField REGEX = new ParseField("regex");
3536
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
3637
public static final ParseField EXAMPLES = new ParseField("examples");
38+
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
3739

3840
// Used for QueryPage
3941
public static final ParseField RESULTS_FIELD = new ParseField("categories");
@@ -51,6 +53,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
5153
parser.declareString(CategoryDefinition::setRegex, REGEX);
5254
parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
5355
parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
56+
parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
5457

5558
return parser;
5659
}
@@ -61,6 +64,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
6164
private String regex = "";
6265
private long maxMatchingLength = 0L;
6366
private final Set<String> examples;
67+
private String grokPattern;
6468

6569
public CategoryDefinition(String jobId) {
6670
this.jobId = jobId;
@@ -74,6 +78,9 @@ public CategoryDefinition(StreamInput in) throws IOException {
7478
regex = in.readString();
7579
maxMatchingLength = in.readLong();
7680
examples = new TreeSet<>(in.readList(StreamInput::readString));
81+
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
82+
grokPattern = in.readOptionalString();
83+
}
7784
}
7885

7986
@Override
@@ -84,6 +91,9 @@ public void writeTo(StreamOutput out) throws IOException {
8491
out.writeString(regex);
8592
out.writeLong(maxMatchingLength);
8693
out.writeStringList(new ArrayList<>(examples));
94+
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
95+
out.writeOptionalString(grokPattern);
96+
}
8797
}
8898

8999
public String getJobId() {
@@ -139,6 +149,14 @@ public void addExample(String example) {
139149
examples.add(example);
140150
}
141151

152+
public String getGrokPattern() {
153+
return grokPattern;
154+
}
155+
156+
public void setGrokPattern(String grokPattern) {
157+
this.grokPattern = grokPattern;
158+
}
159+
142160
@Override
143161
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
144162
builder.startObject();
@@ -148,6 +166,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
148166
builder.field(REGEX.getPreferredName(), regex);
149167
builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
150168
builder.field(EXAMPLES.getPreferredName(), examples);
169+
if (grokPattern != null) {
170+
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
171+
}
151172
builder.endObject();
152173
return builder;
153174
}
@@ -166,11 +187,12 @@ public boolean equals(Object other) {
166187
&& Objects.equals(this.terms, that.terms)
167188
&& Objects.equals(this.regex, that.regex)
168189
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
169-
&& Objects.equals(this.examples, that.examples);
190+
&& Objects.equals(this.examples, that.examples)
191+
&& Objects.equals(this.grokPattern, that.grokPattern);
170192
}
171193

172194
@Override
173195
public int hashCode() {
174-
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples);
196+
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
175197
}
176198
}

x-pack/plugin/ml/build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ dependencies {
4646
testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')
4747

4848
// ml deps
49+
compile project(':libs:grok')
4950
compile 'net.sf.supercsv:super-csv:2.4.0'
5051
nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip"
5152
testCompile 'org.ini4j:ini4j:0.5.2'

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ protected void doExecute(GetCategoriesAction.Request request, ActionListener<Get
4141

4242
Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;
4343
Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;
44-
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,
44+
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,
4545
r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);
4646
}
4747
}

0 commit comments

Comments
 (0)