Skip to content

Commit 9e2a2df

Browse files
committed
[ML] Reverse engineer Grok patterns from categorization results
It has been noted that the regexes we produce in our categorization results are not that far away from Grok patterns that could be used in Logstash to categorize messages at ingest time and do better field extraction for log formats that do not have out-of-the-box patterns. This change adds a `grok_pattern` field to our GET categories API output. It's calculated using the regex and examples in the categorization result, and applying a list of candidate Grok patterns to the bits in between the tokens that are considered to define the category. This can currently be considered a prototype, as there is an outstanding question on how the functionality should work: * Is calculating the Grok patterns on the fly the best thing to do? It might be better to calculate them when categorization results are created/updated, and store the patterns in a new type of ML results document. Then we could let users manually improve the patterns and remember their edits. But the decision here needs to tie in with the end-to-end story for this functionality. If the intended flow is `ML -> User edits in the Grok debugger -> Logstash config` then maybe there's no need for ML to remember the user edits.
1 parent 2c3e71f commit 9e2a2df

File tree

9 files changed

+540
-34
lines changed

9 files changed

+540
-34
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66
package org.elasticsearch.xpack.core.ml.job.results;
77

8+
import org.elasticsearch.Version;
89
import org.elasticsearch.common.ParseField;
910
import org.elasticsearch.common.io.stream.StreamInput;
1011
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
3435
public static final ParseField REGEX = new ParseField("regex");
3536
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
3637
public static final ParseField EXAMPLES = new ParseField("examples");
38+
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
3739

3840
// Used for QueryPage
3941
public static final ParseField RESULTS_FIELD = new ParseField("categories");
@@ -51,6 +53,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
5153
parser.declareString(CategoryDefinition::setRegex, REGEX);
5254
parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
5355
parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
56+
parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
5457

5558
return parser;
5659
}
@@ -61,6 +64,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
6164
private String regex = "";
6265
private long maxMatchingLength = 0L;
6366
private final Set<String> examples;
67+
private String grokPattern;
6468

6569
public CategoryDefinition(String jobId) {
6670
this.jobId = jobId;
@@ -74,6 +78,9 @@ public CategoryDefinition(StreamInput in) throws IOException {
7478
regex = in.readString();
7579
maxMatchingLength = in.readLong();
7680
examples = new TreeSet<>(in.readList(StreamInput::readString));
81+
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
82+
grokPattern = in.readOptionalString();
83+
}
7784
}
7885

7986
@Override
@@ -84,6 +91,9 @@ public void writeTo(StreamOutput out) throws IOException {
8491
out.writeString(regex);
8592
out.writeLong(maxMatchingLength);
8693
out.writeStringList(new ArrayList<>(examples));
94+
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
95+
out.writeOptionalString(grokPattern);
96+
}
8797
}
8898

8999
public String getJobId() {
@@ -139,6 +149,14 @@ public void addExample(String example) {
139149
examples.add(example);
140150
}
141151

152+
public String getGrokPattern() {
153+
return grokPattern;
154+
}
155+
156+
public void setGrokPattern(String grokPattern) {
157+
this.grokPattern = grokPattern;
158+
}
159+
142160
@Override
143161
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
144162
builder.startObject();
@@ -148,6 +166,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
148166
builder.field(REGEX.getPreferredName(), regex);
149167
builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
150168
builder.field(EXAMPLES.getPreferredName(), examples);
169+
if (grokPattern != null) {
170+
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
171+
}
151172
builder.endObject();
152173
return builder;
153174
}
@@ -166,11 +187,12 @@ public boolean equals(Object other) {
166187
&& Objects.equals(this.terms, that.terms)
167188
&& Objects.equals(this.regex, that.regex)
168189
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
169-
&& Objects.equals(this.examples, that.examples);
190+
&& Objects.equals(this.examples, that.examples)
191+
&& Objects.equals(this.grokPattern, that.grokPattern);
170192
}
171193

172194
@Override
173195
public int hashCode() {
174-
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples);
196+
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
175197
}
176198
}

x-pack/plugin/ml/build.gradle

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ dependencies {
4646
testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')
4747

4848
// ml deps
49+
compile project(':libs:grok')
4950
compile 'net.sf.supercsv:super-csv:2.4.0'
5051
nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip"
5152
testCompile 'org.ini4j:ini4j:0.5.2'
@@ -85,7 +86,7 @@ task internalClusterTest(type: RandomizedTestingTask,
8586
include '**/*IT.class'
8687
systemProperty 'es.set.netty.runtime.available.processors', 'false'
8788
}
88-
check.dependsOn internalClusterTest
89+
check.dependsOn internalClusterTest
8990
internalClusterTest.mustRunAfter test
9091

9192
// also add an "alias" task to make typing on the command line easier

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ protected void doExecute(GetCategoriesAction.Request request, ActionListener<Get
4141

4242
Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;
4343
Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;
44-
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,
44+
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,
4545
r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);
4646
}
4747
}
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.job.categorization;
7+
8+
import org.elasticsearch.grok.Grok;
9+
10+
import java.io.BufferedReader;
11+
import java.io.IOException;
12+
import java.io.InputStreamReader;
13+
import java.io.UncheckedIOException;
14+
import java.nio.charset.StandardCharsets;
15+
import java.util.ArrayList;
16+
import java.util.Arrays;
17+
import java.util.Collection;
18+
import java.util.Collections;
19+
import java.util.HashMap;
20+
import java.util.List;
21+
import java.util.Map;
22+
import java.util.regex.Matcher;
23+
import java.util.regex.Pattern;
24+
25+
26+
/**
27+
* Creates Grok patterns that will match all the examples in a given category_definition.
28+
*
29+
* The choice of field names is quite primitive. The intention is that a human will edit these.
30+
*/
31+
public final class GrokPatternCreator {
32+
33+
private static String PREFACE = "preface";
34+
private static String EPILOGUE = "epilogue";
35+
36+
/**
37+
* The first match in this list will be chosen, so it needs to be ordered
38+
* such that more generic patterns come after more specific patterns.
39+
*/
40+
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
41+
new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
42+
new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
43+
new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
44+
new GrokPatternCandidate("DATESTAMP_OTHER", "timestamp"),
45+
new GrokPatternCandidate("DATESTAMP_EVENTLOG", "timestamp"),
46+
new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
47+
new GrokPatternCandidate("HTTPDATE", "timestamp"),
48+
new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
49+
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
50+
new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
51+
new GrokPatternCandidate("DATE", "date"),
52+
new GrokPatternCandidate("TIME", "time"),
53+
new GrokPatternCandidate("LOGLEVEL", "loglevel"),
54+
new GrokPatternCandidate("URI", "uri"),
55+
new GrokPatternCandidate("UUID", "uuid"),
56+
new GrokPatternCandidate("MAC", "macaddress"),
57+
// Can't use \b as the breaks, because slashes are not "word" characters
58+
new GrokPatternCandidate("PATH", "path", "(?<!\\w)", "(?!\\w)"),
59+
new GrokPatternCandidate("EMAILADDRESS", "email"),
60+
// TODO: would be nice to have IPORHOST here, but HOST matches almost all words
61+
new GrokPatternCandidate("IP", "ipaddress"),
62+
// This already includes pre/post break conditions
63+
new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
64+
// Can't use \b as the break before, because it doesn't work for negative numbers (the
65+
// minus sign is not a "word" character)
66+
new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
67+
// Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
68+
// numbers that NUMBER rejected due to preceeding characters
69+
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
70+
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
71+
// Fixing these problems with overly broad matches would require some extra intelligence
72+
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
73+
// but that doesn't necessarily help as "jay" could be a username but is also a dictionary
74+
// word (plus there's the international headache with relying on dictionaries). Similarly,
75+
// hostnames could also be dictionary words - I've worked on machines called "hippo" and
76+
// "scarf" in the past. Another idea would be to look at the adjacent characters and
77+
// apply some heuristic based on those.
78+
);
79+
80+
private GrokPatternCreator() {
81+
}
82+
83+
/**
84+
* Given a category definition regex and a collection of examples from the category, return
85+
* a grok pattern that will match the category and pull out any likely fields. The extracted
86+
* fields are given pretty generic names, but unique within the grok pattern provided. The
87+
* expectation is that a user will adjust the extracted field names based on their domain
88+
* knowledge.
89+
*/
90+
public static String findBestGrokMatchFromExamples(String regex, Collection<String> examples) {
91+
92+
// The first string in this array will end up being the empty string, and it doesn't correspond
93+
// to an "in between" bit. Although it could be removed for "neatness", it actually makes the
94+
// loops below slightly neater if it's left in.
95+
//
96+
// E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
97+
String[] fixedRegexBits = regex.split("\\.[*+]\\??");
98+
99+
// Create a pattern that will capture the bits in between the fixed parts of the regex
100+
//
101+
// E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
102+
Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL);
103+
104+
List<Collection<String>> inBetweenBits = new ArrayList<>(fixedRegexBits.length);
105+
for (String example : examples) {
106+
Matcher matcher = exampleProcessor.matcher(example);
107+
if (matcher.matches()) {
108+
assert matcher.groupCount() == fixedRegexBits.length;
109+
// E.g., if the input regex was ".*?cat.+?sat.+?mat.*" then the example
110+
// "the cat sat on the mat" will result in "the ", " ", " on the ", and ""
111+
// being added to the 4 "in between" collections in that order
112+
for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) {
113+
if (inBetweenBits.size() < groupNum) {
114+
inBetweenBits.add(new ArrayList<>(examples.size()));
115+
}
116+
inBetweenBits.get(groupNum - 1).add(matcher.group(groupNum));
117+
}
118+
} else {
119+
// We should never get here. If we do it implies a bug in the original categorization,
120+
// as it's produced a regex that doesn't match the examples.
121+
assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
122+
}
123+
}
124+
125+
Map<String, Integer> fieldNameCountStore = new HashMap<>();
126+
StringBuilder overallGrokPatternBuilder = new StringBuilder();
127+
// Finally, for each collection of "in between" bits we look for the best Grok pattern and incorporate
128+
// it into the overall Grok pattern that will match the each example in its entirety
129+
for (int inBetweenBitNum = 0; inBetweenBitNum < inBetweenBits.size(); ++inBetweenBitNum) {
130+
// Remember (from the first comment in this method) that the first element in this array is
131+
// always the empty string
132+
overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
133+
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
134+
inBetweenBitNum == fixedRegexBits.length - 1, inBetweenBits.get(inBetweenBitNum));
135+
}
136+
return overallGrokPatternBuilder.toString();
137+
}
138+
139+
/**
140+
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
141+
* to use matches it best. Then append the appropriate grok language to represent that finding
142+
* onto the supplied string builder.
143+
*/
144+
static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
145+
boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
146+
147+
GrokPatternCandidate bestCandidate = null;
148+
for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) {
149+
if (mustMatchStrings.stream().allMatch(candidate.grok::match)) {
150+
bestCandidate = candidate;
151+
break;
152+
}
153+
}
154+
155+
if (bestCandidate == null) {
156+
if (isLast) {
157+
overallGrokPatternBuilder.append(".*");
158+
} else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
159+
overallGrokPatternBuilder.append(".*?");
160+
} else {
161+
overallGrokPatternBuilder.append(".+?");
162+
}
163+
} else {
164+
Collection<String> prefaces = new ArrayList<>();
165+
Collection<String> epilogues = new ArrayList<>();
166+
populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
167+
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
168+
overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
169+
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
170+
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
171+
}
172+
}
173+
174+
/**
175+
* Given a collection of strings, and a grok pattern that matches some part of them all,
176+
* return collections of the bits that come before (prefaces) and after (epilogues) the
177+
* bit that matches.
178+
*/
179+
static void populatePrefacesAndEpilogues(Collection<String> matchingStrings, Grok grok, Collection<String> prefaces,
180+
Collection<String> epilogues) {
181+
for (String s : matchingStrings) {
182+
Map<String, Object> captures = grok.captures(s);
183+
// If the pattern doesn't match then captures will be null. But we expect this
184+
// method to only be called after validating that the pattern does match.
185+
assert captures != null;
186+
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
187+
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
188+
}
189+
}
190+
191+
/**
192+
* The first time a particular field name is passed, simply return it.
193+
* The second time return it with "2" appended.
194+
* The third time return it with "3" appended.
195+
* Etc.
196+
*/
197+
static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
198+
Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
199+
if (numberSeen > 1) {
200+
return fieldName + numberSeen;
201+
} else {
202+
return fieldName;
203+
}
204+
}
205+
206+
static class GrokPatternCandidate {
207+
208+
final String grokPatternName;
209+
final String fieldName;
210+
final Grok grok;
211+
212+
/**
213+
* Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
214+
* end with a non "word" character (i.e. letter, number or underscore). For such patterns use one
215+
* of the other constructors.
216+
*
217+
* In cases where the Grok pattern defined by Logstash already includes conditions on what must
218+
* come before and after the match, use one of the other constructors and specify an empty string
219+
* for the pre and/or post breaks.
220+
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
221+
* @param fieldName Name of the field to extract from the match.
222+
*/
223+
GrokPatternCandidate(String grokPatternName, String fieldName) {
224+
this(grokPatternName, fieldName, "\\b", "\\b");
225+
}
226+
227+
GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak) {
228+
this(grokPatternName, fieldName, preBreak, "\\b");
229+
}
230+
231+
/**
232+
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
233+
* @param fieldName Name of the field to extract from the match.
234+
* @param preBreak Only consider the match if it's broken from the previous text by this.
235+
* @param postBreak Only consider the match if it's broken from the following text by this.
236+
*/
237+
GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak, String postBreak) {
238+
this.grokPatternName = grokPatternName;
239+
this.fieldName = fieldName;
240+
this.grok = new Grok(Grok.getBuiltinPatterns(), "%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":this}" +
241+
postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
242+
}
243+
}
244+
}

0 commit comments

Comments
 (0)