Skip to content

Commit 84eaac7

Browse files
authored
[ML] Minor improvements to categorization Grok pattern creation (#33353)
1. The TOMCAT_DATESTAMP format needs to be checked before TIMESTAMP_ISO8601, otherwise TIMESTAMP_ISO8601 will match the start of the Tomcat datestamp. 2. Exclude more characters before and after numbers. For example, in 1.2.3 we don't want to match 1.2 as a float.
1 parent d9f394b commit 84eaac7

File tree

2 files changed

+41
-9
lines changed

2 files changed

+41
-9
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,15 @@
2525
*/
2626
public final class GrokPatternCreator {
2727

28-
private static String PREFACE = "preface";
29-
private static String EPILOGUE = "epilogue";
28+
private static final String PREFACE = "preface";
29+
private static final String EPILOGUE = "epilogue";
3030

3131
/**
3232
* The first match in this list will be chosen, so it needs to be ordered
3333
* such that more generic patterns come after more specific patterns.
3434
*/
3535
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
36+
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
3637
new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
3738
new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
3839
new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
@@ -41,7 +42,6 @@ public final class GrokPatternCreator {
4142
new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
4243
new GrokPatternCandidate("HTTPDATE", "timestamp"),
4344
new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
44-
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
4545
new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
4646
new GrokPatternCandidate("DATE", "date"),
4747
new GrokPatternCandidate("TIME", "time"),
@@ -56,12 +56,10 @@ public final class GrokPatternCreator {
5656
new GrokPatternCandidate("IP", "ipaddress"),
5757
// This already includes pre/post break conditions
5858
new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
59-
// Can't use \b as the break before, because it doesn't work for negative numbers (the
60-
// minus sign is not a "word" character)
61-
new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
62-
// Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
63-
// numbers that NUMBER rejected due to preceeding characters
64-
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
59+
// Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
60+
// up numeric suffices too eagerly
61+
new GrokPatternCandidate("NUMBER", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
62+
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
6563
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
6664
// Fixing these problems with overly broad matches would require some extra intelligence
6765
// to be added to remove inappropriate matches. One idea would be to use a dictionary,

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,40 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
7676
assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
7777
}
7878

79+
public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() {
80+
81+
// The first part of the Tomcat datestamp can match as an ISO8601
82+
// timestamp if the ordering of candidate patterns is wrong
83+
Collection<String> mustMatchStrings = Arrays.asList("2018-09-03 17:03:28,269 +0100 | ERROR | ",
84+
"2018-09-03 17:04:27,279 +0100 | DEBUG | ",
85+
"2018-09-03 17:05:26,289 +0100 | ERROR | ");
86+
87+
Map<String, Integer> fieldNameCountStore = new HashMap<>();
88+
StringBuilder overallGrokPatternBuilder = new StringBuilder();
89+
90+
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
91+
92+
assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
93+
}
94+
95+
public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() {
96+
97+
// If we're not careful then we might detect the first part of these strings as a
98+
// number, e.g. 1.2 in the first example, but this is inappropriate given the
99+
// trailing dot and digit
100+
Collection<String> mustMatchStrings = Arrays.asList("1.2.3",
101+
"-2.3.4",
102+
"4.5.6.7",
103+
"-9.8.7.6.5");
104+
105+
Map<String, Integer> fieldNameCountStore = new HashMap<>();
106+
StringBuilder overallGrokPatternBuilder = new StringBuilder();
107+
108+
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
109+
110+
assertEquals(".+?", overallGrokPatternBuilder.toString());
111+
}
112+
79113
public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
80114

81115
Collection<String> mustMatchStrings = Arrays.asList("(-2)",

0 commit comments

Comments
 (0)