elastic
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java
Lines changed: 24 additions & 2 deletions b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java
Lines changed: 24 additions & 2 deletions
diff --git a/‎x-pack/plugin/ml/build.gradle
Lines changed: 2 additions & 1 deletion b/‎x-pack/plugin/ml/build.gradle
Lines changed: 2 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java
Lines changed: 1 addition & 1 deletion b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
Lines changed: 244 additions & 0 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
Lines changed: 244 additions & 0 deletions
@@ -5,6 +5,7 @@
  */
 package org.elasticsearch.xpack.core.ml.job.results;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
     public static final ParseField REGEX = new ParseField("regex");
     public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
     public static final ParseField EXAMPLES = new ParseField("examples");
+    public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
 
     // Used for QueryPage
     public static final ParseField RESULTS_FIELD = new ParseField("categories");
@@ -51,6 +53,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
         parser.declareString(CategoryDefinition::setRegex, REGEX);
         parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
         parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
+        parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
 
         return parser;
     }
@@ -61,6 +64,7 @@ private static ConstructingObjectParser<CategoryDefinition, Void> createParser(b
     private String regex = "";
     private long maxMatchingLength = 0L;
     private final Set<String> examples;
+    private String grokPattern;
 
     public CategoryDefinition(String jobId) {
         this.jobId = jobId;
@@ -74,6 +78,9 @@ public CategoryDefinition(StreamInput in) throws IOException {
         regex = in.readString();
         maxMatchingLength = in.readLong();
         examples = new TreeSet<>(in.readList(StreamInput::readString));
+        if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            grokPattern = in.readOptionalString();
+        }
     }
 
     @Override
@@ -84,6 +91,9 @@ public void writeTo(StreamOutput out) throws IOException {
         out.writeString(regex);
         out.writeLong(maxMatchingLength);
         out.writeStringList(new ArrayList<>(examples));
+        if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            out.writeOptionalString(grokPattern);
+        }
     }
 
     public String getJobId() {
@@ -139,6 +149,14 @@ public void addExample(String example) {
         examples.add(example);
     }
 
+    public String getGrokPattern() {
+        return grokPattern;
+    }
+
+    public void setGrokPattern(String grokPattern) {
+        this.grokPattern = grokPattern;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
@@ -148,6 +166,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         builder.field(REGEX.getPreferredName(), regex);
         builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
         builder.field(EXAMPLES.getPreferredName(), examples);
+        if (grokPattern != null) {
+            builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
+        }
         builder.endObject();
         return builder;
     }
@@ -166,11 +187,12 @@ public boolean equals(Object other) {
                 && Objects.equals(this.terms, that.terms)
                 && Objects.equals(this.regex, that.regex)
                 && Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
-                && Objects.equals(this.examples, that.examples);
+                && Objects.equals(this.examples, that.examples)
+                && Objects.equals(this.grokPattern, that.grokPattern);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples);
+        return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
     }
 }
@@ -46,6 +46,7 @@ dependencies {
     testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')
 
     // ml deps
+    compile project(':libs:grok')
     compile 'net.sf.supercsv:super-csv:2.4.0'
     nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip"
     testCompile 'org.ini4j:ini4j:0.5.2'
@@ -85,7 +86,7 @@ task internalClusterTest(type: RandomizedTestingTask,
   include '**/*IT.class'
   systemProperty 'es.set.netty.runtime.available.processors', 'false'
 }
-check.dependsOn internalClusterTest 
+check.dependsOn internalClusterTest
 internalClusterTest.mustRunAfter test
 
 // also add an "alias" task to make typing on the command line easier
 
@@ -41,7 +41,7 @@ protected void doExecute(GetCategoriesAction.Request request, ActionListener<Get
 
         Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;
         Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;
-        jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,
+        jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,
                 r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);
     }
 }
@@ -0,0 +1,244 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.grok.Grok;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Creates Grok patterns that will match all the examples in a given category_definition.
+ *
+ * The choice of field names is quite primitive.  The intention is that a human will edit these.
+ */
+public final class GrokPatternCreator {
+
+    private static String PREFACE = "preface";
+    private static String EPILOGUE = "epilogue";
+
+    /**
+     * The first match in this list will be chosen, so it needs to be ordered
+     * such that more generic patterns come after more specific patterns.
+     */
+    private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
+            new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_OTHER", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_EVENTLOG", "timestamp"),
+            new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
+            new GrokPatternCandidate("HTTPDATE", "timestamp"),
+            new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
+            new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
+            new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
+            new GrokPatternCandidate("DATE", "date"),
+            new GrokPatternCandidate("TIME", "time"),
+            new GrokPatternCandidate("LOGLEVEL", "loglevel"),
+            new GrokPatternCandidate("URI", "uri"),
+            new GrokPatternCandidate("UUID", "uuid"),
+            new GrokPatternCandidate("MAC", "macaddress"),
+            // Can't use \b as the breaks, because slashes are not "word" characters
+            new GrokPatternCandidate("PATH", "path", "(?<!\\w)", "(?!\\w)"),
+            new GrokPatternCandidate("EMAILADDRESS", "email"),
+            // TODO: would be nice to have IPORHOST here, but HOST matches almost all words
+            new GrokPatternCandidate("IP", "ipaddress"),
+            // This already includes pre/post break conditions
+            new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
+            // Can't use \b as the break before, because it doesn't work for negative numbers (the
+            // minus sign is not a "word" character)
+            new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
+            // Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
+            // numbers that NUMBER rejected due to preceeding characters
+            new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
+            // TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
+            // Fixing these problems with overly broad matches would require some extra intelligence
+            // to be added to remove inappropriate matches.  One idea would be to use a dictionary,
+            // but that doesn't necessarily help as "jay" could be a username but is also a dictionary
+            // word (plus there's the international headache with relying on dictionaries).  Similarly,
+            // hostnames could also be dictionary words - I've worked on machines called "hippo" and
+            // "scarf" in the past.  Another idea would be to look at the adjacent characters and
+            // apply some heuristic based on those.
+    );
+
+    private GrokPatternCreator() {
+    }
+
+    /**
+     * Given a category definition regex and a collection of examples from the category, return
+     * a grok pattern that will match the category and pull out any likely fields.  The extracted
+     * fields are given pretty generic names, but unique within the grok pattern provided.  The
+     * expectation is that a user will adjust the extracted field names based on their domain
+     * knowledge.
+     */
+    public static String findBestGrokMatchFromExamples(String regex, Collection<String> examples) {
+
+        // The first string in this array will end up being the empty string, and it doesn't correspond
+        // to an "in between" bit.  Although it could be removed for "neatness", it actually makes the
+        // loops below slightly neater if it's left in.
+        //
+        // E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
+        String[] fixedRegexBits = regex.split("\\.[*+]\\??");
+
+        // Create a pattern that will capture the bits in between the fixed parts of the regex
+        //
+        // E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
+        Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL);
+
+        List<Collection<String>> inBetweenBits = new ArrayList<>(fixedRegexBits.length);
+        for (String example : examples) {
+            Matcher matcher = exampleProcessor.matcher(example);
+            if (matcher.matches()) {
+                assert matcher.groupCount() == fixedRegexBits.length;
+                // E.g., if the input regex was ".*?cat.+?sat.+?mat.*" then the example
+                // "the cat sat on the mat" will result in "the ", " ", " on the ", and ""
+                // being added to the 4 "in between" collections in that order
+                for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) {
+                    if (inBetweenBits.size() < groupNum) {
+                        inBetweenBits.add(new ArrayList<>(examples.size()));
+                    }
+                    inBetweenBits.get(groupNum - 1).add(matcher.group(groupNum));
+                }
+            } else {
+                // We should never get here.  If we do it implies a bug in the original categorization,
+                // as it's produced a regex that doesn't match the examples.
+                assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
+            }
+        }
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+        // Finally, for each collection of "in between" bits we look for the best Grok pattern and incorporate
+        // it into the overall Grok pattern that will match the each example in its entirety
+        for (int inBetweenBitNum = 0; inBetweenBitNum < inBetweenBits.size(); ++inBetweenBitNum) {
+            // Remember (from the first comment in this method) that the first element in this array is
+            // always the empty string
+            overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
+                    inBetweenBitNum == fixedRegexBits.length - 1, inBetweenBits.get(inBetweenBitNum));
+        }
+        return overallGrokPatternBuilder.toString();
+    }
+
+    /**
+     * Given a collection of strings, work out which (if any) of the grok patterns we're allowed
+     * to use matches it best.  Then append the appropriate grok language to represent that finding
+     * onto the supplied string builder.
+     */
+    static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
+                                              boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
+
+        GrokPatternCandidate bestCandidate = null;
+        for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) {
+            if (mustMatchStrings.stream().allMatch(candidate.grok::match)) {
+                bestCandidate = candidate;
+                break;
+            }
+        }
+
+        if (bestCandidate == null) {
+            if (isLast) {
+                overallGrokPatternBuilder.append(".*");
+            } else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
+                overallGrokPatternBuilder.append(".*?");
+            } else {
+                overallGrokPatternBuilder.append(".+?");
+            }
+        } else {
+            Collection<String> prefaces = new ArrayList<>();
+            Collection<String> epilogues = new ArrayList<>();
+            populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
+            overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
+                    .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
+        }
+    }
+
+    /**
+     * Given a collection of strings, and a grok pattern that matches some part of them all,
+     * return collections of the bits that come before (prefaces) and after (epilogues) the
+     * bit that matches.
+     */
+    static void populatePrefacesAndEpilogues(Collection<String> matchingStrings, Grok grok, Collection<String> prefaces,
+                                             Collection<String> epilogues) {
+        for (String s : matchingStrings) {
+            Map<String, Object> captures = grok.captures(s);
+            // If the pattern doesn't match then captures will be null.  But we expect this
+            // method to only be called after validating that the pattern does match.
+            assert captures != null;
+            prefaces.add(captures.getOrDefault(PREFACE, "").toString());
+            epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
+        }
+    }
+
+    /**
+     * The first time a particular field name is passed, simply return it.
+     * The second time return it with "2" appended.
+     * The third time return it with "3" appended.
+     * Etc.
+     */
+    static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
+        Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
+        if (numberSeen > 1) {
+            return fieldName + numberSeen;
+        } else {
+            return fieldName;
+        }
+    }
+
+    static class GrokPatternCandidate {
+
+        final String grokPatternName;
+        final String fieldName;
+        final Grok grok;
+
+        /**
+         * Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
+         * end with a non "word" character (i.e. letter, number or underscore).  For such patterns use one
+         * of the other constructors.
+         *
+         * In cases where the Grok pattern defined by Logstash already includes conditions on what must
+         * come before and after the match, use one of the other constructors and specify an empty string
+         * for the pre and/or post breaks.
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param fieldName       Name of the field to extract from the match.
+         */
+        GrokPatternCandidate(String grokPatternName, String fieldName) {
+            this(grokPatternName, fieldName, "\\b", "\\b");
+        }
+
+        GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak) {
+            this(grokPatternName, fieldName, preBreak, "\\b");
+        }
+
+        /**
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param fieldName       Name of the field to extract from the match.
+         * @param preBreak        Only consider the match if it's broken from the previous text by this.
+         * @param postBreak       Only consider the match if it's broken from the following text by this.
+         */
+        GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak, String postBreak) {
+            this.grokPatternName = grokPatternName;
+            this.fieldName = fieldName;
+            this.grok = new Grok(Grok.getBuiltinPatterns(), "%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":this}" +
+                    postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ protected void doExecute(GetCategoriesAction.Request request, ActionListener<Get`
`41`	`41`
`42`	`42`	`Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;`
`43`	`43`	`Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;`
`44`		`- jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,`
	`44`	`+ jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,`
`45`	`45`	`r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);`
`46`	`46`	`}`
`47`	`47`	`}`