[ML][Inference] fixing pattern compilation + unnecessary string copy (#51483) (#51487)

benwtrent · web-flow · commit 8559ff7cee63 · 2020-01-27T12:12:34.000-05:00
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java
@@ -12,12 +12,17 @@
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 /**
  * A collection of messy feature extractors
  */
 public final class FeatureUtils {
 
+    private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
+    private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
+    private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
+
     private FeatureUtils() {}
 
     /**
@@ -56,20 +61,18 @@ public static String cleanAndLowerText(String text) {
         String newText = text.startsWith(" ") ? "" : " ";
 
         // 2. Replace punctuation and whitespace with ' '
-        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
-        newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
-
         // 2.1. Replace spacing modifier characters
-        newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " ");
+        // NOTE: we capture unicode letters AND marks as Nepalese and other languages
+        newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");
 
         // 3. Add space at end
         newText += text.endsWith(" ") ? "" : " ";
 
         // 4. Remove multiple spaces (2 or more) with a single space
-        newText = newText.replaceAll("\\p{IsWhite_Space}+", " ");
+        newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");
 
         // 5. Replace Turkish İ with I (TODO - check this out better...)
-        newText = newText.replaceAll("\\u0130", "I");
+        newText = TURKISH_I.matcher(newText).replaceAll("I");
 
         return newText.toLowerCase(Locale.ROOT);
     }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java
@@ -55,13 +55,14 @@ public FeatureValue[] extractFeatures(String text) {
         Map<String, Counter> charNGrams = new TreeMap<>();
 
         int countSum = 0;
-        int end = newText.toString().length() - nGrams;
+        String textWithTerminators = newText.toString();
+        int end = textWithTerminators.length() - nGrams;
         for (int start = 0; start <= end; ++start) {
             StringBuilder charNGram = new StringBuilder();
 
             int index;
             for (index = 0; index < nGrams; ++index) {
-                char currentChar = newText.toString().charAt(start + index);
+                char currentChar = textWithTerminators.charAt(start + index);
                 if (currentChar == ' ') {
                     break;
                 }