Skip to content

Commit 8559ff7

Browse files
authored
[ML][Inference] fixing pattern compilation + unnecessary string copy (#51483) (#51487)
1 parent 40bd271 commit 8559ff7

File tree

2 files changed

+12
-8
lines changed

2 files changed

+12
-8
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.java

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,17 @@
1212
import java.nio.charset.CodingErrorAction;
1313
import java.nio.charset.StandardCharsets;
1414
import java.util.Locale;
15+
import java.util.regex.Pattern;
1516

1617
/**
1718
* A collection of messy feature extractors
1819
*/
1920
public final class FeatureUtils {
2021

22+
private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
23+
private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
24+
private static final Pattern TURKISH_I = Pattern.compile("\\u0130");
25+
2126
private FeatureUtils() {}
2227

2328
/**
@@ -56,20 +61,18 @@ public static String cleanAndLowerText(String text) {
5661
String newText = text.startsWith(" ") ? "" : " ";
5762

5863
// 2. Replace punctuation and whitespace with ' '
59-
// NOTE: we capture unicode letters AND marks as Nepalese and other languages
60-
newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " ");
61-
6264
// 2.1. Replace spacing modifier characters
63-
newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " ");
65+
// NOTE: we capture unicode letters AND marks as Nepalese and other languages
66+
newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" ");
6467

6568
// 3. Add space at end
6669
newText += text.endsWith(" ") ? "" : " ";
6770

6871
// 4. Remove multiple spaces (2 or more) with a single space
69-
newText = newText.replaceAll("\\p{IsWhite_Space}+", " ");
72+
newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" ");
7073

7174
// 5. Replace Turkish İ with I (TODO - check this out better...)
72-
newText = newText.replaceAll("\\u0130", "I");
75+
newText = TURKISH_I.matcher(newText).replaceAll("I");
7376

7477
return newText.toLowerCase(Locale.ROOT);
7578
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/NGramFeatureExtractor.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,14 @@ public FeatureValue[] extractFeatures(String text) {
5555
Map<String, Counter> charNGrams = new TreeMap<>();
5656

5757
int countSum = 0;
58-
int end = newText.toString().length() - nGrams;
58+
String textWithTerminators = newText.toString();
59+
int end = textWithTerminators.length() - nGrams;
5960
for (int start = 0; start <= end; ++start) {
6061
StringBuilder charNGram = new StringBuilder();
6162

6263
int index;
6364
for (index = 0; index < nGrams; ++index) {
64-
char currentChar = newText.toString().charAt(start + index);
65+
char currentChar = textWithTerminators.charAt(start + index);
6566
if (currentChar == ' ') {
6667
break;
6768
}

0 commit comments

Comments
 (0)