|
12 | 12 | import java.nio.charset.CodingErrorAction;
|
13 | 13 | import java.nio.charset.StandardCharsets;
|
14 | 14 | import java.util.Locale;
|
| 15 | +import java.util.regex.Pattern; |
15 | 16 |
|
16 | 17 | /**
|
17 | 18 | * A collection of messy feature extractors
|
18 | 19 | */
|
19 | 20 | public final class FeatureUtils {
|
20 | 21 |
|
| 22 | + private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}"); |
| 23 | + private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+"); |
| 24 | + private static final Pattern TURKISH_I = Pattern.compile("\\u0130"); |
| 25 | + |
21 | 26 | private FeatureUtils() {}
|
22 | 27 |
|
23 | 28 | /**
|
@@ -56,20 +61,18 @@ public static String cleanAndLowerText(String text) {
|
56 | 61 | String newText = text.startsWith(" ") ? "" : " ";
|
57 | 62 |
|
58 | 63 | // 2. Replace punctuation and whitespace with ' '
|
59 |
| - // NOTE: we capture unicode letters AND marks as Nepalese and other languages |
60 |
| - newText += text.replaceAll("[^\\p{L}|\\p{M}|\\s]|\\|", " "); |
61 |
| - |
62 | 64 | // 2.1. Replace spacing modifier characters
|
63 |
| - newText = newText.replaceAll("\\p{InSpacing_Modifier_Letters}", " "); |
| 65 | + // NOTE: we capture unicode letters AND marks as Nepalese and other languages |
| 66 | + newText += NOT_UNICODE_OR_IS_SPECIAL.matcher(text).replaceAll(" "); |
64 | 67 |
|
65 | 68 | // 3. Add space at end
|
66 | 69 | newText += text.endsWith(" ") ? "" : " ";
|
67 | 70 |
|
68 | 71 | // 4. Remove multiple spaces (2 or more) with a single space
|
69 |
| - newText = newText.replaceAll("\\p{IsWhite_Space}+", " "); |
| 72 | + newText = ONE_OR_MORE_WHITESPACE.matcher(newText).replaceAll(" "); |
70 | 73 |
|
71 | 74 | // 5. Replace Turkish İ with I (TODO - check this out better...)
|
72 |
| - newText = newText.replaceAll("\\u0130", "I"); |
| 75 | + newText = TURKISH_I.matcher(newText).replaceAll("I"); |
73 | 76 |
|
74 | 77 | return newText.toLowerCase(Locale.ROOT);
|
75 | 78 | }
|
|
0 commit comments