Skip to content

Commit f00dfb2

Browse files
authored
[ML] adds WKT support in filestructurefinder (#57014) (#57032)
Field mapping detection is done via grok patterns. This commit adds well-known text (WKT) formatted geometry detection. If everything is a `POINT`, then a `geo_point` mapping is preferred. Otherwise, if all the fields are WKT geometries a `geo_shape` mapping is preferred. This does **NOT** detect other types of formatted geometries (geohash, comma delimited points, etc.) closes #56967
1 parent 9af3110 commit f00dfb2

File tree

2 files changed

+94
-14
lines changed

2 files changed

+94
-14
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import java.util.Arrays;
1616
import java.util.Collection;
1717
import java.util.Collections;
18+
import java.util.HashMap;
1819
import java.util.LinkedHashMap;
1920
import java.util.List;
2021
import java.util.Map;
@@ -35,11 +36,35 @@ public final class FileStructureUtils {
3536
public static final Set<String> CONVERTIBLE_TYPES =
3637
Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));
3738

39+
private static final Map<String, String> EXTENDED_PATTERNS;
40+
static {
41+
Map<String, String> patterns = new HashMap<>();
42+
patterns.put("GEO_POINT", "%{NUMBER} %{NUMBER}");
43+
patterns.put("GEO_POINT_GROUP", "\\(%{GEO_POINT}, (?:%{GEO_POINT}, )*%{GEO_POINT}\\)");
44+
patterns.put("GEO_POINT_GROUP_GROUP", "\\(%{GEO_POINT_GROUP}(?:, %{GEO_POINT_GROUP})*\\)");
45+
patterns.put("WKT_POINT", "POINT \\(%{GEO_POINT}\\)");
46+
patterns.put("WKT_LINESTRING", "LINESTRING %{GEO_POINT_GROUP}");
47+
patterns.put("WKT_MULTIPOINT", "MULTIPOINT %{GEO_POINT_GROUP}");
48+
patterns.put("WKT_POLYGON", "POLYGON %{GEO_POINT_GROUP_GROUP}");
49+
patterns.put("WKT_MULTILINESTRING", "MULTILINESTRING %{GEO_POINT_GROUP_GROUP}");
50+
patterns.put("WKT_MULTIPOLYGON", "MULTIPOLYGON \\(%{GEO_POINT_GROUP_GROUP}(?:, %{GEO_POINT_GROUP_GROUP})*\\)");
51+
patterns.put("WKT_BBOX", "BBOX \\(%{NUMBER}, %{NUMBER}, %{NUMBER}, %{NUMBER}\\)");
52+
patterns.put(
53+
"WKT_ANY",
54+
"(?:%{WKT_POINT}|%{WKT_LINESTRING}|%{WKT_MULTIPOINT}|%{WKT_POLYGON}|%{WKT_MULTILINESTRING}|%{WKT_MULTIPOLYGON}|%{WKT_BBOX})"
55+
);
56+
patterns.put("WKT_GEOMETRYCOLLECTION", "GEOMETRYCOLLECTION \\(%{WKT_ANY}(?:, %{WKT_ANY})\\)");
57+
patterns.putAll(Grok.getBuiltinPatterns());
58+
EXTENDED_PATTERNS = Collections.unmodifiableMap(patterns);
59+
}
60+
3861
private static final int NUM_TOP_HITS = 10;
3962
// NUMBER Grok pattern doesn't support scientific notation, so we extend it
4063
private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$",
4164
TimeoutChecker.watchdog);
4265
private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$", TimeoutChecker.watchdog);
66+
private static final Grok GEO_POINT_WKT = new Grok(EXTENDED_PATTERNS, "^%{WKT_POINT}$", TimeoutChecker.watchdog);
67+
private static final Grok GEO_WKT = new Grok(EXTENDED_PATTERNS, "^(?:%{WKT_ANY}|%{WKT_GEOMETRYCOLLECTION})$", TimeoutChecker.watchdog);
4368
private static final int KEYWORD_MAX_LEN = 256;
4469
private static final int KEYWORD_MAX_SPACES = 5;
4570

@@ -317,6 +342,11 @@ static Map<String, String> guessScalarMapping(List<String> explanation, String f
317342
}
318343
else if (fieldValues.stream().allMatch(IP_GROK::match)) {
319344
return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
345+
// geo_point mapping MUST be checked before geo_shape as geo_shape also contains a matcher for geo_point
346+
} else if (fieldValues.stream().allMatch(GEO_POINT_WKT::match)) {
347+
return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_point");
348+
} else if (fieldValues.stream().allMatch(GEO_WKT::match)) {
349+
return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_shape");
320350
}
321351

322352
if (fieldValues.stream().anyMatch(FileStructureUtils::isMoreLikelyTextThanKeyword)) {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import java.util.SortedMap;
1818

1919
import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES;
20+
import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureUtils.MAPPING_TYPE_SETTING;
2021
import static org.hamcrest.Matchers.contains;
2122
import static org.hamcrest.Matchers.equalTo;
2223
import static org.hamcrest.Matchers.instanceOf;
@@ -238,26 +239,26 @@ public void testGuessMappingGivenNothing() {
238239
}
239240

240241
public void testGuessMappingGivenKeyword() {
241-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
242+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
242243

243244
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
244245
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
245246
}
246247

247248
public void testGuessMappingGivenText() {
248-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
249+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
249250

250251
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
251252
}
252253

253254
public void testGuessMappingGivenIp() {
254-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip");
255+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
255256

256257
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
257258
}
258259

259260
public void testGuessMappingGivenDouble() {
260-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double");
261+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
261262

262263
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
263264
// 12345678901234567890 is too long for long
@@ -267,39 +268,39 @@ public void testGuessMappingGivenDouble() {
267268
}
268269

269270
public void testGuessMappingGivenLong() {
270-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long");
271+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
271272

272273
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
273274
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
274275
}
275276

276277
public void testGuessMappingGivenDate() {
277278
Map<String, String> expected = new HashMap<>();
278-
expected.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
279+
expected.put(MAPPING_TYPE_SETTING, "date");
279280
expected.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601");
280281

281282
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
282283
}
283284

284285
public void testGuessMappingGivenBoolean() {
285-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "boolean");
286+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
286287

287288
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("false", "true")));
288289
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(true, false)));
289290
}
290291

291292
public void testGuessMappingGivenArray() {
292-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long");
293+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
293294

294295
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
295296

296-
expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
297+
expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
297298

298299
assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
299300
}
300301

301302
public void testGuessMappingGivenObject() {
302-
Map<String, String> expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
303+
Map<String, String> expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
303304

304305
assertEquals(expected, guessMapping(explanation, "foo",
305306
Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
@@ -330,12 +331,12 @@ public void testGuessMappingsAndCalculateFieldStats() {
330331

331332
Map<String, Object> mappings = mappingsAndFieldStats.v1();
332333
assertNotNull(mappings);
333-
assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
334+
assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
334335
Map<String, String> expectedTimeMapping = new HashMap<>();
335-
expectedTimeMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
336+
expectedTimeMapping.put(MAPPING_TYPE_SETTING, "date");
336337
expectedTimeMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "yyyy-MM-dd HH:mm:ss,SSS");
337338
assertEquals(expectedTimeMapping, mappings.get("time"));
338-
assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
339+
assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
339340
assertNull(mappings.get("nothing"));
340341

341342
Map<String, FieldStats> fieldStats = mappingsAndFieldStats.v2();
@@ -446,7 +447,7 @@ public void testMakeIngestPipelineDefinitionGivenDelimitedWithConversion() {
446447
String mappingType = expectConversion ? randomFrom("long", "double", "boolean") : randomFrom("keyword", "text", "date");
447448
String firstTargetField = ((List<String>) csvProcessorSettings.get("target_fields")).get(0);
448449
Map<String, Object> mappingsForConversions =
449-
Collections.singletonMap(firstTargetField, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType));
450+
Collections.singletonMap(firstTargetField, Collections.singletonMap(MAPPING_TYPE_SETTING, mappingType));
450451

451452
Map<String, Object> pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
452453
mappingsForConversions, null, null, false);
@@ -558,6 +559,55 @@ public void testMakeIngestPipelineDefinitionGivenSemiStructured() {
558559
assertEquals(Collections.emptyMap(), pipeline);
559560
}
560561

562+
public void testGuessGeoPoint() {
563+
Map<String, String> mapping = FileStructureUtils.guessScalarMapping(
564+
explanation,
565+
"foo",
566+
Arrays.asList("POINT (-77.03653 38.897676)", "POINT (-50.03653 28.8973)"),
567+
NOOP_TIMEOUT_CHECKER
568+
);
569+
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_point"));
570+
571+
mapping = FileStructureUtils.guessScalarMapping(
572+
explanation,
573+
"foo",
574+
Arrays.asList("POINT (-77.03653 38.897676)", "bar"),
575+
NOOP_TIMEOUT_CHECKER
576+
);
577+
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword"));
578+
}
579+
580+
public void testGuessGeoShape() {
581+
Map<String, String> mapping = FileStructureUtils.guessScalarMapping(
582+
explanation,
583+
"foo",
584+
Arrays.asList(
585+
"POINT (-77.03653 38.897676)",
586+
"LINESTRING (-77.03653 38.897676, -77.009051 38.889939)",
587+
"POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0))",
588+
"POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0), " +
589+
"(100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2))",
590+
"MULTIPOINT (102.0 2.0, 103.0 2.0)",
591+
"MULTILINESTRING ((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0), (100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0)," +
592+
" (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8))",
593+
"MULTIPOLYGON (((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0, 102.0 2.0)), ((100.0 0.0, 101.0 0.0, 101.0 1.0, " +
594+
"100.0 1.0, 100.0 0.0), (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2)))",
595+
"GEOMETRYCOLLECTION (POINT (100.0 0.0), LINESTRING (101.0 0.0, 102.0 1.0))",
596+
"BBOX (100.0, 102.0, 2.0, 0.0)"
597+
),
598+
NOOP_TIMEOUT_CHECKER
599+
);
600+
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_shape"));
601+
602+
mapping = FileStructureUtils.guessScalarMapping(
603+
explanation,
604+
"foo",
605+
Arrays.asList("POINT (-77.03653 38.897676)", "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", "bar"),
606+
NOOP_TIMEOUT_CHECKER
607+
);
608+
assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword"));
609+
}
610+
561611
private Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
562612
Tuple<Map<String, String>, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats(explanation,
563613
fieldName, fieldValues, NOOP_TIMEOUT_CHECKER);

0 commit comments

Comments
 (0)