Skip to content

Commit 80d1da5

Browse files
committed
[ML] Add support for date_nanos fields in find_file_structure
Now that elastic#61324 is merged it is possible for the find_file_structure endpoint to suggest using date_nanos fields for timestamps where the timestamp format provides greater than millisecond accuracy.
1 parent b08f121 commit 80d1da5

File tree

9 files changed

+145
-22
lines changed

9 files changed

+145
-22
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
149149
.setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats())
150150
.setNeedClientTimezone(needClientTimeZone)
151151
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
152-
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
152+
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
153+
timeField.v2().needNanosecondPrecision()))
153154
.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
154155
quotePattern, mappings, timeField.v1(), timeField.v2()));
155156

156-
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
157+
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
157158
} else {
158159
structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
159-
csvProcessorSettings, mappings, null, null, false));
160+
csvProcessorSettings, mappings, null, null, false, false));
160161
structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
161162
delimiterPattern, quotePattern, mappings, null, null));
162163
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public final class FileStructureUtils {
3636
public static final String MAPPING_PROPERTIES_SETTING = "properties";
3737
public static final Map<String, String> DATE_MAPPING_WITHOUT_FORMAT =
3838
Collections.singletonMap(MAPPING_TYPE_SETTING, "date");
39+
public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX";
3940
public static final Set<String> CONVERTIBLE_TYPES =
4041
Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));
4142

@@ -397,13 +398,14 @@ static boolean isMoreLikelyTextThanKeyword(String str) {
397398
* @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}.
398399
* May be <code>null</code> if {@code timestampField} is also <code>null</code>.
399400
* @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp?
401+
* @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy?
400402
* @return The ingest pipeline definition, or <code>null</code> if none is required.
401403
*/
402404
public static Map<String, Object> makeIngestPipelineDefinition(String grokPattern, Map<String, String> customGrokPatternDefinitions,
403405
Map<String, Object> csvProcessorSettings,
404406
Map<String, Object> mappingsForConversions,
405407
String timestampField, List<String> timestampFormats,
406-
boolean needClientTimezone) {
408+
boolean needClientTimezone, boolean needNanosecondPrecision) {
407409

408410
if (grokPattern == null && csvProcessorSettings == null && timestampField == null) {
409411
return null;
@@ -437,6 +439,9 @@ public static Map<String, Object> makeIngestPipelineDefinition(String grokPatter
437439
dateProcessorSettings.put("timezone", "{{ " + BEAT_TIMEZONE_FIELD + " }}");
438440
}
439441
dateProcessorSettings.put("formats", timestampFormats);
442+
if (needNanosecondPrecision) {
443+
dateProcessorSettings.put("output_format", NANOSECOND_DATE_OUTPUT_FORMAT);
444+
}
440445
processors.add(Collections.singletonMap("date", dateProcessorSettings));
441446
}
442447

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,16 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> expl
6464
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
6565
// Note: no convert processors are added based on mappings for NDJSON input
6666
// because it's reasonable that _source matches the supplied JSON precisely
67-
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone));
67+
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
68+
timeField.v2().needNanosecondPrecision()));
6869
}
6970

7071
Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
7172
FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);
7273

73-
SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
74+
Map<String, Object> mappings = mappingsAndFieldStats.v1();
7475
if (timeField != null) {
75-
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
76+
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
7677
}
7778

7879
if (mappingsAndFieldStats.v2() != null) {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
111111
Map<String, String> messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
112112
SortedMap<String, Object> mappings = new TreeMap<>();
113113
mappings.put("message", messageMapping);
114-
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
114+
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timestampFormatFinder.getEsDateMappingTypeWithoutFormat());
115115

116116
SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
117117
fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker));
@@ -151,7 +151,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
151151
.setNeedClientTimezone(needClientTimeZone)
152152
.setGrokPattern(grokPattern)
153153
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, mappings,
154-
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone))
154+
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone,
155+
timestampFormatFinder.needNanosecondPrecision()))
155156
.setMappings(mappings)
156157
.setFieldStats(fieldStats)
157158
.setExplanation(explanation)

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java

+61-2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ public final class TimestampFormatFinder {
5353
private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class);
5454
private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?";
5555
private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,";
56+
private static final Pattern FRACTIONAL_SECOND_INTERPRETER =
57+
Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])");
5658
private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?';
5759
// The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER
5860
// above, but they're literals in this regex to aid readability
@@ -702,6 +704,20 @@ public List<String> getJavaTimestampFormats() {
702704
(matchedFormats.size() > 1) ? matchedFormats.get(0) : null);
703705
}
704706

707+
/**
708+
* This is needed to decide between "date" and "date_nanos" as the index mapping type.
709+
* @return Do the observed timestamps require nanosecond precision to store accurately?
710+
*/
711+
public boolean needNanosecondPrecision() {
712+
if (matchedFormats.isEmpty()) {
713+
// If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake
714+
assert errorOnNoTimestamp == false;
715+
return false;
716+
}
717+
return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat))
718+
.anyMatch(match -> match.hasNanosecondPrecision);
719+
}
720+
705721
/**
706722
* Given a list of timestamp formats that might contain indeterminate day/month parts,
707723
* return the corresponding pattern with the placeholders replaced with concrete
@@ -947,6 +963,14 @@ public boolean hasTimezoneDependentParsing() {
947963
.anyMatch(match -> match.hasTimezoneDependentParsing);
948964
}
949965

966+
/**
967+
* The @timestamp field will always have been parsed into epoch format,
968+
* so we just need to know if it has nanosecond resolution or not.
969+
*/
970+
public Map<String, String> getEsDateMappingTypeWithoutFormat() {
971+
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
972+
}
973+
950974
/**
951975
* Sometimes Elasticsearch mappings for dates need to include the format.
952976
* This method returns appropriate mappings settings: at minimum "type" : "date",
@@ -959,7 +983,7 @@ public Map<String, String> getEsDateMappingTypeWithFormat() {
959983
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
960984
}
961985
Map<String, String> mapping = new LinkedHashMap<>();
962-
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
986+
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
963987
String formats = javaTimestampFormats.stream().map(format -> {
964988
switch (format) {
965989
case "ISO8601":
@@ -1233,6 +1257,7 @@ static final class TimestampMatch {
12331257
final int secondIndeterminateDateNumber;
12341258

12351259
final boolean hasTimezoneDependentParsing;
1260+
final boolean hasNanosecondPrecision;
12361261

12371262
/**
12381263
* Text that came after the timestamp in the matched field/message.
@@ -1250,6 +1275,8 @@ static final class TimestampMatch {
12501275
this.secondIndeterminateDateNumber = indeterminateDateNumbers[1];
12511276
this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0),
12521277
matchedDate);
1278+
this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0),
1279+
matchedDate);
12531280
this.epilogue = Objects.requireNonNull(epilogue);
12541281
}
12551282

@@ -1259,6 +1286,7 @@ static final class TimestampMatch {
12591286
this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber;
12601287
this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber;
12611288
this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing;
1289+
this.hasNanosecondPrecision = toCopyExceptFormat.hasNanosecondPrecision;
12621290
this.epilogue = toCopyExceptFormat.epilogue;
12631291
}
12641292

@@ -1285,6 +1313,38 @@ static boolean requiresTimezoneDependentParsing(String format, String matchedDat
12851313
}
12861314
}
12871315

1316+
static boolean matchHasNanosecondPrecision(String format, String matchedDate) {
1317+
switch (format) {
1318+
case "ISO8601":
1319+
Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(matchedDate);
1320+
return matcher.find() && matcher.group(2).length() > 3;
1321+
case "UNIX_MS":
1322+
case "UNIX":
1323+
return false;
1324+
case "TAI64N":
1325+
return true;
1326+
default:
1327+
boolean notQuoted = true;
1328+
int consecutiveSs = 0;
1329+
for (int pos = 0; pos < format.length(); ++pos) {
1330+
char curChar = format.charAt(pos);
1331+
if (curChar == '\'') {
1332+
notQuoted = !notQuoted;
1333+
consecutiveSs = 0;
1334+
} else if (notQuoted) {
1335+
if (curChar == 'S') {
1336+
if (++consecutiveSs > 3) {
1337+
return true;
1338+
}
1339+
} else {
1340+
consecutiveSs = 0;
1341+
}
1342+
}
1343+
}
1344+
return false;
1345+
}
1346+
}
1347+
12881348
static int[] parseIndeterminateDateNumbers(String matchedDate, List<String> rawJavaTimestampFormats) {
12891349
int[] indeterminateDateNumbers = { -1, -1 };
12901350

@@ -1368,7 +1428,6 @@ public String toString() {
13681428
*/
13691429
static final class CandidateTimestampFormat {
13701430

1371-
private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$");
13721431
// This means that in the case of a literal Z, XXX is preferred
13731432
private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$");
13741433

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
104104
.setNeedClientTimezone(needClientTimeZone)
105105
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
106106
Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(),
107-
needClientTimeZone));
107+
needClientTimeZone, timeField.v2().needNanosecondPrecision()));
108108
}
109109

110110
Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
@@ -114,14 +114,14 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
114114
structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
115115
}
116116

117-
SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
117+
Map<String, Object> innerMappings = mappingsAndFieldStats.v1();
118118
Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
119119
secondLevelProperties.put(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
120120
secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
121121
SortedMap<String, Object> outerMappings = new TreeMap<>();
122122
outerMappings.put(topLevelTag, secondLevelProperties);
123123
if (timeField != null) {
124-
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
124+
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
125125
}
126126

127127
FileStructure structure = structureBuilder

0 commit comments

Comments
 (0)