Skip to content

Commit 2a1be7c

Browse files
authored
[ML] Fix detection of syslog-like timestamp in find_file_structure (#47970)
Usually syslog timestamps have two spaces before a single digit day-of-month. However, in some non-syslog cases where syslog-like timestamps are used there is only one space. The grok pattern supports this, so the timestamp parser should too. This change makes the find_file_structure endpoint do this. Also fixes another problem that the same test case exposed in the find_file_structure endpoint, which was that the exclude_lines_pattern for delimited files was always created on the assumption the delimiter was a comma. Now it is based on the actual delimiter.
1 parent 7a0edc2 commit 2a1be7c

File tree

5 files changed

+57
-12
lines changed

5 files changed

+57
-12
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,11 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
139139
String quote = String.valueOf(csvPreference.getQuoteChar());
140140
String twoQuotes = quote + quote;
141141
String optQuote = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + "?";
142+
String delimiterMatcher =
143+
(delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
142144
structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
143145
.map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote)
144-
.collect(Collectors.joining(",")));
146+
.collect(Collectors.joining(delimiterMatcher)));
145147
}
146148

147149
boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing();

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ public final class TimestampFormatFinder {
145145
example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"),
146146
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
147147
"%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP",
148-
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 4, 10),
148+
Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 6, 10),
149149
new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"),
150150
"\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
151151
"\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE",
@@ -154,10 +154,10 @@ public final class TimestampFormatFinder {
154154
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
155155
"%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP",
156156
Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3),
157-
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
157+
new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"),
158158
"\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
159159
"%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP",
160-
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 0),
160+
Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 1, 0),
161161
new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample,
162162
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP",
163163
// In DATESTAMP the month may be 1 or 2 digits, but the day must be 2
@@ -1467,7 +1467,7 @@ private static String adjustFractionalSecondsFromEndOfExample(String example, St
14671467
static List<String> expandDayAndAdjustFractionalSecondsFromExample(String example, String formatWithddAndNoFraction) {
14681468

14691469
String formatWithdd = adjustFractionalSecondsFromEndOfExample(example, formatWithddAndNoFraction);
1470-
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"));
1470+
return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"), formatWithdd.replace(" dd", " d"));
14711471
}
14721472

14731473
static List<String> indeterminateDayMonthFormatFromExample(String example) {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
2525

2626
private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false);
27+
private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 3, false);
2728

2829
public void testCreateConfigsGivenCompleteCsv() throws Exception {
2930
String sample = "time,message\n" +
@@ -368,6 +369,47 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
368369
assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats());
369370
}
370371

372+
public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception {
373+
String sample = "Latitude\tLongitude\tloc\tTimestamp\n" +
374+
"25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" +
375+
"25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" +
376+
"25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" +
377+
"25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" +
378+
"25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" +
379+
"25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" +
380+
"25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" +
381+
"25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" +
382+
"25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n";
383+
assertTrue(tsvFactory.canCreateFromSample(explanation, sample));
384+
385+
String charset = randomFrom(POSSIBLE_CHARSETS);
386+
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
387+
FileStructureFinder structureFinder = tsvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
388+
FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
389+
390+
FileStructure structure = structureFinder.getStructure();
391+
392+
assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
393+
assertEquals(charset, structure.getCharset());
394+
if (hasByteOrderMarker == null) {
395+
assertNull(structure.getHasByteOrderMarker());
396+
} else {
397+
assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
398+
}
399+
assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?",
400+
structure.getExcludeLinesPattern());
401+
assertNull(structure.getMultilineStartPattern());
402+
assertEquals(Character.valueOf('\t'), structure.getDelimiter());
403+
assertEquals(Character.valueOf('"'), structure.getQuote());
404+
assertTrue(structure.getHasHeaderRow());
405+
assertNull(structure.getShouldTrimFields());
406+
assertEquals(Arrays.asList("Latitude", "Longitude", "loc", "Timestamp"), structure.getColumnNames());
407+
assertNull(structure.getGrokPattern());
408+
assertEquals("Timestamp", structure.getTimestampField());
409+
assertEquals(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"),
410+
structure.getJodaTimestampFormats());
411+
}
412+
371413
public void testCreateConfigsGivenDotInFieldName() throws Exception {
372414
String sample = "time.iso8601,message\n" +
373415
"2018-05-17T13:41:23,hello\n" +

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist
194194
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
195195
assertNotNull(match);
196196
assertEquals("time", match.v1());
197-
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
197+
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
198198
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
199199
}
200200

@@ -227,7 +227,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten
227227
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
228228
assertNotNull(match);
229229
assertEquals("time2", match.v1());
230-
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
230+
assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"));
231231
assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName());
232232
}
233233

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -661,9 +661,9 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() {
661661
"\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L);
662662

663663
validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
664-
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
664+
Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L);
665665
validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
666-
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
666+
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L);
667667

668668
validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
669669
"dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L);
@@ -672,7 +672,7 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() {
672672
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L);
673673

674674
validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
675-
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
675+
Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L);
676676

677677
validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP",
678678
"\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L);
@@ -799,15 +799,16 @@ public void testFindFormatGivenRealLogMessages() {
799799

800800
validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
801801
"opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP",
802-
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
802+
"\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
803+
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
803804

804805
validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
805806
"192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601",
806807
"\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "ISO8601");
807808

808809
validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
809810
"'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b",
810-
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"));
811+
Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"));
811812

812813
validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " +
813814
"Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP",

0 commit comments

Comments
 (0)