From 5ff984660c87dd89afaf403d47ac101545dca364 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 1 Aug 2019 16:07:01 +0100 Subject: [PATCH] [ML] Improve CSV header row detection in find_file_structure When doing a fieldwise Levenshtein distance comparison between CSV rows, this change ignores all fields that have long values, not just the longest field. This approach works better for CSV formats that have multiple freeform text fields rather than just a single "message" field. Fixes #45047 --- .../DelimitedFileStructureFinder.java | 64 ++++++++++++++----- .../DelimitedFileStructureFinderTests.java | 52 +++++++++++++-- 2 files changed, 95 insertions(+), 21 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 0eb4bfd4b90d1..7e5b660c8da74 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -17,6 +17,7 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.DoubleSummaryStatistics; import java.util.HashSet; @@ -27,12 +28,12 @@ import java.util.Random; import java.util.SortedMap; import java.util.stream.Collectors; -import java.util.stream.IntStream; public class DelimitedFileStructureFinder implements FileStructureFinder { private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])"; private static final int MAX_LEVENSHTEIN_COMPARISONS = 100; + private static final int LONG_FIELD_THRESHOLD = 100; private final List sampleMessages; private final FileStructure structure; @@ -322,10 +323,15 @@ private static boolean isFirstRowUnusual(List explanation, List (double) levenshteinFieldwiseCompareRows(firstRow, otherRow)) + .mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow, shortFieldMask)) .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine); otherRowStats = new DoubleSummaryStatistics(); @@ -336,7 +342,7 @@ private static boolean isFirstRowUnusual(List explanation, List> rows, int longFieldThreshold) { + + assert rows.isEmpty() == false; + + BitSet shortFieldMask = new BitSet(); + + int maxLength = rows.stream().map(List::size).max(Integer::compareTo).get(); + for (int index = 0; index < maxLength; ++index) { + final int i = index; + shortFieldMask.set(i, + rows.stream().allMatch(row -> i >= row.size() || row.get(i) == null || row.get(i).length() < longFieldThreshold)); + } + + return shortFieldMask; + } + /** * Sum of the Levenshtein distances between corresponding elements - * in the two supplied lists _excluding_ the biggest difference. - * The reason the biggest difference is excluded is that sometimes - * there's a "message" field that is much longer than any of the other - * fields, varies enormously between rows, and skews the comparison. + * in the two supplied lists. */ static int levenshteinFieldwiseCompareRows(List firstRow, List secondRow) { int largestSize = Math.max(firstRow.size(), secondRow.size()); - if (largestSize <= 1) { + if (largestSize < 1) { return 0; } - int[] distances = new int[largestSize]; + BitSet allFields = new BitSet(); + allFields.set(0, largestSize); + + return levenshteinFieldwiseCompareRows(firstRow, secondRow, allFields); + } - for (int index = 0; index < largestSize; ++index) { - distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "", + /** + * Sum of the Levenshtein distances between corresponding elements + * in the two supplied lists where the corresponding bit in the + * supplied bit mask is set. + */ + static int levenshteinFieldwiseCompareRows(List firstRow, List secondRow, BitSet fieldMask) { + + int result = 0; + + for (int index = fieldMask.nextSetBit(0); index >= 0; index = fieldMask.nextSetBit(index + 1)) { + result += levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "", (index < secondRow.size()) ? secondRow.get(index) : ""); } - Arrays.sort(distances); - - return IntStream.of(distances).limit(distances.length - 1).sum(); + return result; } /** diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 4a7c5b87d2186..01a45b67e8784 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -10,12 +10,16 @@ import org.supercsv.prefs.CsvPreference; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; +import java.util.List; import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows; import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance; import static org.hamcrest.Matchers.arrayContaining; +import static org.hamcrest.Matchers.equalTo; public class DelimitedFileStructureFinderTests extends FileStructureTestCase { @@ -449,15 +453,51 @@ public void testLevenshteinDistance() { assertEquals(0, levenshteinDistance("", "")); } + public void testMakeShortFieldMask() { + + List> rows = new ArrayList<>(); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(20), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(50), randomAlphaOfLength(5), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(80))); + + BitSet shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 110); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("111"))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 80); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("11 "))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 50); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" 1 "))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 20); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" "))); + } + public void testLevenshteinCompareRows() { assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"))); - assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); - assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); - assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); - assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); + assertEquals(6, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); + assertEquals(8, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); + assertEquals(10, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); + assertEquals(9, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); + assertEquals(12, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); + } + + public void testLevenshteinCompareRowsWithMask() { + + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11")))); + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ")))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" 1", "1 ")))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(" 1"))); + assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); + assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); + assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); } public void testLineHasUnescapedQuote() {