diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 0eb4bfd4b90d1..7e5b660c8da74 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -17,6 +17,7 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.DoubleSummaryStatistics; import java.util.HashSet; @@ -27,12 +28,12 @@ import java.util.Random; import java.util.SortedMap; import java.util.stream.Collectors; -import java.util.stream.IntStream; public class DelimitedFileStructureFinder implements FileStructureFinder { private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])"; private static final int MAX_LEVENSHTEIN_COMPARISONS = 100; + private static final int LONG_FIELD_THRESHOLD = 100; private final List sampleMessages; private final FileStructure structure; @@ -322,10 +323,15 @@ private static boolean isFirstRowUnusual(List explanation, List (double) levenshteinFieldwiseCompareRows(firstRow, otherRow)) + .mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow, shortFieldMask)) .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine); otherRowStats = new DoubleSummaryStatistics(); @@ -336,7 +342,7 @@ private static boolean isFirstRowUnusual(List explanation, List> rows, int longFieldThreshold) { + + assert rows.isEmpty() == false; + + BitSet shortFieldMask = new BitSet(); + + int maxLength = rows.stream().map(List::size).max(Integer::compareTo).get(); + for (int index = 0; index < maxLength; ++index) { + final int i = index; + shortFieldMask.set(i, + rows.stream().allMatch(row -> i >= row.size() || row.get(i) == null || row.get(i).length() < longFieldThreshold)); + } + + return shortFieldMask; + } + /** * Sum of the Levenshtein distances between corresponding elements - * in the two supplied lists _excluding_ the biggest difference. - * The reason the biggest difference is excluded is that sometimes - * there's a "message" field that is much longer than any of the other - * fields, varies enormously between rows, and skews the comparison. + * in the two supplied lists. */ static int levenshteinFieldwiseCompareRows(List firstRow, List secondRow) { int largestSize = Math.max(firstRow.size(), secondRow.size()); - if (largestSize <= 1) { + if (largestSize < 1) { return 0; } - int[] distances = new int[largestSize]; + BitSet allFields = new BitSet(); + allFields.set(0, largestSize); + + return levenshteinFieldwiseCompareRows(firstRow, secondRow, allFields); + } - for (int index = 0; index < largestSize; ++index) { - distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "", + /** + * Sum of the Levenshtein distances between corresponding elements + * in the two supplied lists where the corresponding bit in the + * supplied bit mask is set. + */ + static int levenshteinFieldwiseCompareRows(List firstRow, List secondRow, BitSet fieldMask) { + + int result = 0; + + for (int index = fieldMask.nextSetBit(0); index >= 0; index = fieldMask.nextSetBit(index + 1)) { + result += levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "", (index < secondRow.size()) ? secondRow.get(index) : ""); } - Arrays.sort(distances); - - return IntStream.of(distances).limit(distances.length - 1).sum(); + return result; } /** diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 4a7c5b87d2186..01a45b67e8784 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -10,12 +10,16 @@ import org.supercsv.prefs.CsvPreference; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; +import java.util.List; import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows; import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance; import static org.hamcrest.Matchers.arrayContaining; +import static org.hamcrest.Matchers.equalTo; public class DelimitedFileStructureFinderTests extends FileStructureTestCase { @@ -449,15 +453,51 @@ public void testLevenshteinDistance() { assertEquals(0, levenshteinDistance("", "")); } + public void testMakeShortFieldMask() { + + List> rows = new ArrayList<>(); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(20), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(50), randomAlphaOfLength(5), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(5))); + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(80))); + + BitSet shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 110); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("111"))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 80); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("11 "))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 50); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" 1 "))); + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 20); + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" "))); + } + public void testLevenshteinCompareRows() { assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"))); - assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); - assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); - assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); - assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); + assertEquals(6, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); + assertEquals(8, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); + assertEquals(10, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); + assertEquals(9, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); + assertEquals(12, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); + } + + public void testLevenshteinCompareRowsWithMask() { + + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11")))); + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ")))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" 1", "1 ")))); + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(" 1"))); + assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); + assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); + assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"), + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); } public void testLineHasUnescapedQuote() {