|
10 | 10 | import org.supercsv.prefs.CsvPreference;
|
11 | 11 |
|
12 | 12 | import java.io.IOException;
|
| 13 | +import java.util.ArrayList; |
13 | 14 | import java.util.Arrays;
|
| 15 | +import java.util.BitSet; |
14 | 16 | import java.util.Collections;
|
| 17 | +import java.util.List; |
15 | 18 |
|
16 | 19 | import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows;
|
17 | 20 | import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance;
|
18 | 21 | import static org.hamcrest.Matchers.arrayContaining;
|
| 22 | +import static org.hamcrest.Matchers.equalTo; |
19 | 23 |
|
20 | 24 | public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
21 | 25 |
|
@@ -449,15 +453,51 @@ public void testLevenshteinDistance() {
|
449 | 453 | assertEquals(0, levenshteinDistance("", ""));
|
450 | 454 | }
|
451 | 455 |
|
| 456 | + public void testMakeShortFieldMask() { |
| 457 | + |
| 458 | + List<List<String>> rows = new ArrayList<>(); |
| 459 | + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(20), randomAlphaOfLength(5))); |
| 460 | + rows.add(Arrays.asList(randomAlphaOfLength(50), randomAlphaOfLength(5), randomAlphaOfLength(5))); |
| 461 | + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(5))); |
| 462 | + rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(80))); |
| 463 | + |
| 464 | + BitSet shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 110); |
| 465 | + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("111"))); |
| 466 | + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 80); |
| 467 | + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("11 "))); |
| 468 | + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 50); |
| 469 | + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" 1 "))); |
| 470 | + shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 20); |
| 471 | + assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" "))); |
| 472 | + } |
| 473 | + |
452 | 474 | public void testLevenshteinCompareRows() {
|
453 | 475 |
|
454 | 476 | assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
|
455 |
| - assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); |
456 |
| - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); |
457 |
| - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); |
458 |
| - assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); |
459 |
| - assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); |
460 |
| - assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); |
| 477 | + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"))); |
| 478 | + assertEquals(6, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"))); |
| 479 | + assertEquals(8, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"))); |
| 480 | + assertEquals(10, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"))); |
| 481 | + assertEquals(9, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"))); |
| 482 | + assertEquals(12, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"))); |
| 483 | + } |
| 484 | + |
| 485 | + public void testLevenshteinCompareRowsWithMask() { |
| 486 | + |
| 487 | + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"), |
| 488 | + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11")))); |
| 489 | + assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"), |
| 490 | + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ")))); |
| 491 | + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"), |
| 492 | + TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" 1", "1 ")))); |
| 493 | + assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"), |
| 494 | + TimestampFormatFinder.stringToNumberPosBitSet(" 1"))); |
| 495 | + assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"), |
| 496 | + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); |
| 497 | + assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"), |
| 498 | + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); |
| 499 | + assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"), |
| 500 | + TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); |
461 | 501 | }
|
462 | 502 |
|
463 | 503 | public void testLineHasUnescapedQuote() {
|
|
0 commit comments