@@ -48,7 +48,7 @@ number of short patterns is far more likely.
48
48
Faro and Kulekci published another paper [4b] that is conceptually very similar
49
49
to [4a]. The key difference is that it uses the CRC32 instruction (introduced
50
50
as part of SSE 4.2) to compute fingerprint values. This also enables the
51
- algorithm to work effectively on substrings as short at 7 bytes with 4 byte
51
+ algorithm to work effectively on substrings as short as 7 bytes with 4 byte
52
52
windows. 7 bytes is unfortunately still too long. The window could be
53
53
technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
54
54
small window size ends up negating most performance benefits—and it's likely
@@ -66,6 +66,7 @@ because it is behind a paywall.
66
66
67
67
Teddy
68
68
-----
69
+
69
70
Finally, we get to Teddy. If the above literature review is complete, then it
70
71
appears that Teddy is a novel algorithm. More than that, in my experience, it
71
72
completely blows away the competition for short substrings, which is exactly
@@ -242,20 +243,21 @@ haystack.
242
243
243
244
Implementation notes
244
245
--------------------
246
+
245
247
The problem with the algorithm as described above is that it uses a single byte
246
- for a fingerprint. This will work will if the fingerprints are rare in the
247
- haystack (e.g., capital letters or special characters in normal English text),
248
- but if the fingerprints are common, you'll wind up spending too much time in
249
- the verification step, which effectively gives the performance benefits of
250
- scanning 16 bytes at a time. Remember, the key to the performance of this
251
- algorithm is to do as little work as possible per 16 bytes.
248
+ for a fingerprint. This will if the fingerprints are rare in the haystack (e.g.,
249
+ capital letters or special characters in normal English text), but if the
250
+ fingerprints are common, you'll wind up spending too much time in the
251
+ verification step, which effectively gives the performance benefits of scanning
252
+ 16 bytes at a time. Remember, the key to the performance of this algorithm is to
253
+ do as little work as possible per 16 bytes.
252
254
253
255
This algorithm can be extrapolated in a relatively straight-forward way to use
254
256
larger fingerprints. That is, instead of a single byte prefix, we might use a
255
- three byte prefix. The implementation below implements N = {1, 2, 3} and always
256
- picks the largest N possible. The rationale is that the bigger the fingerprint,
257
- the fewer verification steps we'll do. Of course, if N is too large, then we'll
258
- end up doing too much on each step.
257
+ three byte prefix. The implementation below implements ` N = {1, 2, 3}` and
258
+ always picks the largest N possible. The rationale is that the bigger the
259
+ fingerprint, the fewer verification steps we'll do. Of course, if N is too
260
+ large, then we'll end up doing too much on each step.
259
261
260
262
The way to extend it is:
261
263
@@ -344,7 +346,7 @@ pub struct Match {
344
346
pub pat : usize ,
345
347
/// The start byte offset of the match.
346
348
pub start : usize ,
347
- /// The end byte offset of the match. This is always start + pat.len().
349
+ /// The end byte offset of the match. This is always ` start + pat.len()` .
348
350
pub end : usize ,
349
351
}
350
352
@@ -363,7 +365,7 @@ pub struct Teddy {
363
365
}
364
366
365
367
/// A list of masks. This has length equal to the length of the fingerprint.
366
- /// The length of the fingerprint is always `max(3, len(smallest substring ))`.
368
+ /// The length of the fingerprint is always `max(3, len(smallest_substring ))`.
367
369
#[ derive( Debug , Clone ) ]
368
370
struct Masks ( Vec < Mask > ) ;
369
371
@@ -377,9 +379,9 @@ struct Mask {
377
379
}
378
380
379
381
impl Teddy {
380
- /// Create a new Teddy multi substring matcher.
382
+ /// Create a new ` Teddy` multi substring matcher.
381
383
///
382
- /// If a Teddy matcher could not be created (e.g., `pats` is empty or has
384
+ /// If a ` Teddy` matcher could not be created (e.g., `pats` is empty or has
383
385
/// an empty substring), then `None` is returned.
384
386
pub fn new ( pats : & syntax:: Literals ) -> Option < Teddy > {
385
387
let pats: Vec < _ > = pats. literals ( ) . iter ( ) . map ( |p|p. to_vec ( ) ) . collect ( ) ;
@@ -407,7 +409,7 @@ impl Teddy {
407
409
} )
408
410
}
409
411
410
- /// Returns all of the substrings matched by this Teddy.
412
+ /// Returns all of the substrings matched by this ` Teddy` .
411
413
pub fn patterns ( & self ) -> & [ Vec < u8 > ] {
412
414
& self . pats
413
415
}
@@ -422,7 +424,7 @@ impl Teddy {
422
424
self . pats . iter ( ) . fold ( 0 , |a, b| a + b. len ( ) )
423
425
}
424
426
425
- /// Searches `haystack` for the substrings in this Teddy. If a match was
427
+ /// Searches `haystack` for the substrings in this ` Teddy` . If a match was
426
428
/// found, then it is returned. Otherwise, `None` is returned.
427
429
pub fn find ( & self , haystack : & [ u8 ] ) -> Option < Match > {
428
430
// If our haystack is smaller than the block size, then fall back to
@@ -441,7 +443,7 @@ impl Teddy {
441
443
}
442
444
}
443
445
444
- /// find1 is used when there is only 1 mask. This is the easy case and is
446
+ /// ` find1` is used when there is only 1 mask. This is the easy case and is
445
447
/// pretty much as described in the module documentation.
446
448
#[ inline( always) ]
447
449
fn find1 ( & self , haystack : & [ u8 ] ) -> Option < Match > {
@@ -451,7 +453,7 @@ impl Teddy {
451
453
debug_assert ! ( len >= BLOCK_SIZE ) ;
452
454
while pos <= len - BLOCK_SIZE {
453
455
let h = unsafe { u8x16:: load_unchecked ( haystack, pos) } ;
454
- // N.B. res0 is our `C` in the module documentation.
456
+ // N.B. ` res0` is our `C` in the module documentation.
455
457
let res0 = self . masks . members1 ( h) ;
456
458
// Only do expensive verification if there are any non-zero bits.
457
459
if res0. ne ( zero) . any ( ) {
@@ -464,7 +466,7 @@ impl Teddy {
464
466
self . slow ( haystack, pos)
465
467
}
466
468
467
- /// find2 is used when there are 2 masks, e.g., the fingerprint is 2 bytes
469
+ /// ` find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
468
470
/// long.
469
471
#[ inline( always) ]
470
472
fn find2 ( & self , haystack : & [ u8 ] ) -> Option < Match > {
@@ -478,12 +480,12 @@ impl Teddy {
478
480
) ;
479
481
let zero = u8x16:: splat ( 0 ) ;
480
482
let len = haystack. len ( ) ;
481
- // The previous value of C (from the module documentation) for the
483
+ // The previous value of `C` (from the module documentation) for the
482
484
// *first* byte in the fingerprint. On subsequent iterations, we take
483
- // the last bitset from the previous C and insert it into the first
485
+ // the last bitset from the previous `C` and insert it into the first
484
486
// position of the current C, shifting all other bitsets to the right
485
- // one lane. This causes C for the first byte to line up with C for the
486
- // second byte, so that they can be AND'd together.
487
+ // one lane. This causes `C` for the first byte to line up with `C` for
488
+ // the second byte, so that they can be ` AND` 'd together.
487
489
let mut prev0 = u8x16:: splat ( 0 ) ;
488
490
let mut pos = 0 ;
489
491
debug_assert ! ( len >= BLOCK_SIZE ) ;
@@ -493,17 +495,19 @@ impl Teddy {
493
495
494
496
// The next three lines are essentially equivalent to
495
497
//
496
- // (prev0 << 15) | (res0 >> 1)
498
+ // ```rust,ignore
499
+ // (prev0 << 15) | (res0 >> 1)
500
+ // ```
497
501
//
498
502
// ... if SIMD vectors could shift across lanes. There is the
499
- // PALIGNR instruction, but apparently LLVM doesn't expose it as
503
+ // ` PALIGNR` instruction, but apparently LLVM doesn't expose it as
500
504
// a proper intrinsic. Thankfully, it appears the following
501
- // sequence does indeed compile down to a PALIGNR.
505
+ // sequence does indeed compile down to a ` PALIGNR` .
502
506
let prev0byte0 = prev0. extract ( 15 ) ;
503
507
let res0shiftr8 = res0. shuffle_bytes ( res0shuffle) ;
504
508
let res0prev0 = res0shiftr8. replace ( 0 , prev0byte0) ;
505
509
506
- // AND's our C values together.
510
+ // ` AND` 's our `C` values together.
507
511
let res = res0prev0 & res1;
508
512
prev0 = res0;
509
513
if res. ne ( zero) . any ( ) {
@@ -519,12 +523,12 @@ impl Teddy {
519
523
self . slow ( haystack, pos - 1 )
520
524
}
521
525
522
- /// find3 is used when there are 3 masks, e.g., the fingerprint is 3 bytes
526
+ /// ` find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
523
527
/// long.
524
528
///
525
- /// N.B. This is a straight-forward extrapolation of find2. The only
526
- /// difference is that we need to keep track of two previous values of
527
- /// C, since we now need to align for three bytes.
529
+ /// N.B. This is a straight-forward extrapolation of ` find2` . The only
530
+ /// difference is that we need to keep track of two previous values of `C`,
531
+ /// since we now need to align for three bytes.
528
532
#[ inline( always) ]
529
533
fn find3 ( & self , haystack : & [ u8 ] ) -> Option < Match > {
530
534
let zero = u8x16:: splat ( 0 ) ;
@@ -606,7 +610,7 @@ impl Teddy {
606
610
///
607
611
/// If a match exists, it returns the first one.
608
612
///
609
- /// offset is an additional byte offset to add to the position before
613
+ /// ` offset` is an additional byte offset to add to the position before
610
614
/// substring match verification.
611
615
#[ inline( always) ]
612
616
fn verify_64 (
@@ -708,17 +712,17 @@ impl Masks {
708
712
}
709
713
710
714
/// Adds the given pattern to the given bucket. The bucket should be a
711
- /// power of 2 <= 2^7.
715
+ /// power of ` 2 <= 2^7` .
712
716
fn add ( & mut self , bucket : u8 , pat : & [ u8 ] ) {
713
717
for ( i, mask) in self . 0 . iter_mut ( ) . enumerate ( ) {
714
718
mask. add ( bucket, pat[ i] ) ;
715
719
}
716
720
}
717
721
718
722
/// Finds the fingerprints that are in the given haystack block. i.e., this
719
- /// returns C as described in the module documentation.
723
+ /// returns `C` as described in the module documentation.
720
724
///
721
- /// More specifically, for i in 0..16 and j in 0..8, C[i][j] == 1 if and
725
+ /// More specifically, ` for i in 0..16` and ` j in 0..8, C[i][j] == 1` if and
722
726
/// only if `haystack_block[i]` corresponds to a fingerprint that is part
723
727
/// of a pattern in bucket `j`.
724
728
#[ inline( always) ]
@@ -745,8 +749,8 @@ impl Masks {
745
749
( res0, res1)
746
750
}
747
751
748
- /// Like members1, but computes C for the first, second and third bytes in
749
- /// the fingerprint.
752
+ /// Like ` members1` , but computes `C` for the first, second and third bytes
753
+ /// in the fingerprint.
750
754
#[ inline( always) ]
751
755
fn members3 ( & self , haystack_block : u8x16 ) -> ( u8x16 , u8x16 , u8x16 ) {
752
756
let masklo = u8x16:: splat ( 0xF ) ;
0 commit comments