@@ -211,31 +211,30 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
211
211
if ( findOptimizations . IsUseful &&
212
212
findOptimizations . LeadingAnchor is not RegexNodeKind . Beginning )
213
213
{
214
- // this makes some assumptions about the frequency of occurrences
215
- // some large sets like \p{Sm} are faster with infrequent matches but slower with frequent matches
216
- // the easiest thing to do here is to leave it as-is, but this means some inputs can have large performance losses of 10x or more
217
-
218
- var setIsTooCommon = new Func < RegexFindOptimizations . FixedDistanceSet , bool > ( ( fds ) =>
219
- {
220
- return fds switch
221
- {
222
- { Chars : not null } =>
223
- // anything above 4 uint16 chars is generally slower than DFA
224
- fds . Negated ||
225
- ( fds . Chars . Length > 4 &&
226
- Array . Exists ( fds . Chars , char . IsAsciiLetterLower ) ) ,
227
- { Range : not null } => false ,
228
- // for fixed length strings just trust the optimizations
229
- _ => _optimizedReversalState . Kind != MatchReversalKind . FixedLength ,
230
- } ;
231
- } ) ;
232
-
233
214
// In some cases where the findOptimizations are useful, just using the DFA can still be faster.
234
215
_findOpts = findOptimizations switch
235
216
{
236
- { FindMode : FindNextStartingPositionMode . FixedDistanceSets_LeftToRight } when findOptimizations . FixedDistanceSets ! . TrueForAll ( setIsTooCommon . Invoke ) => null ,
237
- { FindMode : FindNextStartingPositionMode . LeadingSet_LeftToRight } when setIsTooCommon ( findOptimizations . FixedDistanceSets ! [ 0 ] ) => null ,
238
- _ => findOptimizations
217
+ // for sets in fixed length patterns just trust the optimizations,
218
+ // the performance can be either better or worse depending on frequency
219
+ {
220
+ FindMode :
221
+ FindNextStartingPositionMode . FixedDistanceSets_LeftToRight or
222
+ FindNextStartingPositionMode . LeadingSet_LeftToRight } when
223
+ _optimizedReversalState . Kind != MatchReversalKind . FixedLength => findOptimizations ,
224
+ // string literals are the best case
225
+ {
226
+ FindMode :
227
+ FindNextStartingPositionMode . LeadingString_LeftToRight or
228
+ FindNextStartingPositionMode . FixedDistanceString_LeftToRight or
229
+ FindNextStartingPositionMode . LeadingString_OrdinalIgnoreCase_LeftToRight
230
+ } => findOptimizations ,
231
+ // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null
232
+ { FindMode : FindNextStartingPositionMode . LeadingStrings_LeftToRight } => findOptimizations ,
233
+ { FindMode : FindNextStartingPositionMode . LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations ,
234
+ // for singular character sets it depends if there's any reasonably small set to be accelerated
235
+ { FindMode : FindNextStartingPositionMode . FixedDistanceSets_LeftToRight } when findOptimizations . FixedDistanceSets ! . TrueForAll ( CharSetIsTooCommon ) => null ,
236
+ { FindMode : FindNextStartingPositionMode . LeadingSet_LeftToRight } when CharSetIsTooCommon ( findOptimizations . FixedDistanceSets ! [ 0 ] ) => null ,
237
+ _ => null
239
238
} ;
240
239
}
241
240
@@ -291,6 +290,36 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
291
290
_reverseInitialStates = reverseInitialStates ;
292
291
293
292
293
+ // TODO: this is still work in progress
294
+ // The frequency of occurrences makes a big difference here,
295
+ // anything above 4 uint16 chars is generally slower than DFA, but
296
+ // if the characters are very rare, then SearchValues can be up to ~2x faster
297
+ // SearchValues<char> implementations to avoid:
298
+ // - ProbabilisticCharSearchValues
299
+ // - ProbabilisticWithAsciiCharSearchValues`1
300
+ // - AsciiCharSearchValues`1
301
+ // - Any5SearchValues`2"
302
+ // SearchValues<string> implementations to avoid:
303
+ // - StringSearchValuesAhoCorasick`2
304
+ bool CharSetIsTooCommon ( RegexFindOptimizations . FixedDistanceSet fixedDistanceSet )
305
+ {
306
+ return fixedDistanceSet switch
307
+ {
308
+ // anything above 4 uint16 chars is generally slower than DFA
309
+ { Chars : not null } =>
310
+ // negated sets are usually large
311
+ fixedDistanceSet . Negated ||
312
+ ( fixedDistanceSet . Chars . Length > 4
313
+ // TODO: this extra condition is currently kept so there's no regressions
314
+ // if ~500mb/s worst case is acceptable then this could be removed
315
+ // but being able to guess which character sets are not too frequent can
316
+ // often reach over 1gb/s with AVX
317
+ && Array . Exists ( fixedDistanceSet . Chars , char . IsAsciiLetterLower ) ) ,
318
+ { Range : not null } => false ,
319
+ _ => false ,
320
+ } ;
321
+ }
322
+
294
323
// Maps a minterm ID to a character kind
295
324
uint CalculateMintermIdKind ( int mintermId )
296
325
{
@@ -561,9 +590,9 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
561
590
done =
562
591
FindEndPositionDeltasDFAOptimized < TOptimizedInputReader ,
563
592
TAcceleratedStateHandler ,
564
- TOptimizedNullabilityHandler > ( input , innerLoopLength - 1 , mode , ref pos ,
593
+ TOptimizedNullabilityHandler > ( input , innerLoopLength - 1 , mode , timeoutOccursAt , ref pos ,
565
594
currentState . DfaStateId , ref endPos , ref initialStatePosCandidate ,
566
- ref initialStatePosCandidate , timeoutOccursAt ) ;
595
+ ref initialStatePosCandidate ) ;
567
596
}
568
597
else
569
598
{
@@ -648,8 +677,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
648
677
: input . Length ;
649
678
done =
650
679
FindEndPositionDeltasDFA < DfaStateHandler , TInputReader , TFindOptimizationsHandler ,
651
- TNullabilityHandler > ( input , innerLoopLength , mode , ref pos , ref currentState , ref endPos ,
652
- ref endStateId , ref initialStatePosCandidate , timeoutOccursAt ) ;
680
+ TNullabilityHandler > ( input , innerLoopLength , mode , timeoutOccursAt , ref pos , ref currentState , ref endPos ,
681
+ ref endStateId , ref initialStatePosCandidate ) ;
653
682
}
654
683
else
655
684
{
@@ -699,11 +728,14 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
699
728
700
729
701
730
/// <summary>
702
- /// tbd
731
+ /// This version of <see cref="FindEndPositionDeltasDFA"/> uses a different set of interfaces,
732
+ /// which don't check for many inner loop edge cases e.g. input end or '\n'.
733
+ /// All edge cases are handled before entering the loop.
703
734
/// </summary>
704
735
private bool FindEndPositionDeltasDFAOptimized < TOptimizedInputReader , TAcceleratedStateHandler ,
705
736
TOptimizedNullabilityHandler > ( ReadOnlySpan < char > input , int lengthMinus1 , RegexRunnerMode mode ,
706
- ref int posRef , int startStateId , ref int endPosRef , ref int initialStatePosRef , ref int initialStatePosCandidateRef , long timeoutOccursAt )
737
+ long timeoutOccursAt , ref int posRef , int startStateId , ref int endPosRef , ref int initialStatePosRef ,
738
+ ref int initialStatePosCandidateRef )
707
739
where TOptimizedInputReader : struct , IOptimizedInputReader
708
740
where TAcceleratedStateHandler : struct , IAcceleratedStateHandler
709
741
where TOptimizedNullabilityHandler : struct , IOptimizedNullabilityHandler
@@ -742,8 +774,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
742
774
if ( TAcceleratedStateHandler . TryFindNextStartingPosition < TOptimizedInputReader > (
743
775
this , mtlookup , input , ref currStateId , ref pos , initialStateId ) )
744
776
{
745
- // future work could combine this with an immediate state transition
746
- // but this requires changing too much for now
777
+ // a good potential future optimization here would
778
+ // be to combine this with an immediate state transition
747
779
if ( pos == input . Length )
748
780
{
749
781
// patterns such as ^$ can be nullable right away
@@ -769,7 +801,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
769
801
}
770
802
771
803
// If there is more input available try to transition with the next character.
772
- // Note: the order here is important so the transition gets taken
804
+ // Note: the order here is important so the transition itself gets taken
773
805
if ( ! DfaStateHandler . TryTakeDFATransition (
774
806
this , ref currStateId , TOptimizedInputReader . GetPositionId ( mtlookup , maxChar , input , pos ) ,
775
807
timeoutOccursAt )
@@ -824,8 +856,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
824
856
/// A negative value if iteration completed because we ran out of input or we failed to transition.
825
857
/// </returns>
826
858
private bool FindEndPositionDeltasDFA < TStateHandler , TInputReader , TFindOptimizationsHandler , TNullabilityHandler > ( ReadOnlySpan < char > input , int length , RegexRunnerMode mode ,
827
- ref int posRef , ref CurrentState state , ref int endPosRef , ref int initialStatePosRef , ref int initialStatePosCandidateRef ,
828
- long timeoutOccursAt )
859
+ long timeoutOccursAt , ref int posRef , ref CurrentState state , ref int endPosRef , ref int initialStatePosRef , ref int initialStatePosCandidateRef
860
+ )
829
861
where TStateHandler : struct , IStateHandler
830
862
where TInputReader : struct , IInputReader
831
863
where TFindOptimizationsHandler : struct , IInitialStateHandler
@@ -1666,7 +1698,7 @@ public static void UndoTransition(ref CurrentState state)
1666
1698
/// This input reader attempts to minimize overhead
1667
1699
/// by handling constraints outside of the loop:
1668
1700
/// 1. the position must be already valid for the input.
1669
- /// 2. the pattern must not to contain \Z.
1701
+ /// 2. the pattern must not contain \Z.
1670
1702
/// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm
1671
1703
/// </summary>
1672
1704
private interface IOptimizedInputReader
@@ -1690,7 +1722,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
1690
1722
}
1691
1723
1692
1724
/// <summary>
1693
- /// This reader is effectively an array lookup for the full 64k utf16 code unit mapping
1725
+ /// This reader is effectively an array lookup for the all utf16 code units
1694
1726
/// </summary>
1695
1727
private readonly struct OptimizedFullInputReader : IOptimizedInputReader
1696
1728
{
@@ -1703,6 +1735,10 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
1703
1735
}
1704
1736
}
1705
1737
1738
+ /// <summary>
1739
+ /// This nullability handler interface can be used in DFAs
1740
+ /// for patterns that do not contain \Z
1741
+ /// </summary>
1706
1742
private interface IOptimizedNullabilityHandler
1707
1743
{
1708
1744
public static abstract bool IsNullable < TOptimizedInputReader > ( SymbolicRegexMatcher < TSet > matcher ,
@@ -1728,7 +1764,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
1728
1764
byte [ ] nullabilityArray , int currStateId , byte [ ] lookup , ReadOnlySpan < char > input , int pos )
1729
1765
where TOptimizedInputReader : struct , IOptimizedInputReader
1730
1766
{
1731
- Debug . Assert ( pos < input . Length , $ "input end should not be handled here { input } , pat: { matcher . _dotstarredInitialStates [ CharKind . General ] . Node } ") ;
1767
+ Debug . Assert ( pos < input . Length , $ "input end should not be handled here") ;
1732
1768
return nullabilityArray [ currStateId ] > 0 && matcher . IsNullableWithContext ( currStateId , TOptimizedInputReader . GetPositionId ( lookup , lookup . Length + 1 , input , pos ) ) ;
1733
1769
}
1734
1770
}
0 commit comments