Skip to content

Commit 1077145

Browse files
committed
comments and cleanup
1 parent 3cccc4a commit 1077145

File tree

3 files changed

+73
-47
lines changed

3 files changed

+73
-47
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,24 +100,13 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
100100

101101
/// <summary>
102102
/// Cached nullability check with encoded bits
103-
/// whereever possible
104103
/// </summary>
105104
[MethodImpl(MethodImplOptions.AggressiveInlining)]
106105
internal bool IsNullableFor(uint nextCharKind)
107106
{
108107
return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
109108
}
110109

111-
/// <summary>
112-
/// Full nullability check for initialization
113-
/// </summary>
114-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
115-
internal bool IsNullableForInit(uint nextCharKind)
116-
{
117-
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
118-
return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind));
119-
}
120-
121110
/// <summary>
122111
/// Builds a <see cref="StateFlags"/> with the relevant flags set.
123112
/// </summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ private static SymbolicRegexInfo Create(
5555
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
5656

5757
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
58+
5859
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
5960

6061
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs

Lines changed: 72 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -211,31 +211,30 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
211211
if (findOptimizations.IsUseful &&
212212
findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
213213
{
214-
// this makes some assumptions about the frequency of occurrences
215-
// some large sets like \p{Sm} are faster with infrequent matches but slower with frequent matches
216-
// the easiest thing to do here is to leave it as-is, but this means some inputs can have large performance losses of 10x or more
217-
218-
var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
219-
{
220-
return fds switch
221-
{
222-
{ Chars: not null } =>
223-
// anything above 4 uint16 chars is generally slower than DFA
224-
fds.Negated ||
225-
(fds.Chars.Length > 4 &&
226-
Array.Exists(fds.Chars, char.IsAsciiLetterLower)),
227-
{ Range: not null } => false,
228-
// for fixed length strings just trust the optimizations
229-
_ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength,
230-
};
231-
});
232-
233214
// In some cases where the findOptimizations are useful, just using the DFA can still be faster.
234215
_findOpts = findOptimizations switch
235216
{
236-
{ FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke) => null,
237-
{ FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when setIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
238-
_ => findOptimizations
217+
// for sets in fixed length patterns just trust the optimizations,
218+
// the performance can be either better or worse depending on frequency
219+
{
220+
FindMode:
221+
FindNextStartingPositionMode.FixedDistanceSets_LeftToRight or
222+
FindNextStartingPositionMode.LeadingSet_LeftToRight} when
223+
_optimizedReversalState.Kind != MatchReversalKind.FixedLength => findOptimizations,
224+
// string literals are the best case
225+
{
226+
FindMode:
227+
FindNextStartingPositionMode.LeadingString_LeftToRight or
228+
FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
229+
FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight
230+
} => findOptimizations,
231+
// note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null
232+
{ FindMode: FindNextStartingPositionMode.LeadingStrings_LeftToRight } => findOptimizations,
233+
{ FindMode: FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations,
234+
// for singular character sets it depends if there's any reasonably small set to be accelerated
235+
{ FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => null,
236+
{ FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
237+
_ => null
239238
};
240239
}
241240

@@ -291,6 +290,36 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
291290
_reverseInitialStates = reverseInitialStates;
292291

293292

293+
// TODO: this is still work in progress
294+
// The frequency of occurrences makes a big difference here,
295+
// anything above 4 uint16 chars is generally slower than DFA, but
296+
// if the characters are very rare, then SearchValues can be up to ~2x faster
297+
// SearchValues<char> implementations to avoid:
298+
// - ProbabilisticCharSearchValues
299+
// - ProbabilisticWithAsciiCharSearchValues`1
300+
// - AsciiCharSearchValues`1
301+
// - Any5SearchValues`2"
302+
// SearchValues<string> implementations to avoid:
303+
// - StringSearchValuesAhoCorasick`2
304+
bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet)
305+
{
306+
return fixedDistanceSet switch
307+
{
308+
// anything above 4 uint16 chars is generally slower than DFA
309+
{ Chars: not null } =>
310+
// negated sets are usually large
311+
fixedDistanceSet.Negated ||
312+
(fixedDistanceSet.Chars.Length > 4
313+
// TODO: this extra condition is currently kept so there's no regressions
314+
// if ~500mb/s worst case is acceptable then this could be removed
315+
// but being able to guess which character sets are not too frequent can
316+
// often reach over 1gb/s with AVX
317+
&& Array.Exists(fixedDistanceSet.Chars, char.IsAsciiLetterLower)),
318+
{ Range: not null } => false,
319+
_ => false,
320+
};
321+
}
322+
294323
// Maps a minterm ID to a character kind
295324
uint CalculateMintermIdKind(int mintermId)
296325
{
@@ -561,9 +590,9 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
561590
done =
562591
FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
563592
TAcceleratedStateHandler,
564-
TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
593+
TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, timeoutOccursAt, ref pos,
565594
currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
566-
ref initialStatePosCandidate, timeoutOccursAt);
595+
ref initialStatePosCandidate);
567596
}
568597
else
569598
{
@@ -648,8 +677,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
648677
: input.Length;
649678
done =
650679
FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
651-
TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
652-
ref endStateId, ref initialStatePosCandidate, timeoutOccursAt);
680+
TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
681+
ref endStateId, ref initialStatePosCandidate);
653682
}
654683
else
655684
{
@@ -699,11 +728,14 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
699728

700729

701730
/// <summary>
702-
/// tbd
731+
/// This version of <see cref="FindEndPositionDeltasDFA"/> uses a different set of interfaces,
732+
/// which don't check for many inner loop edge cases e.g. input end or '\n'.
733+
/// All edge cases are handled before entering the loop.
703734
/// </summary>
704735
private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
705736
TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
706-
ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, long timeoutOccursAt)
737+
long timeoutOccursAt, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef,
738+
ref int initialStatePosCandidateRef)
707739
where TOptimizedInputReader : struct, IOptimizedInputReader
708740
where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
709741
where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -742,8 +774,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
742774
if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
743775
this, mtlookup, input, ref currStateId, ref pos, initialStateId))
744776
{
745-
// future work could combine this with an immediate state transition
746-
// but this requires changing too much for now
777+
// a good potential future optimization here would
778+
// be to combine this with an immediate state transition
747779
if (pos == input.Length)
748780
{
749781
// patterns such as ^$ can be nullable right away
@@ -769,7 +801,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
769801
}
770802

771803
// If there is more input available try to transition with the next character.
772-
// Note: the order here is important so the transition gets taken
804+
// Note: the order here is important so the transition itself gets taken
773805
if (!DfaStateHandler.TryTakeDFATransition(
774806
this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos),
775807
timeoutOccursAt)
@@ -824,8 +856,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
824856
/// A negative value if iteration completed because we ran out of input or we failed to transition.
825857
/// </returns>
826858
private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
827-
ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef,
828-
long timeoutOccursAt)
859+
long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef
860+
)
829861
where TStateHandler : struct, IStateHandler
830862
where TInputReader : struct, IInputReader
831863
where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -1666,7 +1698,7 @@ public static void UndoTransition(ref CurrentState state)
16661698
/// This input reader attempts to minimize overhead
16671699
/// by handling constraints outside of the loop:
16681700
/// 1. the position must be already valid for the input.
1669-
/// 2. the pattern must not to contain \Z.
1701+
/// 2. the pattern must not contain \Z.
16701702
/// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm
16711703
/// </summary>
16721704
private interface IOptimizedInputReader
@@ -1690,7 +1722,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
16901722
}
16911723

16921724
/// <summary>
1693-
/// This reader is effectively an array lookup for the full 64k utf16 code unit mapping
1725+
/// This reader is effectively an array lookup for the all utf16 code units
16941726
/// </summary>
16951727
private readonly struct OptimizedFullInputReader : IOptimizedInputReader
16961728
{
@@ -1703,6 +1735,10 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
17031735
}
17041736
}
17051737

1738+
/// <summary>
1739+
/// This nullability handler interface can be used in DFAs
1740+
/// for patterns that do not contain \Z
1741+
/// </summary>
17061742
private interface IOptimizedNullabilityHandler
17071743
{
17081744
public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
@@ -1728,7 +1764,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
17281764
byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
17291765
where TOptimizedInputReader : struct, IOptimizedInputReader
17301766
{
1731-
Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}");
1767+
Debug.Assert(pos < input.Length, $"input end should not be handled here");
17321768
return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos));
17331769
}
17341770
}

0 commit comments

Comments
 (0)