Skip to content

Commit fcd0b59

Browse files
authored
Share the same processor in firstMatch (#497)
Gives a 7x improvement to firstMatch-style benchmarks like "FirstMatch", 2-3x to CSS and basic backtracking benchmarks. Thanks to @rctcwyvrn for the original code.
1 parent 1356e8c commit fcd0b59

File tree

4 files changed

+141
-100
lines changed

4 files changed

+141
-100
lines changed

Diff for: Sources/_StringProcessing/Engine/Processor.swift

+39-7
Original file line numberDiff line numberDiff line change
@@ -32,30 +32,38 @@ struct Processor<
3232
typealias Element = Input.Element
3333

3434
let input: Input
35-
let bounds: Range<Position>
3635
let matchMode: MatchMode
36+
let instructions: InstructionList<Instruction>
37+
38+
// MARK: Resettable state
39+
40+
// The subject bounds.
41+
//
42+
// FIXME: This also conflates search bounds too!
43+
var bounds: Range<Position>
44+
45+
// The current position in the subject
3746
var currentPosition: Position
3847

39-
let instructions: InstructionList<Instruction>
4048
var controller: Controller
4149

42-
var cycleCount = 0
43-
44-
/// Our register file
4550
var registers: Registers
4651

47-
// Used for back tracking
4852
var savePoints: [SavePoint] = []
4953

5054
var callStack: [InstructionAddress] = []
5155

56+
var storedCaptures: Array<_StoredCapture>
57+
5258
var state: State = .inProgress
5359

5460
var failureReason: Error? = nil
5561

62+
63+
// MARK: Metrics, debugging, etc.
64+
var cycleCount = 0
5665
var isTracingEnabled: Bool
5766

58-
var storedCaptures: Array<_StoredCapture>
5967
}
6068

6169
extension Processor {
@@ -88,6 +96,30 @@ extension Processor {
8896
_checkInvariants()
8997
}
9098

99+
100+
mutating func reset(searchBounds: Range<Position>) {
101+
// FIXME: We currently conflate both subject bounds and search bounds
102+
// This should just reset search bounds
103+
self.bounds = searchBounds
104+
self.currentPosition = self.bounds.lowerBound
105+
106+
self.controller = Controller(pc: 0)
107+
108+
self.registers.reset(sentinel: bounds.upperBound)
109+
110+
self.savePoints.removeAll(keepingCapacity: true)
111+
self.callStack.removeAll(keepingCapacity: true)
112+
113+
for idx in storedCaptures.indices {
114+
storedCaptures[idx] = .init()
115+
}
116+
117+
self.state = .inProgress
118+
self.failureReason = nil
119+
120+
_checkInvariants()
121+
}
122+
91123
func _checkInvariants() {
92124
assert(end <= input.endIndex)
93125
assert(start >= input.startIndex)

Diff for: Sources/_StringProcessing/Engine/Registers.swift

+65-74
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,19 @@ struct SentinelValue: Hashable, CustomStringConvertible {
1818
extension Processor {
1919
/// Our register file
2020
struct Registers {
21-
// currently, these are static readonly
21+
22+
// MARK: static / read-only, non-resettable
23+
24+
// Verbatim elements to compare against
2225
var elements: [Element]
2326

24-
// currently, these are static readonly
27+
// Verbatim sequences to compare against
2528
//
26-
// TODO: We want to be `String` instead of `[Character]`...
29+
// TODO: Degenericize Processor and store Strings
2730
var sequences: [[Element]] = []
2831

29-
// currently, hold output of assertions
30-
var bools: [Bool] // TODO: bitset
31-
32-
// currently, these are static readonly
3332
var consumeFunctions: [MEProgram<Input>.ConsumeFunction]
3433

35-
// currently, these are static readonly
3634
var assertionFunctions: [MEProgram<Input>.AssertionFunction]
3735

3836
// Captured-value constructors
@@ -44,69 +42,61 @@ extension Processor {
4442
// currently, these are for comments and abort messages
4543
var strings: [String]
4644

45+
// MARK: writeable, resettable
46+
47+
// currently, hold output of assertions
48+
var bools: [Bool] // TODO: bitset
49+
4750
// currently, useful for range-based quantification
4851
var ints: [Int]
4952

50-
// unused
51-
var floats: [Double] = []
52-
5353
// Currently, used for `movePosition` and `matchSlice`
5454
var positions: [Position] = []
5555

5656
var values: [Any]
57+
}
58+
}
5759

58-
// unused
59-
var instructionAddresses: [InstructionAddress] = []
60-
61-
// unused, any application?
62-
var classStackAddresses: [CallStackAddress] = []
63-
64-
// unused, any application?
65-
var positionStackAddresses: [PositionStackAddress] = []
66-
67-
// unused, any application?
68-
var savePointAddresses: [SavePointStackAddress] = []
69-
70-
subscript(_ i: StringRegister) -> String {
71-
strings[i.rawValue]
72-
}
73-
subscript(_ i: SequenceRegister) -> [Element] {
74-
sequences[i.rawValue]
75-
}
76-
subscript(_ i: IntRegister) -> Int {
77-
get { ints[i.rawValue] }
78-
set { ints[i.rawValue] = newValue }
79-
}
80-
subscript(_ i: BoolRegister) -> Bool {
81-
get { bools[i.rawValue] }
82-
set { bools[i.rawValue] = newValue }
83-
}
84-
subscript(_ i: PositionRegister) -> Position {
85-
get { positions[i.rawValue] }
86-
set { positions[i.rawValue] = newValue }
87-
}
88-
subscript(_ i: ValueRegister) -> Any {
89-
get { values[i.rawValue] }
90-
set {
91-
values[i.rawValue] = newValue
92-
}
93-
}
94-
subscript(_ i: ElementRegister) -> Element {
95-
elements[i.rawValue]
96-
}
97-
subscript(_ i: ConsumeFunctionRegister) -> MEProgram<Input>.ConsumeFunction {
98-
consumeFunctions[i.rawValue]
99-
}
100-
subscript(_ i: AssertionFunctionRegister) -> MEProgram<Input>.AssertionFunction {
101-
assertionFunctions[i.rawValue]
102-
}
103-
subscript(_ i: TransformRegister) -> MEProgram<Input>.TransformFunction {
104-
transformFunctions[i.rawValue]
105-
}
106-
subscript(_ i: MatcherRegister) -> MEProgram<Input>.MatcherFunction {
107-
matcherFunctions[i.rawValue]
60+
extension Processor.Registers {
61+
subscript(_ i: StringRegister) -> String {
62+
strings[i.rawValue]
63+
}
64+
subscript(_ i: SequenceRegister) -> [Input.Element] {
65+
sequences[i.rawValue]
66+
}
67+
subscript(_ i: IntRegister) -> Int {
68+
get { ints[i.rawValue] }
69+
set { ints[i.rawValue] = newValue }
70+
}
71+
subscript(_ i: BoolRegister) -> Bool {
72+
get { bools[i.rawValue] }
73+
set { bools[i.rawValue] = newValue }
74+
}
75+
subscript(_ i: PositionRegister) -> Input.Index {
76+
get { positions[i.rawValue] }
77+
set { positions[i.rawValue] = newValue }
78+
}
79+
subscript(_ i: ValueRegister) -> Any {
80+
get { values[i.rawValue] }
81+
set {
82+
values[i.rawValue] = newValue
10883
}
10984
}
85+
subscript(_ i: ElementRegister) -> Input.Element {
86+
elements[i.rawValue]
87+
}
88+
subscript(_ i: ConsumeFunctionRegister) -> MEProgram<Input>.ConsumeFunction {
89+
consumeFunctions[i.rawValue]
90+
}
91+
subscript(_ i: AssertionFunctionRegister) -> MEProgram<Input>.AssertionFunction {
92+
assertionFunctions[i.rawValue]
93+
}
94+
subscript(_ i: TransformRegister) -> MEProgram<Input>.TransformFunction {
95+
transformFunctions[i.rawValue]
96+
}
97+
subscript(_ i: MatcherRegister) -> MEProgram<Input>.MatcherFunction {
98+
matcherFunctions[i.rawValue]
99+
}
110100
}
111101

112102
extension Processor.Registers {
@@ -141,20 +131,26 @@ extension Processor.Registers {
141131

142132
self.ints = Array(repeating: 0, count: info.ints)
143133

144-
self.floats = Array(repeating: 0, count: info.floats)
145-
146134
self.positions = Array(repeating: sentinel, count: info.positions)
147135

148136
self.values = Array(
149137
repeating: SentinelValue(), count: info.values)
138+
}
150139

151-
self.instructionAddresses = Array(repeating: 0, count: info.instructionAddresses)
152-
153-
self.classStackAddresses = Array(repeating: 0, count: info.classStackAddresses)
154-
155-
self.positionStackAddresses = Array(repeating: 0, count: info.positionStackAddresses)
140+
mutating func reset(sentinel: Input.Index) {
141+
self.bools._setAll(to: false)
142+
self.ints._setAll(to: 0)
143+
self.positions._setAll(to: sentinel)
144+
self.values._setAll(to: SentinelValue())
145+
}
146+
}
156147

157-
self.savePointAddresses = Array(repeating: 0, count: info.savePointAddresses)
148+
// TODO: Productize into general algorithm
149+
extension MutableCollection {
150+
mutating func _setAll(to e: Element) {
151+
for idx in self.indices {
152+
self[idx] = e
153+
}
158154
}
159155
}
160156

@@ -196,12 +192,7 @@ extension Processor.Registers: CustomStringConvertible {
196192
\(formatRegisters("bools", bools))\
197193
\(formatRegisters("strings", strings))\
198194
\(formatRegisters("ints", ints))\
199-
\(formatRegisters("floats", floats))\
200195
\(formatRegisters("positions", positions))\
201-
\(formatRegisters("instructionAddresses", instructionAddresses))\
202-
\(formatRegisters("classStackAddresses", classStackAddresses))\
203-
\(formatRegisters("positionStackAddresses", positionStackAddresses))\
204-
\(formatRegisters("savePointAddresses", savePointAddresses))\
205196
206197
"""
207198
}

Diff for: Sources/_StringProcessing/Executor.swift

+35
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,33 @@ struct Executor {
1919
self.engine = Engine(program, enableTracing: enablesTracing)
2020
}
2121

22+
@available(SwiftStdlib 5.7, *)
23+
func firstMatch<Output>(
24+
_ input: String,
25+
in inputRange: Range<String.Index>,
26+
graphemeSemantic: Bool
27+
) throws -> Regex<Output>.Match? {
28+
var cpu = engine.makeProcessor(
29+
input: input, bounds: inputRange, matchMode: .partialFromFront)
30+
31+
var low = inputRange.lowerBound
32+
let high = inputRange.upperBound
33+
while true {
34+
if let m: Regex<Output>.Match = try _match(
35+
input, in: low..<high, using: &cpu
36+
) {
37+
return m
38+
}
39+
if low >= high { return nil }
40+
if graphemeSemantic {
41+
input.formIndex(after: &low)
42+
} else {
43+
input.unicodeScalars.formIndex(after: &low)
44+
}
45+
cpu.reset(searchBounds: low..<high)
46+
}
47+
}
48+
2249
@available(SwiftStdlib 5.7, *)
2350
func match<Output>(
2451
_ input: String,
@@ -27,7 +54,15 @@ struct Executor {
2754
) throws -> Regex<Output>.Match? {
2855
var cpu = engine.makeProcessor(
2956
input: input, bounds: inputRange, matchMode: mode)
57+
return try _match(input, in: inputRange, using: &cpu)
58+
}
3059

60+
@available(SwiftStdlib 5.7, *)
61+
func _match<Output>(
62+
_ input: String,
63+
in inputRange: Range<String.Index>,
64+
using cpu: inout Processor<String>
65+
) throws -> Regex<Output>.Match? {
3166
guard let endIdx = cpu.consume() else {
3267
if let e = cpu.failureReason {
3368
throw e

Diff for: Sources/_StringProcessing/Regex/Match.swift

+2-19
Original file line numberDiff line numberDiff line change
@@ -137,27 +137,10 @@ extension Regex {
137137
_ input: String,
138138
in inputRange: Range<String.Index>
139139
) throws -> Regex<Output>.Match? {
140-
// FIXME: Something more efficient, likely an engine interface, and we
141-
// should scrap the RegexConsumer crap and call this
142-
143140
let executor = Executor(program: regex.program.loweredProgram)
144141
let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster
145-
146-
var low = inputRange.lowerBound
147-
let high = inputRange.upperBound
148-
while true {
149-
if let m: Regex<Output>.Match = try executor.match(
150-
input, in: low..<high, .partialFromFront
151-
) {
152-
return m
153-
}
154-
if low >= high { return nil }
155-
if graphemeSemantic {
156-
input.formIndex(after: &low)
157-
} else {
158-
input.unicodeScalars.formIndex(after: &low)
159-
}
160-
}
142+
return try executor.firstMatch(
143+
input, in: inputRange, graphemeSemantic: graphemeSemantic)
161144
}
162145
}
163146

0 commit comments

Comments
 (0)