From 0b040289bc61aedf627b0b7dc35d168733f43211 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 17 Jun 2022 15:00:27 -0600 Subject: [PATCH] Share the same processor in firstMatch Gives a 7x improvement to firstMatch-style benchmarks like "FirstMatch", 2-3x to CSS and basic backtracking benchmarks. Thanks to @rctcwyvrn for the original code. --- .../_StringProcessing/Engine/Processor.swift | 46 +++++- .../_StringProcessing/Engine/Registers.swift | 139 ++++++++---------- Sources/_StringProcessing/Executor.swift | 35 +++++ Sources/_StringProcessing/Regex/Match.swift | 21 +-- 4 files changed, 141 insertions(+), 100 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5f58394d3..a81d2ce06 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -32,30 +32,38 @@ struct Processor< typealias Element = Input.Element let input: Input - let bounds: Range let matchMode: MatchMode + let instructions: InstructionList + + // MARK: Resettable state + + // The subject bounds. + // + // FIXME: This also conflates search bounds too! + var bounds: Range + + // The current position in the subject var currentPosition: Position - let instructions: InstructionList var controller: Controller - var cycleCount = 0 - - /// Our register file var registers: Registers - // Used for back tracking var savePoints: [SavePoint] = [] var callStack: [InstructionAddress] = [] + var storedCaptures: Array<_StoredCapture> + var state: State = .inProgress var failureReason: Error? = nil + + // MARK: Metrics, debugging, etc. + var cycleCount = 0 var isTracingEnabled: Bool - var storedCaptures: Array<_StoredCapture> } extension Processor { @@ -88,6 +96,30 @@ extension Processor { _checkInvariants() } + + mutating func reset(searchBounds: Range) { + // FIXME: We currently conflate both subject bounds and search bounds + // This should just reset search bounds + self.bounds = searchBounds + self.currentPosition = self.bounds.lowerBound + + self.controller = Controller(pc: 0) + + self.registers.reset(sentinel: bounds.upperBound) + + self.savePoints.removeAll(keepingCapacity: true) + self.callStack.removeAll(keepingCapacity: true) + + for idx in storedCaptures.indices { + storedCaptures[idx] = .init() + } + + self.state = .inProgress + self.failureReason = nil + + _checkInvariants() + } + func _checkInvariants() { assert(end <= input.endIndex) assert(start >= input.startIndex) diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index 812866ee6..e6f823341 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -18,21 +18,19 @@ struct SentinelValue: Hashable, CustomStringConvertible { extension Processor { /// Our register file struct Registers { - // currently, these are static readonly + + // MARK: static / read-only, non-resettable + + // Verbatim elements to compare against var elements: [Element] - // currently, these are static readonly + // Verbatim sequences to compare against // - // TODO: We want to be `String` instead of `[Character]`... + // TODO: Degenericize Processor and store Strings var sequences: [[Element]] = [] - // currently, hold output of assertions - var bools: [Bool] // TODO: bitset - - // currently, these are static readonly var consumeFunctions: [MEProgram.ConsumeFunction] - // currently, these are static readonly var assertionFunctions: [MEProgram.AssertionFunction] // Captured-value constructors @@ -44,69 +42,61 @@ extension Processor { // currently, these are for comments and abort messages var strings: [String] + // MARK: writeable, resettable + + // currently, hold output of assertions + var bools: [Bool] // TODO: bitset + // currently, useful for range-based quantification var ints: [Int] - // unused - var floats: [Double] = [] - // Currently, used for `movePosition` and `matchSlice` var positions: [Position] = [] var values: [Any] + } +} - // unused - var instructionAddresses: [InstructionAddress] = [] - - // unused, any application? - var classStackAddresses: [CallStackAddress] = [] - - // unused, any application? - var positionStackAddresses: [PositionStackAddress] = [] - - // unused, any application? - var savePointAddresses: [SavePointStackAddress] = [] - - subscript(_ i: StringRegister) -> String { - strings[i.rawValue] - } - subscript(_ i: SequenceRegister) -> [Element] { - sequences[i.rawValue] - } - subscript(_ i: IntRegister) -> Int { - get { ints[i.rawValue] } - set { ints[i.rawValue] = newValue } - } - subscript(_ i: BoolRegister) -> Bool { - get { bools[i.rawValue] } - set { bools[i.rawValue] = newValue } - } - subscript(_ i: PositionRegister) -> Position { - get { positions[i.rawValue] } - set { positions[i.rawValue] = newValue } - } - subscript(_ i: ValueRegister) -> Any { - get { values[i.rawValue] } - set { - values[i.rawValue] = newValue - } - } - subscript(_ i: ElementRegister) -> Element { - elements[i.rawValue] - } - subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { - consumeFunctions[i.rawValue] - } - subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { - assertionFunctions[i.rawValue] - } - subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { - transformFunctions[i.rawValue] - } - subscript(_ i: MatcherRegister) -> MEProgram.MatcherFunction { - matcherFunctions[i.rawValue] +extension Processor.Registers { + subscript(_ i: StringRegister) -> String { + strings[i.rawValue] + } + subscript(_ i: SequenceRegister) -> [Input.Element] { + sequences[i.rawValue] + } + subscript(_ i: IntRegister) -> Int { + get { ints[i.rawValue] } + set { ints[i.rawValue] = newValue } + } + subscript(_ i: BoolRegister) -> Bool { + get { bools[i.rawValue] } + set { bools[i.rawValue] = newValue } + } + subscript(_ i: PositionRegister) -> Input.Index { + get { positions[i.rawValue] } + set { positions[i.rawValue] = newValue } + } + subscript(_ i: ValueRegister) -> Any { + get { values[i.rawValue] } + set { + values[i.rawValue] = newValue } } + subscript(_ i: ElementRegister) -> Input.Element { + elements[i.rawValue] + } + subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { + consumeFunctions[i.rawValue] + } + subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { + assertionFunctions[i.rawValue] + } + subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { + transformFunctions[i.rawValue] + } + subscript(_ i: MatcherRegister) -> MEProgram.MatcherFunction { + matcherFunctions[i.rawValue] + } } extension Processor.Registers { @@ -141,20 +131,26 @@ extension Processor.Registers { self.ints = Array(repeating: 0, count: info.ints) - self.floats = Array(repeating: 0, count: info.floats) - self.positions = Array(repeating: sentinel, count: info.positions) self.values = Array( repeating: SentinelValue(), count: info.values) + } - self.instructionAddresses = Array(repeating: 0, count: info.instructionAddresses) - - self.classStackAddresses = Array(repeating: 0, count: info.classStackAddresses) - - self.positionStackAddresses = Array(repeating: 0, count: info.positionStackAddresses) + mutating func reset(sentinel: Input.Index) { + self.bools._setAll(to: false) + self.ints._setAll(to: 0) + self.positions._setAll(to: sentinel) + self.values._setAll(to: SentinelValue()) + } +} - self.savePointAddresses = Array(repeating: 0, count: info.savePointAddresses) +// TODO: Productize into general algorithm +extension MutableCollection { + mutating func _setAll(to e: Element) { + for idx in self.indices { + self[idx] = e + } } } @@ -196,12 +192,7 @@ extension Processor.Registers: CustomStringConvertible { \(formatRegisters("bools", bools))\ \(formatRegisters("strings", strings))\ \(formatRegisters("ints", ints))\ - \(formatRegisters("floats", floats))\ \(formatRegisters("positions", positions))\ - \(formatRegisters("instructionAddresses", instructionAddresses))\ - \(formatRegisters("classStackAddresses", classStackAddresses))\ - \(formatRegisters("positionStackAddresses", positionStackAddresses))\ - \(formatRegisters("savePointAddresses", savePointAddresses))\ """ } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 941d5943a..295a732de 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -19,6 +19,33 @@ struct Executor { self.engine = Engine(program, enableTracing: enablesTracing) } + @available(SwiftStdlib 5.7, *) + func firstMatch( + _ input: String, + in inputRange: Range, + graphemeSemantic: Bool + ) throws -> Regex.Match? { + var cpu = engine.makeProcessor( + input: input, bounds: inputRange, matchMode: .partialFromFront) + + var low = inputRange.lowerBound + let high = inputRange.upperBound + while true { + if let m: Regex.Match = try _match( + input, in: low..= high { return nil } + if graphemeSemantic { + input.formIndex(after: &low) + } else { + input.unicodeScalars.formIndex(after: &low) + } + cpu.reset(searchBounds: low..( _ input: String, @@ -27,7 +54,15 @@ struct Executor { ) throws -> Regex.Match? { var cpu = engine.makeProcessor( input: input, bounds: inputRange, matchMode: mode) + return try _match(input, in: inputRange, using: &cpu) + } + @available(SwiftStdlib 5.7, *) + func _match( + _ input: String, + in inputRange: Range, + using cpu: inout Processor + ) throws -> Regex.Match? { guard let endIdx = cpu.consume() else { if let e = cpu.failureReason { throw e diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 8d9ff8f0b..7e4be5652 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -137,27 +137,10 @@ extension Regex { _ input: String, in inputRange: Range ) throws -> Regex.Match? { - // FIXME: Something more efficient, likely an engine interface, and we - // should scrap the RegexConsumer crap and call this - let executor = Executor(program: regex.program.loweredProgram) let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster - - var low = inputRange.lowerBound - let high = inputRange.upperBound - while true { - if let m: Regex.Match = try executor.match( - input, in: low..= high { return nil } - if graphemeSemantic { - input.formIndex(after: &low) - } else { - input.unicodeScalars.formIndex(after: &low) - } - } + return try executor.firstMatch( + input, in: inputRange, graphemeSemantic: graphemeSemantic) } }