Skip to content

Commit 33acdeb

Browse files
authored
Break out of quantification loop if there is no forward progress (#560)
This fixes infinite loops when we loop over an internal node that does not have any forward progress. Also included is an optimization to only emit the check/break instructions if we have a case that might result in an infinite loop (possibly non-progressing inner node + unlimited quantification)
1 parent 33856e7 commit 33acdeb

File tree

9 files changed

+205
-11
lines changed

9 files changed

+205
-11
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

+65
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,12 @@ fileprivate extension Compiler.ByteCodeGen {
543543
decrement %minTrips and fallthrough
544544
545545
loop-body:
546+
<if can't guarantee forward progress && extraTrips = nil>:
547+
mov currentPosition %pos
546548
evaluate the subexpression
549+
<if can't guarantee forward progress && extraTrips = nil>:
550+
if %pos is currentPosition:
551+
goto exit
547552
goto min-trip-count control block
548553
549554
exit-policy control block:
@@ -646,7 +651,28 @@ fileprivate extension Compiler.ByteCodeGen {
646651
// <subexpression>
647652
// branch min-trip-count
648653
builder.label(loopBody)
654+
655+
// if we aren't sure if the child node will have forward progress and
656+
// we have an unbounded quantification
657+
let startPosition: PositionRegister?
658+
let emitPositionChecking =
659+
(!optimizationsEnabled || !child.guaranteesForwardProgress) &&
660+
extraTrips == nil
661+
662+
if emitPositionChecking {
663+
startPosition = builder.makePositionRegister()
664+
builder.buildMoveCurrentPosition(into: startPosition!)
665+
} else {
666+
startPosition = nil
667+
}
649668
try emitNode(child)
669+
if emitPositionChecking {
670+
// in all quantifier cases, no matter what minTrips or extraTrips is,
671+
// if we have a successful non-advancing match, branch to exit because it
672+
// can match an arbitrary number of times
673+
builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!)
674+
}
675+
650676
if minTrips <= 1 {
651677
// fallthrough
652678
} else {
@@ -832,3 +858,42 @@ fileprivate extension Compiler.ByteCodeGen {
832858
return nil
833859
}
834860
}
861+
862+
extension DSLTree.Node {
863+
var guaranteesForwardProgress: Bool {
864+
switch self {
865+
case .orderedChoice(let children):
866+
return children.allSatisfy { $0.guaranteesForwardProgress }
867+
case .concatenation(let children):
868+
return children.contains(where: { $0.guaranteesForwardProgress })
869+
case .capture(_, _, let node, _):
870+
return node.guaranteesForwardProgress
871+
case .nonCapturingGroup(let kind, let child):
872+
switch kind.ast {
873+
case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind:
874+
return false
875+
default: return child.guaranteesForwardProgress
876+
}
877+
case .atom(let atom):
878+
switch atom {
879+
case .changeMatchingOptions, .assertion: return false
880+
default: return true
881+
}
882+
case .trivia, .empty:
883+
return false
884+
case .quotedLiteral(let string):
885+
return !string.isEmpty
886+
case .convertedRegexLiteral(let node, _):
887+
return node.guaranteesForwardProgress
888+
case .consumer, .matcher:
889+
// Allow zero width consumers and matchers
890+
return false
891+
case .customCharacterClass:
892+
return true
893+
case .quantification(let amount, _, let child):
894+
let (atLeast, _) = amount.ast.bounds
895+
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
896+
default: return false
897+
}
898+
}
899+
}

Sources/_StringProcessing/Engine/Backtracking.swift

+7-3
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,18 @@ extension Processor {
3232
// The int registers store values that can be relevant to
3333
// backtracking, such as the number of trips in a quantification.
3434
var intRegisters: [Int]
35+
// Same with position registers
36+
var posRegisters: [Input.Index]
3537

3638
var destructure: (
3739
pc: InstructionAddress,
3840
pos: Position?,
3941
stackEnd: CallStackAddress,
4042
captureEnds: [_StoredCapture],
41-
intRegisters: [Int]
43+
intRegisters: [Int],
44+
PositionRegister: [Input.Index]
4245
) {
43-
(pc, pos, stackEnd, captureEnds, intRegisters)
46+
(pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
4447
}
4548
}
4649

@@ -53,7 +56,8 @@ extension Processor {
5356
pos: addressOnly ? nil : currentPosition,
5457
stackEnd: .init(callStack.count),
5558
captureEnds: storedCaptures,
56-
intRegisters: registers.ints)
59+
intRegisters: registers.ints,
60+
posRegisters: registers.positions)
5761
}
5862
}
5963

Sources/_StringProcessing/Engine/InstPayload.swift

+3-3
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,10 @@ extension Instruction.Payload {
284284
interpretPair()
285285
}
286286

287-
init(pos: PositionRegister, pos2: PositionRegister) {
288-
self.init(pos, pos2)
287+
init(addr: InstructionAddress, position: PositionRegister) {
288+
self.init(addr, position)
289289
}
290-
var pairedPosPos: (PositionRegister, PositionRegister) {
290+
var pairedAddrPos: (InstructionAddress, PositionRegister) {
291291
interpretPair()
292292
}
293293

Sources/_StringProcessing/Engine/Instruction.swift

+18
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ extension Instruction {
3737
///
3838
case moveImmediate
3939

40+
/// Move the current position into a register
41+
///
42+
/// moveCurrentPosition(into: PositionRegister)
43+
///
44+
/// Operands:
45+
/// - Position register to move into
46+
case moveCurrentPosition
47+
4048
// MARK: General Purpose: Control flow
4149

4250
/// Branch to a new instruction
@@ -57,6 +65,16 @@ extension Instruction {
5765
///
5866
case condBranchZeroElseDecrement
5967

68+
/// Conditionally branch if the current position is the same as the register
69+
///
70+
/// condBranch(
71+
/// to: InstAddr, ifSamePositionAs: PositionRegister)
72+
///
73+
/// Operands:
74+
/// - Instruction address to branch to, if the position in the register is the same as currentPosition
75+
/// - Position register to check against
76+
case condBranchSamePosition
77+
6078
// TODO: Function calls
6179

6280
// MARK: - Matching

Sources/_StringProcessing/Engine/MEBuilder.swift

+22-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ extension MEProgram {
3232
var nextIntRegister = IntRegister(0)
3333
var nextCaptureRegister = CaptureRegister(0)
3434
var nextValueRegister = ValueRegister(0)
35+
var nextPositionRegister = PositionRegister(0)
3536

3637
// Special addresses or instructions
3738
var failAddressToken: AddressToken? = nil
@@ -105,6 +106,14 @@ extension MEProgram.Builder {
105106
fixup(to: t)
106107
}
107108

109+
mutating func buildCondBranch(
110+
to t: AddressToken,
111+
ifSamePositionAs r: PositionRegister
112+
) {
113+
instructions.append(.init(.condBranchSamePosition, .init(position: r)))
114+
fixup(to: t)
115+
}
116+
108117
mutating func buildSave(_ t: AddressToken) {
109118
instructions.append(.init(.save))
110119
fixup(to: t)
@@ -211,6 +220,10 @@ extension MEProgram.Builder {
211220
.init(value: value, capture: capture)))
212221
}
213222

223+
mutating func buildMoveCurrentPosition(into r: PositionRegister) {
224+
instructions.append(.init(.moveCurrentPosition, .init(position: r)))
225+
}
226+
214227
mutating func buildBackreference(
215228
_ cap: CaptureRegister
216229
) {
@@ -257,7 +270,8 @@ extension MEProgram.Builder {
257270
switch inst.opcode {
258271
case .condBranchZeroElseDecrement:
259272
payload = .init(addr: addr, int: inst.payload.int)
260-
273+
case .condBranchSamePosition:
274+
payload = .init(addr: addr, position: inst.payload.position)
261275
case .branch, .save, .saveAddress, .clearThrough:
262276
payload = .init(addr: addr)
263277

@@ -281,6 +295,7 @@ extension MEProgram.Builder {
281295
regInfo.sequences = sequences.count
282296
regInfo.ints = nextIntRegister.rawValue
283297
regInfo.values = nextValueRegister.rawValue
298+
regInfo.positions = nextPositionRegister.rawValue
284299
regInfo.bitsets = asciiBitsets.count
285300
regInfo.consumeFunctions = consumeFunctions.count
286301
regInfo.assertionFunctions = assertionFunctions.count
@@ -421,6 +436,12 @@ extension MEProgram.Builder {
421436
return r
422437
}
423438

439+
mutating func makePositionRegister() -> PositionRegister {
440+
let r = nextPositionRegister
441+
defer { nextPositionRegister.rawValue += 1 }
442+
return r
443+
}
444+
424445
// TODO: A register-mapping helper struct, which could release
425446
// registers without monotonicity required
426447

Sources/_StringProcessing/Engine/Processor.swift

+13-3
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ extension Processor {
245245
}
246246

247247
mutating func signalFailure() {
248-
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
248+
guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) =
249249
savePoints.popLast()?.destructure
250250
else {
251251
state = .fail
@@ -259,6 +259,7 @@ extension Processor {
259259
callStack.removeLast(callStack.count - stackEnd.rawValue)
260260
storedCaptures = capEnds
261261
registers.ints = intRegisters
262+
registers.positions = posRegisters
262263
}
263264

264265
mutating func abort(_ e: Error? = nil) {
@@ -315,7 +316,10 @@ extension Processor {
315316

316317
registers[reg] = int
317318
controller.step()
318-
319+
case .moveCurrentPosition:
320+
let reg = payload.position
321+
registers[reg] = currentPosition
322+
controller.step()
319323
case .branch:
320324
controller.pc = payload.addr
321325

@@ -327,7 +331,13 @@ extension Processor {
327331
registers[int] -= 1
328332
controller.step()
329333
}
330-
334+
case .condBranchSamePosition:
335+
let (addr, pos) = payload.pairedAddrPos
336+
if registers[pos] == currentPosition {
337+
controller.pc = addr
338+
} else {
339+
controller.step()
340+
}
331341
case .save:
332342
let resumeAddr = payload.addr
333343
let sp = makeSavePoint(resumeAddr)

Sources/_StringProcessing/Engine/Registers.swift

+14
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ extension Processor {
4747
var ints: [Int]
4848

4949
var values: [Any]
50+
51+
var positions: [Input.Index]
5052
}
5153
}
5254

@@ -66,6 +68,12 @@ extension Processor.Registers {
6668
values[i.rawValue] = newValue
6769
}
6870
}
71+
subscript(_ i: PositionRegister) -> Input.Index {
72+
get { positions[i.rawValue] }
73+
set {
74+
positions[i.rawValue] = newValue
75+
}
76+
}
6977
subscript(_ i: ElementRegister) -> Input.Element {
7078
elements[i.rawValue]
7179
}
@@ -89,6 +97,8 @@ extension Processor.Registers {
8997
}
9098

9199
extension Processor.Registers {
100+
static let sentinelIndex = "".startIndex
101+
92102
init(
93103
_ program: MEProgram,
94104
_ sentinel: String.Index
@@ -120,11 +130,15 @@ extension Processor.Registers {
120130

121131
self.values = Array(
122132
repeating: SentinelValue(), count: info.values)
133+
self.positions = Array(
134+
repeating: Processor.Registers.sentinelIndex,
135+
count: info.positions)
123136
}
124137

125138
mutating func reset(sentinel: Input.Index) {
126139
self.ints._setAll(to: 0)
127140
self.values._setAll(to: SentinelValue())
141+
self.positions._setAll(to: Processor.Registers.sentinelIndex)
128142
}
129143
}
130144

Tests/RegexTests/CompileTests.swift

+36
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,40 @@ extension RegexTests {
208208
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset])
209209
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy])
210210
}
211+
212+
func testQuantificationForwardProgressCompile() {
213+
// Unbounded quantification + non forward progressing inner nodes
214+
// Expect to emit the position checking instructions
215+
expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
216+
expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
217+
expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
218+
expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
219+
expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
220+
expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
221+
expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
222+
expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
223+
expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
224+
225+
// Bounded quantification, don't emit position checking
226+
expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
227+
expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
228+
expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
229+
expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
230+
expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
231+
expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
232+
expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
233+
expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
234+
expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
235+
236+
// Inner node is a quantification that does not guarantee forward progress
237+
expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
238+
expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
239+
expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
240+
expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
241+
expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
242+
expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
243+
// Inner node is a quantification that guarantees forward progress
244+
expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
245+
expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
246+
}
211247
}

Tests/RegexTests/MatchTests.swift

+27-1
Original file line numberDiff line numberDiff line change
@@ -1895,5 +1895,31 @@ extension RegexTests {
18951895
XCTAssertEqual(matches.count, 3)
18961896
}
18971897
}
1898-
}
18991898

1899+
func expectCompletion(regex: String, in target: String) {
1900+
let expectation = XCTestExpectation(description: "Run the given regex to completion")
1901+
Task.init {
1902+
let r = try! Regex(regex)
1903+
let val = target.matches(of: r).isEmpty
1904+
expectation.fulfill()
1905+
return val
1906+
}
1907+
wait(for: [expectation], timeout: 3.0)
1908+
}
1909+
1910+
func testQuantificationForwardProgress() {
1911+
expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa")
1912+
expectCompletion(regex: #"(?:\b)+"#, in: "aa")
1913+
expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa")
1914+
expectCompletion(regex: #"(?:|)+"#, in: "aa")
1915+
expectCompletion(regex: #"(?:\w|)+"#, in: "aa")
1916+
expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa")
1917+
expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa")
1918+
expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa")
1919+
expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa")
1920+
expectCompletion(regex: #"(a*)*"#, in: "aa")
1921+
expectCompletion(regex: #"(a?)*"#, in: "aa")
1922+
expectCompletion(regex: #"(a{,4})*"#, in: "aa")
1923+
expectCompletion(regex: #"((|)+)*"#, in: "aa")
1924+
}
1925+
}

0 commit comments

Comments
 (0)