Skip to content

Commit 7752047

Browse files
authored
Optimize matching to match on scalar values when possible (#525)
- Adds new instructions for matching characters and scalars case insensitively - Compiles ascii character matches into the faster scalar match instructions even in grapheme semantic mode - Optimizes out unnecessary runtime grapheme boundary checks for all ascii strings - Also includes fixes to scalar matching in grapheme semantic mode (#565)
1 parent 33acdeb commit 7752047

14 files changed

+785
-288
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

+68-67
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,14 @@ fileprivate extension Compiler.ByteCodeGen {
5959
emitAny()
6060

6161
case let .char(c):
62-
try emitCharacter(c)
62+
emitCharacter(c)
6363

6464
case let .scalar(s):
65-
try emitScalar(s)
65+
if options.semanticLevel == .graphemeCluster {
66+
emitCharacter(Character(s))
67+
} else {
68+
emitMatchScalar(s)
69+
}
6670

6771
case let .assertion(kind):
6872
try emitAssertion(kind.ast)
@@ -88,6 +92,34 @@ fileprivate extension Compiler.ByteCodeGen {
8892
}
8993
}
9094

95+
mutating func emitQuotedLiteral(_ s: String) {
96+
guard options.semanticLevel == .graphemeCluster else {
97+
for char in s {
98+
for scalar in char.unicodeScalars {
99+
emitMatchScalar(scalar)
100+
}
101+
}
102+
return
103+
}
104+
105+
// Fast path for eliding boundary checks for an all ascii quoted literal
106+
if optimizationsEnabled && s.allSatisfy(\.isASCII) {
107+
let lastIdx = s.unicodeScalars.indices.last!
108+
for idx in s.unicodeScalars.indices {
109+
let boundaryCheck = idx == lastIdx
110+
let scalar = s.unicodeScalars[idx]
111+
if options.isCaseInsensitive && scalar.properties.isCased {
112+
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
113+
} else {
114+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
115+
}
116+
}
117+
return
118+
}
119+
120+
for c in s { emitCharacter(c) }
121+
}
122+
91123
mutating func emitBackreference(
92124
_ ref: AST.Reference
93125
) throws {
@@ -245,41 +277,47 @@ fileprivate extension Compiler.ByteCodeGen {
245277
}
246278
}
247279

248-
mutating func emitScalar(_ s: UnicodeScalar) throws {
249-
// TODO: Native instruction buildMatchScalar(s)
250-
if options.isCaseInsensitive {
251-
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
252-
builder.buildConsume(by: consumeScalar {
253-
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
254-
})
280+
mutating func emitMatchScalar(_ s: UnicodeScalar) {
281+
assert(options.semanticLevel == .unicodeScalar)
282+
if options.isCaseInsensitive && s.properties.isCased {
283+
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
255284
} else {
256-
builder.buildConsume(by: consumeScalar {
257-
$0 == s
258-
})
285+
builder.buildMatchScalar(s, boundaryCheck: false)
259286
}
260287
}
261288

262-
mutating func emitCharacter(_ c: Character) throws {
263-
// Unicode scalar matches the specific scalars that comprise a character
289+
mutating func emitCharacter(_ c: Character) {
290+
// Unicode scalar mode matches the specific scalars that comprise a character
264291
if options.semanticLevel == .unicodeScalar {
265292
for scalar in c.unicodeScalars {
266-
try emitScalar(scalar)
293+
emitMatchScalar(scalar)
267294
}
268295
return
269296
}
270297

271298
if options.isCaseInsensitive && c.isCased {
272-
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
273-
builder.buildConsume { input, bounds in
274-
let inputChar = input[bounds.lowerBound].lowercased()
275-
let matchChar = c.lowercased()
276-
return inputChar == matchChar
277-
? input.index(after: bounds.lowerBound)
278-
: nil
299+
if optimizationsEnabled && c.isASCII {
300+
// c.isCased ensures that c is not CR-LF,
301+
// so we know that c is a single scalar
302+
assert(c.unicodeScalars.count == 1)
303+
builder.buildMatchScalarCaseInsensitive(
304+
c.unicodeScalars.last!,
305+
boundaryCheck: true)
306+
} else {
307+
builder.buildMatch(c, isCaseInsensitive: true)
279308
}
280-
} else {
281-
builder.buildMatch(c)
309+
return
282310
}
311+
312+
if optimizationsEnabled && c.isASCII {
313+
let lastIdx = c.unicodeScalars.indices.last!
314+
for idx in c.unicodeScalars.indices {
315+
builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx)
316+
}
317+
return
318+
}
319+
320+
builder.buildMatch(c, isCaseInsensitive: false)
283321
}
284322

285323
mutating func emitAny() {
@@ -717,11 +755,12 @@ fileprivate extension Compiler.ByteCodeGen {
717755
_ ccc: DSLTree.CustomCharacterClass
718756
) throws {
719757
if let asciiBitset = ccc.asAsciiBitset(options),
720-
options.semanticLevel == .graphemeCluster,
721758
optimizationsEnabled {
722-
// future work: add a bit to .matchBitset to consume either a character
723-
// or a scalar so we can have this optimization in scalar mode
724-
builder.buildMatchAsciiBitset(asciiBitset)
759+
if options.semanticLevel == .unicodeScalar {
760+
builder.buildScalarMatchAsciiBitset(asciiBitset)
761+
} else {
762+
builder.buildMatchAsciiBitset(asciiBitset)
763+
}
725764
} else {
726765
let consumer = try ccc.generateConsumer(options)
727766
builder.buildConsume(by: consumer)
@@ -798,45 +837,7 @@ fileprivate extension Compiler.ByteCodeGen {
798837
try emitAtom(a)
799838

800839
case let .quotedLiteral(s):
801-
if options.semanticLevel == .graphemeCluster {
802-
if options.isCaseInsensitive {
803-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
804-
builder.buildConsume { input, bounds in
805-
var iterator = s.makeIterator()
806-
var currentIndex = bounds.lowerBound
807-
while let ch = iterator.next() {
808-
guard currentIndex < bounds.upperBound,
809-
ch.lowercased() == input[currentIndex].lowercased()
810-
else { return nil }
811-
input.formIndex(after: &currentIndex)
812-
}
813-
return currentIndex
814-
}
815-
} else {
816-
builder.buildMatchSequence(s)
817-
}
818-
} else {
819-
builder.buildConsume {
820-
[caseInsensitive = options.isCaseInsensitive] input, bounds in
821-
// TODO: Case folding
822-
var iterator = s.unicodeScalars.makeIterator()
823-
var currentIndex = bounds.lowerBound
824-
while let scalar = iterator.next() {
825-
guard currentIndex < bounds.upperBound else { return nil }
826-
if caseInsensitive {
827-
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
828-
return nil
829-
}
830-
} else {
831-
if scalar != input.unicodeScalars[currentIndex] {
832-
return nil
833-
}
834-
}
835-
input.unicodeScalars.formIndex(after: &currentIndex)
836-
}
837-
return currentIndex
838-
}
839-
}
840+
emitQuotedLiteral(s)
840841

841842
case let .convertedRegexLiteral(n, _):
842843
return try emitNode(n)

Sources/_StringProcessing/ConsumerInterface.swift

+62-41
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
@_implementationOnly import _RegexParser
1313

14+
extension Character {
15+
var _singleScalarAsciiValue: UInt8? {
16+
guard self != "\r\n" else { return nil }
17+
return asciiValue
18+
}
19+
}
20+
1421
extension DSLTree.Node {
1522
/// Attempt to generate a consumer from this AST node
1623
///
@@ -53,11 +60,50 @@ extension DSLTree._AST.Atom {
5360
}
5461
}
5562

63+
extension Character {
64+
func generateConsumer(
65+
_ opts: MatchingOptions
66+
) throws -> MEProgram.ConsumeFunction? {
67+
let isCaseInsensitive = opts.isCaseInsensitive
68+
switch opts.semanticLevel {
69+
case .graphemeCluster:
70+
return { input, bounds in
71+
let low = bounds.lowerBound
72+
if isCaseInsensitive && isCased {
73+
return input[low].lowercased() == lowercased()
74+
? input.index(after: low)
75+
: nil
76+
} else {
77+
return input[low] == self
78+
? input.index(after: low)
79+
: nil
80+
}
81+
}
82+
case .unicodeScalar:
83+
// TODO: This should only be reachable from character class emission, can
84+
// we guarantee that? Otherwise we'd want a different matching behavior.
85+
let consumers = unicodeScalars.map { s in consumeScalar {
86+
isCaseInsensitive
87+
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
88+
: $0 == s
89+
}}
90+
return { input, bounds in
91+
for fn in consumers {
92+
if let idx = fn(input, bounds) {
93+
return idx
94+
}
95+
}
96+
return nil
97+
}
98+
}
99+
}
100+
}
101+
56102
extension DSLTree.Atom {
57103
var singleScalarASCIIValue: UInt8? {
58104
switch self {
59-
case let .char(c) where c != "\r\n":
60-
return c.asciiValue
105+
case let .char(c):
106+
return c._singleScalarAsciiValue
61107
case let .scalar(s) where s.isASCII:
62108
return UInt8(ascii: s)
63109
case let .unconverted(atom):
@@ -72,44 +118,15 @@ extension DSLTree.Atom {
72118
func generateConsumer(
73119
_ opts: MatchingOptions
74120
) throws -> MEProgram.ConsumeFunction? {
75-
let isCaseInsensitive = opts.isCaseInsensitive
76-
77121
switch self {
78122
case let .char(c):
79-
if opts.semanticLevel == .graphemeCluster {
80-
return { input, bounds in
81-
let low = bounds.lowerBound
82-
if isCaseInsensitive && c.isCased {
83-
return input[low].lowercased() == c.lowercased()
84-
? input.index(after: low)
85-
: nil
86-
} else {
87-
return input[low] == c
88-
? input.index(after: low)
89-
: nil
90-
}
91-
}
92-
} else {
93-
let consumers = c.unicodeScalars.map { s in consumeScalar {
94-
isCaseInsensitive
95-
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
96-
: $0 == s
97-
}}
98-
return { input, bounds in
99-
for fn in consumers {
100-
if let idx = fn(input, bounds) {
101-
return idx
102-
}
103-
}
104-
return nil
105-
}
106-
}
123+
return try c.generateConsumer(opts)
124+
107125
case let .scalar(s):
108-
return consumeScalar {
109-
isCaseInsensitive
110-
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
111-
: $0 == s
112-
}
126+
// A scalar always matches the same as a single scalar character. This
127+
// means it must match a whole grapheme in grapheme semantic mode, but
128+
// can match a single scalar in scalar semantic mode.
129+
return try Character(s).generateConsumer(opts)
113130

114131
case .any:
115132
// FIXME: Should this be a total ordering?
@@ -211,16 +228,20 @@ extension AST.Atom {
211228
var singleScalar: UnicodeScalar? {
212229
switch kind {
213230
case .scalar(let s): return s.value
231+
case .escaped(let e):
232+
guard let s = e.scalarValue else { return nil }
233+
return s
214234
default: return nil
215235
}
216236
}
217237

218238
var singleScalarASCIIValue: UInt8? {
239+
if let s = singleScalar, s.isASCII {
240+
return UInt8(ascii: s)
241+
}
219242
switch kind {
220-
case let .char(c) where c != "\r\n":
221-
return c.asciiValue
222-
case let .scalar(s) where s.value.isASCII:
223-
return UInt8(ascii: s.value)
243+
case let .char(c):
244+
return c._singleScalarAsciiValue
224245
default:
225246
return nil
226247
}

Sources/_StringProcessing/Engine/InstPayload.swift

+30-8
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,26 @@ extension Instruction.Payload {
147147
var string: StringRegister {
148148
interpret()
149149
}
150+
151+
init(scalar: Unicode.Scalar) {
152+
self.init(UInt64(scalar.value))
153+
}
154+
var scalar: Unicode.Scalar {
155+
return Unicode.Scalar(_value: UInt32(self.rawValue))
156+
}
157+
158+
init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
159+
let raw = UInt64(scalar.value)
160+
+ (caseInsensitive ? 1 << 55: 0)
161+
+ (boundaryCheck ? 1 << 54 : 0)
162+
self.init(raw)
163+
}
164+
var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
165+
let caseInsensitive = (self.rawValue >> 55) & 1 == 1
166+
let boundaryCheck = (self.rawValue >> 54) & 1 == 1
167+
let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF))
168+
return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck)
169+
}
150170

151171
init(sequence: SequenceRegister) {
152172
self.init(sequence)
@@ -190,18 +210,20 @@ extension Instruction.Payload {
190210
interpret()
191211
}
192212

193-
init(element: ElementRegister) {
194-
self.init(element)
213+
init(element: ElementRegister, isCaseInsensitive: Bool) {
214+
self.init(isCaseInsensitive ? 1 : 0, element)
195215
}
196-
var element: ElementRegister {
197-
interpret()
216+
var elementPayload: (isCaseInsensitive: Bool, ElementRegister) {
217+
let pair: (UInt64, ElementRegister) = interpretPair()
218+
return (isCaseInsensitive: pair.0 == 1, pair.1)
198219
}
199220

200-
init(bitset: AsciiBitsetRegister) {
201-
self.init(bitset)
221+
init(bitset: AsciiBitsetRegister, isScalar: Bool) {
222+
self.init(isScalar ? 1 : 0, bitset)
202223
}
203-
var bitset: AsciiBitsetRegister {
204-
interpret()
224+
var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) {
225+
let pair: (UInt64, AsciiBitsetRegister) = interpretPair()
226+
return (isScalar: pair.0 == 1, pair.1)
205227
}
206228

207229
init(consumer: ConsumeFunctionRegister) {

0 commit comments

Comments
 (0)