Skip to content

Commit b6fd6f5

Browse files
weissimilseman
authored andcommitted
integrating utf8 validation
1 parent d46ebf8 commit b6fd6f5

File tree

5 files changed

+184
-31
lines changed

5 files changed

+184
-31
lines changed

stdlib/public/core/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ set(SWIFTLIB_ESSENTIAL
148148
StringUnicodeScalarView.swift
149149
StringUTF16View.swift
150150
StringUTF8View.swift
151+
StringUTF8Validation.swift
151152
StringVariant.swift
152153
Substring.swift
153154
SwiftNativeNSArray.swift

stdlib/public/core/GroupInfo.json

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"StringTesting.swift",
3737
"StringUTF16View.swift",
3838
"StringUTF8View.swift",
39+
"StringUTF8Validation.swift",
3940
"StringUnicodeScalarView.swift",
4041
"StringVariant.swift",
4142
"Substring.swift",

stdlib/public/core/StringCreate.swift

+17-30
Original file line numberDiff line numberDiff line change
@@ -46,54 +46,41 @@ extension String {
4646
internal static func _tryFromUTF8(
4747
_ input: UnsafeBufferPointer<UInt8>
4848
) -> String? {
49-
// TODO(UTF8 perf): More efficient validation
50-
51-
// TODO(UTF8 perf): Skip intermediary array
52-
var contents: [UInt8] = []
53-
contents.reserveCapacity(input.count)
54-
let repaired = transcode(
55-
input.makeIterator(),
56-
from: UTF8.self,
57-
to: UTF8.self,
58-
stoppingOnError: true,
59-
into: { contents.append($0) })
60-
guard !repaired else { return nil }
49+
guard case .success(let extraInfo) = validateUTF8(input) else {
50+
return nil
51+
}
6152

62-
return contents.withUnsafeBufferPointer { String._uncheckedFromUTF8($0) }
53+
return String._uncheckedFromUTF8(input, isASCII: extraInfo.isASCII)
6354
}
6455

6556
@usableFromInline
6657
internal static func _fromUTF8Repairing(
6758
_ input: UnsafeBufferPointer<UInt8>
6859
) -> (result: String, repairsMade: Bool) {
69-
if _allASCII(input) {
70-
return (String._uncheckedFromUTF8(input, asciiPreScanResult: true), false)
60+
switch validateUTF8(input) {
61+
case .success(let extraInfo):
62+
return (String._uncheckedFromUTF8(input, asciiPreScanResult: extraInfo.isASCII), false)
63+
case .error(let initialRange):
64+
return (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
7165
}
72-
73-
// TODO(UTF8 perf): More efficient validation
74-
75-
// TODO(UTF8 perf): Skip intermediary array
76-
var contents: [UInt8] = []
77-
contents.reserveCapacity(input.count)
78-
let repaired = transcode(
79-
input.makeIterator(),
80-
from: UTF8.self,
81-
to: UTF8.self,
82-
stoppingOnError: false,
83-
into: { contents.append($0) })
84-
let str = contents.withUnsafeBufferPointer { String._uncheckedFromUTF8($0) }
85-
return (str, repaired)
8666
}
8767

8868
@usableFromInline
8969
internal static func _uncheckedFromUTF8(
9070
_ input: UnsafeBufferPointer<UInt8>
71+
) -> String {
72+
return _uncheckedFromUTF8(input, isASCII: _allASCII(input))
73+
}
74+
75+
@usableFromInline
76+
internal static func _uncheckedFromUTF8(
77+
_ input: UnsafeBufferPointer<UInt8>,
78+
isASCII: Bool
9179
) -> String {
9280
if let smol = _SmallString(input) {
9381
return String(_StringGuts(smol))
9482
}
9583

96-
let isASCII = _allASCII(input)
9784
let storage = _StringStorage.create(
9885
initializingFrom: input, isASCII: isASCII)
9986
return storage.asString

stdlib/public/core/StringGutsRangeReplaceable.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ extension _StringGuts {
167167
_foreignAppendInPlace(slicedOther)
168168
}
169169

170-
private mutating func appendInPlace(
170+
internal mutating func appendInPlace(
171171
_ other: UnsafeBufferPointer<UInt8>, isASCII: Bool
172172
) {
173173
self._object.nativeStorage.appendInPlace(other, isASCII: isASCII)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
private func _isUTF8MultiByteLeading(_ x: UInt8) -> Bool {
2+
return (0xC2...0xF4).contains(x)
3+
}
4+
5+
private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
6+
return (0x90...0xBF).contains(x)
7+
}
8+
9+
private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
10+
return _isContinuation(x) && x <= 0x8F
11+
}
12+
13+
private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
14+
return (0xA0...0xBF).contains(x)
15+
}
16+
17+
private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
18+
return _isContinuation(x) && x <= 0x9F
19+
}
20+
21+
private func _isASCII_cmp(_ x: UInt8) -> Bool {
22+
return x <= 0x7F
23+
}
24+
25+
internal struct UTF8ExtraInfo: Equatable {
26+
public var isASCII: Bool
27+
}
28+
29+
internal enum UTF8ValidationResult {
30+
case success(UTF8ExtraInfo)
31+
case error(toBeReplaced: Range<Int>)
32+
}
33+
34+
extension UTF8ValidationResult: Equatable {}
35+
36+
private struct UTF8ValidationError: Error {}
37+
38+
internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
39+
var iter = buf.makeIterator()
40+
var lastValidIndex = buf.startIndex
41+
42+
@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws {
43+
guard let cu = iter.next() else { throw UTF8ValidationError() }
44+
guard f(cu) else { throw UTF8ValidationError() }
45+
}
46+
@inline(__always) func guaranteeContinuation() throws {
47+
try guaranteeIn(_isContinuation)
48+
}
49+
50+
func findInvalidRange(_ buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
51+
var endIndex = buf.startIndex
52+
var iter = buf.makeIterator()
53+
_ = iter.next()
54+
while let cu = iter.next(), !_isASCII(cu) && !_isUTF8MultiByteLeading(cu) {
55+
endIndex += 1
56+
}
57+
let illegalRange = Range(buf.startIndex...endIndex)
58+
_sanityCheck(illegalRange.clamped(to: (buf.startIndex..<buf.endIndex)) == illegalRange,
59+
"illegal range out of full range")
60+
return illegalRange
61+
}
62+
63+
do {
64+
var isASCII = true
65+
while let cu = iter.next() {
66+
if _isASCII(cu) { lastValidIndex &+= 1; continue }
67+
isASCII = false
68+
if _slowPath(!_isUTF8MultiByteLeading(cu)) {
69+
throw UTF8ValidationError()
70+
}
71+
switch cu {
72+
case 0xC2...0xDF:
73+
try guaranteeContinuation()
74+
lastValidIndex &+= 2
75+
case 0xE0:
76+
try guaranteeIn(_isNotOverlong_E0)
77+
try guaranteeContinuation()
78+
lastValidIndex &+= 3
79+
case 0xE1...0xEC:
80+
try guaranteeContinuation()
81+
try guaranteeContinuation()
82+
lastValidIndex &+= 3
83+
case 0xED:
84+
try guaranteeIn(_isNotOverlong_ED)
85+
try guaranteeContinuation()
86+
lastValidIndex &+= 3
87+
case 0xEE...0xEF:
88+
try guaranteeContinuation()
89+
try guaranteeContinuation()
90+
lastValidIndex &+= 3
91+
case 0xF0:
92+
try guaranteeIn(_isNotOverlong_F0)
93+
try guaranteeContinuation()
94+
try guaranteeContinuation()
95+
lastValidIndex &+= 4
96+
case 0xF1...0xF3:
97+
try guaranteeContinuation()
98+
try guaranteeContinuation()
99+
try guaranteeContinuation()
100+
lastValidIndex &+= 4
101+
case 0xF4:
102+
try guaranteeIn(_isNotOverlong_F4)
103+
try guaranteeContinuation()
104+
try guaranteeContinuation()
105+
lastValidIndex &+= 4
106+
default:
107+
Builtin.unreachable()
108+
}
109+
}
110+
return .success(UTF8ExtraInfo(isASCII: isASCII))
111+
} catch {
112+
return .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
113+
}
114+
}
115+
116+
internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRange: Range<Int>) -> String {
117+
_sanityCheck(input.count > 0, "empty input doesn't need to be repaired")
118+
_sanityCheck(firstKnownBrokenRange.clamped(to: input.indices) == firstKnownBrokenRange)
119+
// During this process, `remainingInput` contains the remaining bytes to process. It's split into three
120+
// non-overlapping sub-regions:
121+
//
122+
// 1. `goodChunk` (may be empty) containing bytes that are known good UTF-8 and can be copied into the output String
123+
// 2. `brokenRange` (never empty) the next range of broken bytes,
124+
// 3. the remainder (implicit, will become the next `remainingInput`)
125+
//
126+
// At the beginning of the process, the `goodChunk` starts at the beginning and extends to just before the first
127+
// known broken byte. The known broken bytes are covered in the `brokenRange` and everything following that is
128+
// the remainder.
129+
// We then copy the `goodChunk` into the target buffer and append a UTF8 replacement character. `brokenRange` is
130+
// skipped (replaced by the replacement character) and we restart the same process. This time, `goodChunk` extends
131+
// from the byte after the previous `brokenRange` to the next `brokenRange`.
132+
var result = _StringGuts()
133+
let replacementCharacterCount = Unicode.Scalar._replacementCharacter.withUTF8CodeUnits { $0.count }
134+
result.reserveCapacity(input.count + 5 * replacementCharacterCount) // extra space for some replacement characters
135+
136+
var brokenRange: Range<Int> = firstKnownBrokenRange
137+
var remainingInput = input
138+
repeat {
139+
_sanityCheck(brokenRange.count > 0, "broken range empty")
140+
_sanityCheck(remainingInput.count > 0, "empty remaining input doesn't need to be repaired")
141+
let goodChunk = remainingInput[..<brokenRange.startIndex]
142+
143+
// very likely this capacity reservation does not actually do anything because we reserved space for the entire
144+
// input plus up to five replacement characters up front
145+
result.reserveCapacity(result.count + remainingInput.count + replacementCharacterCount)
146+
147+
// we can now safely append the next known good bytes and a replacement character
148+
result.appendInPlace(UnsafeBufferPointer(rebasing: goodChunk),
149+
isASCII: false /* appending replacement character anyway, so let's not bother */)
150+
Unicode.Scalar._replacementCharacter.withUTF8CodeUnits {
151+
result.appendInPlace($0, isASCII: false)
152+
}
153+
154+
remainingInput = UnsafeBufferPointer(rebasing: remainingInput[brokenRange.endIndex...])
155+
switch validateUTF8(remainingInput) {
156+
case .success:
157+
result.appendInPlace(remainingInput, isASCII: false)
158+
return String(result)
159+
case .error(let newBrokenRange):
160+
brokenRange = newBrokenRange
161+
}
162+
} while remainingInput.count > 0
163+
return String(result)
164+
}

0 commit comments

Comments
 (0)