Skip to content

Commit 6bca986

Browse files
committed
Handle unprocessable whitespace-related unicode characters
1 parent 2cb34ec commit 6bca986

File tree

3 files changed

+110
-12
lines changed

3 files changed

+110
-12
lines changed

Sources/SwiftFormat/PrettyPrint/WhitespaceFindingCategory.swift

+4
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ enum WhitespaceFindingCategory: FindingCategorizing {
3333
/// Findings related to the length of a line.
3434
case lineLength
3535

36+
/// Findings related to the presence of disallowed or unexpected Unicode whitespace characters.
37+
case unexpectedUnicode
38+
3639
var description: String {
3740
switch self {
3841
case .trailingWhitespace: return "TrailingWhitespace"
@@ -42,6 +45,7 @@ enum WhitespaceFindingCategory: FindingCategorizing {
4245
case .removeLine: return "RemoveLine"
4346
case .addLines: return "AddLines"
4447
case .lineLength: return "LineLength"
48+
case .unexpectedUnicode: return "UnexpectedUnicode"
4549
}
4650
}
4751
}

Sources/SwiftFormat/PrettyPrint/WhitespaceLinter.swift

+81-12
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,15 @@ public class WhitespaceLinter {
113113
let userRun = userRunsIterator.next()!
114114
let formattedRun = formattedRunsIterator.next()!
115115

116+
// Print a diagnostic for unexpected Unicode characters with the highest priority.
117+
let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
118+
guard unicodeExceptionErrors.isEmpty else {
119+
unicodeExceptionErrors.forEach { exception, offset in
120+
diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
121+
}
122+
return
123+
}
124+
116125
// If there was only a single whitespace run in each input, then that means there weren't any
117126
// newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
118127
// preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
126135

127136
while let userRun = userRunsIterator.next() {
128137
let possibleFormattedRun = formattedRunsIterator.next()
129-
138+
let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
139+
guard unicodeExceptionErrors.isEmpty else {
140+
unicodeExceptionErrors.forEach { exception, offset in
141+
diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
142+
}
143+
continue
144+
}
130145
if runIndex < excessUserLines {
131146
// If there were excess newlines in the user input, tell the user to remove them. This
132147
// short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,34 @@ public class WhitespaceLinter {
323338
}
324339
}
325340

341+
/// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
342+
///
343+
/// - Parameters:
344+
/// - userIndex: The current character offset within the user text.
345+
/// - userRun: A run of whitespace from the user text.
346+
/// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
347+
private func checkForUnicodeExceptionErrors(
348+
userIndex: Int,
349+
userRun: ArraySlice<UTF8.CodeUnit>
350+
) -> [(exception: UnicodeException, offset: Int)] {
351+
var matches: [(UnicodeException, Int)] = []
352+
var offset = 0
353+
while offset < userRun.count {
354+
if let exception = UnicodeException.allCases.first(where: { exception in
355+
let bytes = exception.utf8Bytes
356+
let start = userRun.startIndex + offset
357+
let end = start + bytes.count
358+
return end <= userRun.endIndex && userRun[start..<end].elementsEqual(bytes)
359+
}) {
360+
matches.append((exception, userIndex + offset))
361+
offset += exception.utf8Bytes.count
362+
} else {
363+
offset += 1
364+
}
365+
}
366+
return matches
367+
}
368+
326369
/// Find the next non-whitespace character in a given string, and any leading whitespace before
327370
/// the character.
328371
///
@@ -339,20 +382,26 @@ public class WhitespaceLinter {
339382
startingAt offset: Int,
340383
in data: [UTF8.CodeUnit]
341384
) -> ArraySlice<UTF8.CodeUnit> {
342-
func isWhitespace(_ char: UTF8.CodeUnit) -> Bool {
343-
switch char {
344-
case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"), /*VT*/ 0x0B, /*FF*/ 0x0C:
345-
return true
385+
var currentIndex = offset
386+
while currentIndex < data.count {
387+
if let unicodeException = UnicodeException.allCases.first(where: { exception in
388+
let bytes = exception.utf8Bytes
389+
return currentIndex + bytes.count <= data.count
390+
&& data[currentIndex..<currentIndex + bytes.count].elementsEqual(bytes)
391+
}) {
392+
currentIndex += unicodeException.utf8Bytes.count
393+
continue
394+
}
395+
396+
switch data[currentIndex] {
397+
case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"),
398+
/*VT*/ 0x0B, /*FF*/ 0x0C:
399+
currentIndex += 1
346400
default:
347-
return false
401+
return data[offset..<currentIndex]
348402
}
349403
}
350-
guard
351-
let whitespaceEnd = data[offset...].firstIndex(where: { !isWhitespace($0) })
352-
else {
353-
return data[offset..<data.endIndex]
354-
}
355-
return data[offset..<whitespaceEnd]
404+
return data[offset..<currentIndex]
356405
}
357406

358407
/// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +461,22 @@ public class WhitespaceLinter {
412461
}
413462
}
414463

464+
/// A collection of unexpected Unicode characters that cannot be processed normally.
465+
private enum UnicodeException: CaseIterable {
466+
case u2028 // U+2028 LINE SEPARATOR
467+
case u2029 // U+2029 PARAGRAPH SEPARATOR
468+
469+
/// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
470+
var utf8Bytes: [UTF8.CodeUnit] {
471+
switch self {
472+
case .u2028:
473+
return [0xE2, 0x80, 0xA8]
474+
case .u2029:
475+
return [0xE2, 0x80, 0xA9]
476+
}
477+
}
478+
}
479+
415480
/// Describes the composition of the whitespace that creates an indentation for a line of code.
416481
public enum WhitespaceIndentation: Equatable {
417482
/// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +578,8 @@ extension Finding.Message {
513578
}
514579

515580
fileprivate static let lineLengthError: Finding.Message = "line is too long"
581+
582+
fileprivate static func removeUnexpectedUnicode(_ unicode: UnicodeException) -> Finding.Message {
583+
return "remove unexpected unicode character \\\(unicode)"
584+
}
516585
}

Tests/SwiftFormatTests/PrettyPrint/WhitespaceLintTests.swift

+25
Original file line numberDiff line numberDiff line change
@@ -255,4 +255,29 @@ final class WhitespaceLintTests: WhitespaceTestCase {
255255
]
256256
)
257257
}
258+
259+
func testUnexpectedUnicodeCharacters() {
260+
assertWhitespaceLint(
261+
input: """
262+
// Hello World1️⃣\u{2028}
263+
// Hello2️⃣\u{2028}World
264+
// Hello World3️⃣\u{2028}4️⃣\u{2029}5️⃣\u{2029}
265+
// Hello World 6️⃣\u{2028}
266+
""",
267+
expected: """
268+
// Hello World
269+
// Hello World
270+
// Hello World
271+
// Hello World
272+
""",
273+
findings: [
274+
FindingSpec("1️⃣", message: "remove unexpected unicode character \\u2028"),
275+
FindingSpec("2️⃣", message: "remove unexpected unicode character \\u2028"),
276+
FindingSpec("3️⃣", message: "remove unexpected unicode character \\u2028"),
277+
FindingSpec("4️⃣", message: "remove unexpected unicode character \\u2029"),
278+
FindingSpec("5️⃣", message: "remove unexpected unicode character \\u2029"),
279+
FindingSpec("6️⃣", message: "remove unexpected unicode character \\u2028"),
280+
]
281+
)
282+
}
258283
}

0 commit comments

Comments
 (0)