@@ -113,6 +113,15 @@ public class WhitespaceLinter {
113
113
let userRun = userRunsIterator. next ( ) !
114
114
let formattedRun = formattedRunsIterator. next ( ) !
115
115
116
+ // Print a diagnostic for unexpected Unicode characters with the highest priority.
117
+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
118
+ guard unicodeExceptionErrors. isEmpty else {
119
+ unicodeExceptionErrors. forEach { exception, offset in
120
+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
121
+ }
122
+ return
123
+ }
124
+
116
125
// If there was only a single whitespace run in each input, then that means there weren't any
117
126
// newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
118
127
// preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
126
135
127
136
while let userRun = userRunsIterator. next ( ) {
128
137
let possibleFormattedRun = formattedRunsIterator. next ( )
129
-
138
+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
139
+ guard unicodeExceptionErrors. isEmpty else {
140
+ unicodeExceptionErrors. forEach { exception, offset in
141
+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
142
+ }
143
+ continue
144
+ }
130
145
if runIndex < excessUserLines {
131
146
// If there were excess newlines in the user input, tell the user to remove them. This
132
147
// short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,34 @@ public class WhitespaceLinter {
323
338
}
324
339
}
325
340
341
+ /// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
342
+ ///
343
+ /// - Parameters:
344
+ /// - userIndex: The current character offset within the user text.
345
+ /// - userRun: A run of whitespace from the user text.
346
+ /// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
347
+ private func checkForUnicodeExceptionErrors(
348
+ userIndex: Int ,
349
+ userRun: ArraySlice < UTF8 . CodeUnit >
350
+ ) -> [ ( exception: UnicodeException , offset: Int ) ] {
351
+ var matches : [ ( UnicodeException , Int ) ] = [ ]
352
+ var offset = 0
353
+ while offset < userRun. count {
354
+ if let exception = UnicodeException . allCases. first ( where: { exception in
355
+ let bytes = exception. utf8Bytes
356
+ let start = userRun. startIndex + offset
357
+ let end = start + bytes. count
358
+ return end <= userRun. endIndex && userRun [ start..< end] . elementsEqual ( bytes)
359
+ } ) {
360
+ matches. append ( ( exception, userIndex + offset) )
361
+ offset += exception. utf8Bytes. count
362
+ } else {
363
+ offset += 1
364
+ }
365
+ }
366
+ return matches
367
+ }
368
+
326
369
/// Find the next non-whitespace character in a given string, and any leading whitespace before
327
370
/// the character.
328
371
///
@@ -339,20 +382,26 @@ public class WhitespaceLinter {
339
382
startingAt offset: Int ,
340
383
in data: [ UTF8 . CodeUnit ]
341
384
) -> ArraySlice < UTF8 . CodeUnit > {
342
- func isWhitespace( _ char: UTF8 . CodeUnit ) -> Bool {
343
- switch char {
344
- case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) , /*VT*/ 0x0B , /*FF*/ 0x0C :
345
- return true
385
+ var currentIndex = offset
386
+ while currentIndex < data. count {
387
+ if let unicodeException = UnicodeException . allCases. first ( where: { exception in
388
+ let bytes = exception. utf8Bytes
389
+ return currentIndex + bytes. count <= data. count
390
+ && data [ currentIndex..< currentIndex + bytes. count] . elementsEqual ( bytes)
391
+ } ) {
392
+ currentIndex += unicodeException. utf8Bytes. count
393
+ continue
394
+ }
395
+
396
+ switch data [ currentIndex] {
397
+ case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) ,
398
+ /*VT*/ 0x0B , /*FF*/ 0x0C :
399
+ currentIndex += 1
346
400
default :
347
- return false
401
+ return data [ offset ..< currentIndex ]
348
402
}
349
403
}
350
- guard
351
- let whitespaceEnd = data [ offset... ] . firstIndex ( where: { !isWhitespace( $0) } )
352
- else {
353
- return data [ offset..< data. endIndex]
354
- }
355
- return data [ offset..< whitespaceEnd]
404
+ return data [ offset..< currentIndex]
356
405
}
357
406
358
407
/// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +461,22 @@ public class WhitespaceLinter {
412
461
}
413
462
}
414
463
464
+ /// A collection of unexpected Unicode characters that cannot be processed normally.
465
+ private enum UnicodeException : CaseIterable {
466
+ case u2028 // U+2028 LINE SEPARATOR
467
+ case u2029 // U+2029 PARAGRAPH SEPARATOR
468
+
469
+ /// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
470
+ var utf8Bytes : [ UTF8 . CodeUnit ] {
471
+ switch self {
472
+ case . u2028:
473
+ return [ 0xE2 , 0x80 , 0xA8 ]
474
+ case . u2029:
475
+ return [ 0xE2 , 0x80 , 0xA9 ]
476
+ }
477
+ }
478
+ }
479
+
415
480
/// Describes the composition of the whitespace that creates an indentation for a line of code.
416
481
public enum WhitespaceIndentation : Equatable {
417
482
/// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +578,8 @@ extension Finding.Message {
513
578
}
514
579
515
580
fileprivate static let lineLengthError : Finding . Message = " line is too long "
581
+
582
+ fileprivate static func removeUnexpectedUnicode( _ unicode: UnicodeException ) -> Finding . Message {
583
+ return " remove unexpected unicode character \\ \( unicode) "
584
+ }
516
585
}
0 commit comments