Skip to content

Commit 8d888b5

Browse files
author
Dave Abrahams
authored
Merge pull request #9375 from apple/direct-transcoding
Direct transcoding
2 parents 4bfa3b2 + 776e0a6 commit 8d888b5

File tree

5 files changed

+253
-14
lines changed

5 files changed

+253
-14
lines changed

Diff for: stdlib/public/core/UTF16.swift

+43
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,49 @@ extension _Unicode.UTF16 : UnicodeEncoding {
4848
r |= (0xd800 + (x1 &>> 10 & 0x3ff))
4949
return EncodedScalar(_storage: r, _bitCount: 32)
5050
}
51+
52+
@inline(__always)
53+
public static func transcode<FromEncoding : UnicodeEncoding>(
54+
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
55+
) -> EncodedScalar? {
56+
if _fastPath(FromEncoding.self == UTF8.self) {
57+
let c = unsafeBitCast(content, to: UTF8.EncodedScalar.self)
58+
var b = c._bitCount
59+
b = b &- 8
60+
if _fastPath(b == 0) {
61+
return EncodedScalar(
62+
_storage: c._storage & 0b0__111_1111, _bitCount: 16)
63+
}
64+
var s = c._storage
65+
var r = s
66+
r &<<= 6
67+
s &>>= 8
68+
r |= s & 0b0__11_1111
69+
b = b &- 8
70+
71+
if _fastPath(b == 0) {
72+
return EncodedScalar(_storage: r & 0b0__111_1111_1111, _bitCount: 16)
73+
}
74+
r &<<= 6
75+
s &>>= 8
76+
r |= s & 0b0__11_1111
77+
b = b &- 8
78+
79+
if _fastPath(b == 0) {
80+
return EncodedScalar(_storage: r & 0xFFFF, _bitCount: 16)
81+
}
82+
83+
r &<<= 6
84+
s &>>= 8
85+
r |= s & 0b0__11_1111
86+
r &= (1 &<< 21) - 1
87+
return encode(UnicodeScalar(_unchecked: r))
88+
}
89+
else if _fastPath(FromEncoding.self == UTF16.self) {
90+
return unsafeBitCast(content, to: UTF16.EncodedScalar.self)
91+
}
92+
return encode(FromEncoding.decode(content))
93+
}
5194

5295
public struct ForwardParser {
5396
public typealias _Buffer = _UIntBuffer<UInt32, UInt16>

Diff for: stdlib/public/core/UTF8.swift

+35-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,41 @@ extension _Unicode.UTF8 : UnicodeEncoding {
8484
_storage: o | c | 0b0__1000_0000__1000_0000__1000_0000__1111_0000,
8585
_bitCount: 32)
8686
}
87-
87+
88+
@inline(__always)
89+
public static func transcode<FromEncoding : UnicodeEncoding>(
90+
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
91+
) -> EncodedScalar? {
92+
if _fastPath(FromEncoding.self == UTF16.self) {
93+
let c = unsafeBitCast(content, to: UTF16.EncodedScalar.self)
94+
var u0 = UInt16(extendingOrTruncating: c._storage)
95+
if _fastPath(u0 < 0x80) {
96+
return EncodedScalar(containing: UInt8(extendingOrTruncating: u0))
97+
}
98+
var r = UInt32(u0 & 0b0__11_1111)
99+
r &<<= 8
100+
u0 &>>= 6
101+
if _fastPath(u0 < (1&<<5)) {
102+
return EncodedScalar(
103+
_storage: UInt32(u0) | r | 0b0__1000_0000__1100_0000,
104+
_bitCount: 16)
105+
}
106+
r |= UInt32(u0 & 0b0__11_1111)
107+
r &<<= 8
108+
if _fastPath(u0 & (0xF800 &>> 6) != (0xD800 &>> 6)) {
109+
u0 &>>= 6
110+
return EncodedScalar(
111+
_storage: UInt32(u0)
112+
| r | 0b0__1000_0000__1000_0000__1000_0000__1110_0000,
113+
_bitCount: 24)
114+
}
115+
}
116+
else if _fastPath(FromEncoding.self == UTF8.self) {
117+
return unsafeBitCast(content, to: UTF8.EncodedScalar.self)
118+
}
119+
return encode(FromEncoding.decode(content))
120+
}
121+
88122
@_fixed_layout
89123
public struct ForwardParser {
90124
public typealias _Buffer = _UIntBuffer<UInt32, UInt8>

Diff for: stdlib/public/core/Unicode.swift

+3-11
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public enum UnicodeDecodingResult : Equatable {
6161
/// decoded Unicode scalar values.
6262
///
6363
/// - SeeAlso: `UTF8`, `UTF16`, `UTF32`, `UnicodeScalar`
64-
public protocol UnicodeCodec {
64+
public protocol UnicodeCodec : UnicodeEncoding {
6565

6666
/// A type that can hold code unit values for this encoding.
6767
associatedtype CodeUnit
@@ -591,12 +591,10 @@ public func transcode<Input, InputEncoding, OutputEncoding>(
591591
///
592592
/// Returns the index of the first unhandled code unit and the UTF-8 data
593593
/// that was encoded.
594-
internal func _transcodeSomeUTF16AsUTF8<Input>(
594+
internal func _transcodeSomeUTF16AsUTF8<Input : Collection>(
595595
_ input: Input, _ startIndex: Input.Index
596596
) -> (Input.Index, _StringCore._UTF8Chunk)
597-
where
598-
Input : Collection,
599-
Input.Iterator.Element == UInt16 {
597+
where Input.Iterator.Element == UInt16 {
600598

601599
typealias _UTF8Chunk = _StringCore._UTF8Chunk
602600

@@ -968,12 +966,6 @@ extension UnicodeCodec where CodeUnit : UnsignedInteger {
968966
}
969967
}
970968

971-
extension UnicodeCodec {
972-
public static func _nullCodeUnitOffset(in input: UnsafePointer<CodeUnit>) -> Int {
973-
fatalError("_nullCodeUnitOffset(in:) implementation should be provided")
974-
}
975-
}
976-
977969
@available(*, unavailable, renamed: "UnicodeCodec")
978970
public typealias UnicodeCodecType = UnicodeCodec
979971

Diff for: stdlib/public/core/UnicodeEncoding.swift

+13
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ public protocol _UnicodeEncoding {
2828
/// `nil` if the scalar can't be represented in this encoding.
2929
static func encode(_ content: UnicodeScalar) -> EncodedScalar?
3030

31+
/// Converts a scalar from another encoding's representation, returning
32+
/// `nil` if the scalar can't be represented in this encoding.
33+
static func transcode<FromEncoding : UnicodeEncoding>(
34+
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
35+
) -> EncodedScalar?
36+
3137
associatedtype ForwardParser : UnicodeParser
3238
associatedtype ReverseParser : UnicodeParser
3339

@@ -48,3 +54,10 @@ extension _UnicodeEncoding {
4854
public protocol UnicodeEncoding : _UnicodeEncoding
4955
where ForwardParser.Encoding == Self, ReverseParser.Encoding == Self {}
5056

57+
extension _UnicodeEncoding {
58+
public static func transcode<FromEncoding : UnicodeEncoding>(
59+
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
60+
) -> EncodedScalar? {
61+
return encode(FromEncoding.decode(content))
62+
}
63+
}

Diff for: test/Prototypes/UnicodeDecoders.swift

+159-2
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,23 @@ func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
304304
}
305305
check(expected.reversed(), "reverse, repairing: true")
306306

307+
//===--- Transcoded Scalars ---------------------------------------------===//
308+
for x in decoded.lazy.map({ UnicodeScalar($0)! }) {
309+
expectEqualSequence(
310+
UTF8.encode(x)!,
311+
UTF8.transcode(Codec.encode(x)!, from: Codec.self)!
312+
)
313+
expectEqualSequence(
314+
UTF16.encode(x)!,
315+
UTF16.transcode(Codec.encode(x)!, from: Codec.self)!
316+
)
317+
expectEqualSequence(
318+
UTF32.encode(x)!,
319+
UTF32.transcode(Codec.encode(x)!, from: Codec.self)!
320+
)
321+
}
322+
323+
//===--- Scalar View ----------------------------------------------------===//
307324
let scalars = _Unicode.DefaultScalarView(utfStr, fromEncoding: Codec.self)
308325
expectEqualSequence(expected, scalars.map { $0.value })
309326
expectEqualSequence(
@@ -319,6 +336,7 @@ func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
319336
}
320337
expectNil(x.next())
321338
}
339+
322340
return result
323341
}
324342

@@ -337,15 +355,13 @@ func checkDecodeUTF16(
337355
utf16Str)
338356
}
339357

340-
/*
341358
func checkDecodeUTF32(
342359
_ expectedHead: [UInt32],
343360
_ expectedRepairedTail: [UInt32], _ utf32Str: [UInt32]
344361
) -> AssertionResult {
345362
return checkDecodeUTF(UTF32.self, expectedHead, expectedRepairedTail,
346363
utf32Str)
347364
}
348-
*/
349365

350366
func checkEncodeUTF8(_ expected: [UInt8],
351367
_ scalars: [UInt32]) -> AssertionResult {
@@ -369,6 +385,147 @@ func checkEncodeUTF8(_ expected: [UInt8],
369385
return assertionSuccess()
370386
}
371387

388+
//===----------------------------------------------------------------------===//
389+
390+
var UTF32Decoder = TestSuite("UTF32Decoder")
391+
392+
UTF32Decoder.test("Empty") {
393+
expectTrue(checkDecodeUTF32([], [], []))
394+
}
395+
396+
UTF32Decoder.test("SmokeTest") {
397+
// U+0041 LATIN CAPITAL LETTER A
398+
expectTrue(checkDecodeUTF32([ 0x0041 ], [], [ 0x0000_0041 ]))
399+
400+
// U+0041 LATIN CAPITAL LETTER A
401+
// U+0042 LATIN CAPITAL LETTER B
402+
expectTrue(checkDecodeUTF32(
403+
[ 0x0041, 0x0042 ], [],
404+
[ 0x0000_0041, 0x0000_0042 ]))
405+
406+
// U+0000 NULL
407+
// U+0041 LATIN CAPITAL LETTER A
408+
// U+0042 LATIN CAPITAL LETTER B
409+
// U+0000 NULL
410+
expectTrue(checkDecodeUTF32(
411+
[ 0x0000, 0x0041, 0x0042, 0x0000 ], [],
412+
[ 0x0000_0000, 0x0000_0041, 0x0000_0042, 0x0000_0000 ]))
413+
414+
// U+0283 LATIN SMALL LETTER ESH
415+
expectTrue(checkDecodeUTF32([ 0x0283 ], [], [ 0x0000_0283 ]))
416+
417+
// U+03BA GREEK SMALL LETTER KAPPA
418+
// U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
419+
// U+03C3 GREEK SMALL LETTER SIGMA
420+
// U+03BC GREEK SMALL LETTER MU
421+
// U+03B5 GREEK SMALL LETTER EPSILON
422+
expectTrue(checkDecodeUTF32(
423+
[ 0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5 ], [],
424+
[ 0x0000_03ba, 0x0000_1f79, 0x0000_03c3, 0x0000_03bc, 0x0000_03b5 ]))
425+
426+
// U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
427+
// U+6587 CJK UNIFIED IDEOGRAPH-6587
428+
expectTrue(checkDecodeUTF32(
429+
[ 0x4f8b, 0x6587 ], [],
430+
[ 0x0000_4f8b, 0x0000_6587 ]))
431+
432+
// U+D55C HANGUL SYLLABLE HAN
433+
// U+AE00 HANGUL SYLLABLE GEUL
434+
expectTrue(checkDecodeUTF32(
435+
[ 0xd55c, 0xae00 ], [],
436+
[ 0x0000_d55c, 0x0000_ae00 ]))
437+
438+
// U+1112 HANGUL CHOSEONG HIEUH
439+
// U+1161 HANGUL JUNGSEONG A
440+
// U+11AB HANGUL JONGSEONG NIEUN
441+
// U+1100 HANGUL CHOSEONG KIYEOK
442+
// U+1173 HANGUL JUNGSEONG EU
443+
// U+11AF HANGUL JONGSEONG RIEUL
444+
expectTrue(checkDecodeUTF32(
445+
[ 0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af ], [],
446+
[ 0x0000_1112, 0x0000_1161, 0x0000_11ab, 0x0000_1100, 0x0000_1173,
447+
0x0000_11af ]))
448+
449+
// U+D7FF (unassigned)
450+
expectTrue(checkDecodeUTF16([ 0xd7ff ], [], [ 0x0000_d7ff ]))
451+
452+
// U+E000 (private use)
453+
expectTrue(checkDecodeUTF16([ 0xe000 ], [], [ 0x0000_e000 ]))
454+
455+
// U+FFFD REPLACEMENT CHARACTER
456+
expectTrue(checkDecodeUTF16([ 0xfffd ], [], [ 0x0000_fffd ]))
457+
458+
// U+FFFF (noncharacter)
459+
expectTrue(checkDecodeUTF16([ 0xffff ], [], [ 0x0000_ffff ]))
460+
461+
// U+10000 LINEAR B SYLLABLE B008 A
462+
expectTrue(checkDecodeUTF32([ 0x00010000 ], [], [ 0x0001_0000 ]))
463+
464+
// U+10100 AEGEAN WORD SEPARATOR LINE
465+
expectTrue(checkDecodeUTF32([ 0x00010100 ], [], [ 0x0001_0100 ]))
466+
467+
// U+103FF (unassigned)
468+
expectTrue(checkDecodeUTF32([ 0x000103ff ], [], [ 0x0001_03ff ]))
469+
470+
// U+1D800 (unassigned)
471+
expectTrue(checkDecodeUTF32([ 0x0001d800 ], [], [ 0x0001_d800 ]))
472+
473+
474+
// U+E0000 (unassigned)
475+
expectTrue(checkDecodeUTF32([ 0x000e0000 ], [], [ 0x000e_0000 ]))
476+
477+
// U+E0100 VARIATION SELECTOR-17
478+
expectTrue(checkDecodeUTF32([ 0x000e0100 ], [], [ 0x000e_0100 ]))
479+
480+
// U+E03FF (unassigned)
481+
expectTrue(checkDecodeUTF32([ 0x000e03ff ], [], [ 0x000e_03ff ]))
482+
483+
484+
// U+10FC00 (private use)
485+
expectTrue(checkDecodeUTF32([ 0x0010fc00 ], [], [ 0x0010_fc00 ]))
486+
487+
// U+10FD00 (private use)
488+
expectTrue(checkDecodeUTF32([ 0x0010fd00 ], [], [ 0x0010_fd00 ]))
489+
490+
// U+10FFFF (private use, noncharacter)
491+
expectTrue(checkDecodeUTF32([ 0x0010ffff ], [], [ 0x0010_ffff ]))
492+
}
493+
494+
UTF32Decoder.test("IllFormed") {
495+
// U+D800 (high-surrogate)
496+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_d800 ]))
497+
498+
// U+DB40 (high-surrogate)
499+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_db40 ]))
500+
501+
// U+DBFF (high-surrogate)
502+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dbff ]))
503+
504+
// U+DC00 (low-surrogate)
505+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dc00 ]))
506+
507+
// U+DD00 (low-surrogate)
508+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dd00 ]))
509+
510+
// U+DFFF (low-surrogate)
511+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dfff ]))
512+
513+
// U+110000 (invalid)
514+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0011_0000 ]))
515+
516+
// U+1000000 (invalid)
517+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0100_0000 ]))
518+
519+
// U+80000000 (invalid)
520+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x8000_0000 ]))
521+
522+
// U+FFFF0000 (invalid)
523+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0xffff_0000 ]))
524+
525+
// U+FFFFFFFF (invalid)
526+
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0xffff_ffff ]))
527+
}
528+
372529
var UTF8Decoder = TestSuite("UTF8Decoder")
373530

374531
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)