|
1 |
| -# ===--- GYBUnicodeDataUtils.py -----------------------*- coding: utf-8 -*-===// |
| 1 | +# ===--- GYBUnicodeDataUtils.py ----------------------*- coding: utf-8 -*-===// |
2 | 2 | #
|
3 | 3 | # This source file is part of the Swift.org open source project
|
4 | 4 | #
|
@@ -105,8 +105,8 @@ def __init__(self, grapheme_break_property_file_name):
|
105 | 105 | for cp in range(0, 0x110000):
|
106 | 106 | self.property_values[cp] = self.get_default_value()
|
107 | 107 |
|
108 |
| - for start_code_point, end_code_point, val in self.property_value_ranges: |
109 |
| - for cp in range(start_code_point, end_code_point + 1): |
| 108 | + for start_code_pt, end_code_pt, val in self.property_value_ranges: |
| 109 | + for cp in range(start_code_pt, end_code_pt + 1): |
110 | 110 | self.property_values[cp] = val
|
111 | 111 |
|
112 | 112 | def get_default_value(self):
|
@@ -320,7 +320,8 @@ def get_value(self, cp):
|
320 | 320 | if cp <= 0xffff:
|
321 | 321 | data_block_index = self.bmp_lookup[
|
322 | 322 | self.get_bmp_first_level_index(cp)]
|
323 |
| - return self.bmp_data[data_block_index][self.get_bmp_data_offset(cp)] |
| 323 | + return self.bmp_data[data_block_index][ |
| 324 | + self.get_bmp_data_offset(cp)] |
324 | 325 | else:
|
325 | 326 | second_lookup_index = self.supp_lookup1[
|
326 | 327 | self.get_supp_first_level_index(cp)]
|
@@ -552,14 +553,14 @@ def _convert_line(line):
|
552 | 553 | pass
|
553 | 554 | else:
|
554 | 555 | code_point = int(token, 16)
|
555 |
| - # Tests from Unicode spec have isolated surrogates in them. Our |
556 |
| - # segmentation algorithm works on UTF-8 sequences, so encoding a |
557 |
| - # surrogate would produce an invalid code unit sequence. |
558 |
| - # Instead of trying to emulate the maximal subpart algorithm for |
559 |
| - # inserting U+FFFD in Python, we just replace every isolated |
560 |
| - # surrogate with U+200B, which also has Grapheme_Cluster_Break |
561 |
| - # equal to 'Control' and test separately that we handle |
562 |
| - # ill-formed UTF-8 sequences. |
| 556 | + # Tests from Unicode spec have isolated surrogates in them. |
| 557 | + # Our segmentation algorithm works on UTF-8 sequences, so |
| 558 | + # encoding a surrogate would produce an invalid code unit |
| 559 | + # sequence. Instead of trying to emulate the maximal subpart |
| 560 | + # algorithm for inserting U+FFFD in Python, we just replace |
| 561 | + # every isolated surrogate with U+200B, which also has |
| 562 | + # Grapheme_Cluster_Break equal to 'Control' and test |
| 563 | + # separately that we handle ill-formed UTF-8 sequences. |
563 | 564 | if code_point >= 0xd800 and code_point <= 0xdfff:
|
564 | 565 | code_point = 0x200b
|
565 | 566 | code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode(
|
@@ -613,14 +614,14 @@ def _convert_line(line):
|
613 | 614 | pass
|
614 | 615 | else:
|
615 | 616 | code_point = int(token, 16)
|
616 |
| - # Tests from Unicode spec have isolated surrogates in them. Our |
| 617 | + # Tests from Unicode spec have isolated surrogates in them. Our |
617 | 618 | # segmentation algorithm works on UTF-16 sequences, so encoding
|
618 | 619 | # a surrogate would produce an invalid code unit sequence.
|
619 |
| - # Instead of trying to emulate the maximal subpart algorithm for |
620 |
| - # inserting U+FFFD in Python, we just replace every isolated |
621 |
| - # surrogate with U+200B, which also has Grapheme_Cluster_Break |
622 |
| - # equal to 'Control' and test separately that we handle |
623 |
| - # ill-formed UTF-8 sequences. |
| 620 | + # Instead of trying to emulate the maximal subpart algorithm |
| 621 | + # for inserting U+FFFD in Python, we just replace every |
| 622 | + # isolated surrogate with U+200B, which also has |
| 623 | + # Grapheme_Cluster_Break equal to 'Control' and test separately |
| 624 | + # that we handle ill-formed UTF-8 sequences. |
624 | 625 | if code_point >= 0xd800 and code_point <= 0xdfff:
|
625 | 626 | code_point = 0x200b
|
626 | 627 | test += [code_point]
|
|
0 commit comments