19
19
# Since this should not require frequent updates, we just store this
20
20
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21
21
import collections
22
+ import re
22
23
import urllib .request
23
24
24
25
UNICODE_VERSION = "15.1.0"
66
67
class UnicodeData (object ):
67
68
def __init__ (self ):
68
69
self ._load_unicode_data ()
70
+ self ._load_default_ignorable_marks ()
71
+
69
72
self .norm_props = self ._load_norm_props ()
70
73
self .norm_tests = self ._load_norm_tests ()
71
74
@@ -100,6 +103,11 @@ def _load_unicode_data(self):
100
103
self .general_category_mark = []
101
104
self .general_category_public_assigned = []
102
105
106
+ # Characters that cannot be part of a combining character sequence:
107
+ # control characters, format characters other than ZWJ and ZWNJ,
108
+ # the line and paragraph separators, and noncharacters.
109
+ self .not_in_ccs = []
110
+
103
111
assigned_start = 0 ;
104
112
prev_char_int = - 1 ;
105
113
prev_name = "" ;
@@ -125,6 +133,9 @@ def _load_unicode_data(self):
125
133
if category == 'M' or 'M' in expanded_categories .get (category , []):
126
134
self .general_category_mark .append (char_int )
127
135
136
+ if category in ['Cc' , 'Cf' , 'Zl' , 'Zp' ] and char_int not in [0x200C , 0x200D ]:
137
+ self .not_in_ccs .append (char_int )
138
+
128
139
assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
129
140
if category not in ['Co' , 'Cs' ]:
130
141
if char_int != prev_char_int + 1 and not is_first_and_last (prev_name , name ):
@@ -135,6 +146,44 @@ def _load_unicode_data(self):
135
146
136
147
self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
137
148
149
+ # Mark noncharacters as nongraphic
150
+ for i in range (0xFDD0 , 0xFDF0 ):
151
+ self .not_in_ccs .append (i )
152
+ for prefix in range (0 , 0x11 ):
153
+ shifted = prefix << 16
154
+ self .not_in_ccs .append (shifted | 0xFFFE )
155
+ self .not_in_ccs .append (shifted | 0xFFFF )
156
+
157
+ self .not_in_ccs .sort ()
158
+
159
+ def _load_default_ignorable_marks (self ):
160
+ default_ignorable_cps = set ()
161
+
162
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
163
+ multiple = re .compile (
164
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
165
+ )
166
+
167
+ for line in self ._fetch ("DerivedCoreProperties.txt" ).splitlines ():
168
+ raw_data = None # (low, high)
169
+ if match := single .match (line ):
170
+ raw_data = (match .group (1 ), match .group (1 ))
171
+ elif match := multiple .match (line ):
172
+ raw_data = (match .group (1 ), match .group (2 ))
173
+ else :
174
+ continue
175
+ low = int (raw_data [0 ], 16 )
176
+ high = int (raw_data [1 ], 16 )
177
+ for cp in range (low , high + 1 ):
178
+ default_ignorable_cps .add (cp )
179
+
180
+ self .default_ignorable_marks = []
181
+ for cp in self .general_category_mark :
182
+ if cp in default_ignorable_cps :
183
+ self .default_ignorable_marks .append (cp )
184
+
185
+ self .default_ignorable_marks .sort ()
186
+
138
187
def _load_cjk_compat_ideograph_variants (self ):
139
188
for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
140
189
strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out):
454
503
455
504
def gen_public_assigned (general_category_public_assigned , out ):
456
505
# This could be done as a hash but the table is somewhat small.
457
- out .write ("#[inline]\n " )
506
+ out .write ("\n #[inline]\n " )
458
507
out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
459
508
out .write (" match c {\n " )
460
509
@@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out):
476
525
out .write ("}\n " )
477
526
out .write ("\n " )
478
527
528
+ def gen_not_in_ccs (not_in_ccs , out ):
529
+ # List of codepoints to list of ranges
530
+ range_list = []
531
+ for cp in not_in_ccs :
532
+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
533
+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
534
+ else :
535
+ range_list .append ((cp , cp ))
536
+
537
+ out .write ("\n #[inline]\n " )
538
+ out .write ("pub fn not_in_ccs(c: char) -> bool {\n " )
539
+ out .write (" match c {\n " )
540
+
541
+ start = True
542
+ for first , last in range_list :
543
+ if start :
544
+ out .write (" " )
545
+ start = False
546
+ else :
547
+ out .write ("\n | " )
548
+ if first == last :
549
+ out .write ("'\\ u{%s}'" % hexify (first ))
550
+ else :
551
+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
552
+ out .write (" => true,\n " )
553
+
554
+ out .write (" _ => false,\n " )
555
+ out .write (" }\n " )
556
+ out .write ("}\n " )
557
+
558
+ def gen_default_ignorable_mark (default_ignorable_marks , out ):
559
+ # List of codepoints to list of ranges
560
+ range_list = []
561
+ for cp in default_ignorable_marks :
562
+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
563
+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
564
+ else :
565
+ range_list .append ((cp , cp ))
566
+
567
+ out .write ("\n #[inline]\n " )
568
+ out .write ("pub fn is_default_ignorable_mark(c: char) -> bool {\n " )
569
+ out .write (" match c {\n " )
570
+
571
+ start = True
572
+ for first , last in range_list :
573
+ if start :
574
+ out .write (" " )
575
+ start = False
576
+ else :
577
+ out .write ("\n | " )
578
+ if first == last :
579
+ out .write ("'\\ u{%s}'" % hexify (first ))
580
+ else :
581
+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
582
+ out .write (" => true,\n " )
583
+
584
+ out .write (" _ => false,\n " )
585
+ out .write (" }\n " )
586
+ out .write ("}\n " )
587
+
479
588
def gen_stream_safe (leading , trailing , out ):
480
589
# This could be done as a hash but the table is very small.
481
590
out .write ("#[inline]\n " )
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602
711
gen_public_assigned (data .general_category_public_assigned , out )
603
712
out .write ("\n " )
604
713
714
+ gen_not_in_ccs (data .not_in_ccs , out )
715
+
716
+ gen_default_ignorable_mark (data .default_ignorable_marks , out )
717
+
605
718
gen_nfc_qc (data .norm_props , out )
606
719
out .write ("\n " )
607
720
0 commit comments