Skip to content

Commit 113feb3

Browse files
authored
bpo-40328: Add tool for generating cjk mapping headers (GH-19602)
1 parent 2d87577 commit 113feb3

15 files changed

+51015
-3
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add tools for generating mappings headers for CJKCodecs.

Modules/cjkcodecs/README

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
To generate or modify mapping headers
22
-------------------------------------
3-
Mapping headers are imported from CJKCodecs as pre-generated form.
4-
If you need to tweak or add something on it, please look at tools/
5-
subdirectory of CJKCodecs' distribution.
3+
Mapping headers are generated from Tools/unicode/genmap_*.py
64

75

86

Modules/cjkcodecs/mappings_cn.h

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// AUTO-GENERATED FILE FROM genmap_schinese.py: DO NOT EDIT
12
static const ucs2_t __gb2312_decmap[7482] = {
23
12288,12289,12290,12539,713,711,168,12291,12293,8213,65374,8214,8230,8216,
34
8217,8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,

Modules/cjkcodecs/mappings_jisx0213_pair.h

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
12
#define JISX0213_ENCPAIRS 46
23
#ifdef EXTERN_JISX0213_PAIR
34
static const struct widedbcs_index *jisx0213_pair_decmap;

Modules/cjkcodecs/mappings_jp.h

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
12
static const ucs2_t __jisx0208_decmap[6956] = {
23
12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,
34
65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,

Modules/cjkcodecs/mappings_kr.h

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// AUTO-GENERATED FILE FROM genmap_korean.py: DO NOT EDIT
12
static const ucs2_t __ksx1001_decmap[8264] = {
23
12288,12289,12290,183,8229,8230,168,12291,173,8213,8741,65340,8764,8216,8217,
34
8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,12304,
@@ -3249,3 +3250,4 @@ __cp949_encmap+31959,0,255},{__cp949_encmap+32215,0,255},{__cp949_encmap+32471
32493250
__cp949_encmap+32891,0,11},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{__cp949_encmap+
32503251
32903,1,230},
32513252
};
3253+

Tools/unicode/genmap_japanese.py

+251
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
#
2+
# genmap_ja_codecs.py: Japanese Codecs Map Generator
3+
#
4+
# Original Author: Hye-Shik Chang <[email protected]>
5+
# Modified Author: Dong-hee Na <[email protected]>
6+
#
7+
import os
8+
9+
from genmap_support import *
10+
11+
JISX0208_C1 = (0x21, 0x74)
12+
JISX0208_C2 = (0x21, 0x7e)
13+
JISX0212_C1 = (0x22, 0x6d)
14+
JISX0212_C2 = (0x21, 0x7e)
15+
JISX0213_C1 = (0x21, 0x7e)
16+
JISX0213_C2 = (0x21, 0x7e)
17+
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
18+
CP932P0_C2 = (0x5f, 0xca)
19+
CP932P1_C1 = (0x87, 0x87) # CP932 P1
20+
CP932P1_C2 = (0x40, 0x9c)
21+
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
22+
CP932P2_C2 = (0x40, 0xfc)
23+
24+
MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
25+
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
26+
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
27+
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'
28+
29+
30+
def loadmap_jisx0213(fo):
31+
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
32+
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
33+
decmap3_pair = {} # maps to BMP-pair for level 3
34+
for line in fo:
35+
line = line.split('#', 1)[0].strip()
36+
if not line or len(line.split()) < 2:
37+
continue
38+
39+
row = line.split()
40+
loc = eval('0x' + row[0][2:])
41+
level = eval(row[0][0])
42+
m = None
43+
if len(row[1].split('+')) == 2: # single unicode
44+
uni = eval('0x' + row[1][2:])
45+
if level == 3:
46+
if uni < 0x10000:
47+
m = decmap3
48+
elif 0x20000 <= uni < 0x30000:
49+
uni -= 0x20000
50+
m = decmap3_2
51+
elif level == 4:
52+
if uni < 0x10000:
53+
m = decmap4
54+
elif 0x20000 <= uni < 0x30000:
55+
uni -= 0x20000
56+
m = decmap4_2
57+
m.setdefault((loc >> 8), {})
58+
m[(loc >> 8)][(loc & 0xff)] = uni
59+
else: # pair
60+
uniprefix = eval('0x' + row[1][2:6]) # body
61+
uni = eval('0x' + row[1][7:11]) # modifier
62+
if level != 3:
63+
raise ValueError("invalid map")
64+
decmap3_pair.setdefault(uniprefix, {})
65+
m = decmap3_pair[uniprefix]
66+
67+
if m is None:
68+
raise ValueError("invalid map")
69+
m.setdefault((loc >> 8), {})
70+
m[(loc >> 8)][(loc & 0xff)] = uni
71+
72+
return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair
73+
74+
75+
def main():
76+
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
77+
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
78+
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
79+
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)
80+
81+
print("Loading Mapping File...")
82+
83+
sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
84+
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
85+
jisx0212decmap = loadmap(jisx0212file)
86+
cp932decmap = loadmap(cp932file)
87+
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)
88+
89+
if jis3decmap[0x21][0x24] != 0xff0c:
90+
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')
91+
92+
sjisencmap, cp932encmap = {}, {}
93+
jisx0208_0212encmap = {}
94+
for c1, m in sjisdecmap.items():
95+
for c2, code in m.items():
96+
sjisencmap.setdefault(code >> 8, {})
97+
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
98+
for c1, m in cp932decmap.items():
99+
for c2, code in m.items():
100+
cp932encmap.setdefault(code >> 8, {})
101+
if (code & 0xff) not in cp932encmap[code >> 8]:
102+
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
103+
for c1, m in cp932encmap.copy().items():
104+
for c2, code in m.copy().items():
105+
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
106+
del cp932encmap[c1][c2]
107+
if not cp932encmap[c1]:
108+
del cp932encmap[c1]
109+
110+
jisx0213pairdecmap = {}
111+
jisx0213pairencmap = []
112+
for unibody, m1 in jis3_pairdecmap.items():
113+
for c1, m2 in m1.items():
114+
for c2, modifier in m2.items():
115+
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
116+
jisx0213pairdecmap.setdefault(c1, {})
117+
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier
118+
119+
# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
120+
for c1, m in jisx0208decmap.items():
121+
for c2, code in m.items():
122+
jisx0208_0212encmap.setdefault(code >> 8, {})
123+
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2
124+
125+
for c1, m in jisx0212decmap.items():
126+
for c2, code in m.items():
127+
jisx0208_0212encmap.setdefault(code >> 8, {})
128+
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
129+
print("OOPS!!!", (code))
130+
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
131+
132+
jisx0213bmpencmap = {}
133+
for c1, m in jis3decmap.copy().items():
134+
for c2, code in m.copy().items():
135+
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
136+
if code in jis3_pairdecmap:
137+
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
138+
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
139+
elif jisx0208decmap[c1][c2] == code:
140+
del jis3decmap[c1][c2]
141+
if not jis3decmap[c1]:
142+
del jis3decmap[c1]
143+
else:
144+
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
145+
else:
146+
jisx0213bmpencmap.setdefault(code >> 8, {})
147+
if code not in jis3_pairdecmap:
148+
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
149+
else:
150+
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
151+
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
152+
153+
for c1, m in jis4decmap.items():
154+
for c2, code in m.items():
155+
jisx0213bmpencmap.setdefault(code >> 8, {})
156+
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
157+
158+
jisx0213empencmap = {}
159+
for c1, m in jis3_2_decmap.items():
160+
for c2, code in m.items():
161+
jisx0213empencmap.setdefault(code >> 8, {})
162+
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
163+
for c1, m in jis4_2_decmap.items():
164+
for c2, code in m.items():
165+
jisx0213empencmap.setdefault(code >> 8, {})
166+
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
167+
168+
with open("mappings_jp.h", "w") as fp:
169+
print_autogen(fp, os.path.basename(__file__))
170+
print("Generating JIS X 0208 decode map...")
171+
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
172+
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
173+
writer.generate()
174+
175+
print("Generating JIS X 0212 decode map...")
176+
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
177+
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
178+
writer.generate()
179+
180+
print("Generating JIS X 0208 && JIS X 0212 encode map...")
181+
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
182+
writer.generate()
183+
184+
print("Generating CP932 Extension decode map...")
185+
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
186+
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
187+
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
188+
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
189+
writer.generate()
190+
191+
print("Generating CP932 Extension encode map...")
192+
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
193+
writer.generate()
194+
195+
print("Generating JIS X 0213 Plane 1 BMP decode map...")
196+
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
197+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
198+
writer.generate()
199+
200+
print("Generating JIS X 0213 Plane 2 BMP decode map...")
201+
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
202+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
203+
writer.generate()
204+
205+
print("Generating JIS X 0213 BMP encode map...")
206+
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
207+
writer.generate()
208+
209+
print("Generating JIS X 0213 Plane 1 EMP decode map...")
210+
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
211+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
212+
writer.generate()
213+
214+
print("Generating JIS X 0213 Plane 2 EMP decode map...")
215+
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
216+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
217+
writer.generate()
218+
219+
print("Generating JIS X 0213 EMP encode map...")
220+
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
221+
writer.generate()
222+
223+
with open('mappings_jisx0213_pair.h', 'w') as fp:
224+
print_autogen(fp, os.path.basename(__file__))
225+
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
226+
fp.write("""\
227+
#ifdef EXTERN_JISX0213_PAIR
228+
static const struct widedbcs_index *jisx0213_pair_decmap;
229+
static const struct pair_encodemap *jisx0213_pair_encmap;
230+
#else
231+
""")
232+
233+
print("Generating JIS X 0213 unicode-pair decode map...")
234+
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
235+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
236+
writer.generate(wide=True)
237+
238+
print("Generating JIS X 0213 unicode-pair encode map...")
239+
jisx0213pairencmap.sort()
240+
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
241+
filler = BufferedFiller()
242+
for body, modifier, jis in jisx0213pairencmap:
243+
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
244+
filler.printout(fp)
245+
fp.write("};\n")
246+
fp.write("#endif\n")
247+
248+
print("Done!")
249+
250+
if __name__ == '__main__':
251+
main()

Tools/unicode/genmap_korean.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#
2+
# genmap_korean.py: Korean Codecs Map Generator
3+
#
4+
# Original Author: Hye-Shik Chang <[email protected]>
5+
# Modified Author: Dong-hee Na <[email protected]>
6+
#
7+
import os
8+
9+
from genmap_support import *
10+
11+
12+
KSX1001_C1 = (0x21, 0x7e)
13+
KSX1001_C2 = (0x21, 0x7e)
14+
UHCL1_C1 = (0x81, 0xa0)
15+
UHCL1_C2 = (0x41, 0xfe)
16+
UHCL2_C1 = (0xa1, 0xfe)
17+
UHCL2_C2 = (0x41, 0xa0)
18+
MAPPINGS_CP949 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT'
19+
20+
21+
def main():
22+
mapfile = open_mapping_file('python-mappings/CP949.TXT', MAPPINGS_CP949)
23+
print("Loading Mapping File...")
24+
decmap = loadmap(mapfile)
25+
uhcdecmap, ksx1001decmap, cp949encmap = {}, {}, {}
26+
for c1, c2map in decmap.items():
27+
for c2, code in c2map.items():
28+
if c1 >= 0xa1 and c2 >= 0xa1:
29+
ksx1001decmap.setdefault(c1 & 0x7f, {})
30+
ksx1001decmap[c1 & 0x7f][c2 & 0x7f] = c2map[c2]
31+
cp949encmap.setdefault(code >> 8, {})
32+
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) & 0x7f7f
33+
else:
34+
# uhc
35+
uhcdecmap.setdefault(c1, {})
36+
uhcdecmap[c1][c2] = c2map[c2]
37+
cp949encmap.setdefault(code >> 8, {}) # MSB set
38+
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2)
39+
40+
with open('mappings_kr.h', 'w') as fp:
41+
print_autogen(fp, os.path.basename(__file__))
42+
43+
print("Generating KS X 1001 decode map...")
44+
writer = DecodeMapWriter(fp, "ksx1001", ksx1001decmap)
45+
writer.update_decode_map(KSX1001_C1, KSX1001_C2)
46+
writer.generate()
47+
48+
print("Generating UHC decode map...")
49+
writer = DecodeMapWriter(fp, "cp949ext", uhcdecmap)
50+
writer.update_decode_map(UHCL1_C1, UHCL1_C2)
51+
writer.update_decode_map(UHCL2_C1, UHCL2_C2)
52+
writer.generate()
53+
54+
print("Generating CP949 (includes KS X 1001) encode map...")
55+
writer = EncodeMapWriter(fp, "cp949", cp949encmap)
56+
writer.generate()
57+
58+
print("Done!")
59+
60+
61+
if __name__ == '__main__':
62+
main()

0 commit comments

Comments
 (0)