|
| 1 | +# |
| 2 | +# genmap_ja_codecs.py: Japanese Codecs Map Generator |
| 3 | +# |
| 4 | +# Original Author: Hye-Shik Chang <[email protected]> |
| 5 | +# Modified Author: Dong-hee Na <[email protected]> |
| 6 | +# |
| 7 | +import os |
| 8 | + |
| 9 | +from genmap_support import * |
| 10 | + |
| 11 | +JISX0208_C1 = (0x21, 0x74) |
| 12 | +JISX0208_C2 = (0x21, 0x7e) |
| 13 | +JISX0212_C1 = (0x22, 0x6d) |
| 14 | +JISX0212_C2 = (0x21, 0x7e) |
| 15 | +JISX0213_C1 = (0x21, 0x7e) |
| 16 | +JISX0213_C2 = (0x21, 0x7e) |
| 17 | +CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932 |
| 18 | +CP932P0_C2 = (0x5f, 0xca) |
| 19 | +CP932P1_C1 = (0x87, 0x87) # CP932 P1 |
| 20 | +CP932P1_C2 = (0x40, 0x9c) |
| 21 | +CP932P2_C1 = (0xed, 0xfc) # CP932 P2 |
| 22 | +CP932P2_C2 = (0x40, 0xfc) |
| 23 | + |
| 24 | +MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT' |
| 25 | +MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT' |
| 26 | +MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT' |
| 27 | +MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt' |
| 28 | + |
| 29 | + |
| 30 | +def loadmap_jisx0213(fo): |
| 31 | + decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4 |
| 32 | + decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4 |
| 33 | + decmap3_pair = {} # maps to BMP-pair for level 3 |
| 34 | + for line in fo: |
| 35 | + line = line.split('#', 1)[0].strip() |
| 36 | + if not line or len(line.split()) < 2: |
| 37 | + continue |
| 38 | + |
| 39 | + row = line.split() |
| 40 | + loc = eval('0x' + row[0][2:]) |
| 41 | + level = eval(row[0][0]) |
| 42 | + m = None |
| 43 | + if len(row[1].split('+')) == 2: # single unicode |
| 44 | + uni = eval('0x' + row[1][2:]) |
| 45 | + if level == 3: |
| 46 | + if uni < 0x10000: |
| 47 | + m = decmap3 |
| 48 | + elif 0x20000 <= uni < 0x30000: |
| 49 | + uni -= 0x20000 |
| 50 | + m = decmap3_2 |
| 51 | + elif level == 4: |
| 52 | + if uni < 0x10000: |
| 53 | + m = decmap4 |
| 54 | + elif 0x20000 <= uni < 0x30000: |
| 55 | + uni -= 0x20000 |
| 56 | + m = decmap4_2 |
| 57 | + m.setdefault((loc >> 8), {}) |
| 58 | + m[(loc >> 8)][(loc & 0xff)] = uni |
| 59 | + else: # pair |
| 60 | + uniprefix = eval('0x' + row[1][2:6]) # body |
| 61 | + uni = eval('0x' + row[1][7:11]) # modifier |
| 62 | + if level != 3: |
| 63 | + raise ValueError("invalid map") |
| 64 | + decmap3_pair.setdefault(uniprefix, {}) |
| 65 | + m = decmap3_pair[uniprefix] |
| 66 | + |
| 67 | + if m is None: |
| 68 | + raise ValueError("invalid map") |
| 69 | + m.setdefault((loc >> 8), {}) |
| 70 | + m[(loc >> 8)][(loc & 0xff)] = uni |
| 71 | + |
| 72 | + return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair |
| 73 | + |
| 74 | + |
| 75 | +def main(): |
| 76 | + jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208) |
| 77 | + jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212) |
| 78 | + cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932) |
| 79 | + jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004) |
| 80 | + |
| 81 | + print("Loading Mapping File...") |
| 82 | + |
| 83 | + sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2) |
| 84 | + jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2) |
| 85 | + jisx0212decmap = loadmap(jisx0212file) |
| 86 | + cp932decmap = loadmap(cp932file) |
| 87 | + jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file) |
| 88 | + |
| 89 | + if jis3decmap[0x21][0x24] != 0xff0c: |
| 90 | + raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff') |
| 91 | + |
| 92 | + sjisencmap, cp932encmap = {}, {} |
| 93 | + jisx0208_0212encmap = {} |
| 94 | + for c1, m in sjisdecmap.items(): |
| 95 | + for c2, code in m.items(): |
| 96 | + sjisencmap.setdefault(code >> 8, {}) |
| 97 | + sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 98 | + for c1, m in cp932decmap.items(): |
| 99 | + for c2, code in m.items(): |
| 100 | + cp932encmap.setdefault(code >> 8, {}) |
| 101 | + if (code & 0xff) not in cp932encmap[code >> 8]: |
| 102 | + cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 103 | + for c1, m in cp932encmap.copy().items(): |
| 104 | + for c2, code in m.copy().items(): |
| 105 | + if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code: |
| 106 | + del cp932encmap[c1][c2] |
| 107 | + if not cp932encmap[c1]: |
| 108 | + del cp932encmap[c1] |
| 109 | + |
| 110 | + jisx0213pairdecmap = {} |
| 111 | + jisx0213pairencmap = [] |
| 112 | + for unibody, m1 in jis3_pairdecmap.items(): |
| 113 | + for c1, m2 in m1.items(): |
| 114 | + for c2, modifier in m2.items(): |
| 115 | + jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2)) |
| 116 | + jisx0213pairdecmap.setdefault(c1, {}) |
| 117 | + jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier |
| 118 | + |
| 119 | + # Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set) |
| 120 | + for c1, m in jisx0208decmap.items(): |
| 121 | + for c2, code in m.items(): |
| 122 | + jisx0208_0212encmap.setdefault(code >> 8, {}) |
| 123 | + jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 124 | + |
| 125 | + for c1, m in jisx0212decmap.items(): |
| 126 | + for c2, code in m.items(): |
| 127 | + jisx0208_0212encmap.setdefault(code >> 8, {}) |
| 128 | + if (code & 0xff) in jisx0208_0212encmap[code >> 8]: |
| 129 | + print("OOPS!!!", (code)) |
| 130 | + jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 131 | + |
| 132 | + jisx0213bmpencmap = {} |
| 133 | + for c1, m in jis3decmap.copy().items(): |
| 134 | + for c2, code in m.copy().items(): |
| 135 | + if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]: |
| 136 | + if code in jis3_pairdecmap: |
| 137 | + jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair |
| 138 | + jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) |
| 139 | + elif jisx0208decmap[c1][c2] == code: |
| 140 | + del jis3decmap[c1][c2] |
| 141 | + if not jis3decmap[c1]: |
| 142 | + del jis3decmap[c1] |
| 143 | + else: |
| 144 | + raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.") |
| 145 | + else: |
| 146 | + jisx0213bmpencmap.setdefault(code >> 8, {}) |
| 147 | + if code not in jis3_pairdecmap: |
| 148 | + jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 149 | + else: |
| 150 | + jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair |
| 151 | + jisx0213pairencmap.append((code, 0, c1 << 8 | c2)) |
| 152 | + |
| 153 | + for c1, m in jis4decmap.items(): |
| 154 | + for c2, code in m.items(): |
| 155 | + jisx0213bmpencmap.setdefault(code >> 8, {}) |
| 156 | + jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 157 | + |
| 158 | + jisx0213empencmap = {} |
| 159 | + for c1, m in jis3_2_decmap.items(): |
| 160 | + for c2, code in m.items(): |
| 161 | + jisx0213empencmap.setdefault(code >> 8, {}) |
| 162 | + jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2 |
| 163 | + for c1, m in jis4_2_decmap.items(): |
| 164 | + for c2, code in m.items(): |
| 165 | + jisx0213empencmap.setdefault(code >> 8, {}) |
| 166 | + jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2 |
| 167 | + |
| 168 | + with open("mappings_jp.h", "w") as fp: |
| 169 | + print_autogen(fp, os.path.basename(__file__)) |
| 170 | + print("Generating JIS X 0208 decode map...") |
| 171 | + writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap) |
| 172 | + writer.update_decode_map(JISX0208_C1, JISX0208_C2) |
| 173 | + writer.generate() |
| 174 | + |
| 175 | + print("Generating JIS X 0212 decode map...") |
| 176 | + writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap) |
| 177 | + writer.update_decode_map(JISX0212_C1, JISX0212_C2) |
| 178 | + writer.generate() |
| 179 | + |
| 180 | + print("Generating JIS X 0208 && JIS X 0212 encode map...") |
| 181 | + writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap) |
| 182 | + writer.generate() |
| 183 | + |
| 184 | + print("Generating CP932 Extension decode map...") |
| 185 | + writer = DecodeMapWriter(fp, "cp932ext", cp932decmap) |
| 186 | + writer.update_decode_map(CP932P0_C1, CP932P0_C2) |
| 187 | + writer.update_decode_map(CP932P1_C1, CP932P1_C2) |
| 188 | + writer.update_decode_map(CP932P2_C1, CP932P2_C2) |
| 189 | + writer.generate() |
| 190 | + |
| 191 | + print("Generating CP932 Extension encode map...") |
| 192 | + writer = EncodeMapWriter(fp, "cp932ext", cp932encmap) |
| 193 | + writer.generate() |
| 194 | + |
| 195 | + print("Generating JIS X 0213 Plane 1 BMP decode map...") |
| 196 | + writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap) |
| 197 | + writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 198 | + writer.generate() |
| 199 | + |
| 200 | + print("Generating JIS X 0213 Plane 2 BMP decode map...") |
| 201 | + writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap) |
| 202 | + writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 203 | + writer.generate() |
| 204 | + |
| 205 | + print("Generating JIS X 0213 BMP encode map...") |
| 206 | + writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap) |
| 207 | + writer.generate() |
| 208 | + |
| 209 | + print("Generating JIS X 0213 Plane 1 EMP decode map...") |
| 210 | + writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap) |
| 211 | + writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 212 | + writer.generate() |
| 213 | + |
| 214 | + print("Generating JIS X 0213 Plane 2 EMP decode map...") |
| 215 | + writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap) |
| 216 | + writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 217 | + writer.generate() |
| 218 | + |
| 219 | + print("Generating JIS X 0213 EMP encode map...") |
| 220 | + writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap) |
| 221 | + writer.generate() |
| 222 | + |
| 223 | + with open('mappings_jisx0213_pair.h', 'w') as fp: |
| 224 | + print_autogen(fp, os.path.basename(__file__)) |
| 225 | + fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n") |
| 226 | + fp.write("""\ |
| 227 | +#ifdef EXTERN_JISX0213_PAIR |
| 228 | +static const struct widedbcs_index *jisx0213_pair_decmap; |
| 229 | +static const struct pair_encodemap *jisx0213_pair_encmap; |
| 230 | +#else |
| 231 | +""") |
| 232 | + |
| 233 | + print("Generating JIS X 0213 unicode-pair decode map...") |
| 234 | + writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap) |
| 235 | + writer.update_decode_map(JISX0213_C1, JISX0213_C2) |
| 236 | + writer.generate(wide=True) |
| 237 | + |
| 238 | + print("Generating JIS X 0213 unicode-pair encode map...") |
| 239 | + jisx0213pairencmap.sort() |
| 240 | + fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n") |
| 241 | + filler = BufferedFiller() |
| 242 | + for body, modifier, jis in jisx0213pairencmap: |
| 243 | + filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},') |
| 244 | + filler.printout(fp) |
| 245 | + fp.write("};\n") |
| 246 | + fp.write("#endif\n") |
| 247 | + |
| 248 | + print("Done!") |
| 249 | + |
| 250 | +if __name__ == '__main__': |
| 251 | + main() |
0 commit comments