Skip to content

Commit 973c537

Browse files
cfbolzambveendebakptsweeneyde
authored andcommitted
pythongh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (python#97906)
Co-authored-by: Łukasz Langa <[email protected]> Co-authored-by: Pieter Eendebak <[email protected]> Co-authored-by: Dennis Sweeney <[email protected]>
1 parent a9d9778 commit 973c537

File tree

8 files changed

+18134
-30444
lines changed

8 files changed

+18134
-30444
lines changed
+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import unittest
2+
from test.test_tools import toolsdir, imports_under_tool
3+
from test import support
4+
from test.support.hypothesis_helper import hypothesis
5+
6+
st = hypothesis.strategies
7+
given = hypothesis.given
8+
example = hypothesis.example
9+
10+
11+
with imports_under_tool("unicode"):
12+
from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
13+
14+
15+
@st.composite
16+
def char_name_db(draw, min_length=1, max_length=30):
17+
m = draw(st.integers(min_value=min_length, max_value=max_length))
18+
names = draw(
19+
st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
20+
)
21+
characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
22+
return list(zip(names, characters))
23+
24+
25+
class TestDawg(unittest.TestCase):
26+
"""Tests for the directed acyclic word graph data structure that is used
27+
to store the unicode character names in unicodedata. Tests ported from PyPy
28+
"""
29+
30+
def test_dawg_direct_simple(self):
31+
dawg = Dawg()
32+
dawg.insert("a", -4)
33+
dawg.insert("c", -2)
34+
dawg.insert("cat", -1)
35+
dawg.insert("catarr", 0)
36+
dawg.insert("catnip", 1)
37+
dawg.insert("zcatnip", 5)
38+
packed, data, inverse = dawg.finish()
39+
40+
self.assertEqual(lookup(packed, data, b"a"), -4)
41+
self.assertEqual(lookup(packed, data, b"c"), -2)
42+
self.assertEqual(lookup(packed, data, b"cat"), -1)
43+
self.assertEqual(lookup(packed, data, b"catarr"), 0)
44+
self.assertEqual(lookup(packed, data, b"catnip"), 1)
45+
self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
46+
self.assertRaises(KeyError, lookup, packed, data, b"b")
47+
self.assertRaises(KeyError, lookup, packed, data, b"catni")
48+
self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
49+
50+
self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
51+
self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
52+
self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
53+
self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
54+
self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
55+
self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
56+
self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
57+
58+
def test_forbid_empty_dawg(self):
59+
dawg = Dawg()
60+
self.assertRaises(ValueError, dawg.finish)
61+
62+
@given(char_name_db())
63+
@example([("abc", "a"), ("abd", "b")])
64+
@example(
65+
[
66+
("bab", "1"),
67+
("a", ":"),
68+
("ad", "@"),
69+
("b", "<"),
70+
("aacc", "?"),
71+
("dab", "D"),
72+
("aa", "0"),
73+
("ab", "F"),
74+
("aaa", "7"),
75+
("cbd", "="),
76+
("abad", ";"),
77+
("ac", "B"),
78+
("abb", "4"),
79+
("bb", "2"),
80+
("aab", "9"),
81+
("caaaaba", "E"),
82+
("ca", ">"),
83+
("bbaaa", "5"),
84+
("d", "3"),
85+
("baac", "8"),
86+
("c", "6"),
87+
("ba", "A"),
88+
]
89+
)
90+
@example(
91+
[
92+
("bcdac", "9"),
93+
("acc", "g"),
94+
("d", "d"),
95+
("daabdda", "0"),
96+
("aba", ";"),
97+
("c", "6"),
98+
("aa", "7"),
99+
("abbd", "c"),
100+
("badbd", "?"),
101+
("bbd", "f"),
102+
("cc", "@"),
103+
("bb", "8"),
104+
("daca", ">"),
105+
("ba", ":"),
106+
("baac", "3"),
107+
("dbdddac", "a"),
108+
("a", "2"),
109+
("cabd", "b"),
110+
("b", "="),
111+
("abd", "4"),
112+
("adcbd", "5"),
113+
("abc", "e"),
114+
("ab", "1"),
115+
]
116+
)
117+
def test_dawg(self, data):
118+
# suppress debug prints
119+
with support.captured_stdout() as output:
120+
# it's enough to build it, building will also check the result
121+
build_compression_dawg(data)

Lib/test/test_unicodedata.py

+20
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,26 @@ def test_name_inverse_lookup(self):
104104
if looked_name := self.db.name(char, None):
105105
self.assertEqual(self.db.lookup(looked_name), char)
106106

107+
def test_no_names_in_pua(self):
108+
puas = [*range(0xe000, 0xf8ff),
109+
*range(0xf0000, 0xfffff),
110+
*range(0x100000, 0x10ffff)]
111+
for i in puas:
112+
char = chr(i)
113+
self.assertRaises(ValueError, self.db.name, char)
114+
115+
def test_lookup_nonexistant(self):
116+
# just make sure that lookup can fail
117+
for nonexistant in [
118+
"LATIN SMLL LETR A",
119+
"OPEN HANDS SIGHS",
120+
"DREGS",
121+
"HANDBUG",
122+
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
123+
"???",
124+
]:
125+
self.assertRaises(KeyError, self.db.lookup, nonexistant)
126+
107127
def test_digit(self):
108128
self.assertEqual(self.db.digit('A', None), None)
109129
self.assertEqual(self.db.digit('9'), 9)

Makefile.pre.in

+9-1
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,14 @@ check-abidump: all
13421342
regen-limited-abi: all
13431343
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml
13441344

1345+
############################################################################
1346+
# Regenerate Unicode Data
1347+
1348+
.PHONY: regen-unicodedata
1349+
regen-unicodedata:
1350+
$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py
1351+
1352+
13451353
############################################################################
13461354
# Regenerate all generated files
13471355

@@ -1350,7 +1358,7 @@ regen-limited-abi: all
13501358
regen-all: regen-cases regen-typeslots \
13511359
regen-token regen-ast regen-keyword regen-sre regen-frozen \
13521360
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
1353-
regen-test-levenshtein regen-global-objects
1361+
regen-test-levenshtein regen-global-objects regen-unicodedata
13541362
@echo
13551363
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
13561364
@echo "and make regen-configure should be run manually"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Switch the storage of the unicode codepoint names to use a different
2+
data-structure, a `directed acyclic word graph
3+
<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
4+
This makes the unicodedata shared library about 440 KiB smaller. Contributed by
5+
Carl Friedrich Bolz-Tereick using code from the PyPy project.

0 commit comments

Comments
 (0)