-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcologne_phonetics.py
175 lines (147 loc) · 5.28 KB
/
cologne_phonetics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python3
"""
Cologne_phonetics is a Python implementation of the cologne-phonetics, a phonetic
algorithm similar to soundex but optimized for the german language
Documentation can be found at https://github.com/provinzkraut/cologne_phonetics
A detailed explanation of the cologne phonetics can be found at:
https://en.wikipedia.org/wiki/Cologne_phonetics
"""
from __future__ import annotations
__author__ = "Janek Nouvertné"
__version__ = "2.0.0"
__license__ = "MIT"
import re
import sys
import unicodedata
from argparse import ArgumentParser
from typing import Iterable, Pattern
RGX_SPECIAL_CHARS = re.compile(r"[äüöß]")
RGX_SPECIAL_CHAR_REPLACEMENTS = [
(re.compile(r"ä"), "ae"),
(re.compile(r"ö"), "oe"),
(re.compile(r"ü"), "ue"),
(re.compile(r"ß"), "s"),
]
RGX_RULES = [
# ignore special characters that have not been replaced at this point
(re.compile(r"[^a-z]"), ""),
# d,t replacements
# not before c,s,z
(re.compile(r"[dt](?![csz])"), "2"),
# before c,s,z
(re.compile(r"[dt](?=[csz])"), "8"),
# x replacements
# not after c,k,q
(re.compile(r"(?<![ckq])x"), "48"),
# after c,k,q. insert new x for later comparison. will be removed later
(re.compile(r"(?<=[ckq])x"), "x8"),
# c replacements
# at the start before a,h,k,l,o,q,r,u,x
# | not after s,z before a,h,k,o,q,u,x
(re.compile(r"^c(?=[ahkloqrux])|(?<![sz])c(?=[ahkoqux])"), "4"),
# not before a,h,k,o,q,u,x
# | not before s,z
# | at the start, not before a,h,k,l,o,q,r,u,x
(re.compile(r"c(?![ahkoqux])|(?<=[sz])c|^c(?![ahkloqrux])"), "8"),
# p not before h
(re.compile(r"p(?!h)|b"), "1"),
# p before h and f,v,w
(re.compile(r"p(?=h)|[fvw]"), "3"),
(re.compile(r"[hx]"), ""),
(re.compile(r"[aeijouy]"), "0"),
(re.compile(r"[gkq]"), "4"),
(re.compile(r"l"), "5"),
(re.compile(r"[mn]"), "6"),
(re.compile(r"r"), "7"),
(re.compile(r"[sz]"), "8"),
# repeating digits
(re.compile(r"(\d)(?=\1)"), ""),
(re.compile(r"\B0"), ""),
]
def _remove_diacritics(s: str) -> str:
# https://stackoverflow.com/a/518232
return "".join(
c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
)
def _replace_by_rules(rules: list[tuple[Pattern[str], str]], s: str) -> str:
for rule in rules:
s = rule[0].sub(rule[1], s)
return s
def encode(data: str, concat: bool = False) -> list[tuple[str, str]]:
"""
:param data: Input to be encoded. Whitespace characters will be
interpreted as a wordbreak
:param concat: The intended behaviour of the cologne-phonetics
is to ignore special characters. This leads to concatenation for strings
with hyphens. If ``concat`` is set to ``True``, hyphenated string will be
treated as separate words
:return: Return a list of tuples containing sanitised input / encoded substring
pairs
:note: Contrary to many other implementations, in the final pass only
repeated **digits** are removed, not repeated **numbers**. Resulting e.g.
in ``xx`` being encoded as `4848` and not `48``
"""
if not concat:
data = data.replace("-", " ")
data = data.lower()
words_encoded = []
for word in data.split(" "):
word_clean = _remove_diacritics(
_replace_by_rules(RGX_SPECIAL_CHAR_REPLACEMENTS, word)
)
word_encoded = _replace_by_rules(RGX_RULES, word_clean)
words_encoded.append((word_clean, word_encoded))
return words_encoded
def compare(*data: str, concat: bool = False) -> bool:
"""
Encode and compare strings.
:param data: Data to compare. Either at last 2 positional arguments or an iterable
:param concat: Passed to ``encode()``
:returns: A boolean, indicating whether all passed data is equal after encoding
:raises: ValueError if only one input string is given
"""
if (
not isinstance(data[0], str)
and isinstance(data[0], Iterable)
and len(data) == 1
):
data = data[0]
if len(data) == 1:
raise ValueError('Compare called with only one value: "%s"' % data[0])
last = None
for s in data:
res = [r[1] for r in encode(s, concat=concat)]
if last and res != last:
return False
else:
last = res
else:
return True
def cli(args: list[str] | None = None) -> None:
parser = ArgumentParser(description=__doc__)
parser.add_argument("data", help="string to be encoded")
parser.add_argument(
"-c",
"--concat",
action="store_true",
help="treat words connected by hyphens as separate words",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="show detailed information"
)
parser.add_argument(
"-p",
"--pretty",
action="store_true",
help="use in combination with --verbose to format output nicely",
)
parsed_args = parser.parse_args(args)
res = encode(parsed_args.data, concat=parsed_args.concat)
if parsed_args.verbose:
sep = "\n" if parsed_args.pretty else ", "
out = sep.join([r[0] + ": " + r[1] for r in res])
else:
out = ", ".join([r[1] for r in res])
print(out)
if __name__ == "__main__": # pragma: no cover
cli(sys.argv)