Skip to content

Commit 3c55db7

Browse files
authored
Use Erlang's implementation of jaro distance, fix bugs (#13369)
* Use Erlang's implementation of jaro distance, fix bugs * Remove conditional compilation
1 parent 80af632 commit 3c55db7

File tree

3 files changed

+89
-83
lines changed

3 files changed

+89
-83
lines changed

lib/elixir/lib/string.ex

+3-81
Original file line numberDiff line numberDiff line change
@@ -3066,74 +3066,13 @@ defmodule String do
30663066
@spec jaro_distance(t, t) :: float
30673067
def jaro_distance(string1, string2)
30683068

3069-
def jaro_distance(string, string), do: 1.0
3069+
def jaro_distance(string, string) when is_binary(string), do: 1.0
30703070
def jaro_distance(_string, ""), do: 0.0
30713071
def jaro_distance("", _string), do: 0.0
30723072

30733073
def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do
3074-
{chars1, len1} = graphemes_and_length(string1)
3075-
{chars2, len2} = graphemes_and_length(string2)
3076-
3077-
case match(chars1, len1, chars2, len2) do
3078-
{0, _trans} ->
3079-
0.0
3080-
3081-
{comm, trans} ->
3082-
(comm / len1 + comm / len2 + (comm - trans) / comm) / 3
3083-
end
3084-
end
3085-
3086-
defp match(chars1, len1, chars2, len2) do
3087-
if len1 < len2 do
3088-
match(chars1, chars2, div(len2, 2) - 1)
3089-
else
3090-
match(chars2, chars1, div(len1, 2) - 1)
3091-
end
3092-
end
3093-
3094-
defp match(chars1, chars2, lim) do
3095-
match(chars1, chars2, {0, lim}, {0, 0, -1}, 0)
3096-
end
3097-
3098-
defp match([char | rest], chars, range, state, idx) do
3099-
{chars, state} = submatch(char, chars, range, state, idx)
3100-
3101-
case range do
3102-
{lim, lim} -> match(rest, tl(chars), range, state, idx + 1)
3103-
{pre, lim} -> match(rest, chars, {pre + 1, lim}, state, idx + 1)
3104-
end
3105-
end
3106-
3107-
defp match([], _, _, {comm, trans, _}, _), do: {comm, trans}
3108-
3109-
defp submatch(char, chars, {pre, _} = range, state, idx) do
3110-
case detect(char, chars, range) do
3111-
nil ->
3112-
{chars, state}
3113-
3114-
{subidx, chars} ->
3115-
{chars, proceed(state, idx - pre + subidx)}
3116-
end
3117-
end
3118-
3119-
defp detect(char, chars, {pre, lim}) do
3120-
detect(char, chars, pre + 1 + lim, 0, [])
3121-
end
3122-
3123-
defp detect(_char, _chars, 0, _idx, _acc), do: nil
3124-
defp detect(_char, [], _lim, _idx, _acc), do: nil
3125-
3126-
defp detect(char, [char | rest], _lim, idx, acc), do: {idx, Enum.reverse(acc, [nil | rest])}
3127-
3128-
defp detect(char, [other | rest], lim, idx, acc),
3129-
do: detect(char, rest, lim - 1, idx + 1, [other | acc])
3130-
3131-
defp proceed({comm, trans, former}, current) do
3132-
if current < former do
3133-
{comm + 1, trans + 1, current}
3134-
else
3135-
{comm + 1, trans, current}
3136-
end
3074+
# TODO: Replace by :string.jaro_similarity/2 when we require Erlang/OTP 27+
3075+
:elixir_utils.jaro_similarity(string1, string2)
31373076
end
31383077

31393078
@doc """
@@ -3168,7 +3107,6 @@ defmodule String do
31683107
codepoint_byte_size: 1,
31693108
grapheme_byte_size: 1,
31703109
grapheme_to_binary: 1,
3171-
graphemes_and_length: 1,
31723110
reverse_characters_to_binary: 1}
31733111

31743112
defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
@@ -3205,22 +3143,6 @@ defmodule String do
32053143
defp grapheme_byte_size([], acc),
32063144
do: acc
32073145

3208-
defp graphemes_and_length(string),
3209-
do: graphemes_and_length(string, [], 0)
3210-
3211-
defp graphemes_and_length(string, acc, length) do
3212-
case :unicode_util.gc(string) do
3213-
[gc | rest] ->
3214-
graphemes_and_length(rest, [gc | acc], length + 1)
3215-
3216-
[] ->
3217-
{:lists.reverse(acc), length}
3218-
3219-
{:error, <<byte, rest::bits>>} ->
3220-
graphemes_and_length(rest, [<<byte>> | acc], length + 1)
3221-
end
3222-
end
3223-
32243146
defp reverse_characters_to_binary(acc),
32253147
do: acc |> :lists.reverse() |> :unicode.characters_to_binary()
32263148
end

lib/elixir/src/elixir_utils.erl

+84-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
read_file_type/1, read_file_type/2, read_link_type/1, read_posix_mtime_and_size/1,
99
change_posix_time/2, change_universal_time/2,
1010
guard_op/2, extract_splat_guards/1, extract_guards/1,
11-
erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2]).
11+
erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2, jaro_similarity/2]).
1212
-include("elixir.hrl").
1313
-include_lib("kernel/include/file.hrl").
1414

@@ -223,3 +223,86 @@ returns_boolean({'__block__', _, Exprs}) ->
223223
returns_boolean(lists:last(Exprs));
224224

225225
returns_boolean(_) -> false.
226+
227+
228+
% TODO: Remove me when we require Erlang/OTP 27+
229+
% This is a polyfill for older versions, copying the code from
230+
% https://github.com/erlang/otp/pull/7879
231+
-spec jaro_similarity(String1, String2) -> Similarity when
232+
String1 :: unicode:chardata(),
233+
String2 :: unicode:chardata(),
234+
Similarity :: float(). %% Between +0.0 and 1.0
235+
jaro_similarity(A0, B0) ->
236+
{A, ALen} = str_to_gcl_and_length(A0),
237+
{B, BLen} = str_to_indexmap(B0),
238+
Dist = max(ALen, BLen) div 2,
239+
{AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
240+
if
241+
ALen =:= 0 andalso BLen =:= 0 ->
242+
1.0;
243+
ALen =:= 0 orelse BLen =:= 0 ->
244+
0.0;
245+
AM =:= [] ->
246+
0.0;
247+
true ->
248+
{M,T} = jaro_calc_mt(AM, BM, 0, 0),
249+
(M/ALen + M/BLen + (M-T/2)/M) / 3
250+
end.
251+
252+
jaro_match([A|As], B0, Min, Max, AM, BM) ->
253+
case jaro_detect(maps:get(A, B0, []), Min, Max) of
254+
false ->
255+
jaro_match(As, B0, Min+1, Max+1, AM, BM);
256+
{J, Remain} ->
257+
B = B0#{A => Remain},
258+
jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
259+
end;
260+
jaro_match(_A, _B, _Min, _Max, AM, BM) ->
261+
{AM, BM}.
262+
263+
jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
264+
{Idx, Rest};
265+
jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
266+
jaro_detect(Rest, Min, Max);
267+
jaro_detect(_, _, _) ->
268+
false.
269+
270+
jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
271+
jaro_calc_mt(AM, BM, M+1, T);
272+
jaro_calc_mt([_|AM], [_|BM], M, T) ->
273+
jaro_calc_mt(AM, BM, M+1, T+1);
274+
jaro_calc_mt([], [], M, T) ->
275+
{M, T}.
276+
277+
278+
%% Returns GC list and length
279+
str_to_gcl_and_length(S0) ->
280+
gcl_and_length(unicode_util:gc(S0), [], 0).
281+
282+
gcl_and_length([C|Str], Acc, N) ->
283+
gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
284+
gcl_and_length([], Acc, N) ->
285+
{lists:reverse(Acc), N};
286+
gcl_and_length({error, Err}, _, _) ->
287+
error({badarg, Err}).
288+
289+
%% Returns GC map with index and length
290+
str_to_indexmap(S) ->
291+
[M|L] = str_to_map(unicode_util:gc(S), 0),
292+
{M,L}.
293+
294+
str_to_map([], L) -> [#{}|L];
295+
str_to_map([G | Gs], I) ->
296+
[M|L] = str_to_map(unicode_util:gc(Gs), I+1),
297+
[maps:put(G, [I | maps:get(G, M, [])], M)| L];
298+
str_to_map({error,Error}, _) ->
299+
error({badarg, Error}).
300+
301+
%% Add in decreasing order
302+
add_rsorted(A, [H|_]=BM) when A > H ->
303+
[A|BM];
304+
add_rsorted(A, [H|BM]) ->
305+
[H|add_rsorted(A,BM)];
306+
add_rsorted(A, []) ->
307+
[A].
308+

lib/elixir/test/elixir/string_test.exs

+2-1
Original file line numberDiff line numberDiff line change
@@ -982,7 +982,7 @@ defmodule StringTest do
982982
assert String.jaro_distance("marhha", "martha") == 0.888888888888889
983983
assert String.jaro_distance("dwayne", "duane") == 0.8222222222222223
984984
assert String.jaro_distance("dixon", "dicksonx") == 0.7666666666666666
985-
assert String.jaro_distance("xdicksonx", "dixon") == 0.7851851851851852
985+
assert String.jaro_distance("xdicksonx", "dixon") == 0.7518518518518519
986986
assert String.jaro_distance("shackleford", "shackelford") == 0.9696969696969697
987987
assert String.jaro_distance("dunningham", "cunnigham") == 0.8962962962962964
988988
assert String.jaro_distance("nichleson", "nichulson") == 0.9259259259259259
@@ -999,6 +999,7 @@ defmodule StringTest do
999999
assert String.jaro_distance("jon", "john") == 0.9166666666666666
10001000
assert String.jaro_distance("jon", "jan") == 0.7777777777777777
10011001
assert String.jaro_distance("семена", "стремя") == 0.6666666666666666
1002+
assert String.jaro_distance("Sunday", "Saturday") == 0.7194444444444444
10021003
end
10031004

10041005
test "myers_difference/2" do

0 commit comments

Comments
 (0)