Skip to content

Use Erlang's implementation of jaro distance, fix bugs #13369

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 5 additions & 81 deletions lib/elixir/lib/string.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3066,74 +3066,15 @@ defmodule String do
@spec jaro_distance(t, t) :: float
def jaro_distance(string1, string2)

def jaro_distance(string, string), do: 1.0
def jaro_distance(string, string) when is_binary(string), do: 1.0
def jaro_distance(_string, ""), do: 0.0
def jaro_distance("", _string), do: 0.0
Comment on lines +3069 to 3071
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I felt we could keep these optimizations since they weren't in the Erlang version, WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine to keep them, yeah. :)


def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do
{chars1, len1} = graphemes_and_length(string1)
{chars2, len2} = graphemes_and_length(string2)

case match(chars1, len1, chars2, len2) do
{0, _trans} ->
0.0

{comm, trans} ->
(comm / len1 + comm / len2 + (comm - trans) / comm) / 3
end
end
# TODO: Remove me when we require Erlang/OTP 27+
@jaro_module if :erlang.system_info(:otp_release) >= [?2, ?7], do: :string, else: :elixir_utils
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure: is it enough to check the OTP version at compile time, or is there another preferred way?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just always call the one in elixir_utils for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense!


defp match(chars1, len1, chars2, len2) do
if len1 < len2 do
match(chars1, chars2, div(len2, 2) - 1)
else
match(chars2, chars1, div(len1, 2) - 1)
end
end

defp match(chars1, chars2, lim) do
match(chars1, chars2, {0, lim}, {0, 0, -1}, 0)
end

defp match([char | rest], chars, range, state, idx) do
{chars, state} = submatch(char, chars, range, state, idx)

case range do
{lim, lim} -> match(rest, tl(chars), range, state, idx + 1)
{pre, lim} -> match(rest, chars, {pre + 1, lim}, state, idx + 1)
end
end

defp match([], _, _, {comm, trans, _}, _), do: {comm, trans}

defp submatch(char, chars, {pre, _} = range, state, idx) do
case detect(char, chars, range) do
nil ->
{chars, state}

{subidx, chars} ->
{chars, proceed(state, idx - pre + subidx)}
end
end

defp detect(char, chars, {pre, lim}) do
detect(char, chars, pre + 1 + lim, 0, [])
end

defp detect(_char, _chars, 0, _idx, _acc), do: nil
defp detect(_char, [], _lim, _idx, _acc), do: nil

defp detect(char, [char | rest], _lim, idx, acc), do: {idx, Enum.reverse(acc, [nil | rest])}

defp detect(char, [other | rest], lim, idx, acc),
do: detect(char, rest, lim - 1, idx + 1, [other | acc])

defp proceed({comm, trans, former}, current) do
if current < former do
{comm + 1, trans + 1, current}
else
{comm + 1, trans, current}
end
def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do
@jaro_module.jaro_similarity(string1, string2)
end

@doc """
Expand Down Expand Up @@ -3168,7 +3109,6 @@ defmodule String do
codepoint_byte_size: 1,
grapheme_byte_size: 1,
grapheme_to_binary: 1,
graphemes_and_length: 1,
reverse_characters_to_binary: 1}

defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
Expand Down Expand Up @@ -3205,22 +3145,6 @@ defmodule String do
defp grapheme_byte_size([], acc),
do: acc

defp graphemes_and_length(string),
do: graphemes_and_length(string, [], 0)

defp graphemes_and_length(string, acc, length) do
case :unicode_util.gc(string) do
[gc | rest] ->
graphemes_and_length(rest, [gc | acc], length + 1)

[] ->
{:lists.reverse(acc), length}

{:error, <<byte, rest::bits>>} ->
graphemes_and_length(rest, [<<byte>> | acc], length + 1)
end
end

defp reverse_characters_to_binary(acc),
do: acc |> :lists.reverse() |> :unicode.characters_to_binary()
end
85 changes: 84 additions & 1 deletion lib/elixir/src/elixir_utils.erl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
read_file_type/1, read_file_type/2, read_link_type/1, read_posix_mtime_and_size/1,
change_posix_time/2, change_universal_time/2,
guard_op/2, extract_splat_guards/1, extract_guards/1,
erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2]).
erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2, jaro_similarity/2]).
-include("elixir.hrl").
-include_lib("kernel/include/file.hrl").

Expand Down Expand Up @@ -223,3 +223,86 @@ returns_boolean({'__block__', _, Exprs}) ->
returns_boolean(lists:last(Exprs));

returns_boolean(_) -> false.


% TODO: Remove me when we require Erlang/OTP 27+
% This is a polyfill for older versions, copying the code from
% https://github.com/erlang/otp/pull/7879
-spec jaro_similarity(String1, String2) -> Similarity when
String1 :: unicode:chardata(),
String2 :: unicode:chardata(),
Similarity :: float(). %% Between +0.0 and 1.0
jaro_similarity(A0, B0) ->
{A, ALen} = str_to_gcl_and_length(A0),
{B, BLen} = str_to_indexmap(B0),
Dist = max(ALen, BLen) div 2,
{AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
if
ALen =:= 0 andalso BLen =:= 0 ->
1.0;
ALen =:= 0 orelse BLen =:= 0 ->
0.0;
AM =:= [] ->
0.0;
true ->
{M,T} = jaro_calc_mt(AM, BM, 0, 0),
(M/ALen + M/BLen + (M-T/2)/M) / 3
end.

jaro_match([A|As], B0, Min, Max, AM, BM) ->
case jaro_detect(maps:get(A, B0, []), Min, Max) of
false ->
jaro_match(As, B0, Min+1, Max+1, AM, BM);
{J, Remain} ->
B = B0#{A => Remain},
jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
end;
jaro_match(_A, _B, _Min, _Max, AM, BM) ->
{AM, BM}.

jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
{Idx, Rest};
jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
jaro_detect(Rest, Min, Max);
jaro_detect(_, _, _) ->
false.

jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
jaro_calc_mt(AM, BM, M+1, T);
jaro_calc_mt([_|AM], [_|BM], M, T) ->
jaro_calc_mt(AM, BM, M+1, T+1);
jaro_calc_mt([], [], M, T) ->
{M, T}.


%% Returns GC list and length
str_to_gcl_and_length(S0) ->
gcl_and_length(unicode_util:gc(S0), [], 0).

gcl_and_length([C|Str], Acc, N) ->
gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
gcl_and_length([], Acc, N) ->
{lists:reverse(Acc), N};
gcl_and_length({error, Err}, _, _) ->
error({badarg, Err}).

%% Returns GC map with index and length
str_to_indexmap(S) ->
[M|L] = str_to_map(unicode_util:gc(S), 0),
{M,L}.

str_to_map([], L) -> [#{}|L];
str_to_map([G | Gs], I) ->
[M|L] = str_to_map(unicode_util:gc(Gs), I+1),
[maps:put(G, [I | maps:get(G, M, [])], M)| L];
str_to_map({error,Error}, _) ->
error({badarg, Error}).

%% Add in decreasing order
add_rsorted(A, [H|_]=BM) when A > H ->
[A|BM];
add_rsorted(A, [H|BM]) ->
[H|add_rsorted(A,BM)];
add_rsorted(A, []) ->
[A].

3 changes: 2 additions & 1 deletion lib/elixir/test/elixir/string_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ defmodule StringTest do
assert String.jaro_distance("marhha", "martha") == 0.888888888888889
assert String.jaro_distance("dwayne", "duane") == 0.8222222222222223
assert String.jaro_distance("dixon", "dicksonx") == 0.7666666666666666
assert String.jaro_distance("xdicksonx", "dixon") == 0.7851851851851852
assert String.jaro_distance("xdicksonx", "dixon") == 0.7518518518518519
assert String.jaro_distance("shackleford", "shackelford") == 0.9696969696969697
assert String.jaro_distance("dunningham", "cunnigham") == 0.8962962962962964
assert String.jaro_distance("nichleson", "nichulson") == 0.9259259259259259
Expand All @@ -999,6 +999,7 @@ defmodule StringTest do
assert String.jaro_distance("jon", "john") == 0.9166666666666666
assert String.jaro_distance("jon", "jan") == 0.7777777777777777
assert String.jaro_distance("семена", "стремя") == 0.6666666666666666
assert String.jaro_distance("Sunday", "Saturday") == 0.7194444444444444
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the example mentioned here

end

test "myers_difference/2" do
Expand Down