-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Use Erlang's implementation of jaro distance, fix bugs #13369
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3066,74 +3066,15 @@ defmodule String do | |
@spec jaro_distance(t, t) :: float | ||
def jaro_distance(string1, string2) | ||
|
||
def jaro_distance(string, string), do: 1.0 | ||
def jaro_distance(string, string) when is_binary(string), do: 1.0 | ||
def jaro_distance(_string, ""), do: 0.0 | ||
def jaro_distance("", _string), do: 0.0 | ||
|
||
def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do | ||
{chars1, len1} = graphemes_and_length(string1) | ||
{chars2, len2} = graphemes_and_length(string2) | ||
|
||
case match(chars1, len1, chars2, len2) do | ||
{0, _trans} -> | ||
0.0 | ||
|
||
{comm, trans} -> | ||
(comm / len1 + comm / len2 + (comm - trans) / comm) / 3 | ||
end | ||
end | ||
# TODO: Remove me when we require Erlang/OTP 27+ | ||
@jaro_module if :erlang.system_info(:otp_release) >= [?2, ?7], do: :string, else: :elixir_utils | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't sure: is it enough to check the OTP version at compile time, or is there another preferred way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can just always call the one in elixir_utils for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense! |
||
|
||
defp match(chars1, len1, chars2, len2) do | ||
if len1 < len2 do | ||
match(chars1, chars2, div(len2, 2) - 1) | ||
else | ||
match(chars2, chars1, div(len1, 2) - 1) | ||
end | ||
end | ||
|
||
defp match(chars1, chars2, lim) do | ||
match(chars1, chars2, {0, lim}, {0, 0, -1}, 0) | ||
end | ||
|
||
defp match([char | rest], chars, range, state, idx) do | ||
{chars, state} = submatch(char, chars, range, state, idx) | ||
|
||
case range do | ||
{lim, lim} -> match(rest, tl(chars), range, state, idx + 1) | ||
{pre, lim} -> match(rest, chars, {pre + 1, lim}, state, idx + 1) | ||
end | ||
end | ||
|
||
defp match([], _, _, {comm, trans, _}, _), do: {comm, trans} | ||
|
||
defp submatch(char, chars, {pre, _} = range, state, idx) do | ||
case detect(char, chars, range) do | ||
nil -> | ||
{chars, state} | ||
|
||
{subidx, chars} -> | ||
{chars, proceed(state, idx - pre + subidx)} | ||
end | ||
end | ||
|
||
defp detect(char, chars, {pre, lim}) do | ||
detect(char, chars, pre + 1 + lim, 0, []) | ||
end | ||
|
||
defp detect(_char, _chars, 0, _idx, _acc), do: nil | ||
defp detect(_char, [], _lim, _idx, _acc), do: nil | ||
|
||
defp detect(char, [char | rest], _lim, idx, acc), do: {idx, Enum.reverse(acc, [nil | rest])} | ||
|
||
defp detect(char, [other | rest], lim, idx, acc), | ||
do: detect(char, rest, lim - 1, idx + 1, [other | acc]) | ||
|
||
defp proceed({comm, trans, former}, current) do | ||
if current < former do | ||
{comm + 1, trans + 1, current} | ||
else | ||
{comm + 1, trans, current} | ||
end | ||
def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do | ||
@jaro_module.jaro_similarity(string1, string2) | ||
end | ||
|
||
@doc """ | ||
|
@@ -3168,7 +3109,6 @@ defmodule String do | |
codepoint_byte_size: 1, | ||
grapheme_byte_size: 1, | ||
grapheme_to_binary: 1, | ||
graphemes_and_length: 1, | ||
reverse_characters_to_binary: 1} | ||
|
||
defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary) | ||
|
@@ -3205,22 +3145,6 @@ defmodule String do | |
defp grapheme_byte_size([], acc), | ||
do: acc | ||
|
||
defp graphemes_and_length(string), | ||
do: graphemes_and_length(string, [], 0) | ||
|
||
defp graphemes_and_length(string, acc, length) do | ||
case :unicode_util.gc(string) do | ||
[gc | rest] -> | ||
graphemes_and_length(rest, [gc | acc], length + 1) | ||
|
||
[] -> | ||
{:lists.reverse(acc), length} | ||
|
||
{:error, <<byte, rest::bits>>} -> | ||
graphemes_and_length(rest, [<<byte>> | acc], length + 1) | ||
end | ||
end | ||
|
||
defp reverse_characters_to_binary(acc), | ||
do: acc |> :lists.reverse() |> :unicode.characters_to_binary() | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -982,7 +982,7 @@ defmodule StringTest do | |
assert String.jaro_distance("marhha", "martha") == 0.888888888888889 | ||
assert String.jaro_distance("dwayne", "duane") == 0.8222222222222223 | ||
assert String.jaro_distance("dixon", "dicksonx") == 0.7666666666666666 | ||
assert String.jaro_distance("xdicksonx", "dixon") == 0.7851851851851852 | ||
assert String.jaro_distance("xdicksonx", "dixon") == 0.7518518518518519 | ||
assert String.jaro_distance("shackleford", "shackelford") == 0.9696969696969697 | ||
assert String.jaro_distance("dunningham", "cunnigham") == 0.8962962962962964 | ||
assert String.jaro_distance("nichleson", "nichulson") == 0.9259259259259259 | ||
|
@@ -999,6 +999,7 @@ defmodule StringTest do | |
assert String.jaro_distance("jon", "john") == 0.9166666666666666 | ||
assert String.jaro_distance("jon", "jan") == 0.7777777777777777 | ||
assert String.jaro_distance("семена", "стремя") == 0.6666666666666666 | ||
assert String.jaro_distance("Sunday", "Saturday") == 0.7194444444444444 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding the example mentioned here |
||
end | ||
|
||
test "myers_difference/2" do | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I felt we could keep these optimizations since they weren't in the Erlang version, WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fine to keep them, yeah. :)