Skip to content

Commit cb08cfc

Browse files
authored
ReadOnlySpan<T> Support (#36)
Resolves #35 for comparison using byte[], or any scenarios where i.e. ReadOnlySpan might be preferred.
1 parent 1fa3892 commit cb08cfc

22 files changed

+313
-142
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -243,3 +243,4 @@ ModelManifest.xml
243243
# FAKE - F# Make
244244
.fake/
245245
*.DS_Store
246+
.idea/

Diff for: src/F23.StringSimilarity/Cosine.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ namespace F23.StringSimilarity
3232
public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStringDistance
3333
{
3434
/// <summary>
35-
/// Implements Cosine Similarity between strings.The strings are first
35+
/// Implements Cosine Similarity between strings. The strings are first
3636
/// transformed in vectors of occurrences of k-shingles(sequences of k
3737
/// characters). In this n-dimensional space, the similarity between the two
3838
/// strings is the cosine of their respective vectors.
@@ -41,7 +41,7 @@ public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStri
4141
public Cosine(int k) : base(k) { }
4242

4343
/// <summary>
44-
/// Implements Cosine Similarity between strings.The strings are first
44+
/// Implements Cosine Similarity between strings. The strings are first
4545
/// transformed in vectors of occurrences of k-shingles(sequences of k
4646
/// characters). In this n-dimensional space, the similarity between the two
4747
/// strings is the cosine of their respective vectors.

Diff for: src/F23.StringSimilarity/Damerau.cs

+8-4
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ namespace F23.StringSimilarity
4141
/// This is not to be confused with the optimal string alignment distance, which
4242
/// is an extension where no substring can be edited more than once.
4343
/// </summary>
44-
public class Damerau : IMetricStringDistance
44+
public class Damerau : IMetricStringDistance, IMetricSpanDistance
4545
{
4646
/// <summary>
4747
/// Compute the distance between strings: the minimum number of operations
@@ -54,6 +54,10 @@ public class Damerau : IMetricStringDistance
5454
/// <returns>The computed distance.</returns>
5555
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
5656
public double Distance(string s1, string s2)
57+
=> Distance(s1.AsSpan(), s2.AsSpan());
58+
59+
public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
60+
where T : IEquatable<T>
5761
{
5862
if (s1 == null)
5963
{
@@ -65,7 +69,7 @@ public double Distance(string s1, string s2)
6569
throw new ArgumentNullException(nameof(s2));
6670
}
6771

68-
if (s1.Equals(s2))
72+
if (s1.SequenceEqual(s2))
6973
{
7074
return 0;
7175
}
@@ -74,7 +78,7 @@ public double Distance(string s1, string s2)
7478
int inf = s1.Length + s2.Length;
7579

7680
// Create and initialize the character array indices
77-
var da = new Dictionary<char, int>();
81+
var da = new Dictionary<T, int>();
7882

7983
for (int d = 0; d < s1.Length; d++)
8084
{
@@ -115,7 +119,7 @@ public double Distance(string s1, string s2)
115119
int j1 = db;
116120

117121
int cost = 1;
118-
if (s1[i - 1] == s2[j - 1])
122+
if (s1[i - 1].Equals(s2[j - 1]))
119123
{
120124
cost = 0;
121125
db = j;

Diff for: src/F23.StringSimilarity/F23.StringSimilarity.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0</TargetFrameworks>
3+
<TargetFramework>netstandard2.0</TargetFramework>
44
<PackageId>F23.StringSimilarity</PackageId>
55
<PackageTags>string;similarity;distance;levenshtein;jaro-winkler;lcs;cosine</PackageTags>
66
<Title>StringSimilarity.NET</Title>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
using System;
2+
3+
namespace F23.StringSimilarity.Interfaces
4+
{
5+
/// <summary>
6+
/// Span distances that implement this interface are metrics, which means:
7+
/// - d(x, y) ≥ 0 (non-negativity, or separation axiom)
8+
/// - d(x, y) = 0 if and only if x = y (identity, or coincidence axiom)
9+
/// - d(x, y) = d(y, x) (symmetry)
10+
/// - d(x, z) ≤ d(x, y) + d(y, z) (triangle inequality).
11+
/// </summary>
12+
public interface IMetricSpanDistance : ISpanDistance
13+
{
14+
/// <summary>
15+
/// Compute and return the metric distance.
16+
/// </summary>
17+
/// <param name="b1">The first span.</param>
18+
/// <param name="b2">The second span.</param>
19+
/// <returns>The metric distance.</returns>
20+
new double Distance<T>(ReadOnlySpan<T> b1, ReadOnlySpan<T> b2)
21+
where T : IEquatable<T>;
22+
}
23+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace F23.StringSimilarity.Interfaces
2+
{
3+
public interface INormalizedSpanDistance : ISpanDistance
4+
{
5+
}
6+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace F23.StringSimilarity.Interfaces
2+
{
3+
public interface INormalizedSpanSimilarity : ISpanSimilarity
4+
{
5+
}
6+
}

Diff for: src/F23.StringSimilarity/Interfaces/ISpanDistance.cs

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
using System;
2+
3+
namespace F23.StringSimilarity.Interfaces
4+
{
5+
public interface ISpanDistance
6+
{
7+
/// <summary>
8+
/// Compute and return a measure of distance.
9+
/// Must be >= 0.
10+
///
11+
/// This method operates on spans such as byte arrays.
12+
/// Note that, when used on bytes, string encodings that
13+
/// use more than one byte per codepoint (such as UTF-8)
14+
/// are not supported and will most likely return
15+
/// incorrect results.
16+
/// </summary>
17+
/// <param name="b1">The first span.</param>
18+
/// <param name="b2">The second span.</param>
19+
/// <returns>The measure of distance between the spans.</returns>
20+
double Distance<T>(ReadOnlySpan<T> b1, ReadOnlySpan<T> b2)
21+
where T : IEquatable<T>;
22+
}
23+
}
+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using System;
2+
3+
namespace F23.StringSimilarity.Interfaces
4+
{
5+
public interface ISpanSimilarity
6+
{
7+
/// <summary>
8+
/// Compute and return a measure of similarity between 2 spans.
9+
/// </summary>
10+
/// <param name="s1">The first span</param>
11+
/// <param name="s2">The second span</param>
12+
/// <returns>Similarity (0 means both spans are completely different)</returns>
13+
double Similarity<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
14+
where T : IEquatable<T>;
15+
}
16+
}

Diff for: src/F23.StringSimilarity/JaroWinkler.cs

+20-10
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424

2525
using System;
26+
using System.Data.SqlTypes;
2627
using System.Linq;
2728
using F23.StringSimilarity.Interfaces;
2829
// ReSharper disable SuggestVarOrType_Elsewhere
@@ -38,7 +39,7 @@ namespace F23.StringSimilarity
3839
/// Jaro-Winkler was developed in the area of record linkage (duplicate
3940
/// detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0].
4041
/// The distance is computed as 1 - Jaro-Winkler similarity.
41-
public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance
42+
public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance, INormalizedSpanSimilarity, INormalizedSpanDistance
4243
{
4344
private const double DEFAULT_THRESHOLD = 0.7;
4445
private const int THREE = 3;
@@ -75,6 +76,10 @@ public JaroWinkler(double threshold)
7576
/// <returns>The Jaro-Winkler similarity in the range [0, 1]</returns>
7677
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
7778
public double Similarity(string s1, string s2)
79+
=> Similarity(s1.AsSpan(), s2.AsSpan());
80+
81+
public double Similarity<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
82+
where T : IEquatable<T>
7883
{
7984
if (s1 == null)
8085
{
@@ -86,7 +91,7 @@ public double Similarity(string s1, string s2)
8691
throw new ArgumentNullException(nameof(s2));
8792
}
8893

89-
if (s1.Equals(s2))
94+
if (s1.SequenceEqual(s2))
9095
{
9196
return 1f;
9297
}
@@ -117,10 +122,15 @@ public double Similarity(string s1, string s2)
117122
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
118123
public double Distance(string s1, string s2)
119124
=> 1.0 - Similarity(s1, s2);
125+
126+
public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
127+
where T : IEquatable<T>
128+
=> 1.0 - Similarity(s1, s2);
120129

121-
private static int[] Matches(string s1, string s2)
130+
private static int[] Matches<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
131+
where T : IEquatable<T>
122132
{
123-
string max, min;
133+
ReadOnlySpan<T> max, min;
124134
if (s1.Length > s2.Length)
125135
{
126136
max = s1;
@@ -141,11 +151,11 @@ private static int[] Matches(string s1, string s2)
141151
int matches = 0;
142152
for (int mi = 0; mi < min.Length; mi++)
143153
{
144-
char c1 = min[mi];
154+
var c1 = min[mi];
145155
for (int xi = Math.Max(mi - range, 0),
146156
xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
147157
{
148-
if (!match_flags[xi] && c1 == max[xi])
158+
if (!match_flags[xi] && c1.Equals(max[xi]))
149159
{
150160
match_indexes[mi] = xi;
151161
match_flags[xi] = true;
@@ -154,8 +164,8 @@ private static int[] Matches(string s1, string s2)
154164
}
155165
}
156166
}
157-
char[] ms1 = new char[matches];
158-
char[] ms2 = new char[matches];
167+
T[] ms1 = new T[matches];
168+
T[] ms2 = new T[matches];
159169
for (int i = 0, si = 0; i < min.Length; i++)
160170
{
161171
if (match_indexes[i] != -1)
@@ -175,15 +185,15 @@ private static int[] Matches(string s1, string s2)
175185
int transpositions = 0;
176186
for (int mi = 0; mi < ms1.Length; mi++)
177187
{
178-
if (ms1[mi] != ms2[mi])
188+
if (!ms1[mi].Equals(ms2[mi]))
179189
{
180190
transpositions++;
181191
}
182192
}
183193
int prefix = 0;
184194
for (int mi = 0; mi < min.Length; mi++)
185195
{
186-
if (s1[mi] == s2[mi])
196+
if (s1[mi].Equals(s2[mi]))
187197
{
188198
prefix++;
189199
}

Diff for: src/F23.StringSimilarity/Levenshtein.cs

+17-11
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,15 @@ namespace F23.StringSimilarity
3232
/// The Levenshtein distance between two words is the Minimum number of
3333
/// single-character edits (insertions, deletions or substitutions) required to
3434
/// change one string into the other.
35-
public class Levenshtein : IMetricStringDistance
35+
public class Levenshtein : IMetricStringDistance, IMetricSpanDistance
3636
{
3737
/// <summary>
3838
/// Equivalent to Distance(s1, s2, Int32.MaxValue).
3939
/// </summary>
4040
/// <param name="s1">The first string to compare.</param>
4141
/// <param name="s2">The second string to compare.</param>
4242
/// <returns>The Levenshtein distance between strings</returns>
43-
public double Distance(string s1, string s2)
44-
{
45-
return Distance(s1, s2, int.MaxValue);
46-
}
43+
public double Distance(string s1, string s2) => Distance(s1, s2, int.MaxValue);
4744

4845
/// <summary>
4946
/// The Levenshtein distance, or edit distance, between two words is the
@@ -75,6 +72,14 @@ public double Distance(string s1, string s2)
7572
/// <returns>The Levenshtein distance between strings</returns>
7673
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
7774
public double Distance(string s1, string s2, int limit)
75+
=> Distance(s1.AsSpan(), s2.AsSpan(), limit);
76+
77+
public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
78+
where T : IEquatable<T>
79+
=> Distance(s1, s2, int.MaxValue);
80+
81+
public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
82+
where T : IEquatable<T>
7883
{
7984
if (s1 == null)
8085
{
@@ -86,7 +91,7 @@ public double Distance(string s1, string s2, int limit)
8691
throw new ArgumentNullException(nameof(s2));
8792
}
8893

89-
if (s1.Equals(s2))
94+
if (s1.SequenceEqual(s2))
9095
{
9196
return 0;
9297
}
@@ -127,15 +132,16 @@ public double Distance(string s1, string s2, int limit)
127132
for (int j = 0; j < s2.Length; j++)
128133
{
129134
int cost = 1;
130-
if (s1[i] == s2[j])
135+
if (s1[i].Equals(s2[j]))
131136
{
132137
cost = 0;
133138
}
139+
134140
v1[j + 1] = Math.Min(
135-
v1[j] + 1, // Cost of insertion
136-
Math.Min(
137-
v0[j + 1] + 1, // Cost of remove
138-
v0[j] + cost)); // Cost of substitution
141+
v1[j] + 1, // Cost of insertion
142+
Math.Min(
143+
v0[j + 1] + 1, // Cost of remove
144+
v0[j] + cost)); // Cost of substitution
139145

140146
minv1 = Math.Min(minv1, v1[j + 1]);
141147
}

Diff for: src/F23.StringSimilarity/LongestCommonSubsequence.cs

+11-6
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ namespace F23.StringSimilarity
4444
///
4545
/// ! This class currently implements the dynamic programming approach, which has
4646
/// a space requirement O(m * n)!
47-
public class LongestCommonSubsequence : IStringDistance
47+
public class LongestCommonSubsequence : IStringDistance, ISpanDistance
4848
{
4949
/// <summary>
5050
/// Return the LCS distance between strings s1 and s2, computed as |s1| +
@@ -58,6 +58,10 @@ public class LongestCommonSubsequence : IStringDistance
5858
/// </returns>
5959
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
6060
public double Distance(string s1, string s2)
61+
=> Distance(s1.AsSpan(), s2.AsSpan());
62+
63+
public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
64+
where T : IEquatable<T>
6165
{
6266
if (s1 == null)
6367
{
@@ -69,7 +73,7 @@ public double Distance(string s1, string s2)
6973
throw new ArgumentNullException(nameof(s2));
7074
}
7175

72-
if (s1.Equals(s2))
76+
if (s1.SequenceEqual(s2))
7377
{
7478
return 0;
7579
}
@@ -86,6 +90,10 @@ public double Distance(string s1, string s2)
8690
/// <returns>The length of LCS(s2, s2)</returns>
8791
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
8892
public int Length(string s1, string s2)
93+
=> Length(s1.AsSpan(), s2.AsSpan());
94+
95+
internal static int Length<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2)
96+
where T : IEquatable<T>
8997
{
9098
if (s1 == null)
9199
{
@@ -113,8 +121,6 @@ public int Length(string s1, string s2)
113121
*/
114122
int s1_length = s1.Length;
115123
int s2_length = s2.Length;
116-
char[] x = s1.ToCharArray();
117-
char[] y = s2.ToCharArray();
118124

119125
int[,] c = new int[s1_length + 1, s2_length + 1];
120126

@@ -132,10 +138,9 @@ public int Length(string s1, string s2)
132138
{
133139
for (int j = 1; j <= s2_length; j++)
134140
{
135-
if (x[i - 1] == y[j - 1])
141+
if (s1[i - 1].Equals(s2[j - 1]))
136142
{
137143
c[i, j] = c[i - 1, j - 1] + 1;
138-
139144
}
140145
else
141146
{

0 commit comments

Comments
 (0)