Skip to content

Commit 3d9ddb8

Browse files
committed
2 parents b7c19ec + 488b075 commit 3d9ddb8

23 files changed

+171
-855
lines changed

Diff for: .travis.yml

+5-37
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,7 @@
11
language: csharp
2-
solution: F23.StringSimilarity.sln
3-
install:
4-
- nuget restore F23.StringSimilarity.sln
5-
- nuget install xunit.runner.console -Version 2.1.0 -OutputDirectory testrunner
6-
# - nuget install coveralls.io -Version 1.3.4 -OutputDirectory coveralls
7-
8-
# - CURRENT_WORKING_DIR=$(pwd)
9-
10-
# - sudo apt-get install gtk-sharp2
11-
# - curl -sS https://api.nuget.org/packages/mono.cecil.0.9.5.4.nupkg > /tmp/mono.cecil.0.9.5.4.nupkg.zip
12-
# - unzip /tmp/mono.cecil.0.9.5.4.nupkg.zip -d /tmp/cecil
13-
# - cp /tmp/cecil/lib/net40/Mono.Cecil.dll .
14-
# - cp /tmp/cecil/lib/net40/Mono.Cecil.dll /tmp/cecil/
15-
# - git clone --depth=50 git://github.com/csMACnz/monocov.git ../../csMACnz/monocov
16-
# - cd ../../csMACnz/monocov
17-
# - cp /tmp/cecil/Mono.Cecil.dll .
18-
# - ./configure
19-
# - make
20-
# - sudo make install
21-
# - cd $CURRENT_WORKING_DIR
22-
2+
mono: none
3+
dotnet: 2.0.0
4+
dist: trusty
235
script:
24-
- xbuild /p:Configuration=Release F23.StringSimilarity.sln
25-
- mono ./testrunner/xunit.runner.console.2.1.0/tools/xunit.console.exe ./test/F23.StringSimilarity.Tests/bin/Release/F23.StringSimilarity.Tests.dll
26-
27-
# - export LD_LIBRARY_PATH=/usr/local/lib
28-
# - mono --debug --profile=monocov:outfile=monocovCoverage.cov,+[F23.StringSimilarity],+[F23.StringSimilarity.Tests] ./testrunner/xunit.runner.console.2.1.0/tools/xunit.console.exe ./test/F23.StringSimilarity.Tests/bin/Release/F23.StringSimilarity.Tests.dll
29-
# - monocov --export-xml=monocovCoverage monocovCoverage.cov
30-
# - REPO_COMMIT_AUTHOR=$(git show -s --pretty=format:"%cn")
31-
# - REPO_COMMIT_AUTHOR_EMAIL=$(git show -s --pretty=format:"%ce")
32-
# - REPO_COMMIT_MESSAGE=$(git show -s --pretty=format:"%s")
33-
# - echo $TRAVIS_COMMIT
34-
# - echo $TRAVIS_BRANCH
35-
# - echo $REPO_COMMIT_AUTHOR
36-
# - echo $REPO_COMMIT_AUTHOR_EMAIL
37-
# - echo $REPO_COMMIT_MESSAGE
38-
# - echo $TRAVIS_JOB_ID
39-
# - mono ./coveralls/coveralls.io.1.3.4/tools/coveralls.net.exe --monocov -i ./monocovCoverage --commitId $TRAVIS_COMMIT --commitBranch $TRAVIS_BRANCH --commitAuthor "$REPO_COMMIT_AUTHOR" --commitEmail "$REPO_COMMIT_AUTHOR_EMAIL" --commitMessage "$REPO_COMMIT_MESSAGE" --jobId $TRAVIS_JOB_ID --serviceName "travis-ci" --useRelativePaths
6+
- dotnet build -c Release src/F23.StringSimilarity/F23.StringSimilarity.csproj
7+
- dotnet test test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj

Diff for: LICENSE

+22
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,25 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
2222

23+
Portions of this code are licensed and copyright as follows:
24+
25+
Copyright 2015 Thibault Debatty.
26+
27+
Permission is hereby granted, free of charge, to any person obtaining
28+
a copy of this software and associated documentation files (the
29+
"Software"), to deal in the Software without restriction, including
30+
without limitation the rights to use, copy, modify, merge, publish,
31+
distribute, sublicense, and/or sell copies of the Software, and to
32+
permit persons to whom the Software is furnished to do so, subject to
33+
the following conditions:
34+
35+
The above copyright notice and this permission notice shall be
36+
included in all copies or substantial portions of the Software.
37+
38+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
40+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
42+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
43+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
44+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Diff for: build/F23.StringSimilarity.nuspec

-22
This file was deleted.

Diff for: src/F23.StringSimilarity/Cosine.cs

+10-6
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,14 @@ public Cosine(int k) : base(k) { }
4949
/// Default k is 3.
5050
/// </summary>
5151
public Cosine() { }
52-
52+
5353
/// <summary>
5454
/// Compute the cosine similarity between strings.
5555
/// </summary>
5656
/// <param name="s1">The first string to compare.</param>
5757
/// <param name="s2">The second string to compare.</param>
5858
/// <returns>The cosine similarity in the range [0, 1]</returns>
59-
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
59+
/// <exception cref="T:System.ArgumentNullException">If s1 or s2 is null.</exception>
6060
public double Similarity(string s1, string s2)
6161
{
6262
if (s1 == null)
@@ -118,9 +118,7 @@ private static double DotProduct(IDictionary<string, int> profile1,
118118
double agg = 0;
119119
foreach (var entry in small_profile)
120120
{
121-
int i;
122-
123-
if (!large_profile.TryGetValue(entry.Key, out i)) continue;
121+
if (!large_profile.TryGetValue(entry.Key, out var i)) continue;
124122

125123
agg += 1.0 * entry.Value * i;
126124
}
@@ -137,7 +135,13 @@ private static double DotProduct(IDictionary<string, int> profile1,
137135
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
138136
public double Distance(string s1, string s2)
139137
=> 1.0 - Similarity(s1, s2);
140-
138+
139+
/// <summary>
140+
///
141+
/// </summary>
142+
/// <param name="profile1"></param>
143+
/// <param name="profile2"></param>
144+
/// <returns></returns>
141145
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
142146
=> DotProduct(profile1, profile2)
143147
/ (Norm(profile1) * Norm(profile2));

Diff for: src/F23.StringSimilarity/Damerau.cs

-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ namespace F23.StringSimilarity
3838
/// substitution of a single character, or a transposition of two adjacent
3939
/// characters.
4040
/// It does respect triangle inequality, and is thus a metric distance.
41-
///
4241
/// This is not to be confused with the optimal string alignment distance, which
4342
/// is an extension where no substring can be edited more than once.
4443
/// </summary>

Diff for: src/F23.StringSimilarity/F23.StringSimilarity.csproj

+20-74
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,23 @@
1-
<?xml version="1.0" encoding="utf-8"?>
2-
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3-
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
43
<PropertyGroup>
5-
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
6-
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
7-
<ProjectGuid>{FA27327B-BCCC-46C7-8EED-BBD1ECF4BF53}</ProjectGuid>
8-
<OutputType>Library</OutputType>
9-
<AppDesignerFolder>Properties</AppDesignerFolder>
10-
<RootNamespace>F23.StringSimilarity</RootNamespace>
11-
<AssemblyName>F23.StringSimilarity</AssemblyName>
12-
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
13-
<FileAlignment>512</FileAlignment>
4+
<TargetFramework>netstandard1.0</TargetFramework>
5+
<PackageId>F23.StringSimilarity</PackageId>
6+
<PackageVersion>3.0.0</PackageVersion>
7+
<Title>StringSimilarity.NET</Title>
8+
<Authors>James Blair, Paul Irwin</Authors>
9+
<Copyright>Copyright 2016 feature[23]</Copyright>
10+
<Description>A .NET port of java-string-similarity.</Description>
11+
<Summary>A .NET port of java-string-similarity (https://github.com/tdebatty/java-string-similarity). A library implementing different string similarity and distance measures. Several algorithms (including Levenshtein edit distance and sibblings, Jaro-Winkler, Longest Common Subsequence, cosine similarity etc.) are currently implemented.</Summary>
12+
<PackageProjectUrl>https://github.com/feature23/StringSimilarity.NET</PackageProjectUrl>
13+
<PackageLicenseUrl>https://raw.githubusercontent.com/feature23/StringSimilarity.NET/master/LICENSE</PackageLicenseUrl>
14+
<PackageIconUrl>https://raw.githubusercontent.com/feature23/StringSimilarity.NET/master/logo.png</PackageIconUrl>
15+
<PackageRequireLicenseAcceptance>false</PackageRequireLicenseAcceptance>
16+
<PackageTags>string similarity distance cosine damerau jaccard jaro-winkler levenshtein ngram qgram shingle sift4</PackageTags>
1417
</PropertyGroup>
15-
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
16-
<DebugSymbols>true</DebugSymbols>
17-
<DebugType>full</DebugType>
18-
<Optimize>false</Optimize>
19-
<OutputPath>bin\Debug\</OutputPath>
20-
<DefineConstants>DEBUG;TRACE</DefineConstants>
21-
<ErrorReport>prompt</ErrorReport>
22-
<WarningLevel>4</WarningLevel>
18+
19+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
20+
<DocumentationFile>bin\Release\netstandard1.0\F23.StringSimilarity.xml</DocumentationFile>
2321
</PropertyGroup>
24-
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
25-
<DebugType>pdbonly</DebugType>
26-
<Optimize>true</Optimize>
27-
<OutputPath>bin\Release\</OutputPath>
28-
<DefineConstants>TRACE</DefineConstants>
29-
<ErrorReport>prompt</ErrorReport>
30-
<WarningLevel>4</WarningLevel>
31-
</PropertyGroup>
32-
<ItemGroup>
33-
<Reference Include="System" />
34-
<Reference Include="System.Core" />
35-
<Reference Include="System.Xml.Linq" />
36-
<Reference Include="System.Data.DataSetExtensions" />
37-
<Reference Include="Microsoft.CSharp" />
38-
<Reference Include="System.Data" />
39-
<Reference Include="System.Net.Http" />
40-
<Reference Include="System.Xml" />
41-
</ItemGroup>
42-
<ItemGroup>
43-
<Compile Include="Cosine.cs" />
44-
<Compile Include="Damerau.cs" />
45-
<Compile Include="Experimental\Sift4.cs" />
46-
<Compile Include="ICharacterSubstitution.cs" />
47-
<Compile Include="Interfaces\IMetricStringDistance.cs" />
48-
<Compile Include="Interfaces\INormalizedStringDistance.cs" />
49-
<Compile Include="Interfaces\INormalizedStringSimilarity.cs" />
50-
<Compile Include="Interfaces\IStringDistance.cs" />
51-
<Compile Include="Interfaces\IStringSimilarity.cs" />
52-
<Compile Include="Jaccard.cs" />
53-
<Compile Include="JaroWinkler.cs" />
54-
<Compile Include="Levenshtein.cs" />
55-
<Compile Include="LongestCommonSubsequence.cs" />
56-
<Compile Include="MetricLCS.cs" />
57-
<Compile Include="NGram.cs" />
58-
<Compile Include="NormalizedLevenshtein.cs" />
59-
<Compile Include="OptimalStringAlignment.cs" />
60-
<Compile Include="Properties\AssemblyInfo.cs" />
61-
<Compile Include="QGram.cs" />
62-
<Compile Include="ShingleBased.cs" />
63-
<Compile Include="SorensenDice.cs" />
64-
<Compile Include="Support\ArrayExtensions.cs" />
65-
<Compile Include="Utils\SparseBooleanVector.cs" />
66-
<Compile Include="Utils\SparseIntegerVector.cs" />
67-
<Compile Include="WeightedLevenshtein.cs" />
68-
</ItemGroup>
69-
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
70-
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
71-
Other similar extension points exist, see Microsoft.Common.targets.
72-
<Target Name="BeforeBuild">
73-
</Target>
74-
<Target Name="AfterBuild">
75-
</Target>
76-
-->
77-
</Project>
22+
23+
</Project>

Diff for: src/F23.StringSimilarity/Interfaces/IMetricStringDistance.cs

+6
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ namespace F23.StringSimilarity.Interfaces
3333
/// </summary>
3434
public interface IMetricStringDistance : IStringDistance
3535
{
36+
/// <summary>
37+
/// Compute and return the metric distance.
38+
/// </summary>
39+
/// <param name="s1"></param>
40+
/// <param name="s2"></param>
41+
/// <returns></returns>
3642
new double Distance(string s1, string s2);
3743
}
3844
}

Diff for: src/F23.StringSimilarity/Interfaces/IStringDistance.cs

+7
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ namespace F23.StringSimilarity.Interfaces
2626
{
2727
public interface IStringDistance
2828
{
29+
/// <summary>
30+
/// Compute and return a measure of distance.
31+
/// Must be >= 0.
32+
/// </summary>
33+
/// <param name="s1"></param>
34+
/// <param name="s2"></param>
35+
/// <returns></returns>
2936
double Distance(string s1, string s2);
3037
}
3138
}

Diff for: src/F23.StringSimilarity/Jaccard.cs

+22-7
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,30 @@
3030

3131
namespace F23.StringSimilarity
3232
{
33+
/// <summary>
34+
/// Each input string is converted into a set of n-grams, the Jaccard index is
35+
/// then computed as |V1 inter V2| / |V1 union V2|.
36+
/// Like Q-Gram distance, the input strings are first converted into sets of
37+
/// n-grams (sequences of n characters, also called k-shingles), but this time
38+
/// the cardinality of each n-gram is not taken into account.
39+
/// Distance is computed as 1 - cosine similarity.
40+
/// Jaccard index is a metric distance.
41+
/// </summary>
3342
public class Jaccard : ShingleBased, IMetricStringDistance, INormalizedStringDistance, INormalizedStringSimilarity
3443
{
44+
/// <summary>
45+
/// The strings are first transformed into sets of k-shingles (sequences of k
46+
/// characters), then Jaccard index is computed as |A inter B| / |A union B|.
47+
/// The default value of k is 3.
48+
/// </summary>
49+
/// <param name="k"></param>
3550
public Jaccard(int k) : base(k) { }
3651

52+
/// <summary>
53+
/// The strings are first transformed into sets of k-shingles (sequences of k
54+
/// characters), then Jaccard index is computed as |A inter B| / |A union B|.
55+
/// The default value of k is 3.
56+
/// </summary>
3757
public Jaccard() { }
3858

3959
/// <summary>
@@ -67,13 +87,8 @@ public double Similarity(string s1, string s2)
6787
union.UnionWith(profile1.Keys);
6888
union.UnionWith(profile2.Keys);
6989

70-
int inter = 0;
71-
72-
foreach (var key in union)
73-
{
74-
if (profile1.ContainsKey(key) && profile2.ContainsKey(key))
75-
inter++;
76-
}
90+
int inter = profile1.Keys.Count + profile2.Keys.Count
91+
- union.Count;
7792

7893
return 1.0 * inter / union.Count;
7994
}

Diff for: src/F23.StringSimilarity/JaroWinkler.cs

+9-9
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ public double Similarity(string s1, string s2)
118118
public double Distance(string s1, string s2)
119119
=> 1.0 - Similarity(s1, s2);
120120

121-
private int[] Matches(string s1, string s2)
121+
private static int[] Matches(string s1, string s2)
122122
{
123123
string max, min;
124124
if (s1.Length > s2.Length)
@@ -135,20 +135,20 @@ private int[] Matches(string s1, string s2)
135135

136136
//int[] matchIndexes = new int[min.Length];
137137
//Arrays.fill(matchIndexes, -1);
138-
int[] matchIndexes = Enumerable.Repeat(-1, min.Length).ToArray();
138+
int[] match_indexes = Enumerable.Repeat(-1, min.Length).ToArray();
139139

140-
bool[] matchFlags = new bool[max.Length];
140+
bool[] match_flags = new bool[max.Length];
141141
int matches = 0;
142142
for (int mi = 0; mi < min.Length; mi++)
143143
{
144144
char c1 = min[mi];
145145
for (int xi = Math.Max(mi - range, 0),
146146
xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
147147
{
148-
if (!matchFlags[xi] && c1 == max[xi])
148+
if (!match_flags[xi] && c1 == max[xi])
149149
{
150-
matchIndexes[mi] = xi;
151-
matchFlags[xi] = true;
150+
match_indexes[mi] = xi;
151+
match_flags[xi] = true;
152152
matches++;
153153
break;
154154
}
@@ -158,15 +158,15 @@ private int[] Matches(string s1, string s2)
158158
char[] ms2 = new char[matches];
159159
for (int i = 0, si = 0; i < min.Length; i++)
160160
{
161-
if (matchIndexes[i] != -1)
161+
if (match_indexes[i] != -1)
162162
{
163163
ms1[si] = min[i];
164164
si++;
165165
}
166166
}
167167
for (int i = 0, si = 0; i < max.Length; i++)
168168
{
169-
if (matchFlags[i])
169+
if (match_flags[i])
170170
{
171171
ms2[si] = max[i];
172172
si++;
@@ -192,7 +192,7 @@ private int[] Matches(string s1, string s2)
192192
break;
193193
}
194194
}
195-
return new int[] { matches, transpositions / 2, prefix, max.Length };
195+
return new[] { matches, transpositions / 2, prefix, max.Length };
196196
}
197197
}
198198
}

0 commit comments

Comments
 (0)