diff --git a/FuzzySharp.Benchmarks/BenchmarkAll.cs b/FuzzySharp.Benchmarks/BenchmarkAll.cs index f7aea5c..3d89ca3 100644 --- a/FuzzySharp.Benchmarks/BenchmarkAll.cs +++ b/FuzzySharp.Benchmarks/BenchmarkAll.cs @@ -8,6 +8,12 @@ namespace Raffinert.FuzzySharp.Benchmarks; [MemoryDiagnoser] public class BenchmarkAll { + [GlobalSetup] + public void GlobalSetup() + { + GlobalConfig.PartialRatioAccuracy = PartialRatioAccuracy.Strict; + } + [Benchmark] public int Ratio() { @@ -18,7 +24,7 @@ public int Ratio() [Benchmark] public int PartialRatio() { - return Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring"); + return Fuzz.PartialRatio("Supplier: ACME Corp. International, Address: 221B Baker St., London NW1 6XE", "Order: PO-100923, Supplier: Acme Corporation International, Address: 221B Baker Street, London NW1 6XE, VAT: GB123456789, Contact: accounting@acme.example"); } [Benchmark] @@ -96,7 +102,7 @@ public int RatioClassic() [Benchmark] public int PartialRatioClassic() { - return Classic.Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring"); + return Classic.Fuzz.PartialRatio("Supplier: ACME Corp. International, Address: 221B Baker St., London NW1 6XE", "Order: PO-100923, Supplier: Acme Corporation International, Address: 221B Baker Street, London NW1 6XE, VAT: GB123456789, Contact: accounting@acme.example"); } [Benchmark] diff --git a/FuzzySharp.Benchmarks/BenchmarkFastPartial.cs b/FuzzySharp.Benchmarks/BenchmarkFastPartial.cs new file mode 100644 index 0000000..d219453 --- /dev/null +++ b/FuzzySharp.Benchmarks/BenchmarkFastPartial.cs @@ -0,0 +1,50 @@ +using BenchmarkDotNet.Attributes; +using Raffinert.FuzzySharp.PreProcess; + +namespace Raffinert.FuzzySharp.Benchmarks; + +[MemoryDiagnoser] +public class BenchmarkFastPartial +{ + [GlobalSetup] + public void GlobalSetup() + { + GlobalConfig.PartialRatioAccuracy = PartialRatioAccuracy.Fast; + } + + [Benchmark] + public int PartialRatio() + { + return Fuzz.PartialRatio("Supplier: ACME Corp. International, Address: 221B Baker St., London NW1 6XE", "Order: PO-100923, Supplier: Acme Corporation International, Address: 221B Baker Street, London NW1 6XE, VAT: GB123456789, Contact: accounting@acme.example"); + } + + [Benchmark] + public int PartialTokenSortRatio() + { + return Fuzz.PartialTokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int PartialTokenSetRatio() + { + return Fuzz.PartialTokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int WeightedRatio() + { + return Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog"); + } + + [Benchmark] + public int PartialTokenInitialismRatio() + { + return Fuzz.PartialTokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int PartialTokenAbbreviationRatio() + { + return Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); + } +} \ No newline at end of file diff --git a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj index bf9a3a3..0fcb333 100644 --- a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj +++ b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe - net9.0 + NET10.0 enable enable $(MSBuildProjectName) @@ -13,6 +13,7 @@ + diff --git a/FuzzySharp.Benchmarks/Program.cs b/FuzzySharp.Benchmarks/Program.cs index 651fb45..d5749f6 100644 --- a/FuzzySharp.Benchmarks/Program.cs +++ b/FuzzySharp.Benchmarks/Program.cs @@ -1,43 +1,9 @@ using BenchmarkDotNet.Configs; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Running; - -//using Raffinert.FuzzySharp; -//using Raffinert.FuzzySharp.SimilarityRatio; -//using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite; -//using Classic = FuzzySharp; +using Raffinert.FuzzySharp.Benchmarks; var config = ManualConfig.Create(DefaultConfig.Instance) - .AddJob(Job.ShortRun); // ← built-in short run - -BenchmarkRunner.Run(typeof(Program).Assembly, config); - -//var input1 = "+30.0% Damage to Close Enemies [30.01%"; -//var input2Collection = new[] -//{ -// "+#% Damage", -// "+#% Damage to Crowd Controlled Enemies", -// "+#% Damage to Close Enemies", -// "+#% Damage to Chilled Enemies", -// "+#% Damage to Poisoned Enemies", -// "#% Block Chance#% Blocked Damage Reduction", -// "#% Damage Reduction from Bleeding Enemies", -// "#% Damage Reduction", -// "+#% Cold Damage" -//}; - -//var classicScorer = Classic.SimilarityRatio.ScorerCache.Get(); - -//Func classicScorerFunc = input2 => classicScorer.Score(input1, input2); - -//var classicResult = input2Collection.Select(classicScorerFunc).ToList(); - -//var scorer = ScorerCache.Get(); - -//Func scorerFunc = input2 => scorer.Score(input1, input2); - -//var result = input2Collection.Select(scorerFunc).ToList(); - -//Console.WriteLine(); + .AddJob(Job.ShortRun); -//Console.WriteLine(Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog")); \ No newline at end of file +BenchmarkSwitcher.FromAssembly(typeof(BenchmarkFastPartial).Assembly).Run(args, config); \ No newline at end of file diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 588c006..ebff19a 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -1,7 +1,7 @@  - netframework4.6.2;netframework4.7.2;NET8.0;NET9.0 + netframework4.6.2;netframework4.7.2;NET8.0;NET10.0 false 12.0 Raffinert.$(MSBuildProjectName) diff --git a/FuzzySharp.Test/FuzzyTests/FastPartialRatioTests.cs b/FuzzySharp.Test/FuzzyTests/FastPartialRatioTests.cs new file mode 100644 index 0000000..2fea5f6 --- /dev/null +++ b/FuzzySharp.Test/FuzzyTests/FastPartialRatioTests.cs @@ -0,0 +1,237 @@ +using System; +using NUnit.Framework; +using Raffinert.FuzzySharp.PreProcess; + +namespace Raffinert.FuzzySharp.Test.FuzzyTests; + +[TestFixture] +public class FastPartialRatioTests : IDisposable +{ + #region Private Fields + private string _s1, + _s1A, + _s2, + _s3, + _s4, + _s5, + _s6, + _s7, + _s8, + _s8A, + _s9, + _s9A, + _s10, + _s10A; + + private string[] _cirqueStrings, _baseballStrings; + #endregion + + [SetUp] + public void Setup() + { + GlobalConfig.PartialRatioAccuracy = PartialRatioAccuracy.Fast; + _s1 = "new york mets"; + _s1A = "new york mets"; + _s2 = "new YORK mets"; + _s3 = "the wonderful new york mets"; + _s4 = "new york mets vs atlanta braves"; + _s5 = "atlanta braves vs new york mets"; + _s6 = "new york mets - atlanta braves"; + _s7 = "new york city mets - atlanta braves"; + // Edge cases + _s8 = "{"; + _s8A = "{"; + _s9 = "{a"; + _s9A = "{a"; + _s10 = "a{"; + _s10A = "{b"; + } + + public void Dispose() + { + GlobalConfig.PartialRatioAccuracy = PartialRatioAccuracy.Strict; + } + + [Test] + public void Test_Equal() + { + Assert.AreEqual(Fuzz.Ratio(_s1, _s1A), 100); + Assert.AreEqual(Fuzz.Ratio(_s8, _s8A), 100); + Assert.AreEqual(Fuzz.Ratio(_s9, _s9A), 100); + } + + [Test] + public void Test_Case_Insensitive() + { + Assert.AreNotEqual(Fuzz.Ratio(_s1, _s2), 100); + Assert.AreEqual(Fuzz.Ratio(_s1, _s2, PreprocessMode.Full), 100); + } + + [Test] + public void Test_Partial() + { + Assert.AreEqual(Fuzz.PartialRatio(_s1, _s3), 100); + } + + [Test] + public void TestTokenSortRatio() + { + Assert.AreEqual(Fuzz.TokenSortRatio(_s1, _s1A), 100); + } + + [Test] + public void TestPartialTokenSortRatio() + { + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s1, _s1A, PreprocessMode.Full), 100); + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s4, _s5, PreprocessMode.Full), 100); + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s8, _s8A), 100); + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s9, _s9A, PreprocessMode.Full), 100); + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s9, _s9A), 100); + + //var al = Fuzz1.PartialRatioAlignment("a certain string".AsSpan(), "cetain".AsSpan()); + + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s10, _s10A), 50); // 67 in strict mode + Assert.AreEqual(Fuzz.PartialTokenSortRatio(_s10, _s10A, PreprocessMode.Full), 0); + } + + [Test] + public void TestTokenSetRatio() + { + Assert.AreEqual(Fuzz.TokenSetRatio(_s4, _s5, PreprocessMode.Full), 100); + Assert.AreEqual(Fuzz.TokenSetRatio(_s8, _s8A), 100); + Assert.AreEqual(Fuzz.TokenSetRatio(_s9, _s9A, PreprocessMode.Full), 100); + Assert.AreEqual(Fuzz.TokenSetRatio(_s9, _s9A), 100); + Assert.AreEqual(Fuzz.TokenSetRatio(_s10, _s10A), 50); + } + + [Test] + public void TestTokenAbbreviationRatio() + { + Assert.AreEqual(Fuzz.TokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full), 40); + Assert.AreEqual(Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full), 50); // 67 in strict mode + } + + [Test] + public void TestPartialTokenSetRatio() + { + Assert.AreEqual(Fuzz.PartialTokenSetRatio(_s4, _s7), 100); + } + + [Test] + public void TestWeightedRatioEqual() + { + Assert.AreEqual(Fuzz.WeightedRatio(_s1, _s1A), 100); + } + + [Test] + public void TestWeightedRatioCaseInsensitive() + { + Assert.AreEqual(Fuzz.WeightedRatio(_s1, _s2, PreprocessMode.Full), 100); + } + + [Test] + public void TestWeightedRatioPartialMatch() + { + Assert.AreEqual(Fuzz.WeightedRatio(_s1, _s3), 90); + } + + [Test] + public void TestWeightedRatioMisorderedMatch() + { + Assert.AreEqual(Fuzz.WeightedRatio(_s4, _s5), 95); + } + + [Test] + public void TestEmptyStringsScore0() + { + Assert.That(Fuzz.Ratio("test_string", ""), Is.EqualTo(0)); + Assert.That(Fuzz.PartialRatio("test_string", ""), Is.EqualTo(0)); + Assert.That(Fuzz.Ratio("", ""), Is.EqualTo(0)); + Assert.That(Fuzz.PartialRatio("", ""), Is.EqualTo(0)); + } + + [Test] + public void TestIssueSeven() + { + _s1 = "HSINCHUANG"; + _s2 = "SINJHUAN"; + _s3 = "LSINJHUANG DISTRIC"; + _s4 = "SINJHUANG DISTRICT"; + + Assert.IsTrue(Fuzz.PartialRatio(_s1, _s2) > 75); + Assert.IsTrue(Fuzz.PartialRatio(_s1, _s3) > 75); + Assert.IsTrue(Fuzz.PartialRatio(_s1, _s4) > 75); + } + + [Test] + public void TestIssueEight() + { + // https://github.com/JakeBayer/FuzzySharp/issues/8 + Assert.AreEqual(85, Fuzz.PartialRatio("Partnernummer", "Partne\nrnum\nmerASDFPartnernummerASDF")); // 100 in strict mode + Assert.AreEqual(77, Fuzz.PartialRatio("Partnernummer", "PartnerrrrnummerASDFPartnernummerASDF")); // 100 in strict mode + + // https://github.com/xdrop/fuzzywuzzy/issues/39 + Assert.AreEqual(57, Fuzz.PartialRatio("kaution", "kdeffxxxiban:de1110010060046666666datum:16.11.17zeit:01:12uft0000899999tan076601testd.-20-maisonette-z4-jobas-hagkautionauszug")); // 100 in strict mode + + // https://github.com/seatgeek/fuzzywuzzy/issues/79 + Assert.AreEqual(93, Fuzz.PartialRatio("this is a test", "is this is a not really thing this is a test!")); // 100 in strict mode + + // https://github.com/Raffinert/FuzzySharp/issues/2 + Assert.AreEqual(100, Fuzz.PartialRatio("sh", "Growing eshops without a popular platform", PreprocessMode.Full)); + Assert.AreEqual(100, Fuzz.PartialRatio("shop", "Growing eshops without a popular platform", PreprocessMode.Full)); + } + + [Test] + public void MorePartialRatio() + { + Assert.AreEqual(100, Fuzz.PartialRatio("geeks for geeks", "geeks for geeks!")); + Assert.AreEqual(64, Fuzz.PartialRatio("geeks for geeks", "geeks geeks")); // 71 in strict mode + Assert.AreEqual(100, Fuzz.TokenSortRatio("geeks for geeks", "for geeks geeks")); + } + + [Test] + public void TestPartialRatioUnicodeString() + { + _s1 = "\u00C1"; + _s2 = "ABCD"; + var score = Fuzz.PartialRatio(_s1, _s2); + Assert.AreEqual(0, score); + } + + [Test] + public void TestZeroRatio() + { + var ratio = Fuzz.PartialTokenSortRatio("abc", "def"); + + Assert.True(ratio == 0); + } + + [Test] + public void Test03() + { + var ratio = Fuzz.PartialTokenSortRatio("new york mets", "atlanta braves vs new york mets"); + + Assert.True(ratio == 77); + } + + [Test] + public void TestRatioUnicodeString() + { + _s1 = "\u00C1"; + _s2 = "ABCD"; + var score = Fuzz.WeightedRatio(_s1, _s2); + Assert.AreEqual(0, score); + + // Cyrillic. + _s1 = "\u043f\u0441\u0438\u0445\u043e\u043b\u043e\u0433"; + _s2 = "\u043f\u0441\u0438\u0445\u043e\u0442\u0435\u0440\u0430\u043f\u0435\u0432\u0442"; + score = Fuzz.WeightedRatio(_s1, _s2); + Assert.AreNotEqual(0, score); + + // Chinese. + _s1 = "\u6211\u4e86\u89e3\u6570\u5b66"; + _s2 = "\u6211\u5b66\u6570\u5b66"; + score = Fuzz.WeightedRatio(_s1, _s2); + Assert.AreNotEqual(0, score); + } +} \ No newline at end of file diff --git a/FuzzySharp.Test/FuzzyTests/RatioTests.cs b/FuzzySharp.Test/FuzzyTests/StrictPartialRatioTests.cs similarity index 99% rename from FuzzySharp.Test/FuzzyTests/RatioTests.cs rename to FuzzySharp.Test/FuzzyTests/StrictPartialRatioTests.cs index 2b6c216..a05b82d 100644 --- a/FuzzySharp.Test/FuzzyTests/RatioTests.cs +++ b/FuzzySharp.Test/FuzzyTests/StrictPartialRatioTests.cs @@ -4,7 +4,7 @@ namespace Raffinert.FuzzySharp.Test.FuzzyTests; [TestFixture] -public class RatioTests +public class StrictPartialRatioTests { #region Private Fields private string _s1, diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index 03a4ad4..2bb3407 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,10 +1,10 @@  - 3.0.6.0 - 3.0.6.0 - 3.0.6 - 3.0.6 + 3.0.7.0 + 3.0.7.0 + 3.0.7 + 3.0.7 Jacob Bayer;Yevhen Cherkes @@ -12,7 +12,7 @@ true true - 12.0 + Latest MIT https://github.com/Raffinert/FuzzySharp false @@ -21,7 +21,7 @@ git https://github.com/Raffinert/FuzzySharp snupkg - netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net462;net472;net48;NET60;NET80;NET90 + netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net462;net472;net48;NET60;NET80;NET90;NET10.0 Raffinert.$(MSBuildProjectName) Raffinert.$(MSBuildProjectName.Replace(" ", "_")) README.md diff --git a/FuzzySharp/GlobalConfig.cs b/FuzzySharp/GlobalConfig.cs new file mode 100644 index 0000000..42591f5 --- /dev/null +++ b/FuzzySharp/GlobalConfig.cs @@ -0,0 +1,16 @@ +namespace Raffinert.FuzzySharp; + +public static class GlobalConfig +{ + public static PartialRatioAccuracy PartialRatioAccuracy + { + get => SimilarityRatio.Strategy.PartialRatioStrategy.Accuracy; + set => SimilarityRatio.Strategy.PartialRatioStrategy.Accuracy = value; + } +} + +public enum PartialRatioAccuracy +{ + Strict, + Fast +} \ No newline at end of file diff --git a/FuzzySharp/Indel.Static.cs b/FuzzySharp/Indel.Static.cs index 5e9fc7d..1931fb7 100644 --- a/FuzzySharp/Indel.Static.cs +++ b/FuzzySharp/Indel.Static.cs @@ -26,7 +26,7 @@ public static int BlockDistance( int? scoreCutoff = null) where T : IEquatable { var maximum = s1.Length + s2.Length; - var lcsSim = LongestCommonSequence.BlockSimilarityMultipleULongs(block, s1, s2); + var lcsSim = LongestCommonSequence.BlockSimilarity(block, s1, s2); var dist = maximum - 2 * lcsSim; var result = scoreCutoff == null || dist <= scoreCutoff.Value ? dist @@ -43,6 +43,7 @@ public static int BlockDistance( /// Second sequence. /// Optional maximum normalized distance threshold. If the distance exceeds this value, returns 1. /// The normalized Indel distance between the two sequences. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static double BlockNormalizedDistance( CharMaskBuffer block, ReadOnlySpan s1, @@ -68,6 +69,7 @@ public static double BlockNormalizedDistance( /// Second sequence. /// Optional minimum similarity threshold. If the similarity is below this value, returns 0. /// The normalized Indel similarity between the two sequences. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static double BlockNormalizedSimilarity( CharMaskBuffer block, ReadOnlySpan s1, diff --git a/FuzzySharp/Levenshtein.Static.cs b/FuzzySharp/Levenshtein.Static.cs index 6b12aac..dcf48bb 100644 --- a/FuzzySharp/Levenshtein.Static.cs +++ b/FuzzySharp/Levenshtein.Static.cs @@ -110,9 +110,9 @@ public static List GetMatchingBlocks(ReadOnlySpan s1, ReadO /// First sequence. /// Second sequence. /// List of matching blocks. - public static List GetMatchingBlocks(T[] s1, T[] s2) where T : IEquatable + public static List GetMatchingBlocks(ReadOnlySpan s1, ReadOnlySpan s2) where T : IEquatable { - var editOps = GetEditOps(new ReadOnlySpan(s1), new ReadOnlySpan(s2)); + var editOps = GetEditOps(s1, s2); var matchingBlocks = editOps.AsMatchingBlocks(s1.Length, s2.Length); return matchingBlocks; } diff --git a/FuzzySharp/LongestCommonSequence.Static.cs b/FuzzySharp/LongestCommonSequence.Static.cs index 8609414..3a34d21 100644 --- a/FuzzySharp/LongestCommonSequence.Static.cs +++ b/FuzzySharp/LongestCommonSequence.Static.cs @@ -2,6 +2,7 @@ using Raffinert.FuzzySharp.Extensions; using Raffinert.FuzzySharp.Utils; using System; +using System.Buffers; using System.Collections.Generic; using System.Runtime.CompilerServices; @@ -348,7 +349,21 @@ internal static int SimilarityImpl( /// Second sequence (text). /// Optional minimum similarity threshold. /// The length of the longest common subsequence, or 0 if below cutoff. - internal static int BlockSimilarityMultipleULongs( + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int BlockSimilarity( + CharMaskBuffer block, + ReadOnlySpan s1, + ReadOnlySpan s2, + int? scoreCutoff = null + ) where T : IEquatable + { + return s1.Length <= 64 + ? BlockSimilaritySingleULong(block, s1, s2, scoreCutoff) + : BlockSimilarityMultipleULongs(block, s1, s2, scoreCutoff); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int BlockSimilaritySingleULong( CharMaskBuffer block, ReadOnlySpan s1, ReadOnlySpan s2, @@ -359,62 +374,95 @@ internal static int BlockSimilarityMultipleULongs( return 0; int len1 = s1.Length; - int segCount = (len1 + 63) / 64; - - // --- 2) prepare the \"all-ones up to len1\" mask and state S --- - ulong[] S = new ulong[segCount]; - for (int i = 0; i < segCount; i++) - S[i] = ulong.MaxValue; - // clear high bits in the final segment if len1 % 64 != 0 - int rem = len1 & 63; - if (rem != 0) - S[segCount - 1] = (1UL << rem) - 1; + ulong mask = len1 == 64 ? ulong.MaxValue : (1UL << len1) - 1UL; - // --- 3) main bit-parallel loop: S = (S + u) | (S - u) --- + ulong S = mask; foreach (T ch in s2) { - var M = block.GetOrZero(ch); - - // u = S & M - var u = new ulong[segCount]; - for (int i = 0; i < segCount; i++) - u[i] = S[i] & M[i]; - - // add = S + u (multi-precision) - var add = new ulong[segCount]; - ulong carry = 0; - for (int i = 0; i < segCount; i++) - { - ulong sum = S[i] + u[i] + carry; - // carry if sum < S[i] or (carry==1 && sum==S[i]) - carry = sum < S[i] || (carry == 1 && sum == S[i]) ? 1UL : 0UL; - add[i] = sum; - } - - // sub = S - u (multi-precision) - var sub = new ulong[segCount]; - ulong borrow = 0; - for (int i = 0; i < segCount; i++) + ulong M = block.GetOrZero(ch)[0]; + ulong u = S & M; + unchecked { - ulong diff = S[i] - u[i] - borrow; - // borrow if original S[i] < u[i] + borrow - borrow = S[i] < u[i] + borrow ? 1UL : 0UL; - sub[i] = diff; + S = (S + u) | (S - u); } - - // new S = add | sub - for (int i = 0; i < segCount; i++) - S[i] = add[i] | sub[i]; } - // --- 4) count zero bits in the lower len1 positions of S --- int lcs = CountZeroBits(S, len1); - - var result = scoreCutoff == null || lcs >= scoreCutoff.Value + return scoreCutoff == null || lcs >= scoreCutoff.Value ? lcs : 0; + } - return result; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int BlockSimilarityMultipleULongs( + CharMaskBuffer block, + ReadOnlySpan s1, + ReadOnlySpan s2, + int? scoreCutoff = null + ) where T : IEquatable + { + if (s1.IsEmpty) + return 0; + + int len1 = s1.Length; + int segCount = (len1 + 63) / 64; + + var scratch = ArrayPool.Shared.Rent(segCount * 4); + try + { + var S = scratch.AsSpan(0, segCount); + var u = scratch.AsSpan(segCount, segCount); + var add = scratch.AsSpan(segCount * 2, segCount); + var sub = scratch.AsSpan(segCount * 3, segCount); + + // --- 2) prepare the \"all-ones up to len1\" mask and state S --- + S.Fill(ulong.MaxValue); + int rem = len1 & 63; + if (rem != 0) + S[segCount - 1] = (1UL << rem) - 1; + + // --- 3) main bit-parallel loop: S = (S + u) | (S - u) --- + foreach (T ch in s2) + { + var M = block.GetOrZero(ch); + + // u = S & M + for (int i = 0; i < segCount; i++) + u[i] = S[i] & M[i]; + + // add = S + u (multi-precision) + ulong carry = 0; + for (int i = 0; i < segCount; i++) + { + ulong sum = S[i] + u[i] + carry; + carry = sum < S[i] || (carry == 1 && sum == S[i]) ? 1UL : 0UL; + add[i] = sum; + } + + // sub = S - u (multi-precision) + ulong borrow = 0; + for (int i = 0; i < segCount; i++) + { + ulong diff = S[i] - u[i] - borrow; + borrow = S[i] < u[i] + borrow ? 1UL : 0UL; + sub[i] = diff; + } + + // new S = add | sub + for (int i = 0; i < segCount; i++) + S[i] = add[i] | sub[i]; + } + + // --- 4) count zero bits in the lower len1 positions of S --- + int lcs = CountZeroBits(S, len1); + return scoreCutoff == null || lcs >= scoreCutoff.Value + ? lcs + : 0; + } + finally + { + ArrayPool.Shared.Return(scratch); + } } private static int CountZeroBits(ulong x, int length) @@ -425,6 +473,11 @@ private static int CountZeroBits(ulong x, int length) } private static int CountZeroBits(ulong[] S, int length) + { + return CountZeroBits((ReadOnlySpan)S, length); + } + + private static int CountZeroBits(ReadOnlySpan S, int length) { int fullBlocks = length / 64; int remBits = length % 64; @@ -625,4 +678,4 @@ private static int SimilaritySingleULong(ReadOnlySpan s1, ReadOnlySpan int res = CountZeroBits(S, len1); return res; } -} \ No newline at end of file +} diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs index 62d418f..45ba63d 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs @@ -5,5 +5,5 @@ namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; public sealed class PartialTokenDifferenceScorer : TokenDifferenceScorerBase { - protected override Func Scorer => PartialRatioStrategy.Calculate; + protected override Func Scorer => static (strings1, strings2) => PartialRatioStrategy.Calculate(strings1.AsSpan(), strings2.AsSpan()); } \ No newline at end of file diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/FastPartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/FastPartialRatioStrategyT.cs new file mode 100644 index 0000000..7584cd6 --- /dev/null +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/FastPartialRatioStrategyT.cs @@ -0,0 +1,67 @@ +using System; +using System.Runtime.CompilerServices; +using Raffinert.FuzzySharp; +using Raffinert.FuzzySharp.Utils; + +namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic; + +internal static class FastPartialRatioStrategyT where T : IEquatable +{ + public static int Calculate(ReadOnlySpan input1, ReadOnlySpan input2) + { + if (input1.Length == 0 || input2.Length == 0) + { + return 0; + } + + var shorter = input1; + var longer = input2; + + SequenceUtils.SwapIfSourceIsLonger(ref shorter, ref longer); + + using var charMask = CharMask.Create(shorter); + + var maxScore = ComputeMaxScore(shorter, longer, charMask); + + return (int)Math.Round(100 * maxScore); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static double ComputeMaxScore( + ReadOnlySpan shorter, + ReadOnlySpan longer, + CharMaskBuffer charMask) + { + double maxScore = 0; + var len1 = shorter.Length; + var len2 = longer.Length; + + // Reuse the matching-block candidate generation to avoid scanning every window. + var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); + foreach (var block in matchingBlocks) + { + // Offset between source and destination tells us where the shorter string could align. + var dist = block.DestPos - block.SourcePos; + var windowStart = dist > 0 ? dist : 0; + var windowEnd = windowStart + len1; + if (windowEnd > len2) + { + windowEnd = len2; + } + + var window = longer.Slice(windowStart, windowEnd - windowStart); + var ratio = Indel.BlockNormalizedSimilarity(charMask, shorter, window); + + if (ratio > maxScore) + { + maxScore = ratio; + if (ratio >= 0.995) + { + return 1.0; + } + } + } + + return maxScore; + } +} diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index 7820ed1..c1f9644 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -11,15 +11,15 @@ internal static class PartialRatioStrategy where T : IEquatable /// Searches for the optimal alignment of the shorter span in the longer span /// and returns the partial fuzz.ratio for that alignment, as a value in [0…100]. /// - public static int Calculate(T[] input1, T[] input2) + public static int Calculate(ReadOnlySpan input1, ReadOnlySpan input2) { if (input1.Length == 0 || input2.Length == 0) { return 0; } - var shorter = (ReadOnlySpan)input1; - var longer = (ReadOnlySpan)input2; + var shorter = input1; + var longer = input2; SequenceUtils.SwapIfSourceIsLonger(ref shorter, ref longer); @@ -143,17 +143,14 @@ private static ScoreAlignment PartialRatioImpl( if (len1 == 0 || len2 == 0) return res; - // Precompute s1’s character set for fast Contains - var charSet = new HashSet(s1.ToArray()); - - double? cutoff = scoreCutoff; + double cutoff = scoreCutoff ?? 0.0; // 1) Prefixes shorter than len1 for (int i = 1; i < len1; i++) { - if (!charSet.Contains(s2[i - 1])) continue; + if (!charMask.ContainsKey(s2[i - 1])) continue; var slice = s2[..i]; double sim = Indel.BlockNormalizedSimilarity(charMask, s1, slice); - if (sim > res.Score && (!cutoff.HasValue || sim >= cutoff.Value)) + if (sim > res.Score && sim >= cutoff) { res.Score = sim; cutoff = sim; @@ -166,10 +163,10 @@ private static ScoreAlignment PartialRatioImpl( // 2) Full-width windows of length len1 for (int i = 0; i <= len2 - len1; i++) { - if (!charSet.Contains(s2[i + len1 - 1])) continue; + if (!charMask.ContainsKey(s2[i + len1 - 1])) continue; var window = s2[i..(i + len1)]; double sim = Indel.BlockNormalizedSimilarity(charMask, s1, window); - if (sim > res.Score && (!cutoff.HasValue || sim >= cutoff.Value)) + if (sim > res.Score && sim >= cutoff) { res.Score = sim; cutoff = sim; @@ -182,10 +179,10 @@ private static ScoreAlignment PartialRatioImpl( // 3) Suffixes shorter than len1 for (int i = len2 - len1 + 1; i < len2; i++) { - if (!charSet.Contains(s2[i])) continue; + if (!charMask.ContainsKey(s2[i])) continue; var tail = s2[i..]; double sim = Indel.BlockNormalizedSimilarity(charMask, s1, tail); - if (sim > res.Score && (!cutoff.HasValue || sim >= cutoff.Value)) + if (sim > res.Score && sim >= cutoff) { res.Score = sim; cutoff = sim; @@ -201,4 +198,4 @@ private static ScoreAlignment PartialRatioImpl( } internal record struct ScoreAlignment(double Score, int SrcStart, int SrcEnd, int DestStart, int DestEnd); -} \ No newline at end of file +} diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index 7db9525..5d290b7 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -1,11 +1,37 @@ -using Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic; -using Raffinert.FuzzySharp.Utils; -using System; +using System; +using System.Threading; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic; namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy; internal static class PartialRatioStrategy { + private static PartialRatioAccuracy _accuracy = PartialRatioAccuracy.Strict; + + internal delegate int PartialRatio(ReadOnlySpan shorter, ReadOnlySpan longer); + + private static PartialRatio _partialRatioImpl = PartialRatioStrategy.Calculate; + + public static PartialRatioAccuracy Accuracy + { + get => _accuracy; + set + { + if (_accuracy != value) + { + PartialRatio partialRatioImpl = value switch + { + PartialRatioAccuracy.Strict => PartialRatioStrategy.Calculate, + PartialRatioAccuracy.Fast => FastPartialRatioStrategyT.Calculate, + _ => throw new ArgumentOutOfRangeException(nameof(value), "Unsupported accuracy mode.") + }; + + Interlocked.Exchange(ref _partialRatioImpl, partialRatioImpl); + _accuracy = value; + } + } + } + /// /// Searches for the optimal alignment of the shorter span in the longer span /// and returns the partial fuzz.ratio for that alignment, as a value in [0…100]. @@ -17,13 +43,8 @@ public static int Calculate(string input1, string input2) return 0; } - var shorter = input1.AsSpan(); - var longer = input2.AsSpan(); - - SequenceUtils.SwapIfSourceIsLonger(ref shorter, ref longer); - - var alignment = PartialRatioStrategy.PartialRatioAlignment(shorter, longer); + var score = _partialRatioImpl(input1.AsSpan(), input2.AsSpan()); - return (int)Math.Round(alignment.Score); + return score; } } \ No newline at end of file diff --git a/FuzzySharp/Utils/CharMaskBuffer.cs b/FuzzySharp/Utils/CharMaskBuffer.cs index cecafce..a4bc92c 100644 --- a/FuzzySharp/Utils/CharMaskBuffer.cs +++ b/FuzzySharp/Utils/CharMaskBuffer.cs @@ -67,6 +67,13 @@ private void GrowBuffer() _capacity = newCapacity; } + public bool ContainsKey(T key) + { + if (_disposed) throw new ObjectDisposedException(nameof(CharMaskBuffer)); + + return _indexMap.ContainsKey(key); + } + public bool TryGetMask(T key, out ReadOnlySpan mask) { if (_disposed) throw new ObjectDisposedException(nameof(CharMaskBuffer));