=?utf-8?q?=5Bfolks=5D_core=3A_Fix_indexing_for_unichars_in_PotentialMatch?= =?utf-8?q?=E2=80=99s_Jaro_distance_function?=



commit 697b611adfbfffdb37a817feb5482f43bb715e1e
Author: Philip Withnall <philip tecnocode co uk>
Date:   Sat Jun 23 11:24:53 2012 +0100

    core: Fix indexing for unichars in PotentialMatchâs Jaro distance function
    
    The indexing was getting out of sync between the two strings if one contained
    a different number of non-ASCII characters to the other (as the indexing was
    done in terms of bytes, rather than characters). This re-works the Jaro
    distance function to operate on unichar arrays, and index in terms of
    characters. This also means that exact matches now work over stripped non-ASCII
    characters, which they didnât before.
    
    This adds a new test case. Yay.
    
    Closes: https://bugzilla.gnome.org/show_bug.cgi?id=678474

 NEWS                          |    1 +
 folks/potential-match.vala    |   89 +++++++++++++++++++++++++----------------
 tests/tracker/match-name.vala |    9 ++++
 3 files changed, 64 insertions(+), 35 deletions(-)
---
diff --git a/NEWS b/NEWS
index 6107593..8d33d19 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,7 @@ Dependencies:
 
 Bugs fixed:
 â Bug 677166 â Salut personas survive disconnection
+â Bug 678474 â potential-match should be smarter with accents
 
 API changes:
 
diff --git a/folks/potential-match.vala b/folks/potential-match.vala
index f10a70a..ee8e3fb 100644
--- a/folks/potential-match.vala
+++ b/folks/potential-match.vala
@@ -446,14 +446,23 @@ public class Folks.PotentialMatch : Object
           return false;
         }
 
-      if (a == b)
+      assert (a.validate ());
+      assert (b.validate ());
+
+      var a_stripped = this._strip_string ((!) a);
+      var b_stripped = this._strip_string ((!) b);
+
+      var jaro_dist = this._jaro_dist (a_stripped, b_stripped);
+
+      // a and b match exactly iff their Jaro distance is 1.
+      if (jaro_dist == 1.0)
         {
           exact = true;
           return true;
         }
 
       // a and b look alike if their Jaro distance is over the threshold.
-      return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
+      return (jaro_dist >= this._DIST_THRESHOLD);
     }
 
   private bool _look_alike (string? a, string? b)
@@ -463,8 +472,14 @@ public class Folks.PotentialMatch : Object
           return false;
         }
 
+      assert (a.validate ());
+      assert (b.validate ());
+
+      var a_stripped = this._strip_string ((!) a);
+      var b_stripped = this._strip_string ((!) b);
+
       // a and b look alike if their Jaro distance is over the threshold.
-      return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
+      return (this._jaro_dist (a_stripped, b_stripped) >= this._DIST_THRESHOLD);
     }
 
   /* Based on:
@@ -477,7 +492,7 @@ public class Folks.PotentialMatch : Object
    * m = matching characters
    * t = number of transpositions
    */
-  private double jaro_dist (string s1, string s2)
+  private double _jaro_dist (unichar[] s1, unichar[] s2)
     {
       double distance;
       int max = s1.length > s2.length ? s1.length : s2.length;
@@ -495,7 +510,7 @@ public class Folks.PotentialMatch : Object
 
       distance = (1.0/3.0) * (a + b + c);
 
-      debug ("[jaro_dist] Distance for %s and %s: %f\n", s1, s2, distance);
+      debug ("Jaro distance: %f (a = %f, b = %f, c = %f)", distance, a, b, c);
 
       return distance;
     }
@@ -563,28 +578,37 @@ public class Folks.PotentialMatch : Object
       return retval[0];
     }
 
+  private unichar[] _strip_string (string s)
+    {
+      int next_idx = 0;
+      uint write_idx = 0;
+      unichar ch = 0;
+      unichar[] output = new unichar[s.length]; // this is a safe overestimate
+
+      while (s.get_next_char (ref next_idx, out ch))
+        {
+          ch = this._stripped_char (ch);
+          if (ch != 0)
+            {
+              output[write_idx++] = ch;
+            }
+        }
+
+      output.length = (int) write_idx;
+      return output;
+    }
+
   /* Calculate matches and transpositions as defined by the Jaro distance.
    */
-  private int _matches (string s1, string s2, int max_dist, out double t)
+  private int _matches (unichar[] s1, unichar[] s2, int max_dist, out double t)
     {
       int matches = 0;
       t = 0.0;
 
-      assert (s1.validate ());
-      assert (s2.validate ());
-
-      int idx = 0;
       unichar look_for = 0;
 
-      while (s1.get_next_char (ref idx, out look_for))
+      for (uint idx = 0; (look_for = s1[idx]) != 0; idx++)
         {
-          /* Skip uninteresting characters. */
-          look_for = this._stripped_char (look_for);
-          if (look_for == 0)
-            {
-              continue;
-            }
-
           int contains = this._contains (s2, look_for, idx, max_dist);
           if (contains >= 0)
             {
@@ -594,8 +618,7 @@ public class Folks.PotentialMatch : Object
             }
         }
 
-      debug ("%s and %s have %d matches and %f / 2 transpositions\n",
-          s1, s2, matches, t);
+      debug ("%d matches and %f / 2 transpositions", matches, t);
 
       t = t / 2.0;
       return matches;
@@ -605,31 +628,27 @@ public class Folks.PotentialMatch : Object
    * it withing the bounds of max_dist return abs(pos-pos_found).
    * If its not found, return -1.
    *
-   * pos and max_dist are both in bytes.
+   * pos and max_dist are both in unichars.
    *
    * Note: haystack must have been validated using haystack.validate() before
    * being passed to this method. */
-  private int _contains (string haystack, unichar c, int pos, int max_dist)
+  private int _contains (unichar[] haystack, unichar c, uint pos, uint max_dist)
     {
-      var haystack_len = haystack.length; /* in bytes */
+      var haystack_len = haystack.length; /* in unichars */
 
-      if (pos < haystack_len && haystack.get_char (pos) == c)
+      unichar ch = haystack[pos];
+      if (pos < haystack_len && ch == c)
         return 0;
 
-      int idx = (pos - max_dist).clamp (0, haystack_len);
-      unichar ch = 0;
+      uint idx = ((int) pos - (int) max_dist).clamp (0, haystack_len);
+      ch = 0;
 
-      while (idx < pos + max_dist && haystack.get_next_char (ref idx, out ch))
+      while (idx < pos + max_dist && (ch = haystack[idx]) != 0)
         {
-          /* Skip uninteresting characters. */
-          ch = this._stripped_char (ch);
-          if (ch == 0)
-            {
-              continue;
-            }
-
           if (ch == c)
-            return (pos - idx).abs ();
+            return ((int) pos - (int) idx).abs ();
+
+          idx++;
         }
 
       return -1;
diff --git a/tests/tracker/match-name.vala b/tests/tracker/match-name.vala
index cf518ea..ec96eff 100644
--- a/tests/tracker/match-name.vala
+++ b/tests/tracker/match-name.vala
@@ -50,6 +50,8 @@ public class MatchNameTests : Folks.TestCase
           this.test_match_name_3);
       this.add_test ("test potential match by name #4 ",
           this.test_match_name_4);
+      this.add_test ("test potential match by name #5 ",
+          this.test_match_name_5);
     }
 
   public override void set_up ()
@@ -110,6 +112,13 @@ public class MatchNameTests : Folks.TestCase
       assert (this._match >= Folks.MatchResult.MEDIUM);
     }
 
+  public void test_match_name_5 ()
+    {
+      /* bgo#678474 */
+      this._test_match_name ("FrÃdÃric Peters", "Frederic Peters");
+      assert (this._match >= Folks.MatchResult.HIGH);
+    }
+
   private async void _test_match_name_async ()
     {
       var store = BackendStore.dup ();



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]