=?utf-8?q?=5Bfolks=5D_Bug_670872_=E2=80=94_Should_be_less_sensitive_to_ac?= =?utf-8?q?centuated_chars?=



commit f8e59f0d0a051ed9dcfeab2467b78ca1aabec741
Author: Philip Withnall <philip tecnocode co uk>
Date:   Sat Mar 24 22:18:21 2012 +0000

    Bug 670872 â Should be less sensitive to accentuated chars
    
    Strip accented characters from names and make them all lower case when
    searching for potential matches between individuals.
    
    This uses a modified version of Empathy's live search code to allow for
    string matches regardless of case or accentuation.
    
    The commit also fixes the potential match code to handle UTF-8 properly,
    which it wasn't doing before.
    
    This includes a test case.
    
    Closes: https://bugzilla.gnome.org/show_bug.cgi?id=670872

 NEWS                          |    1 +
 folks/potential-match.vala    |  112 ++++++++++++++++++++++++++++++++++++----
 tests/tracker/match-name.vala |   11 ++++
 3 files changed, 112 insertions(+), 12 deletions(-)
---
diff --git a/NEWS b/NEWS
index 9f5e7bd..c0cc840 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,7 @@ Bugs fixed:
 * Bug 672373 â folks-import segfaults (Archlinux x86_64)
 * Bug 670348 â Handle Telepathy CMs crashing/being invalidated
 * Bug 671900 â Folks should not suggest linking contacts from telepathy-salut
+* Bug 670872 â Should be less sensitive to accentuated chars
 
 Overview of changes from libfolks 0.6.6 to libfolks 0.6.7
 =============================================================
diff --git a/folks/potential-match.vala b/folks/potential-match.vala
index 83fb40b..d4f44a4 100644
--- a/folks/potential-match.vala
+++ b/folks/potential-match.vala
@@ -495,6 +495,69 @@ public class Folks.PotentialMatch : Object
       return distance;
     }
 
+  /**
+   * stripped_char:
+   *
+   * Returns a stripped version of @ch, removing any case, accentuation
+   * mark, or any special mark on it.
+   *
+   * Copied from Empathy's libempathy-gtk/empathy-live-search.c.
+   *
+   * Copyright (C) 2010 Collabora Ltd.
+   * Copyright (C) 2007-2010 Nokia Corporation.
+   *
+   * Authors: Felix Kaser <felix kaser collabora co uk>
+   *          Xavier Claessens <xavier claessens collabora co uk>
+   *          Claudio Saavedra <csaavedra igalia com>
+   */
+  private unichar _stripped_char (unichar ch)
+    {
+      unichar retval[1] = { 0 };
+      var utype = ch.type ();
+
+      switch (utype)
+        {
+          case UnicodeType.CONTROL:
+          case UnicodeType.FORMAT:
+          case UnicodeType.UNASSIGNED:
+          case UnicodeType.NON_SPACING_MARK:
+          case UnicodeType.COMBINING_MARK:
+          case UnicodeType.ENCLOSING_MARK:
+            /* Ignore those */
+            break;
+          case UnicodeType.PRIVATE_USE:
+          case UnicodeType.SURROGATE:
+          case UnicodeType.LOWERCASE_LETTER:
+          case UnicodeType.MODIFIER_LETTER:
+          case UnicodeType.OTHER_LETTER:
+          case UnicodeType.TITLECASE_LETTER:
+          case UnicodeType.UPPERCASE_LETTER:
+          case UnicodeType.DECIMAL_NUMBER:
+          case UnicodeType.LETTER_NUMBER:
+          case UnicodeType.OTHER_NUMBER:
+          case UnicodeType.CONNECT_PUNCTUATION:
+          case UnicodeType.DASH_PUNCTUATION:
+          case UnicodeType.CLOSE_PUNCTUATION:
+          case UnicodeType.FINAL_PUNCTUATION:
+          case UnicodeType.INITIAL_PUNCTUATION:
+          case UnicodeType.OTHER_PUNCTUATION:
+          case UnicodeType.OPEN_PUNCTUATION:
+          case UnicodeType.CURRENCY_SYMBOL:
+          case UnicodeType.MODIFIER_SYMBOL:
+          case UnicodeType.MATH_SYMBOL:
+          case UnicodeType.OTHER_SYMBOL:
+          case UnicodeType.LINE_SEPARATOR:
+          case UnicodeType.PARAGRAPH_SEPARATOR:
+          case UnicodeType.SPACE_SEPARATOR:
+          default:
+            ch = ch.tolower ();
+            ch.fully_decompose (false, retval);
+            break;
+        }
+
+      return retval[0];
+    }
+
   /* Calculate matches and transpositions as defined by the Jaro distance.
    */
   private int _matches (string s1, string s2, int max_dist, out double t)
@@ -502,10 +565,22 @@ public class Folks.PotentialMatch : Object
       int matches = 0;
       t = 0.0;
 
-      for (int i=0; i < s1.length; i++)
+      assert (s1.validate ());
+      assert (s2.validate ());
+
+      int idx = 0;
+      unichar look_for = 0;
+
+      while (s1.get_next_char (ref idx, out look_for))
         {
-          var look_for = s1.slice (i, i + 1);
-          int contains = this._contains (s2, look_for, i, max_dist);
+          /* Skip uninteresting characters. */
+          look_for = this._stripped_char (look_for);
+          if (look_for == 0)
+            {
+              continue;
+            }
+
+          int contains = this._contains (s2, look_for, idx, max_dist);
           if (contains >= 0)
             {
               matches++;
@@ -523,20 +598,33 @@ public class Folks.PotentialMatch : Object
 
   /* If haystack contains c in pos return 0, if it contains
    * it withing the bounds of max_dist return abs(pos-pos_found).
-   * If its not found, return -1. */
-  private int _contains (string haystack, string c, int pos, int max_dist)
+   * If its not found, return -1.
+   *
+   * pos and max_dist are both in bytes.
+   *
+   * Note: haystack must have been validated using haystack.validate() before
+   * being passed to this method. */
+  private int _contains (string haystack, unichar c, int pos, int max_dist)
     {
-      if (pos < haystack.length && haystack.slice (pos, pos + 1) == c)
+      var haystack_len = haystack.length; /* in bytes */
+
+      if (pos < haystack_len && haystack.get_char (pos) == c)
         return 0;
 
-      for (int i=pos-max_dist; i <= pos + max_dist; i++)
+      int idx = (pos - max_dist).clamp (0, haystack_len);
+      unichar ch = 0;
+
+      while (idx < pos + max_dist && haystack.get_next_char (ref idx, out ch))
         {
-          if (i < 0 || i >= haystack.length)
-            continue;
+          /* Skip uninteresting characters. */
+          ch = this._stripped_char (ch);
+          if (ch == 0)
+            {
+              continue;
+            }
 
-          var str = haystack.slice (i, i + 1);
-          if (str == c)
-            return (pos - i).abs ();
+          if (ch == c)
+            return (pos - idx).abs ();
         }
 
       return -1;
diff --git a/tests/tracker/match-name.vala b/tests/tracker/match-name.vala
index db1bb29..cf518ea 100644
--- a/tests/tracker/match-name.vala
+++ b/tests/tracker/match-name.vala
@@ -48,6 +48,8 @@ public class MatchNameTests : Folks.TestCase
           this.test_match_name_2);
       this.add_test ("test potential match by name #3 ",
           this.test_match_name_3);
+      this.add_test ("test potential match by name #4 ",
+          this.test_match_name_4);
     }
 
   public override void set_up ()
@@ -99,6 +101,15 @@ public class MatchNameTests : Folks.TestCase
       assert (this._match >= Folks.MatchResult.MEDIUM);
     }
 
+  public void test_match_name_4 ()
+    {
+      /* Chosen to test the accent- and case-invariance of the matching
+       * algorithm. The string's repeated so the string lengths get us up to
+       * a MEDIUM result. */
+      this._test_match_name ("PÃtÃPÃtÃPÃtÃ", "patepatepate");
+      assert (this._match >= Folks.MatchResult.MEDIUM);
+    }
+
   private async void _test_match_name_async ()
     {
       var store = BackendStore.dup ();



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]