=?utf-8?q?=5Bfolks=5D_core=3A_Fix_indexing_for_unichars_in_PotentialMatch?= =?utf-8?q?=E2=80=99s_Jaro_distance_function?=
- From: Philip Withnall <pwithnall src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [folks] core: Fix indexing for unichars in PotentialMatchâs Jaro distance function
- Date: Mon, 25 Jun 2012 18:47:19 +0000 (UTC)
commit 697b611adfbfffdb37a817feb5482f43bb715e1e
Author: Philip Withnall <philip tecnocode co uk>
Date: Sat Jun 23 11:24:53 2012 +0100
core: Fix indexing for unichars in PotentialMatchâs Jaro distance function
The indexing was getting out of sync between the two strings if one contained
a different number of non-ASCII characters to the other (as the indexing was
done in terms of bytes, rather than characters). This re-works the Jaro
distance function to operate on unichar arrays, and index in terms of
characters. This also means that exact matches now work over stripped non-ASCII
characters, which they didnât before.
This adds a new test case. Yay.
Closes: https://bugzilla.gnome.org/show_bug.cgi?id=678474
NEWS | 1 +
folks/potential-match.vala | 89 +++++++++++++++++++++++++----------------
tests/tracker/match-name.vala | 9 ++++
3 files changed, 64 insertions(+), 35 deletions(-)
---
diff --git a/NEWS b/NEWS
index 6107593..8d33d19 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,7 @@ Dependencies:
Bugs fixed:
â Bug 677166 â Salut personas survive disconnection
+â Bug 678474 â potential-match should be smarter with accents
API changes:
diff --git a/folks/potential-match.vala b/folks/potential-match.vala
index f10a70a..ee8e3fb 100644
--- a/folks/potential-match.vala
+++ b/folks/potential-match.vala
@@ -446,14 +446,23 @@ public class Folks.PotentialMatch : Object
return false;
}
- if (a == b)
+ assert (a.validate ());
+ assert (b.validate ());
+
+ var a_stripped = this._strip_string ((!) a);
+ var b_stripped = this._strip_string ((!) b);
+
+ var jaro_dist = this._jaro_dist (a_stripped, b_stripped);
+
+ // a and b match exactly iff their Jaro distance is 1.
+ if (jaro_dist == 1.0)
{
exact = true;
return true;
}
// a and b look alike if their Jaro distance is over the threshold.
- return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
+ return (jaro_dist >= this._DIST_THRESHOLD);
}
private bool _look_alike (string? a, string? b)
@@ -463,8 +472,14 @@ public class Folks.PotentialMatch : Object
return false;
}
+ assert (a.validate ());
+ assert (b.validate ());
+
+ var a_stripped = this._strip_string ((!) a);
+ var b_stripped = this._strip_string ((!) b);
+
// a and b look alike if their Jaro distance is over the threshold.
- return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
+ return (this._jaro_dist (a_stripped, b_stripped) >= this._DIST_THRESHOLD);
}
/* Based on:
@@ -477,7 +492,7 @@ public class Folks.PotentialMatch : Object
* m = matching characters
* t = number of transpositions
*/
- private double jaro_dist (string s1, string s2)
+ private double _jaro_dist (unichar[] s1, unichar[] s2)
{
double distance;
int max = s1.length > s2.length ? s1.length : s2.length;
@@ -495,7 +510,7 @@ public class Folks.PotentialMatch : Object
distance = (1.0/3.0) * (a + b + c);
- debug ("[jaro_dist] Distance for %s and %s: %f\n", s1, s2, distance);
+ debug ("Jaro distance: %f (a = %f, b = %f, c = %f)", distance, a, b, c);
return distance;
}
@@ -563,28 +578,37 @@ public class Folks.PotentialMatch : Object
return retval[0];
}
+ private unichar[] _strip_string (string s)
+ {
+ int next_idx = 0;
+ uint write_idx = 0;
+ unichar ch = 0;
+ unichar[] output = new unichar[s.length]; // this is a safe overestimate
+
+ while (s.get_next_char (ref next_idx, out ch))
+ {
+ ch = this._stripped_char (ch);
+ if (ch != 0)
+ {
+ output[write_idx++] = ch;
+ }
+ }
+
+ output.length = (int) write_idx;
+ return output;
+ }
+
/* Calculate matches and transpositions as defined by the Jaro distance.
*/
- private int _matches (string s1, string s2, int max_dist, out double t)
+ private int _matches (unichar[] s1, unichar[] s2, int max_dist, out double t)
{
int matches = 0;
t = 0.0;
- assert (s1.validate ());
- assert (s2.validate ());
-
- int idx = 0;
unichar look_for = 0;
- while (s1.get_next_char (ref idx, out look_for))
+ for (uint idx = 0; (look_for = s1[idx]) != 0; idx++)
{
- /* Skip uninteresting characters. */
- look_for = this._stripped_char (look_for);
- if (look_for == 0)
- {
- continue;
- }
-
int contains = this._contains (s2, look_for, idx, max_dist);
if (contains >= 0)
{
@@ -594,8 +618,7 @@ public class Folks.PotentialMatch : Object
}
}
- debug ("%s and %s have %d matches and %f / 2 transpositions\n",
- s1, s2, matches, t);
+ debug ("%d matches and %f / 2 transpositions", matches, t);
t = t / 2.0;
return matches;
@@ -605,31 +628,27 @@ public class Folks.PotentialMatch : Object
* it withing the bounds of max_dist return abs(pos-pos_found).
* If its not found, return -1.
*
- * pos and max_dist are both in bytes.
+ * pos and max_dist are both in unichars.
*
* Note: haystack must have been validated using haystack.validate() before
* being passed to this method. */
- private int _contains (string haystack, unichar c, int pos, int max_dist)
+ private int _contains (unichar[] haystack, unichar c, uint pos, uint max_dist)
{
- var haystack_len = haystack.length; /* in bytes */
+ var haystack_len = haystack.length; /* in unichars */
- if (pos < haystack_len && haystack.get_char (pos) == c)
+ unichar ch = haystack[pos];
+ if (pos < haystack_len && ch == c)
return 0;
- int idx = (pos - max_dist).clamp (0, haystack_len);
- unichar ch = 0;
+ uint idx = ((int) pos - (int) max_dist).clamp (0, haystack_len);
+ ch = 0;
- while (idx < pos + max_dist && haystack.get_next_char (ref idx, out ch))
+ while (idx < pos + max_dist && (ch = haystack[idx]) != 0)
{
- /* Skip uninteresting characters. */
- ch = this._stripped_char (ch);
- if (ch == 0)
- {
- continue;
- }
-
if (ch == c)
- return (pos - idx).abs ();
+ return ((int) pos - (int) idx).abs ();
+
+ idx++;
}
return -1;
diff --git a/tests/tracker/match-name.vala b/tests/tracker/match-name.vala
index cf518ea..ec96eff 100644
--- a/tests/tracker/match-name.vala
+++ b/tests/tracker/match-name.vala
@@ -50,6 +50,8 @@ public class MatchNameTests : Folks.TestCase
this.test_match_name_3);
this.add_test ("test potential match by name #4 ",
this.test_match_name_4);
+ this.add_test ("test potential match by name #5 ",
+ this.test_match_name_5);
}
public override void set_up ()
@@ -110,6 +112,13 @@ public class MatchNameTests : Folks.TestCase
assert (this._match >= Folks.MatchResult.MEDIUM);
}
+ public void test_match_name_5 ()
+ {
+ /* bgo#678474 */
+ this._test_match_name ("FrÃdÃric Peters", "Frederic Peters");
+ assert (this._match >= Folks.MatchResult.HIGH);
+ }
+
private async void _test_match_name_async ()
{
var store = BackendStore.dup ();
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]