[folks] Add support to perform potential matches among Individuals
- From: Raul Gutierrez Segales <raulgs src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [folks] Add support to perform potential matches among Individuals
- Date: Sat, 9 Apr 2011 00:43:55 +0000 (UTC)
commit 62695d90ce0b63656e1adb454966495cbba86648
Author: Raul Gutierrez Segales <raul gutierrez segales collabora co uk>
Date: Fri Apr 8 23:52:54 2011 +0100
Add support to perform potential matches among Individuals
folks/Makefile.am | 1 +
folks/individual-aggregator.vala | 75 ++++++
folks/potential-match.vala | 462 ++++++++++++++++++++++++++++++++++++++
3 files changed, 538 insertions(+), 0 deletions(-)
---
diff --git a/folks/Makefile.am b/folks/Makefile.am
index c01cdf2..51eefc4 100644
--- a/folks/Makefile.am
+++ b/folks/Makefile.am
@@ -40,6 +40,7 @@ libfolks_la_SOURCES = \
types.vala \
debug.vala \
utils.vala \
+ potential-match.vala \
$(NULL)
libfolks_la_VALAFLAGS = \
diff --git a/folks/individual-aggregator.vala b/folks/individual-aggregator.vala
index 39bafe5..96fb424 100644
--- a/folks/individual-aggregator.vala
+++ b/folks/individual-aggregator.vala
@@ -225,6 +225,81 @@ public class Folks.IndividualAggregator : Object
}
}
+ /**
+ * Get all matches for a given { link Individual}.
+ *
+ * @since UNRELEASED
+ */
+ public HashMap<Individual, MatchResult> get_potential_matches
+ (Individual matchee, MatchResult min_threshold = MatchResult.VERY_HIGH)
+ {
+ HashMap<Individual, MatchResult> matches =
+ new HashMap<Individual, MatchResult> ();
+ Folks.PotentialMatch matchObj = new Folks.PotentialMatch ();
+
+ foreach (var i in this.individuals.get_values ())
+ {
+ if (i.id == matchee.id)
+ continue;
+
+ var result = matchObj.potential_match (i, matchee);
+ if (result >= min_threshold)
+ {
+ matches.set (i, result);
+ }
+ }
+
+ return matches;
+ }
+
+ /**
+ * Get all combinations between all { link Individual}s.
+ *
+ * @since UNRELEASED
+ */
+ public HashMap<Individual, HashMap<Individual, MatchResult>>
+ get_all_potential_matches
+ (MatchResult min_threshold = MatchResult.VERY_HIGH)
+ {
+ HashMap<Individual, HashMap<Individual, MatchResult>> matches =
+ new HashMap<Individual, HashMap<Individual, MatchResult>> ();
+ GLib.List<Individual> individuals = this.individuals.get_values ();
+ Folks.PotentialMatch matchObj = new Folks.PotentialMatch ();
+
+ for (unowned GLib.List<Individual> l = individuals;
+ l != null && l.next != null; l = l.next)
+ {
+ var a = l.data;
+ var matches_a = matches.get (a);
+ if (matches_a == null)
+ {
+ matches_a = new HashMap<Individual, MatchResult> ();
+ matches.set (a, matches_a);
+ }
+
+ for (unowned GLib.List<Individual> j=l.next; j != null; j = j.next)
+ {
+ var b = j.data;
+ var matches_b = matches.get (b);
+ if (matches_b == null)
+ {
+ matches_b = new HashMap<Individual, MatchResult> ();
+ matches.set (b, matches_b);
+ }
+
+ var result = matchObj.potential_match (a, b);
+
+ if (result >= min_threshold)
+ {
+ matches_a.set (b, result);
+ matches_b.set (b, result);
+ }
+ }
+ }
+
+ return matches;
+ }
+
private async void _add_backend (Backend backend)
{
if (!this._backends.contains (backend))
diff --git a/folks/potential-match.vala b/folks/potential-match.vala
new file mode 100644
index 0000000..96c1410
--- /dev/null
+++ b/folks/potential-match.vala
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2011 Collabora Ltd.
+ *
+ * This library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Authors:
+ * Raul Gutierrez Segales <raul gutierrez segales collabora co uk>
+ */
+
+
+/**
+ * Likely-ness of a potential match.
+ *
+ * Note that the order should be maintained.
+ */
+public enum Folks.MatchResult
+{
+ VERY_LOW,
+ LOW,
+ MEDIUM,
+ HIGH,
+ VERY_HIGH,
+ MIN = VERY_LOW,
+ MAX = VERY_HIGH
+}
+
+/**
+ * This class provides functionality to explore a potential match between
+ * two individuals.
+ *
+ * @since UNRELEASED
+ */
+public class Folks.PotentialMatch : Object
+{
+ MatchResult _result;
+ private Folks.Individual _individual_a;
+ private Folks.Individual _individual_b;
+ public static Gee.HashSet<string> known_email_aliases = null;
+ private static double _DIST_THRESHOLD = 0.70;
+ private const string _SEPARATORS = "._-+";
+
+ public PotentialMatch ()
+ {
+ if (this.known_email_aliases == null)
+ {
+ this.known_email_aliases = new Gee.HashSet<string> (str_hash,
+ str_equal);
+ this.known_email_aliases.add ("admin");
+ this.known_email_aliases.add ("abuse");
+ this.known_email_aliases.add ("webmaster");
+ }
+ }
+
+ /**
+ * Whether two individuals are likely to be the same person.
+ *
+ * @since UNRELEASED
+ */
+ public MatchResult potential_match (Individual a, Individual b)
+ {
+ this._individual_a = a;
+ this._individual_b = b;
+ this._result = MatchResult.MIN;
+
+ if (this._individual_a.gender != Gender.UNSPECIFIED &&
+ this._individual_b.gender != Gender.UNSPECIFIED &&
+ this._individual_a.gender != this._individual_b.gender)
+ {
+ return this._result;
+ }
+
+ /* If individuals share common im-addresses */
+ this._inspect_im_addresses ();
+ if (this._result == MatchResult.MAX)
+ return this._result;
+
+ /* If individuals share common e-mails */
+ this._inspect_emails ();
+ if (this._result == MatchResult.MAX)
+ return this._result;
+
+ /* If individuals share common phone numbers */
+ this._inspect_phone_numbers ();
+ if (this._result == MatchResult.MAX)
+ return this._result;
+
+ /* they have the same (normalised) name? */
+ this._name_similarity ();
+ if (this._result == MatchResult.MAX)
+ return this._result;
+
+ return this._result;
+ }
+
+ private void _inspect_phone_numbers ()
+ {
+ unowned GLib.List<FieldDetails> list_a =
+ this._individual_a.phone_numbers;
+ unowned GLib.List<FieldDetails> list_b =
+ this._individual_b.phone_numbers;
+
+ foreach (var fd_a in list_a)
+ {
+ foreach (var fd_b in list_b)
+ {
+ if (PhoneDetails.numbers_equal (fd_a.value, fd_b.value))
+ {
+ this._result = MatchResult.HIGH;
+ return;
+ }
+ }
+ }
+ }
+
+ /**
+ * Keep in sync with Folks.MatchResult.
+ *
+ * @since UNRELEASED
+ */
+ public string result_to_string (MatchResult result)
+ {
+ string match_level = "";
+ switch (result)
+ {
+ case Folks.MatchResult.VERY_LOW:
+ match_level = "very unlikely";
+ break;
+ case Folks.MatchResult.LOW:
+ match_level = "low";
+ break;
+ case Folks.MatchResult.MEDIUM:
+ match_level = "medium";
+ break;
+ case Folks.MatchResult.HIGH:
+ match_level = "high";
+ break;
+ case Folks.MatchResult.VERY_HIGH:
+ match_level = "very high";
+ break;
+ }
+ return match_level;
+ }
+
+ /* Approach:
+ * - taking in account family, given, prefix, suffix and additional names
+ * we give some points for each non-empty match
+ *
+ * @since UNRELEASED
+ */
+ private void _name_similarity ()
+ {
+ Folks.StructuredName a = this._individual_a.structured_name;
+ Folks.StructuredName b = this._individual_b.structured_name;
+ double similarity = 0.0;
+
+ if (this._look_alike (this._individual_a.nickname,
+ this._individual_b.nickname))
+ {
+ similarity += 0.20;
+ }
+
+ if (this._look_alike (this._individual_a.full_name,
+ this._individual_b.full_name))
+ {
+ similarity += 0.70;
+ }
+
+ if (a != null && b != null)
+ {
+ if (a.is_empty () == false && a.equal (b))
+ {
+ this._result = MatchResult.HIGH;
+ return;
+ }
+
+ if (Folks.Utils._str_equal_safe (a.given_name, b.given_name))
+ similarity += 0.20;
+
+ if (this._look_alike (a.family_name, b.family_name) &&
+ this._look_alike (a.given_name, b.given_name))
+ {
+ similarity += 0.40;
+ }
+
+ if (Folks.Utils._str_equal_safe (a.additional_names,
+ b.additional_names))
+ similarity += 0.5;
+
+ if (Folks.Utils._str_equal_safe (a.prefixes, b.prefixes))
+ similarity += 0.5;
+
+ if (Folks.Utils._str_equal_safe (a.suffixes, b.suffixes))
+ similarity += 0.5;
+ }
+
+ debug ("[name_similarity] Got %f\n", similarity);
+
+ if (similarity >= this._DIST_THRESHOLD)
+ this._result = this._inc_match_level (this._result, 2);
+ }
+
+ /**
+ * Number of equal IM addresses between two individuals.
+ *
+ * @since UNRELEASED
+ */
+ public void _inspect_im_addresses ()
+ {
+ foreach (var proto in this._individual_a.im_addresses.get_keys ())
+ {
+ var addrs_b = this._individual_b.im_addresses.lookup (proto);
+ if (addrs_b == null)
+ continue;
+
+ var addrs_a = this._individual_a.im_addresses.lookup (proto);
+
+ foreach (var im_a in addrs_a)
+ {
+ foreach (var im_b in addrs_b)
+ {
+ if (im_a == im_b)
+ {
+ this._result = MatchResult.HIGH;
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Inspect email addresses.
+ *
+ * @since UNRELEASED
+ */
+ private void _inspect_emails ()
+ {
+ unowned GLib.List<FieldDetails> list_a =
+ this._individual_a.email_addresses;
+ unowned GLib.List<FieldDetails> list_b =
+ this._individual_b.email_addresses;
+
+ foreach (var fd_a in list_a)
+ {
+ foreach (var fd_b in list_b)
+ {
+ string[] email_split_a = fd_a.value.split ("@");
+ string[] email_split_b = fd_b.value.split ("@");
+
+ if (fd_a.value == fd_b.value)
+ {
+ if (this.known_email_aliases.contains
+ (email_split_a[0]) == true)
+ {
+ if (this._result < MatchResult.HIGH)
+ {
+ this._result = MatchResult.LOW;
+ }
+ }
+ else
+ {
+ this._result = MatchResult.HIGH;
+ return;
+ }
+ }
+ else
+ {
+ string[] tokens_a =
+ email_split_a[0].split_set (this._SEPARATORS);
+ string[] tokens_b =
+ email_split_b[0].split_set (this._SEPARATORS);
+
+ /* Do we have: first.middle.last@ ~= fml@ ? */
+ if (this._check_initials_expansion (tokens_a, tokens_b))
+ {
+ this._result = MatchResult.MEDIUM;
+ }
+ /* So we have splitted the user part of the e-mail
+ * address into tokens. Lets see if there is some
+ * matches between tokens.
+ * As in: first.middle.last@ ~= [first,middle,..]@ */
+ else if (this._match_tokens (tokens_a, tokens_b))
+ {
+ this._result = MatchResult.MEDIUM;
+ }
+ }
+ }
+ }
+ }
+
+ /* We are after:
+ * you.are.someone@ =~ yas@
+ */
+ private bool _check_initials_expansion (string[] tokens_a, string[] tokens_b)
+ {
+ if (tokens_a.length > tokens_b.length &&
+ tokens_b.length == 1)
+ {
+ return this._do_check_initials_expansion (tokens_a, tokens_b[0]);
+ }
+ else if (tokens_b.length > tokens_a.length &&
+ tokens_a.length == 1)
+ {
+ return this._do_check_initials_expansion (tokens_b, tokens_a[0]);
+ }
+ return false;
+ }
+
+ private bool _do_check_initials_expansion (string[] expanded_name,
+ string initials)
+ {
+ if (expanded_name.length != initials.length)
+ return false;
+
+ for (int i=0; i<expanded_name.length; i++)
+ {
+ if (expanded_name[i][0] != initials[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ /*
+ * We should probably count how many tokens matched?
+ */
+ private bool _match_tokens (string[] tokens_a, string[] tokens_b)
+ {
+ /* To find matching items from 2 sets its more efficient
+ * to make the outer loop go with the smaller set. */
+ if (tokens_a.length > tokens_b.length)
+ return this._do_match_tokens (tokens_a, tokens_b);
+ else
+ return this._do_match_tokens (tokens_b, tokens_a);
+ }
+
+ private bool _do_match_tokens (string[] bigger_set, string[] smaller_set)
+ {
+ for (var i=0; i < smaller_set.length; i++)
+ {
+ for (var j=0; j < bigger_set.length; j++)
+ {
+ if (smaller_set[i] == bigger_set[j])
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private MatchResult _inc_match_level (
+ MatchResult current_level, int times = 1)
+ {
+ MatchResult ret = current_level + times;
+ if (ret > MatchResult.MAX)
+ ret = MatchResult.MAX;
+
+ return ret;
+ }
+
+ private bool _look_alike (string? a, string? b)
+ {
+ if (a == null || b == null)
+ {
+ return false;
+ }
+
+ bool alike = this.jaro_dist (a, b) >= this._DIST_THRESHOLD ? true : false;
+ return alike;
+ }
+
+ /* Based on:
+ * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
+ *
+ * d = 1/3 * ( m/|s1| + m/|s2| + (m - t)/m )
+ *
+ * where
+ *
+ * m = matching characters
+ * t = number of transpositions
+ */
+ public double jaro_dist (string s1, string s2)
+ {
+ double distance;
+ int max = s1.length > s2.length ? s1.length : s2.length;
+ int max_dist = (max / 2) - 1;
+ double t;
+ double m = (double) this._matches (s1, s2, max_dist, out t);
+ double len_s1 = (double) s1.length;
+ double len_s2 = (double) s2.length;
+ double a = m / len_s1;
+ double b = m / len_s2;
+ double c = 0;
+
+ if ((int) m > 0)
+ c = (m - t) / m;
+
+ distance = (1.0/3.0) * (a + b + c);
+
+ debug ("[jaro_dist] Distance for %s and %s: %f\n", s1, s2, distance);
+
+ return distance;
+ }
+
+ /* Calculate matches and transpositions as defined by the Jaro distance.
+ */
+ private int _matches (string s1, string s2, int max_dist, out double t)
+ {
+ int matches = 0;
+ t = 0.0;
+
+ for (int i=0; i < s1.length; i++)
+ {
+ var look_for = s1.slice (i, i + 1);
+ int contains = this._contains (s2, look_for, i, max_dist);
+ if (contains >= 0)
+ {
+ matches++;
+ if (contains > 0)
+ t += 1.0;
+ }
+ }
+
+ debug ("%s and %s have %d matches and %f / 2 transpositions\n",
+ s1, s2, matches, t);
+
+ t = t / 2.0;
+ return matches;
+ }
+
+ /* If haystack contains c in pos return 0, if it contains
+ * it withing the bounds of max_dist return abs(pos-pos_found).
+ * If its not found, return -1. */
+ private int _contains (string haystack, string c, int pos, int max_dist)
+ {
+ if (pos < haystack.length && haystack.slice (pos, pos + 1) == c)
+ return 0;
+
+ for (int i=pos-max_dist; i <= pos + max_dist; i++)
+ {
+ if (i < 0 || i >= haystack.length)
+ continue;
+
+ var str = haystack.slice (i, i + 1);
+ if (str == c)
+ return (pos - i).abs ();
+ }
+
+ return -1;
+ }
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]