[libdazzle] search: find best match within corpus word



commit b0eff9ab75a0837039304fb42bafd8c51078a5c6
Author: Christian Hergert <chergert redhat com>
Date:   Tue Jul 16 15:49:31 2019 -0700

    search: find best match within corpus word
    
    We might find a word that is a better match by starting with a character
    later on in the word. However, if the previous scan moved us past the
    character/pos tuple that would be necessary to match, we won't find it.
    
    This allows us to rollback the position to a location where we can once
    again match upcoming character for the current key-id. It's a small hit
    (walking backwards) to greatly increase the likelyhood we match a
    contiguous string.
    
    This tweaks the scoring a bit so that we can detect a perfect match (all
    characters one after another) without perturbing the score with the
    length. Such is necessary so that longer words with a perfect match
    do not score lower than shorter works with a partial match. However, the
    in-between states there are going to fall into the previous trap (which
    is probably fine for now).
    
    Fixes #41

 src/search/dzl-fuzzy-mutable-index.c | 79 ++++++++++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 9 deletions(-)
---
diff --git a/src/search/dzl-fuzzy-mutable-index.c b/src/search/dzl-fuzzy-mutable-index.c
index 5c16e74..a85a56b 100644
--- a/src/search/dzl-fuzzy-mutable-index.c
+++ b/src/search/dzl-fuzzy-mutable-index.c
@@ -332,23 +332,50 @@ dzl_fuzzy_mutable_index_unref (DzlFuzzyMutableIndex *fuzzy)
     }
 }
 
+static void
+rollback_state_to_pos (GArray *table,
+                       gint   *state,
+                       guint   id,
+                       guint   pos)
+{
+  g_assert (table != NULL);
+  g_assert (state != NULL);
+  g_assert (pos > 0);
+
+  while (*state > 0 && *state <= table->len)
+    {
+      DzlFuzzyMutableIndexItem *iter;
+
+      (*state)--;
+
+      iter = &g_array_index (table, DzlFuzzyMutableIndexItem, *state);
+
+      if (iter->id > id || (iter->id == id && *state >= pos))
+        continue;
+
+      break;
+    }
+}
+
 static gboolean
 dzl_fuzzy_mutable_index_do_match (DzlFuzzyMutableIndexLookup *lookup,
                                   DzlFuzzyMutableIndexItem   *item,
                                   guint                       table_index,
                                   gint                        score)
 {
-  DzlFuzzyMutableIndexItem *iter;
-  gpointer key;
+  gboolean ret = FALSE;
   GArray *table;
   gint *state;
-  gint iter_score;
 
   table = lookup->tables [table_index];
   state = &lookup->state [table_index];
 
   for (; state [0] < (gint)table->len; state [0]++)
     {
+      DzlFuzzyMutableIndexItem *iter;
+      gpointer key;
+      gint iter_score;
+
       iter = &g_array_index (table, DzlFuzzyMutableIndexItem, state[0]);
 
       if ((iter->id < item->id) || ((iter->id == item->id) && (iter->pos <= item->pos)))
@@ -356,12 +383,26 @@ dzl_fuzzy_mutable_index_do_match (DzlFuzzyMutableIndexLookup *lookup,
       else if (iter->id > item->id)
         break;
 
-      iter_score = score + (iter->pos - item->pos);
+      iter_score = score + (iter->pos - item->pos - 1);
 
       if ((table_index + 1) < lookup->n_tables)
         {
           if (dzl_fuzzy_mutable_index_do_match (lookup, iter, table_index + 1, iter_score))
-            return TRUE;
+            {
+              ret = TRUE;
+
+              /* We already found a match, but we could have a better match
+               * further in the word. We need to rollback all of our additional
+               * table state to the current position so that we can possibly
+               * advance again.
+               */
+              if ((state[0] + 1) < table->len &&
+                  g_array_index (table, DzlFuzzyMutableIndexItem, state[0] + 1).id == item->id)
+                {
+                  for (guint i = table_index + 1; i < lookup->n_tables; i++)
+                    rollback_state_to_pos (lookup->tables[i], &lookup->state[i], iter->id, iter->pos + 1);
+                }
+            }
           continue;
         }
 
@@ -371,10 +412,10 @@ dzl_fuzzy_mutable_index_do_match (DzlFuzzyMutableIndexLookup *lookup,
           (iter_score < GPOINTER_TO_INT (g_hash_table_lookup (lookup->matches, key))))
         g_hash_table_insert (lookup->matches, key, GINT_TO_POINTER (iter_score));
 
-      return TRUE;
+      ret = TRUE;
     }
 
-  return FALSE;
+  return ret;
 }
 
 static inline const gchar *
@@ -466,7 +507,19 @@ dzl_fuzzy_mutable_index_match (DzlFuzzyMutableIndex *fuzzy,
       for (i = 0; i < root->len; i++)
         {
           item = &g_array_index (root, DzlFuzzyMutableIndexItem, i);
-          dzl_fuzzy_mutable_index_do_match (&lookup, item, 1, MIN (16, item->pos * 4));
+
+          if (dzl_fuzzy_mutable_index_do_match (&lookup, item, 1, 0) &&
+              i + 1 < root->len &&
+              (item + 1)->id == item->id)
+            {
+              /* We found a match, but we might find another one with a higher
+               * score later on for the same item of the corpus.  We need to
+               * roll state back to the position we're starting at so that we
+               * can match all the same characters again.
+               */
+              for (guint j = 1; j < lookup.n_tables; j++)
+                rollback_state_to_pos (lookup.tables[j], &lookup.state[j], item->id, item->pos + 1);
+            }
         }
     }
   else
@@ -500,9 +553,17 @@ dzl_fuzzy_mutable_index_match (DzlFuzzyMutableIndex *fuzzy,
 
       match.id = GPOINTER_TO_INT (key);
       match.key = dzl_fuzzy_mutable_index_get_string (fuzzy, match.id);
-      match.score = 1.0 / (strlen (match.key) + GPOINTER_TO_INT (value));
       match.value = g_ptr_array_index (fuzzy->id_to_value, match.id);
 
+      /* If we got a perfect substring match, then this is 1.0, and avoid
+       * perturbing further or we risk non-contiguous (but shorter strings)
+       * matching at higher value.
+       */
+      if (value == NULL)
+        match.score = 1.0;
+      else
+        match.score = 1.0 / (strlen (match.key) + GPOINTER_TO_INT (value));
+
       g_array_append_val (matches, match);
     }
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]