[gnome-builder] fuzzy: support utf8 and larger data-set sizes



commit c886a9e39826a7f47ce88ddb40db71871f23e11f
Author: Christian Hergert <christian hergert me>
Date:   Fri Jun 12 14:42:46 2015 -0700

    fuzzy: support utf8 and larger data-set sizes
    
    We had to bump the index entry to 48-bit, which definitely hurts our
    compactness a bit. But this is needed to support larger data set sizes
    which I hit when testing a 1.8M entry test-case.
    
    Also, we switch to utf8 casefolding for case insensitive indexing.
    We still store relative string offsets. We could probably compress
    this to character index, but it doesn't really buy us anything yet.
    
    I also switched to using GByteArray rather than manually calling
    malloc and realloc. We could probably use GStringChunk, but since
    these are mostly "build and re-use" indexes, the smaller space
    consumption is probably better.
    
    We still need to add support for removing data from the index, that
    way we can incrementally update the index, and just leak the removed
    strings in the GByteArray.

 contrib/search/fuzzy.c |  190 ++++++++++++++++++++++++------------------------
 1 files changed, 95 insertions(+), 95 deletions(-)
---
diff --git a/contrib/search/fuzzy.c b/contrib/search/fuzzy.c
index 4d04448..87219c7 100644
--- a/contrib/search/fuzzy.c
+++ b/contrib/search/fuzzy.c
@@ -48,25 +48,25 @@ typedef struct _FuzzyLookup FuzzyLookup;
 struct _Fuzzy
 {
    volatile gint   ref_count;
-   gchar          *heap;
-   gsize           heap_length;
-   gsize           heap_offset;
+   GByteArray     *heap;
    GArray         *id_to_text_offset;
    GPtrArray      *id_to_value;
-   GPtrArray      *char_tables;
+   GHashTable     *char_tables;
    gboolean        in_bulk_insert;
    gboolean        case_sensitive;
 };
 
 
+#pragma pack(push, 1)
 struct _FuzzyItem
 {
-   guint id  : 20;
-   guint pos : 12;
+   guint64 id  : 36;
+   guint   pos : 12;
 };
+#pragma pack(pop)
 
 
-G_STATIC_ASSERT(sizeof(FuzzyItem) == 4);
+G_STATIC_ASSERT(sizeof(FuzzyItem) == 6);
 
 
 struct _FuzzyLookup
@@ -138,26 +138,15 @@ fuzzy_ref (Fuzzy *fuzzy)
 Fuzzy *
 fuzzy_new (gboolean case_sensitive)
 {
-   GArray *table;
    Fuzzy *fuzzy;
-   gint i;
 
-   fuzzy = g_new0(Fuzzy, 1);
+   fuzzy = g_slice_new0 (Fuzzy);
    fuzzy->ref_count = 1;
-   fuzzy->heap_length = FUZZY_GROW_HEAP_BY;
-   fuzzy->heap = g_malloc(fuzzy->heap_length);
-   fuzzy->heap_offset = 0;
+   fuzzy->heap = g_byte_array_new ();
    fuzzy->id_to_value = g_ptr_array_new();
    fuzzy->id_to_text_offset = g_array_new(FALSE, FALSE, sizeof(gsize));
-   fuzzy->char_tables = g_ptr_array_new();
+   fuzzy->char_tables = g_hash_table_new_full (g_direct_hash, NULL, NULL, (GDestroyNotify)g_array_unref);
    fuzzy->case_sensitive = case_sensitive;
-   g_ptr_array_set_free_func(fuzzy->char_tables,
-                             (GDestroyNotify)g_array_unref);
-
-   for (i = 0; i < 256; i++) {
-      table = g_array_new(FALSE, FALSE, sizeof(FuzzyItem));
-      g_ptr_array_add(fuzzy->char_tables, table);
-   }
 
    return fuzzy;
 }
@@ -190,27 +179,16 @@ static gsize
 fuzzy_heap_insert (Fuzzy       *fuzzy,
                    const gchar *text)
 {
-   gsize offset;
-   gsize req_bytes;
-   gsize len;
+   gsize ret;
 
    g_assert(fuzzy);
    g_assert(text);
 
-   len = strlen(text) + 1;
-   req_bytes = fuzzy->heap_offset + len;
+   ret = fuzzy->heap->len;
 
-   if (req_bytes > fuzzy->heap_length) {
-      fuzzy->heap_length = (((req_bytes / FUZZY_GROW_HEAP_BY) + 1) *
-                            FUZZY_GROW_HEAP_BY);
-      fuzzy->heap = g_realloc(fuzzy->heap, fuzzy->heap_length);
-   }
-
-   offset = fuzzy->heap_offset;
-   memcpy(fuzzy->heap + offset, text, len);
-   fuzzy->heap_offset += len;
+   g_byte_array_append (fuzzy->heap, (guint8 *)text, strlen (text) + 1);
 
-   return offset;
+   return ret;
 }
 
 
@@ -227,8 +205,8 @@ fuzzy_heap_insert (Fuzzy       *fuzzy,
 void
 fuzzy_begin_bulk_insert (Fuzzy *fuzzy)
 {
-   g_return_if_fail(fuzzy);
-   g_return_if_fail(!fuzzy->in_bulk_insert);
+   g_return_if_fail (fuzzy);
+   g_return_if_fail (!fuzzy->in_bulk_insert);
 
    fuzzy->in_bulk_insert = TRUE;
 }
@@ -243,17 +221,21 @@ fuzzy_begin_bulk_insert (Fuzzy *fuzzy)
 void
 fuzzy_end_bulk_insert (Fuzzy *fuzzy)
 {
-   GArray *table;
-   gint i;
+   GHashTableIter iter;
+   gpointer key;
+   gpointer value;
 
    g_return_if_fail(fuzzy);
    g_return_if_fail(fuzzy->in_bulk_insert);
 
    fuzzy->in_bulk_insert = FALSE;
 
-   for (i = 0; i < fuzzy->char_tables->len; i++) {
-      table = g_ptr_array_index(fuzzy->char_tables, i);
-      g_array_sort(table, fuzzy_item_compare);
+   g_hash_table_iter_init (&iter, fuzzy->char_tables);
+
+   while (g_hash_table_iter_next (&iter, &key, &value)) {
+      GArray *table = value;
+
+      g_array_sort (table, fuzzy_item_compare);
    }
 }
 
@@ -273,57 +255,63 @@ fuzzy_insert (Fuzzy       *fuzzy,
               const gchar *key,
               gpointer     value)
 {
-   FuzzyItem item;
-   GArray *table;
+   const gchar *tmp;
    gchar *downcase = NULL;
    gsize offset;
-   guint idx;
-   gint id;
-   gint i;
+   guint id;
 
-   g_return_if_fail(fuzzy);
-   g_return_if_fail(key);
-   g_return_if_fail(fuzzy->id_to_text_offset->len < ((1 << 20) - 1));
+   g_assert (fuzzy != NULL);
+   g_assert (key != NULL);
+
+   if (fuzzy->id_to_text_offset->len == G_MAXUINT) {
+      return;
+   }
 
    if (!*key) {
       return;
    }
 
    if (!fuzzy->case_sensitive) {
-      downcase = g_ascii_strdown(key, -1);
+      downcase = g_utf8_casefold (key, -1);
    }
 
    /*
     * Insert the string into our heap.
     * Track the offset within the heap since the heap could realloc.
     */
-   offset = fuzzy_heap_insert(fuzzy, key);
-   g_array_append_val(fuzzy->id_to_text_offset, offset);
-   g_ptr_array_add(fuzzy->id_to_value, value);
-   g_assert(fuzzy->id_to_value->len == fuzzy->id_to_text_offset->len);
-
-   id = fuzzy->id_to_text_offset->len - 1;
+   offset = fuzzy_heap_insert (fuzzy, key);
+   id = fuzzy->id_to_text_offset->len;
+   g_array_append_val (fuzzy->id_to_text_offset, offset);
+   g_ptr_array_add (fuzzy->id_to_value, value);
+   g_assert (fuzzy->id_to_value->len == fuzzy->id_to_text_offset->len);
 
    if (!fuzzy->case_sensitive) {
       key = downcase;
    }
 
-   for (i = 0; key[i]; i++) {
-      idx = key[i];
-      table = g_ptr_array_index(fuzzy->char_tables, idx);
+   for (tmp = key; *tmp; tmp = g_utf8_next_char (tmp)) {
+      gunichar ch = g_utf8_get_char (tmp);
+      GArray *table;
+      FuzzyItem item;
+
+      table = g_hash_table_lookup (fuzzy->char_tables, GINT_TO_POINTER (ch));
+
+      if (G_UNLIKELY (table == NULL)) {
+         table = g_array_new (FALSE, FALSE, sizeof (FuzzyItem));
+         g_hash_table_insert (fuzzy->char_tables, GINT_TO_POINTER (ch), table);
+      }
 
       item.id = id;
-      item.pos = i;
-      g_array_append_val(table, item);
+      item.pos = tmp - key;
+
+      g_array_append_val (table, item);
 
       if (!fuzzy->in_bulk_insert) {
-         g_array_sort(table, fuzzy_item_compare);
+         g_array_sort (table, fuzzy_item_compare);
       }
    }
 
-   if (!fuzzy->case_sensitive) {
-      g_free(downcase);
-   }
+   g_free (downcase);
 }
 
 
@@ -341,21 +329,19 @@ fuzzy_unref (Fuzzy *fuzzy)
    g_return_if_fail (fuzzy->ref_count > 0);
 
    if (g_atomic_int_dec_and_test (&fuzzy->ref_count)) {
-      g_free(fuzzy->heap);
-      fuzzy->heap = 0;
-      fuzzy->heap_offset = 0;
-      fuzzy->heap_length = 0;
+      g_byte_array_unref (fuzzy->heap);
+      fuzzy->heap = NULL;
 
-      g_array_unref(fuzzy->id_to_text_offset);
+      g_array_unref (fuzzy->id_to_text_offset);
       fuzzy->id_to_text_offset = NULL;
 
-      g_ptr_array_unref(fuzzy->id_to_value);
+      g_ptr_array_unref (fuzzy->id_to_value);
       fuzzy->id_to_value = NULL;
 
-      g_ptr_array_unref(fuzzy->char_tables);
+      g_hash_table_unref (fuzzy->char_tables);
       fuzzy->char_tables = NULL;
 
-      g_free(fuzzy);
+      g_slice_free (Fuzzy, fuzzy);
    }
 }
 
@@ -426,11 +412,12 @@ fuzzy_get_string (Fuzzy *fuzzy,
 {
    gsize offset;
 
-   g_assert(fuzzy);
-   g_assert(id >= 0);
+   g_assert (fuzzy);
+   g_assert (id >= 0);
 
-   offset = g_array_index(fuzzy->id_to_text_offset, gsize, id);
-   return fuzzy->heap + offset;
+   offset = g_array_index (fuzzy->id_to_text_offset, gsize, id);
+
+   return (const gchar *)&fuzzy->heap->data [offset];
 }
 
 
@@ -477,39 +464,51 @@ fuzzy_match (Fuzzy       *fuzzy,
    }
 
    if (!fuzzy->case_sensitive) {
-      downcase = g_ascii_strdown(needle, -1);
+      downcase = g_utf8_casefold (needle, -1);
       needle = downcase;
    }
 
    lookup.fuzzy = fuzzy;
-   lookup.n_tables = strlen(needle);
-   lookup.state = g_new0(gint, lookup.n_tables);
-   lookup.tables = g_new0(GArray*, lookup.n_tables);
+   lookup.n_tables = g_utf8_strlen (needle, -1);
+   lookup.state = g_new0 (gint, lookup.n_tables);
+   lookup.tables = g_new0 (GArray*, lookup.n_tables);
    lookup.needle = needle;
    lookup.max_matches = max_matches;
    lookup.matches = g_hash_table_new(NULL, NULL);
 
-   for (i = 0; needle[i]; i++) {
-      lookup.tables[i] = g_ptr_array_index(fuzzy->char_tables,
-                                           (guint)needle[i]);
+   for (i = 0; *needle; needle = g_utf8_next_char (needle)) {
+      gunichar ch;
+      GArray *table;
+
+      ch = g_utf8_get_char (needle);
+      table = g_hash_table_lookup (fuzzy->char_tables, GINT_TO_POINTER (ch));
+
+      if (table != NULL)
+         lookup.tables [i++] = table;
    }
 
-   root = g_ptr_array_index(fuzzy->char_tables, (guint)needle[0]);
+   lookup.n_tables = i;
 
-   if (G_LIKELY(lookup.n_tables > 1)) {
+   root = g_hash_table_lookup (fuzzy->char_tables,
+                               GINT_TO_POINTER (g_utf8_get_char (needle)));
+
+   if (root == NULL)
+      return matches;
+
+   if (G_LIKELY (lookup.n_tables > 1)) {
       for (i = 0; i < root->len; i++) {
          item = &g_array_index(root, FuzzyItem, i);
          fuzzy_do_match(&lookup, item, 1, 0);
       }
    } else {
       for (i = 0; i < root->len; i++) {
-         item = &g_array_index(root, FuzzyItem, i);
-         match.key = fuzzy_get_string(fuzzy, item->id);
-         match.value = g_ptr_array_index(fuzzy->id_to_value, item->id);
+         item = &g_array_index (root, FuzzyItem, i);
+         match.key = fuzzy_get_string (fuzzy, item->id);
+         match.value = g_ptr_array_index (fuzzy->id_to_value, item->id);
          match.score = 0;
-         g_array_append_val(matches, match);
+         g_array_append_val (matches, match);
       }
-      g_free(downcase);
+      g_free (downcase);
       return matches;
    }
 
@@ -519,8 +518,9 @@ fuzzy_match (Fuzzy       *fuzzy,
       gpointer value;
 
       g_hash_table_iter_init(&iter, lookup.matches);
-      while (g_hash_table_iter_next(&iter, &key, &value)) {
-         match.key = fuzzy_get_string(fuzzy, GPOINTER_TO_INT(key));
+
+      while (g_hash_table_iter_next (&iter, &key, &value)) {
+         match.key = fuzzy_get_string (fuzzy, GPOINTER_TO_INT(key));
          match.score = 1.0 / (strlen(match.key) + GPOINTER_TO_INT(value));
          match.value = g_ptr_array_index(fuzzy->id_to_value,
                                          GPOINTER_TO_INT(key));


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]