[gnote] Port the new trie implementation from Tomboy



commit 207c45f6ad0502d108ec78689f56aeb43474f886
Author: Debarshi Ray <debarshir src gnome org>
Date:   Sat Oct 1 22:46:22 2011 +0300

    Port the new trie implementation from Tomboy
    
    Tomboy bug: https://bugzilla.gnome.org/584362
    Fixes:      https://bugzilla.gnome.org/660653

 src/notemanager.cpp   |    1 +
 src/test/trietest.cpp |   25 ++--
 src/trie.hpp          |  457 ++++++++++++++++++++++---------------------------
 src/triehit.hpp       |   70 +++++----
 src/watchers.cpp      |   22 ++-
 5 files changed, 272 insertions(+), 303 deletions(-)
---
diff --git a/src/notemanager.cpp b/src/notemanager.cpp
index 25bc980..a354ccd 100644
--- a/src/notemanager.cpp
+++ b/src/notemanager.cpp
@@ -774,6 +774,7 @@ namespace gnote {
       const Note::Ptr & note(*iter);
       m_title_trie->add_keyword (note->get_title(), note);
     }
+    m_title_trie->compute_failure_graph();
   }
 
 }
diff --git a/src/test/trietest.cpp b/src/test/trietest.cpp
index 195e30b..e994770 100644
--- a/src/test/trietest.cpp
+++ b/src/test/trietest.cpp
@@ -18,29 +18,30 @@ int test_main(int /*argc*/, char ** /*argv*/)
   trie.add_keyword ("baz", "baz");
   trie.add_keyword ("bazar", "bazar");
   trie.add_keyword ("ÄÄÄÄÄÅÅÅÅ", "ÄÄÄÄÄÅÅÅÅ");
+  trie.compute_failure_graph();
 
   printf ("Starting search...\n");
-  gnote::TrieTree<std::string>::HitListPtr matches(trie.find_matches (src));
+  gnote::TrieHit<std::string>::ListPtr matches(trie.find_matches (src));
   BOOST_CHECK( matches.get() );
-
+  printf ("Matches size\n");
   BOOST_CHECK( matches->size() == 16 );
-  gnote::TrieTree<std::string>::HitList::const_iterator iter = matches->begin();
+  gnote::TrieHit<std::string>::List::const_iterator iter = matches->begin();
 
   BOOST_CHECK( *iter );
-  BOOST_CHECK( (*iter)->key == "baz" );
-  BOOST_CHECK( (*iter)->start == 0 );
-  BOOST_CHECK( (*iter)->end == 3 );
+  BOOST_CHECK( (*iter)->key() == "baz" );
+  BOOST_CHECK( (*iter)->start() == 0 );
+  BOOST_CHECK( (*iter)->end() == 3 );
   ++iter;
   BOOST_CHECK( *iter );
-  BOOST_CHECK( (*iter)->key == "bazar" );
-  BOOST_CHECK( (*iter)->start == 0 );
-  BOOST_CHECK( (*iter)->end == 5 );
+  BOOST_CHECK( (*iter)->key() == "bazar" );
+  BOOST_CHECK( (*iter)->start() == 0 );
+  BOOST_CHECK( (*iter)->end() == 5 );
 
-  for(gnote::TrieTree<std::string>::HitList::const_iterator hit_iter = matches->begin();
+  for(gnote::TrieHit<std::string>::List::const_iterator hit_iter = matches->begin();
       hit_iter != matches->end(); ++hit_iter) {
-    gnote::TrieHit<std::string> *hit(*hit_iter);
+    gnote::TrieHit<std::string>::Ptr hit(*hit_iter);
     printf ("*** Match: '%s' at %d-%d\n",
-            hit->key.c_str(), hit->start, hit->end);
+            hit->key().c_str(), hit->start(), hit->end());
   }
   printf ("Search finished!\n");
   return 0;
diff --git a/src/trie.hpp b/src/trie.hpp
index f457ce4..c50b7c2 100644
--- a/src/trie.hpp
+++ b/src/trie.hpp
@@ -1,6 +1,7 @@
 /*
  * gnote
  *
+ * Copyright (C) 2011 Debarshi Ray
  * Copyright (C) 2009 Hubert Figuiere
  *
  * This program is free software: you can redistribute it and/or modify
@@ -17,292 +18,246 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-
-
-
 #ifndef __TRIE_HPP_
 #define __TRIE_HPP_
 
-#include <ctype.h>
-
-#include <string>
+#include <list>
+#include <queue>
 #include <tr1/memory>
 
-#include "sharp/string.hpp"
+#include <glibmm.h>
 
-#include "note.hpp"
 #include "triehit.hpp"
 
 namespace gnote {
 
+template<class value_t>
+class TrieTree
+{
 
-  template<class value_t>
-  class TrieTree 
+private:
+
+  class TrieState;
+  typedef std::tr1::shared_ptr<TrieState> TrieStatePtr;
+  typedef std::list<TrieStatePtr> TrieStateList;
+  typedef std::queue<TrieStatePtr> TrieStateQueue;
+
+  class TrieState
   {
-    class TrieMatch;
-    typedef std::tr1::shared_ptr<TrieMatch> TrieMatchPtr;
-    class TrieState;
-    typedef std::tr1::shared_ptr<TrieState> TrieStatePtr;
+  public:
 
-    class TrieState
+    TrieState(gunichar v, int d, const TrieStatePtr & s)
+      : m_value(v)
+      , m_depth(d)
+      , m_fail_state(s)
+      , m_transitions()
+      , m_payload()
+      , m_payload_present(false)
     {
-    public:
-      TrieState()
-        : final(0)
-        {
-        }
-      TrieStatePtr next;
-      TrieStatePtr fail;
-      TrieMatchPtr first_match;
-      int         final;
-      value_t     id;
-    };
-
-    class TrieMatch
+    }
+
+    gunichar value() const
     {
-    public:
-      TrieMatch()
-        : value(0)
-        {
-        }
-      TrieMatchPtr next;
-      TrieStatePtr state;
-      gunichar     value;
-    };
+      return m_value;
+    }
 
-  public:
-    typedef TrieHit<value_t>  triehit_t;
-    
-    typedef typename triehit_t::List     HitList;
-    typedef typename triehit_t::ListPtr  HitListPtr;
-
-    TrieTree(bool case_sensitive)
-      : m_root(new TrieState())
-      , m_fail_states(8)
-      , m_case_sensitive(case_sensitive)
-      , m_max_length(0)
-      {
-      }
+    int depth() const
+    {
+      return m_depth;
+    }
 
-    ~TrieTree()
-      {
+    TrieStatePtr fail_state()
+    {
+      return m_fail_state;
+    }
 
-      }
+    void fail_state(const TrieStatePtr & s)
+    {
+      m_fail_state = s;
+    }
+
+    TrieStateList & transitions()
+    {
+      return m_transitions;
+    }
+
+    value_t payload() const
+    {
+      return m_payload;
+    }
+
+    void payload(const value_t & p)
+    {
+      m_payload = p;
+    }
+
+    bool payload_present() const
+    {
+      return m_payload_present;
+    }
+
+    void payload_present(bool pp)
+    {
+      m_payload_present = pp;
+    }
 
   private:
-    TrieStatePtr insert_match_at_state(size_t depth, const TrieStatePtr & q, gunichar c)
-      {
-        // Create a new state with a failure at %root
-        TrieStatePtr new_q(new TrieState ());
-        new_q->fail = m_root;
-
-        // Insert/Replace into fail_states at %depth
-        if (depth < m_fail_states.size()) {
-          new_q->next = m_fail_states[depth];
-          m_fail_states[depth] = new_q;
-        } 
-        else {
-          m_fail_states.resize(depth + 1);
-          m_fail_states[depth] = new_q;
-        }
 
-        // New match points to the newly created state for value %c
-        TrieMatchPtr m(new TrieMatch ());
-        m->next = q->first_match;
-        m->state = new_q;
-        m->value = c;
+    gunichar m_value;
+    int m_depth;
+    TrieStatePtr m_fail_state;
+    TrieStateList m_transitions;
+    value_t m_payload;
+    bool m_payload_present;
+  };
 
-        // Insert the new match into existin state's match list
-        q->first_match = m;
+  const bool m_case_sensitive;
+  const TrieStatePtr m_root;
+  size_t m_max_length;
 
-        return new_q;
-      }
+public:
 
-    // Iterate the matches at state %s looking for the first match
-    // containing %c.
-    TrieMatchPtr find_match_at_state (const TrieStatePtr & s, gunichar c) const
-      {
-        TrieMatchPtr m = s->first_match;
+  TrieTree(bool case_sensitive)
+    : m_case_sensitive(case_sensitive)
+    , m_root(new TrieState('\0', -1, TrieStatePtr()))
+    , m_max_length(0)
+  {
+  }
 
-        while (m && (m->value != c)) {
-          m = m->next;
-        }
+  void add_keyword(const Glib::ustring & keyword, const value_t & pattern_id)
+  {
+    TrieStatePtr current_state = m_root;
 
-        return m;
-      }
-  public:
-    /*
-     * final = empty set
-     * FOR p = 1 TO #pat
-     *   q = root
-     *   FOR j = 1 TO m[p]
-     *     IF g(q, pat[p][j]) == null
-     *       insert(q, pat[p][j])
-     *     ENDIF
-     *     q = g(q, pat[p][j])
-     *   ENDFOR
-     *   final = union(final, q)
-     * ENDFOR
-     */
-    void add_keyword(const Glib::ustring & needle, const value_t & pattern_id)
-      {
-        TrieStatePtr q = m_root;
-        int depth = 0;
-
-        // Step 1: add the pattern to the trie...
-
-        for (size_t idx = 0; idx < needle.size(); idx++) {
-          gunichar c = needle [idx];
-          if (!m_case_sensitive)
-            c = g_unichar_tolower(c);
-
-          TrieMatchPtr m = find_match_at_state (q, c);
-          if (m == NULL) {
-            q = insert_match_at_state (depth, q, c);
-          }
-          else {
-            q = m->state;
-          }
-
-          depth++;
-        }
-        q->final = depth;
-        q->id = pattern_id;
-
-        // Step 2: compute failure graph...
-
-        for (size_t idx = 0; idx < m_fail_states.size(); idx++) {
-          q = m_fail_states[idx];
-
-          while (q) {
-            TrieMatchPtr m = q->first_match;
-
-            while (m) {
-              TrieStatePtr q1 = m->state;
-              TrieStatePtr r = q->fail;
-              TrieMatchPtr n;
-
-              while (r) {
-                n = find_match_at_state (r, m->value);
-                if (n == NULL) {
-                  r = r->fail;
-                }
-                else {
-                  break;
-                }
-              }
-
-              if (r && n) {
-                q1->fail = n->state;
-
-                if (q1->fail->final > q1->final)
-                  q1->final = q1->fail->final;
-              } 
-              else {
-                n = find_match_at_state (m_root, m->value);
-                if (n == NULL)
-                  q1->fail = m_root;
-                else
-                  q1->fail = n->state;
-              }
-
-              m = m->next;
-            }
-
-            q = q->next;
-          }
-        }
-        // Update max_length
-        m_max_length = std::max(m_max_length, needle.size());
+    for (Glib::ustring::size_type i = 0; i < keyword.size(); i++) {
+      gunichar c = keyword[i];
+      if (!m_case_sensitive)
+        c = Glib::Unicode::tolower(c);
+
+      TrieStatePtr target_state = find_state_transition(current_state, c);
+      if (0 == target_state) {
+        target_state = TrieStatePtr(new TrieState(c, i, m_root));
+        current_state->transitions().push_front(target_state);
       }
 
-    /*
-     * Aho-Corasick
-     *
-     * q = root
-     * FOR i = 1 TO n
-     *   WHILE q != fail AND g(q, text[i]) == fail
-     *     q = h(q)
-     *   ENDWHILE
-     *   IF q == fail
-     *     q = root
-     *   ELSE
-     *     q = g(q, text[i])
-     *   ENDIF
-     *   IF isElement(q, final)
-     *     RETURN TRUE
-     *   ENDIF
-     * ENDFOR
-     * RETURN FALSE
-     */
-    HitListPtr find_matches(const Glib::ustring & haystack)
-      {
-        HitListPtr matches(new HitList());
-        TrieStatePtr q = m_root;
-        TrieMatchPtr m;
-        size_t idx = 0, start_idx = 0, last_idx = 0;
-        while (idx < haystack.size()) {
-          gunichar c = haystack [idx++];
-          if (!m_case_sensitive)
-            c = g_unichar_tolower(c);
-
-          while (q) {
-            m = find_match_at_state (q, c);
-            if (m == NULL) {
-              q = q->fail;
-            }
-            else {
-              break;
-            }
-          }
-
-          if (q == m_root) {
-            start_idx = last_idx;
-          }
-
-          if (q == NULL) {
-            q = m_root;
-            start_idx = idx;
-          } 
-          else if (m) {
-            q = m->state;
-
-            // Got a match!
-            if (q->final != 0) {
-              std::string key = sharp::string_substring (haystack, start_idx,
-                                                         idx - start_idx);
-              matches->push_back(new triehit_t (start_idx, idx, key, q->id));
-            }
-          }
-
-          last_idx = idx;
+      current_state = target_state;
+    }
+
+    current_state->payload(pattern_id);
+    current_state->payload_present(true);
+    m_max_length = std::max(m_max_length, keyword.size());
+  }
+
+  void compute_failure_graph()
+  {
+    // Failure state is computed breadth-first (-> Queue)
+    TrieStateQueue state_queue;
+
+    // For each direct child of the root state
+    // * Set the fail state to the root state
+    // * Enqueue the state for failure graph computing
+    for (typename TrieStateList::iterator iter = m_root->transitions().begin();
+         m_root->transitions().end() != iter; iter++) {
+      TrieStatePtr & transition = *iter;
+      transition->fail_state(m_root);
+      state_queue.push(transition);
+    }
+
+    while (false == state_queue.empty()) {
+      // Current state already has a valid fail state at this point
+      TrieStatePtr current_state = state_queue.front();
+      state_queue.pop();
+
+      for (typename TrieStateList::iterator iter
+             = current_state->transitions().begin();
+           current_state->transitions().end() != iter; iter++) {
+        TrieStatePtr & transition = *iter;
+        state_queue.push(transition);
+
+        TrieStatePtr fail_state = current_state->fail_state();
+        while ((0 != fail_state)
+               && 0 == find_state_transition(fail_state, transition->value())) {
+          fail_state = fail_state->fail_state();
         }
-        return matches;
+
+        if (0 == fail_state)
+          transition->fail_state(m_root);
+        else
+          transition->fail_state(find_state_transition(fail_state, transition->value()));
       }
+    }
+  }
 
-    value_t lookup (const std::string & key)
-      {
-        HitListPtr matches(find_matches (key));
-        typename HitList::const_iterator iter;
-        for (iter = matches->begin(); iter != matches->end(); ++iter) {
-          if ((*iter)->key.size() == key.size()) {
-            return (*iter)->value;
-          }
-        }
-        return NULL;
+  static TrieStatePtr find_state_transition(const TrieStatePtr & state,
+                                            gunichar value)
+  {
+    if (true == state->transitions().empty())
+      return TrieStatePtr();
+
+    for (typename TrieStateList::const_iterator iter
+           = state->transitions().begin();
+         state->transitions().end() != iter; iter++) {
+      const TrieStatePtr & transition = *iter;
+      if (transition->value() == value)
+        return transition;
+
+    }
+
+    return TrieStatePtr();
+  }
+
+  typename TrieHit<value_t>::ListPtr find_matches (const Glib::ustring & haystack)
+  {
+    TrieStatePtr current_state = m_root;
+    typename TrieHit<value_t>::ListPtr matches(
+      new typename TrieHit<value_t>::List());
+    int start_index = 0;
+
+    for (Glib::ustring::size_type i = 0; i < haystack.size(); i++) {
+      gunichar c = haystack[i];
+      if (!m_case_sensitive)
+        c = Glib::Unicode::tolower(c);
+
+      if (current_state == m_root)
+        start_index = i;
+
+      // While there's no matching transition, follow the fail states
+      // Because we're potentially changing the depths (aka length of
+      // matched characters) in the tree we're updating the start_index
+      // accordingly
+      while ((current_state != m_root)
+             && 0 == find_state_transition(current_state, c)) {
+        TrieStatePtr old_state = current_state;
+        current_state = current_state->fail_state();
+        start_index += old_state->depth() - current_state->depth();
       }
 
-    size_t max_length() const
-      {
-        return m_max_length;
+      current_state = find_state_transition (current_state, c);
+      if (0 == current_state)
+        current_state = m_root;
+
+      // If the state contains a payload: We've got a hit
+      // Return a TrieHit with the start and end index, the matched
+      // string and the payload object
+      if (current_state->payload_present()) {
+        int hit_length = i - start_index + 1;
+        typename TrieHit<value_t>::Ptr hit(
+          new TrieHit<value_t>(start_index,
+                               start_index + hit_length,
+                               haystack.substr(start_index, hit_length),
+                               current_state->payload()));
+        matches->push_back(hit);
       }
-  private:
-    TrieStatePtr            m_root;
-    std::vector<TrieStatePtr> m_fail_states;
-    bool                    m_case_sensitive;
-    size_t                  m_max_length;
+    }
 
-  };
+    return matches;
+  }
+
+  size_t max_length() const
+  {
+    return m_max_length;
+  }
+
+};
 
 }
 
diff --git a/src/triehit.hpp b/src/triehit.hpp
index a7a6eff..fd63462 100644
--- a/src/triehit.hpp
+++ b/src/triehit.hpp
@@ -1,6 +1,7 @@
 /*
  * gnote
  *
+ * Copyright (C) 2011 Debarshi Ray
  * Copyright (C) 2009 Hubert Figuiere
  *
  * This program is free software: you can redistribute it and/or modify
@@ -17,50 +18,59 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-
-
 #ifndef __TRIEHIT_HPP_
 #define __TRIEHIT_HPP_
 
 #include <list>
-#include <string>
 #include <tr1/memory>
 
+#include <glibmm.h>
+
 namespace gnote {
 
-  template<class value_t>
-  class TrieHitList
-    : public std::list<value_t*>
+template<class value_t>
+class TrieHit
+{
+public:
+  typedef std::tr1::shared_ptr<TrieHit> Ptr;
+  typedef std::list<Ptr> List;
+  typedef std::tr1::shared_ptr<List> ListPtr;
+
+  TrieHit(int s, int e, const Glib::ustring & k, const value_t & v)
+    : m_start(s)
+    , m_end(e)
+    , m_key(k)
+    , m_value(v)
+    {
+    }
+
+  int start() const
+  {
+    return m_start;
+  }
+
+  int end() const
   {
-  public:
-    ~TrieHitList()
-      {
-        typename TrieHitList::iterator iter;
-        for(iter = this->begin(); iter != this->end(); ++iter) {
-          delete *iter;
-        }
-      }
-  };
+    return m_end;
+  }
 
+  Glib::ustring key() const
+  {
+    return m_key;
+  }
 
-  template<class value_t>
-  class TrieHit
+  value_t value() const
   {
-  public:
-    typedef TrieHitList<TrieHit<value_t> > List;
-    typedef std::tr1::shared_ptr<List>      ListPtr;
-    
-    int         start;
-    int         end;
-    std::string key;
-    value_t     value;
+    return m_value;
+  }
 
-    TrieHit(int s, int e, const std::string & k, const value_t & v)
-      : start(s), end(e), key(k), value(v)
-      {
-      }
-  };
+private:
 
+  int           m_start;
+  int           m_end;
+  Glib::ustring m_key;
+  value_t       m_value;
+};
 
 }
 
diff --git a/src/watchers.cpp b/src/watchers.cpp
index ef95779..f807912 100644
--- a/src/watchers.cpp
+++ b/src/watchers.cpp
@@ -762,20 +762,22 @@ namespace gnote {
   {
     // Some of these checks should be replaced with fixes to
     // TitleTrie.FindMatches, probably.
-    if (hit.value.expired()) {
-      DBG_OUT("DoHighlight: null pointer error for '%s'." , hit.key.c_str());
+    if (hit.value().expired()) {
+      DBG_OUT("DoHighlight: null pointer error for '%s'." , hit.key().c_str());
       return;
     }
       
-    if (!manager().find(hit.key)) {
-      DBG_OUT ("DoHighlight: '%s' links to non-existing note." , hit.key.c_str());
+    if (!manager().find(hit.key())) {
+      DBG_OUT ("DoHighlight: '%s' links to non-existing note." ,
+               hit.key().c_str());
       return;
     }
       
-    Note::Ptr hit_note(hit.value);
+    Note::Ptr hit_note(hit.value());
 
-    if (sharp::string_to_lower(hit.key) != sharp::string_to_lower(hit_note->get_title())) { // == 0 if same string  
-      DBG_OUT ("DoHighlight: '%s' links wrongly to note '%s'." , hit.key.c_str(), 
+    if (sharp::string_to_lower(hit.key()) != sharp::string_to_lower(hit_note->get_title())) { // == 0 if same string
+      DBG_OUT ("DoHighlight: '%s' links wrongly to note '%s'." ,
+               hit.key().c_str(),
                hit_note->get_title().c_str());
       return;
     }
@@ -784,10 +786,10 @@ namespace gnote {
       return;
 
     Gtk::TextIter title_start = start;
-    title_start.forward_chars (hit.start);
+    title_start.forward_chars (hit.start());
 
     Gtk::TextIter title_end = start;
-    title_end.forward_chars (hit.end);
+    title_end.forward_chars (hit.end());
 
     // Only link against whole words/phrases
     if ((!title_start.starts_word () && !title_start.starts_sentence ()) ||
@@ -801,7 +803,7 @@ namespace gnote {
     }
 
     DBG_OUT ("Matching Note title '%s' at %d-%d...",
-             hit.key.c_str(), hit.start, hit.end);
+             hit.key().c_str(), hit.start(), hit.end());
 
     get_buffer()->remove_tag (m_broken_link_tag, title_start, title_end);
     get_buffer()->apply_tag (m_link_tag, title_start, title_end);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]