[gnote/stable-0.7] Port the new trie implementation from Tomboy
- From: Aurimas Äernius <aurimasc src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnote/stable-0.7] Port the new trie implementation from Tomboy
- Date: Sat, 15 Oct 2011 19:21:57 +0000 (UTC)
commit 548b0832e1850e4473608b4afbf14e12d3c488c3
Author: Aurimas Äernius <aurisc4 gmail com>
Date: Sun Oct 2 22:32:38 2011 +0300
Port the new trie implementation from Tomboy
Tomboy bug: https://bugzilla.gnome.org/584362
Fixes: https://bugzilla.gnome.org/660653
Thanks Debarshi Ray.
src/notemanager.cpp | 1 +
src/test/trietest.cpp | 25 ++--
src/trie.hpp | 457 ++++++++++++++++++++++---------------------------
src/triehit.hpp | 70 +++++----
src/watchers.cpp | 22 ++-
5 files changed, 272 insertions(+), 303 deletions(-)
---
diff --git a/src/notemanager.cpp b/src/notemanager.cpp
index d8c6ec9..a5d3a7e 100644
--- a/src/notemanager.cpp
+++ b/src/notemanager.cpp
@@ -776,6 +776,7 @@ namespace gnote {
const Note::Ptr & note(*iter);
m_title_trie->add_keyword (note->get_title(), note);
}
+ m_title_trie->compute_failure_graph();
}
}
diff --git a/src/test/trietest.cpp b/src/test/trietest.cpp
index 195e30b..e994770 100644
--- a/src/test/trietest.cpp
+++ b/src/test/trietest.cpp
@@ -18,29 +18,30 @@ int test_main(int /*argc*/, char ** /*argv*/)
trie.add_keyword ("baz", "baz");
trie.add_keyword ("bazar", "bazar");
trie.add_keyword ("ÄÄÄÄÄÅÅÅÅ", "ÄÄÄÄÄÅÅÅÅ");
+ trie.compute_failure_graph();
printf ("Starting search...\n");
- gnote::TrieTree<std::string>::HitListPtr matches(trie.find_matches (src));
+ gnote::TrieHit<std::string>::ListPtr matches(trie.find_matches (src));
BOOST_CHECK( matches.get() );
-
+ printf ("Matches size\n");
BOOST_CHECK( matches->size() == 16 );
- gnote::TrieTree<std::string>::HitList::const_iterator iter = matches->begin();
+ gnote::TrieHit<std::string>::List::const_iterator iter = matches->begin();
BOOST_CHECK( *iter );
- BOOST_CHECK( (*iter)->key == "baz" );
- BOOST_CHECK( (*iter)->start == 0 );
- BOOST_CHECK( (*iter)->end == 3 );
+ BOOST_CHECK( (*iter)->key() == "baz" );
+ BOOST_CHECK( (*iter)->start() == 0 );
+ BOOST_CHECK( (*iter)->end() == 3 );
++iter;
BOOST_CHECK( *iter );
- BOOST_CHECK( (*iter)->key == "bazar" );
- BOOST_CHECK( (*iter)->start == 0 );
- BOOST_CHECK( (*iter)->end == 5 );
+ BOOST_CHECK( (*iter)->key() == "bazar" );
+ BOOST_CHECK( (*iter)->start() == 0 );
+ BOOST_CHECK( (*iter)->end() == 5 );
- for(gnote::TrieTree<std::string>::HitList::const_iterator hit_iter = matches->begin();
+ for(gnote::TrieHit<std::string>::List::const_iterator hit_iter = matches->begin();
hit_iter != matches->end(); ++hit_iter) {
- gnote::TrieHit<std::string> *hit(*hit_iter);
+ gnote::TrieHit<std::string>::Ptr hit(*hit_iter);
printf ("*** Match: '%s' at %d-%d\n",
- hit->key.c_str(), hit->start, hit->end);
+ hit->key().c_str(), hit->start(), hit->end());
}
printf ("Search finished!\n");
return 0;
diff --git a/src/trie.hpp b/src/trie.hpp
index f457ce4..c50b7c2 100644
--- a/src/trie.hpp
+++ b/src/trie.hpp
@@ -1,6 +1,7 @@
/*
* gnote
*
+ * Copyright (C) 2011 Debarshi Ray
* Copyright (C) 2009 Hubert Figuiere
*
* This program is free software: you can redistribute it and/or modify
@@ -17,292 +18,246 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
-
-
#ifndef __TRIE_HPP_
#define __TRIE_HPP_
-#include <ctype.h>
-
-#include <string>
+#include <list>
+#include <queue>
#include <tr1/memory>
-#include "sharp/string.hpp"
+#include <glibmm.h>
-#include "note.hpp"
#include "triehit.hpp"
namespace gnote {
+template<class value_t>
+class TrieTree
+{
- template<class value_t>
- class TrieTree
+private:
+
+ class TrieState;
+ typedef std::tr1::shared_ptr<TrieState> TrieStatePtr;
+ typedef std::list<TrieStatePtr> TrieStateList;
+ typedef std::queue<TrieStatePtr> TrieStateQueue;
+
+ class TrieState
{
- class TrieMatch;
- typedef std::tr1::shared_ptr<TrieMatch> TrieMatchPtr;
- class TrieState;
- typedef std::tr1::shared_ptr<TrieState> TrieStatePtr;
+ public:
- class TrieState
+ TrieState(gunichar v, int d, const TrieStatePtr & s)
+ : m_value(v)
+ , m_depth(d)
+ , m_fail_state(s)
+ , m_transitions()
+ , m_payload()
+ , m_payload_present(false)
{
- public:
- TrieState()
- : final(0)
- {
- }
- TrieStatePtr next;
- TrieStatePtr fail;
- TrieMatchPtr first_match;
- int final;
- value_t id;
- };
-
- class TrieMatch
+ }
+
+ gunichar value() const
{
- public:
- TrieMatch()
- : value(0)
- {
- }
- TrieMatchPtr next;
- TrieStatePtr state;
- gunichar value;
- };
+ return m_value;
+ }
- public:
- typedef TrieHit<value_t> triehit_t;
-
- typedef typename triehit_t::List HitList;
- typedef typename triehit_t::ListPtr HitListPtr;
-
- TrieTree(bool case_sensitive)
- : m_root(new TrieState())
- , m_fail_states(8)
- , m_case_sensitive(case_sensitive)
- , m_max_length(0)
- {
- }
+ int depth() const
+ {
+ return m_depth;
+ }
- ~TrieTree()
- {
+ TrieStatePtr fail_state()
+ {
+ return m_fail_state;
+ }
- }
+ void fail_state(const TrieStatePtr & s)
+ {
+ m_fail_state = s;
+ }
+
+ TrieStateList & transitions()
+ {
+ return m_transitions;
+ }
+
+ value_t payload() const
+ {
+ return m_payload;
+ }
+
+ void payload(const value_t & p)
+ {
+ m_payload = p;
+ }
+
+ bool payload_present() const
+ {
+ return m_payload_present;
+ }
+
+ void payload_present(bool pp)
+ {
+ m_payload_present = pp;
+ }
private:
- TrieStatePtr insert_match_at_state(size_t depth, const TrieStatePtr & q, gunichar c)
- {
- // Create a new state with a failure at %root
- TrieStatePtr new_q(new TrieState ());
- new_q->fail = m_root;
-
- // Insert/Replace into fail_states at %depth
- if (depth < m_fail_states.size()) {
- new_q->next = m_fail_states[depth];
- m_fail_states[depth] = new_q;
- }
- else {
- m_fail_states.resize(depth + 1);
- m_fail_states[depth] = new_q;
- }
- // New match points to the newly created state for value %c
- TrieMatchPtr m(new TrieMatch ());
- m->next = q->first_match;
- m->state = new_q;
- m->value = c;
+ gunichar m_value;
+ int m_depth;
+ TrieStatePtr m_fail_state;
+ TrieStateList m_transitions;
+ value_t m_payload;
+ bool m_payload_present;
+ };
- // Insert the new match into existin state's match list
- q->first_match = m;
+ const bool m_case_sensitive;
+ const TrieStatePtr m_root;
+ size_t m_max_length;
- return new_q;
- }
+public:
- // Iterate the matches at state %s looking for the first match
- // containing %c.
- TrieMatchPtr find_match_at_state (const TrieStatePtr & s, gunichar c) const
- {
- TrieMatchPtr m = s->first_match;
+ TrieTree(bool case_sensitive)
+ : m_case_sensitive(case_sensitive)
+ , m_root(new TrieState('\0', -1, TrieStatePtr()))
+ , m_max_length(0)
+ {
+ }
- while (m && (m->value != c)) {
- m = m->next;
- }
+ void add_keyword(const Glib::ustring & keyword, const value_t & pattern_id)
+ {
+ TrieStatePtr current_state = m_root;
- return m;
- }
- public:
- /*
- * final = empty set
- * FOR p = 1 TO #pat
- * q = root
- * FOR j = 1 TO m[p]
- * IF g(q, pat[p][j]) == null
- * insert(q, pat[p][j])
- * ENDIF
- * q = g(q, pat[p][j])
- * ENDFOR
- * final = union(final, q)
- * ENDFOR
- */
- void add_keyword(const Glib::ustring & needle, const value_t & pattern_id)
- {
- TrieStatePtr q = m_root;
- int depth = 0;
-
- // Step 1: add the pattern to the trie...
-
- for (size_t idx = 0; idx < needle.size(); idx++) {
- gunichar c = needle [idx];
- if (!m_case_sensitive)
- c = g_unichar_tolower(c);
-
- TrieMatchPtr m = find_match_at_state (q, c);
- if (m == NULL) {
- q = insert_match_at_state (depth, q, c);
- }
- else {
- q = m->state;
- }
-
- depth++;
- }
- q->final = depth;
- q->id = pattern_id;
-
- // Step 2: compute failure graph...
-
- for (size_t idx = 0; idx < m_fail_states.size(); idx++) {
- q = m_fail_states[idx];
-
- while (q) {
- TrieMatchPtr m = q->first_match;
-
- while (m) {
- TrieStatePtr q1 = m->state;
- TrieStatePtr r = q->fail;
- TrieMatchPtr n;
-
- while (r) {
- n = find_match_at_state (r, m->value);
- if (n == NULL) {
- r = r->fail;
- }
- else {
- break;
- }
- }
-
- if (r && n) {
- q1->fail = n->state;
-
- if (q1->fail->final > q1->final)
- q1->final = q1->fail->final;
- }
- else {
- n = find_match_at_state (m_root, m->value);
- if (n == NULL)
- q1->fail = m_root;
- else
- q1->fail = n->state;
- }
-
- m = m->next;
- }
-
- q = q->next;
- }
- }
- // Update max_length
- m_max_length = std::max(m_max_length, needle.size());
+ for (Glib::ustring::size_type i = 0; i < keyword.size(); i++) {
+ gunichar c = keyword[i];
+ if (!m_case_sensitive)
+ c = Glib::Unicode::tolower(c);
+
+ TrieStatePtr target_state = find_state_transition(current_state, c);
+ if (0 == target_state) {
+ target_state = TrieStatePtr(new TrieState(c, i, m_root));
+ current_state->transitions().push_front(target_state);
}
- /*
- * Aho-Corasick
- *
- * q = root
- * FOR i = 1 TO n
- * WHILE q != fail AND g(q, text[i]) == fail
- * q = h(q)
- * ENDWHILE
- * IF q == fail
- * q = root
- * ELSE
- * q = g(q, text[i])
- * ENDIF
- * IF isElement(q, final)
- * RETURN TRUE
- * ENDIF
- * ENDFOR
- * RETURN FALSE
- */
- HitListPtr find_matches(const Glib::ustring & haystack)
- {
- HitListPtr matches(new HitList());
- TrieStatePtr q = m_root;
- TrieMatchPtr m;
- size_t idx = 0, start_idx = 0, last_idx = 0;
- while (idx < haystack.size()) {
- gunichar c = haystack [idx++];
- if (!m_case_sensitive)
- c = g_unichar_tolower(c);
-
- while (q) {
- m = find_match_at_state (q, c);
- if (m == NULL) {
- q = q->fail;
- }
- else {
- break;
- }
- }
-
- if (q == m_root) {
- start_idx = last_idx;
- }
-
- if (q == NULL) {
- q = m_root;
- start_idx = idx;
- }
- else if (m) {
- q = m->state;
-
- // Got a match!
- if (q->final != 0) {
- std::string key = sharp::string_substring (haystack, start_idx,
- idx - start_idx);
- matches->push_back(new triehit_t (start_idx, idx, key, q->id));
- }
- }
-
- last_idx = idx;
+ current_state = target_state;
+ }
+
+ current_state->payload(pattern_id);
+ current_state->payload_present(true);
+ m_max_length = std::max(m_max_length, keyword.size());
+ }
+
+ void compute_failure_graph()
+ {
+ // Failure state is computed breadth-first (-> Queue)
+ TrieStateQueue state_queue;
+
+ // For each direct child of the root state
+ // * Set the fail state to the root state
+ // * Enqueue the state for failure graph computing
+ for (typename TrieStateList::iterator iter = m_root->transitions().begin();
+ m_root->transitions().end() != iter; iter++) {
+ TrieStatePtr & transition = *iter;
+ transition->fail_state(m_root);
+ state_queue.push(transition);
+ }
+
+ while (false == state_queue.empty()) {
+ // Current state already has a valid fail state at this point
+ TrieStatePtr current_state = state_queue.front();
+ state_queue.pop();
+
+ for (typename TrieStateList::iterator iter
+ = current_state->transitions().begin();
+ current_state->transitions().end() != iter; iter++) {
+ TrieStatePtr & transition = *iter;
+ state_queue.push(transition);
+
+ TrieStatePtr fail_state = current_state->fail_state();
+ while ((0 != fail_state)
+ && 0 == find_state_transition(fail_state, transition->value())) {
+ fail_state = fail_state->fail_state();
}
- return matches;
+
+ if (0 == fail_state)
+ transition->fail_state(m_root);
+ else
+ transition->fail_state(find_state_transition(fail_state, transition->value()));
}
+ }
+ }
- value_t lookup (const std::string & key)
- {
- HitListPtr matches(find_matches (key));
- typename HitList::const_iterator iter;
- for (iter = matches->begin(); iter != matches->end(); ++iter) {
- if ((*iter)->key.size() == key.size()) {
- return (*iter)->value;
- }
- }
- return NULL;
+ static TrieStatePtr find_state_transition(const TrieStatePtr & state,
+ gunichar value)
+ {
+ if (true == state->transitions().empty())
+ return TrieStatePtr();
+
+ for (typename TrieStateList::const_iterator iter
+ = state->transitions().begin();
+ state->transitions().end() != iter; iter++) {
+ const TrieStatePtr & transition = *iter;
+ if (transition->value() == value)
+ return transition;
+
+ }
+
+ return TrieStatePtr();
+ }
+
+ typename TrieHit<value_t>::ListPtr find_matches (const Glib::ustring & haystack)
+ {
+ TrieStatePtr current_state = m_root;
+ typename TrieHit<value_t>::ListPtr matches(
+ new typename TrieHit<value_t>::List());
+ int start_index = 0;
+
+ for (Glib::ustring::size_type i = 0; i < haystack.size(); i++) {
+ gunichar c = haystack[i];
+ if (!m_case_sensitive)
+ c = Glib::Unicode::tolower(c);
+
+ if (current_state == m_root)
+ start_index = i;
+
+ // While there's no matching transition, follow the fail states
+ // Because we're potentially changing the depths (aka length of
+ // matched characters) in the tree we're updating the start_index
+ // accordingly
+ while ((current_state != m_root)
+ && 0 == find_state_transition(current_state, c)) {
+ TrieStatePtr old_state = current_state;
+ current_state = current_state->fail_state();
+ start_index += old_state->depth() - current_state->depth();
}
- size_t max_length() const
- {
- return m_max_length;
+ current_state = find_state_transition (current_state, c);
+ if (0 == current_state)
+ current_state = m_root;
+
+ // If the state contains a payload: We've got a hit
+ // Return a TrieHit with the start and end index, the matched
+ // string and the payload object
+ if (current_state->payload_present()) {
+ int hit_length = i - start_index + 1;
+ typename TrieHit<value_t>::Ptr hit(
+ new TrieHit<value_t>(start_index,
+ start_index + hit_length,
+ haystack.substr(start_index, hit_length),
+ current_state->payload()));
+ matches->push_back(hit);
}
- private:
- TrieStatePtr m_root;
- std::vector<TrieStatePtr> m_fail_states;
- bool m_case_sensitive;
- size_t m_max_length;
+ }
- };
+ return matches;
+ }
+
+ size_t max_length() const
+ {
+ return m_max_length;
+ }
+
+};
}
diff --git a/src/triehit.hpp b/src/triehit.hpp
index a7a6eff..fd63462 100644
--- a/src/triehit.hpp
+++ b/src/triehit.hpp
@@ -1,6 +1,7 @@
/*
* gnote
*
+ * Copyright (C) 2011 Debarshi Ray
* Copyright (C) 2009 Hubert Figuiere
*
* This program is free software: you can redistribute it and/or modify
@@ -17,50 +18,59 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
-
#ifndef __TRIEHIT_HPP_
#define __TRIEHIT_HPP_
#include <list>
-#include <string>
#include <tr1/memory>
+#include <glibmm.h>
+
namespace gnote {
- template<class value_t>
- class TrieHitList
- : public std::list<value_t*>
+template<class value_t>
+class TrieHit
+{
+public:
+ typedef std::tr1::shared_ptr<TrieHit> Ptr;
+ typedef std::list<Ptr> List;
+ typedef std::tr1::shared_ptr<List> ListPtr;
+
+ TrieHit(int s, int e, const Glib::ustring & k, const value_t & v)
+ : m_start(s)
+ , m_end(e)
+ , m_key(k)
+ , m_value(v)
+ {
+ }
+
+ int start() const
+ {
+ return m_start;
+ }
+
+ int end() const
{
- public:
- ~TrieHitList()
- {
- typename TrieHitList::iterator iter;
- for(iter = this->begin(); iter != this->end(); ++iter) {
- delete *iter;
- }
- }
- };
+ return m_end;
+ }
+ Glib::ustring key() const
+ {
+ return m_key;
+ }
- template<class value_t>
- class TrieHit
+ value_t value() const
{
- public:
- typedef TrieHitList<TrieHit<value_t> > List;
- typedef std::tr1::shared_ptr<List> ListPtr;
-
- int start;
- int end;
- std::string key;
- value_t value;
+ return m_value;
+ }
- TrieHit(int s, int e, const std::string & k, const value_t & v)
- : start(s), end(e), key(k), value(v)
- {
- }
- };
+private:
+ int m_start;
+ int m_end;
+ Glib::ustring m_key;
+ value_t m_value;
+};
}
diff --git a/src/watchers.cpp b/src/watchers.cpp
index 1b5076f..dc4f009 100644
--- a/src/watchers.cpp
+++ b/src/watchers.cpp
@@ -763,20 +763,22 @@ namespace gnote {
{
// Some of these checks should be replaced with fixes to
// TitleTrie.FindMatches, probably.
- if (hit.value.expired()) {
- DBG_OUT("DoHighlight: null pointer error for '%s'." , hit.key.c_str());
+ if (hit.value().expired()) {
+ DBG_OUT("DoHighlight: null pointer error for '%s'." , hit.key().c_str());
return;
}
- if (!manager().find(hit.key)) {
- DBG_OUT ("DoHighlight: '%s' links to non-existing note." , hit.key.c_str());
+ if (!manager().find(hit.key())) {
+ DBG_OUT ("DoHighlight: '%s' links to non-existing note." ,
+ hit.key().c_str());
return;
}
- Note::Ptr hit_note(hit.value);
+ Note::Ptr hit_note(hit.value());
- if (sharp::string_to_lower(hit.key) != sharp::string_to_lower(hit_note->get_title())) { // == 0 if same string
- DBG_OUT ("DoHighlight: '%s' links wrongly to note '%s'." , hit.key.c_str(),
+ if (sharp::string_to_lower(hit.key()) != sharp::string_to_lower(hit_note->get_title())) { // == 0 if same string
+ DBG_OUT ("DoHighlight: '%s' links wrongly to note '%s'." ,
+ hit.key().c_str(),
hit_note->get_title().c_str());
return;
}
@@ -785,10 +787,10 @@ namespace gnote {
return;
Gtk::TextIter title_start = start;
- title_start.forward_chars (hit.start);
+ title_start.forward_chars (hit.start());
Gtk::TextIter title_end = start;
- title_end.forward_chars (hit.end);
+ title_end.forward_chars (hit.end());
// Only link against whole words/phrases
if ((!title_start.starts_word () && !title_start.starts_sentence ()) ||
@@ -802,7 +804,7 @@ namespace gnote {
}
DBG_OUT ("Matching Note title '%s' at %d-%d...",
- hit.key.c_str(), hit.start, hit.end);
+ hit.key().c_str(), hit.start(), hit.end());
get_buffer()->remove_tag (m_broken_link_tag, title_start, title_end);
get_buffer()->apply_tag (m_link_tag, title_start, title_end);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]