[tomboy] Rewrite Trie.cs to fix bug 584362 and move to a cleaner code base



commit 924543acef3a495f1326de02a95d83fe11a3e4d2
Author: Benjamin Podszun <benjamin podszun gmail com>
Date:   Mon Jul 4 22:47:00 2011 +0300

    Rewrite Trie.cs to fix bug 584362 and move to a cleaner code base
    
    The previous implementation failed to track the start of a match correctly if
    fail states lead to a match. This resulted in a wrong substring being returned
    as "Key" and wrong indices to select the relevant substring in the haystack.
    
    In addition the previous implementation created the failure graph on every
    keyword insert. This patch does this once, after all note titles are inserted.
    On my old, dusty laptop this improves startup performance by ~2.5 seconds..
    
    Signed-off-by: Benjamin Podszun <benjamin podszun gmail com>

 Tomboy/NoteManager.cs |    2 +
 Tomboy/Trie.cs        |  333 +++++++++++++++---------------------------------
 2 files changed, 106 insertions(+), 229 deletions(-)
---
diff --git a/Tomboy/NoteManager.cs b/Tomboy/NoteManager.cs
index 33d5ab5..000841e 100644
--- a/Tomboy/NoteManager.cs
+++ b/Tomboy/NoteManager.cs
@@ -764,6 +764,8 @@ Ciao!");
 			foreach (Note note in manager.Notes) {
 				title_trie.AddKeyword (note.Title, note);
 			}
+
+			title_trie.ComputeFailureGraph ();
 		}
 
 		public TrieTree TitleTrie
diff --git a/Tomboy/Trie.cs b/Tomboy/Trie.cs
index b3a9247..eca3118 100644
--- a/Tomboy/Trie.cs
+++ b/Tomboy/Trie.cs
@@ -1,23 +1,14 @@
-
-//
-//  Trie.cs: An efficient exact string set matcher, using Aho-Corasick.  Used to
-//  identify substrings that match Tomboy note titles as the user types.
-//
-//  To test, compile with:
-//     mcs -g -o testtrie.exe -define:TEST Trie.cs
-//
-
-using System;
+ïusing System;
 using System.Collections.Generic;
 
 namespace Tomboy
 {
 	public class TrieHit
 	{
-		public int    Start;
-		public int    End;
-		public string Key;
-		public object Value;
+		public int Start { get; private set; }
+		public int End { get; private set; }
+		public string Key { get; private set; }
+		public object Value { get; private set; }
 
 		public TrieHit (int start, int end, string key, object value)
 		{
@@ -32,257 +23,141 @@ namespace Tomboy
 	{
 		class TrieState
 		{
-			public TrieState Next;
-			public TrieState Fail;
-			public TrieMatch FirstMatch;
-			public int       Final;
-			public object    Id;
-		}
+			public TrieState (char value, int depth, TrieState fail_state)
+			{
+				Value = value;
+				Depth = depth;
+				FailState = fail_state;
+				Transitions = new LinkedList<TrieState> ();
+			}
 
-		class TrieMatch
-		{
-			public TrieMatch Next;
-			public TrieState State;
-			public char      Value;
+			public char Value { get; private set; }
+			public int Depth { get; private set; }
+			public TrieState FailState { get; set; }
+			public LinkedList<TrieState> Transitions { get; private set; }
+
+			public object Payload { get; set; }
 		}
 
-		TrieState root;
-		List<TrieState> fail_states;
-		bool      case_sensitive;
-		int       max_length;
+		readonly bool case_sensitive;
+		readonly TrieState root;
 
 		public TrieTree (bool case_sensitive)
 		{
 			this.case_sensitive = case_sensitive;
-
-			root = new TrieState ();
-			fail_states = new List<TrieState> (8);
+			root = new TrieState ('\0', -1, null);
 		}
 
-		TrieState InsertMatchAtState (int depth, TrieState q, char c)
-		{
-			// Create a new state with a failure at %root
-			TrieState new_q = new TrieState ();
-			new_q.Fail = root;
+		public int MaxLength { get; private set; }
 
-			// Insert/Replace into fail_states at %depth
-			if (depth < fail_states.Count) {
-				new_q.Next = fail_states [depth];
-				fail_states [depth] = new_q;
-			} else {
-				fail_states.Insert (depth, new_q);
+		public void AddKeyword (string keyword, object pattern_id)
+		{
+			if (!case_sensitive) {
+				keyword = keyword.ToLower ();
 			}
 
-			// New match points to the newly created state for value %c
-			TrieMatch m = new TrieMatch ();
-			m.Next = q.FirstMatch;
-			m.State = new_q;
-			m.Value = c;
-
-			// Insert the new match into existin state's match list
-			q.FirstMatch = m;
-
-			return new_q;
-		}
-
-		// Iterate the matches at state %s looking for the first match
-		// containing %c.
-		TrieMatch FindMatchAtState (TrieState s, char c)
-		{
-			TrieMatch m = s.FirstMatch;
+			var currentState = root;
+			for (int i = 0; i < keyword.Length; i++) {
+				char c = keyword[i];
+				var targetState = FindStateTransition (currentState, c);
+				if (targetState == null) {
+					targetState = new TrieState (c, i, root);
+					currentState.Transitions.AddFirst (targetState);
+				}
 
-			while (m != null && m.Value != c)
-				m = m.Next;
+				currentState = targetState;
+			}
+			currentState.Payload = pattern_id;
 
-			return m;
+			MaxLength = Math.Max (MaxLength, keyword.Length);
 		}
 
-		/*
-		 * final = empty set
-		 * FOR p = 1 TO #pat
-		 *   q = root
-		 *   FOR j = 1 TO m[p]
-		 *     IF g(q, pat[p][j]) == null
-		 *       insert(q, pat[p][j])
-		 *     ENDIF
-		 *     q = g(q, pat[p][j])
-		 *   ENDFOR
-		 *   final = union(final, q)
-		 * ENDFOR
-		 */
-		public void AddKeyword (string needle, object pattern_id)
+		public void ComputeFailureGraph ()
 		{
-			TrieState q = root;
-			int depth = 0;
-
-			// Step 1: add the pattern to the trie...
-
-			for (int idx = 0; idx < needle.Length; idx++) {
-				char c = needle [idx];
-				if (!case_sensitive)
-					c = Char.ToLower (c);
-
-				TrieMatch m = FindMatchAtState (q, c);
-				if (m == null)
-					q = InsertMatchAtState (depth, q, c);
-				else
-					q = m.State;
-
-				depth++;
+			// Failure state is computed breadth-first (-> Queue)
+			var state_queue = new Queue<TrieState> ();
+
+			// For each direct child of the root state
+			// * Set the fail state to the root state
+			// * Enqueue the state for failure graph computing
+			foreach (var transition in root.Transitions) {
+				transition.FailState = root;
+				state_queue.Enqueue (transition);
 			}
-
-			q.Final = depth;
-			q.Id = pattern_id;
-
-			// Step 2: compute failure graph...
-
-			for (int idx = 0; idx < fail_states.Count; idx++) {
-				q = fail_states [idx];
-
-				while (q != null) {
-					TrieMatch m = q.FirstMatch;
-
-					while (m != null) {
-						TrieState q1 = m.State;
-						TrieState r = q.Fail;
-						TrieMatch n = null;
-
-						while (r != null) {
-							n = FindMatchAtState (r, m.Value);
-							if (n == null)
-								r = r.Fail;
-							else
-								break;
-						}
-
-						if (r != null && n != null) {
-							q1.Fail = n.State;
-
-							if (q1.Fail.Final > q1.Final)
-								q1.Final = q1.Fail.Final;
-						} else {
-							n = FindMatchAtState (root, m.Value);
-							if (n == null)
-								q1.Fail = root;
-							else
-								q1.Fail = n.State;
-						}
-
-						m = m.Next;
+			
+			while (state_queue.Count > 0) {
+				// Current state already has a valid fail state at this point
+				var current_state = state_queue.Dequeue ();
+				foreach (var transition in current_state.Transitions) {
+					state_queue.Enqueue (transition);
+
+					var failState = current_state.FailState;
+					while ((failState != null) && FindStateTransition (failState, transition.Value) == null) {
+						failState = failState.FailState;
 					}
 
-					q = q.Next;
+					if (failState == null) {
+						transition.FailState = root;
+					} else {
+						transition.FailState = FindStateTransition (failState, transition.Value);   
+					}                    
 				}
 			}
-
-			// Update max_length
-			max_length = Math.Max (max_length, needle.Length);
 		}
 
-		/*
-		 * Aho-Corasick
-		 *
-		 * q = root
-		 * FOR i = 1 TO n
-		 *   WHILE q != fail AND g(q, text[i]) == fail
-		 *     q = h(q)
-		 *   ENDWHILE
-		 *   IF q == fail
-		 *     q = root
-		 *   ELSE
-		 *     q = g(q, text[i])
-		 *   ENDIF
-		 *   IF isElement(q, final)
-		 *     RETURN TRUE
-		 *   ENDIF
-		 * ENDFOR
-		 * RETURN FALSE
-		 */
-		public IList<TrieHit> FindMatches (string haystack)
+		static TrieState FindStateTransition (TrieState state, char value)
 		{
-			List<TrieHit> matches = new List<TrieHit> ();
-			TrieState q = root;
-			TrieMatch m = null;
-			int idx = 0, start_idx = 0, last_idx = 0;
-
-			while (idx < haystack.Length) {
-				char c = haystack [idx++];
-				if (!case_sensitive)
-					c = Char.ToLower (c);
-
-				while (q != null) {
-					m = FindMatchAtState (q, c);
-					if (m == null)
-						q = q.Fail;
-					else
-						break;
-				}
-
-				if (q == root)
-					start_idx = last_idx;
-
-				if (q == null) {
-					q = root;
-					start_idx = idx;
-				} else if (m != null) {
-					q = m.State;
-
-					// Got a match!
-					if (q.Final != 0) {
-						string key = haystack.Substring (start_idx,
-						                                 idx - start_idx);
-						TrieHit hit =
-						        new TrieHit (start_idx, idx, key, q.Id);
-						matches.Add (hit);
-					}
-				}
-
-				last_idx = idx;
+			if (state.Transitions == null)
+			{
+				return null;
 			}
-
-			return matches;
-		}
-
-		public object Lookup (string key)
-		{
-			foreach (TrieHit hit in FindMatches (key)) {
-				if (hit.Key.Length == key.Length)
-					return hit.Value;
+			foreach (var transition in state.Transitions) {
+				if (transition.Value == value) {
+					return transition;
+				}
 			}
 			return null;
 		}
 
-		public int MaxLength
+		public IList<TrieHit> FindMatches (string haystack)
 		{
-			get {
-				return max_length;
+			if (!case_sensitive) {
+				haystack = haystack.ToLower ();
 			}
-		}
-	}
 
-	#if TEST
-	public class Tester
-	{
-		public static void Main (string [] args)
-		{
-			string src = "bazar this is some foo, bar, and baz bazbarfoofoo bazbazarbaz end bazar";
-			Logger.Log ("Searching in '{0}':", src);
+			var current_state = root;
 
-			TrieTree trie = new TrieTree (false);
-			trie.AddKeyword ("foo", "foo");
-			trie.AddKeyword ("bar", "bar");
-			trie.AddKeyword ("baz", "baz");
-			trie.AddKeyword ("bazar", "bazar");
+			var matches = new List<TrieHit> ();
+			int start_index = 0;
+			for (int i = 0; i < haystack.Length; i++) {
+				var c = haystack [i];
+
+				if (current_state == root) {
+					start_index = i;
+				}
 
-			Logger.Log ("Starting search...");
-			foreach (TrieHit hit in trie.FindMatches (src)) {
-				Logger.Log ("*** Match: '{0}' at {1}-{2}",
-				            hit.Key,
-				            hit.Start,
-				            hit.End);
+				// While there's no matching transition, follow the fail states
+				// Because we're potentially changing the depths (aka length of 
+				// matched characters) in the tree we're updating the start_index
+				// accordingly
+				while ((current_state != root) && FindStateTransition (current_state, c) == null) {
+					var old_state = current_state;                    
+					current_state = current_state.FailState;
+					start_index += old_state.Depth - current_state.Depth;
+				}
+				current_state = FindStateTransition (current_state, c) ?? root;
+
+				// If the state contains a payload: We've got a hit
+				// Return a TrieHit with the start and end index, the matched 
+				// string and the payload object
+				if (current_state.Payload != null) {
+					var hit_length = i - start_index + 1;
+					matches.Add(
+						new TrieHit (start_index, start_index + hit_length, 
+							haystack.Substring (start_index, hit_length), current_state.Payload));
+				}
 			}
-			Logger.Log ("Search finished!");
+			return matches;
 		}
 	}
-	#endif
 }



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]