beagle r4574 - in branches/beagle-lucene2_1/beagled: . Snowball.Net/Lucene.Net/Analysis/Snowball Snowball.Net/upstream-changes



Author: dbera
Date: Sun Mar  2 18:26:36 2008
New Revision: 4574
URL: http://svn.gnome.org/viewvc/beagle?rev=4574&view=rev

Log:
Snowball stemmers are also a bit expensive to create. And by default a new stemmer is created for each field of each document. This patch reuses the stemmers. Also, this will enable us to use easily hook language based stemmers later.


Added:
   branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/
   branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch
Modified:
   branches/beagle-lucene2_1/beagled/LuceneCommon.cs
   branches/beagle-lucene2_1/beagled/NoiseFilter.cs
   branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs

Modified: branches/beagle-lucene2_1/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/LuceneCommon.cs	(original)
+++ branches/beagle-lucene2_1/beagled/LuceneCommon.cs	Sun Mar  2 18:26:36 2008
@@ -26,6 +26,7 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
@@ -42,6 +43,9 @@
 using Lucene.Net.QueryParsers;
 using LNS = Lucene.Net.Search;
 
+using SF.Snowball.Ext;
+using SnowballProgram = SF.Snowball.SnowballProgram;
+
 using Beagle.Util;
 
 namespace Beagle.Daemon {
@@ -477,12 +481,12 @@
 		}
 
 		// FIXME: This assumes everything being indexed is in English!
-		internal class BeagleAnalyzer : StandardAnalyzer {
+		public class BeagleAnalyzer : StandardAnalyzer {
 
+			const string DEFAULT_STEMMER_LANGUAGE = "English";
 			private char [] buffer = new char [2];
 			private bool strip_extra_property_info = false;
 			private bool tokenize_email_hostname = false;
-			const string DEFAULT_STEMMER = "English";
 
 			public BeagleAnalyzer (bool is_indexing_analyzer)
 			{
@@ -539,7 +543,10 @@
 				    || fieldName == "PropertyText"
 				    || is_text_prop) {
 					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
-					outstream = new SnowballFilter (outstream, DEFAULT_STEMMER);
+					// Sharing Stemmer is not thread safe.
+					// Currently our underlying lucene indexing is not done in multiple threads.
+					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
+					outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
 				}
 
 				return outstream;
@@ -1039,17 +1046,42 @@
 		// Access to the stemmer and list of stop words
 		//
 
-		static SF.Snowball.Ext.EnglishStemmer stemmer = new SF.Snowball.Ext.EnglishStemmer ();
+		private static Dictionary<string, StemmerInfo> stemmer_table = new Dictionary<string, StemmerInfo> ();
+
+		class StemmerInfo {
+			internal SnowballProgram Stemmer;
+			internal System.Reflection.MethodInfo StemMethod;
+		}
+
+		private static StemmerInfo GetStemmer (System.String name)
+		{
+			if (! stemmer_table.ContainsKey (name)) {
+				StemmerInfo stemmer_info = new StemmerInfo ();
+
+				// Taken from Snowball/SnowballFilter.cs
+				System.Type stemClass = System.Type.GetType ("SF.Snowball.Ext." + name + "Stemmer", true);
+				SnowballProgram stemmer = (SnowballProgram) System.Activator.CreateInstance (stemClass);
+				// why doesn't the SnowballProgram class have an (abstract?) stem method?
+				System.Reflection.MethodInfo stemMethod = stemClass.GetMethod ("Stem", (new System.Type [0] == null) ? new System.Type [0] : (System.Type []) new System.Type [0]);
+
+				stemmer_info.Stemmer = stemmer;
+				stemmer_info.StemMethod = stemMethod;
+				stemmer_table [name] = stemmer_info;
+			}
+
+			return stemmer_table [name];
+		}
+
+		private static SF.Snowball.Ext.EnglishStemmer default_stemmer = new SF.Snowball.Ext.EnglishStemmer ();
 
 		static public string Stem (string str)
 		{
 			string stemmed_str;
 
-			lock (stemmer) {
-				stemmer.SetCurrent (str);
-				stemmer.Stem ();
-				stemmed_str = stemmer.GetCurrent ();
-				stemmer.SetCurrent (String.Empty);
+			lock (default_stemmer) {
+				default_stemmer.SetCurrent (str);
+				default_stemmer.Stem ();
+				stemmed_str = default_stemmer.GetCurrent ();
 			}
 
 			return stemmed_str;

Modified: branches/beagle-lucene2_1/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/NoiseFilter.cs	(original)
+++ branches/beagle-lucene2_1/beagled/NoiseFilter.cs	Sun Mar  2 18:26:36 2008
@@ -38,7 +38,7 @@
 	// 1. Removes words which are potential noise like dhyhy8ju7q9
 	// 2. Splits email addresses into meaningful tokens
 	// 3. Splits hostnames into subparts
-	class NoiseEmailHostFilter : TokenFilter {
+	public class NoiseEmailHostFilter : TokenFilter {
 			
 		private bool tokenize_email_hostname;
 

Modified: branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs	(original)
+++ branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs	Sun Mar  2 18:26:36 2008
@@ -60,7 +60,13 @@
 				throw new System.SystemException(e.ToString());
 			}
 		}
-		
+
+		public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
+		{
+			this.stemmer = stemmer;
+			this.stemMethod = stemMethod;
+		}
+
 		/// <summary>Returns the next input Token, after being stemmed </summary>
         public override Token Next()
 		{
@@ -81,5 +87,12 @@
 			newToken.SetPositionIncrement(token.GetPositionIncrement());
 			return newToken;
 		}
+
+		public override void Close()
+		{
+			// In case stemmer was shared
+			stemmer.SetCurrent(String.Empty);
+			base.Close();
+		}
 	}
-}
\ No newline at end of file
+}

Added: branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch
==============================================================================
--- (empty file)
+++ branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch	Sun Mar  2 18:26:36 2008
@@ -0,0 +1,38 @@
+Reuse stemmers as much as possible.
+
+From: D Bera <dbera web gmail com>
+
+Index: Lucene.Net/Analysis/Snowball/SnowballFilter.cs
+===================================================================
+--- Lucene.Net/Analysis/Snowball/SnowballFilter.cs	(revision 4503)
++++ Lucene.Net/Analysis/Snowball/SnowballFilter.cs	(working copy)
+@@ -60,7 +60,13 @@
+ 				throw new System.SystemException(e.ToString());
+ 			}
+ 		}
+-		
++
++		public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
++		{
++			this.stemmer = stemmer;
++			this.stemMethod = stemMethod;
++		}
++
+ 		/// <summary>Returns the next input Token, after being stemmed </summary>
+         public override Token Next()
+ 		{
+@@ -81,5 +87,12 @@
+ 			newToken.SetPositionIncrement(token.GetPositionIncrement());
+ 			return newToken;
+ 		}
++
++		public override void Close()
++		{
++			// In case stemmer was shared
++			stemmer.SetCurrent(String.Empty);
++			base.Close();
++		}
+ 	}
+-}
+\ No newline at end of file
++}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]