beagle r4579 - in branches/beagle-rdf: . Util beagled beagled/Lucene.Net/Search beagled/Lucene.Net/upstream-changes beagled/Snowball.Net/Lucene.Net/Analysis/Snowball beagled/Snowball.Net/upstream-changes



Author: dbera
Date: Mon Mar  3 23:22:43 2008
New Revision: 4579
URL: http://svn.gnome.org/viewvc/beagle?rev=4579&view=rev

Log:
Merge from trunk (Lucene-2.1 changes) 4575-4577.

Added:
   branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch
   branches/beagle-rdf/beagled/Snowball.Net/upstream-changes/
      - copied from r4577, /trunk/beagle/beagled/Snowball.Net/upstream-changes/
Modified:
   branches/beagle-rdf/   (props changed)
   branches/beagle-rdf/Util/PullingReader.cs
   branches/beagle-rdf/Util/StringFu.cs
   branches/beagle-rdf/beagled/BuildIndex.cs
   branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs
   branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs
   branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs
   branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs
   branches/beagle-rdf/beagled/LuceneBitArray.cs
   branches/beagle-rdf/beagled/LuceneCommon.cs
   branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
   branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
   branches/beagle-rdf/beagled/Makefile.am
   branches/beagle-rdf/beagled/NoiseFilter.cs
   branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs

Modified: branches/beagle-rdf/Util/PullingReader.cs
==============================================================================
--- branches/beagle-rdf/Util/PullingReader.cs	(original)
+++ branches/beagle-rdf/Util/PullingReader.cs	Mon Mar  3 23:22:43 2008
@@ -56,6 +56,7 @@
 					done = ! pull (pullBuffer, neededSize - pullBuffer.Length);
 				} catch (Exception e) {
 					Logger.Log.Debug (e, "Caught exception pulling text from {0}", pull);
+					done = true;
 				}
 			}
 		}
@@ -88,8 +89,7 @@
 			if (done && pullBuffer.Length < count)
 				count = pullBuffer.Length;
 
-			for (int i = 0; i < count; ++i)
-				buffer [index + i] = pullBuffer [i];
+			pullBuffer.CopyTo (0, buffer, index, count);
 			pullBuffer.Remove (0, count);
 
 			return count;

Modified: branches/beagle-rdf/Util/StringFu.cs
==============================================================================
--- branches/beagle-rdf/Util/StringFu.cs	(original)
+++ branches/beagle-rdf/Util/StringFu.cs	Mon Mar  3 23:22:43 2008
@@ -26,6 +26,7 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.Globalization;
 using System.IO;
 using System.Text;
@@ -459,9 +460,14 @@
 
 		static public string HexEscape (string str)
 		{
-			StringBuilder builder = new StringBuilder ();
+			int index = -1;
+			if ((index = str.IndexOfAny (CharsToQuote)) == -1)
+				return str;
+
+			StringBuilder builder = new StringBuilder (str, 0, index, str.Length << 1);
 
-			foreach (char c in str) {
+			for (; index < str.Length; ++ index) {
+				char c = str [index];
 
 				if (ArrayFu.IndexOfChar (CharsToQuote, c) != -1)
 					builder.Append (Uri.HexEscape (c));
@@ -491,23 +497,26 @@
 		/// </returns>
 		static public string HexUnescape (string str)
 		{
-			ArrayList bytes = new ArrayList ();
-                        byte[] sub_bytes;
                         int i, pos = 0;
+			if ((i = str.IndexOf ('%')) == -1)
+				return str;
 
-                        while ((i = str.IndexOf ('%', pos)) != -1) {
+			List<byte> bytes = new List<byte> (str.Length);
+                        byte[] sub_bytes;
+
+			do {
                                 sub_bytes = Encoding.UTF8.GetBytes (str.Substring (pos, i - pos));
                                 bytes.AddRange (sub_bytes);
 				
 				pos = i;
                                 char unescaped = Uri.HexUnescape (str, ref pos);
-				bytes.Add ((byte) unescaped);
-                        }
+				bytes.Add (Convert.ToByte (unescaped));
+                        } while ((i = str.IndexOf ('%', pos)) != -1);
 
                         sub_bytes = Encoding.UTF8.GetBytes (str.Substring (pos, str.Length - pos));
                         bytes.AddRange (sub_bytes);
 
-                        return Encoding.UTF8.GetString ((byte[]) bytes.ToArray (typeof (byte)));
+                        return Encoding.UTF8.GetString (bytes.ToArray ());
 		}
 
 		// These strings should never be exposed to the user.

Modified: branches/beagle-rdf/beagled/BuildIndex.cs
==============================================================================
--- branches/beagle-rdf/beagled/BuildIndex.cs	(original)
+++ branches/beagle-rdf/beagled/BuildIndex.cs	Mon Mar  3 23:22:43 2008
@@ -102,7 +102,7 @@
 		static Queue pending_directories = new Queue ();
 		static IndexerRequest pending_request;
 		
-		const int BATCH_SIZE = 30;
+		const int BATCH_SIZE = Lucene.Net.Index.IndexWriter.DEFAULT_MAX_BUFFERED_DOCS;
 		
 		/////////////////////////////////////////////////////////
 		

Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs	(original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs	Mon Mar  3 23:22:43 2008
@@ -18,6 +18,7 @@
 using System;
 
 using Document = Lucene.Net.Documents.Document;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
 
 namespace Lucene.Net.Search
 {
@@ -90,11 +91,17 @@
 			return length;
 		}
 		
+		public Document Doc(int n)
+		{
+			return Doc(n, null);
+		}
+		
 		/// <summary>Returns the stored fields of the n<sup>th</sup> document in this set.
 		/// <p>Documents are cached, so that repeated requests for the same element may
-		/// return the same Document object. 
+		/// return the same Document object. If the fieldselector is changed, then the new
+		/// fields will not be loaded.
 		/// </summary>
-		public Document Doc(int n)
+		public Document Doc(int n, FieldSelector fieldSelector)
 		{
 			HitDoc hitDoc = HitDoc(n);
 			
@@ -111,12 +118,15 @@
 			
 			if (hitDoc.doc == null)
 			{
-				hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
+				if (fieldSelector == null)
+					hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
+				else
+					hitDoc.doc = searcher.Doc(hitDoc.id, fieldSelector); // cache miss: read document
 			}
 			
 			return hitDoc.doc;
 		}
-		
+
 		/// <summary>Returns the score for the nth document in this set. </summary>
 		public float Score(int n)
 		{
@@ -222,4 +232,4 @@
 			id = i;
 		}
 	}
-}
\ No newline at end of file
+}

Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs	(original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs	Mon Mar  3 23:22:43 2008
@@ -21,6 +21,7 @@
 using Document = Lucene.Net.Documents.Document;
 using IndexReader = Lucene.Net.Index.IndexReader;
 using Term = Lucene.Net.Index.Term;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
 
 namespace Lucene.Net.Search
 {
@@ -126,6 +127,11 @@
 			return reader.Document(i);
 		}
 		
+		public override Document Doc(int i, FieldSelector fieldSelector)
+		{
+			return reader.Document(i, fieldSelector);
+		}
+		
 		// inherit javadoc
 		public override int MaxDoc()
 		{
@@ -185,4 +191,4 @@
 			return weight.Explain(reader, doc);
 		}
 	}
-}
\ No newline at end of file
+}

Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs	(original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs	Mon Mar  3 23:22:43 2008
@@ -19,6 +19,7 @@
 
 using Document = Lucene.Net.Documents.Document;
 using Term = Lucene.Net.Index.Term;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
 
 namespace Lucene.Net.Search
 {
@@ -121,6 +122,11 @@
 				throw new System.NotSupportedException();
 			}
 			
+			public override Document Doc(int i, FieldSelector fieldSelector)
+			{
+				throw new System.NotSupportedException();
+			}
+			
 			public override Explanation Explain(Weight weight, int doc)
 			{
 				throw new System.NotSupportedException();
@@ -195,6 +201,11 @@
 			return searchables[i].Doc(n - starts[i]); // dispatch to searcher
 		}
 		
+		public override Document Doc(int n, FieldSelector fieldSelector)
+		{
+			throw new System.NotSupportedException();
+		}
+		
 		
 		/// <summary>Returns index of the searcher for document <code>n</code> in the array
 		/// used to construct this searcher. 
@@ -389,4 +400,4 @@
 			return rewrittenQuery.Weight(cacheSim);
 		}
 	}
-}
\ No newline at end of file
+}

Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs	(original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs	Mon Mar  3 23:22:43 2008
@@ -19,6 +19,7 @@
 
 using Term = Lucene.Net.Index.Term;
 using Document = Lucene.Net.Documents.Document;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
 
 namespace Lucene.Net.Search
 {
@@ -208,9 +209,10 @@
 		abstract public int MaxDoc();
 		abstract public TopDocs Search(Weight weight, Filter filter, int n);
 		abstract public Document Doc(int i);
+		abstract public Document Doc(int i, FieldSelector fieldSelector);
 		abstract public Query Rewrite(Query query);
 		abstract public Explanation Explain(Weight weight, int doc);
 		abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);
 		/* End patch for GCJ bug #15411. */
 	}
-}
\ No newline at end of file
+}

Added: branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch
==============================================================================
--- (empty file)
+++ branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch	Mon Mar  3 23:22:43 2008
@@ -0,0 +1,157 @@
+Index: Search/IndexSearcher.cs
+===================================================================
+--- Search/IndexSearcher.cs	(revision 4576)
++++ Search/IndexSearcher.cs	(working copy)
+@@ -21,6 +21,7 @@
+ using Document = Lucene.Net.Documents.Document;
+ using IndexReader = Lucene.Net.Index.IndexReader;
+ using Term = Lucene.Net.Index.Term;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+ 
+ namespace Lucene.Net.Search
+ {
+@@ -126,6 +127,11 @@
+ 			return reader.Document(i);
+ 		}
+ 		
++		public override Document Doc(int i, FieldSelector fieldSelector)
++		{
++			return reader.Document(i, fieldSelector);
++		}
++		
+ 		// inherit javadoc
+ 		public override int MaxDoc()
+ 		{
+@@ -185,4 +191,4 @@
+ 			return weight.Explain(reader, doc);
+ 		}
+ 	}
+-}
+\ No newline at end of file
++}
+Index: Search/Searcher.cs
+===================================================================
+--- Search/Searcher.cs	(revision 4576)
++++ Search/Searcher.cs	(working copy)
+@@ -19,6 +19,7 @@
+ 
+ using Term = Lucene.Net.Index.Term;
+ using Document = Lucene.Net.Documents.Document;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+ 
+ namespace Lucene.Net.Search
+ {
+@@ -208,9 +209,10 @@
+ 		abstract public int MaxDoc();
+ 		abstract public TopDocs Search(Weight weight, Filter filter, int n);
+ 		abstract public Document Doc(int i);
++		abstract public Document Doc(int i, FieldSelector fieldSelector);
+ 		abstract public Query Rewrite(Query query);
+ 		abstract public Explanation Explain(Weight weight, int doc);
+ 		abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);
+ 		/* End patch for GCJ bug #15411. */
+ 	}
+-}
+\ No newline at end of file
++}
+Index: Search/Hits.cs
+===================================================================
+--- Search/Hits.cs	(revision 4576)
++++ Search/Hits.cs	(working copy)
+@@ -18,6 +18,7 @@
+ using System;
+ 
+ using Document = Lucene.Net.Documents.Document;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+ 
+ namespace Lucene.Net.Search
+ {
+@@ -90,11 +91,17 @@
+ 			return length;
+ 		}
+ 		
++		public Document Doc(int n)
++		{
++			return Doc(n, null);
++		}
++		
+ 		/// <summary>Returns the stored fields of the n<sup>th</sup> document in this set.
+ 		/// <p>Documents are cached, so that repeated requests for the same element may
+-		/// return the same Document object. 
++		/// return the same Document object. If the fieldselector is changed, then the new
++		/// fields will not be loaded.
+ 		/// </summary>
+-		public Document Doc(int n)
++		public Document Doc(int n, FieldSelector fieldSelector)
+ 		{
+ 			HitDoc hitDoc = HitDoc(n);
+ 			
+@@ -111,12 +118,15 @@
+ 			
+ 			if (hitDoc.doc == null)
+ 			{
+-				hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
++				if (fieldSelector == null)
++					hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
++				else
++					hitDoc.doc = searcher.Doc(hitDoc.id, fieldSelector); // cache miss: read document
+ 			}
+ 			
+ 			return hitDoc.doc;
+ 		}
+-		
++
+ 		/// <summary>Returns the score for the nth document in this set. </summary>
+ 		public float Score(int n)
+ 		{
+@@ -222,4 +232,4 @@
+ 			id = i;
+ 		}
+ 	}
+-}
+\ No newline at end of file
++}
+Index: Search/MultiSearcher.cs
+===================================================================
+--- Search/MultiSearcher.cs	(revision 4576)
++++ Search/MultiSearcher.cs	(working copy)
+@@ -19,6 +19,7 @@
+ 
+ using Document = Lucene.Net.Documents.Document;
+ using Term = Lucene.Net.Index.Term;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+ 
+ namespace Lucene.Net.Search
+ {
+@@ -121,6 +122,11 @@
+ 				throw new System.NotSupportedException();
+ 			}
+ 			
++			public override Document Doc(int i, FieldSelector fieldSelector)
++			{
++				throw new System.NotSupportedException();
++			}
++			
+ 			public override Explanation Explain(Weight weight, int doc)
+ 			{
+ 				throw new System.NotSupportedException();
+@@ -195,7 +201,12 @@
+ 			return searchables[i].Doc(n - starts[i]); // dispatch to searcher
+ 		}
+ 		
++		public override Document Doc(int n, FieldSelector fieldSelector)
++		{
++			throw new System.NotSupportedException();
++		}
+ 		
++		
+ 		/// <summary>Returns index of the searcher for document <code>n</code> in the array
+ 		/// used to construct this searcher. 
+ 		/// </summary>
+@@ -389,4 +400,4 @@
+ 			return rewrittenQuery.Weight(cacheSim);
+ 		}
+ 	}
+-}
+\ No newline at end of file
++}

Modified: branches/beagle-rdf/beagled/LuceneBitArray.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneBitArray.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneBitArray.cs	Mon Mar  3 23:22:43 2008
@@ -198,8 +198,6 @@
 
 		////////////////////////////////////////////////////////////
 
-		static string[] fields_uri = { "Timestamp", "Uri" };
-
 		public void ProjectOnto (LuceneBitArray other)
 		{
 			int j = 0;
@@ -211,7 +209,7 @@
 				j = i+1;
 
 				Document doc;
-				doc = searcher.Doc (i, fields_uri);
+				doc = searcher.Doc (i, LuceneQueryingDriver.fields_uri);
 
 				other.AddUri (doc.Get ("Uri"));
 			}

Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs	Mon Mar  3 23:22:43 2008
@@ -26,6 +26,7 @@
 
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
@@ -42,6 +43,9 @@
 using Lucene.Net.QueryParsers;
 using LNS = Lucene.Net.Search;
 
+using SF.Snowball.Ext;
+using SnowballProgram = SF.Snowball.SnowballProgram;
+
 using Beagle.Util;
 
 namespace Beagle.Daemon {
@@ -102,7 +106,7 @@
 		private Lucene.Net.Store.Directory secondary_store = null;
 
 		// Flush if more than this number of requests
-		public const int RequestFlushThreshold = 37; // a total arbitrary magic number
+		public const int RequestFlushThreshold = Lucene.Net.Index.IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; // Use same value as Lucene's flush threshold
 
 		//////////////////////////////////////////////////////////////////////////////
 
@@ -383,7 +387,7 @@
 
 			// Create a new store.
 			Lucene.Net.Store.Directory store;
-			store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
+			store = Lucene.Net.Store.FSDirectory.GetDirectory (path, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
 
 			// Create an empty index in that store.
 			IndexWriter writer;
@@ -441,8 +445,14 @@
 			reader.Close ();
 
 			// Create stores for our indexes.
-			primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
-			secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
+			// Use separate lock factories since each lock factory is tied to the index directory
+			if (read_only_mode) {
+				primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, Lucene.Net.Store.NoLockFactory.GetNoLockFactory ());
+				secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, Lucene.Net.Store.NoLockFactory.GetNoLockFactory ());
+			} else {
+				primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
+				secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
+			}
 		}
 
 		////////////////////////////////////////////////////////////////
@@ -475,12 +485,12 @@
 		}
 
 		// FIXME: This assumes everything being indexed is in English!
-		internal class BeagleAnalyzer : StandardAnalyzer {
+		public class BeagleAnalyzer : StandardAnalyzer {
 
+			const string DEFAULT_STEMMER_LANGUAGE = "English";
 			private char [] buffer = new char [2];
 			private bool strip_extra_property_info = false;
 			private bool tokenize_email_hostname = false;
-			const string DEFAULT_STEMMER = "English";
 
 			public BeagleAnalyzer (bool is_indexing_analyzer)
 			{
@@ -540,7 +550,10 @@
 				    || fieldName == "PropertyText"
 				    || is_text_prop) {
 					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
-					outstream = new SnowballFilter (outstream, DEFAULT_STEMMER);
+					// Sharing Stemmer is not thread safe.
+					// Currently our underlying lucene indexing is not done in multiple threads.
+					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
+					outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
 				}
 
 				return outstream;
@@ -1077,17 +1090,42 @@
 		// Access to the stemmer and list of stop words
 		//
 
-		static SF.Snowball.Ext.EnglishStemmer stemmer = new SF.Snowball.Ext.EnglishStemmer ();
+		private static Dictionary<string, StemmerInfo> stemmer_table = new Dictionary<string, StemmerInfo> ();
+
+		class StemmerInfo {
+			internal SnowballProgram Stemmer;
+			internal System.Reflection.MethodInfo StemMethod;
+		}
+
+		private static StemmerInfo GetStemmer (System.String name)
+		{
+			if (! stemmer_table.ContainsKey (name)) {
+				StemmerInfo stemmer_info = new StemmerInfo ();
+
+				// Taken from Snowball/SnowballFilter.cs
+				System.Type stemClass = System.Type.GetType ("SF.Snowball.Ext." + name + "Stemmer", true);
+				SnowballProgram stemmer = (SnowballProgram) System.Activator.CreateInstance (stemClass);
+				// why doesn't the SnowballProgram class have an (abstract?) stem method?
+				System.Reflection.MethodInfo stemMethod = stemClass.GetMethod ("Stem", (new System.Type [0] == null) ? new System.Type [0] : (System.Type []) new System.Type [0]);
+
+				stemmer_info.Stemmer = stemmer;
+				stemmer_info.StemMethod = stemMethod;
+				stemmer_table [name] = stemmer_info;
+			}
+
+			return stemmer_table [name];
+		}
+
+		private static SF.Snowball.Ext.EnglishStemmer default_stemmer = new SF.Snowball.Ext.EnglishStemmer ();
 
 		static public string Stem (string str)
 		{
 			string stemmed_str;
 
-			lock (stemmer) {
-				stemmer.SetCurrent (str);
-				stemmer.Stem ();
-				stemmed_str = stemmer.GetCurrent ();
-				stemmer.SetCurrent (String.Empty);
+			lock (default_stemmer) {
+				default_stemmer.SetCurrent (str);
+				default_stemmer.Stem ();
+				stemmed_str = default_stemmer.GetCurrent ();
 			}
 
 			return stemmed_str;
@@ -1376,11 +1414,11 @@
 				if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
 					LNS.BooleanQuery sub_query;
 					sub_query = new LNS.BooleanQuery ();
-					sub_query.Add (ym_query, true, false);
-					sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
-					top_level_query.Add (sub_query, false, false);
+					sub_query.Add (ym_query, LNS.BooleanClause.Occur.MUST);
+					sub_query.Add (NewDayQuery (field_name, d1, d2), LNS.BooleanClause.Occur.MUST);
+					top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
 				} else {
-					top_level_query.Add (ym_query, false, false);
+					top_level_query.Add (ym_query, LNS.BooleanClause.Occur.SHOULD);
 				}
 
 			} else {
@@ -1389,9 +1427,9 @@
 				if (d1 > 1) {
 					LNS.BooleanQuery sub_query;
 					sub_query = new LNS.BooleanQuery ();
-					sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
-					sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
-					top_level_query.Add (sub_query, false, false);
+					sub_query.Add (NewYearMonthQuery (field_name, y1, m1), LNS.BooleanClause.Occur.MUST);
+					sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), LNS.BooleanClause.Occur.MUST);
+					top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
 					
 					++m1;
 					if (m1 == 13) {
@@ -1404,9 +1442,9 @@
 				if (d2 < DateTime.DaysInMonth (y2, m2)) {
 					LNS.BooleanQuery sub_query;
 					sub_query = new LNS.BooleanQuery ();
-					sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
-					sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
-					top_level_query.Add (sub_query, false, false);
+					sub_query.Add (NewYearMonthQuery (field_name, y2, m2), LNS.BooleanClause.Occur.MUST);
+					sub_query.Add (NewDayQuery (field_name, 1, d2), LNS.BooleanClause.Occur.MUST);
+					top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
 
 					--m2;
 					if (m2 == 0) {
@@ -1418,7 +1456,7 @@
 				// Generate the query for the "middle" of our period, if it is non-empty
 				if (y1 < y2 || ((y1 == y2) && m1 <= m2))
 					top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
-							     false, false);
+							     LNS.BooleanClause.Occur.SHOULD);
 			}
 				
 			return top_level_query;
@@ -1478,14 +1516,14 @@
 					LNS.Query subquery;
 					subquery = StringToQuery ("Text", part.Text, term_list);
 					if (subquery != null) {
-						p_query.Add (subquery, false, false);
+						p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 						added_subquery = true;
 					}
 
 					// FIXME: HotText is ignored for now!
 					// subquery = StringToQuery ("HotText", part.Text);
 					// if (subquery != null) {
-					//    p_query.Add (subquery, false, false);
+					//    p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 					//    added_subquery = true;
 					// }
 				}
@@ -1494,10 +1532,10 @@
 					LNS.Query subquery;
 					subquery = StringToQuery ("PropertyText", part.Text, term_list);
 					if (subquery != null) {
-						p_query.Add (subquery, false, false);
+						p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 						// Properties can live in either index
 						if (! only_build_primary_query)
-							s_query.Add (subquery.Clone () as LNS.Query, false, false);
+							s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
 						added_subquery = true;
 					}
 
@@ -1528,10 +1566,10 @@
 						if (term_list != null)
 							term_list.Add (term);
 						subquery = new LNS.TermQuery (term);
-						p_query.Add (subquery, false, false);
+						p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 						// Properties can live in either index
 						if (! only_build_primary_query)
-							s_query.Add (subquery.Clone () as LNS.Query, false, false);
+							s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
 					} else {
 						// Reset these so we return a null query
 						p_query = null;
@@ -1561,26 +1599,26 @@
 				// Search text content
 				term = new Term ("Text", query_string_lower);
 				subquery = new LNS.WildcardQuery (term);
-				p_query.Add (subquery, false, false);
+				p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 				term_list.Add (term);
 
 				// Search text properties
 				term = new Term ("PropertyText", query_string_lower);
 				subquery = new LNS.WildcardQuery (term);
-				p_query.Add (subquery, false, false);
+				p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 				// Properties can live in either index
 				if (! only_build_primary_query)
-					s_query.Add (subquery.Clone () as LNS.Query, false, false);
+					s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
 				term_list.Add (term);
 
 				// Search property keywords
 				term = new Term ("PropertyKeyword", query_string_lower);
 				term_list.Add (term);
 				subquery = new LNS.WildcardQuery (term);
-				p_query.Add (subquery, false, false);
+				p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 				// Properties can live in either index
 				if (! only_build_primary_query)
-					s_query.Add (subquery.Clone () as LNS.Query, false, false);
+					s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
 
 				primary_query = p_query;
 				if (! only_build_primary_query)
@@ -1633,9 +1671,9 @@
 							  term_list, query_part_hook,
 							  out p_subq, out s_subq, out sub_hit_filter);
 					if (p_subq != null)
-						p_query.Add (p_subq, false, false);
+						p_query.Add (p_subq, LNS.BooleanClause.Occur.SHOULD);
 					if (s_subq != null)
-						s_query.Add (s_subq, false, false);
+						s_query.Add (s_subq, LNS.BooleanClause.Occur.SHOULD);
 					if (sub_hit_filter != null) {
 						if (or_hit_filter == null)
 							or_hit_filter = new OrHitFilter ();
@@ -1726,7 +1764,7 @@
 
 			int cursor = 0;
 			if (extra_requirement != null) {
-				top_query.Add (extra_requirement, true, false);
+				top_query.Add (extra_requirement, LNS.BooleanClause.Occur.MUST);
 				++cursor;
 			}
 
@@ -1738,7 +1776,7 @@
 					LNS.BooleanQuery bq;
 					bq = new LNS.BooleanQuery ();
 					bottom_queries.Add (bq);
-					top_query.Add (bq, false, false);
+					top_query.Add (bq, LNS.BooleanClause.Occur.SHOULD);
 				}
 			}
 
@@ -1756,7 +1794,7 @@
 						cursor = 0;
 				}
 				
-				target.Add (subquery, false, false);
+				target.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
 			}
 
 			return top_query;
@@ -2051,7 +2089,7 @@
 			return GetHitsForUris (uris, null);
 		}
 
-		public ICollection GetHitsForUris (ICollection uris, string[] fields)
+		public ICollection GetHitsForUris (ICollection uris, FieldSelector fields)
 		{
 			Hashtable hits_by_uri = UriFu.NewHashtable ();
 

Modified: branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneIndexingDriver.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneIndexingDriver.cs	Mon Mar  3 23:22:43 2008
@@ -215,19 +215,19 @@
 
 				term = new Term ("Uri", uri_str);
 				// For property changes, only secondary index is modified
-				secondary_reader.Delete (term);
+				secondary_reader.DeleteDocuments (term);
 
 				// Now remove from everywhere else (if asked to remove or if asked to add, in which case
 				// we first remove and then add)
 				// So we also need to remove child documents
 				if (indexable.Type != IndexableType.PropertyChange) {
-					num_delete = primary_reader.Delete (term);
+					num_delete = primary_reader.DeleteDocuments (term);
 
 					// When we delete an indexable, also delete any children.
 					// FIXME: Shouldn't we also delete any children of children, etc.?
 					term = new Term ("ParentUri", uri_str);
-					num_delete += primary_reader.Delete (term);
-					secondary_reader.Delete (term);
+					num_delete += primary_reader.DeleteDocuments (term);
+					secondary_reader.DeleteDocuments (term);
 				}
 
 				// If this is a strict removal (and not a deletion that
@@ -270,6 +270,10 @@
 				text_cache.BeginTransaction ();
 				
 			IndexWriter primary_writer, secondary_writer;
+			// FIXME: Lock obtain time-out can happen here; if that happens,
+			// an exception will be thrown and this method will break in the middle
+			// leaving IndexWriters unclosed! Same for any Lucene.Net-index modification
+			// methods.
 			primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
 			secondary_writer = null;
 

Modified: branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneQueryingDriver.cs	(original)
+++ branches/beagle-rdf/beagled/LuceneQueryingDriver.cs	Mon Mar  3 23:22:43 2008
@@ -193,12 +193,12 @@
 				case QueryPartLogic.Prohibited:
 					if (primary_prohibited_part_query == null)
 						primary_prohibited_part_query = new LNS.BooleanQuery ();
-					primary_prohibited_part_query.Add (primary_part_query, false, false);
+					primary_prohibited_part_query.Add (primary_part_query, LNS.BooleanClause.Occur.SHOULD);
 
 					if (secondary_part_query != null) {
 						if (secondary_prohibited_part_query == null)
 							secondary_prohibited_part_query = new LNS.BooleanQuery ();
-						secondary_prohibited_part_query.Add (secondary_part_query, false, false);
+						secondary_prohibited_part_query.Add (secondary_part_query, LNS.BooleanClause.Occur.SHOULD);
 					}
 
 					if (part_hit_filter != null) {
@@ -408,7 +408,7 @@
 				uri_list.Add (new Uri (subject));
 
 				string field_name = PropertyToFieldName (pred_type, predicate);
-				string[] fields = { "Uri", "Timestamp", field_name };
+				FieldSelector fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name });
 				ICollection hits = GetHitsForUris (uri_list, fields);
 
 				return hits;
@@ -554,9 +554,9 @@
 			if (secondary_searcher != null)
 				secondary_term_docs = secondary_searcher.Reader.TermDocs ();
 		
-			string[] fields = (field_name != null) ?
-					new string[] { "Uri", "Timestamp", field_name } :
-					null;
+			FieldSelector fields = null;
+			if (field_name != null)
+				fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name });
 
 			for (int match_index = primary_matches.GetNextTrueIndex (0);
 			     match_index < primary_matches.Count; 
@@ -594,7 +594,17 @@
 						found_matching_predicate = true;
 					}
 
-					if (secondary_searcher != null) {
+					// Now get the matching predicate from the secondary index
+					if (secondary_searcher == null) {
+						doc = null;
+					} else {
+						Term term = new Term ("Uri", doc.Get ("Uri"));
+						secondary_term_docs.Seek (term);
+						if (secondary_term_docs.Next ())
+							doc = secondary_searcher.Doc (secondary_term_docs.Doc ());
+					}
+
+					if (doc != null) {
 						foreach (Field field in doc.Fields ()) {
 							if (! FieldIsPredicate (field, field_value))
 								continue;
@@ -617,7 +627,7 @@
 					hits.Add (hit);
 				} else {
 					doc = primary_searcher.Doc (match_index, fields);
-					hits.Add (CreateHit (doc, secondary_searcher, secondary_term_docs, fields));
+					hits.Add (CreateHit (doc, secondary_reader, secondary_term_docs, fields));
 				}
 			}
 
@@ -906,8 +916,7 @@
 			// Only generate results if we got some matches
 			if (primary_matches != null && primary_matches.ContainsTrue ()) {
 				GenerateQueryResults (primary_reader,
-						      primary_searcher,
-						      secondary_searcher,
+						      secondary_reader,
 						      primary_matches,
 						      result,
 						      term_list,
@@ -964,7 +973,7 @@
 			LNS.BooleanQuery combined_query;
 			combined_query = new LNS.BooleanQuery ();
 			foreach (LNS.Query query in primary_queries)
-				combined_query.Add (query, true, false);
+				combined_query.Add (query, LNS.BooleanClause.Occur.MUST);
 
 			LuceneBitArray matches;
 			matches = new LuceneBitArray (primary_searcher, combined_query);
@@ -1100,7 +1109,7 @@
 			foreach (Term term in term_list) {
 
 				double idf;
-				idf = similarity.Ldf (reader.DocFreq (term), reader.MaxDoc ());
+				idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ());
 
 				int hit_count;
 				hit_count = hits_by_id.Count;
@@ -1136,11 +1145,11 @@
 		//
 
 		// Two arrays we need for quickly creating lucene documents and check if they are valid
-		static string[] fields_timestamp_uri = { "Timestamp", "Uri" };
+		static FieldSelector fields_timestamp_uri = new MapFieldSelector (new string[] {"Uri", "Timestamp"});
+		static internal FieldSelector fields_uri = new MapFieldSelector (new string[] {"Uri"});
 
 		private static void GenerateQueryResults (IndexReader       primary_reader,
-							  LNS.IndexSearcher primary_searcher,
-							  LNS.IndexSearcher secondary_searcher,
+							  IndexReader       secondary_reader,
 							  BetterBitArray    primary_matches,
 							  IQueryResult      result,
 							  ICollection       query_term_list,
@@ -1178,8 +1187,7 @@
 
 			if (primary_matches.TrueCount > max_results)
 				final_list_of_hits = ScanRecentDocs (primary_reader,
-					primary_searcher,
-					secondary_searcher,
+					secondary_reader,
 					primary_matches,
 					hits_by_id,
 					max_results,
@@ -1188,8 +1196,7 @@
 
 			if (final_list_of_hits == null)
 				final_list_of_hits = FindRecentResults (primary_reader,
-					primary_searcher,
-					secondary_searcher,
+					secondary_reader,
 					primary_matches,
 					hits_by_id,
 					max_results,
@@ -1280,8 +1287,7 @@
 		// for all of them.
 
 		private static ArrayList ScanRecentDocs (IndexReader	    primary_reader,
-						    LNS.IndexSearcher	    primary_searcher,
-						    LNS.IndexSearcher	    secondary_searcher,
+						    IndexReader		    secondary_reader,
 						    BetterBitArray	    primary_matches,
 						    Dictionary<int, Hit>    hits_by_id,
 						    int			    max_results,
@@ -1300,8 +1306,8 @@
 
 			Term term;
 			TermDocs secondary_term_docs = null;
-			if (secondary_searcher != null)
-				secondary_term_docs = secondary_searcher.Reader.TermDocs ();
+			if (secondary_reader != null)
+				secondary_term_docs = secondary_reader.TermDocs ();
 
 			do {
 				term = enumerator.Term ();
@@ -1317,13 +1323,13 @@
 					int doc_id = docs.Doc ();
 
 					if (primary_matches.Get (doc_id)) {
-						Document doc = primary_searcher.Doc (doc_id);
+						Document doc = primary_reader.Document (doc_id);
 						// If we have a UriFilter, apply it.
 						if (uri_filter != null) {
 							Uri uri;
 							uri = GetUriFromDocument (doc);
 							if (uri_filter (uri)) {
-								Hit hit = CreateHit (doc, secondary_searcher, secondary_term_docs);
+								Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);
 								hits_by_id [doc_id] = hit;
 								// Add the result, last modified first
 								results.Add (hit);
@@ -1362,8 +1368,7 @@
 		}
 
 		private static ArrayList   FindRecentResults (IndexReader	    primary_reader,
-							      LNS.IndexSearcher primary_searcher,
-							      LNS.IndexSearcher	    secondary_searcher,
+							      IndexReader	    secondary_reader,
 							      BetterBitArray	    primary_matches,
 							      Dictionary<int, Hit>  hits_by_id,
 							      int		    max_results,
@@ -1385,8 +1390,8 @@
 			else
 				all_docs = new ArrayList (primary_matches.TrueCount);
 
-			if (secondary_searcher != null)
-				term_docs = secondary_searcher.Reader.TermDocs ();
+			if (secondary_reader != null)
+				term_docs = secondary_reader.TermDocs ();
 
 			for (int match_index = primary_matches.Count; ; match_index --) {
 				// Walk across the matches backwards, since newer
@@ -1398,7 +1403,7 @@
 
 				count++;
 
-				doc = primary_searcher.Doc (match_index, fields_timestamp_uri);
+				doc = primary_reader.Document (match_index, fields_timestamp_uri);
 
 				// Check the timestamp --- if we have already reached our
 				// limit, we might be able to reject it immediately.
@@ -1424,7 +1429,7 @@
 
 				// Get the actual hit now
 				// doc was created with only 2 fields, so first get the complete lucene document for primary document
-				Hit hit = CreateHit (primary_searcher.Doc (match_index), secondary_searcher, term_docs);
+				Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs);
 				hits_by_id [match_index] = hit;
 
 				// Add the document to the appropriate data structure.
@@ -1454,23 +1459,23 @@
 		}
 
 		private static Hit CreateHit ( Document primary_doc,
-					LNS.IndexSearcher secondary_searcher,
+					IndexReader secondary_reader,
 					TermDocs term_docs)
 		{
 			return CreateHit ( primary_doc,
-					secondary_searcher,
+					secondary_reader,
 					term_docs,
 					null);
 		}
 
 		private static Hit CreateHit ( Document primary_doc,
-					LNS.IndexSearcher secondary_searcher,
+					IndexReader secondary_reader,
 					TermDocs term_docs,
-					string[] fields)
+					FieldSelector fields)
 		{
 			Hit hit = DocumentToHit (primary_doc);
 
-			if (secondary_searcher == null)
+			if (secondary_reader == null)
 				return hit;
 
 			// Get the stringified version of the URI
@@ -1482,8 +1487,8 @@
 			term_docs.Next ();
 			Document secondary_doc =
 				(fields == null) ?
-				secondary_searcher.Doc (term_docs.Doc ()) :
-				secondary_searcher.Doc (term_docs.Doc (), fields);
+				secondary_reader.Document (term_docs.Doc ()) :
+				secondary_reader.Document (term_docs.Doc (), fields);
 
 			// If we are using the secondary index, now we need to
 			// merge the properties from the secondary index

Modified: branches/beagle-rdf/beagled/Makefile.am
==============================================================================
--- branches/beagle-rdf/beagled/Makefile.am	(original)
+++ branches/beagle-rdf/beagled/Makefile.am	Mon Mar  3 23:22:43 2008
@@ -87,208 +87,9 @@
 
 ############################################################
 
-lucenedir = $(srcdir)/Lucene.Net
+include $(srcdir)/Lucene.Net/Makefile.include
 
-LUCENE_1_9_CSFILES =							\
-	$(lucenedir)/Analysis/Standard/CharStream.cs			\
-	$(lucenedir)/Analysis/Standard/FastCharStream.cs		\
-	$(lucenedir)/Analysis/Standard/ParseException.cs		\
-	$(lucenedir)/Analysis/Standard/StandardAnalyzer.cs		\
-	$(lucenedir)/Analysis/Standard/StandardFilter.cs		\
-	$(lucenedir)/Analysis/Standard/StandardTokenizer.cs		\
-	$(lucenedir)/Analysis/Standard/StandardTokenizerConstants.cs	\
-	$(lucenedir)/Analysis/Standard/StandardTokenizerTokenManager.cs	\
-	$(lucenedir)/Analysis/Standard/Token.cs				\
-	$(lucenedir)/Analysis/Standard/TokenMgrError.cs			\
-	$(lucenedir)/Analysis/Analyzer.cs				\
-	$(lucenedir)/Analysis/CharTokenizer.cs				\
-	$(lucenedir)/Analysis/ISOLatin1AccentFilter.cs			\
-	$(lucenedir)/Analysis/KeywordAnalyzer.cs			\
-	$(lucenedir)/Analysis/KeywordTokenizer.cs			\
-	$(lucenedir)/Analysis/LengthFilter.cs				\
-	$(lucenedir)/Analysis/LetterTokenizer.cs			\
-	$(lucenedir)/Analysis/LowerCaseFilter.cs			\
-	$(lucenedir)/Analysis/LowerCaseTokenizer.cs			\
-	$(lucenedir)/Analysis/PerFieldAnalyzerWrapper.cs		\
-	$(lucenedir)/Analysis/PorterStemFilter.cs			\
-	$(lucenedir)/Analysis/PorterStemmer.cs				\
-	$(lucenedir)/Analysis/SimpleAnalyzer.cs				\
-	$(lucenedir)/Analysis/StopAnalyzer.cs				\
-	$(lucenedir)/Analysis/StopFilter.cs				\
-	$(lucenedir)/Analysis/Token.cs					\
-	$(lucenedir)/Analysis/TokenFilter.cs				\
-	$(lucenedir)/Analysis/Tokenizer.cs				\
-	$(lucenedir)/Analysis/TokenStream.cs				\
-	$(lucenedir)/Analysis/WhitespaceAnalyzer.cs			\
-	$(lucenedir)/Analysis/WhitespaceTokenizer.cs			\
-	$(lucenedir)/Analysis/WordlistLoader.cs				\
-	$(lucenedir)/Document/DateField.cs				\
-	$(lucenedir)/Document/DateTools.cs				\
-	$(lucenedir)/Document/Document.cs				\
-	$(lucenedir)/Document/Field.cs					\
-	$(lucenedir)/Document/NumberTools.cs				\
-	$(lucenedir)/Index/CompoundFileReader.cs			\
-	$(lucenedir)/Index/CompoundFileWriter.cs			\
-	$(lucenedir)/Index/DocumentWriter.cs				\
-	$(lucenedir)/Index/FieldInfo.cs					\
-	$(lucenedir)/Index/FieldInfos.cs				\
-	$(lucenedir)/Index/FieldsReader.cs				\
-	$(lucenedir)/Index/FieldsWriter.cs				\
-	$(lucenedir)/Index/FilterIndexReader.cs				\
-	$(lucenedir)/Index/IndexFileNameFilter.cs			\
-	$(lucenedir)/Index/IndexFileNames.cs				\
-	$(lucenedir)/Index/IndexModifier.cs				\
-	$(lucenedir)/Index/IndexReader.cs				\
-	$(lucenedir)/Index/IndexWriter.cs				\
-	$(lucenedir)/Index/MultipleTermPositions.cs			\
-	$(lucenedir)/Index/MultiReader.cs				\
-	$(lucenedir)/Index/ParallelReader.cs				\
-	$(lucenedir)/Index/SegmentInfo.cs				\
-	$(lucenedir)/Index/SegmentInfos.cs				\
-	$(lucenedir)/Index/SegmentMergeInfo.cs				\
-	$(lucenedir)/Index/SegmentMergeQueue.cs				\
-	$(lucenedir)/Index/SegmentMerger.cs				\
-	$(lucenedir)/Index/SegmentReader.cs				\
-	$(lucenedir)/Index/SegmentTermDocs.cs				\
-	$(lucenedir)/Index/SegmentTermEnum.cs				\
-	$(lucenedir)/Index/SegmentTermPositions.cs			\
-	$(lucenedir)/Index/SegmentTermPositionVector.cs			\
-	$(lucenedir)/Index/SegmentTermVector.cs				\
-	$(lucenedir)/Index/Term.cs					\
-	$(lucenedir)/Index/TermBuffer.cs				\
-	$(lucenedir)/Index/TermDocs.cs					\
-	$(lucenedir)/Index/TermEnum.cs					\
-	$(lucenedir)/Index/TermFreqVector.cs				\
-	$(lucenedir)/Index/TermInfo.cs					\
-	$(lucenedir)/Index/TermInfosReader.cs				\
-	$(lucenedir)/Index/TermInfosWriter.cs				\
-	$(lucenedir)/Index/TermPositions.cs				\
-	$(lucenedir)/Index/TermPositionVector.cs			\
-	$(lucenedir)/Index/TermVectorOffsetInfo.cs			\
-	$(lucenedir)/Index/TermVectorsReader.cs				\
-	$(lucenedir)/Index/TermVectorsWriter.cs				\
-	$(lucenedir)/QueryParser/CharStream.cs				\
-	$(lucenedir)/QueryParser/FastCharStream.cs			\
-	$(lucenedir)/QueryParser/MultiFieldQueryParser.cs		\
-	$(lucenedir)/QueryParser/ParseException.cs			\
-	$(lucenedir)/QueryParser/QueryParser.cs				\
-	$(lucenedir)/QueryParser/QueryParserConstants.cs		\
-	$(lucenedir)/QueryParser/QueryParserTokenManager.cs		\
-	$(lucenedir)/QueryParser/Token.cs				\
-	$(lucenedir)/QueryParser/TokenMgrError.cs			\
-	$(lucenedir)/Search/Regex/RegexQuery.cs				\
-	$(lucenedir)/Search/Regex/RegexTermEnum.cs			\
-	$(lucenedir)/Search/Regex/SpanRegexQuery.cs			\
-	$(lucenedir)/Search/Spans/NearSpans.cs				\
-	$(lucenedir)/Search/Spans/SpanFirstQuery.cs			\
-	$(lucenedir)/Search/Spans/SpanNearQuery.cs			\
-	$(lucenedir)/Search/Spans/SpanNotQuery.cs			\
-	$(lucenedir)/Search/Spans/SpanOrQuery.cs			\
-	$(lucenedir)/Search/Spans/SpanQuery.cs				\
-	$(lucenedir)/Search/Spans/Spans.cs				\
-	$(lucenedir)/Search/Spans/SpanScorer.cs				\
-	$(lucenedir)/Search/Spans/SpanTermQuery.cs			\
-	$(lucenedir)/Search/Spans/SpanWeight.cs				\
-	$(lucenedir)/Search/BooleanClause.cs				\
-	$(lucenedir)/Search/BooleanQuery.cs				\
-	$(lucenedir)/Search/BooleanScorer.cs				\
-	$(lucenedir)/Search/BooleanScorer2.cs				\
-	$(lucenedir)/Search/CachingWrapperFilter.cs			\
-	$(lucenedir)/Search/ConjunctionScorer.cs			\
-	$(lucenedir)/Search/ConstantScoreQuery.cs			\
-	$(lucenedir)/Search/ConstantScoreRangeQuery.cs			\
-	$(lucenedir)/Search/DateFilter.cs				\
-	$(lucenedir)/Search/DefaultSimilarity.cs			\
-	$(lucenedir)/Search/DisjunctionMaxQuery.cs			\
-	$(lucenedir)/Search/DisjunctionMaxScorer.cs			\
-	$(lucenedir)/Search/DisjunctionSumScorer.cs			\
-	$(lucenedir)/Search/ExactPhraseScorer.cs			\
-	$(lucenedir)/Search/Explanation.cs				\
-	$(lucenedir)/Search/FieldCache.cs				\
-	$(lucenedir)/Search/FieldCacheImpl.cs				\
-	$(lucenedir)/Search/FieldDoc.cs					\
-	$(lucenedir)/Search/FieldDocSortedHitQueue.cs			\
-	$(lucenedir)/Search/FieldSortedHitQueue.cs			\
-	$(lucenedir)/Search/Filter.cs					\
-	$(lucenedir)/Search/FilteredQuery.cs				\
-	$(lucenedir)/Search/FilteredTermEnum.cs				\
-	$(lucenedir)/Search/FuzzyQuery.cs				\
-	$(lucenedir)/Search/FuzzyTermEnum.cs				\
-	$(lucenedir)/Search/Hit.cs					\
-	$(lucenedir)/Search/HitCollector.cs				\
-	$(lucenedir)/Search/HitIterator.cs				\
-	$(lucenedir)/Search/HitQueue.cs					\
-	$(lucenedir)/Search/Hits.cs					\
-	$(lucenedir)/Search/IndexSearcher.cs				\
-	$(lucenedir)/Search/MatchAllDocsQuery.cs			\
-	$(lucenedir)/Search/MultiPhraseQuery.cs				\
-	$(lucenedir)/Search/MultiSearcher.cs				\
-	$(lucenedir)/Search/MultiTermQuery.cs				\
-	$(lucenedir)/Search/NonMatchingScorer.cs			\
-	$(lucenedir)/Search/ParallelMultiSearcher.cs			\
-	$(lucenedir)/Search/PhrasePositions.cs				\
-	$(lucenedir)/Search/PhrasePrefixQuery.cs			\
-	$(lucenedir)/Search/PhraseQuery.cs				\
-	$(lucenedir)/Search/PhraseQueue.cs				\
-	$(lucenedir)/Search/PhraseScorer.cs				\
-	$(lucenedir)/Search/PrefixQuery.cs				\
-	$(lucenedir)/Search/Query.cs					\
-	$(lucenedir)/Search/QueryFilter.cs				\
-	$(lucenedir)/Search/QueryTermVector.cs				\
-	$(lucenedir)/Search/RangeFilter.cs				\
-	$(lucenedir)/Search/RangeQuery.cs				\
-	$(lucenedir)/Search/ReqExclScorer.cs				\
-	$(lucenedir)/Search/ReqOptSumScorer.cs				\
-	$(lucenedir)/Search/ScoreDoc.cs					\
-	$(lucenedir)/Search/ScoreDocComparator.cs			\
-	$(lucenedir)/Search/Scorer.cs					\
-	$(lucenedir)/Search/Searchable.cs				\
-	$(lucenedir)/Search/Searcher.cs					\
-	$(lucenedir)/Search/Similarity.cs				\
-	$(lucenedir)/Search/SimilarityDelegator.cs			\
-	$(lucenedir)/Search/SloppyPhraseScorer.cs			\
-	$(lucenedir)/Search/Sort.cs					\
-	$(lucenedir)/Search/SortComparator.cs				\
-	$(lucenedir)/Search/SortComparatorSource.cs			\
-	$(lucenedir)/Search/SortField.cs				\
-	$(lucenedir)/Search/TermQuery.cs				\
-	$(lucenedir)/Search/TermScorer.cs				\
-	$(lucenedir)/Search/TopDocs.cs					\
-	$(lucenedir)/Search/TopFieldDocs.cs				\
-	$(lucenedir)/Search/Weight.cs					\
-	$(lucenedir)/Search/WildcardQuery.cs				\
-	$(lucenedir)/Search/WildcardTermEnum.cs				\
-	$(lucenedir)/Store/BufferedIndexInput.cs			\
-	$(lucenedir)/Store/BufferedIndexOutput.cs			\
-	$(lucenedir)/Store/Directory.cs					\
-	$(lucenedir)/Store/FSDirectory.cs				\
-	$(lucenedir)/Store/IndexInput.cs				\
-	$(lucenedir)/Store/IndexOutput.cs				\
-	$(lucenedir)/Store/InputStream.cs				\
-	$(lucenedir)/Store/Lock.cs					\
-	$(lucenedir)/Store/MMapDirectory.cs				\
-	$(lucenedir)/Store/OutputStream.cs				\
-	$(lucenedir)/Store/RAMDirectory.cs				\
-	$(lucenedir)/Store/RAMFile.cs					\
-	$(lucenedir)/Store/RAMInputStream.cs				\
-	$(lucenedir)/Store/RAMOutputStream.cs				\
-	$(lucenedir)/Util/BitVector.cs					\
-	$(lucenedir)/Util/Constants.cs					\
-	$(lucenedir)/Util/Parameter.cs					\
-	$(lucenedir)/Util/PriorityQueue.cs				\
-	$(lucenedir)/Util/SmallFloat.cs					\
-	$(lucenedir)/Util/StringHelper.cs				\
-	$(lucenedir)/Util/ToStringUtils.cs				\
-	$(lucenedir)/LucenePackage.cs					\
-	$(lucenedir)/SharpZipLibAdapter.cs				\
-	$(lucenedir)/SupportClass.cs
-
-# Stuff we don't build because we don't use it and it
-# introduces additional library dependencies.
-IGNORED_LUCENE_CSFILES =			\
-        $(lucenedir)/Search/RemoteSearchable.cs
-
-LUCENE_CSFILES = $(LUCENE_1_9_CSFILES)
+LUCENE_CSFILES = $(LUCENE_2_1_CSFILES)
 
 ############################################################
 

Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs	(original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs	Mon Mar  3 23:22:43 2008
@@ -38,7 +38,7 @@
 	// 1. Removes words which are potential noise like dhyhy8ju7q9
 	// 2. Splits email addresses into meaningful tokens
 	// 3. Splits hostnames into subparts
-	class NoiseEmailHostFilter : TokenFilter {
+	public class NoiseEmailHostFilter : TokenFilter {
 			
 		private bool tokenize_email_hostname;
 
@@ -131,13 +131,13 @@
 		// Someone might like to search for emails, hostnames and
 		// phone numbers (which fall under type NUM)
 		private static readonly string tokentype_email
-			= LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
+			= LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.EMAIL];
 		private static readonly string tokentype_host 
-			= LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
+			= LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.HOST];
 		private static readonly string tokentype_number 
-			= LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
+			= LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.NUM];
 		private static readonly string tokentype_alphanum
-			= LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.ALPHANUM];
+			= LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.ALPHANUM];
 
 		private bool ProcessToken (ref Lucene.Net.Analysis.Token token)
 		{
@@ -166,10 +166,10 @@
 				if (begin == 0)
 					return ! IsNoise (text);
 				token = new Lucene.Net.Analysis.Token (
-					token.TermText ().Remove (0, begin),
-					token.StartOffset (),
+					text.Remove (0, begin),
+					begin,
 					token.EndOffset (),
-					token.Type ());
+					type);
 				return true;
 			} else if (type == tokentype_email) {
 				if (tokenize_email_hostname)
@@ -184,27 +184,46 @@
 				return ! IsNoise (token.TermText ());
 		}
 
-		private Queue parts = new Queue ();
-		private Lucene.Net.Analysis.Token token;
+		// State for creating smaller tokens from larger email/hostname tokens
+		private string[] parts = null;
+		private int parts_index = -1;
+		private int last_end_offset = -1;
+		private string token_type = null;
 
 		public override Lucene.Net.Analysis.Token Next ()
 		{
-			if (parts.Count != 0) {
-				string part = (string) parts.Dequeue ();
-				Lucene.Net.Analysis.Token part_token;
-				// FIXME: Searching for google.com will not match www.google.com.
-				// If we decide to allow google-style "abcd.1234" which means
-				// "abcd 1234" as a consequtive phrase, then adjusting
-				// the startOffset and endOffset would enable matching
-				// google.com to www.google.com
-				part_token = new Lucene.Net.Analysis.Token (part,
-								       token.StartOffset (),
-								       token.EndOffset (),
-								       token.Type ());
-				part_token.SetPositionIncrement (0);
-				return part_token;
+			if (parts != null) {
+				if (++parts_index < parts.Length) {
+					string part = parts [parts_index];
+					Lucene.Net.Analysis.Token part_token;
+					// FIXME: Searching for google.com will not match www.google.com.
+					// If we decide to allow google-style "abcd.1234" which means
+					// "abcd 1234" as a consequtive phrase, then adjusting
+					// the startOffset and endOffset would enable matching
+					// google.com to www.google.com
+					int start_offset = (parts_index == 0 && token_type == tokentype_email ?
+						0 :
+						last_end_offset + 1); // assuming only one separator
+					int end_offset = start_offset + part.Length;
+					part_token = new Lucene.Net.Analysis.Token (part,
+									       start_offset,
+									       end_offset,
+									       token_type);
+					part_token.SetPositionIncrement (0);
+					last_end_offset = (parts_index == 0 && token_type == tokentype_email ?
+						-1 :
+						end_offset); // assuming only one separator
+					return part_token;
+				} else {
+					// clear the array
+					parts = null;
+					parts_index = -1;
+					last_end_offset = -1;
+					token_type = null;
+				}
 			}
 
+			Token token;
 			while ( (token = token_stream.Next ()) != null) {
 				//Console.WriteLine ("Found token: [{0}]", token.TermText ());
 				if (ProcessToken (ref token))
@@ -213,41 +232,46 @@
 			return null;
 		}
 
-		char[] replace_array = { '@', '.', '-', '_', '+' };
+		private static readonly char[] replace_array = { '@', '.', '-', '_', '+' };
+
 		private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
 		{
+			token_type = tokentype_email;
+
 			string email = token.TermText ();
-			string[] tmp = email.Split (replace_array);
-			int l = tmp.Length;
+			parts = email.Split (replace_array);
+			if (parts.Length == 1) // safety check
+				return;
 
-			// store username part as a large token
 			int index_at = email.IndexOf ('@');
-			tmp [l-1] = email.Substring (0, index_at);
-
-			foreach (string s in tmp)
-				parts.Enqueue (s);
-			
+			// store username part as a large token
+			// and also remove the final tld part
+			Array.Copy (parts, 0, parts, 1, parts.Length - 1);
+			parts [0] = email.Substring (0, index_at);
 		}
 
 		private void ProcessURLToken (Lucene.Net.Analysis.Token token)
 		{
+			token_type = tokentype_host;
+
 			string hostname = token.TermText ();
-			string[] host_parts = hostname.Split ('.');
+			parts = hostname.Split ('.');
+
+			if (parts [0] != "www")
+				return;
 
 			// remove initial www
-			int begin_index = (host_parts [0] == "www" ? 1 : 0);
+			Array.Copy (parts, 1, parts, 0, parts.Length - 1);
+			Array.Resize (ref parts, parts.Length - 1);
 			// FIXME: Remove final tld
 			// Any string of form "<alnum> '.')+<alnum>" has type HOST
 			// Removing last token might remove important words from non-host
 			// string of that form. To fix that, we need to match against the
 			// huge list of TLDs.
-			for (int i = begin_index; i < host_parts.Length; ++i)
-				parts.Enqueue (host_parts [i]);
-
 		}
 	}
 
-#if false
+#if Noisefilter
 	// To build: gmcs NoiseFilter.cs LuceneCommon.cs -r:../Util/Util.dll -r:../BeagleClient/Beagle.dll -r:BeagleDaemonLib.dll
 	public class AnalyzerTest {
 		public static void Main ()

Modified: branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs	(original)
+++ branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs	Mon Mar  3 23:22:43 2008
@@ -60,7 +60,13 @@
 				throw new System.SystemException(e.ToString());
 			}
 		}
-		
+
+		public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
+		{
+			this.stemmer = stemmer;
+			this.stemMethod = stemMethod;
+		}
+
 		/// <summary>Returns the next input Token, after being stemmed </summary>
         public override Token Next()
 		{
@@ -81,5 +87,12 @@
 			newToken.SetPositionIncrement(token.GetPositionIncrement());
 			return newToken;
 		}
+
+		public override void Close()
+		{
+			// In case stemmer was shared
+			stemmer.SetCurrent(String.Empty);
+			base.Close();
+		}
 	}
-}
\ No newline at end of file
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]