beagle r4395 - in branches/beagle-lucene-2_0_004/beagled/Lucene.Net: Index Search



Author: kkubasik
Date: Thu Jan 17 18:54:23 2008
New Revision: 4395
URL: http://svn.gnome.org/viewvc/beagle?rev=4395&view=rev

Log:
Most of our patches fixed/reapplied where still relivent

Modified:
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs
   branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs	Thu Jan 17 18:54:23 2008
@@ -15,11 +15,13 @@
  * limitations under the License.
  */
 
-using System;
+using System;
+using System.Collections;
+using System.Collections.Generic;
 using Document = Lucene.Net.Documents.Document;
 using Field = Lucene.Net.Documents.Field;
 using Directory = Lucene.Net.Store.Directory;
-using IndexInput = Lucene.Net.Store.IndexInput;
+using IndexInput = Lucene.Net.Store.IndexInput;
 
 namespace Lucene.Net.Index
 {
@@ -145,7 +147,151 @@
 			}
 			
 			return doc;
-		}
+		}
+		public /*internal*/ Document Doc(int n, string[] fields)
+		{
+		
+			if (fields.Length == 0)
+				return Doc (n);
+
+			
+			// FIXME: use Hashset
+			
+			System.Collections.Specialized.StringCollection field_list = new System.Collections.Specialized.StringCollection();
+			field_list.AddRange(fields);
+			int num_required_fields = field_list.Count;
+
+			indexStream.Seek(n * 8L);
+			long position = indexStream.ReadLong();
+			fieldsStream.Seek(position);
+			
+			Document doc = new Document();
+			int numFields = fieldsStream.ReadVInt();
+			for (int i = 0; i < numFields && num_required_fields > 0; i++)
+			{
+				int fieldNumber = fieldsStream.ReadVInt();
+				FieldInfo fi = fieldInfos.FieldInfo(fieldNumber);
+				if (field_list.Contains (fi.name)) {
+					num_required_fields --;	
+
+					byte bits = fieldsStream.ReadByte();
+					
+					bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+					bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+					
+					if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0)
+					{
+						byte[] b = new byte[fieldsStream.ReadVInt()];
+						fieldsStream.ReadBytes(b, 0, b.Length);
+						if (compressed)
+							doc.Add(new Field(fi.name, Uncompress(b), Field.Store.COMPRESS));
+						else
+							doc.Add(new Field(fi.name, b, Field.Store.YES));
+					}
+					else
+					{
+						Field.Index index;
+						Field.Store store = Field.Store.YES;
+						
+						if (fi.isIndexed && tokenize)
+							index = Field.Index.TOKENIZED;
+						else if (fi.isIndexed && !tokenize)
+							index = Field.Index.UN_TOKENIZED;
+						else
+							index = Field.Index.NO;
+						
+						Field.TermVector termVector = null;
+						if (fi.storeTermVector)
+						{
+							if (fi.storeOffsetWithTermVector)
+							{
+								if (fi.storePositionWithTermVector)
+								{
+									termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+								}
+								else
+								{
+									termVector = Field.TermVector.WITH_OFFSETS;
+								}
+							}
+							else if (fi.storePositionWithTermVector)
+							{
+								termVector = Field.TermVector.WITH_POSITIONS;
+							}
+							else
+							{
+								termVector = Field.TermVector.YES;
+							}
+						}
+						else
+						{
+							termVector = Field.TermVector.NO;
+						}
+						
+						if (compressed)
+						{
+							store = Field.Store.COMPRESS;
+							byte[] b = new byte[fieldsStream.ReadVInt()];
+							fieldsStream.ReadBytes(b, 0, b.Length);
+							Field f = new Field(fi.name, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index, termVector);
+							f.SetOmitNorms(fi.omitNorms);
+							doc.Add(f);
+						}
+						else
+						{
+							Field f = new Field(fi.name, fieldsStream.ReadString(), store, index, termVector);
+							f.SetOmitNorms(fi.omitNorms);
+							doc.Add(f);
+						}
+					}
+				} else {
+					byte bits = fieldsStream.ReadByte();
+					
+					bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+					bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+					
+					if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0)
+					{
+						//byte[] b = new byte[fieldsStream.ReadVInt()];
+						//fieldsStream.ReadBytes(b, 0, b.Length);
+						int length = fieldsStream.ReadVInt();
+						for (int j = 0; j < length; j++)
+							fieldsStream.ReadByte ();
+					}
+					else
+					{
+						if (compressed)
+						{
+							//byte[] b = new byte[fieldsStream.ReadVInt()];
+							//fieldsStream.ReadBytes(b, 0, b.Length);
+							int length = fieldsStream.ReadVInt();
+							for (int j = 0; j < length; j++)
+								fieldsStream.ReadByte ();
+						}
+						else
+						{
+							//fieldsStream.ReadString ();
+							int length = fieldsStream.ReadVInt();
+							for (int j = 0; j < length; j++)
+							{
+								byte b = fieldsStream.ReadByte ();
+								if ((b & 0x80) == 0)
+									continue;
+								else if ((b & 0xE0) != 0xE0) {
+									fieldsStream.ReadByte ();
+								} else {
+									fieldsStream.ReadByte ();
+									fieldsStream.ReadByte ();
+								}
+							}
+						}
+					}
+				}
+			}
+			
+			return doc;
+		}
+		
 		
 		private byte[] Uncompress(byte[] input)
 		{

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs	Thu Jan 17 18:54:23 2008
@@ -155,6 +155,10 @@
 		public override Document Document(int n)
 		{
 			return in_Renamed.Document(n);
+		}
+		public override Document Document(int n, string[] fields)
+		{
+			return in_Renamed.Document(n, fields);
 		}
 		
 		public override bool IsDeleted(int n)

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs	Thu Jan 17 18:54:23 2008
@@ -448,6 +448,12 @@
 		/// <code>Document</code> in this index. 
 		/// </summary>
 		public abstract Document Document(int n);
+		
+		
+		/// <summary>Returns the specified fields of the <code>n</code><sup>th</sup>
+		/// <code>Document</code> in this index. 
+		/// </summary>
+		public abstract Document Document(int n, string[] fields);
 		
 		/// <summary>Returns true if document <i>n</i> has been deleted </summary>
 		public abstract bool IsDeleted(int n);

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs	Thu Jan 17 18:54:23 2008
@@ -228,6 +228,8 @@
 		
 		private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
 		private Directory ramDirectory = new RAMDirectory(); // for temp segs
+		
+		private int singleDocSegmentsCount = 0; // for speeding decision on merge candidates
 		
 		private Lock writeLock;
 		
@@ -626,7 +628,8 @@
 			dw.AddDocument(segmentName, doc);
 			lock (this)
 			{
-				segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
+				segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
+				singleDocSegmentsCount++;
 				MaybeMergeSegments();
 			}
 		}
@@ -691,6 +694,8 @@
 				{
 					int minSegment = segmentInfos.Count - mergeFactor;
 					MergeSegments(minSegment < 0?0:minSegment);
+
+				segmentInfos.Optimize (directory);
 				}
 			}
 		}
@@ -820,9 +825,9 @@
 			long targetMergeDocs = minMergeDocs;
 			while (targetMergeDocs <= maxMergeDocs)
 			{
-				// find segments smaller than current target size
-				int minSegment = segmentInfos.Count;
-				int mergeDocs = 0;
+				// find segments smaller than current target size
+				int minSegment = segmentInfos.Count - singleDocSegmentsCount; // top 1-doc segments are taken for sure
+				int mergeDocs = singleDocSegmentsCount;
 				while (--minSegment >= 0)
 				{
 					SegmentInfo si = segmentInfos.Info(minSegment);
@@ -830,11 +835,12 @@
 						break;
 					mergeDocs += si.docCount;
 				}
-				
-				if (mergeDocs >= targetMergeDocs)
-				// found a merge to do
-					MergeSegments(minSegment + 1);
-				else
+
+				if (mergeDocs >= targetMergeDocs) {
+					// found a merge to do
+ 					MergeSegments(minSegment + 1);
+					singleDocSegmentsCount = 0;
+				}else
 					break;
 				
 				targetMergeDocs *= mergeFactor; // increase target size
@@ -1008,4 +1014,4 @@
 			directory.RenameFile("deleteable.new", IndexFileNames.DELETABLE);
 		}
 	}
-}
\ No newline at end of file
+}

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs	Thu Jan 17 18:54:23 2008
@@ -116,7 +116,11 @@
 			int i = ReaderIndex(n); // find segment num
 			return subReaders[i].Document(n - starts[i]); // dispatch to segment reader
 		}
-		
+		public override Document Document(int n, string[] fields)
+		{
+			int i = ReaderIndex(n); // find segment num
+			return subReaders[i].Document(n - starts[i], fields); // dispatch to segment reader
+		}
 		public override bool IsDeleted(int n)
 		{
 			int i = ReaderIndex(n); // find segment num
@@ -471,14 +475,15 @@
 		
 		/// <summary>As yet unoptimized implementation. </summary>
 		public virtual bool SkipTo(int target)
-		{
-			do 
-			{
-				if (!Next())
-					return false;
-			}
-			while (target > Doc());
-			return true;
+		{
+			if (current != null && current.SkipTo (target - base_Renamed)) {
+				return true;
+			} else if (pointer < readers.Length) {
+				base_Renamed = starts [pointer];
+				current = TermDocs (pointer++);
+				return SkipTo (target);
+			} else
+				return false;
 		}
 		
 		private TermDocs TermDocs(int i)

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs	Thu Jan 17 18:54:23 2008
@@ -159,6 +159,10 @@
 				}
 			}
 			return result;
+		}
+		public override Document Document(int n, string[] fields)
+		{
+			throw new System.NotSupportedException ();
 		}
 		
 		// get all vectors

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs	Thu Jan 17 18:54:23 2008
@@ -150,6 +150,27 @@
 			SegmentInfos sis = new SegmentInfos();
 			sis.Read(directory);
 			return sis.GetVersion();
+		}
+		
+		public void Optimize(Directory directory)
+		{
+			string[] files = directory.List();
+
+			System.Collections.ArrayList segment_names = new System.Collections.ArrayList();
+			foreach (SegmentInfo si in this)
+				segment_names.Add (si.name);
+
+			foreach (string file in files) {
+				string basename = System.IO.Path.GetFileNameWithoutExtension (file);
+				if (segment_names.Contains (basename))
+					continue;
+
+				if (basename == IndexFileNames.DELETABLE || basename == IndexFileNames.SEGMENTS)
+					continue;
+
+				Console.WriteLine ("WARNING! Deleting stale data {0}", file);
+				directory.DeleteFile (file);
+			}
 		}
 	}
 }
\ No newline at end of file

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs	Thu Jan 17 18:54:23 2008
@@ -327,6 +327,15 @@
                 return fieldsReader.Doc(n);
             }
         }
+		public override Document Document(int n, string[] fields)
+		{
+			lock (this)
+			{
+				if (IsDeleted(n))
+					throw new System.ArgumentException("attempt to access a deleted document");
+				return fieldsReader.Doc(n, fields);
+			}
+		}		
 		
         public override bool IsDeleted(int n)
         {

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs	Thu Jan 17 18:54:23 2008
@@ -87,12 +87,13 @@
 			// copy text into the buffer
 			SetTextLength(term.Text().Length);
 
-            System.String sourceString = term.Text();
-            int sourceEnd = term.Text().Length;
-            for (int i = 0; i < sourceEnd; i++)
-            {
-                text[i] = (char) sourceString[i];
-            }
+//            System.String sourceString = term.Text();
+//            int sourceEnd = term.Text().Length;
+//            for (int i = 0; i < sourceEnd; i++)
+//            {
+//                text[i] = (char) sourceString[i];
+//            }
+			text = term.Text().ToCharArray();
 			
 			this.field = term.Field();
 			this.term = term;

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs	Thu Jan 17 18:54:23 2008
@@ -114,7 +114,16 @@
 			}
 			
 			return hitDoc.doc;
-		}
+		}
		
+		/// <summary>Returns the requested fields of the n<sup>th</sup> document in this set.
+		/// <p>Documents are not cached since they could be fetched using different set of fields.
+		/// </summary>
+		public Document Doc(int n, string[] fields)
+		{
+			HitDoc hitDoc = HitDoc(n);
+			
+			return searcher.Doc(hitDoc.id, fields);
+		}		
 		
 		/// <summary>Returns the score for the nth document in this set. </summary>
 		public float Score(int n)

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs	Thu Jan 17 18:54:23 2008
@@ -130,6 +130,11 @@
 		{
 			return reader.Document(i);
 		}
+
+		public override Document Doc(int i, string[] fields)
+		{
+			return reader.Document(i, fields);
+		}
 		
 		// inherit javadoc
 		public override int MaxDoc()

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs	Thu Jan 17 18:54:23 2008
@@ -118,7 +118,12 @@
 			public override Document Doc(int i)
 			{
 				throw new System.NotSupportedException();
-			}
+			}
+			public override Document Doc(int i, string[] fields)
+			{
+				throw new System.NotSupportedException();
+			}
+			
 			
 			public override Explanation Explain(Weight weight, int doc)
 			{
@@ -185,7 +190,14 @@
 			for (int i = 0; i < searchables.Length; i++)
 				docFreq += searchables[i].DocFreq(term);
 			return docFreq;
-		}
+		}
+		
+		public override Document Doc(int n, string[] fields)
+		{
+			int i = SubSearcher(n); // find searcher index
+			return searchables[i].Doc(n - starts[i], fields); // dispatch to searcher
+		}
+		
 		
 		// inherit javadoc
 		public override Document Doc(int n)

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs	Thu Jan 17 18:54:23 2008
@@ -100,7 +100,14 @@
         /// </summary>
         /// <seealso cref="IndexReader#document(int)">
         /// </seealso>
-        Document Doc(int i);
+        Document Doc(int i);
+		
+		 /// <summary>Expert: Returns the requested <code>fields</code> of document <code>i</code>.
+        /// Called by { link HitCollector} implementations.
+        /// </summary>
+        /// <seealso cref="IndexReader#document(int,string[])">
+        /// </seealso>
+        Document Doc(int i, string[] fields);
 		
         /// <summary>Expert: called to re-write queries into primitive queries.</summary>
         /// <throws>  BooleanQuery.TooManyClauses </throws>

Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs	(original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs	Thu Jan 17 18:54:23 2008
@@ -206,7 +206,8 @@
 		abstract public int DocFreq(Term term);
 		abstract public int MaxDoc();
 		abstract public TopDocs Search(Weight weight, Filter filter, int n);
-		abstract public Document Doc(int i);
+		abstract public Document Doc(int i);
+		abstract public Document Doc(int i, string[] fields);
 		abstract public Query Rewrite(Query query);
 		abstract public Explanation Explain(Weight weight, int doc);
 		abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]