beagle r4395 - in branches/beagle-lucene-2_0_004/beagled/Lucene.Net: Index Search
- From: kkubasik svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4395 - in branches/beagle-lucene-2_0_004/beagled/Lucene.Net: Index Search
- Date: Thu, 17 Jan 2008 18:54:23 +0000 (GMT)
Author: kkubasik
Date: Thu Jan 17 18:54:23 2008
New Revision: 4395
URL: http://svn.gnome.org/viewvc/beagle?rev=4395&view=rev
Log:
Most of our patches fixed/reapplied where still relivent
Modified:
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs
branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FieldsReader.cs Thu Jan 17 18:54:23 2008
@@ -15,11 +15,13 @@
* limitations under the License.
*/
-using System;
+using System;
+using System.Collections;
+using System.Collections.Generic;
using Document = Lucene.Net.Documents.Document;
using Field = Lucene.Net.Documents.Field;
using Directory = Lucene.Net.Store.Directory;
-using IndexInput = Lucene.Net.Store.IndexInput;
+using IndexInput = Lucene.Net.Store.IndexInput;
namespace Lucene.Net.Index
{
@@ -145,7 +147,151 @@
}
return doc;
- }
+ }
+ public /*internal*/ Document Doc(int n, string[] fields)
+ {
+
+ if (fields.Length == 0)
+ return Doc (n);
+
+
+ // FIXME: use Hashset
+
+ System.Collections.Specialized.StringCollection field_list = new System.Collections.Specialized.StringCollection();
+ field_list.AddRange(fields);
+ int num_required_fields = field_list.Count;
+
+ indexStream.Seek(n * 8L);
+ long position = indexStream.ReadLong();
+ fieldsStream.Seek(position);
+
+ Document doc = new Document();
+ int numFields = fieldsStream.ReadVInt();
+ for (int i = 0; i < numFields && num_required_fields > 0; i++)
+ {
+ int fieldNumber = fieldsStream.ReadVInt();
+ FieldInfo fi = fieldInfos.FieldInfo(fieldNumber);
+ if (field_list.Contains (fi.name)) {
+ num_required_fields --;
+
+ byte bits = fieldsStream.ReadByte();
+
+ bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+ bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+
+ if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0)
+ {
+ byte[] b = new byte[fieldsStream.ReadVInt()];
+ fieldsStream.ReadBytes(b, 0, b.Length);
+ if (compressed)
+ doc.Add(new Field(fi.name, Uncompress(b), Field.Store.COMPRESS));
+ else
+ doc.Add(new Field(fi.name, b, Field.Store.YES));
+ }
+ else
+ {
+ Field.Index index;
+ Field.Store store = Field.Store.YES;
+
+ if (fi.isIndexed && tokenize)
+ index = Field.Index.TOKENIZED;
+ else if (fi.isIndexed && !tokenize)
+ index = Field.Index.UN_TOKENIZED;
+ else
+ index = Field.Index.NO;
+
+ Field.TermVector termVector = null;
+ if (fi.storeTermVector)
+ {
+ if (fi.storeOffsetWithTermVector)
+ {
+ if (fi.storePositionWithTermVector)
+ {
+ termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+ }
+ else
+ {
+ termVector = Field.TermVector.WITH_OFFSETS;
+ }
+ }
+ else if (fi.storePositionWithTermVector)
+ {
+ termVector = Field.TermVector.WITH_POSITIONS;
+ }
+ else
+ {
+ termVector = Field.TermVector.YES;
+ }
+ }
+ else
+ {
+ termVector = Field.TermVector.NO;
+ }
+
+ if (compressed)
+ {
+ store = Field.Store.COMPRESS;
+ byte[] b = new byte[fieldsStream.ReadVInt()];
+ fieldsStream.ReadBytes(b, 0, b.Length);
+ Field f = new Field(fi.name, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index, termVector);
+ f.SetOmitNorms(fi.omitNorms);
+ doc.Add(f);
+ }
+ else
+ {
+ Field f = new Field(fi.name, fieldsStream.ReadString(), store, index, termVector);
+ f.SetOmitNorms(fi.omitNorms);
+ doc.Add(f);
+ }
+ }
+ } else {
+ byte bits = fieldsStream.ReadByte();
+
+ bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+ bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+
+ if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0)
+ {
+ //byte[] b = new byte[fieldsStream.ReadVInt()];
+ //fieldsStream.ReadBytes(b, 0, b.Length);
+ int length = fieldsStream.ReadVInt();
+ for (int j = 0; j < length; j++)
+ fieldsStream.ReadByte ();
+ }
+ else
+ {
+ if (compressed)
+ {
+ //byte[] b = new byte[fieldsStream.ReadVInt()];
+ //fieldsStream.ReadBytes(b, 0, b.Length);
+ int length = fieldsStream.ReadVInt();
+ for (int j = 0; j < length; j++)
+ fieldsStream.ReadByte ();
+ }
+ else
+ {
+ //fieldsStream.ReadString ();
+ int length = fieldsStream.ReadVInt();
+ for (int j = 0; j < length; j++)
+ {
+ byte b = fieldsStream.ReadByte ();
+ if ((b & 0x80) == 0)
+ continue;
+ else if ((b & 0xE0) != 0xE0) {
+ fieldsStream.ReadByte ();
+ } else {
+ fieldsStream.ReadByte ();
+ fieldsStream.ReadByte ();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return doc;
+ }
+
private byte[] Uncompress(byte[] input)
{
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/FilterIndexReader.cs Thu Jan 17 18:54:23 2008
@@ -155,6 +155,10 @@
public override Document Document(int n)
{
return in_Renamed.Document(n);
+ }
+ public override Document Document(int n, string[] fields)
+ {
+ return in_Renamed.Document(n, fields);
}
public override bool IsDeleted(int n)
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexReader.cs Thu Jan 17 18:54:23 2008
@@ -448,6 +448,12 @@
/// <code>Document</code> in this index.
/// </summary>
public abstract Document Document(int n);
+
+
+ /// <summary>Returns the specified fields of the <code>n</code><sup>th</sup>
+ /// <code>Document</code> in this index.
+ /// </summary>
+ public abstract Document Document(int n, string[] fields);
/// <summary>Returns true if document <i>n</i> has been deleted </summary>
public abstract bool IsDeleted(int n);
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/IndexWriter.cs Thu Jan 17 18:54:23 2008
@@ -228,6 +228,8 @@
private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
private Directory ramDirectory = new RAMDirectory(); // for temp segs
+
+ private int singleDocSegmentsCount = 0; // for speeding decision on merge candidates
private Lock writeLock;
@@ -626,7 +628,8 @@
dw.AddDocument(segmentName, doc);
lock (this)
{
- segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
+ segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
+ singleDocSegmentsCount++;
MaybeMergeSegments();
}
}
@@ -691,6 +694,8 @@
{
int minSegment = segmentInfos.Count - mergeFactor;
MergeSegments(minSegment < 0?0:minSegment);
+
+ segmentInfos.Optimize (directory);
}
}
}
@@ -820,9 +825,9 @@
long targetMergeDocs = minMergeDocs;
while (targetMergeDocs <= maxMergeDocs)
{
- // find segments smaller than current target size
- int minSegment = segmentInfos.Count;
- int mergeDocs = 0;
+ // find segments smaller than current target size
+ int minSegment = segmentInfos.Count - singleDocSegmentsCount; // top 1-doc segments are taken for sure
+ int mergeDocs = singleDocSegmentsCount;
while (--minSegment >= 0)
{
SegmentInfo si = segmentInfos.Info(minSegment);
@@ -830,11 +835,12 @@
break;
mergeDocs += si.docCount;
}
-
- if (mergeDocs >= targetMergeDocs)
- // found a merge to do
- MergeSegments(minSegment + 1);
- else
+
+ if (mergeDocs >= targetMergeDocs) {
+ // found a merge to do
+ MergeSegments(minSegment + 1);
+ singleDocSegmentsCount = 0;
+ }else
break;
targetMergeDocs *= mergeFactor; // increase target size
@@ -1008,4 +1014,4 @@
directory.RenameFile("deleteable.new", IndexFileNames.DELETABLE);
}
}
-}
\ No newline at end of file
+}
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/MultiReader.cs Thu Jan 17 18:54:23 2008
@@ -116,7 +116,11 @@
int i = ReaderIndex(n); // find segment num
return subReaders[i].Document(n - starts[i]); // dispatch to segment reader
}
-
+ public override Document Document(int n, string[] fields)
+ {
+ int i = ReaderIndex(n); // find segment num
+ return subReaders[i].Document(n - starts[i], fields); // dispatch to segment reader
+ }
public override bool IsDeleted(int n)
{
int i = ReaderIndex(n); // find segment num
@@ -471,14 +475,15 @@
/// <summary>As yet unoptimized implementation. </summary>
public virtual bool SkipTo(int target)
- {
- do
- {
- if (!Next())
- return false;
- }
- while (target > Doc());
- return true;
+ {
+ if (current != null && current.SkipTo (target - base_Renamed)) {
+ return true;
+ } else if (pointer < readers.Length) {
+ base_Renamed = starts [pointer];
+ current = TermDocs (pointer++);
+ return SkipTo (target);
+ } else
+ return false;
}
private TermDocs TermDocs(int i)
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/ParallelReader.cs Thu Jan 17 18:54:23 2008
@@ -159,6 +159,10 @@
}
}
return result;
+ }
+ public override Document Document(int n, string[] fields)
+ {
+ throw new System.NotSupportedException ();
}
// get all vectors
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentInfos.cs Thu Jan 17 18:54:23 2008
@@ -150,6 +150,27 @@
SegmentInfos sis = new SegmentInfos();
sis.Read(directory);
return sis.GetVersion();
+ }
+
+ public void Optimize(Directory directory)
+ {
+ string[] files = directory.List();
+
+ System.Collections.ArrayList segment_names = new System.Collections.ArrayList();
+ foreach (SegmentInfo si in this)
+ segment_names.Add (si.name);
+
+ foreach (string file in files) {
+ string basename = System.IO.Path.GetFileNameWithoutExtension (file);
+ if (segment_names.Contains (basename))
+ continue;
+
+ if (basename == IndexFileNames.DELETABLE || basename == IndexFileNames.SEGMENTS)
+ continue;
+
+ Console.WriteLine ("WARNING! Deleting stale data {0}", file);
+ directory.DeleteFile (file);
+ }
}
}
}
\ No newline at end of file
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/SegmentReader.cs Thu Jan 17 18:54:23 2008
@@ -327,6 +327,15 @@
return fieldsReader.Doc(n);
}
}
+ public override Document Document(int n, string[] fields)
+ {
+ lock (this)
+ {
+ if (IsDeleted(n))
+ throw new System.ArgumentException("attempt to access a deleted document");
+ return fieldsReader.Doc(n, fields);
+ }
+ }
public override bool IsDeleted(int n)
{
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Index/TermBuffer.cs Thu Jan 17 18:54:23 2008
@@ -87,12 +87,13 @@
// copy text into the buffer
SetTextLength(term.Text().Length);
- System.String sourceString = term.Text();
- int sourceEnd = term.Text().Length;
- for (int i = 0; i < sourceEnd; i++)
- {
- text[i] = (char) sourceString[i];
- }
+// System.String sourceString = term.Text();
+// int sourceEnd = term.Text().Length;
+// for (int i = 0; i < sourceEnd; i++)
+// {
+// text[i] = (char) sourceString[i];
+// }
+ text = term.Text().ToCharArray();
this.field = term.Field();
this.term = term;
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Hits.cs Thu Jan 17 18:54:23 2008
@@ -114,7 +114,16 @@
}
return hitDoc.doc;
- }
+ }
+ /// <summary>Returns the requested fields of the n<sup>th</sup> document in this set.
+ /// <p>Documents are not cached since they could be fetched using different set of fields.
+ /// </summary>
+ public Document Doc(int n, string[] fields)
+ {
+ HitDoc hitDoc = HitDoc(n);
+
+ return searcher.Doc(hitDoc.id, fields);
+ }
/// <summary>Returns the score for the nth document in this set. </summary>
public float Score(int n)
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/IndexSearcher.cs Thu Jan 17 18:54:23 2008
@@ -130,6 +130,11 @@
{
return reader.Document(i);
}
+
+ public override Document Doc(int i, string[] fields)
+ {
+ return reader.Document(i, fields);
+ }
// inherit javadoc
public override int MaxDoc()
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/MultiSearcher.cs Thu Jan 17 18:54:23 2008
@@ -118,7 +118,12 @@
public override Document Doc(int i)
{
throw new System.NotSupportedException();
- }
+ }
+ public override Document Doc(int i, string[] fields)
+ {
+ throw new System.NotSupportedException();
+ }
+
public override Explanation Explain(Weight weight, int doc)
{
@@ -185,7 +190,14 @@
for (int i = 0; i < searchables.Length; i++)
docFreq += searchables[i].DocFreq(term);
return docFreq;
- }
+ }
+
+ public override Document Doc(int n, string[] fields)
+ {
+ int i = SubSearcher(n); // find searcher index
+ return searchables[i].Doc(n - starts[i], fields); // dispatch to searcher
+ }
+
// inherit javadoc
public override Document Doc(int n)
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searchable.cs Thu Jan 17 18:54:23 2008
@@ -100,7 +100,14 @@
/// </summary>
/// <seealso cref="IndexReader#document(int)">
/// </seealso>
- Document Doc(int i);
+ Document Doc(int i);
+
+ /// <summary>Expert: Returns the requested <code>fields</code> of document <code>i</code>.
+ /// Called by { link HitCollector} implementations.
+ /// </summary>
+ /// <seealso cref="IndexReader#document(int,string[])">
+ /// </seealso>
+ Document Doc(int i, string[] fields);
/// <summary>Expert: called to re-write queries into primitive queries.</summary>
/// <throws> BooleanQuery.TooManyClauses </throws>
Modified: branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs
==============================================================================
--- branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs (original)
+++ branches/beagle-lucene-2_0_004/beagled/Lucene.Net/Search/Searcher.cs Thu Jan 17 18:54:23 2008
@@ -206,7 +206,8 @@
abstract public int DocFreq(Term term);
abstract public int MaxDoc();
abstract public TopDocs Search(Weight weight, Filter filter, int n);
- abstract public Document Doc(int i);
+ abstract public Document Doc(int i);
+ abstract public Document Doc(int i, string[] fields);
abstract public Query Rewrite(Query query);
abstract public Explanation Explain(Weight weight, int doc);
abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]