IGNORE MY PREVIOUS PATCH [Re: Patch: MS Word Filter using wv1]
- From: Veerapuram Varadhan <vvaradhan novell com>
- To: dashboard-hackers gnome org
- Cc: Jon Trowbridge <trow ximian com>
- Subject: IGNORE MY PREVIOUS PATCH [Re: Patch: MS Word Filter using wv1]
- Date: Tue, 16 Nov 2004 21:59:17 -0800
On Tue, 2004-11-16 at 21:26 -0800, Veerapuram Varadhan wrote:
> Hello Every1,
>
> Please find attached the patch for MS WORD filter.
Kindly *IGNORE* my previous patch, I forgot to *ADD CHANGELOG* to that
patch. The PATCH attached herewith is the updated and latest one.
Thanks,
V. Varadhan.
? glue/wv1-glue.c
Index: configure.in
===================================================================
RCS file: /cvs/gnome/beagle/configure.in,v
retrieving revision 1.55
diff -u -r1.55 configure.in
--- configure.in 12 Nov 2004 15:03:22 -0000 1.55
+++ configure.in 16 Nov 2004 16:49:24 -0000
@@ -300,6 +300,16 @@
dnl ----------------------------------------------
+dnl For the wv1 glue
+
+PKG_CHECK_MODULES(WV1, wv-1.0, enable_wv1=yes, enable_wv1=no)
+AM_CONDITIONAL(ENABLE_WV1, test "x$enable_wv1" = "xyes")
+
+WV1_LIBS=`$PKG_CONFIG --libs wv-1.0`
+AC_SUBST(WV1_LIBS)
+
+dnl ----------------------------------------------
+
dnl Searchomatic
PKG_CHECK_MODULES(SEARCHOMATIC, gtk+-2.0)
@@ -350,6 +360,7 @@
gst-sharp? ${enable_gst_sharp}
Epiphany Extension? ${enable_epiphany_extension}
Mozilla Extension? yes
+ wv1? ${enable_wv1}
Enable Network ${enable_network}"
Index: Filters/FilterDOC.cs
===================================================================
RCS file: /cvs/gnome/beagle/Filters/FilterDOC.cs,v
retrieving revision 1.2
diff -u -r1.2 FilterDOC.cs
--- Filters/FilterDOC.cs 15 Oct 2004 13:38:05 -0000 1.2
+++ Filters/FilterDOC.cs 16 Nov 2004 16:49:24 -0000
@@ -1,8 +1,13 @@
//
// FilterDOC.cs
//
-// Copyright (C) 2004 Novell, Inc.
+// FilterRTF.cs : Trivial implementation of a MS Word-document filter.
+// This filter uses wv1 library - http://wvware.sourceforge.net/
+//
+// Author: Veerapuram Varadhan <vvaradhan novell com>
//
+// Copyright (C) 2004 Novell, Inc.
+//
//
// Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,257 +31,190 @@
using System;
-using System.Collections;
using System.IO;
-using System.Text;
-using Gsf;
+using System.Runtime.InteropServices;
+using Beagle.Util;
namespace Beagle.Filters {
- public class FilterDOC : FilterOle {
-
- int fib_ccpText;
- int fib_fcclx;
- Hashtable pieceItems;
- Hashtable pieceIndices;
+ public class FilterDOC : Beagle.Daemon.Filter {
+ string FileName;
- public FilterDOC ()
- {
- AddSupportedMimeType ("application/msword");
- }
+ //////////////////////////////////////////////////////////
- // Note: This code to find the Real "FC" inside a "WordDocument"
- // stream is copied from wv2. Word specification (!) says something about
- // the calculation, but not so clear. Gotta crack it!!
-
- private void FindRealFC (ref int fc, ref bool unicode)
- {
- if ( (fc & 0x40000000) > 0 ) {
- fc = (int)( fc & 0xbfffffff ) >> 1;
- unicode = false;
- }
- else
- unicode = true;
- }
- public string getTableStreamName (Gsf.Input stream)
- {
- int fWhichTblStm;
- int length = 0;
- stream.SeekEmulate (0x0A);
- byte [] data = stream.Read (2);
- if (data == null) {
- Console.WriteLine ("Data is null");
- return "";
- }
+ private delegate void TextHandlerCallback (IntPtr byteArray, int dataLen, bool hotText);
+ //private delegate void TextHandlerCallback (string data, int dataLen, bool hotText);
- int fFlagSet = 0;
+ [DllImport ("wv1glue")]
+ private static extern int wv1_glue_init_doc_parsing (string fname, TextHandlerCallback callback);
- for (int i = 1; i > -1; i--) {
- fFlagSet <<= 8;
- fFlagSet |= 0xff & data[i];
- }
-
- // find out the table stream correspoding to this document
- fWhichTblStm = fFlagSet & 0x0200;
+ [DllImport ("wv1glue")]
+ private static extern IntPtr wv1_glue_get_ole_stream (string fname);
- if (fWhichTblStm > 0)
- return ("1Table");
- else
- return ("0Table");
- }
+ [DllImport ("wv1glue")]
+ private static extern IntPtr wv1_glue_get_ole_summary_stream (IntPtr oleStream);
- private void readPieceTable (Gsf.Input tblStream,
- int fib_fcclx)
- {
- int size = 0;
- tblStream.SeekEmulate (fib_fcclx);
- byte[] data = tblStream.Read (1);
- while (((clxtType)data[0]) == clxtType.clxtGrpprl) {
- data = tblStream.Read (2);
- size = GetInt16 (data, 0);
- tblStream.SeekEmulate (size);
-
- data = tblStream.Read (1);
- }
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_title (IntPtr smryStream);
- if (((clxtType)data[0]) == clxtType.clxtPlcfpcd) {
- int cntPCDs = 0;
- data = tblStream.Read (4);
- size = GetInt32 (data, 0);
-
- pieceItems = new Hashtable ();
- pieceIndices = new Hashtable ();
-
- // 8 is the sizeof a PCD
- if (((size - 4) % (8 + 4)) > 0)
- cntPCDs = 0;
- else
- cntPCDs = (size - 4) / (8 + 4);
-
- // n+1 CP/FCs
- for (int i = 0; i < (cntPCDs+1); i++) {
- data = tblStream.Read (4);
- pieceIndices.Add (i, GetInt32 (data, 0));
- }
-
- // n PCDs
- for (int i = 0; i < cntPCDs; i++) {
- //Console.WriteLine ("Current tablestream pos : {0}",
- //tblStream.Tell());
- //data = tblStream.Read (4);
- pieceItems.Add (i, new PCD (tblStream));
- }
- }
-
- }
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_subject (IntPtr smryStream);
- private StringBuilder StripOffWordSpecificChars (string strValue)
- {
- if (strValue == null)
- return null;
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_author (IntPtr smryStream);
- StringBuilder strText = new StringBuilder ();
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_keywords (IntPtr smryStream);
- for (int i = 0; i < strValue.Length; i++) {
- if (strValue[i] > 31 && strValue[i] < 126)
- strText.Append (strValue[i]);
- else {
- // FIXME: We need to handle the characters appropriately,
- // such that we can extract the attributes of them as well.
- switch (Convert.ToInt32(strValue[i])) {
- case (int) PieceType.PageSectionBreaks:
- case (int) PieceType.ParagraphEnds:
- case (int) PieceType.HardLineBreaks: strText.Append (" ");
- break;
-
- case (int) PieceType.Tab: strText.Append (" ");
- break;
- }
- }
- }
- return strText;
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_comments (IntPtr smryStream);
- }
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_template (IntPtr smryStream);
- public void ExtractText (Gsf.Input docStream,
- Gsf.Input tblStream)
- {
- int pcdTblIndex = 0;
- int chCntPCD = 0;
- StringBuilder strText = new StringBuilder ();
- string val = null;
- int fc;
- bool unicode;
-
- // get the ccpText defined in FIB
- docStream.SeekEmulate (0x004C);
- byte[] data = docStream.Read (4);
- fib_ccpText = GetInt32 (data, 0);
-
- // get the fcclx defined in FIB
- docStream.SeekEmulate (0x01A2);
- data = docStream.Read (4);
- fib_fcclx = GetInt32 (data, 0);
-
- // get the plcpcd list from the tableStream
- readPieceTable (tblStream, fib_fcclx);
-
- data = null;
- unicode = true;
-
- while (fib_ccpText > 0 && pcdTblIndex < pieceItems.Count) {
- chCntPCD = (int) pieceIndices[pcdTblIndex+1] -
- (int) pieceIndices[pcdTblIndex];
- chCntPCD = chCntPCD > fib_ccpText ? fib_ccpText : chCntPCD;
-
- fc = ((PCD)pieceItems[pcdTblIndex]).GetFC();
-
- FindRealFC (ref fc, ref unicode);
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_lastsavedby (IntPtr smryStream);
- if (docStream.SeekEmulate (fc))
- Console.WriteLine ("Seek failed!!");
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_revision_number (IntPtr smryStream);
- //Console.WriteLine ("Number of chars present: {0}", chCntPCD);
- if (unicode) {
- data = docStream.Read (chCntPCD * 2); //unicode chars
- if (data != null)
- val = System.Text.Encoding.Unicode.GetString (data);
- else
- Console.WriteLine ("Unicode Data is Null");
- } else {
-
- data = docStream.Read (chCntPCD);
- if (data != null)
- val = System.Text.Encoding.ASCII.GetString (data);
- else
- Console.WriteLine ("ASCII data is NULL");
- }
- if (val != null) {
- strText = StripOffWordSpecificChars (val);
- AppendText (strText.ToString());
- }
- pcdTblIndex ++;
- data = null;
- val = null;
- }
- AppendWhiteSpace ();
- }
+ [DllImport ("wv1glue")]
+ private static extern string wv1_glue_get_appname (IntPtr smryStream);
- // This enum is a "convenience enum" for reading the piece table
- private enum clxtType {
- clxtGrpprl = 1,
- clxtPlcfpcd = 2
- }
- override protected void DoPull ()
- {
- Input docStream = file.ChildByName ("WordDocument");
- Input tblStream = file.ChildByName (getTableStreamName (docStream));
+ [DllImport ("wv1glue")]
+ private static extern Int32 wv1_glue_get_page_count (IntPtr smryStream);
- if (docStream != null)
- ExtractText (docStream, tblStream);
- Finished();
- }
- // FIXME: there are other specially treated "ASCII" characters
- // available to list here, but, we will content with these
- // and in general, we can replace ASCII 1 thru ASCII 31 with
- // simple spaces, however, word has this wierd logic where-in
- // if chp.fSpec = 1, some ASCII characters between ASCII 32 and
- // ASCII 41 should be interpreted differently.
-
- public enum PieceType {
- Unknown = 0,
- Tab = 0x09,
- HardLineBreaks = 0x0B,
- PageSectionBreaks = 0x0C,
- ParagraphEnds = 0x0D,
- BreakingHyphens = 0x2D,
- NonRequiredHyphens = 0x1F,
- NonBreakingHyphens = 0x1E
+ [DllImport ("wv1glue")]
+ private static extern Int32 wv1_glue_get_word_count (IntPtr smryStream);
+
+ [DllImport ("wv1glue")]
+ private static extern Int32 wv1_glue_get_character_count (IntPtr smryStream);
+
+ [DllImport ("wv1glue")]
+ private static extern Int32 wv1_glue_get_security (IntPtr smryStream);
+
+ [DllImport ("wv1glue")]
+ private static extern Int16 wv1_glue_get_codepage (IntPtr smryStream);
+
+ [DllImport ("wv1glue")]
+ private static extern void wv1_glue_close_stream (IntPtr oleStream, IntPtr summary);
+
+ //////////////////////////////////////////////////////////
+
+ public FilterDOC ()
+ {
+ AddSupportedMimeType ("application/msword");
+ AddSupportedMimeType ("application/vnd.ms-word");
+ AddSupportedMimeType ("application/x-msword");
}
+
+ private void IndexText (IntPtr byteArray, int dataLen, bool hotText)
+ {
+ if (byteArray != IntPtr.Zero) {
+ byte[] data = new byte[dataLen];
+ Marshal.Copy (byteArray, data, 0, dataLen);
+ if (hotText)
+ HotUp();
+ AppendText (System.Text.Encoding.UTF8.GetString(data, 0, dataLen));
+
+ if (hotText)
+ HotDown();
+ }
+ }
+ override protected void DoOpen (FileInfo info)
+ {
+ FileName = info.FullName;
+ }
+ override protected void DoPullProperties ()
+ {
+ IntPtr oleStream;
+ IntPtr oleSummaryStream;
+ string strProp = null;
+ Int32 intProp = 0;
+
+ oleStream = wv1_glue_get_ole_stream (FileName);
+ if (oleStream == IntPtr.Zero) {
+ Logger.Log.Error ("Could not open OLE stream {0}", FileName);
+ return;
+ }
+ oleSummaryStream = wv1_glue_get_ole_summary_stream (oleStream);
+ if (oleSummaryStream == IntPtr.Zero) {
+ Logger.Log.Error ("Could not open OLE Meta data stream from {0}", FileName);
+ return;
+ }
+ strProp = wv1_glue_get_title (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("dc:title", strProp ));
+ strProp = null;
+
+ strProp = wv1_glue_get_subject (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("dc:subject", strProp ));
+ strProp = null;
+
+ strProp = wv1_glue_get_comments (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("dc:comment", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_author (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:author", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_lastsavedby (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:lastsavedby", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_appname (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:appname", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_keywords (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:keywords", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_template (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:template", strProp));
+ strProp = null;
+
+ strProp = wv1_glue_get_revision_number (oleSummaryStream);
+ if (strProp != null && strProp.Length > 0)
+ AddProperty (Beagle.Property.New ("fixme:revisionnumber", strProp));
+
+ intProp = wv1_glue_get_page_count (oleSummaryStream);
+ if (intProp != 0)
+ AddProperty (Beagle.Property.New ("fixme:page-count", intProp ));
+ intProp = 0;
+
+ intProp = wv1_glue_get_word_count (oleSummaryStream);
+ if (intProp != 0)
+ AddProperty (Beagle.Property.New ("fixme:word-count", intProp));
+ intProp = 0;
- public class PCD {
- /* this structure is of size 8 bytes and other fields do exist..
- I am just having the one that I am interested in.. ;-)
- */
- int _notInterested; // 16 bits
- int fc; // 32 bits
- int _actually_prm; // 16 bits
-
- public PCD (Gsf.Input stream)
- {
- read (stream);
- }
- private void read (Gsf.Input stream)
- {
- byte[] data = stream.Read (2);
- data = stream.Read (4);
- fc = GetInt32 (data, 0);
- //Console.WriteLine ("PCD FC : {0}", fc);
- data = stream.Read (2);
- }
+ wv1_glue_close_stream (oleStream, oleSummaryStream);
+ }
- public int GetFC () { return fc;}
+ override protected void DoPull ()
+ {
+ int ret;
+ TextHandlerCallback textHandler;
+ textHandler = new TextHandlerCallback (IndexText);
+
+ ret = wv1_glue_init_doc_parsing (FileName, textHandler);
+
+ if (ret == -2)
+ Logger.Log.Error ("{0} : is password protected", FileName);
+ else if (ret == -1)
+ Logger.Log.Error ("{0} : Unable to read", FileName);
+ else if (ret == -3)
+ Logger.Log.Error ("Unable to initiate the parser for {0}", FileName);
+ Finished ();
}
}
}
Index: Filters/FilterSource.cs
===================================================================
RCS file: /cvs/gnome/beagle/Filters/FilterSource.cs,v
retrieving revision 1.4
diff -u -r1.4 FilterSource.cs
--- Filters/FilterSource.cs 15 Oct 2004 13:13:15 -0000 1.4
+++ Filters/FilterSource.cs 16 Nov 2004 16:49:24 -0000
@@ -86,16 +86,16 @@
"try", "while", "yield"};
private enum LineType {
+ None,
SingleLineComment,
BlockComment,
- StringConstant,
- None
+ StringConstant
}
private enum LangType {
+ None,
C_Style,
- Python,
- None
+ Python
}
LineType SrcLineType;
Index: Filters/Makefile.am
===================================================================
RCS file: /cvs/gnome/beagle/Filters/Makefile.am,v
retrieving revision 1.24
diff -u -r1.24 Makefile.am
--- Filters/Makefile.am 12 Nov 2004 15:03:22 -0000 1.24
+++ Filters/Makefile.am 16 Nov 2004 16:49:24 -0000
@@ -35,7 +35,6 @@
if ENABLE_GSF_SHARP
CSFILES += \
$(srcdir)/FilterOle.cs \
- $(srcdir)/FilterDOC.cs \
$(srcdir)/FilterPPT.cs
endif
@@ -45,6 +44,11 @@
else
CSFILES += \
$(srcdir)/FilterMusic.cs
+endif
+
+if ENABLE_WV1
+CSFILES += \
+ $(srcdir)/FilterDOC.cs
endif
LOCAL_ASSEMBLIES = \
Index: glue/Makefile.am
===================================================================
RCS file: /cvs/gnome/beagle/glue/Makefile.am,v
retrieving revision 1.5
diff -u -r1.5 Makefile.am
--- glue/Makefile.am 2 Nov 2004 20:22:42 -0000 1.5
+++ glue/Makefile.am 16 Nov 2004 16:49:25 -0000
@@ -14,6 +14,10 @@
-DMOZILLA_HOME=\""$(MOZILLA_HOME)\"" \
-include $(MOZILLA_INCLUDE_ROOT)/mozilla-config.h
+if ENABLE_WV1
+INCLUDES += \
+ $(WV1_CFLAGS)
+endif
gluelibdir = $(pkglibdir)
@@ -23,6 +27,11 @@
libgeckoglue.la \
libkeyglue.la
+if ENABLE_WV1
+gluelib_LTLIBRARIES += \
+ libwv1glue.la
+endif
+
libinotifyglue_la_SOURCES = \
inotify.h \
inotify-glue.c
@@ -43,4 +52,10 @@
eggaccelerators.c \
eggaccelerators.h
+if ENABLE_WV1
+libwv1glue_la_SOURCES = \
+ wv1-glue.c
+libwv1glue_la_LIBADD = @X_LIBS@ \
+ $(WV1_LIBS)
+endif
--- /dev/null 2004-08-25 10:34:59.000000000 -0700
+++ glue/wv1-glue.c 2004-11-16 20:07:37.432313528 -0800
@@ -0,0 +1,471 @@
+/*
+ * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
+ * Microsoft Word documents).
+ *
+ * Copyright (C) 2004 Novell, Inc.
+ *
+ * Author: Veerapuram Varadhan <vvaradhan novell com>
+ * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
+ *
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <wv.h>
+
+/* Callback to Handle "text" (or words) extracted out of
+ * M$ Word documents
+ *
+ * text: Holds the extracted text/words.
+ *
+ * hotText: Identifies the attributes of the text.
+ * (bold, italic, underline, superscript, subscript)
+ */
+
+typedef void (* wvTextHandlerCallback) (U8* text, int len, U8 hotText);
+
+typedef struct _UserData {
+ /* formatting variables */
+
+ int cFontSize;
+ int cCol;
+
+ /* boolean formats */
+ int bIsBold:1;
+ int bIsItalic:1;
+ int bIsStrike:1;
+ int bIsUl:1;
+ int bIsSup:1;
+ int bIsSub:1;
+
+ /* beagle specifc formats */
+ U8 bIsHot;
+
+ /* buffer to hold text */
+ GString* txtWord;
+
+ wvTextHandlerCallback WordHandler;
+
+} UserData;
+
+
+/*
+ * append_char: fills the txtWord buffer with the character 'ch'
+ * converted to UTF8 encoding. Calls the "WordHandler" for every
+ * word/line/end of a paragraph or for every 1023 characters,
+ * whichever comes first.
+ *
+ * ud : carries the UserData filled-in appropriately to hold the
+ * character (text) attributes.
+ *
+ * ch : unicode character
+ *
+ */
+
+void
+append_char (UserData * ud, U16 ch)
+{
+ int hotText;
+ char tmpBuf[64];
+ int len = 0;
+
+ switch (ch) {
+ case 0x20: /* space */
+ case 0x0B: /* hard line break */
+ case 0x0D: /* paragraph end */
+ case 0x0C:
+ case '\n': /* new-line */
+ if (ch != '\n')
+ ch = 0x20;
+ else
+ ch = '\n';
+ g_string_append_c (ud->txtWord, ch);
+ break;
+ default:
+ len = g_unichar_to_utf8 (ch, tmpBuf);
+ g_string_append_len (ud->txtWord, tmpBuf, len);
+ break;
+ }
+ if (ch == 0x00 || ch == '\n' || ch == 0x20) {
+ (*(ud->WordHandler))(ud->txtWord->str, ud->txtWord->len, ud->bIsHot);
+ g_string_erase (ud->txtWord, 0, -1);
+ }
+}
+
+/*
+ * fill_UserData: fills the UserData structure from the
+ * CHP structure that represents the Character Property
+ * Information like bold, italic, striked, underlined,
+ * superscript, subscript, fontsize, color, fontface etc.
+ *
+ */
+void
+fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
+{
+ ud->cCol = 0;
+ if (chp->ico)
+ ud->cCol = chp->ico - 1;
+
+ ud->cFontSize = chp->hps;
+ ud->bIsBold = (chp->fBold);
+ ud->bIsItalic = (chp->fItalic);
+ ud->bIsUl = (chp->kul);
+ ud->bIsStrike = (chp->fStrike);
+ ud->bIsSup = (chp->iss == 1);
+ ud->bIsSub = (chp->iss == 2);
+
+ if (ud->bIsBold || ud->bIsItalic || ud->bIsUl || ud->bIsSup || ud->bIsSub)
+ ud->bIsHot = 1;
+ else
+ ud->bIsHot = 0;
+}
+
+/* This is a callback that handles the individual
+ * character that are extracted from M$ word file.
+ */
+static int
+charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
+{
+
+ /* convert incoming character to unicode */
+ if (chartype) {
+ eachchar = wvHandleCodePage (eachchar, lid);
+ }
+
+ /* take care of any oddities in Microsoft's character "encoding" */
+ /* TODO: does the above code page handler take care of these? */
+ if (chartype == 1 && eachchar == 146)
+ eachchar = 39; /* apostrophe */
+
+ switch (eachchar)
+ {
+ case 14: /* column break */
+ break;
+
+ case 19: /* field begin */
+ /* flush current text buffer */
+ ps->fieldstate++;
+ ps->fieldmiddle = 0;
+ return 0;
+ case 20: /* field separator */
+ ps->fieldmiddle = 1;
+ return 0;
+ case 21: /* field end */
+ ps->fieldstate--;
+ ps->fieldmiddle = 0;
+ return 0;
+
+ default:
+ break;
+ }
+
+ if (eachchar == 0x14)
+ return 0;
+
+ append_char (ps->userData, eachchar);
+ return 0;
+}
+
+/* This is a callback that handles the special
+ * character that are specific to M$ word file.
+ */
+static int
+specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
+{
+ Blip blip;
+ wvStream *fil;
+ long pos;
+ FSPA *fspa;
+ PICF picf;
+ FDOA *fdoa;
+
+ switch (eachchar)
+ {
+ case 19: /* field begin */
+ ps->fieldstate++;
+ ps->fieldmiddle = 0;
+ return 0;
+ case 20: /* field separator */
+ if (achp->fOle2)
+ {
+/* printf ("Field has an embedded OLE2 object\n"); */
+ }
+ ps->fieldmiddle = 1;
+ return 0;
+ case 21: /* field end */
+ ps->fieldstate--;
+ ps->fieldmiddle = 0;
+ return 0;
+ default:
+ break;
+ }
+
+ if (ps->fieldstate)
+ {
+ if (eachchar == 0x13 || eachchar == 0x14)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* This is a callback that handles the individual
+ * elements that are marked by libwv1.
+ */
+
+static int
+eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
+{
+ /* some word structures */
+ PAP *apap;
+ CHP *achp;
+ SEP *asep;
+ int iRes;
+
+ UserData *ud = (UserData *) ps->userData;
+
+ switch (tag)
+ {
+ case SECTIONEND:
+ case PARAEND: /* pretty much nothing */
+ append_char (ud, '\n');
+ break;
+
+ case CHARPROPBEGIN:
+ achp = (CHP *) props;
+ fill_UserData (ud, achp, ps);
+ break;
+
+ case CHARPROPEND:
+ achp = (CHP *) props;
+ fill_UserData (ud, achp, ps);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/* This is a callback that handles the document
+ * level tags that are marked by libwv1.
+ */
+
+static int
+docProc (wvParseStruct * ps, wvTag tag)
+{
+ switch (tag)
+ {
+ case DOCEND:
+ append_char (ps->userData, 0x00);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * wv1_glue_init_doc_parsing: Initiates the document parsing
+ * procedure. Sets up all the required handlers and the parser.
+ *
+ * fname: Name of the file to parse. (essentially a M$ word file)
+ *
+ * wvTextHandlerCallback: The callback routine that will be called
+ * on extraction of each word.
+ *
+ * Return: 0 -> success
+ * -1 -> failure.
+ */
+
+int
+wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
+{
+ FILE *input;
+ int ret;
+
+ wvParseStruct ps;
+ char *dir = NULL;
+
+ UserData ud;
+
+ input = fopen (fname, "rb");
+ if (!input)
+ return -1;
+ fclose (input);
+
+ wvInit ();
+ ret = wvInitParser (&ps, fname);
+ if (ret & 0x8000)
+ return -2;
+ else if (ret)
+ return -3;
+
+ ps.filename = fname;
+ ps.dir = dir;
+
+ /* set to 0 */
+ memset (&ud, 0, sizeof (UserData));
+ ud.WordHandler = callback;
+ ud.txtWord = g_string_sized_new (32);
+ ps.userData = &ud;
+
+
+ wvSetElementHandler (&ps, eleProc);
+ wvSetDocumentHandler (&ps, docProc);
+ wvSetCharHandler (&ps, charProc);
+ wvSetSpecialCharHandler (&ps, specCharProc);
+
+ wvText (&ps);
+
+ /* free associated memory */
+ wvOLEFree (&ps);
+
+ /* free userdata memory */
+ g_string_free (ud.txtWord, TRUE);
+
+ return 0;
+}
+
+void *
+wv1_glue_get_ole_stream (const char* fname)
+{
+ MsOle *ole = NULL;
+ ms_ole_open (&ole, fname);
+ return ((void *)ole);
+}
+
+void *
+wv1_glue_get_ole_summary_stream (MsOle *stream)
+{
+ MsOle *oleStream = (MsOle *)stream;
+ MsOleSummary *summary = NULL;
+ summary = ms_ole_summary_open (oleStream);
+ return ((void *)summary);
+}
+
+char *
+wv1_glue_get_title (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TITLE, &ret));
+}
+
+char *
+wv1_glue_get_subject (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_SUBJECT, &ret));
+}
+
+char *
+wv1_glue_get_author (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_AUTHOR, &ret));
+}
+
+char *
+wv1_glue_get_keywords (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_KEYWORDS, &ret));
+}
+
+char *
+wv1_glue_get_comments (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_COMMENTS, &ret));
+}
+
+char *
+wv1_glue_get_template (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TEMPLATE, &ret));
+}
+
+char *
+wv1_glue_get_lastsavedby (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_LASTAUTHOR, &ret));
+}
+
+char *
+wv1_glue_get_revision_number (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_REVNUMBER, &ret));
+}
+
+char *
+wv1_glue_get_appname (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_APPNAME, &ret));
+}
+
+long
+wv1_glue_get_page_count (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_PAGECOUNT, &ret));
+}
+
+long
+wv1_glue_get_word_count (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_WORDCOUNT, &ret));
+}
+
+long
+wv1_glue_get_character_count (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_CHARCOUNT, &ret));
+}
+
+long
+wv1_glue_get_security (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_SECURITY, &ret));
+}
+
+short
+wv1_glue_get_codepage (MsOleSummary* smryStream)
+{
+ int ret;
+ return (ms_ole_summary_get_short (smryStream, MS_OLE_SUMMARY_CODEPAGE, &ret));
+}
+
+void
+wv1_glue_close_stream (MsOle* oleStream, MsOleSummary* summary)
+{
+ ms_ole_summary_close (summary);
+ ms_ole_destroy (&oleStream);
+}
Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/beagle/ChangeLog,v
retrieving revision 1.33
diff -r1.33 ChangeLog
0a1,10
> 2004-11-16 Veerapuram Varadhan <vvaradhan novell com>
>
> * configure.in, Filters/Makefile.am, glue/Makefile.am: Added
> support for wv1.
>
> * glue/wv1-glue.c: Wrapper to parse MS word files using wv-1.0
>
> * Filters/FilterDOC.cs: Modified to use wv1 to get meta-data and
> text data from MS Word documents.
>
Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/beagle/ChangeLog,v
retrieving revision 1.33
diff -u -r1.33 ChangeLog
--- ChangeLog 14 Nov 2004 07:53:38 -0000 1.33
+++ ChangeLog 16 Nov 2004 17:58:53 -0000
@@ -1,3 +1,13 @@
+2004-11-16 Veerapuram Varadhan <vvaradhan novell com>
+
+ * configure.in, Filters/Makefile.am, glue/Makefile.am: Added
+ support for wv1.
+
+ * glue/wv1-glue.c: Wrapper to parse MS word files using wv-1.0
+
+ * Filters/FilterDOC.cs: Modified to use wv1 to get meta-data and
+ text data from MS Word documents.
+
2004-11-14 Tuomas Kuosmanen <tigert tigert priv>
* Tiles/template-page.css: made opacity to 0.5 for non-focused
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]