Patch: MS Word Filter using wv1



Hello Every1,

Please find attached the patch for MS WORD filter.

It uses wv1 [http://wvware.sourceforge.net].

You need wv-1.0 to build this filter.  However, this filter will not be
enabled if you don't have wv1.

Kindly let me know your comments &/ bugs.

Thanks,

V. Varadhan.



? glue/wv1-glue.c
Index: configure.in
===================================================================
RCS file: /cvs/gnome/beagle/configure.in,v
retrieving revision 1.55
diff -u -r1.55 configure.in
--- configure.in	12 Nov 2004 15:03:22 -0000	1.55
+++ configure.in	16 Nov 2004 16:49:24 -0000
@@ -300,6 +300,16 @@
 
 dnl ----------------------------------------------
 
+dnl For the wv1 glue
+
+PKG_CHECK_MODULES(WV1, wv-1.0, enable_wv1=yes, enable_wv1=no)
+AM_CONDITIONAL(ENABLE_WV1, test "x$enable_wv1" = "xyes")
+
+WV1_LIBS=`$PKG_CONFIG --libs wv-1.0`
+AC_SUBST(WV1_LIBS)
+
+dnl ----------------------------------------------
+
 dnl Searchomatic
 
 PKG_CHECK_MODULES(SEARCHOMATIC, gtk+-2.0)
@@ -350,6 +360,7 @@
 	gst-sharp?		${enable_gst_sharp}
 	Epiphany Extension?	${enable_epiphany_extension}
 	Mozilla Extension?	yes
+	wv1?			${enable_wv1}
 
 	Enable Network		${enable_network}"
 
Index: Filters/FilterDOC.cs
===================================================================
RCS file: /cvs/gnome/beagle/Filters/FilterDOC.cs,v
retrieving revision 1.2
diff -u -r1.2 FilterDOC.cs
--- Filters/FilterDOC.cs	15 Oct 2004 13:38:05 -0000	1.2
+++ Filters/FilterDOC.cs	16 Nov 2004 16:49:24 -0000
@@ -1,8 +1,13 @@
 //
 // FilterDOC.cs
 //
-// Copyright (C) 2004 Novell, Inc.
+// FilterRTF.cs : Trivial implementation of a MS Word-document filter.
+//                This filter uses wv1 library - http://wvware.sourceforge.net/
+//
+// Author: Veerapuram Varadhan <vvaradhan novell com>
 //
+// Copyright (C) 2004 Novell, Inc.
+// 
 
 //
 // Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,257 +31,190 @@
 
 
 using System;
-using System.Collections;
 using System.IO;
-using System.Text;
-using Gsf;
+using System.Runtime.InteropServices;
+using Beagle.Util;
 
 namespace Beagle.Filters {
     
-	public class FilterDOC : FilterOle {
-
-		int fib_ccpText;
-		int fib_fcclx;
-		Hashtable pieceItems;
-		Hashtable pieceIndices;
+	public class FilterDOC : Beagle.Daemon.Filter {
 
+		string FileName;
 
-		public FilterDOC () 
-		{
-			AddSupportedMimeType ("application/msword");
-		}
+		//////////////////////////////////////////////////////////
 
-		// Note:  This code to find the Real "FC" inside a "WordDocument"
-		// stream is copied from wv2.  Word specification (!) says something about
-		// the calculation, but not so clear.  Gotta crack it!!
-	
-		private void FindRealFC (ref int fc, ref bool unicode)
-		{
-			if ( (fc & 0x40000000) > 0 ) {
-				fc = (int)( fc & 0xbfffffff ) >> 1;
-				unicode = false;
-			}
-			else
-				unicode = true;
-		}
-		public string getTableStreamName (Gsf.Input stream)
-		{
-			int fWhichTblStm;
-			int length = 0;
-			stream.SeekEmulate (0x0A);
-			byte [] data = stream.Read (2);
-			if (data == null) {
-				Console.WriteLine ("Data is null");
-				return "";
-			}
+		private delegate void TextHandlerCallback (IntPtr byteArray, int dataLen, bool hotText);
+		//private delegate void TextHandlerCallback (string data, int dataLen, bool hotText);
 		
-			int fFlagSet = 0;
+		[DllImport ("wv1glue")]
+		private static extern int wv1_glue_init_doc_parsing (string fname, TextHandlerCallback callback);
 
-			for (int i = 1; i > -1; i--) {
-				fFlagSet <<= 8;
-				fFlagSet |= 0xff & data[i];
-			}
-		
-			// find out the table stream correspoding to this document
-			fWhichTblStm = fFlagSet & 0x0200;
+		[DllImport ("wv1glue")]
+		private static extern IntPtr wv1_glue_get_ole_stream (string fname);
 
-			if (fWhichTblStm > 0)
-				return ("1Table");
-			else
-				return ("0Table");
-		}
+		[DllImport ("wv1glue")]
+		private static extern IntPtr wv1_glue_get_ole_summary_stream (IntPtr oleStream);
 
-		private void readPieceTable (Gsf.Input tblStream, 
-					     int fib_fcclx)
-		{
-			int size = 0;
-			tblStream.SeekEmulate (fib_fcclx);
-			byte[] data = tblStream.Read (1);
-			while (((clxtType)data[0]) == clxtType.clxtGrpprl) {
-				data = tblStream.Read (2);
-				size = GetInt16 (data, 0);
-				tblStream.SeekEmulate (size);
-			
-				data = tblStream.Read (1);
-			}
+		[DllImport ("wv1glue")]
+		private static extern string wv1_glue_get_title (IntPtr smryStream);
 
-			if (((clxtType)data[0]) == clxtType.clxtPlcfpcd) {
-				int cntPCDs = 0;
-				data = tblStream.Read (4);
-				size = GetInt32 (data, 0);
-			
-				pieceItems = new Hashtable ();
-				pieceIndices = new Hashtable ();
-			
-				// 8 is the sizeof a PCD
-				if (((size - 4) % (8 + 4)) > 0)
-					cntPCDs = 0;
-				else
-					cntPCDs = (size - 4) / (8 + 4);
-			
-				// n+1 CP/FCs
-				for (int i = 0; i < (cntPCDs+1); i++) {
-					data = tblStream.Read (4);
-					pieceIndices.Add (i, GetInt32 (data, 0));
-				}
-
-				// n PCDs
-				for (int i = 0; i < cntPCDs; i++) {
-					//Console.WriteLine ("Current tablestream pos : {0}", 
-					//tblStream.Tell());
-					//data = tblStream.Read (4);
-					pieceItems.Add (i, new PCD (tblStream));
-				}
-			}
-		
-		}
+		[DllImport ("wv1glue")]
+		private static extern string wv1_glue_get_subject (IntPtr smryStream);
 
-		private StringBuilder StripOffWordSpecificChars (string strValue)
-		{
-			if (strValue == null)
-				return null;
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_author (IntPtr smryStream);
 
-			StringBuilder strText = new StringBuilder ();
+		[DllImport ("wv1glue")]
+		private static extern string wv1_glue_get_keywords (IntPtr smryStream);
 
-			for (int i = 0; i < strValue.Length; i++) {
-				if (strValue[i] > 31 && strValue[i] < 126)
-					strText.Append (strValue[i]);
-				else {
-					// FIXME:  We need to handle the characters appropriately,
-					// such that we can extract the attributes of them as well.
-					switch (Convert.ToInt32(strValue[i])) {
-					case (int) PieceType.PageSectionBreaks:
-					case (int) PieceType.ParagraphEnds:
-					case (int) PieceType.HardLineBreaks: strText.Append (" ");
-						break;
-					
-					case (int) PieceType.Tab: strText.Append (" ");
-						break;
-					}
-				}
-			}
-			return strText;
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_comments (IntPtr smryStream);
 
-		}
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_template (IntPtr smryStream);
 
-		public void ExtractText (Gsf.Input docStream, 
-					   Gsf.Input tblStream)
-		{
-			int pcdTblIndex = 0;
-			int chCntPCD = 0;
-			StringBuilder strText = new StringBuilder ();
-			string val = null;
-			int fc;
-			bool unicode;
-
-			// get the ccpText defined in FIB
-			docStream.SeekEmulate (0x004C);
-			byte[] data = docStream.Read (4);
-			fib_ccpText = GetInt32 (data, 0);
-
-			// get the fcclx defined in FIB
-			docStream.SeekEmulate (0x01A2);
-			data = docStream.Read (4);
-			fib_fcclx = GetInt32 (data, 0);
-
-			// get the plcpcd list from the tableStream
-			readPieceTable (tblStream, fib_fcclx);
-
-			data = null;
-			unicode = true;
-
-			while (fib_ccpText > 0 && pcdTblIndex < pieceItems.Count) {
-				chCntPCD = (int) pieceIndices[pcdTblIndex+1] - 
-					(int) pieceIndices[pcdTblIndex];
-				chCntPCD = chCntPCD > fib_ccpText ? fib_ccpText : chCntPCD;
-			
-				fc = ((PCD)pieceItems[pcdTblIndex]).GetFC();
-			
-				FindRealFC (ref fc, ref unicode);
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_lastsavedby (IntPtr smryStream);
 
-				if (docStream.SeekEmulate (fc))
-					Console.WriteLine ("Seek failed!!");
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_revision_number (IntPtr smryStream);
 
-				//Console.WriteLine ("Number of chars present: {0}", chCntPCD);
-				if (unicode) {
-					data = docStream.Read (chCntPCD * 2); //unicode chars
-					if (data != null)
-						val = System.Text.Encoding.Unicode.GetString (data);
-					else
-						Console.WriteLine ("Unicode Data is Null");
-				} else {
-			
-					data = docStream.Read (chCntPCD);
-					if (data != null)
-						val = System.Text.Encoding.ASCII.GetString (data);
-					else
-						Console.WriteLine ("ASCII data is NULL");
-				}
-				if (val != null) {
-					strText = StripOffWordSpecificChars (val);
-					AppendText (strText.ToString());
-				}
-				pcdTblIndex ++;
-				data = null;
-				val = null;
-			}
-			AppendWhiteSpace ();
-		}
+		[DllImport ("wv1glue")]
+	        private static extern string wv1_glue_get_appname (IntPtr smryStream);
 
-		// This enum is a "convenience enum" for reading the piece table
-		private enum clxtType {
-			clxtGrpprl = 1,
-			clxtPlcfpcd = 2
-		}
-		override protected void DoPull ()
-		{
-			Input docStream = file.ChildByName ("WordDocument");
-			Input tblStream = file.ChildByName (getTableStreamName (docStream));
+		[DllImport ("wv1glue")]
+	        private static extern Int32 wv1_glue_get_page_count (IntPtr smryStream);
 
-			if (docStream != null)
-				ExtractText (docStream, tblStream);
-			Finished();
-		}
-		// FIXME: there are other specially treated "ASCII" characters
-		// available to list here, but, we will content with these
-		// and in general, we can replace ASCII 1 thru ASCII 31 with
-		// simple spaces, however, word has this wierd logic where-in
-		// if chp.fSpec = 1, some ASCII characters between ASCII 32 and 
-		// ASCII 41 should be interpreted differently.
-
-		public enum PieceType {
-			Unknown = 0,
-			Tab = 0x09,
-			HardLineBreaks = 0x0B,
-			PageSectionBreaks = 0x0C,
-			ParagraphEnds = 0x0D,
-			BreakingHyphens = 0x2D,
-			NonRequiredHyphens = 0x1F,
-			NonBreakingHyphens = 0x1E
+		[DllImport ("wv1glue")]
+	        private static extern Int32 wv1_glue_get_word_count (IntPtr smryStream);
+
+		[DllImport ("wv1glue")]
+		private static extern Int32 wv1_glue_get_character_count (IntPtr smryStream);
+
+		[DllImport ("wv1glue")]
+		private static extern Int32 wv1_glue_get_security (IntPtr smryStream);
+
+		[DllImport ("wv1glue")]
+		private static extern Int16 wv1_glue_get_codepage (IntPtr smryStream);
+
+		[DllImport ("wv1glue")]
+		private static extern void wv1_glue_close_stream (IntPtr oleStream, IntPtr summary);
+
+		//////////////////////////////////////////////////////////
+
+		public FilterDOC () 
+		{
+			AddSupportedMimeType ("application/msword");
+			AddSupportedMimeType ("application/vnd.ms-word");
+			AddSupportedMimeType ("application/x-msword");
 		}
+		
+  		private void IndexText (IntPtr byteArray, int dataLen, bool hotText)
+  		{
+			if (byteArray != IntPtr.Zero) {
+				byte[] data = new byte[dataLen];
+				Marshal.Copy (byteArray, data, 0, dataLen);
+				if (hotText)
+					HotUp();
+				AppendText (System.Text.Encoding.UTF8.GetString(data, 0, dataLen));
+			
+				if (hotText)
+					HotDown();
+			}
+  		}
+		override protected void DoOpen (FileInfo info)
+		{
+			FileName = info.FullName;
+		}
+		override protected void DoPullProperties ()
+		{
+			IntPtr oleStream;
+			IntPtr oleSummaryStream;
+			string strProp = null;
+			Int32 intProp = 0;
+			
+			oleStream = wv1_glue_get_ole_stream (FileName);
+			if (oleStream == IntPtr.Zero) {
+				Logger.Log.Error ("Could not open OLE stream {0}", FileName);
+				return;
+			}
+			oleSummaryStream = wv1_glue_get_ole_summary_stream (oleStream);
+			if (oleSummaryStream == IntPtr.Zero) {
+				Logger.Log.Error ("Could not open OLE Meta data stream from {0}", FileName);
+				return;
+			}
+			strProp = wv1_glue_get_title (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("dc:title", strProp ));
+			strProp = null;
+			
+			strProp = wv1_glue_get_subject (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("dc:subject", strProp ));
+			strProp = null;
+
+			strProp = wv1_glue_get_comments (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("dc:comment", strProp));
+			strProp = null;
+
+			strProp = wv1_glue_get_author (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:author", strProp));
+			strProp = null;
+
+			strProp = wv1_glue_get_lastsavedby (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:lastsavedby", strProp));
+			strProp = null;
+			
+			strProp = wv1_glue_get_appname (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:appname", strProp));
+			strProp = null;
+			
+			strProp = wv1_glue_get_keywords (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:keywords", strProp));
+			strProp = null;
+			
+			strProp = wv1_glue_get_template (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:template", strProp));
+			strProp = null;
+
+			strProp = wv1_glue_get_revision_number (oleSummaryStream);
+			if (strProp != null && strProp.Length > 0)
+				AddProperty (Beagle.Property.New ("fixme:revisionnumber", strProp));
+			
+			intProp = wv1_glue_get_page_count (oleSummaryStream);
+			if (intProp != 0)	
+				AddProperty (Beagle.Property.New ("fixme:page-count", intProp ));
+			intProp = 0;
+			
+			intProp = wv1_glue_get_word_count (oleSummaryStream);
+			if (intProp != 0)
+				AddProperty (Beagle.Property.New ("fixme:word-count", intProp));
+			intProp = 0;
 
-		public class PCD {
-			/* this structure is of size 8 bytes and other fields do exist..
-			   I am just having the one that I am interested in.. ;-)
-			*/
-			int _notInterested; // 16 bits
-			int fc; // 32 bits
-			int _actually_prm; // 16 bits
-
-			public PCD (Gsf.Input stream)
-			{
-				read (stream);
-			}
-			private void read (Gsf.Input stream)
-			{
-				byte[] data = stream.Read (2);
-				data = stream.Read (4);
-				fc = GetInt32 (data, 0);
-				//Console.WriteLine ("PCD FC : {0}", fc);
-				data = stream.Read (2);
-			}
+			wv1_glue_close_stream (oleStream, oleSummaryStream);
+		}
 
-			public int GetFC () { return fc;}
+		override protected void DoPull ()
+		{
+			int ret;
+			TextHandlerCallback textHandler;
+			textHandler = new TextHandlerCallback (IndexText);
+			
+			ret = wv1_glue_init_doc_parsing (FileName, textHandler);
+
+			if (ret == -2)
+				Logger.Log.Error ("{0} : is password protected", FileName);
+			else if (ret == -1)
+				Logger.Log.Error ("{0} : Unable to read", FileName);
+			else if (ret == -3)
+				Logger.Log.Error ("Unable to initiate the parser for {0}", FileName);
+			Finished ();
 		}
 	}
 }
Index: Filters/FilterSource.cs
===================================================================
RCS file: /cvs/gnome/beagle/Filters/FilterSource.cs,v
retrieving revision 1.4
diff -u -r1.4 FilterSource.cs
--- Filters/FilterSource.cs	15 Oct 2004 13:13:15 -0000	1.4
+++ Filters/FilterSource.cs	16 Nov 2004 16:49:24 -0000
@@ -86,16 +86,16 @@
 						   "try", "while", "yield"};
 
 		private enum LineType {
+			None,
 			SingleLineComment,
 			BlockComment,
-			StringConstant, 
-			None
+			StringConstant
 		}
 		
 		private enum LangType {
+			None,
 			C_Style,
-			Python,
-			None
+			Python
 		}
 		
 		LineType SrcLineType;
Index: Filters/Makefile.am
===================================================================
RCS file: /cvs/gnome/beagle/Filters/Makefile.am,v
retrieving revision 1.24
diff -u -r1.24 Makefile.am
--- Filters/Makefile.am	12 Nov 2004 15:03:22 -0000	1.24
+++ Filters/Makefile.am	16 Nov 2004 16:49:24 -0000
@@ -35,7 +35,6 @@
 if ENABLE_GSF_SHARP
 CSFILES += 				\
 	$(srcdir)/FilterOle.cs		\
-	$(srcdir)/FilterDOC.cs		\
 	$(srcdir)/FilterPPT.cs
 endif
 
@@ -45,6 +44,11 @@
 else
 CSFILES +=				\
 	$(srcdir)/FilterMusic.cs
+endif
+
+if ENABLE_WV1
+CSFILES += \
+	$(srcdir)/FilterDOC.cs		
 endif
 
 LOCAL_ASSEMBLIES =				\
Index: glue/Makefile.am
===================================================================
RCS file: /cvs/gnome/beagle/glue/Makefile.am,v
retrieving revision 1.5
diff -u -r1.5 Makefile.am
--- glue/Makefile.am	2 Nov 2004 20:22:42 -0000	1.5
+++ glue/Makefile.am	16 Nov 2004 16:49:25 -0000
@@ -14,6 +14,10 @@
         -DMOZILLA_HOME=\""$(MOZILLA_HOME)\""              \
         -include $(MOZILLA_INCLUDE_ROOT)/mozilla-config.h
 
+if ENABLE_WV1
+INCLUDES += \
+	$(WV1_CFLAGS)
+endif
 
 gluelibdir = $(pkglibdir)
 
@@ -23,6 +27,11 @@
 	libgeckoglue.la         \
 	libkeyglue.la
 
+if ENABLE_WV1
+gluelib_LTLIBRARIES += \
+	libwv1glue.la
+endif	
+
 libinotifyglue_la_SOURCES =	\
 	inotify.h		\
 	inotify-glue.c
@@ -43,4 +52,10 @@
 	eggaccelerators.c       \
 	eggaccelerators.h
 
+if ENABLE_WV1
+libwv1glue_la_SOURCES = \
+	wv1-glue.c
+libwv1glue_la_LIBADD = @X_LIBS@ \
+			$(WV1_LIBS)
 
+endif
--- /dev/null	2004-08-25 10:34:59.000000000 -0700
+++ glue/wv1-glue.c	2004-11-16 20:07:37.432313528 -0800
@@ -0,0 +1,471 @@
+/*
+ * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
+ * Microsoft Word documents).
+ *
+ * Copyright (C) 2004 Novell, Inc.
+ *
+ * Author: Veerapuram Varadhan <vvaradhan novell com>
+ * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
+ *
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <wv.h>
+
+/* Callback to Handle "text" (or words) extracted out of 
+ * M$ Word documents 
+ *
+ * text: Holds the extracted text/words.
+ *
+ * hotText: Identifies the attributes of the text.
+ *          (bold, italic, underline, superscript, subscript)
+ */
+
+typedef void (* wvTextHandlerCallback) (U8* text, int len, U8 hotText);
+
+typedef struct _UserData {
+  /* formatting variables */
+
+  int cFontSize;
+  int cCol;
+
+  /* boolean formats */
+  int bIsBold:1;
+  int bIsItalic:1;
+  int bIsStrike:1;
+  int bIsUl:1;
+  int bIsSup:1;
+  int bIsSub:1;
+
+  /* beagle specifc formats */
+  U8 bIsHot;
+
+  /* buffer to hold text */
+  GString* txtWord;
+
+  wvTextHandlerCallback WordHandler;
+  
+} UserData;
+
+
+/*
+ * append_char: fills the txtWord buffer with the character 'ch'
+ * converted to UTF8 encoding.  Calls the "WordHandler" for every
+ * word/line/end of a paragraph or for every 1023 characters,
+ * whichever comes first.
+ *
+ * ud : carries the UserData filled-in appropriately to hold the 
+ *      character (text) attributes.
+ * 
+ * ch : unicode character
+ *
+ */
+
+void
+append_char (UserData * ud, U16 ch)
+{
+  int hotText;
+  char tmpBuf[64];
+  int len = 0;
+
+  switch (ch) {
+  case 0x20: /* space */
+  case 0x0B: /* hard line break */
+  case 0x0D: /* paragraph end */
+  case 0x0C:
+  case '\n': /* new-line */
+    if (ch != '\n')
+      ch = 0x20;
+    else
+      ch = '\n';
+    g_string_append_c (ud->txtWord, ch);
+    break;
+  default: 
+    len =  g_unichar_to_utf8 (ch, tmpBuf);
+    g_string_append_len (ud->txtWord, tmpBuf, len);
+    break;
+  }
+    if (ch == 0x00 || ch == '\n' || ch == 0x20) {
+      (*(ud->WordHandler))(ud->txtWord->str, ud->txtWord->len, ud->bIsHot);
+      g_string_erase (ud->txtWord, 0, -1);
+    }  
+}
+
+/*
+ * fill_UserData: fills the UserData structure from the 
+ * CHP structure that represents the Character Property
+ * Information like bold, italic, striked, underlined, 
+ * superscript, subscript, fontsize, color, fontface etc.
+ *
+ */
+void
+fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
+{
+  ud->cCol = 0;
+  if (chp->ico)
+    ud->cCol = chp->ico - 1;    
+
+  ud->cFontSize = chp->hps;
+  ud->bIsBold = (chp->fBold);
+  ud->bIsItalic = (chp->fItalic);
+  ud->bIsUl = (chp->kul);
+  ud->bIsStrike = (chp->fStrike);
+  ud->bIsSup = (chp->iss == 1);
+  ud->bIsSub = (chp->iss == 2);
+
+  if (ud->bIsBold || ud->bIsItalic || ud->bIsUl || ud->bIsSup || ud->bIsSub)
+    ud->bIsHot = 1;
+  else
+    ud->bIsHot = 0;
+}
+
+/* This is a callback that handles the individual 
+ * character that are extracted from M$ word file.
+ */
+static int
+charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
+{
+
+  /* convert incoming character to unicode */
+  if (chartype) {
+    eachchar = wvHandleCodePage (eachchar, lid);
+  }
+
+  /* take care of any oddities in Microsoft's character "encoding" */
+  /* TODO: does the above code page handler take care of these? */
+  if (chartype == 1 && eachchar == 146)
+    eachchar = 39;		/* apostrophe */
+
+  switch (eachchar)
+    {
+    case 14:			/* column break */
+      break;
+
+    case 19:			/* field begin */
+      /* flush current text buffer */
+      ps->fieldstate++;
+      ps->fieldmiddle = 0;
+      return 0;
+    case 20:			/* field separator */
+      ps->fieldmiddle = 1;
+      return 0;
+    case 21:			/* field end */
+      ps->fieldstate--;
+      ps->fieldmiddle = 0;
+      return 0;
+
+    default:
+      break;
+    }
+
+  if (eachchar == 0x14)
+    return 0;
+
+  append_char (ps->userData, eachchar);
+  return 0;
+}
+
+/* This is a callback that handles the special 
+ * character that are specific to M$ word file.
+ */
+static int
+specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
+{
+  Blip blip;
+  wvStream *fil;
+  long pos;
+  FSPA *fspa;
+  PICF picf;
+  FDOA *fdoa;
+
+  switch (eachchar)
+    {
+    case 19:			/* field begin */
+      ps->fieldstate++;
+      ps->fieldmiddle = 0;
+      return 0;
+    case 20:			/* field separator */
+      if (achp->fOle2)
+	{
+/* 	  printf ("Field has an embedded OLE2 object\n"); */
+	}
+      ps->fieldmiddle = 1;
+      return 0;
+    case 21:			/* field end */
+      ps->fieldstate--;
+      ps->fieldmiddle = 0;
+      return 0;
+    default:
+      break;
+    }
+
+  if (ps->fieldstate)
+    {
+      if (eachchar == 0x13 || eachchar == 0x14)
+	return 0;
+    }
+
+  return 0;
+}
+
+/* This is a callback that handles the individual 
+ * elements that are marked by libwv1.
+ */
+
+static int
+eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
+{
+  /* some word structures */
+  PAP *apap;
+  CHP *achp;
+  SEP *asep;
+  int iRes;
+
+  UserData *ud = (UserData *) ps->userData;
+
+  switch (tag)
+    {
+    case SECTIONEND:
+    case PARAEND:		/* pretty much nothing */
+      append_char (ud, '\n');
+      break;
+
+    case CHARPROPBEGIN:
+      achp = (CHP *) props;
+      fill_UserData (ud, achp, ps);
+      break;
+
+    case CHARPROPEND:
+      achp = (CHP *) props;
+      fill_UserData (ud, achp, ps);
+      break;
+
+    default:
+      break;
+    }
+
+  return 0;
+}
+
+/* This is a callback that handles the document 
+ * level tags that are marked by libwv1.
+ */
+
+static int
+docProc (wvParseStruct * ps, wvTag tag)
+{
+  switch (tag)
+    {
+    case DOCEND:
+      append_char (ps->userData, 0x00);
+      break;
+
+    default:
+      break;
+    }
+
+  return 0;
+}
+
+/*
+ * wv1_glue_init_doc_parsing: Initiates the document parsing 
+ * procedure.  Sets up all the required handlers and the parser.
+ * 
+ * fname: Name of the file to parse. (essentially a M$ word file)
+ *
+ * wvTextHandlerCallback: The callback routine that will be called 
+ * on extraction of each word.
+ *
+ * Return: 0 -> success
+ *        -1 -> failure.
+ */
+
+int
+wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
+{
+  FILE *input;
+  int ret;
+
+  wvParseStruct ps;
+  char *dir = NULL;
+
+  UserData ud;
+
+  input = fopen (fname, "rb");
+  if (!input)
+      return -1;
+  fclose (input);
+
+  wvInit ();
+  ret = wvInitParser (&ps, fname);
+  if (ret & 0x8000)
+    return -2;
+  else if (ret) 
+    return -3;
+
+  ps.filename = fname;
+  ps.dir = dir;
+
+  /* set to 0 */
+  memset (&ud, 0, sizeof (UserData));
+  ud.WordHandler = callback;
+  ud.txtWord = g_string_sized_new (32);
+  ps.userData = &ud;
+
+
+  wvSetElementHandler (&ps, eleProc);
+  wvSetDocumentHandler (&ps, docProc);
+  wvSetCharHandler (&ps, charProc);
+  wvSetSpecialCharHandler (&ps, specCharProc);
+
+  wvText (&ps);
+
+  /* free associated memory */
+  wvOLEFree (&ps);
+  
+  /* free userdata memory */
+  g_string_free (ud.txtWord, TRUE);
+
+  return 0;
+}
+
+void *
+wv1_glue_get_ole_stream (const char* fname)
+{
+    MsOle *ole = NULL;
+    ms_ole_open (&ole, fname);
+    return ((void *)ole);
+}
+
+void *
+wv1_glue_get_ole_summary_stream (MsOle *stream)
+{
+  MsOle *oleStream = (MsOle *)stream;
+  MsOleSummary *summary = NULL;
+  summary = ms_ole_summary_open (oleStream);
+  return ((void *)summary);  
+}
+
+char *
+wv1_glue_get_title (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TITLE, &ret));  
+}
+
+char *
+wv1_glue_get_subject (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_SUBJECT, &ret));
+}
+
+char *
+wv1_glue_get_author (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_AUTHOR, &ret));
+}
+
+char *
+wv1_glue_get_keywords (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_KEYWORDS, &ret));
+}
+
+char *
+wv1_glue_get_comments (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_COMMENTS, &ret));
+}
+                                                                                                                            
+char *
+wv1_glue_get_template (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TEMPLATE, &ret));
+}
+
+char *
+wv1_glue_get_lastsavedby (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_LASTAUTHOR, &ret));
+}
+
+char *
+wv1_glue_get_revision_number (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_REVNUMBER, &ret));
+}
+
+char *
+wv1_glue_get_appname (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_APPNAME, &ret));
+}
+
+long
+wv1_glue_get_page_count (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_PAGECOUNT, &ret));
+}
+
+long
+wv1_glue_get_word_count (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_WORDCOUNT, &ret));
+}
+
+long
+wv1_glue_get_character_count (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_CHARCOUNT, &ret));
+}
+
+long
+wv1_glue_get_security (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_SECURITY, &ret));
+}
+
+short
+wv1_glue_get_codepage (MsOleSummary* smryStream)
+{
+  int ret;
+  return (ms_ole_summary_get_short (smryStream, MS_OLE_SUMMARY_CODEPAGE, &ret));
+}
+
+void
+wv1_glue_close_stream (MsOle* oleStream, MsOleSummary* summary)
+{
+    ms_ole_summary_close (summary);
+    ms_ole_destroy (&oleStream);
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]