Re: Thoughts about FilterChm



>
> I dont have much idea about how the chm filter works. Why dont you
> give it a try, maybe duplicating code on the way ? We can then work on
> that and clean it up. Once I see the code it will be helpful to
> comment how the html filter can be exploited.

Hi!

Well I've ready started the work. I decided to modify the filter to
inherit bahaviour from html filter,  and  I overrided the  DoOpen
method, but I'm getting some extrage NullPointerException  from
HtmlAgilityPack on methods that previouisly worked flawesly when the
chm filter was working, i reading that code in order to trace the bug.
Did someone chaged/updated the code from HtmlAgilityPack?


I found this lines when reading the code:


// SLIM: _text = reader.ReadToEnd();

_text = new StreamAsArray (sr);

That makes me  think that maybe someone chaged/update the code,
further  breaking my code :P



-------------------------------------------------------------------------------------------------------
this is the ouput:

ceruno neguange:~/desarrollo/beagle_doctest$ beagle-extract-content
O\'Reilly_-_Regular.Expression.Pocket.Reference.eBook-LiB.chm 
Filename: file:///home/ceruno/desarrollo/beagle_doctest/O'Reilly_-_Regular.Expression.Pocket.Reference.eBook-LiB.chm
Debug: Loaded 42 filters from
/home/ceruno/desarrollo/root/lib/beagle/Filters/Filters.dll
Object reference not set to an instance of an object
in [0x0004b] (at
/home/ceruno/desarrollo/beagle/Filters/HtmlAgilityPack/HtmlDocument.cs:205)
HtmlAgilityPack.StreamAsArray:Read (Boolean initial)
in [0x00051] (at
/home/ceruno/desarrollo/beagle/Filters/HtmlAgilityPack/HtmlDocument.cs:194)
HtmlAgilityPack.StreamAsArray:.ctor (System.IO.StreamReader r)
in [0x0009e] (at
/home/ceruno/desarrollo/beagle/Filters/HtmlAgilityPack/HtmlDocument.cs:890)
HtmlAgilityPack.HtmlDocument:Load (System.IO.TextReader reader)
in [0x000aa] (at
/home/ceruno/desarrollo/beagle/Filters/FilterChm.cs:188)
Beagle.Filters.FilterChm:DoOpen (System.IO.FileInfo info)
Filter: Beagle.Filters.FilterChm
MimeType: application/x-chm

Properties:
  dc:title = O'Reilly : Regular Expression Pocket Reference

(no content)
(no hot content)
//
// FilterChm.cs : Trivial implementation of a CHM filter.
//
// Copyright (C) 2005,2006 Miguel Cabrera <mfcabrer unal edu co>
//
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//

using System;
using System.Collections;
using System.IO;
using System.Text;
using HtmlAgilityPack;

using Beagle.Util;
using Beagle.Daemon;

namespace Beagle.Filters {

	public class FilterChm : FilterHtml  {
		
		ChmFile chmFile;
	
		public FilterChm () : base()
		{
		
			RegisterSupportedTypes();
			SnippetMode= true;
			
		}
		

	        //UNUSED
		public void WalkTocFile(HtmlNode node) 
		{
			
			
			
			foreach (HtmlNode subnode in node.ChildNodes) {
				if (subnode.NodeType == HtmlNodeType.Element) {
					switch (subnode.Name) {
					case "html":
				case "head":
					WalkTocFile (subnode);
					break;
				case "body":
					default:
						WalkToc (subnode);
						break;
					}
				}
			}
			
		}

		
		//UNUSED
		public void WalkToc(HtmlNode node)
		{
			
			switch (node.NodeType) {
				
			case HtmlNodeType.Document:
			case HtmlNodeType.Element:
				
				if(node.Name == "li")
					foreach(HtmlNode subnode in node.ChildNodes)
						HandleTocEntry(subnode);
				
				foreach(HtmlNode subnode in node.ChildNodes)
					WalkToc(subnode);
				break;
				
				
				
			}
			
		}	
		
		//UNUSED
		public void HandleTocEntry(HtmlNode node)
		{
			
			if(node.Name == "object") {
				
			string attr = node.GetAttributeValue ("type", "");
			
			if(String.Compare(attr,"text/sitemap",true) == 0) 
				foreach(HtmlNode subnode in node.ChildNodes) 
					if(String.Compare(subnode.Name,"param",true) == 0 &&
					   subnode.GetAttributeValue("name","") == "Name" ){
						HotUp();
						AppendText(subnode.GetAttributeValue("value",""));
						HotDown();
						
					}
			
			
			
			}
			
		}		
		
		//UNUSED
		void ReadHtml(TextReader reader) 
		{
			HtmlDocument doc = new HtmlDocument ();
			doc.ReportNode += HandleNodeEvent;	
			doc.StreamMode = true;
			if (reader != null)
				Console.WriteLine("Reader es distinto de Null");

			try {
				doc.Load (reader);
			} catch (ArgumentNullException e) {
				/*Weird should not happend*/
				//¿What should do here?
				Logger.Log.Warn (e.Message);
				return;
				
			}

			
						
			
		}
		
		override protected void DoOpen (FileInfo info) 
		{
					
			chmFile = new ChmFile();
			Encoding enc = null;

			try {
				
				chmFile.Load(info.FullName);
				
			}
			catch (Exception e) {
				
				Logger.Log.Warn ("Could load   {0}: {1}",info.Name,e.Message);
				Finished ();
				return;

			}

			TextReader default_page = chmFile.GetDefaultFile();
			
			HtmlDocument doc = new HtmlDocument ();
			doc.StreamMode = true;
			doc.ReportNode += HandleNodeEvent;
			
			
			if (default_page == null)
				Console.WriteLine("default_page is Null");
			

			try {
				doc.Load (default_page);
				
			} 
			catch (Exception e) {
				//Logger.Log.Warn ("Error parsing file  contents {0}: {1}",info.Name,e.Message);
				Console.WriteLine (e.Message);
				Console.WriteLine (e.StackTrace);
			}

			
			
		}

		

		override protected void DoPullProperties() 
		{
						
			if(chmFile.Title != "") 
				AddProperty (Beagle.Property.New ("dc:title", chmFile.Title));
			
					
		
		}

		override protected void DoPull()
		{
			//Logger.Log.Debug("FilterCHM: Parsing:" + chmFile.Title);
			//chmFile.ParseContents(ReadHtml);
			

			/*
			  We only read the default file and the topic file
			**/
			/* ReadHtml(chmFile.GetDefaultFile());
			
			HtmlDocument doc = new HtmlDocument();

 			doc.Load(chmFile.GetTopicsFile());
			
			WalkTocFile(doc.DocumentNode);*/
			
			Finished();
			
			
		}

		override protected void  DoClose() 
		{
			chmFile.Dispose();
		
		}
		
		override protected  void  RegisterSupportedTypes()
		{
			AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/x-chm"));
		}

	}
}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]