[tracker] tracker-extract: Fixed HTML parser to use TrackerSparqlBuilder



commit 12593e8cefdcd2d3f278a22aab1c977b95c05a4b
Author: Martyn Russell <martyn lanedo com>
Date:   Tue Nov 3 17:52:43 2009 +0000

    tracker-extract: Fixed HTML parser to use TrackerSparqlBuilder

 src/tracker-extract/tracker-extract-html.c |  245 ++++++++++++++--------------
 1 files changed, 125 insertions(+), 120 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index 514d08d..a90b7ce 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -1,7 +1,7 @@
 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 /*
- * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
- * Copyright (C) 2008, Nokia
+ * Copyright (C) 2007,      Jason Kivlighn (jkivlighn gmail com)
+ * Copyright (C) 2008-2009, Nokia
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -31,12 +31,7 @@
 
 #include <libtracker-common/tracker-ontology.h>
 
-#define NIE_PREFIX TRACKER_NIE_PREFIX
-#define NFO_PREFIX TRACKER_NFO_PREFIX
-#define NCO_PREFIX TRACKER_NCO_PREFIX
-
-#define RDF_PREFIX TRACKER_RDF_PREFIX
-#define RDF_TYPE RDF_PREFIX "type"
+#define RDF_TYPE TRACKER_RDF_PREFIX "type"
 
 typedef enum {
 	READ_TITLE,
@@ -46,10 +41,10 @@ typedef struct {
 	TrackerSparqlBuilder *metadata;
 	tag_type current;
 	const gchar *uri;
-} HTMLParseInfo;
+} parser_data;
 
-static void extract_html (const gchar *filename,
-			  TrackerSparqlBuilder   *metadata);
+static void extract_html (const gchar          *filename,
+			  TrackerSparqlBuilder *metadata);
 
 static TrackerExtractData data[] = {
 	{ "text/html",		   extract_html },
@@ -58,19 +53,19 @@ static TrackerExtractData data[] = {
 };
 
 static gboolean
-has_attribute (const xmlChar **atts,
-	       const gchar    *attr,
-	       const gchar    *val)
+has_attribute (const gchar **attrs,
+	       const gchar  *attr,
+	       const gchar  *val)
 {
 	gint i;
 
-	if (!(atts && attr && val)) {
+	if (!attrs || !attr || !val) {
 		return FALSE;
 	}
 
-	for (i = 0; atts[i] && atts[i + 1]; i += 2) {
-		if (strcasecmp ((gchar*) atts[i], attr) == 0) {
-			if (strcasecmp ((gchar*) atts[i + 1], val) == 0) {
+	for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
+		if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
+			if (g_ascii_strcasecmp (attrs[i + 1], val) == 0) {
 				return TRUE;
 			}
 		}
@@ -80,174 +75,184 @@ has_attribute (const xmlChar **atts,
 }
 
 static const xmlChar *
-lookup_attribute (const xmlChar **atts,
-		  const gchar	 *attr)
+lookup_attribute (const gchar **attrs,
+		  const gchar  *attr)
 {
 	gint i;
 
-	if (!atts || !attr) {
+	if (!attrs || !attr) {
 		return NULL;
 	}
 
-	for (i = 0; atts[i] && atts[i + 1]; i += 2) {
-		if (strcasecmp ((gchar*) atts[i], attr) == 0) {
-			return atts[i + 1];
+	for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
+		if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
+			return attrs[i + 1];
 		}
 	}
 
 	return NULL;
 }
 
-void
-startElement (void	     *info_,
-	      const xmlChar  *name,
-	      const xmlChar **atts)
+static void
+parser_start_element (void           *data,
+		      const xmlChar  *name_,
+		      const xmlChar **attrs_)
 {
-	HTMLParseInfo* info = info_;
+	parser_data *pd = data;
+	const gchar *name = (const gchar*) name_;
+	const gchar **attrs = (const gchar**) attrs_;
 
-	if (!(info && name)) {
+	if (!pd || !name) {
 		return;
 	}
 
 	/* Look for RDFa triple describing the license */
-	if (strcasecmp ((gchar*) name, "a") == 0) {
+	if (g_ascii_strcasecmp (name, "a") == 0) {
 		/* This tag is a license.  Ignore, however, if it is
 		 * referring to another document.
 		 */
-		if (has_attribute (atts, "rel", "license") &&
-		    has_attribute (atts, "about", NULL) == FALSE) {
+		if (has_attribute (attrs, "rel", "license") &&
+		    has_attribute (attrs, "about", NULL) == FALSE) {
 			const xmlChar *href;
 
-			href = lookup_attribute (atts, "href");
+			href = lookup_attribute (attrs, "href");
 
 			if (href) {
-				tracker_statement_list_insert (info->metadata,
-							  info->uri, NIE_PREFIX "license", 
-							  (const gchar *)  href);
+				tracker_sparql_builder_predicate (pd->metadata, "nie:license");
+				tracker_sparql_builder_object_unvalidated (pd->metadata, href);
 			}
 		}
-	} else if (strcasecmp ((gchar*)name, "title") == 0) {
-		info->current = READ_TITLE;
-	} else if (strcasecmp ((gchar*)name, "meta") == 0) {
-		if (has_attribute (atts, "name", "Author")) {
+	} else if (g_ascii_strcasecmp (name, "title") == 0) {
+		pd->current = READ_TITLE;
+	} else if (g_ascii_strcasecmp (name, "meta") == 0) {
+		if (has_attribute (attrs, "name", "author")) {
 			const xmlChar *author;
 
-			author = lookup_attribute (atts, "content");
+			author = lookup_attribute (attrs, "content");
 
 			if (author) {
-				tracker_statement_list_insert (info->metadata, ":", RDF_TYPE, NCO_PREFIX "Contact");
-				tracker_statement_list_insert (info->metadata, ":", NCO_PREFIX "fullname", author);
-				tracker_statement_list_insert (info->metadata, info->uri, NCO_PREFIX "creator", ":");
+				tracker_sparql_builder_predicate (pd->metadata, "nco:creator");
+				tracker_sparql_builder_object_blank_open (pd->metadata);
+				tracker_sparql_builder_predicate (pd->metadata, "a");
+				tracker_sparql_builder_object (pd->metadata, "nco:Contact");
+				tracker_sparql_builder_predicate (pd->metadata, "nco:fullname");
+				tracker_sparql_builder_object_unvalidated (pd->metadata, author);
+				tracker_sparql_builder_object_blank_close (pd->metadata);
 			}
 		}
 
-		if (has_attribute (atts, "name", "DC.Description")) {
+		if (has_attribute (attrs, "name", "description")) {
 			const xmlChar *desc;
 
-			desc = lookup_attribute (atts,"content");
+			desc = lookup_attribute (attrs,"content");
 
 			if (desc) {
-				tracker_statement_list_insert (info->metadata,
-							  info->uri, NIE_PREFIX "comment",
-							  (const gchar *) desc);
+				tracker_sparql_builder_predicate (pd->metadata, "nie:description");
+				tracker_sparql_builder_object_unvalidated (pd->metadata, desc);
 			}
 		}
 
-		if (has_attribute (atts, "name", "KEYWORDS") ||
-		    has_attribute (atts, "name", "keywords")) {
-			const xmlChar* k = lookup_attribute (atts, "content");
-
-			if (k) {
-				gchar *keywords = g_strdup (k);
-				char *lasts, *keyw;
-
-				for (keyw = strtok_r (keywords, ",;", &lasts); keyw; 
-				     keyw = strtok_r (NULL, ",;", &lasts)) {
-					tracker_statement_list_insert (info->metadata,
-							  info->uri, NIE_PREFIX "keyword",
-							  (const gchar*) keyw);
+		if (has_attribute (attrs, "name", "keywords")) {
+			const xmlChar* content = lookup_attribute (attrs, "content");
+
+			if (content) {
+				gchar **keywords;
+				gint i;
+
+				keywords = g_strsplit (content, ",", -1);
+				if (keywords) {
+					for (i = 0; keywords[i] != NULL; i++) {
+						if (!keywords[i] || keywords[i] == '\0') {
+							continue;
+						}
+
+						tracker_sparql_builder_predicate (pd->metadata, "nie:keyword");
+						tracker_sparql_builder_object_unvalidated (pd->metadata, g_strstrip (keywords[i]));
+					}
+					
+					g_strfreev (keywords);
 				}
-
-				g_free (keywords);
 			}
 		}
 	}
 }
 
-void
-characters (void	  *info_,
-	    const xmlChar *ch,
-	    int		   len)
+static void
+parser_characters (void          *data,
+		   const xmlChar *ch,
+		   int		  len)
 {
-	HTMLParseInfo* info = info_;
+	parser_data *pd = data;
 
-	switch (info->current) {
+	switch (pd->current) {
 	case READ_TITLE:
-		tracker_statement_list_insert (info->metadata,
-					  info->uri, NIE_PREFIX "title",
-					  (const gchar*) ch);
+		tracker_sparql_builder_predicate (pd->metadata, "nie:title");
+		tracker_sparql_builder_object_unvalidated (pd->metadata, ch);
 		break;
 	default:
 		break;
 	}
 
-	info->current = -1;
+	pd->current = -1;
 }
 
 static void
-extract_html (const gchar *uri,
-	      TrackerSparqlBuilder   *metadata)
+extract_html (const gchar          *uri,
+	      TrackerSparqlBuilder *metadata)
 {
-	gchar *filename = g_filename_from_uri (uri, NULL, NULL);
-	xmlSAXHandler SAXHandlerStruct = {
-			NULL, /* internalSubset */
-			NULL, /* isStandalone */
-			NULL, /* hasInternalSubset */
-			NULL, /* hasExternalSubset */
-			NULL, /* resolveEntity */
-			NULL, /* getEntity */
-			NULL, /* entityDecl */
-			NULL, /* notationDecl */
-			NULL, /* attributeDecl */
-			NULL, /* elementDecl */
-			NULL, /* unparsedEntityDecl */
-			NULL, /* setDocumentLocator */
-			NULL, /* startDocument */
-			NULL, /* endDocument */
-			startElement, /* startElement */
-			NULL, /* endElement */
-			NULL, /* reference */
-			characters, /* characters */
-			NULL, /* ignorableWhitespace */
-			NULL, /* processingInstruction */
-			NULL, /* comment */
-			NULL, /* xmlParserWarning */
-			NULL, /* xmlParserError */
-			NULL, /* xmlParserError */
-			NULL, /* getParameterEntity */
-			NULL, /* cdataBlock */
-			NULL, /* externalSubset */
-			1,    /* initialized */
-			NULL, /* private */
-			NULL, /* startElementNsSAX2Func */
-			NULL, /* endElementNsSAX2Func */
-			NULL  /* xmlStructuredErrorFunc */
+	htmlDocPtr doc;
+	parser_data pd;
+	gchar *filename;
+	xmlSAXHandler handler = {
+		NULL, /* internalSubset */
+		NULL, /* isStandalone */
+		NULL, /* hasInternalSubset */
+		NULL, /* hasExternalSubset */
+		NULL, /* resolveEntity */
+		NULL, /* getEntity */
+		NULL, /* entityDecl */
+		NULL, /* notationDecl */
+		NULL, /* attributeDecl */
+		NULL, /* elementDecl */
+		NULL, /* unparsedEntityDecl */
+		NULL, /* setDocumentLocator */
+		NULL, /* startDocument */
+		NULL, /* endDocument */
+		parser_start_element, /* startElement */
+		NULL, /* endElement */
+		NULL, /* reference */
+		parser_characters, /* characters */
+		NULL, /* ignorableWhitespace */
+		NULL, /* processingInstruction */
+		NULL, /* comment */
+		NULL, /* xmlParserWarning */
+		NULL, /* xmlParserError */
+		NULL, /* xmlParserError */
+		NULL, /* getParameterEntity */
+		NULL, /* cdataBlock */
+		NULL, /* externalSubset */
+		1,    /* initialized */
+		NULL, /* private */
+		NULL, /* startElementNsSAX2Func */
+		NULL, /* endElementNsSAX2Func */
+		NULL  /* xmlStructuredErrorFunc */
 	};
 
-	HTMLParseInfo	info = { metadata, -1, uri };
+	tracker_sparql_builder_subject_iri (metadata, uri);
+	tracker_sparql_builder_predicate (metadata, "a");
+	tracker_sparql_builder_object_unvalidated (metadata, "nfo:Document");
 
-	htmlDocPtr doc;
-	doc = htmlSAXParseFile (filename, NULL, &SAXHandlerStruct, &info);
-	if (doc) {
+	pd.metadata = metadata;
+	pd.current = -1;
+	pd.uri = uri;
 
-		tracker_statement_list_insert (metadata, uri, 
-		                          RDF_TYPE, 
-		                          NFO_PREFIX "Document");
+	filename = g_filename_from_uri (uri, NULL, NULL);
+	doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
+	g_free (filename);
 
+	if (doc) {
 		xmlFreeDoc (doc);
 	}
-
-	g_free (filename);
 }
 
 TrackerExtractData *



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]