[tracker/parser-libunistring-review] Enabled hexadecimal content debug logs in parser



commit 1aed0ae01c0e5d18aea336451278c39a98767c36
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed Apr 28 12:36:55 2010 +0200

    Enabled hexadecimal content debug logs in parser

 src/libtracker-fts/tracker-parser.c |  183 +++++++++++++++++++++++++++++------
 1 files changed, 153 insertions(+), 30 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index ef4344e..64f66c4 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -20,6 +20,7 @@
 
 #include "config.h"
 
+#include <stdio.h>
 #include <string.h>
 
 #ifdef HAVE_UNAC
@@ -39,6 +40,10 @@
 
 #include "tracker-parser.h"
 
+/* Define to 1 if you want to enable debugging logs showing HEX contents
+ * of the words being parsed */
+#define TRACKER_PARSER_DEBUG_HEX 0
+
 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
@@ -125,6 +130,48 @@ struct TrackerParser {
 };
 
 
+#if TRACKER_PARSER_DEBUG_HEX
+/* Based on GNU PDF's pdf_text_test_get_hex() */
+static gchar *
+tracker_strhex (const gchar *data,
+                const gsize size,
+                gchar delimiter)
+{
+	gint i;
+	gint j;
+	guint new_str_length;
+	gchar *new_str;
+	gchar new_hex_char [3];
+
+	/* Get new string length. If input string has N bytes, we need:
+	 * - 1 byte for last NUL char
+	 * - 2N bytes for hexadecimal char representation of each byte...
+	 * - N-1 bytes for the separator ':'
+	 * So... a total of (1+2N+N-1) = 3N bytes are needed... */
+	new_str_length =  3 * size;
+
+	/* Allocate memory for new array and initialize contents to NUL */
+	new_str = g_malloc0 (new_str_length);
+
+	/* Print hexadecimal representation of each byte... */
+	for(i=0, j=0; i<size; i++, j+=3) {
+		memset (new_hex_char, 0, 3);
+		/* Print character in helper array... */
+		sprintf (new_hex_char, "%02X", (guint8)(data[i]));
+		/* Copy to output string... */
+		memcpy (&new_str[j],&new_hex_char[0],2);
+		/* And if needed, add separator */
+		if(i != (size-1) ) {
+			new_str[j+2] = delimiter;
+		}
+	}
+
+	/* Set output string */
+	return new_str;
+}
+
+#endif /* TRACKER_PARSER_DEBUG_HEX */
+
 
 #ifndef HAVE_LIBUNISTRING
 static inline TrackerParserWordType
@@ -777,48 +824,91 @@ tracker_parser_process_word (TrackerParser *parser,
 {
 	gchar *stem_word;
 	gchar *str;
+	gchar *stripped_word;
 	guint  bytes, len;
 
 	g_return_val_if_fail (parser != NULL, NULL);
 	g_return_val_if_fail (word != NULL, NULL);
 
 	str = NULL;
+	stripped_word = NULL;
 
-	bytes = length == -1 ? strlen (word) : length;
+	if (word) {
+		if (length == -1) {
+			bytes = strlen (word);
+		} else {
+			bytes = length;
+		}
 
-	g_debug ("ORIGINAL word: '%s'", word);
+		/* Log original word */
+#if TRACKER_PARSER_DEBUG_HEX
+		{
+			gchar *aux;
+			aux = tracker_strhex (word, bytes, ':');
+			g_message ("ORIGINAL word: '%s' (%s)",
+			           word, aux);
+			g_free (aux);
+		}
+#endif
 
-	str = g_utf8_normalize (word,
-	                        bytes,
-	                        G_NORMALIZE_NFC);
-	if (!str) {
-		return NULL;
-	}
+		if (do_strip) {
+			stripped_word = strip_word (word, bytes, &len);
+
+			/* Log after UNAC stripping */
+#if TRACKER_PARSER_DEBUG_HEX
+			{
+				gchar *aux;
+				aux = tracker_strhex (stripped_word, len, ':');
+				g_message (" After UNAC stripping: '%s' (%s)",
+				           stripped_word, aux);
+				g_free (aux);
+			}
+#endif
+		} else {
+			stripped_word = NULL;
+		}
 
-	len = strlen (str);
 
-	g_debug ("  After NFC normalization: '%s'", str);
+		if (!stripped_word) {
+			str = g_utf8_normalize (word,
+			                        bytes,
+			                        G_NORMALIZE_NFC);
+		} else {
+			str = g_utf8_normalize (stripped_word,
+			                        len,
+			                        G_NORMALIZE_NFC);
+			g_free (stripped_word);
+		}
 
-	if (do_strip) {
-		gchar *stripped_word;
+		/* Log after normalization */
+#if TRACKER_PARSER_DEBUG_HEX
+		{
+			gchar *aux;
+			aux = tracker_strhex (str, strlen ((gchar *)str), ':');
+			g_message ("  After NFC normalization: '%s' (%s)",
+			           str, aux);
+			g_free (aux);
+		}
+#endif
 
-		stripped_word = strip_word (str, len, &len);
-		g_debug (" After UNAC stripping: '%s'", stripped_word);
-		g_free (str);
-		str = stripped_word;
-	}
 
-	if (!parser->enable_stemmer) {
-		return str;
-	}
+		if (!str) {
+			return NULL;
+		}
 
-	stem_word = tracker_language_stem_word (parser->language, str, len);
-	g_debug ("  After Stemming: '%s'", stem_word);
+		if (!parser->enable_stemmer) {
+			return str;
+		}
+
+		len = strlen (str);
+
+		stem_word = tracker_language_stem_word (parser->language, str, len);
 
-	if (stem_word) {
-		g_free (str);
+		if (stem_word) {
+			g_free (str);
 
-		return stem_word;
+			return stem_word;
+		}
 	}
 
 	return str;
@@ -846,7 +936,16 @@ tracker_parser_process_word (TrackerParser *parser,
 		length = strlen (word);
 	}
 
-	g_debug ("Original word: '%s'", word);
+	/* Log original word */
+#if TRACKER_PARSER_DEBUG_HEX
+	{
+		gchar *aux;
+		aux = tracker_strhex (word, length, ':');
+		g_message ("ORIGINAL word: '%s' (%s)",
+		           word, aux);
+		g_free (aux);
+	}
+#endif
 
 	/* Leave space for last NIL */
 	new_word_length = WORD_BUFFER_LENGTH - 1;
@@ -874,7 +973,15 @@ tracker_parser_process_word (TrackerParser *parser,
 	/* Set output NIL */
 	normalized[new_word_length] = '\0';
 
-	g_debug (" After Casefolding and NFC normalization: '%s'", normalized);
+#if TRACKER_PARSER_DEBUG_HEX
+	{
+		gchar *aux;
+		aux = tracker_strhex (normalized, new_word_length, ':');
+		g_message (" After Casefolding and NFC normalization: '%s' (%s)",
+		           normalized, aux);
+		g_free (aux);
+	}
+#endif
 
 	/* UNAC stripping needed? */
 	if (do_strip) {
@@ -885,7 +992,16 @@ tracker_parser_process_word (TrackerParser *parser,
 		                       &stripped_word_length);
 
 		if (stripped) {
-			g_debug ("  After UNAC stripping: '%s'", stripped);
+			/* Log after UNAC stripping */
+#if TRACKER_PARSER_DEBUG_HEX
+			{
+				gchar *aux;
+				aux = tracker_strhex (stripped, stripped_word_length, ':');
+				g_message ("  After UNAC stripping: '%s' (%s)",
+				           stripped, aux);
+				g_free (aux);
+			}
+#endif
 			new_word_length = stripped_word_length;
 		}
 	}
@@ -895,8 +1011,15 @@ tracker_parser_process_word (TrackerParser *parser,
 		stemmed = tracker_language_stem_word (parser->language,
 		                                      stripped ? stripped : normalized,
 		                                      new_word_length);
-
-		g_debug ("   After Stemming: '%s'", stemmed);
+#if TRACKER_PARSER_DEBUG_HEX
+		if (stemmed) {
+			gchar *aux;
+			aux = tracker_strhex (stemmed, strlen (stemmed), ':');
+			g_message ("   After stemming: '%s' (%s)",
+			           stemmed, aux);
+			g_free (aux);
+		}
+#endif
 	}
 
 	/* If stemmed wanted and succeeded, free previous and return it */



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]