[tracker/parser-libunistring-review] Enabled hexadecimal content debug logs in parser
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-libunistring-review] Enabled hexadecimal content debug logs in parser
- Date: Wed, 28 Apr 2010 18:47:42 +0000 (UTC)
commit 1aed0ae01c0e5d18aea336451278c39a98767c36
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed Apr 28 12:36:55 2010 +0200
Enabled hexadecimal content debug logs in parser
src/libtracker-fts/tracker-parser.c | 183 +++++++++++++++++++++++++++++------
1 files changed, 153 insertions(+), 30 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index ef4344e..64f66c4 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -20,6 +20,7 @@
#include "config.h"
+#include <stdio.h>
#include <string.h>
#ifdef HAVE_UNAC
@@ -39,6 +40,10 @@
#include "tracker-parser.h"
+/* Define to 1 if you want to enable debugging logs showing HEX contents
+ * of the words being parsed */
+#define TRACKER_PARSER_DEBUG_HEX 0
+
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
@@ -125,6 +130,48 @@ struct TrackerParser {
};
+#if TRACKER_PARSER_DEBUG_HEX
+/* Based on GNU PDF's pdf_text_test_get_hex() */
+static gchar *
+tracker_strhex (const gchar *data,
+ const gsize size,
+ gchar delimiter)
+{
+ gint i;
+ gint j;
+ guint new_str_length;
+ gchar *new_str;
+ gchar new_hex_char [3];
+
+ /* Get new string length. If input string has N bytes, we need:
+ * - 1 byte for last NUL char
+ * - 2N bytes for hexadecimal char representation of each byte...
+ * - N-1 bytes for the separator ':'
+ * So... a total of (1+2N+N-1) = 3N bytes are needed... */
+ new_str_length = 3 * size;
+
+ /* Allocate memory for new array and initialize contents to NUL */
+ new_str = g_malloc0 (new_str_length);
+
+ /* Print hexadecimal representation of each byte... */
+ for(i=0, j=0; i<size; i++, j+=3) {
+ memset (new_hex_char, 0, 3);
+ /* Print character in helper array... */
+ sprintf (new_hex_char, "%02X", (guint8)(data[i]));
+ /* Copy to output string... */
+ memcpy (&new_str[j],&new_hex_char[0],2);
+ /* And if needed, add separator */
+ if(i != (size-1) ) {
+ new_str[j+2] = delimiter;
+ }
+ }
+
+ /* Set output string */
+ return new_str;
+}
+
+#endif /* TRACKER_PARSER_DEBUG_HEX */
+
#ifndef HAVE_LIBUNISTRING
static inline TrackerParserWordType
@@ -777,48 +824,91 @@ tracker_parser_process_word (TrackerParser *parser,
{
gchar *stem_word;
gchar *str;
+ gchar *stripped_word;
guint bytes, len;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
str = NULL;
+ stripped_word = NULL;
- bytes = length == -1 ? strlen (word) : length;
+ if (word) {
+ if (length == -1) {
+ bytes = strlen (word);
+ } else {
+ bytes = length;
+ }
- g_debug ("ORIGINAL word: '%s'", word);
+ /* Log original word */
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (word, bytes, ':');
+ g_message ("ORIGINAL word: '%s' (%s)",
+ word, aux);
+ g_free (aux);
+ }
+#endif
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- if (!str) {
- return NULL;
- }
+ if (do_strip) {
+ stripped_word = strip_word (word, bytes, &len);
+
+ /* Log after UNAC stripping */
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (stripped_word, len, ':');
+ g_message (" After UNAC stripping: '%s' (%s)",
+ stripped_word, aux);
+ g_free (aux);
+ }
+#endif
+ } else {
+ stripped_word = NULL;
+ }
- len = strlen (str);
- g_debug (" After NFC normalization: '%s'", str);
+ if (!stripped_word) {
+ str = g_utf8_normalize (word,
+ bytes,
+ G_NORMALIZE_NFC);
+ } else {
+ str = g_utf8_normalize (stripped_word,
+ len,
+ G_NORMALIZE_NFC);
+ g_free (stripped_word);
+ }
- if (do_strip) {
- gchar *stripped_word;
+ /* Log after normalization */
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (str, strlen ((gchar *)str), ':');
+ g_message (" After NFC normalization: '%s' (%s)",
+ str, aux);
+ g_free (aux);
+ }
+#endif
- stripped_word = strip_word (str, len, &len);
- g_debug (" After UNAC stripping: '%s'", stripped_word);
- g_free (str);
- str = stripped_word;
- }
- if (!parser->enable_stemmer) {
- return str;
- }
+ if (!str) {
+ return NULL;
+ }
- stem_word = tracker_language_stem_word (parser->language, str, len);
- g_debug (" After Stemming: '%s'", stem_word);
+ if (!parser->enable_stemmer) {
+ return str;
+ }
+
+ len = strlen (str);
+
+ stem_word = tracker_language_stem_word (parser->language, str, len);
- if (stem_word) {
- g_free (str);
+ if (stem_word) {
+ g_free (str);
- return stem_word;
+ return stem_word;
+ }
}
return str;
@@ -846,7 +936,16 @@ tracker_parser_process_word (TrackerParser *parser,
length = strlen (word);
}
- g_debug ("Original word: '%s'", word);
+ /* Log original word */
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (word, length, ':');
+ g_message ("ORIGINAL word: '%s' (%s)",
+ word, aux);
+ g_free (aux);
+ }
+#endif
/* Leave space for last NIL */
new_word_length = WORD_BUFFER_LENGTH - 1;
@@ -874,7 +973,15 @@ tracker_parser_process_word (TrackerParser *parser,
/* Set output NIL */
normalized[new_word_length] = '\0';
- g_debug (" After Casefolding and NFC normalization: '%s'", normalized);
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (normalized, new_word_length, ':');
+ g_message (" After Casefolding and NFC normalization: '%s' (%s)",
+ normalized, aux);
+ g_free (aux);
+ }
+#endif
/* UNAC stripping needed? */
if (do_strip) {
@@ -885,7 +992,16 @@ tracker_parser_process_word (TrackerParser *parser,
&stripped_word_length);
if (stripped) {
- g_debug (" After UNAC stripping: '%s'", stripped);
+ /* Log after UNAC stripping */
+#if TRACKER_PARSER_DEBUG_HEX
+ {
+ gchar *aux;
+ aux = tracker_strhex (stripped, stripped_word_length, ':');
+ g_message (" After UNAC stripping: '%s' (%s)",
+ stripped, aux);
+ g_free (aux);
+ }
+#endif
new_word_length = stripped_word_length;
}
}
@@ -895,8 +1011,15 @@ tracker_parser_process_word (TrackerParser *parser,
stemmed = tracker_language_stem_word (parser->language,
stripped ? stripped : normalized,
new_word_length);
-
- g_debug (" After Stemming: '%s'", stemmed);
+#if TRACKER_PARSER_DEBUG_HEX
+ if (stemmed) {
+ gchar *aux;
+ aux = tracker_strhex (stemmed, strlen (stemmed), ':');
+ g_message (" After stemming: '%s' (%s)",
+ stemmed, aux);
+ g_free (aux);
+ }
+#endif
}
/* If stemmed wanted and succeeded, free previous and return it */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]