[tracker/parser-libunistring-review] Minor fixes in the libunistring-based parser
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-libunistring-review] Minor fixes in the libunistring-based parser
- Date: Wed, 28 Apr 2010 18:47:47 +0000 (UTC)
commit b97e2bb579007b4d7658afff78fd18231cda1b5c
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed Apr 28 19:01:31 2010 +0200
Minor fixes in the libunistring-based parser
* Don't perform UNAC stripping if ASCII or CJK
* Reformat the main word-walker loop
src/libtracker-fts/tracker-parser.c | 148 ++++++++++++++++++++++-------------
1 files changed, 95 insertions(+), 53 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index 64f66c4..71c492f 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -44,9 +44,6 @@
* of the words being parsed */
#define TRACKER_PARSER_DEBUG_HEX 0
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
#ifndef HAVE_LIBUNISTRING
@@ -89,6 +86,18 @@ typedef enum {
TRACKER_PARSER_ENCODING_OTHER
} TrackerParserEncoding;
+#else
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
+#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
+ ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
+ ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
#endif /* !HAVE_LIBUNISTRING */
@@ -109,12 +118,11 @@ struct TrackerParser {
gchar *word;
gint word_length;
guint word_position;
+
#ifndef HAVE_LIBUNISTRING
TrackerParserEncoding encoding;
const gchar *cursor;
-#endif /* !HAVE_LIBUNISTRING */
-#ifndef HAVE_LIBUNISTRING
/* Pango members for CJK text parsing */
PangoLogAttr *attrs;
guint attr_length;
@@ -582,11 +590,28 @@ parser_next (TrackerParser *parser,
#else
-/* Use libunistring
- * void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
- * int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
- * uint8_t * u8_casefold (const uint8_t *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp)
- */
+
+/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
+ * UNAC stripping.
+ * Just check byte per byte, and if any of the bytes is >127, then it's not
+ * ASCII-7 */
+static gboolean
+is_ascii_word (const gchar *word)
+{
+ guchar *i;
+
+ i = (guchar *)word;
+ while (*i != '\0') {
+ if (!IS_ASCII_BYTE (*i)) {
+ return FALSE;
+ }
+ i++;
+ }
+ return TRUE;
+}
+
+
+/* libunistring-based parser */
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -607,10 +632,19 @@ parser_next (TrackerParser *parser,
ucs4_t first_unichar;
gint first_unichar_len;
gsize i;
+ gsize new_length;
+ gchar word_buffer [WORD_BUFFER_LENGTH];
+ gboolean do_strip;
/* Get first character of the word as UCS4 */
first_unichar_len = u8_strmbtouc (&first_unichar,
&(parser->txt[parser->cursor]));
+ if (first_unichar_len <= 0) {
+ /* This should only happen if NIL was passed to u8_strmbtouc,
+ * so better just force stop here */
+ parser->cursor = parser->txt_size;
+ break;
+ }
/* Find next word break */
i = parser->cursor + first_unichar_len;
@@ -623,49 +657,56 @@ parser_next (TrackerParser *parser,
* start of next word or the end of the string */
word_length = i - parser->cursor;
- if (first_unichar_len > 0) {
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- if (uc_is_general_category (first_unichar,
- parser->allowed_start)) {
- gchar word_buffer [WORD_BUFFER_LENGTH];
- gsize new_length;
-
- /* compute truncated word length if needed */
- new_length = (word_length < WORD_BUFFER_LENGTH ?
- word_length :
- WORD_BUFFER_LENGTH - 1);
-
- /* Word here needs always to be NIL-terminated */
- memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
- word_buffer[new_length] = '\0';
-
- /* Process the word here. If it fails, we can still go
- * to the next one. Returns newly allocated string
- * always */
- processed_word = tracker_parser_process_word (parser,
- word_buffer,
- new_length,
- TRUE);
- if (!processed_word) {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- }
- } else {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- }
- } else {
- /* This should only happen if NIL was passed to u8_strmbtouc,
- * so better just force stop here */
- parser->cursor = parser->txt_size;
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ if (!uc_is_general_category (first_unichar,
+ parser->allowed_start)) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* check if word is reserved */
+ if (parser->parse_reserved_words &&
+ word_length == 2 &&
+ parser->txt[parser->cursor] == 'o' &&
+ parser->txt[parser->cursor + 1] == 'r') {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* compute truncated word length if needed */
+ new_length = (word_length < WORD_BUFFER_LENGTH ?
+ word_length :
+ WORD_BUFFER_LENGTH - 1);
+
+ /* Word here needs always to be NIL-terminated */
+ memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
+ word_buffer[new_length] = '\0';
+
+ /* Enable UNAC stripping only if no ASCII and no CJK */
+ do_strip = (!is_ascii_word (word_buffer) &&
+ !IS_CJK_UCS4 (first_unichar));
+
+ /* Process the word here. If it fails, we can still go
+ * to the next one. Returns newly allocated string
+ * always */
+ processed_word = tracker_parser_process_word (parser,
+ word_buffer,
+ new_length,
+ do_strip);
+ if (!processed_word) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
}
}
@@ -1006,6 +1047,7 @@ tracker_parser_process_word (TrackerParser *parser,
}
}
+
/* Stemming needed? */
if (parser->enable_stemmer) {
stemmed = tracker_language_stem_word (parser->language,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]