[tracker] tracker-parser: Remove unused functions



commit 216b6917db746aeac1ae381379e4a65b1050a831
Author: Jürg Billeter <j bitron ch>
Date:   Tue Sep 1 19:02:12 2009 +0200

    tracker-parser: Remove unused functions

 .../libtracker-common-sections.txt                 |    4 -
 src/libtracker-common/tracker-parser.c             |  569 --------------------
 src/libtracker-common/tracker-parser.h             |   45 --
 tests/libtracker-common/Makefile.am                |   14 -
 tests/libtracker-common/tracker-parser-test.c      |  193 -------
 5 files changed, 0 insertions(+), 825 deletions(-)
---
diff --git a/docs/reference/libtracker-common/libtracker-common-sections.txt b/docs/reference/libtracker-common/libtracker-common-sections.txt
index aeb788e..8a5b079 100644
--- a/docs/reference/libtracker-common/libtracker-common-sections.txt
+++ b/docs/reference/libtracker-common/libtracker-common-sections.txt
@@ -234,14 +234,10 @@ tracker_ontology_add_property
 TrackerParser
 tracker_parser_new
 tracker_parser_free
-tracker_parser_is_stop_word
 tracker_parser_next
 tracker_parser_process_word
 tracker_parser_reset
 tracker_parser_text
-tracker_parser_text_fast
-tracker_parser_text_into_array
-tracker_parser_text_to_string
 </SECTION>
 
 <INCLUDE>libtracker-common/tracker-class.h</INCLUDE>
diff --git a/src/libtracker-common/tracker-parser.c b/src/libtracker-common/tracker-parser.c
index 2ab5758..f1d7a23 100644
--- a/src/libtracker-common/tracker-parser.c
+++ b/src/libtracker-common/tracker-parser.c
@@ -166,29 +166,6 @@ strip_word (const gchar *str,
 #endif
 }
 
-static gboolean
-text_needs_pango (const gchar *text)
-{
-	const gchar *p;
-	gunichar     c;
-	gint	     i = 0;
-
-	/* Grab first 1024 non-whitespace chars and test */
-	for (p = text; *p && i < 1024; p = g_utf8_next_char (p)) {
-		c = g_utf8_get_char (p);
-
-		if (!g_unichar_isspace (c)) {
-			i++;
-		}
-
-		if (NEED_PANGO(c)) {
-			return TRUE;
-		}
-	}
-
-	return FALSE;
-}
-
 static TrackerParserEncoding
 get_encoding (const gchar *txt)
 {
@@ -232,192 +209,6 @@ is_stop_word (TrackerLanguage *language,
 	return g_hash_table_lookup (stop_words, word) != NULL;
 }
 
-static const gchar *
-analyze_text (const gchar      *text,
-	      TrackerLanguage  *language,
-	      gint		max_word_length,
-	      gint		min_word_length,
-	      gboolean		filter_words,
-	      gboolean		filter_numbers,
-	      gboolean		delimit_hyphen,
-	      gchar	      **index_word)
-{
-	TrackerParserWordType word_type;
-	gunichar	      word[64];
-	gboolean	      do_strip;
-	gboolean	      is_valid;
-	gint		      length;
-	glong		      bytes;
-	const char	     *p;
-	const char	     *start;
-
-	*index_word = NULL;
-
-	if (text == NULL || text[0] == '\0') {
-		return NULL;
-	}
-
-	word_type = TRACKER_PARSER_WORD_IGNORE;
-	do_strip = FALSE;
-	is_valid = TRUE;
-	length = 0;
-	bytes = 0;
-	start = NULL;
-
-	for (p = text; *p; p = g_utf8_next_char (p)) {
-		TrackerParserWordType type;
-		gunichar	      c;
-
-		c = g_utf8_get_char (p);
-		type = get_word_type (c);
-
-		if (type == TRACKER_PARSER_WORD_IGNORE ||
-		    type == TRACKER_PARSER_WORD_NEWLINE ||
-		    (delimit_hyphen &&
-		     (type == TRACKER_PARSER_WORD_HYPHEN ||
-		      type == TRACKER_PARSER_WORD_UNDERSCORE))) {
-			if (!start) {
-				continue;
-			} else {
-				break;
-			}
-		}
-
-		if (!is_valid) {
-			continue;
-		}
-
-		if (!start) {
-			start = p;
-
-			/* Valid words must start with an alpha or
-			 * underscore if we are filtering.
-			 */
-			if (filter_numbers) {
-				if (type == TRACKER_PARSER_WORD_NUM) {
-					is_valid = FALSE;
-					continue;
-				} else {
-					if (type == TRACKER_PARSER_WORD_HYPHEN) {
-						is_valid = FALSE;
-						continue;
-					}
-				}
-			}
-		}
-
-		if (length >= max_word_length) {
-			continue;
-		}
-
-		length++;
-
-		switch (type) {
-		case TRACKER_PARSER_WORD_ASCII_HIGHER:
-			c += 32;
-
-		case TRACKER_PARSER_WORD_ASCII_LOWER:
-		case TRACKER_PARSER_WORD_HYPHEN:
-		case TRACKER_PARSER_WORD_UNDERSCORE:
-			if (word_type == TRACKER_PARSER_WORD_NUM ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_ALPHA;
-			}
-
-			break;
-
-		case TRACKER_PARSER_WORD_NUM:
-			if (word_type == TRACKER_PARSER_WORD_ALPHA ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_NUM;
-			}
-			break;
-
-		case TRACKER_PARSER_WORD_ALPHA_HIGHER:
-			c = g_unichar_tolower (c);
-
-		case TRACKER_PARSER_WORD_ALPHA_LOWER:
-			if (!do_strip) {
-				do_strip = TRUE;
-			}
-
-			if (word_type == TRACKER_PARSER_WORD_NUM ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_ALPHA;
-			}
-
-			break;
-			
-		case TRACKER_PARSER_WORD_ALPHA:
-		case TRACKER_PARSER_WORD_ALPHA_NUM:
-		case TRACKER_PARSER_WORD_IGNORE:
-		case TRACKER_PARSER_WORD_NEWLINE:
-		default:
-			break;
-		}
-
-		word[length -1] = c;
-	}
-
-	if (!is_valid) {
-		return p;
-	}
-
-	if (word_type == TRACKER_PARSER_WORD_NUM) {
-		if (!filter_numbers || length >= INDEX_NUMBER_MIN_LENGTH) {
-			*index_word = g_ucs4_to_utf8 (word, length, NULL, NULL, NULL);
-		}
-	} else if (length >= min_word_length) {
-		const gchar *stem_word;
-		gchar	    *stripped_word;
-		gchar	    *str;
-		gchar	    *utf8;
-		guint32      len;
-
-		utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
-		if (!utf8) {
-			return p;
-		}
-
-		if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
-			stripped_word = strip_word (utf8, bytes, &len);
-		} else {
-			stripped_word = NULL;
-		}
-
-		if (!stripped_word) {
-			str = g_utf8_normalize (utf8,
-						bytes,
-						G_NORMALIZE_NFC);
-		} else {
-			str = g_utf8_normalize (stripped_word,
-						len,
-						G_NORMALIZE_NFC);
-			g_free (stripped_word);
-		}
-
-		g_free (utf8);
-
-		stem_word = tracker_language_stem_word (language,
-							str,
-							strlen (str));
-		g_free (str);
-
-		if (!filter_words || !is_stop_word (language, stem_word)) {
-			*index_word = g_strdup (stem_word);
-		}
-	}
-
-	return p;
-}
-
 static gboolean
 pango_next (TrackerParser *parser,
 	    gint	  *byte_offset_start,
@@ -691,34 +482,6 @@ parser_next (TrackerParser *parser,
 
 }
 
-static gboolean
-word_table_increment (GHashTable *word_table,
-		      gchar	 *index_word,
-		      gint	  weight,
-		      gint	  total_words,
-		      gint	  max_words_to_index)
-{
-	gboolean update_count;
-
-	update_count = total_words <= max_words_to_index;
-
-	if (update_count) {
-		gpointer p;
-		gint	 count;
-
-		p = g_hash_table_lookup (word_table, index_word);
-		count = GPOINTER_TO_INT (p);
-
-		g_hash_table_replace (word_table,
-				      index_word,
-				      GINT_TO_POINTER (count + weight));
-	} else {
-		g_free (index_word);
-	}
-
-	return update_count;
-}
-
 TrackerParser *
 tracker_parser_new (TrackerLanguage *language,
 		    gint	     max_word_length,
@@ -873,21 +636,6 @@ tracker_parser_process_word (TrackerParser *parser,
 	return str;
 }
 
-gboolean
-tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word)
-{
-	gboolean result;
-	char *processed_word;
-
-	if (get_encoding (word) == TRACKER_PARSER_ENCODING_CJK) return FALSE;
-
-	processed_word = tracker_parser_process_word (parser, word, -1, TRUE);
-	result = is_stop_word (parser->language, processed_word);
-
-	g_free (processed_word);
-	return result;
-}
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
 		     gint	   *position,
@@ -935,320 +683,3 @@ tracker_parser_next (TrackerParser *parser,
 	return str;
 }
 
-
-gchar *
-tracker_parser_text_to_string (const gchar     *text,
-			       TrackerLanguage *language,
-			       gint		max_word_length,
-			       gint		min_word_length,
-			       gboolean		filter_words,
-			       gboolean		filter_numbers,
-			       gboolean		delimit)
-{
-	const gchar *p;
-	gchar	    *parsed_text;
-	guint32      i = 0;
-	gint	     len;
-
-	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
-	if (text == NULL) {
-		return NULL;
-	}
-
-	if (text[0] == '\0') {
-		return g_strdup ("");
-	}
-
-	p = text;
-	len = strlen (text);
-	len = MIN (len, 500);
-
-	if (!g_utf8_validate (text, len, NULL)) {
-		return NULL;
-	}
-
-	if (text_needs_pango (text)) {
-		/* CJK text does not need stemming or other
-		 * treatment.
-		 */
-		PangoLogAttr *attrs;
-		guint	      str_len, word_start;
-		GString	     *strs;
-
-		str_len = g_utf8_strlen (text, -1);
-
-		strs = g_string_new (" ");
-
-		attrs = g_new0 (PangoLogAttr, str_len + 1);
-
-		pango_get_log_attrs (text,
-				     len,
-				     0,
-				     pango_language_from_string ("C"),
-				     attrs,
-				     str_len + 1);
-
-		word_start = 0;
-
-		for (i = 0; i < str_len + 1; i++) {
-			if (attrs[i].is_word_end) {
-				gchar *start_word, *end_word;
-
-				start_word = g_utf8_offset_to_pointer (text, word_start);
-				end_word = g_utf8_offset_to_pointer (text, i);
-
-				if (start_word != end_word) {
-					/* Normalize word */
-					gchar *s;
-					gchar *index_word;
-
-					s = g_utf8_casefold (start_word, end_word - start_word);
-					index_word  = g_utf8_normalize (s, -1, G_NORMALIZE_NFC);
-					g_free (s);
-
-					strs = g_string_append (strs, index_word);
-					strs = g_string_append_c (strs, ' ');
-					g_free (index_word);
-				}
-
-				word_start = i;
-			}
-
-			if (attrs[i].is_word_start) {
-				word_start = i;
-			}
-		}
-
-		g_free (attrs);
-
-		parsed_text = g_string_free (strs, FALSE);
-		return g_strstrip (parsed_text);
-	} else {
-		GString *str;
-		gchar	*word;
-
-		str = g_string_new (" ");
-
-		while (TRUE) {
-			i++;
-			p = analyze_text (p,
-					  language,
-					  max_word_length,
-					  min_word_length,
-					  filter_words,
-					  filter_numbers,
-					  delimit,
-					  &word);
-
-			if (word) {
-				g_string_append (str, word);
-				g_string_append_c (str, ' ');
-				g_free (word);
-			}
-
-			if (!p || !*p) {
-				parsed_text = g_string_free (str, FALSE);
-				return g_strstrip (parsed_text);
-			}
-		}
-
-		g_string_free (str, TRUE);
-	}
-
-	return NULL;
-}
-
-gchar **
-tracker_parser_text_into_array (const gchar	*text,
-				TrackerLanguage *language,
-				gint		 max_word_length,
-				gint		 min_word_length)
-{
-	gchar  *s;
-	gchar **strv;
-
-	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
-	s = tracker_parser_text_to_string (text,
-					   language,
-					   max_word_length,
-					   min_word_length,
-					   TRUE,
-					   FALSE,
-					   FALSE);
-	strv = g_strsplit (g_strstrip (s), " ", -1);
-	g_free (s);
-
-	return strv;
-}
-
-GHashTable *
-tracker_parser_text_fast (GHashTable  *word_table,
-			  const gchar *txt,
-			  gint	       weight)
-{
-	gchar **array;
-	gchar **p;
-
-	/* Use this for already processed text only */
-	if (!word_table) {
-		word_table = g_hash_table_new_full (g_str_hash,
-						    g_str_equal,
-						    g_free,
-						    NULL);
-	}
-
-	if (!txt || weight == 0) {
-		return word_table;
-	}
-
-	array = g_strsplit (txt, " ", -1);
-	if (!array) {
-		return word_table;
-	}
-
-	for (p = array; *p; p++) {
-		word_table_increment (word_table, *p, weight, 0, 0);
-	}
-
-	g_free (array);
-
-	return word_table;
-}
-
-GHashTable *
-tracker_parser_text (GHashTable      *word_table,
-		     const gchar     *text,
-		     gint	      weight,
-		     TrackerLanguage *language,
-		     gint	      max_words_to_index,
-		     gint	      max_word_length,
-		     gint	      min_word_length,
-		     gboolean	      filter_words,
-		     gboolean	      delimit_words)
-{
-	const gchar *p;
-	guint32      i;
-
-	/* Use this for unprocessed raw text */
-	gint	     total_words;
-
-	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
-	if (!word_table) {
-		word_table = g_hash_table_new_full (g_str_hash,
-						    g_str_equal,
-						    g_free,
-						    NULL);
-		total_words = 0;
-	} else {
-		total_words = g_hash_table_size (word_table);
-	}
-
-	if (text == NULL || text[0] == '\0' || weight == 0) {
-		return word_table;
-	}
-
-	p = text;
-	i = 0;
-
-	if (text_needs_pango (text)) {
-		/* CJK text does not need stemming or other treatment */
-		PangoLogAttr *attrs;
-		guint	      len, str_len, word_start;
-
-		len = strlen (text);
-		str_len = g_utf8_strlen (text, -1);
-
-		attrs = g_new0 (PangoLogAttr, str_len + 1);
-
-		pango_get_log_attrs (text,
-				     len,
-				     0,
-				     pango_language_from_string ("C"),
-				     attrs,
-				     str_len + 1);
-
-		word_start = 0;
-
-		for (i = 0; i < str_len + 1; i++) {
-			if (attrs[i].is_word_end) {
-				gchar *start_word, *end_word;
-
-				start_word = g_utf8_offset_to_pointer (text, word_start);
-				end_word = g_utf8_offset_to_pointer (text, i);
-
-				if (start_word != end_word) {
-					gchar	 *str;
-					gchar	 *index_word;
-					gboolean  was_updated;
-
-					/* Normalize word */
-					str = g_utf8_casefold (start_word, end_word - start_word);
-					if (!str) {
-						continue;
-					}
-
-					index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
-					g_free (str);
-
-					if (!index_word) {
-						continue;
-					}
-
-					total_words++;
-					was_updated = word_table_increment (word_table,
-									    index_word,
-									    weight,
-									    total_words,
-									    max_words_to_index);
-
-					if (!was_updated) {
-						break;
-					}
-				}
-
-				word_start = i;
-			}
-
-			if (attrs[i].is_word_start) {
-				word_start = i;
-			}
-		}
-
-		g_free (attrs);
-	} else {
-		gchar *word;
-
-		while (TRUE) {
-			i++;
-			p = analyze_text (p,
-					  language,
-					  max_word_length,
-					  min_word_length,
-					  filter_words,
-					  filter_words,
-					  delimit_words,
-					  &word);
-
-			if (word) {
-				total_words++;
-
-				if (!word_table_increment (word_table,
-							   word,
-							   weight,
-							   total_words,
-							   max_words_to_index)) {
-					break;
-				}
-			}
-
-			if (!p || !*p) {
-				break;
-			}
-		}
-	}
-
-	return word_table;
-}
diff --git a/src/libtracker-common/tracker-parser.h b/src/libtracker-common/tracker-parser.h
index 76f8c2e..b5798dc 100644
--- a/src/libtracker-common/tracker-parser.h
+++ b/src/libtracker-common/tracker-parser.h
@@ -51,57 +51,12 @@ const gchar *  tracker_parser_next	      (TrackerParser   *parser,
 					       gboolean        *new_paragraph,
 					       gboolean        *stop_word,
 					       gint	       *word_length);
-void	       tracker_parser_set_posititon   (TrackerParser   *parser,
-					       gint		position);
-gboolean       tracker_parser_is_stop_word    (TrackerParser   *parser,
-					       const gchar     *word);
 gchar *        tracker_parser_process_word    (TrackerParser   *parser,
 					       const char      *word,
 					       gint		length,
 					       gboolean		do_strip);
 void	       tracker_parser_free	      (TrackerParser   *parser);
 
-
-/*
- * Functions to parse supplied text and break into individual words and
- * maintain a count of no of occurences of the word multiplied by a
- * "weight" factor.
- *
- * The word_table - can be NULL. It contains the accumulated parsed words
- * with weighted word counts for the text (useful for indexing stuff
- * line by line)
- *
- *   text   - the text to be parsed
- *   weight - used to multiply the count of a word's occurance to create
- *	      a weighted rank score
- *
- * Returns the word_table.
- */
-GHashTable *   tracker_parser_text	      (GHashTable      *word_table,
-					       const gchar     *txt,
-					       gint		weight,
-					       TrackerLanguage *language,
-					       gint		max_words_to_index,
-					       gint		max_word_length,
-					       gint		min_word_length,
-					       gboolean		filter_words,
-					       gboolean		delimit_words);
-GHashTable *   tracker_parser_text_fast       (GHashTable      *word_table,
-					       const char      *txt,
-					       gint		weight);
-gchar *        tracker_parser_text_to_string  (const gchar     *txt,
-					       TrackerLanguage *language,
-					       gint		max_word_length,
-					       gint		min_word_length,
-					       gboolean		filter_words,
-					       gboolean		filter_numbers,
-					       gboolean		delimit);
-gchar **       tracker_parser_text_into_array (const gchar     *text,
-					       TrackerLanguage *language,
-					       gint		max_word_length,
-					       gint		min_word_length);
-
-
 G_END_DECLS
 
 #endif /* __TRACKERD_PARSER_H__ */
diff --git a/tests/libtracker-common/Makefile.am b/tests/libtracker-common/Makefile.am
index 44aa4c8..0484c19 100644
--- a/tests/libtracker-common/Makefile.am
+++ b/tests/libtracker-common/Makefile.am
@@ -10,7 +10,6 @@ noinst_PROGRAMS = $(TEST_PROGS)
 # 
 #       These tests include:
 #
-#         tracker-parser
 #	  tracker-field
 #
 
@@ -66,19 +65,6 @@ tracker_file_utils_LDADD =						\
 	$(GTHREAD_LIBS)							\
 	$(GLIB2_LIBS)							
 
-# tracker_parser_SOURCES = 						\
-# 	tracker-parser-test.c 
-#
-# tracker_parser_LDADD =						\
-# 	$(top_builddir)/src/libtracker-common/libtracker-common.la 	\
-# 	$(top_builddir)/tests/common/libtracker-testcommon.la 		\
-# 	$(top_builddir)/src/libstemmer/libstemmer.la	 		\
-# 	$(GMODULE_LIBS)							\
-# 	$(GTHREAD_LIBS)							\
-#	$(GCOV_LIBS)							\
-# 	$(PANGO_LIBS)							\
-# 	$(GLIB2_LIBS)							
-#
 # tracker_property_SOURCES = 						\
 # 	tracker-field-test.c 
 #



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]