[tracker] tracker-parser: Remove unused functions
- From: Jürg Billeter <juergbi src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] tracker-parser: Remove unused functions
- Date: Tue, 1 Sep 2009 17:04:47 +0000 (UTC)
commit 216b6917db746aeac1ae381379e4a65b1050a831
Author: Jürg Billeter <j bitron ch>
Date: Tue Sep 1 19:02:12 2009 +0200
tracker-parser: Remove unused functions
.../libtracker-common-sections.txt | 4 -
src/libtracker-common/tracker-parser.c | 569 --------------------
src/libtracker-common/tracker-parser.h | 45 --
tests/libtracker-common/Makefile.am | 14 -
tests/libtracker-common/tracker-parser-test.c | 193 -------
5 files changed, 0 insertions(+), 825 deletions(-)
---
diff --git a/docs/reference/libtracker-common/libtracker-common-sections.txt b/docs/reference/libtracker-common/libtracker-common-sections.txt
index aeb788e..8a5b079 100644
--- a/docs/reference/libtracker-common/libtracker-common-sections.txt
+++ b/docs/reference/libtracker-common/libtracker-common-sections.txt
@@ -234,14 +234,10 @@ tracker_ontology_add_property
TrackerParser
tracker_parser_new
tracker_parser_free
-tracker_parser_is_stop_word
tracker_parser_next
tracker_parser_process_word
tracker_parser_reset
tracker_parser_text
-tracker_parser_text_fast
-tracker_parser_text_into_array
-tracker_parser_text_to_string
</SECTION>
<INCLUDE>libtracker-common/tracker-class.h</INCLUDE>
diff --git a/src/libtracker-common/tracker-parser.c b/src/libtracker-common/tracker-parser.c
index 2ab5758..f1d7a23 100644
--- a/src/libtracker-common/tracker-parser.c
+++ b/src/libtracker-common/tracker-parser.c
@@ -166,29 +166,6 @@ strip_word (const gchar *str,
#endif
}
-static gboolean
-text_needs_pango (const gchar *text)
-{
- const gchar *p;
- gunichar c;
- gint i = 0;
-
- /* Grab first 1024 non-whitespace chars and test */
- for (p = text; *p && i < 1024; p = g_utf8_next_char (p)) {
- c = g_utf8_get_char (p);
-
- if (!g_unichar_isspace (c)) {
- i++;
- }
-
- if (NEED_PANGO(c)) {
- return TRUE;
- }
- }
-
- return FALSE;
-}
-
static TrackerParserEncoding
get_encoding (const gchar *txt)
{
@@ -232,192 +209,6 @@ is_stop_word (TrackerLanguage *language,
return g_hash_table_lookup (stop_words, word) != NULL;
}
-static const gchar *
-analyze_text (const gchar *text,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
- gboolean delimit_hyphen,
- gchar **index_word)
-{
- TrackerParserWordType word_type;
- gunichar word[64];
- gboolean do_strip;
- gboolean is_valid;
- gint length;
- glong bytes;
- const char *p;
- const char *start;
-
- *index_word = NULL;
-
- if (text == NULL || text[0] == '\0') {
- return NULL;
- }
-
- word_type = TRACKER_PARSER_WORD_IGNORE;
- do_strip = FALSE;
- is_valid = TRUE;
- length = 0;
- bytes = 0;
- start = NULL;
-
- for (p = text; *p; p = g_utf8_next_char (p)) {
- TrackerParserWordType type;
- gunichar c;
-
- c = g_utf8_get_char (p);
- type = get_word_type (c);
-
- if (type == TRACKER_PARSER_WORD_IGNORE ||
- type == TRACKER_PARSER_WORD_NEWLINE ||
- (delimit_hyphen &&
- (type == TRACKER_PARSER_WORD_HYPHEN ||
- type == TRACKER_PARSER_WORD_UNDERSCORE))) {
- if (!start) {
- continue;
- } else {
- break;
- }
- }
-
- if (!is_valid) {
- continue;
- }
-
- if (!start) {
- start = p;
-
- /* Valid words must start with an alpha or
- * underscore if we are filtering.
- */
- if (filter_numbers) {
- if (type == TRACKER_PARSER_WORD_NUM) {
- is_valid = FALSE;
- continue;
- } else {
- if (type == TRACKER_PARSER_WORD_HYPHEN) {
- is_valid = FALSE;
- continue;
- }
- }
- }
- }
-
- if (length >= max_word_length) {
- continue;
- }
-
- length++;
-
- switch (type) {
- case TRACKER_PARSER_WORD_ASCII_HIGHER:
- c += 32;
-
- case TRACKER_PARSER_WORD_ASCII_LOWER:
- case TRACKER_PARSER_WORD_HYPHEN:
- case TRACKER_PARSER_WORD_UNDERSCORE:
- if (word_type == TRACKER_PARSER_WORD_NUM ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_ALPHA;
- }
-
- break;
-
- case TRACKER_PARSER_WORD_NUM:
- if (word_type == TRACKER_PARSER_WORD_ALPHA ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_NUM;
- }
- break;
-
- case TRACKER_PARSER_WORD_ALPHA_HIGHER:
- c = g_unichar_tolower (c);
-
- case TRACKER_PARSER_WORD_ALPHA_LOWER:
- if (!do_strip) {
- do_strip = TRUE;
- }
-
- if (word_type == TRACKER_PARSER_WORD_NUM ||
- word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
- word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
- } else {
- word_type = TRACKER_PARSER_WORD_ALPHA;
- }
-
- break;
-
- case TRACKER_PARSER_WORD_ALPHA:
- case TRACKER_PARSER_WORD_ALPHA_NUM:
- case TRACKER_PARSER_WORD_IGNORE:
- case TRACKER_PARSER_WORD_NEWLINE:
- default:
- break;
- }
-
- word[length -1] = c;
- }
-
- if (!is_valid) {
- return p;
- }
-
- if (word_type == TRACKER_PARSER_WORD_NUM) {
- if (!filter_numbers || length >= INDEX_NUMBER_MIN_LENGTH) {
- *index_word = g_ucs4_to_utf8 (word, length, NULL, NULL, NULL);
- }
- } else if (length >= min_word_length) {
- const gchar *stem_word;
- gchar *stripped_word;
- gchar *str;
- gchar *utf8;
- guint32 len;
-
- utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
- if (!utf8) {
- return p;
- }
-
- if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
- stripped_word = strip_word (utf8, bytes, &len);
- } else {
- stripped_word = NULL;
- }
-
- if (!stripped_word) {
- str = g_utf8_normalize (utf8,
- bytes,
- G_NORMALIZE_NFC);
- } else {
- str = g_utf8_normalize (stripped_word,
- len,
- G_NORMALIZE_NFC);
- g_free (stripped_word);
- }
-
- g_free (utf8);
-
- stem_word = tracker_language_stem_word (language,
- str,
- strlen (str));
- g_free (str);
-
- if (!filter_words || !is_stop_word (language, stem_word)) {
- *index_word = g_strdup (stem_word);
- }
- }
-
- return p;
-}
-
static gboolean
pango_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -691,34 +482,6 @@ parser_next (TrackerParser *parser,
}
-static gboolean
-word_table_increment (GHashTable *word_table,
- gchar *index_word,
- gint weight,
- gint total_words,
- gint max_words_to_index)
-{
- gboolean update_count;
-
- update_count = total_words <= max_words_to_index;
-
- if (update_count) {
- gpointer p;
- gint count;
-
- p = g_hash_table_lookup (word_table, index_word);
- count = GPOINTER_TO_INT (p);
-
- g_hash_table_replace (word_table,
- index_word,
- GINT_TO_POINTER (count + weight));
- } else {
- g_free (index_word);
- }
-
- return update_count;
-}
-
TrackerParser *
tracker_parser_new (TrackerLanguage *language,
gint max_word_length,
@@ -873,21 +636,6 @@ tracker_parser_process_word (TrackerParser *parser,
return str;
}
-gboolean
-tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word)
-{
- gboolean result;
- char *processed_word;
-
- if (get_encoding (word) == TRACKER_PARSER_ENCODING_CJK) return FALSE;
-
- processed_word = tracker_parser_process_word (parser, word, -1, TRUE);
- result = is_stop_word (parser->language, processed_word);
-
- g_free (processed_word);
- return result;
-}
-
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
@@ -935,320 +683,3 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
-
-gchar *
-tracker_parser_text_to_string (const gchar *text,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
- gboolean delimit)
-{
- const gchar *p;
- gchar *parsed_text;
- guint32 i = 0;
- gint len;
-
- g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
- if (text == NULL) {
- return NULL;
- }
-
- if (text[0] == '\0') {
- return g_strdup ("");
- }
-
- p = text;
- len = strlen (text);
- len = MIN (len, 500);
-
- if (!g_utf8_validate (text, len, NULL)) {
- return NULL;
- }
-
- if (text_needs_pango (text)) {
- /* CJK text does not need stemming or other
- * treatment.
- */
- PangoLogAttr *attrs;
- guint str_len, word_start;
- GString *strs;
-
- str_len = g_utf8_strlen (text, -1);
-
- strs = g_string_new (" ");
-
- attrs = g_new0 (PangoLogAttr, str_len + 1);
-
- pango_get_log_attrs (text,
- len,
- 0,
- pango_language_from_string ("C"),
- attrs,
- str_len + 1);
-
- word_start = 0;
-
- for (i = 0; i < str_len + 1; i++) {
- if (attrs[i].is_word_end) {
- gchar *start_word, *end_word;
-
- start_word = g_utf8_offset_to_pointer (text, word_start);
- end_word = g_utf8_offset_to_pointer (text, i);
-
- if (start_word != end_word) {
- /* Normalize word */
- gchar *s;
- gchar *index_word;
-
- s = g_utf8_casefold (start_word, end_word - start_word);
- index_word = g_utf8_normalize (s, -1, G_NORMALIZE_NFC);
- g_free (s);
-
- strs = g_string_append (strs, index_word);
- strs = g_string_append_c (strs, ' ');
- g_free (index_word);
- }
-
- word_start = i;
- }
-
- if (attrs[i].is_word_start) {
- word_start = i;
- }
- }
-
- g_free (attrs);
-
- parsed_text = g_string_free (strs, FALSE);
- return g_strstrip (parsed_text);
- } else {
- GString *str;
- gchar *word;
-
- str = g_string_new (" ");
-
- while (TRUE) {
- i++;
- p = analyze_text (p,
- language,
- max_word_length,
- min_word_length,
- filter_words,
- filter_numbers,
- delimit,
- &word);
-
- if (word) {
- g_string_append (str, word);
- g_string_append_c (str, ' ');
- g_free (word);
- }
-
- if (!p || !*p) {
- parsed_text = g_string_free (str, FALSE);
- return g_strstrip (parsed_text);
- }
- }
-
- g_string_free (str, TRUE);
- }
-
- return NULL;
-}
-
-gchar **
-tracker_parser_text_into_array (const gchar *text,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length)
-{
- gchar *s;
- gchar **strv;
-
- g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
- s = tracker_parser_text_to_string (text,
- language,
- max_word_length,
- min_word_length,
- TRUE,
- FALSE,
- FALSE);
- strv = g_strsplit (g_strstrip (s), " ", -1);
- g_free (s);
-
- return strv;
-}
-
-GHashTable *
-tracker_parser_text_fast (GHashTable *word_table,
- const gchar *txt,
- gint weight)
-{
- gchar **array;
- gchar **p;
-
- /* Use this for already processed text only */
- if (!word_table) {
- word_table = g_hash_table_new_full (g_str_hash,
- g_str_equal,
- g_free,
- NULL);
- }
-
- if (!txt || weight == 0) {
- return word_table;
- }
-
- array = g_strsplit (txt, " ", -1);
- if (!array) {
- return word_table;
- }
-
- for (p = array; *p; p++) {
- word_table_increment (word_table, *p, weight, 0, 0);
- }
-
- g_free (array);
-
- return word_table;
-}
-
-GHashTable *
-tracker_parser_text (GHashTable *word_table,
- const gchar *text,
- gint weight,
- TrackerLanguage *language,
- gint max_words_to_index,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean delimit_words)
-{
- const gchar *p;
- guint32 i;
-
- /* Use this for unprocessed raw text */
- gint total_words;
-
- g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
- if (!word_table) {
- word_table = g_hash_table_new_full (g_str_hash,
- g_str_equal,
- g_free,
- NULL);
- total_words = 0;
- } else {
- total_words = g_hash_table_size (word_table);
- }
-
- if (text == NULL || text[0] == '\0' || weight == 0) {
- return word_table;
- }
-
- p = text;
- i = 0;
-
- if (text_needs_pango (text)) {
- /* CJK text does not need stemming or other treatment */
- PangoLogAttr *attrs;
- guint len, str_len, word_start;
-
- len = strlen (text);
- str_len = g_utf8_strlen (text, -1);
-
- attrs = g_new0 (PangoLogAttr, str_len + 1);
-
- pango_get_log_attrs (text,
- len,
- 0,
- pango_language_from_string ("C"),
- attrs,
- str_len + 1);
-
- word_start = 0;
-
- for (i = 0; i < str_len + 1; i++) {
- if (attrs[i].is_word_end) {
- gchar *start_word, *end_word;
-
- start_word = g_utf8_offset_to_pointer (text, word_start);
- end_word = g_utf8_offset_to_pointer (text, i);
-
- if (start_word != end_word) {
- gchar *str;
- gchar *index_word;
- gboolean was_updated;
-
- /* Normalize word */
- str = g_utf8_casefold (start_word, end_word - start_word);
- if (!str) {
- continue;
- }
-
- index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
- g_free (str);
-
- if (!index_word) {
- continue;
- }
-
- total_words++;
- was_updated = word_table_increment (word_table,
- index_word,
- weight,
- total_words,
- max_words_to_index);
-
- if (!was_updated) {
- break;
- }
- }
-
- word_start = i;
- }
-
- if (attrs[i].is_word_start) {
- word_start = i;
- }
- }
-
- g_free (attrs);
- } else {
- gchar *word;
-
- while (TRUE) {
- i++;
- p = analyze_text (p,
- language,
- max_word_length,
- min_word_length,
- filter_words,
- filter_words,
- delimit_words,
- &word);
-
- if (word) {
- total_words++;
-
- if (!word_table_increment (word_table,
- word,
- weight,
- total_words,
- max_words_to_index)) {
- break;
- }
- }
-
- if (!p || !*p) {
- break;
- }
- }
- }
-
- return word_table;
-}
diff --git a/src/libtracker-common/tracker-parser.h b/src/libtracker-common/tracker-parser.h
index 76f8c2e..b5798dc 100644
--- a/src/libtracker-common/tracker-parser.h
+++ b/src/libtracker-common/tracker-parser.h
@@ -51,57 +51,12 @@ const gchar * tracker_parser_next (TrackerParser *parser,
gboolean *new_paragraph,
gboolean *stop_word,
gint *word_length);
-void tracker_parser_set_posititon (TrackerParser *parser,
- gint position);
-gboolean tracker_parser_is_stop_word (TrackerParser *parser,
- const gchar *word);
gchar * tracker_parser_process_word (TrackerParser *parser,
const char *word,
gint length,
gboolean do_strip);
void tracker_parser_free (TrackerParser *parser);
-
-/*
- * Functions to parse supplied text and break into individual words and
- * maintain a count of no of occurences of the word multiplied by a
- * "weight" factor.
- *
- * The word_table - can be NULL. It contains the accumulated parsed words
- * with weighted word counts for the text (useful for indexing stuff
- * line by line)
- *
- * text - the text to be parsed
- * weight - used to multiply the count of a word's occurance to create
- * a weighted rank score
- *
- * Returns the word_table.
- */
-GHashTable * tracker_parser_text (GHashTable *word_table,
- const gchar *txt,
- gint weight,
- TrackerLanguage *language,
- gint max_words_to_index,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean delimit_words);
-GHashTable * tracker_parser_text_fast (GHashTable *word_table,
- const char *txt,
- gint weight);
-gchar * tracker_parser_text_to_string (const gchar *txt,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length,
- gboolean filter_words,
- gboolean filter_numbers,
- gboolean delimit);
-gchar ** tracker_parser_text_into_array (const gchar *text,
- TrackerLanguage *language,
- gint max_word_length,
- gint min_word_length);
-
-
G_END_DECLS
#endif /* __TRACKERD_PARSER_H__ */
diff --git a/tests/libtracker-common/Makefile.am b/tests/libtracker-common/Makefile.am
index 44aa4c8..0484c19 100644
--- a/tests/libtracker-common/Makefile.am
+++ b/tests/libtracker-common/Makefile.am
@@ -10,7 +10,6 @@ noinst_PROGRAMS = $(TEST_PROGS)
#
# These tests include:
#
-# tracker-parser
# tracker-field
#
@@ -66,19 +65,6 @@ tracker_file_utils_LDADD = \
$(GTHREAD_LIBS) \
$(GLIB2_LIBS)
-# tracker_parser_SOURCES = \
-# tracker-parser-test.c
-#
-# tracker_parser_LDADD = \
-# $(top_builddir)/src/libtracker-common/libtracker-common.la \
-# $(top_builddir)/tests/common/libtracker-testcommon.la \
-# $(top_builddir)/src/libstemmer/libstemmer.la \
-# $(GMODULE_LIBS) \
-# $(GTHREAD_LIBS) \
-# $(GCOV_LIBS) \
-# $(PANGO_LIBS) \
-# $(GLIB2_LIBS)
-#
# tracker_property_SOURCES = \
# tracker-field-test.c
#
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]