Re: [Rhythmbox-devel] Bugzilla Roundup #1 (Fuzzy Matching and TrayIcons)



On Tue, 2005-10-25 at 08:58 -0500, James Cotton wrote:
> I'm in favor of just 1 & 2 because it seems like there would be cases
> where the fuzzy matching might give you artists with similar names or
> something you don't want.  I will often just use a search filter to
> create an on the fly list of songs to hear and wouldn't want to not be
> able to limit it appropriately.  However, not being able to search by
> artist and title at the same time is often frustrating.  #2 should be
> useful too.

Attached is a variant of the second patch, that has the punctuation
stripping code from the first patch.

While true fuzzy matching sounds nice, it would probably be a _lot_ of
work to tune it so that it works well - you wouldn't want it to match to
widely, or to narrowly.


Cheers,

James "Doc" Livingston
-- 
"This label is the target of a goto from outside of the block containing
this label AND this block has an automatic variable with an initializer
AND your window wasn't wide enough to read this whole error message"
    -- MPW C error message
Index: lib/rb-util.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/lib/rb-util.c,v
retrieving revision 1.16
diff -u -u -r1.16 rb-util.c
--- lib/rb-util.c	1 Sep 2005 16:53:49 -0000	1.16
+++ lib/rb-util.c	27 Oct 2005 09:49:07 -0000
@@ -23,6 +23,7 @@
 #include <gtk/gtk.h>
 #include <string.h>
 #include <libgnomevfs/gnome-vfs.h>
+#include "rb-debug.h"
 
 static GPrivate * private_is_primary_thread;
 
@@ -380,3 +381,168 @@
 	 */
 	gdk_threads_init ();
 }
+
+gchar **
+rb_string_split_words (const gchar *string)
+{
+	/*return g_slist_prepend (NULL, g_strdup (string));*/
+
+	GSList *words, *current;
+	gunichar *unicode, *cur_write, *cur_read;
+	gchar **ret;
+	gint i, wordcount = 1;
+	gboolean new_word = TRUE;
+
+	g_return_val_if_fail (string != NULL, NULL);
+
+	cur_write = cur_read = unicode = g_utf8_to_ucs4_fast (string, -1, NULL);
+
+	/* we may fail here, we expect valid utf-8 */
+	g_return_val_if_fail (unicode != NULL, NULL);
+
+	words = g_slist_prepend (NULL, unicode);
+
+	/* now normalize this text */
+	while (*cur_read) {
+		switch (g_unichar_type (*cur_read)) {
+		case G_UNICODE_UNASSIGNED:
+			g_warning ("unassigned unicode character type found");
+			/* fall through */
+		case G_UNICODE_CONTROL:
+		case G_UNICODE_FORMAT:
+		case G_UNICODE_PRIVATE_USE:
+
+		case G_UNICODE_SURROGATE:
+		case G_UNICODE_LINE_SEPARATOR:
+		case G_UNICODE_PARAGRAPH_SEPARATOR:
+		case G_UNICODE_SPACE_SEPARATOR:
+			/* remove these and start a new word */
+			if (!new_word) {
+				/* end current word if it isn't ended yet */
+				*cur_write++ = 0;
+				new_word = TRUE;
+			}
+
+			break;
+		case G_UNICODE_COMBINING_MARK:
+		case G_UNICODE_ENCLOSING_MARK:
+		case G_UNICODE_NON_SPACING_MARK:
+		case G_UNICODE_CONNECT_PUNCTUATION:
+		case G_UNICODE_DASH_PUNCTUATION:
+		case G_UNICODE_CLOSE_PUNCTUATION:
+		case G_UNICODE_FINAL_PUNCTUATION:
+		case G_UNICODE_INITIAL_PUNCTUATION:
+		case G_UNICODE_OTHER_PUNCTUATION:
+		case G_UNICODE_OPEN_PUNCTUATION:
+			/* remove these */
+			/*break;*/
+		case G_UNICODE_LOWERCASE_LETTER:
+		case G_UNICODE_MODIFIER_LETTER:
+		case G_UNICODE_OTHER_LETTER:
+		case G_UNICODE_TITLECASE_LETTER:
+		case G_UNICODE_UPPERCASE_LETTER:
+		case G_UNICODE_DECIMAL_NUMBER:
+		case G_UNICODE_LETTER_NUMBER:
+		case G_UNICODE_OTHER_NUMBER:
+		case G_UNICODE_CURRENCY_SYMBOL:
+		case G_UNICODE_MODIFIER_SYMBOL:
+		case G_UNICODE_MATH_SYMBOL:
+		case G_UNICODE_OTHER_SYMBOL:
+			/* keep these unchanged */
+			*cur_write = *cur_read;
+			if (new_word) {
+				if (cur_write != unicode) {/* first insert has been done above */
+					words = g_slist_prepend (words, cur_write);
+					wordcount++;
+				}
+				new_word = FALSE;
+			}
+			cur_write++;
+			break;    
+		default:
+			g_warning ("unknown unicode character type found");
+			break;
+		}
+		cur_read++;
+	}
+
+	if (!new_word) {
+		*cur_write++ = 0;
+	}
+
+	ret = g_new (gchar *, wordcount + 1); 
+	current = words;
+	for (i = wordcount - 1; i >= 0; i--) {
+		ret[i] = g_ucs4_to_utf8 (current->data, -1, NULL, NULL, NULL);
+		current = g_slist_next (current);
+	}
+	ret[wordcount] = NULL;
+
+	g_slist_free (words);
+	g_free (unicode);
+
+	return ret;
+}
+
+gchar*
+rb_search_fold (const char *original)
+{
+	GString *string;
+	gunichar *unicode, *cur;
+	
+	g_return_val_if_fail (original != NULL, NULL);
+
+	/* old behaviour is equivalent to: return g_utf8_casefold (original, -1); */
+	
+	string = g_string_new (NULL);
+	unicode = g_utf8_to_ucs4_fast (original, -1, NULL);
+
+	for (cur = unicode; *cur != 0; cur++) {
+		switch (g_unichar_type (*cur)) {
+		case G_UNICODE_COMBINING_MARK:
+		case G_UNICODE_ENCLOSING_MARK:
+		case G_UNICODE_NON_SPACING_MARK:
+		case G_UNICODE_CONNECT_PUNCTUATION:
+		case G_UNICODE_DASH_PUNCTUATION:
+		case G_UNICODE_CLOSE_PUNCTUATION:
+		case G_UNICODE_FINAL_PUNCTUATION:
+		case G_UNICODE_INITIAL_PUNCTUATION:
+		case G_UNICODE_OTHER_PUNCTUATION:
+		case G_UNICODE_OPEN_PUNCTUATION:
+			/* remove these */
+			break;
+
+		case G_UNICODE_LOWERCASE_LETTER:
+		case G_UNICODE_MODIFIER_LETTER:
+		case G_UNICODE_OTHER_LETTER:
+		case G_UNICODE_TITLECASE_LETTER:
+		case G_UNICODE_UPPERCASE_LETTER:
+			/* convert to lower case */
+			*cur = g_unichar_tolower (*cur);
+			/* ... and fall through */\
+		case G_UNICODE_DECIMAL_NUMBER:
+		case G_UNICODE_LETTER_NUMBER:
+		case G_UNICODE_OTHER_NUMBER:
+		/* should be keep symbols? */
+		case G_UNICODE_CURRENCY_SYMBOL:
+		case G_UNICODE_MODIFIER_SYMBOL:
+		case G_UNICODE_MATH_SYMBOL:
+		case G_UNICODE_OTHER_SYMBOL:
+			g_string_append_unichar (string, *cur);
+			break;
+
+		case G_UNICODE_UNASSIGNED:
+			g_warning ("unassigned unicode character type found");
+			/* fall through */
+
+		default:
+			/* leave these in */
+			g_string_append_unichar (string, *cur);
+		}
+	}
+	
+	g_free (unicode);
+			
+	return g_string_free (string, FALSE);
+}
+
Index: lib/rb-util.h
===================================================================
RCS file: /cvs/gnome/rhythmbox/lib/rb-util.h,v
retrieving revision 1.12
diff -u -u -r1.12 rb-util.h
--- lib/rb-util.h	1 Sep 2005 16:53:49 -0000	1.12
+++ lib/rb-util.h	27 Oct 2005 09:49:07 -0000
@@ -23,8 +23,7 @@
 #define __RB_UTIL_H
 
 #include <stdarg.h>
-#include <glib/gtypes.h>
-#include <glib-object.h>
+#include <glib.h>
 #include <gtk/gtkimage.h>
 #include <gtk/gtkuimanager.h>
 
@@ -49,8 +48,8 @@
 void rb_threads_init (void);
 gboolean rb_is_main_thread (void);
 
-
-
+gchar* rb_search_fold (const char *original);
+gchar** rb_string_split_words (const gchar *string);
 
 G_END_DECLS
 
Index: rhythmdb/rb-refstring.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/rhythmdb/rb-refstring.c,v
retrieving revision 1.6
diff -u -u -r1.6 rb-refstring.c
--- rhythmdb/rb-refstring.c	20 Oct 2005 11:53:23 -0000	1.6
+++ rhythmdb/rb-refstring.c	27 Oct 2005 09:49:09 -0000
@@ -23,6 +23,7 @@
 
 #include <glib.h>
 #include <string.h>
+#include "rb-util.h"
 #include "rb-cut-and-paste-code.h"
 
 GHashTable *rb_refstrings;
@@ -70,7 +71,7 @@
 	ret->refcount = 1;
 	
 	if (compute_sortdata) {
-		ret->folded = g_utf8_casefold (init, -1);
+		ret->folded = rb_search_fold (init);
 		ret->sortkey = rb_utf8_collate_key_for_filename (ret->folded, -1);
 	} else {
 		ret->folded = NULL;
Index: rhythmdb/rhythmdb-tree.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/rhythmdb/rhythmdb-tree.c,v
retrieving revision 1.76
diff -u -u -r1.76 rhythmdb-tree.c
--- rhythmdb/rhythmdb-tree.c	25 Oct 2005 22:25:16 -0000	1.76
+++ rhythmdb/rhythmdb-tree.c	27 Oct 2005 09:49:09 -0000
@@ -435,6 +435,7 @@
 		case RHYTHMDB_PROP_HIDDEN:
 		case RHYTHMDB_PROP_PLAYBACK_ERROR:
 		case RHYTHMDB_PROP_FIRST_SEEN_STR:
+		case RHYTHMDB_PROP_SEARCH_MATCH:
 		case RHYTHMDB_NUM_PROPERTIES:
 			g_assert_not_reached ();
 			break;
@@ -786,6 +787,7 @@
 		case RHYTHMDB_PROP_HIDDEN:
 		case RHYTHMDB_PROP_PLAYBACK_ERROR:
 		case RHYTHMDB_PROP_FIRST_SEEN_STR:
+		case RHYTHMDB_PROP_SEARCH_MATCH:
 		case RHYTHMDB_NUM_PROPERTIES:
 			break;
 		}
@@ -1363,25 +1365,54 @@
 		case RHYTHMDB_QUERY_PROP_NOT_LIKE:
 		{
 			if (rhythmdb_get_property_type (db, data->propid) == G_TYPE_STRING) {
-				const char *entry_string = rhythmdb_entry_get_string (entry, data->propid);
-				gboolean islike = FALSE;
+				gboolean islike;
 
-				/* check in case the property is NULL, the value should never be NULL */
-				if (entry_string == NULL)
-					return FALSE;
+				if (data->propid == RHYTHMDB_PROP_SEARCH_MATCH) {
+					/* this is a special property, that should match several things */
+					const RhythmDBPropType props[] = {
+						RHYTHMDB_PROP_TITLE_FOLDED,
+						RHYTHMDB_PROP_ALBUM_FOLDED,
+						RHYTHMDB_PROP_ARTIST_FOLDED,
+						RHYTHMDB_PROP_GENRE_FOLDED
+					};
+					gchar **current, **words;
+					int i;
+
+					words = g_value_get_boxed (data->val);
+					islike = TRUE;
+
+					for (current = words; *current != NULL; current++) {
+						gboolean word_found = FALSE;
+
+						for (i = 0; i < G_N_ELEMENTS (props); i++) {
+							const char *entry_string = rhythmdb_entry_get_string (entry, props[i]);
+							if (entry_string && (strstr (entry_string, *current) != NULL)) {
+								/* the word was found, go to the next one */	
+								word_found = TRUE;
+								break;
+							}
+						}
+						if (!word_found) {
+							/* the word wasn't in any of the properties*/
+							islike = FALSE;
+							break;
+						}
+					}
+				} else {
+					const gchar *value_string = g_value_get_string (data->val);
+					const char *entry_string = rhythmdb_entry_get_string (entry, data->propid);
 
-				islike = (strstr (entry_string, g_value_get_string (data->val)) != NULL);
-				if (data->type == RHYTHMDB_QUERY_PROP_LIKE) {
-					if (!islike)
-						return FALSE;
-					else
-						continue;
-				} else if (data->type == RHYTHMDB_QUERY_PROP_NOT_LIKE) {
-					if (islike)
+					/* check in case the property is NULL, the value should never be NULL */
+					if (entry_string == NULL)
 						return FALSE;
-					else
-						continue;
+
+					islike = (strstr (entry_string, value_string) != NULL);
 				}
+
+				if ((data->type == RHYTHMDB_QUERY_PROP_LIKE) ^ islike)
+					return FALSE;
+				else
+					continue;
 				break;
 			} 
 			/* Fall through */
Index: rhythmdb/rhythmdb.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/rhythmdb/rhythmdb.c,v
retrieving revision 1.121
diff -u -u -r1.121 rhythmdb.c
--- rhythmdb/rhythmdb.c	25 Oct 2005 22:25:16 -0000	1.121
+++ rhythmdb/rhythmdb.c	27 Oct 2005 09:49:12 -0000
@@ -2826,6 +2826,8 @@
 			ENUM_ENTRY (RHYTHMDB_PROP_PLAYBACK_ERROR, "Playback error string (gchararray) [playback-error]"),
 			ENUM_ENTRY (RHYTHMDB_PROP_HIDDEN, "Visibility (gboolean) [visibility]"),
 			ENUM_ENTRY (RHYTHMDB_PROP_FIRST_SEEN_STR, "Time Added to Library (gchararray) [first-seen-str]"),
+			ENUM_ENTRY (RHYTHMDB_PROP_SEARCH_MATCH, "Search matching key (gchararray) [search-match]"),
+
 			ENUM_ENTRY (RHYTHMDB_PROP_STATUS, "Status of file (gulong) [status]"),
 			ENUM_ENTRY (RHYTHMDB_PROP_DESCRIPTION, "Podcast description(gchararray) [description]"),
 			ENUM_ENTRY (RHYTHMDB_PROP_SUBTITLE, "Podcast subtitle (gchararray) [subtitle]"),
@@ -3156,12 +3158,28 @@
 			{
 				/* as we are matching against a folded property, the string needs to also be folded */
 				const char *orig = g_value_get_string (data->val);
-				char *folded = g_utf8_casefold (orig, -1);
+				char *folded = rb_search_fold (orig);
 
 				g_value_reset (data->val);
 				g_value_take_string (data->val, folded);
 				break;
 			}
+			case RHYTHMDB_PROP_SEARCH_MATCH:
+			{
+				const char *orig = g_value_get_string (data->val);
+				char *folded = rb_search_fold (orig);
+				char **words = rb_string_split_words (folded);
+
+				g_free (folded);
+				g_value_unset (data->val);
+				g_value_init (data->val, G_TYPE_STRV);
+				g_value_take_boxed (data->val, words);
+				break;
+			}
+			default:
+				/* do nothing */
+				break;
 		}
 	}
 }
+
Index: rhythmdb/rhythmdb.h
===================================================================
RCS file: /cvs/gnome/rhythmbox/rhythmdb/rhythmdb.h,v
retrieving revision 1.53
diff -u -u -r1.53 rhythmdb.h
--- rhythmdb/rhythmdb.h	25 Oct 2005 22:25:16 -0000	1.53
+++ rhythmdb/rhythmdb.h	27 Oct 2005 09:49:12 -0000
@@ -103,6 +103,8 @@
 	RHYTHMDB_PROP_HIDDEN,
 	RHYTHMDB_PROP_PLAYBACK_ERROR,
 	RHYTHMDB_PROP_FIRST_SEEN_STR,
+	RHYTHMDB_PROP_SEARCH_MATCH,
+
 //podcast propriets
 	RHYTHMDB_PROP_STATUS,
 	RHYTHMDB_PROP_DESCRIPTION,
Index: sources/rb-library-source.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/sources/rb-library-source.c,v
retrieving revision 1.120
diff -u -u -r1.120 rb-library-source.c
--- sources/rb-library-source.c	27 Sep 2005 11:30:24 -0000	1.120
+++ sources/rb-library-source.c	27 Oct 2005 09:49:13 -0000
@@ -1401,19 +1401,7 @@
 	if (source->priv->search_text) {
 		GPtrArray *subquery = rhythmdb_query_parse (source->priv->db,
 							    RHYTHMDB_QUERY_PROP_LIKE,
-							    RHYTHMDB_PROP_GENRE_FOLDED,
-							    source->priv->search_text,
-							    RHYTHMDB_QUERY_DISJUNCTION,
-							    RHYTHMDB_QUERY_PROP_LIKE,
-							    RHYTHMDB_PROP_ARTIST_FOLDED,
-							    source->priv->search_text,
-							    RHYTHMDB_QUERY_DISJUNCTION,
-							    RHYTHMDB_QUERY_PROP_LIKE,
-							    RHYTHMDB_PROP_ALBUM_FOLDED,
-							    source->priv->search_text,
-							    RHYTHMDB_QUERY_DISJUNCTION,
-							    RHYTHMDB_QUERY_PROP_LIKE,
-							    RHYTHMDB_PROP_TITLE_FOLDED,
+							    RHYTHMDB_PROP_SEARCH_MATCH,
 							    source->priv->search_text,
 							    RHYTHMDB_QUERY_END);
 		rhythmdb_query_append (source->priv->db,
Index: widgets/rb-entry-view.c
===================================================================
RCS file: /cvs/gnome/rhythmbox/widgets/rb-entry-view.c,v
retrieving revision 1.104
diff -u -u -r1.104 rb-entry-view.c
--- widgets/rb-entry-view.c	26 Oct 2005 07:11:22 -0000	1.104
+++ widgets/rb-entry-view.c	27 Oct 2005 09:49:14 -0000
@@ -32,6 +32,7 @@
 #include "rb-entry-view.h"
 #include "rb-dialog.h"
 #include "rb-debug.h"
+#include "rb-util.h"
 #include "rhythmdb.h"
 #include "rhythmdb-query-model.h"
 #include "rb-cell-renderer-pixbuf.h"
@@ -215,7 +216,7 @@
 	gboolean res;
 
 	gtk_tree_model_get (model, iter, 0, &entry, -1);
-	folded = g_utf8_casefold (key, -1);
+	folded = rb_search_fold (key);
 	res = (strstr (rb_refstring_get_folded (entry->title), folded) == NULL);
 	g_free (folded);
 	return res;

Attachment: signature.asc
Description: This is a digitally signed message part



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]