[rhythmbox] lyrics: use fuzzy matches to check lyrics search results (bug #358313)

From: Jonathan Matthew <jmatthew src gnome org>
To: svn-commits-list gnome org
Subject: [rhythmbox] lyrics: use fuzzy matches to check lyrics search results (bug #358313)
Date: Sun, 14 Jun 2009 03:47:11 -0400 (EDT)
commit 41dd5fadde967ac65e3a8e59b762cac1c3e92ac7
Author: Jonathan Matthew <jonathan d14n org>
Date:   Sun Jun 14 17:34:56 2009 +1000

    lyrics: use fuzzy matches to check lyrics search results (bug #358313)
    
    This only applies to the leoslyrics.com and astraweb.com lyrics
    searches, as the others don't seem to return partial match results, or
    are too weird for me to figure out how to add this.

 plugins/lyrics/lyrics/AstrawebParser.py   |   34 ++++++++++++++--------
 plugins/lyrics/lyrics/LeoslyricsParser.py |   45 +++++++++++++++++++++++------
 2 files changed, 58 insertions(+), 21 deletions(-)
---
diff --git a/plugins/lyrics/lyrics/AstrawebParser.py b/plugins/lyrics/lyrics/AstrawebParser.py
index d5c80c2..d14f56e 100644
--- a/plugins/lyrics/lyrics/AstrawebParser.py
+++ b/plugins/lyrics/lyrics/AstrawebParser.py
@@ -29,6 +29,12 @@ import urllib
 import re
 import rb
 
+from rb.stringmatch import string_match
+
+# these numbers pulled directly from the air
+artist_match = 0.8
+title_match = 0.5
+
 class AstrawebParser (object):
 	def __init__(self, artist, title):
 		self.artist = artist
@@ -54,20 +60,24 @@ class AstrawebParser (object):
 			body = re.split('(<tr><td bgcolor="#BBBBBB".*)(More Songs &gt)', results)[1]
 			entries = re.split('<tr><td bgcolor="#BBBBBB"', body)
 			entries.pop(0)
-			print "found %d entries; looking for [%s,%s]" % (len(entries), self.title.lower().strip(), self.artist.lower().strip())
+			print "found %d entries; looking for [%s,%s]" % (len(entries), self.title, self.artist)
 			for entry in entries:
 				url = re.split('(\/display[^"]*)', entry)[1]
-				artist = re.split('(Artist:.*html">)([^<]*)', entry)[2]
-				title = re.split('(\/display[^>]*)([^<]*)', entry)[2][1:]
-
-				print "checking [%s,%s]" % (title.lower().strip(), artist.lower().strip())
-				if title.lower().find(self.title.lower().strip()) != -1:
-					if artist.lower().find(self.artist.lower().strip()) != -1:
-						loader = rb.Loader()
-						loader.get_url ('http://display.lyrics.astraweb.com' + url, self.parse_lyrics, callback, *data)
-						return
-				
-				continue
+				artist = re.split('(Artist:.*html">)([^<]*)', entry)[2].strip()
+				title = re.split('(\/display[^>]*)([^<]*)', entry)[2][1:].strip()
+
+				if self.artist != "":
+					artist_str = string_match(self.artist, artist)
+				else:
+					artist_str = artist_match + 0.1
+
+				title_str = string_match(self.title, title)
+
+				print "checking [%s,%s]: match strengths [%f,%f]" % (title.strip(), artist.strip(), title_str, artist_str)
+				if title_str > title_match and artist_str > artist_match:
+					loader = rb.Loader()
+					loader.get_url ('http://display.lyrics.astraweb.com' + url, self.parse_lyrics, callback, *data)
+					return
 
 		callback (None, *data)
 		return
diff --git a/plugins/lyrics/lyrics/LeoslyricsParser.py b/plugins/lyrics/lyrics/LeoslyricsParser.py
index 002e83e..8a9d74e 100644
--- a/plugins/lyrics/lyrics/LeoslyricsParser.py
+++ b/plugins/lyrics/lyrics/LeoslyricsParser.py
@@ -31,6 +31,12 @@ import urllib
 import re
 import rb
 
+from rb.stringmatch import string_match
+
+# these numbers pulled directly from the air
+artist_match = 0.8
+title_match = 0.5
+
 # Python 2.4 compatibility
 try:
 	from xml.etree import cElementTree
@@ -64,17 +70,37 @@ class LeoslyricsParser(object):
 			callback (None, *data)
 			return
 
-		#FIXME: check non-exact matches
-		match = element.find("searchResults").find("result")
-		if match.attrib["exactMatch"] is None:
-			print "no exact match:" + lyrics
+		match = None
+		matches = element.find("searchResults").findall("result")
+		print "got %d result(s)" % (len(matches))
+		for m in matches:
+			matchtitle = m.findtext("title")
+			matchartist = m.findtext("artist/name")
+
+			# if we don't know the artist, then anyone will do
+			if self.artist != "":
+				artist_str = string_match(self.artist, matchartist)
+			else:
+				artist_str = artist_match + 0.1
+
+			title_str = string_match(self.title, matchtitle)
+			if artist_str > artist_match and title_str > title_match:
+				print "found acceptable match, artist: %s (%f), title: %s (%f)" % (matchartist, artist_str, matchtitle, title_str)
+				match = m
+				break
+			else:
+				print "skipping match, artist: %s (%f), title: %s (%f)" % (matchartist, artist_str, matchtitle, title_str)
+
+		if match is not None:
+			hid = m.attrib['hid'].encode('utf-8')
+			lurl = "http://api.leoslyrics.com/api_lyrics.php?auth=Rhythmbox&hid=%s"; % (urllib.quote(hid))
+			loader = rb.Loader()
+			loader.get_url (lurl, self.parse_lyrics, callback, *data)
+		else:
+			print "no acceptable match found"
 			callback (None, *data)
-			return
 
-		lurl = "http://api.leoslyrics.com/api_lyrics.php?auth=Rhythmbox&hid=%s"; % (urllib.quote(match.attrib["hid"].encode('utf-8')))
-		loader = rb.Loader()
-		loader.get_url (lurl, self.parse_lyrics, callback, *data)
-			
+
 	def parse_lyrics(self, result, callback, *data):
 		if result is None:
 			callback (None, *data)
@@ -86,3 +112,4 @@ class LeoslyricsParser(object):
 		lyrics += "\n\nLyrics provided by leoslyrics.com"
 
 		callback (lyrics, *data)
+
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]