[rhythmbox] artdisplay: add cover art search using discogs.com

From: Jonathan Matthew <jmatthew src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [rhythmbox] artdisplay: add cover art search using discogs.com
Date: Sat, 29 Aug 2009 09:03:33 +0000 (UTC)
commit a691fc4475bfadb65b41f523bfdcfeceab38bc7e
Author: Jonathan Matthew <jonathan d14n org>
Date:   Sat Aug 29 18:29:31 2009 +1000

    artdisplay: add cover art search using discogs.com

 plugins/artdisplay/artdisplay/CoverArtDatabase.py  |    3 +-
 .../artdisplay/artdisplay/DiscogsCoverArtSearch.py |  279 ++++++++++++++++++++
 plugins/artdisplay/artdisplay/Makefile.am          |    1 +
 3 files changed, 282 insertions(+), 1 deletions(-)
---
diff --git a/plugins/artdisplay/artdisplay/CoverArtDatabase.py b/plugins/artdisplay/artdisplay/CoverArtDatabase.py
index b71676d..1c25ca0 100644
--- a/plugins/artdisplay/artdisplay/CoverArtDatabase.py
+++ b/plugins/artdisplay/artdisplay/CoverArtDatabase.py
@@ -32,6 +32,7 @@ import itertools
 import gobject
 
 from PodcastCoverArtSearch import PodcastCoverArtSearch
+from DiscogsCoverArtSearch import DiscogsCoverArtSearch
 from EmbeddedCoverArtSearch import EmbeddedCoverArtSearch
 
 from urllib import unquote
@@ -46,7 +47,7 @@ except:
 	from LocalCoverArtSearch import LocalCoverArtSearch
 
 ART_SEARCHES_LOCAL = [LocalCoverArtSearch, EmbeddedCoverArtSearch]
-ART_SEARCHES_REMOTE = [PodcastCoverArtSearch]
+ART_SEARCHES_REMOTE = [PodcastCoverArtSearch, DiscogsCoverArtSearch]
 OLD_ART_FOLDER = '~/.gnome2/rhythmbox/covers'
 
 ART_FOLDER = os.path.join(rb.user_cache_dir(), 'covers')
diff --git a/plugins/artdisplay/artdisplay/DiscogsCoverArtSearch.py b/plugins/artdisplay/artdisplay/DiscogsCoverArtSearch.py
new file mode 100644
index 0000000..a363285
--- /dev/null
+++ b/plugins/artdisplay/artdisplay/DiscogsCoverArtSearch.py
@@ -0,0 +1,279 @@
+# -*- Mode: python; coding: utf-8; tab-width: 8; indent-tabs-mode: t; -*-
+#
+# Copyright (C) 2009 Jonathan Matthew  <jonathan d14n org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# The Rhythmbox authors hereby grant permission for non-GPL compatible
+# GStreamer plugins to be used and distributed together with GStreamer
+# and Rhythmbox. This permission is above and beyond the permissions granted
+# by the GPL license by which Rhythmbox is covered. If you modify this code
+# you may extend this exception to your version of the code, but you are not
+# obligated to do so. If you do not wish to do so, delete this exception
+# statement from your version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA.
+
+import urllib
+import xml.dom.minidom as dom
+import re
+import StringIO
+import gzip
+import time
+import httplib
+import threading
+
+import rb
+import rhythmdb
+
+from rb.stringmatch import string_match
+
+# match quality parameters
+DEFAULT_MATCH = 0.35		# used when the item doesn't have the match property
+MINIMUM_MATCH = 0.5		# ignore results below this quality
+REJECT_MATCH = 0.3		# reject results if either match strength is below this
+
+# list of format types to avoid if possible
+# these tend to have poor or non-square cover art
+BAD_FORMAT_LIST = ('Cassette', 'VHS', 'DVD', 'CDr', 'Promo', 'White Label')
+BAD_FORMAT_PENALTY = -0.05
+# slight penalty for vinyl in order to get CD cover images where possible.
+# there tends to be slightly better coverage for CDs.
+VINYL_PENALTY = -0.02
+
+# this API key belongs to jonathan d14n org ('qwe' on discogs)
+# and was generated specifically for this use
+API_KEY = '45be40f6dd'
+
+DISC_NUMBER_REGEXS = (
+	"\(disc *[0-9]+\)",
+	"\(cd *[0-9]+\)",
+	"\[disc *[0-9]+\]",
+	"\[cd *[0-9]+\]",
+	" - disc *[0-9]+$",
+	" - cd *[0-9]+$",
+	" disc *[0-9]+$",
+	" cd *[0-9]+$"
+)
+
+last_poke = 0
+
+class PokeThread (threading.Thread):
+	def __init__(self):
+		threading.Thread.__init__(self)
+
+	def run(self):
+		c = httplib.HTTPConnection('www.discogs.com')
+		c.connect()
+		c.request('GET', '/release/1?f=xml&api_key=' + API_KEY, headers = { 'Accept-Encoding': 'gzip' })
+		c.getresponse()
+		c.close()
+
+class DiscogsCoverArtSearch (object):
+	def __init__(self):
+		pass
+
+	def __poke(self):
+		global last_poke
+		# an oddity here is that discogs claims to require the 'accept-encoding: gzip'
+		# header in all requests, but after the first, it seems to stop checking.
+		# this works out pretty well for us, because the gvfs http backend doesn't send that
+		# header, but we can understand the responses.
+
+		# assuming it's going to forget after a while, we'll just poke it every hour or so.
+		# this probably actually depends on IP address more than anything else, but we
+		# don't really have the ability to determine when that changes.
+		if time.time() < (last_poke + 3600.0):
+			return
+
+		last_poke = time.time()
+		poker = PokeThread()
+		poker.start()
+
+	def __decompress(self, data):
+		sz = gzip.GzipFile(mode = 'r', fileobj = StringIO.StringIO(data))
+		return sz.read()
+
+	def __search_cb (self, data, (artist, album)):
+		if data is None:
+			print "search returned nothing"
+			self.callback (self, self.entry, [], *self.callback_args)
+			return
+
+		try:
+			parsed = dom.parseString(self.__decompress(data))
+		except Exception, e:
+			print "error processing response data: %s" % e
+			self.callback (self, self.entry, [], *self.callback_args)
+			return
+
+		# probably check for exact matches?
+
+		# track best combined and album matches separately
+		# (best album match works pretty well for multi-artist albums)
+		best_match = 0.0
+		best_id = None
+		best_album_id = None
+		best_album_match = 0.0
+
+		# look for releases that sort of match
+		for r in parsed.getElementsByTagName('result'):
+
+			# check it's a release
+			if r.attributes['type'].value != u'release':
+				continue
+
+			# split into artist and album, match against the search terms
+			titletag = r.getElementsByTagName('title')[0]
+			title = titletag.firstChild.data
+			(rel_artist, rel_album) = title.split(" - ", 1)
+
+			# calculate the release format penalty
+			# we rely on the format descriptor appearing somewhere in the freeform
+			# 'summary' tag.  we don't care where.
+			match_penalty = 0.0
+			summary = r.getElementsByTagName('summary')[0].firstChild.data
+			for badformat in BAD_FORMAT_LIST:
+				if summary.find(badformat) != -1:
+					match_penalty = BAD_FORMAT_PENALTY
+
+			# vinyl penalty only applies if the other one doesn't
+			if match_penalty > -0.01 and summary.find('Vinyl') != -1:
+				match_penalty = VINYL_PENALTY
+
+
+			# search result URLs include artist/title slugs, so they don't work with API requests
+			# the release ID is the last path fragment.
+			this_url = r.getElementsByTagName('uri')[0].firstChild.data
+			this_release_id = this_url.split('/')[-1]
+
+			artist_match = string_match(artist, rel_artist)
+			album_match = string_match(album, rel_album)
+			# this probably isn't a good way to combine matches
+			this_match = ((artist_match + album_match) / 2) + match_penalty
+
+			# is this the new best match?
+			if album_match < REJECT_MATCH or artist_match < REJECT_MATCH:
+				print "result \"%s\" rejected (%f, %f)" % (title, album_match, artist_match)
+			elif this_match > best_match:
+				best_id = this_release_id
+				best_match = this_match
+				print "result \"%s\" is the new best match (%f)" % (title, this_match)
+			else:
+				print "result \"%s\" discarded, %f < %f" % (title, this_match, best_match)
+
+			# is this the new best album match?
+			album_match = album_match + match_penalty
+			if album_match > best_album_match:
+				print "result \"%s\" is the new best album match (%f)" % (title, album_match)
+				best_album_match = album_match
+				best_album_id = this_release_id
+
+		# figure out if we got a result good enough to use
+		fetch_id = None
+		if best_match > MINIMUM_MATCH:
+			print "best result has match strength %f, fetching release %s" % (best_match, best_id)
+			fetch_id = best_id
+		elif best_album_match > MINIMUM_MATCH:
+			print "best album result has match strength %f, fetching release %s" % (best_album_match, best_album_id)
+			fetch_id = best_album_id
+		else:
+			print "no suitable results found"
+
+		# if we did, get the release info, which contains the image URLs
+		if fetch_id is not None:
+			xml_url = "http://www.discogs.com/release/%s?f=xml&api_key=%s"; % (fetch_id, API_KEY)
+			loader = rb.Loader()
+			loader.get_url(xml_url, self.__get_release_cb)
+		else:
+			self.callback (self, self.entry, [], *self.callback_args)
+
+
+	def __get_release_cb (self, data):
+		if data is None:
+			print "release returned nothing"
+			self.callback (self, self.entry, [], *self.callback_args)
+			return
+
+		try:
+			parsed = dom.parseString(self.__decompress(data))
+		except Exception, e:
+			print "error processing response data: %s" % e
+			self.callback (self, self.entry, [], *self.callback_args)
+			return
+
+		# find image URLs.  don't think there's much point using secondary images.
+		image_urls = []
+		for tag in parsed.getElementsByTagName('image'):
+			type = tag.attributes['type'].value
+			if type != 'primary':
+
+			url = tag.attributes['uri'].value
+			url.strip()
+			if url != "":
+				print "found image url: %s" % url
+				image_urls.append(url)
+
+		self.callback (self, self.entry, [image_urls], *self.callback_args)
+
+
+
+	def search (self, db, entry, is_playing, callback, *args):
+		self.entry = entry
+		self.callback = callback
+		self.callback_args = args
+
+		artist = db.entry_get (entry, rhythmdb.PROP_ARTIST)
+		if artist == _("Unknown"):
+			artist = ""
+
+		album = db.entry_get (entry, rhythmdb.PROP_ALBUM)
+		if album == _("Unknown"):
+			album = ""
+
+		# Remove variants of Disc/CD [1-9] from album title before search
+		orig_album = album
+		for exp in DISC_NUMBER_REGEXS:
+			p = re.compile (exp, re.IGNORECASE)
+			album = p.sub ('', album)
+
+		album.strip()
+
+		if (artist, album) == ("", ""):
+			print "can't search: no artist or album"
+			callback (self, entry, None, *args)
+			return
+
+		# trick discogs into handling requests without 'accept-encoding: gzip'
+		self.__poke()
+
+		print "searching for (%s, %s)" % (artist, album)
+		terms = artist + " " + album
+		url = "http://www.discogs.com/search?type=all&f=xml&q=%s&api_key=%s"; % (urllib.quote_plus(terms), API_KEY)
+
+		loader = rb.Loader()
+		loader.get_url(url, self.__search_cb, (artist, album))
+
+
+	def search_next (self):
+		return False
+
+	def get_result_pixbuf (self, search_results):
+		return None
+
+	def get_best_match_urls (self, search_results):
+		if search_results == []:
+			return []
+		return search_results[0]
+
+
diff --git a/plugins/artdisplay/artdisplay/Makefile.am b/plugins/artdisplay/artdisplay/Makefile.am
index 58db39d..a318ad2 100644
--- a/plugins/artdisplay/artdisplay/Makefile.am
+++ b/plugins/artdisplay/artdisplay/Makefile.am
@@ -7,6 +7,7 @@ plugin_PYTHON = 			\
 	LocalCoverArtSearch.py		\
 	LocalCoverArtSearchGIO.py	\
 	CoverArtDatabase.py		\
+	DiscogsCoverArtSearch.py	\
 	__init__.py
 
 # the amazon cover art search no longer works
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]