[rhythmbox] lyrics: add a lyrics parser for darklyrics.com (bug #525094)

From: Jonathan Matthew <jmatthew src gnome org>
To: commits-list gnome org
Cc:
Subject: [rhythmbox] lyrics: add a lyrics parser for darklyrics.com (bug #525094)
Date: Sat, 29 May 2010 06:24:22 +0000 (UTC)
commit 06d99662b27d1f8e8cb396f02f62436853b9262a
Author: Edgar Luna <edgar luna gmail com>
Date:   Sat May 29 16:23:02 2010 +1000

    lyrics: add a lyrics parser for darklyrics.com (bug #525094)

 plugins/lyrics/lyrics/DarkLyricsParser.py |  163 +++++++++++++++++++++++++++++
 plugins/lyrics/lyrics/LyricsSites.py      |    4 +-
 plugins/lyrics/lyrics/Makefile.am         |    3 +-
 3 files changed, 168 insertions(+), 2 deletions(-)
---
diff --git a/plugins/lyrics/lyrics/DarkLyricsParser.py b/plugins/lyrics/lyrics/DarkLyricsParser.py
new file mode 100644
index 0000000..7370ece
--- /dev/null
+++ b/plugins/lyrics/lyrics/DarkLyricsParser.py
@@ -0,0 +1,163 @@
+# -*- Mode: python; coding: utf-8; tab-width: 8; indent-tabs-mode: t; -*-
+#
+# Copyright (C) 2008, 2009, 2010 Edgar Luna
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# The Rhythmbox authors hereby grant permission for non-GPL compatible
+# GStreamer plugins to be used and distributed together with GStreamer
+# and Rhythmbox. This permission is above and beyond the permissions granted
+# by the GPL license by which Rhythmbox is covered. If you modify this code
+# you may extend this exception to your version of the code, but you are not
+# obligated to do so. If you do not wish to do so, delete this exception
+# statement from your version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA.
+
+import re
+import string
+import rb
+
+min_artist_match = .5
+min_song_match = .5
+from rb.stringmatch import string_match
+
+class DarkLyricsParser (object):
+	"""Parser for Lyrics from www.darklyrics.com"""
+
+
+	def __init__(self, artist, title):
+		self.artist = artist
+		self.title = title
+		self.artist_ascii = ''
+		self.titlenumber = ''
+
+	def search(self, callback, *data):
+		"""Do a request of a specific url based on artist's first letter name."""
+
+		self.artist_ascii = ''.join(c for c in self.artist.lower() \
+						    if c in string.ascii_letters)
+		self.artist_ascii = self.artist_ascii.lower()
+		firstcharurl = 'http://www.darklyrics.com/%s.html' % (self.artist_ascii[0])
+		loader = rb.Loader()
+		loader.get_url (firstcharurl, self.search_artist, callback, *data)
+
+	def search_artist(self, artist_page, callback, *data):
+		"""Search for the link to the page of artist in artists_page
+		"""
+		if artist_page is None:
+			callback (None, *data)
+			return
+
+		link_section = re.split ('<SCRIPT LANGUAGE="javascript" src="tban2.js"></SCRIPT>', 
+					 artist_page, 1)[1]
+		pattern_link =  '<a href="'
+		pattern_artist = '([^"]*)">*([^<]*)</a><br><br>'
+		links = re.split (pattern_link, link_section.lower())
+		links.pop(0)
+		best_match = ()
+		for line in links:
+			artist = re.findall(pattern_artist, line)
+			if len(artist) == 0:
+				continue
+			artist_link, artist_name = artist[0]
+			artist_url = 'http://www.darklyrics.com/%s' % (artist_link)
+			if artist_link[:5] == 'http:':
+				continue
+			artist_name = artist_name.strip()
+			smvalue = string_match (artist_name, self.artist_ascii)
+			if smvalue > min_artist_match:
+				best_match = (smvalue, artist_url, artist_name)
+
+		if not best_match:
+			# Lyrics are located in external site
+			callback (None, *data)
+			return
+		loader = rb.Loader ()
+		self.artist  = best_match[2]
+		loader.get_url (best_match[1], self.search_song, callback, *data)
+
+	class SongFound (object):
+		def __init__ (self, smvalue, title, number, album, artist):
+			self.smvalue = smvalue
+			self.title = title
+			self.number = number
+			self.album = album
+			self.artist = artist
+
+		def __str__(self):
+			return '(' + str(self.smvalue) + '. ' + self.title + '. ' + self.album + '. ' + self.artist + ')'
+
+	def search_song (self, songlist, callback, *data):
+		"""If artist's page is found, search_song looks for the song.
+
+		The artist page contains a list of all the albums and
+		links to the songs lyrics from this.
+		"""
+		if songlist is None:
+			callback (None, *data)
+			return
+		# Search for all the <a>
+		# filter for those that has the artist name string_match
+		#        and for those which its content is artist string_match
+		# Sort by values given from string_match
+		# and get the best
+		link_section = re.split('LYRICS<BR></FONT>', songlist)[1]
+		link_section = link_section.lower()
+		pattern_song = '<a href="../lyrics/(.*)/(.*).html#([^"]+)" target="_blank"><FONT COLOR="#CCCCCC">(.*)</FONT></a><br>'
+		matches = re.findall (pattern_song.lower(), link_section)
+		best_match = ""
+		for line in matches:
+			artist, album, number, title = line
+			smvalue = string_match (title.lower().replace(' ', '' ),
+					   self.title.lower().replace(' ', ''))
+			if smvalue > min_song_match:
+				best_match  = self.SongFound(smvalue,
+							     title,
+							     number,
+							     album,
+							     artist)
+		if not best_match:
+			callback (None, *data)
+			return
+		loader = rb.Loader ()
+		url = 'http://www.darklyrics.com/lyrics/%s/%s.html' % (best_match.artist, best_match.album)
+		self.title = best_match.title
+		self.titlenumber = best_match.number
+		loader.get_url (url, self.parse_lyrics, callback, *data)
+
+	def parse_lyrics (self, album, callback, *data):
+		"""In the album's page parse_lyrics get the lyrics of the song.
+
+		This page contains all the lyrics for self.album, but
+		this method get rides of everything that isn't the
+		lyrics of self.title"""
+		if album is None:
+			callback (None, *data)
+			return
+		titleline = '(?mis)<a name=%s><FONT color=#DDDDDD><b>%s. %s</b></font>(.+?)<[a|f]' % \
+		    (self.titlenumber, self.titlenumber, re.escape(self.title.title()))
+		lyricmatch = re.split (titleline, album)
+		if len (lyricmatch) > 1:
+			lyrics = lyricmatch[1]
+			lyrics = lyrics.replace ('\r', "")
+			lyrics = re.sub (r'<.*?>', "", lyrics)
+			lyrics = lyrics.strip ("\n")
+			title = "%s - %s\n\n" % (self.artist.title(), self.title.title())
+
+			lyrics = title + str (lyrics)
+			lyrics += "\n\nLyrics provided by Dark Lyrics"
+			callback (lyrics, *data)
+		else:
+			callback (None, *data)
+			return
diff --git a/plugins/lyrics/lyrics/LyricsSites.py b/plugins/lyrics/lyrics/LyricsSites.py
index e995e51..eb13011 100644
--- a/plugins/lyrics/lyrics/LyricsSites.py
+++ b/plugins/lyrics/lyrics/LyricsSites.py
@@ -30,13 +30,15 @@ from AstrawebParser import AstrawebParser
 from LeoslyricsParser import LeoslyricsParser
 from WinampcnParser import WinampcnParser
 from TerraParser import TerraParser
+from DarkLyricsParser import DarkLyricsParser
 
 lyrics_sites = [
 	{ 'id': 'lyrc.com.ar', 		'class': LyrcParser, 		'name': _("Lyrc (lyrc.com.ar)") 		},
 	{ 'id': 'astraweb.com', 	'class': AstrawebParser, 	'name': _("Astraweb (www.astraweb.com)") 	},
 	{ 'id': 'leoslyrics.com', 	'class': LeoslyricsParser, 	'name': _("Leo's Lyrics (www.leoslyrics.com)") 	},
 	{ 'id': 'winampcn.com', 	'class': WinampcnParser, 	'name': _("WinampCN (www.winampcn.com)") 	},
-	{ 'id': 'terra.com.br',		'class': TerraParser,		'name': _("TerraBrasil (terra.com.br)")		}
+	{ 'id': 'terra.com.br',		'class': TerraParser,		'name': _("TerraBrasil (terra.com.br)")		},
+	{ 'id': 'darklyrics.com',	'class': DarkLyricsParser,	'name': _("Dark Lyrics (darklyrics.com)")	}
 
 ]
 
diff --git a/plugins/lyrics/lyrics/Makefile.am b/plugins/lyrics/lyrics/Makefile.am
index 3a5dd82..05026bd 100644
--- a/plugins/lyrics/lyrics/Makefile.am
+++ b/plugins/lyrics/lyrics/Makefile.am
@@ -11,4 +11,5 @@ plugin_PYTHON =				\
        LeoslyricsParser.py		\
        LyricWikiParser.py		\
        WinampcnParser.py		\
-       TerraParser.py
+       TerraParser.py			\
+       DarkLyricsParser.py
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]