[rhythmbox] lyrics: update TerraParser to match current layout
- From: Jonathan Matthew <jmatthew src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [rhythmbox] lyrics: update TerraParser to match current layout
- Date: Sun, 16 Mar 2014 02:38:45 +0000 (UTC)
commit cd9eada4577f8033f03e88bceaec585024dbbdc5
Author: Jonathan Matthew <jonathan d14n org>
Date: Sun Mar 16 12:38:22 2014 +1000
lyrics: update TerraParser to match current layout
plugins/lyrics/TerraParser.py | 25 +++++++++++++++----------
1 files changed, 15 insertions(+), 10 deletions(-)
---
diff --git a/plugins/lyrics/TerraParser.py b/plugins/lyrics/TerraParser.py
index d131667..e5e710f 100644
--- a/plugins/lyrics/TerraParser.py
+++ b/plugins/lyrics/TerraParser.py
@@ -96,16 +96,21 @@ class TerraParser (object):
def parse_lyrics(self, source):
+ def unspace(x):
+ return " ".join(x.split())
+ def untag(x):
+ return re.sub('<.*?>', '', x)
+
source = re.split('<div id="letra">', source)[1]
- source = re.split('<p>', source)
- # Parse artist and title
- artistitle = re.sub('<.*?>', '', source[0])
- rx = re.compile('^(\t|\n)+',re.M | re.S)
- artistitle = rx.sub('', artistitle)
- # Parse lyrics
- lyrics = re.split('</p>', source[1])[0]
- lyrics = re.sub('<[Bb][Rr]/>', '', lyrics)
-
- lyrics = unescape_entities(artistitle) + "\n" + unescape_entities(lyrics)
+ source = re.split('</?div.*?>', source)
+ # source[1] = artist+title
+ # source[2] = lyrics
+
+ header = "".join(source[1].splitlines())
+ # <h1><a>title</a></h1> <h2><a>artist</a></h2>
+ bits = re.findall('<h.>(.*?)</h.>', header)
+ artistitle = unspace(untag(" - ".join(bits)))
+
+ lyrics = unescape_entities(artistitle) + "\n" + unescape_entities(untag(source[2]))
lyrics += "\n\nEsta letra foi disponibilizada pelo site\nhttp://letras.mus.br"
return lyrics
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]