[rhythmbox] lyrics: update TerraParser to match current layout



commit cd9eada4577f8033f03e88bceaec585024dbbdc5
Author: Jonathan Matthew <jonathan d14n org>
Date:   Sun Mar 16 12:38:22 2014 +1000

    lyrics: update TerraParser to match current layout

 plugins/lyrics/TerraParser.py |   25 +++++++++++++++----------
 1 files changed, 15 insertions(+), 10 deletions(-)
---
diff --git a/plugins/lyrics/TerraParser.py b/plugins/lyrics/TerraParser.py
index d131667..e5e710f 100644
--- a/plugins/lyrics/TerraParser.py
+++ b/plugins/lyrics/TerraParser.py
@@ -96,16 +96,21 @@ class TerraParser (object):
 
 
        def parse_lyrics(self, source):
+               def unspace(x):
+                       return " ".join(x.split())
+               def untag(x):
+                       return re.sub('<.*?>', '', x)
+
                source = re.split('<div id="letra">', source)[1]
-               source = re.split('<p>', source)
-               # Parse artist and title
-               artistitle = re.sub('<.*?>', '', source[0])
-               rx = re.compile('^(\t|\n)+',re.M | re.S)
-               artistitle = rx.sub('', artistitle)
-               # Parse lyrics
-               lyrics = re.split('</p>', source[1])[0]
-               lyrics = re.sub('<[Bb][Rr]/>', '', lyrics)
-
-               lyrics = unescape_entities(artistitle) + "\n" + unescape_entities(lyrics)
+               source = re.split('</?div.*?>', source)
+               # source[1] = artist+title
+               # source[2] = lyrics
+
+               header = "".join(source[1].splitlines())
+               # <h1><a>title</a></h1> <h2><a>artist</a></h2>
+               bits = re.findall('<h.>(.*?)</h.>', header)
+               artistitle = unspace(untag(" - ".join(bits)))
+
+               lyrics = unescape_entities(artistitle) + "\n" + unescape_entities(untag(source[2]))
                lyrics += "\n\nEsta letra foi disponibilizada pelo site\nhttp://letras.mus.br";
                return lyrics


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]