[gnome-games] tools: Add psxdatacenter-gameinfo.py
- From: Adrien Plazas <aplazas src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-games] tools: Add psxdatacenter-gameinfo.py
- Date: Fri, 19 Aug 2016 21:10:46 +0000 (UTC)
commit d641cbd5f407567a7bfaeb48af8612b539e9a397
Author: Adrien Plazas <kekun plazas laposte net>
Date: Sun Jul 31 00:57:16 2016 +0200
tools: Add psxdatacenter-gameinfo.py
Add the PlayStation DataCenter scrapper.
This will be used in a next commit to generate a PlayStation gameinfo
file.
tools/gameinfo/psxdatacenter-gameinfo.py | 153 ++++++++++++++++++++++++++++++
1 files changed, 153 insertions(+), 0 deletions(-)
---
diff --git a/tools/gameinfo/psxdatacenter-gameinfo.py b/tools/gameinfo/psxdatacenter-gameinfo.py
new file mode 100755
index 0000000..4953318
--- /dev/null
+++ b/tools/gameinfo/psxdatacenter-gameinfo.py
@@ -0,0 +1,153 @@
+#!/bin/env python3
+
+from gameinfo import Gameinfo
+from os import makedirs
+from os.path import exists, realpath
+import re
+import requests
+
+def _top_srcdir():
+ return realpath(__file__ + '"/../../..')
+
+def _downloaddir():
+ return _top_srcdir() + '/tools/gameinfo/download'
+
+def _outdir():
+ return _top_srcdir() + '/tools/gameinfo/out'
+
+def _fetch_page(url):
+ user_agent = {'User-Agent': 'Gameinfo PlayStation DataCenter Scrapper 1.0'}
+ response = requests.get(url, headers=user_agent)
+
+ # Try to decode utf-16
+ try:
+ page = response.content.decode('utf-16')
+ if page.startswith('<html'):
+ return page
+ except:
+ pass
+
+ return response.text
+
+class GamesListScrapper:
+ def _parse_game_list_page(page, gameinfo, verbose=True):
+ skip = '.*?'
+ grab = '(.*?)'
+
+ begin = '<tr>' + skip
+ end = skip + '</tr>'
+
+ info_part = 'col1' + skip + 'href="' + grab + '"'
+ id_part = 'col2' + skip + '>' + grab + '</td>'
+ title_part = 'col3' + skip + '> ' + grab + '</td>'
+
+ game_expr = begin + info_part + skip + id_part + skip + title_part + end
+
+ domain = 'http://psxdatacenter.com/'
+
+ for match in re.finditer(game_expr, page, re.DOTALL):
+ comments = {
+ 'info': domain + match.group(1),
+ }
+
+ disc_ids = [disc_id.lower() for disc_id in match.group(2).split("<br>")]
+
+ title = match.group(3).split('<br>')[0]
+ title = title.split(' - ')[0]
+
+ if '<u>Includes:</u>' in match.group(3):
+ includes = match.group(3).split('<u>Includes:</u>')[1].replace('\n', ' ').replace(' ',
' ').replace('</span>', '')
+ includes = re.sub('<span.*?>', '', includes)
+ comments['includes'] = includes.strip()
+
+ if verbose:
+ print("Adding " + " ".join(disc_ids).upper() + ": " + title)
+
+ gameinfo.add_game_discs(title, disc_ids, comments)
+
+ def fetch_tmp_gameinfo():
+ gameinfo = Gameinfo()
+
+ game_lists = [
+ 'http://psxdatacenter.com/jlist.html',
+ 'http://psxdatacenter.com/plist.html',
+ 'http://psxdatacenter.com/ulist.html']
+
+ for url in game_lists:
+ page = _fetch_page(url)
+ GamesListScrapper._parse_game_list_page(page, gameinfo)
+
+ return gameinfo
+
+class GamePageScrapper:
+ def _get_game_title(game_page):
+ title_search = 'Official Title.*?<td.*?>(?: )*(.*?)</td>'
+ match = re.search(title_search, game_page, re.DOTALL)
+ if not match:
+ return None
+
+ title = match.group(1)
+
+ title = title.replace(' ', ' ')
+ title = title.strip()
+
+ return title
+
+ def parse_game_page(gameinfo, game, url):
+ game_page = _fetch_page(url)
+
+ title = GamePageScrapper._get_game_title(game_page)
+ if not title:
+ return
+
+ print(title)
+
+ gameinfo.set_game_title(game, title)
+
+class Scrapper:
+ _FILENAME = 'playstation.gameinfo.xml.in'
+ _TMP_FILENAME = _FILENAME + '.tmp'
+
+ def _get_tmp_gameinfo():
+ gameinfo_path = _outdir() + '/' + 'playstation.gameinfo.xml.in.tmp'
+ if exists(gameinfo_path):
+ return Gameinfo(gameinfo_path)
+
+ gameinfo = GamesListScrapper.fetch_tmp_gameinfo()
+
+ if not exists(_outdir()):
+ makedirs(_outdir())
+
+ gameinfo.save(gameinfo_path)
+
+ return gameinfo
+
+ def scrap():
+ gameinfo = Scrapper._get_tmp_gameinfo()
+
+ gameinfo_path = _outdir() + '/' + 'playstation.gameinfo.xml.in.tmp'
+ if not exists(_outdir()):
+ makedirs(_outdir())
+
+ changed = False
+ i = 0
+ for game in gameinfo.findall('games/game'):
+ for discs in game.findall('./discs[@info]'):
+ info = discs.get('info')
+ if not info:
+ continue
+
+ GamePageScrapper.parse_game_page(gameinfo, game, info)
+ changed = True
+
+ del discs.attrib['info']
+
+ i = i + 1
+ if i >= 10:
+ gameinfo.save(gameinfo_path)
+ i = 0
+ if changed:
+ gameinfo.save(gameinfo_path)
+
+if __name__ == '__main__':
+ Scrapper.scrap()
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]