Re: library.gnome.org index



I wrote:

> > Seems a bit nasty. Suggest to use those index.xml.$LANG files. Haven't
> > looked much further. Only that using a minimal XSLT would be better (so
> > we put the top stuff in XSLT and reuse it).
> 
> I agree, I missed the index.xml files at first.  Also I believe the
> OMF files would be useful to get subject/category and document type.

Not much work done, but it now uses index.xml files to get title and
abstract, (script attached).

Also I tried to sort by titles using strcoll but it requires to
  locale.setlocale(locale.LC_COLLATE, xx)
and xx must be a complete and available locale, i.e. not es but es_ES.


And current behaviour is to include links to English documentation for
applications where user language is not available, this is not really
nice, perhaps those documents should be put in a different sections?

As for documents where the last version has not been translated, I
think I will add a short sentence such as "the latest version of this
document is not available in this language, here is a link to the
latest version in English"; does it sound ok?


        Frederic
#! /usr/bin/env python

import os
import re
import sys
import xml.dom.minidom
import locale

class Documentation(object):
    pass

def getNodeTextChild(node):
    rc = ''
    for textnode in node.childNodes:
        if textnode.nodeType == textnode.TEXT_NODE:
            rc = rc + textnode.data
    return rc.encode('utf-8')


title_re = re.compile('<h1>(.*?)</h1>', re.DOTALL)
abstract_re = re.compile('<h3 class="abstract">(.*)</h3', re.DOTALL)

docs = {} # indexed on module
languages = {}

for base, dirs, filenames in os.walk('.'):
    if not 'index.xml.en' in filenames:
        continue
    if base == '.':
        continue
    try:
        ign, module, version = base.rsplit(os.path.sep, 2)
    except ValueError:
        continue

    for filename in filenames:
        if not filename.startswith('index.xml.'):
            continue

        doc = Documentation()

        doc.lang = filename.rsplit('.')[-1]
        doc.filepath = os.path.join(base, filename)
        doc.module = module
        doc.version = version
        doc.url = doc.filepath[1:].replace('.xml', '.html')

        content = file(doc.filepath).read()
        dom = xml.dom.minidom.parseString(content)

        try:
            doc.title = getNodeTextChild(dom.getElementsByTagName('title')[0])
        except IndexError:
            print 'failed to get title for', doc.module, doc.lang
            continue

        try:
            doc.abstract = getNodeTextChild(dom.getElementsByTagName('abstract')[0])
        except IndexError:
            doc.abstract = None

        if not docs.has_key(module):
            docs[module] = []
        docs[module].append(doc)

        languages[doc.lang] = True


def cmpv(x, y):
    return cmp(x.version.split('.'), y.version.split('.'))

def included_file(key, lang):
    if os.path.exists('%s.html.%s' % (key, lang)):
        return open('%s.html.%s' % (key, lang)).read()
    return open('%s.html.en' % key).read()


for lang in languages.keys():
    out = file('index.html.%s' % lang, 'w')
    print >> out, included_file('snippets/index_top', lang)
    print >> out, included_file('header', lang)

    print >> out, '<dl class="doc-index">'

    documents = []

    for module in docs.keys():
        versions = docs[module]
        versions.sort(cmpv)

        in_lang = [x for x in versions if x.lang == lang]
        if in_lang:
            if in_lang[-1].version != versions[-1].version:
                # latest version in this language is older than in English, 
                # include it as is nevertheless
                pass
            d = in_lang[-1]
        else:
            # not available in this language, fallback to English
            d = [x for x in versions if x.lang == 'en'][-1]

        documents.append(d)

    try:
        # will fail most of the time, it requires lang -> locale mapping
        locale.setlocale(locale.LC_COLLATE, lang)
    except locale.Error:
        pass
    documents.sort(lambda x,y: locale.strcoll(x.title, y.title))

    for d in documents:
        print >> out, '  <dt><a href="%s">%s</a></dt>' % (d.url, d.title)
        if d.abstract:
            print >> out, '  <dd><p>%s</p>' % d.abstract
            print >> out, '</dd>\n'

    print >> out, '</dl>'
    print >> out, included_file('footer', lang)

    out.close()



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]