Man page encodings



Hi there,

I've been trying to un-reinvent at least one wheel in yelp's man page
handling and was wondering if someone could have a look at the
result. My idea was that at the moment (mainly because the language
guessing is wrong?), re-encoding man pages is broken.

For example, on debian with yelp from git, type something like

  LC_ALL=de_DE yelp man:man

and look at the umlauts.

Then I noticed that

  man -R utf-8 man

worked fine.

"Ahah!" I said. "Let's just use man -R". Of course, this isn't any good
because not all computers that can run yelp have man-db installed :-( So
here's a patch that works out whether it is installed and, if so, uses
it instead.

Comments please!

Rupert

From 44521a8b5ffa2f2bc3970aec826627e38fafdf85 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Sun, 12 Dec 2010 01:53:01 +0000
Subject: [PATCH] Try to use man-db to recode man pages to utf8.

This patch adds a test (calling "man -R utf8 man") to see whether
there's a version of man installed that'll do recoding. If so, we use
that for converting to utf8 instead of our builtin glib version.

This patch also changes the built-in version to use GCharsetConverter
so we can chain it with the magic decompressor.
---
 libyelp/yelp-man-document.c |  103 ++---------------
 libyelp/yelp-man-parser.c   |  259 +++++++++++++++++++++++++++++++++++--------
 libyelp/yelp-man-parser.h   |    6 +-
 3 files changed, 231 insertions(+), 137 deletions(-)

diff --git a/libyelp/yelp-man-document.c b/libyelp/yelp-man-document.c
index 14ac8cd..d08f541 100644
--- a/libyelp/yelp-man-document.c
+++ b/libyelp/yelp-man-document.c
@@ -63,62 +63,6 @@ struct _YelpManDocumentPrivate {
     guint          error;
 };
 
-typedef struct _YelpLangEncodings YelpLangEncodings;
-struct _YelpLangEncodings {
-    gchar *language;
-    gchar *encoding;
-};
-/* http://www.w3.org/International/O-charset-lang.html */
-static const YelpLangEncodings langmap[] = {
-    { "C",     "ISO-8859-1" },
-    { "af",    "ISO-8859-1" },
-    { "ar",    "ISO-8859-6" },
-    { "bg",    "ISO-8859-5" },
-    { "be",    "ISO-8859-5" },
-    { "ca",    "ISO-8859-1" },
-    { "cs",    "ISO-8859-2" },
-    { "da",    "ISO-8859-1" },
-    { "de",    "ISO-8859-1" },
-    { "el",    "ISO-8859-7" },
-    { "en",    "ISO-8859-1" },
-    { "eo",    "ISO-8859-3" },
-    { "es",    "ISO-8859-1" },
-    { "et",    "ISO-8859-15" },
-    { "eu",    "ISO-8859-1" },
-    { "fi",    "ISO-8859-1" },
-    { "fo",    "ISO-8859-1" },
-    { "fr",    "ISO-8859-1" },
-    { "ga",    "ISO-8859-1" },
-    { "gd",    "ISO-8859-1" },
-    { "gl",    "ISO-8859-1" },
-    { "hu",    "ISO-8859-2" },
-    { "id",    "ISO-8859-1" }, /* is this right */
-    { "mt",    "ISO-8859-3" },
-    { "is",    "ISO-8859-1" },
-    { "it",    "ISO-8859-1" },
-    { "iw",    "ISO-8859-8" },
-    { "ja",    "EUC-JP" },
-    { "ko",    "EUC-KR" },
-    { "lt",    "ISO-8859-13" },
-    { "lv",    "ISO-8859-13" },
-    { "mk",    "ISO-8859-5" },
-    { "mt",    "ISO-8859-3" },
-    { "no",    "ISO-8859-1" },
-    { "pl",    "ISO-8859-2" },
-    { "pt_BR", "ISO-8859-1" },
-    { "ro",    "ISO-8859-2" },
-    { "ru",    "KOI8-R" },
-    { "sl",    "ISO-8859-2" },
-    { "sr",    "ISO-8859-2" }, /* Latin, not cyrillic */
-    { "sk",    "ISO-8859-2" },
-    { "sv",    "ISO-8859-1" },
-    { "tr",    "ISO-8859-9" },
-    { "uk",    "ISO-8859-5" },
-    { "zh_CN", "BIG5" },
-    { "zh_TW", "BIG5" },
-    { NULL,    NULL },
-};
-
 static void           yelp_man_document_class_init       (YelpManDocumentClass   *klass);
 static void           yelp_man_document_init             (YelpManDocument        *man);
 static void           yelp_man_document_dispose          (GObject                *object);
@@ -390,12 +334,11 @@ man_document_process (YelpManDocument *man)
 {
     YelpManDocumentPrivate *priv = GET_PRIV (man);
     GFile *file = NULL;
-    gchar *filepath = NULL;
+    gchar *filepath;
     GError *error;
     gint  params_i = 0;
     gchar **params = NULL;
     YelpManParser *parser;
-    const gchar *language, *encoding;
 
     file = yelp_uri_get_file (priv->uri);
     if (file == NULL) {
@@ -407,44 +350,22 @@ man_document_process (YelpManDocument *man)
     }
 
     filepath = g_file_get_path (file);
-    g_object_unref (file);
-    if (!g_file_test (filepath, G_FILE_TEST_IS_REGULAR)) {
-        error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND,
-                             _("The file ‘%s’ does not exist."),
-                             filepath);
-        yelp_document_error_pending ((YelpDocument *) man, error);
-        g_error_free (error);
-        goto done;
-    }
-
-    /* FIXME: get the language */
-    language = "C";
-
-    /* default encoding if the language doesn't match below */
-    encoding = g_getenv("MAN_ENCODING");
-    if (encoding == NULL)
-	encoding = "ISO-8859-1";
-
-    if (language != NULL) {
-        gint i;
-	for (i = 0; langmap[i].language != NULL; i++) {
-	    if (g_str_equal (language, langmap[i].language)) {
-		encoding = langmap[i].encoding;
-		break;
-	    }
-	}
-    }
 
     parser = yelp_man_parser_new ();
-    priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, encoding);
+    priv->xmldoc = yelp_man_parser_parse_file (parser, file,
+                                               filepath, &error);
     yelp_man_parser_free (parser);
 
     if (priv->xmldoc == NULL) {
-	error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,
-                             _("The file ‘%s’ could not be parsed because it is"
-                               " not a well-formed man page."),
-                             filepath);
-	yelp_document_error_pending ((YelpDocument *) man, error);
+        if (!error) {
+            error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,
+                                 _("The file ‘%s’ could not be parsed because it is"
+                                   " not a well-formed man page."),
+                                 filepath);
+        }
+        yelp_document_error_pending ((YelpDocument *) man, error);
+        g_error_free (error);
+        goto done;
     }
 
     g_mutex_lock (priv->mutex);
diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c
index 49efe9f..f04cffa 100644
--- a/libyelp/yelp-man-parser.c
+++ b/libyelp/yelp-man-parser.c
@@ -29,6 +29,7 @@
 #include <libxml/tree.h>
 #include <string.h>
 
+#include "yelp-error.h"
 #include "yelp-man-parser.h"
 #include "yelp-magic-decompressor.h"
 
@@ -83,6 +84,63 @@ struct _YelpManParser {
     GSList       *nodeStack;
 };
 
+typedef struct _YelpLangEncodings YelpLangEncodings;
+struct _YelpLangEncodings {
+    gchar *language;
+    gchar *encoding;
+};
+/* http://www.w3.org/International/O-charset-lang.html */
+static const YelpLangEncodings langmap[] = {
+    { "C",     "ISO-8859-1" },
+    { "af",    "ISO-8859-1" },
+    { "ar",    "ISO-8859-6" },
+    { "bg",    "ISO-8859-5" },
+    { "be",    "ISO-8859-5" },
+    { "ca",    "ISO-8859-1" },
+    { "cs",    "ISO-8859-2" },
+    { "da",    "ISO-8859-1" },
+    { "de",    "ISO-8859-1" },
+    { "el",    "ISO-8859-7" },
+    { "en",    "ISO-8859-1" },
+    { "eo",    "ISO-8859-3" },
+    { "es",    "ISO-8859-1" },
+    { "et",    "ISO-8859-15" },
+    { "eu",    "ISO-8859-1" },
+    { "fi",    "ISO-8859-1" },
+    { "fo",    "ISO-8859-1" },
+    { "fr",    "ISO-8859-1" },
+    { "ga",    "ISO-8859-1" },
+    { "gd",    "ISO-8859-1" },
+    { "gl",    "ISO-8859-1" },
+    { "hu",    "ISO-8859-2" },
+    { "id",    "ISO-8859-1" }, /* is this right */
+    { "mt",    "ISO-8859-3" },
+    { "is",    "ISO-8859-1" },
+    { "it",    "ISO-8859-1" },
+    { "iw",    "ISO-8859-8" },
+    { "ja",    "EUC-JP" },
+    { "ko",    "EUC-KR" },
+    { "lt",    "ISO-8859-13" },
+    { "lv",    "ISO-8859-13" },
+    { "mk",    "ISO-8859-5" },
+    { "mt",    "ISO-8859-3" },
+    { "no",    "ISO-8859-1" },
+    { "pl",    "ISO-8859-2" },
+    { "pt_BR", "ISO-8859-1" },
+    { "ro",    "ISO-8859-2" },
+    { "ru",    "KOI8-R" },
+    { "sl",    "ISO-8859-2" },
+    { "sr",    "ISO-8859-2" }, /* Latin, not cyrillic */
+    { "sk",    "ISO-8859-2" },
+    { "sv",    "ISO-8859-1" },
+    { "tr",    "ISO-8859-9" },
+    { "uk",    "ISO-8859-5" },
+    { "zh_CN", "BIG5" },
+    { "zh_TW", "BIG5" },
+    { NULL,    NULL },
+};
+
+
 YelpManParser *
 yelp_man_parser_new (void)
 {
@@ -91,61 +149,174 @@ yelp_man_parser_new (void)
     return parser;
 }
 
-xmlDocPtr
-yelp_man_parser_parse_file (YelpManParser   *parser,
-			    gchar           *file,
-			    const gchar     *encoding)
+/* Checks (caching the answer) whether a man-db compatible man */
+/* implementation is installed. */
+static gboolean
+man_db_installed_p ()
+{
+    static gint cache = -1;
+    if (cache > -1) return (cache == 0);
+
+    gchar* argv[] = { "man", "-R", "utf-8", "man", NULL };
+
+    /* If we can run "man -R utf-8 man" successfully, then we decide
+     * that man-db (or a plausible lookalike) must be installed. */
+    g_spawn_sync (NULL, argv, NULL,
+                  G_SPAWN_STDOUT_TO_DEV_NULL | G_SPAWN_SEARCH_PATH |
+                  G_SPAWN_STDERR_TO_DEV_NULL,
+                  NULL, NULL, NULL, NULL, &cache, NULL);
+
+    /* If man returns -n, set it to 1: we only care that it was != 0 */
+    if (cache < 0) cache = 1;
+
+    return (cache == 0);
+}
+
+static GInputStream*
+man_db_utf8_recode (const gchar* path, GError **error)
+{
+    gboolean ret;
+    gint stdout;
+    const gchar* argv[] = { "man", "-R", "utf-8", NULL, NULL };
+    GError *err = NULL, *yelp_err = NULL;
+
+    /* I don't have to worry about the lifetime of path, since
+       g_spawn_async_with_pipes works by calling fork() then execv().
+       Fork copies across all pages of memory into my new address
+       space, so path doesn't need to survive past the call below.
+    */
+    argv[3] = path;
+
+    ret = g_spawn_async_with_pipes (NULL, (gchar**)argv, NULL,
+                                    G_SPAWN_SEARCH_PATH,
+                                    NULL, NULL, NULL, NULL, &stdout,
+                                    NULL, &err);
+    if (!ret) {
+        *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
+                              err->message);
+        g_error_free (err);
+        return NULL;
+    }
+
+    return (GInputStream*) g_unix_input_stream_new (stdout, TRUE);
+}
+
+/*
+  This function is responsible for returning a utf-8 encoded stream
+  for a man file.
+
+  If we're lucky and man-db is installed, we can call 'man -R' to do
+  the work for us (rather better than our heuristics, maybe).
+
+  Otherwise, we fall back on rather less clever methods.
+
+  If something goes wrong, we return NULL and set error to be a
+  YelpError describing the problem.
+*/
+static GInputStream*
+get_man_utf8 (GFile *file, const gchar *path, GError **error)
 {
-    GFile *gfile;
-    GConverter *converter;
+    GConverter *decompressor, *charconv;
     GFileInputStream *file_stream;
-    GInputStream *stream;
+    GInputStream *ret, *tmp;
+    GError *err = NULL;
+    const gchar *language, *encoding;
+
+    if (man_db_installed_p ())
+        return man_db_utf8_recode (path, error);
+
+    /* Bad news: we've got to do it ourselves. */
+
+    /* FIXME: get the language */
+    language = "C";
+
+    /* default encoding if the language doesn't match below */
+    encoding = g_getenv("MAN_ENCODING");
+    if (encoding == NULL)
+        encoding = "ISO-8859-1";
+
+    if (language != NULL) {
+        gint i;
+        for (i = 0; langmap[i].language != NULL; i++) {
+            if (g_str_equal (language, langmap[i].language)) {
+                encoding = langmap[i].encoding;
+                break;
+            }
+        }
+    }
+
+    file = g_file_new_for_path (path);
+    file_stream = g_file_read (file, NULL, &err);
+    if (!file_stream) {
+        *error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND,
+                              err->message);
+        g_error_free (err);
+        g_object_unref (file);
+        return NULL;
+    }
+
+    /* Chain converters if necessary with g_converter_input_stream_new
+       (example in gio/tests/filter-cat.c) */
+    decompressor = (GConverter *) yelp_magic_decompressor_new ();
+    ret = g_converter_input_stream_new ((GInputStream *) file_stream,
+                                        decompressor);
+    g_object_unref (decompressor);
+
+    if (!g_str_equal (encoding, "UTF-8")) {
+        charconv =
+            (GConverter *) g_charset_converter_new ("UTF-8", encoding,
+                                                    &err);
+        if (!charconv) {
+            *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
+                                  err->message);
+            g_error_free (err);
+            g_object_unref (file);
+            g_object_unref (ret);
+            return NULL;
+        }
+
+        tmp = ret;
+        ret =
+            (GInputStream *) g_converter_input_stream_new (ret,
+                                                           charconv);
+        g_object_unref (charconv);
+        g_object_unref (tmp);
+    }
+
+    return ret;
+}
+
+xmlDocPtr
+yelp_man_parser_parse_file (YelpManParser *parser,
+                            GFile *file,
+                            const gchar *path,
+                            GError **error)
+{
+    GInputStream *recoded_stream;
     gchar *line;
     gsize len;
 
-    gfile = g_file_new_for_path (file);
-    file_stream = g_file_read (gfile, NULL, NULL);
-    converter = (GConverter *) yelp_magic_decompressor_new ();
-    stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter);
-    parser->stream = g_data_input_stream_new (stream);
+    recoded_stream = get_man_utf8 (file, path, error);
+    if (!recoded_stream) return NULL;
+
+    parser->stream = g_data_input_stream_new (recoded_stream);
 
     parser->doc = xmlNewDoc (BAD_CAST "1.0");
     parser->ins = xmlNewNode (NULL, BAD_CAST "Man");
-	xmlDocSetRootElement (parser->doc, parser->ins);
+    xmlDocSetRootElement (parser->doc, parser->ins);
 
     parser->make_links = TRUE;
 
-    while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) {
-	/* convert this line from the encoding indicated to UTF-8 */
-	if (!g_str_equal (encoding, "UTF-8")) {
-	    GError *converr = NULL;
-	    gchar *new_buffer = NULL;
-	    gsize bytes_written = 0;
-
-	    /* We are making the
-	     * assumption that there are no partial characters at the end of this
-	     * string, and therefore can use calls like g_convert() which do not
-	     * preserve state - someone tell me if I'm wrong here */
-	    new_buffer = g_convert (parser->buffer, parser->length, "UTF-8", 
-	                            encoding, NULL, &bytes_written, &converr);
-	    if (converr != NULL) {
-		g_print ("Error occurred converting %s to UTF-8: %s\n", 
-		         encoding, converr->message);
-		g_error_free (converr);
-		break;
-	    } else if (parser->buffer == NULL) {
-		g_print ("parser->buffer == NULL\n");
-		break;
-	    }
+    while (1) {
+       parser->buffer =
+       g_data_input_stream_read_line (parser->stream,
+                                      &(parser->length),
+                                      NULL, NULL);
+       if (parser->buffer == NULL) break;
 
-	    g_free (parser->buffer);
-	    parser->buffer = new_buffer;
-	    parser->length = bytes_written;
-	}
+       parser_parse_line (parser);
 
-	parser_parse_line (parser);
-
-	g_free (parser->buffer);
+       g_free (parser->buffer);
     }
 
     g_object_unref (parser->stream);
diff --git a/libyelp/yelp-man-parser.h b/libyelp/yelp-man-parser.h
index 1901f1b..369ad29 100644
--- a/libyelp/yelp-man-parser.h
+++ b/libyelp/yelp-man-parser.h
@@ -24,14 +24,16 @@
 #define __YELP_MAN_PARSER_H__
 
 #include <glib.h>
+#include <gio/gio.h>
 #include <libxml/tree.h>
 
 typedef struct _YelpManParser YelpManParser;
 
 YelpManParser *     yelp_man_parser_new         (void);
 xmlDocPtr           yelp_man_parser_parse_file  (YelpManParser   *parser,
-						 gchar           *file,
-						 const gchar     *encoding);
+                                                 GFile           *file,
+                                                 const gchar     *path,
+                                                 GError         **error);
 void                yelp_man_parser_free        (YelpManParser   *parser);
 
 #endif /* __YELP_MAN_PARSER_H__ */
-- 
1.7.2.3

Attachment: pgp6PHU3YLuJr.pgp
Description: PGP signature



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]