Hi there, I've been trying to un-reinvent at least one wheel in yelp's man page handling and was wondering if someone could have a look at the result. My idea was that at the moment (mainly because the language guessing is wrong?), re-encoding man pages is broken. For example, on debian with yelp from git, type something like LC_ALL=de_DE yelp man:man and look at the umlauts. Then I noticed that man -R utf-8 man worked fine. "Ahah!" I said. "Let's just use man -R". Of course, this isn't any good because not all computers that can run yelp have man-db installed :-( So here's a patch that works out whether it is installed and, if so, uses it instead. Comments please! Rupert
From 44521a8b5ffa2f2bc3970aec826627e38fafdf85 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Sun, 12 Dec 2010 01:53:01 +0000
Subject: [PATCH] Try to use man-db to recode man pages to utf8.
This patch adds a test (calling "man -R utf8 man") to see whether
there's a version of man installed that'll do recoding. If so, we use
that for converting to utf8 instead of our builtin glib version.
This patch also changes the built-in version to use GCharsetConverter
so we can chain it with the magic decompressor.
---
libyelp/yelp-man-document.c | 103 ++---------------
libyelp/yelp-man-parser.c | 259 +++++++++++++++++++++++++++++++++++--------
libyelp/yelp-man-parser.h | 6 +-
3 files changed, 231 insertions(+), 137 deletions(-)
diff --git a/libyelp/yelp-man-document.c b/libyelp/yelp-man-document.c
index 14ac8cd..d08f541 100644
--- a/libyelp/yelp-man-document.c
+++ b/libyelp/yelp-man-document.c
@@ -63,62 +63,6 @@ struct _YelpManDocumentPrivate {
guint error;
};
-typedef struct _YelpLangEncodings YelpLangEncodings;
-struct _YelpLangEncodings {
- gchar *language;
- gchar *encoding;
-};
-/* http://www.w3.org/International/O-charset-lang.html */
-static const YelpLangEncodings langmap[] = {
- { "C", "ISO-8859-1" },
- { "af", "ISO-8859-1" },
- { "ar", "ISO-8859-6" },
- { "bg", "ISO-8859-5" },
- { "be", "ISO-8859-5" },
- { "ca", "ISO-8859-1" },
- { "cs", "ISO-8859-2" },
- { "da", "ISO-8859-1" },
- { "de", "ISO-8859-1" },
- { "el", "ISO-8859-7" },
- { "en", "ISO-8859-1" },
- { "eo", "ISO-8859-3" },
- { "es", "ISO-8859-1" },
- { "et", "ISO-8859-15" },
- { "eu", "ISO-8859-1" },
- { "fi", "ISO-8859-1" },
- { "fo", "ISO-8859-1" },
- { "fr", "ISO-8859-1" },
- { "ga", "ISO-8859-1" },
- { "gd", "ISO-8859-1" },
- { "gl", "ISO-8859-1" },
- { "hu", "ISO-8859-2" },
- { "id", "ISO-8859-1" }, /* is this right */
- { "mt", "ISO-8859-3" },
- { "is", "ISO-8859-1" },
- { "it", "ISO-8859-1" },
- { "iw", "ISO-8859-8" },
- { "ja", "EUC-JP" },
- { "ko", "EUC-KR" },
- { "lt", "ISO-8859-13" },
- { "lv", "ISO-8859-13" },
- { "mk", "ISO-8859-5" },
- { "mt", "ISO-8859-3" },
- { "no", "ISO-8859-1" },
- { "pl", "ISO-8859-2" },
- { "pt_BR", "ISO-8859-1" },
- { "ro", "ISO-8859-2" },
- { "ru", "KOI8-R" },
- { "sl", "ISO-8859-2" },
- { "sr", "ISO-8859-2" }, /* Latin, not cyrillic */
- { "sk", "ISO-8859-2" },
- { "sv", "ISO-8859-1" },
- { "tr", "ISO-8859-9" },
- { "uk", "ISO-8859-5" },
- { "zh_CN", "BIG5" },
- { "zh_TW", "BIG5" },
- { NULL, NULL },
-};
-
static void yelp_man_document_class_init (YelpManDocumentClass *klass);
static void yelp_man_document_init (YelpManDocument *man);
static void yelp_man_document_dispose (GObject *object);
@@ -390,12 +334,11 @@ man_document_process (YelpManDocument *man)
{
YelpManDocumentPrivate *priv = GET_PRIV (man);
GFile *file = NULL;
- gchar *filepath = NULL;
+ gchar *filepath;
GError *error;
gint params_i = 0;
gchar **params = NULL;
YelpManParser *parser;
- const gchar *language, *encoding;
file = yelp_uri_get_file (priv->uri);
if (file == NULL) {
@@ -407,44 +350,22 @@ man_document_process (YelpManDocument *man)
}
filepath = g_file_get_path (file);
- g_object_unref (file);
- if (!g_file_test (filepath, G_FILE_TEST_IS_REGULAR)) {
- error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND,
- _("The file ‘%s’ does not exist."),
- filepath);
- yelp_document_error_pending ((YelpDocument *) man, error);
- g_error_free (error);
- goto done;
- }
-
- /* FIXME: get the language */
- language = "C";
-
- /* default encoding if the language doesn't match below */
- encoding = g_getenv("MAN_ENCODING");
- if (encoding == NULL)
- encoding = "ISO-8859-1";
-
- if (language != NULL) {
- gint i;
- for (i = 0; langmap[i].language != NULL; i++) {
- if (g_str_equal (language, langmap[i].language)) {
- encoding = langmap[i].encoding;
- break;
- }
- }
- }
parser = yelp_man_parser_new ();
- priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, encoding);
+ priv->xmldoc = yelp_man_parser_parse_file (parser, file,
+ filepath, &error);
yelp_man_parser_free (parser);
if (priv->xmldoc == NULL) {
- error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,
- _("The file ‘%s’ could not be parsed because it is"
- " not a well-formed man page."),
- filepath);
- yelp_document_error_pending ((YelpDocument *) man, error);
+ if (!error) {
+ error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,
+ _("The file ‘%s’ could not be parsed because it is"
+ " not a well-formed man page."),
+ filepath);
+ }
+ yelp_document_error_pending ((YelpDocument *) man, error);
+ g_error_free (error);
+ goto done;
}
g_mutex_lock (priv->mutex);
diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c
index 49efe9f..f04cffa 100644
--- a/libyelp/yelp-man-parser.c
+++ b/libyelp/yelp-man-parser.c
@@ -29,6 +29,7 @@
#include <libxml/tree.h>
#include <string.h>
+#include "yelp-error.h"
#include "yelp-man-parser.h"
#include "yelp-magic-decompressor.h"
@@ -83,6 +84,63 @@ struct _YelpManParser {
GSList *nodeStack;
};
+typedef struct _YelpLangEncodings YelpLangEncodings;
+struct _YelpLangEncodings {
+ gchar *language;
+ gchar *encoding;
+};
+/* http://www.w3.org/International/O-charset-lang.html */
+static const YelpLangEncodings langmap[] = {
+ { "C", "ISO-8859-1" },
+ { "af", "ISO-8859-1" },
+ { "ar", "ISO-8859-6" },
+ { "bg", "ISO-8859-5" },
+ { "be", "ISO-8859-5" },
+ { "ca", "ISO-8859-1" },
+ { "cs", "ISO-8859-2" },
+ { "da", "ISO-8859-1" },
+ { "de", "ISO-8859-1" },
+ { "el", "ISO-8859-7" },
+ { "en", "ISO-8859-1" },
+ { "eo", "ISO-8859-3" },
+ { "es", "ISO-8859-1" },
+ { "et", "ISO-8859-15" },
+ { "eu", "ISO-8859-1" },
+ { "fi", "ISO-8859-1" },
+ { "fo", "ISO-8859-1" },
+ { "fr", "ISO-8859-1" },
+ { "ga", "ISO-8859-1" },
+ { "gd", "ISO-8859-1" },
+ { "gl", "ISO-8859-1" },
+ { "hu", "ISO-8859-2" },
+ { "id", "ISO-8859-1" }, /* is this right */
+ { "mt", "ISO-8859-3" },
+ { "is", "ISO-8859-1" },
+ { "it", "ISO-8859-1" },
+ { "iw", "ISO-8859-8" },
+ { "ja", "EUC-JP" },
+ { "ko", "EUC-KR" },
+ { "lt", "ISO-8859-13" },
+ { "lv", "ISO-8859-13" },
+ { "mk", "ISO-8859-5" },
+ { "mt", "ISO-8859-3" },
+ { "no", "ISO-8859-1" },
+ { "pl", "ISO-8859-2" },
+ { "pt_BR", "ISO-8859-1" },
+ { "ro", "ISO-8859-2" },
+ { "ru", "KOI8-R" },
+ { "sl", "ISO-8859-2" },
+ { "sr", "ISO-8859-2" }, /* Latin, not cyrillic */
+ { "sk", "ISO-8859-2" },
+ { "sv", "ISO-8859-1" },
+ { "tr", "ISO-8859-9" },
+ { "uk", "ISO-8859-5" },
+ { "zh_CN", "BIG5" },
+ { "zh_TW", "BIG5" },
+ { NULL, NULL },
+};
+
+
YelpManParser *
yelp_man_parser_new (void)
{
@@ -91,61 +149,174 @@ yelp_man_parser_new (void)
return parser;
}
-xmlDocPtr
-yelp_man_parser_parse_file (YelpManParser *parser,
- gchar *file,
- const gchar *encoding)
+/* Checks (caching the answer) whether a man-db compatible man */
+/* implementation is installed. */
+static gboolean
+man_db_installed_p ()
+{
+ static gint cache = -1;
+ if (cache > -1) return (cache == 0);
+
+ gchar* argv[] = { "man", "-R", "utf-8", "man", NULL };
+
+ /* If we can run "man -R utf-8 man" successfully, then we decide
+ * that man-db (or a plausible lookalike) must be installed. */
+ g_spawn_sync (NULL, argv, NULL,
+ G_SPAWN_STDOUT_TO_DEV_NULL | G_SPAWN_SEARCH_PATH |
+ G_SPAWN_STDERR_TO_DEV_NULL,
+ NULL, NULL, NULL, NULL, &cache, NULL);
+
+ /* If man returns -n, set it to 1: we only care that it was != 0 */
+ if (cache < 0) cache = 1;
+
+ return (cache == 0);
+}
+
+static GInputStream*
+man_db_utf8_recode (const gchar* path, GError **error)
+{
+ gboolean ret;
+ gint stdout;
+ const gchar* argv[] = { "man", "-R", "utf-8", NULL, NULL };
+ GError *err = NULL, *yelp_err = NULL;
+
+ /* I don't have to worry about the lifetime of path, since
+ g_spawn_async_with_pipes works by calling fork() then execv().
+ Fork copies across all pages of memory into my new address
+ space, so path doesn't need to survive past the call below.
+ */
+ argv[3] = path;
+
+ ret = g_spawn_async_with_pipes (NULL, (gchar**)argv, NULL,
+ G_SPAWN_SEARCH_PATH,
+ NULL, NULL, NULL, NULL, &stdout,
+ NULL, &err);
+ if (!ret) {
+ *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
+ err->message);
+ g_error_free (err);
+ return NULL;
+ }
+
+ return (GInputStream*) g_unix_input_stream_new (stdout, TRUE);
+}
+
+/*
+ This function is responsible for returning a utf-8 encoded stream
+ for a man file.
+
+ If we're lucky and man-db is installed, we can call 'man -R' to do
+ the work for us (rather better than our heuristics, maybe).
+
+ Otherwise, we fall back on rather less clever methods.
+
+ If something goes wrong, we return NULL and set error to be a
+ YelpError describing the problem.
+*/
+static GInputStream*
+get_man_utf8 (GFile *file, const gchar *path, GError **error)
{
- GFile *gfile;
- GConverter *converter;
+ GConverter *decompressor, *charconv;
GFileInputStream *file_stream;
- GInputStream *stream;
+ GInputStream *ret, *tmp;
+ GError *err = NULL;
+ const gchar *language, *encoding;
+
+ if (man_db_installed_p ())
+ return man_db_utf8_recode (path, error);
+
+ /* Bad news: we've got to do it ourselves. */
+
+ /* FIXME: get the language */
+ language = "C";
+
+ /* default encoding if the language doesn't match below */
+ encoding = g_getenv("MAN_ENCODING");
+ if (encoding == NULL)
+ encoding = "ISO-8859-1";
+
+ if (language != NULL) {
+ gint i;
+ for (i = 0; langmap[i].language != NULL; i++) {
+ if (g_str_equal (language, langmap[i].language)) {
+ encoding = langmap[i].encoding;
+ break;
+ }
+ }
+ }
+
+ file = g_file_new_for_path (path);
+ file_stream = g_file_read (file, NULL, &err);
+ if (!file_stream) {
+ *error = g_error_new (YELP_ERROR, YELP_ERROR_NOT_FOUND,
+ err->message);
+ g_error_free (err);
+ g_object_unref (file);
+ return NULL;
+ }
+
+ /* Chain converters if necessary with g_converter_input_stream_new
+ (example in gio/tests/filter-cat.c) */
+ decompressor = (GConverter *) yelp_magic_decompressor_new ();
+ ret = g_converter_input_stream_new ((GInputStream *) file_stream,
+ decompressor);
+ g_object_unref (decompressor);
+
+ if (!g_str_equal (encoding, "UTF-8")) {
+ charconv =
+ (GConverter *) g_charset_converter_new ("UTF-8", encoding,
+ &err);
+ if (!charconv) {
+ *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
+ err->message);
+ g_error_free (err);
+ g_object_unref (file);
+ g_object_unref (ret);
+ return NULL;
+ }
+
+ tmp = ret;
+ ret =
+ (GInputStream *) g_converter_input_stream_new (ret,
+ charconv);
+ g_object_unref (charconv);
+ g_object_unref (tmp);
+ }
+
+ return ret;
+}
+
+xmlDocPtr
+yelp_man_parser_parse_file (YelpManParser *parser,
+ GFile *file,
+ const gchar *path,
+ GError **error)
+{
+ GInputStream *recoded_stream;
gchar *line;
gsize len;
- gfile = g_file_new_for_path (file);
- file_stream = g_file_read (gfile, NULL, NULL);
- converter = (GConverter *) yelp_magic_decompressor_new ();
- stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter);
- parser->stream = g_data_input_stream_new (stream);
+ recoded_stream = get_man_utf8 (file, path, error);
+ if (!recoded_stream) return NULL;
+
+ parser->stream = g_data_input_stream_new (recoded_stream);
parser->doc = xmlNewDoc (BAD_CAST "1.0");
parser->ins = xmlNewNode (NULL, BAD_CAST "Man");
- xmlDocSetRootElement (parser->doc, parser->ins);
+ xmlDocSetRootElement (parser->doc, parser->ins);
parser->make_links = TRUE;
- while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) {
- /* convert this line from the encoding indicated to UTF-8 */
- if (!g_str_equal (encoding, "UTF-8")) {
- GError *converr = NULL;
- gchar *new_buffer = NULL;
- gsize bytes_written = 0;
-
- /* We are making the
- * assumption that there are no partial characters at the end of this
- * string, and therefore can use calls like g_convert() which do not
- * preserve state - someone tell me if I'm wrong here */
- new_buffer = g_convert (parser->buffer, parser->length, "UTF-8",
- encoding, NULL, &bytes_written, &converr);
- if (converr != NULL) {
- g_print ("Error occurred converting %s to UTF-8: %s\n",
- encoding, converr->message);
- g_error_free (converr);
- break;
- } else if (parser->buffer == NULL) {
- g_print ("parser->buffer == NULL\n");
- break;
- }
+ while (1) {
+ parser->buffer =
+ g_data_input_stream_read_line (parser->stream,
+ &(parser->length),
+ NULL, NULL);
+ if (parser->buffer == NULL) break;
- g_free (parser->buffer);
- parser->buffer = new_buffer;
- parser->length = bytes_written;
- }
+ parser_parse_line (parser);
- parser_parse_line (parser);
-
- g_free (parser->buffer);
+ g_free (parser->buffer);
}
g_object_unref (parser->stream);
diff --git a/libyelp/yelp-man-parser.h b/libyelp/yelp-man-parser.h
index 1901f1b..369ad29 100644
--- a/libyelp/yelp-man-parser.h
+++ b/libyelp/yelp-man-parser.h
@@ -24,14 +24,16 @@
#define __YELP_MAN_PARSER_H__
#include <glib.h>
+#include <gio/gio.h>
#include <libxml/tree.h>
typedef struct _YelpManParser YelpManParser;
YelpManParser * yelp_man_parser_new (void);
xmlDocPtr yelp_man_parser_parse_file (YelpManParser *parser,
- gchar *file,
- const gchar *encoding);
+ GFile *file,
+ const gchar *path,
+ GError **error);
void yelp_man_parser_free (YelpManParser *parser);
#endif /* __YELP_MAN_PARSER_H__ */
--
1.7.2.3
Attachment:
pgp6PHU3YLuJr.pgp
Description: PGP signature