I realise that sending yet more patches on this subject makes me sound like a ten-year-old who found the sugar jar and is currently dancing around the room as hyper as a 4-manifold... But, I've fixed a bug in the stuff I sent which caused the last line of the manpage to look rubbish. Also (and this took longer!), I wrestled with git cherry-pick long and hard until I got a "sanitised" revision history, which removes all reference to logging stuff to "/home/rupert/test.xml" and so on. So I'm attaching this patch series here. Unfortunately, I realise two things (1) This is going to come in >100kb and so sit in the moderation queue (2) As far as I can tell, the "git am" commands etc. don't deal with mime messages very well. Sending the patch series as 8 emails via git send-email would fix both of these problems, but I'm worried it would irritate subscribers to the list. If I'm wrong, let me know and I'll do that in future! Rupert
From c942bc75e1a23b2cbd1c68e4dd3f436622c24597 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:57:11 +0000 Subject: [PATCH 1/8] Initial implementation calling man -Z and parsing. Currently doesn't deal with 'special characters' (including umlauts and ... hyphens!) Also seems to fail with pretty simple formatting (see list of sections in man man) --- libyelp/yelp-man-document.c | 6 +- libyelp/yelp-man-parser.c | 2057 +++++++++---------------------------------- libyelp/yelp-man-parser.h | 4 +- stylesheets/man2html.xsl.in | 366 +------- 4 files changed, 470 insertions(+), 1963 deletions(-) diff --git a/libyelp/yelp-man-document.c b/libyelp/yelp-man-document.c index 14ac8cd..4fac05a 100644 --- a/libyelp/yelp-man-document.c +++ b/libyelp/yelp-man-document.c @@ -436,14 +436,10 @@ man_document_process (YelpManDocument *man) } parser = yelp_man_parser_new (); - priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, encoding); + priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, &error); yelp_man_parser_free (parser); if (priv->xmldoc == NULL) { - error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, - _("The file ‘%s’ could not be parsed because it is" - " not a well-formed man page."), - filepath); yelp_document_error_pending ((YelpDocument *) man, error); } diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 49efe9f..bceb465 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -27,1796 +27,581 @@ #include <glib.h> #include <glib/gi18n.h> #include <libxml/tree.h> +#include <gio/gio.h> #include <string.h> +#include <math.h> +#include "yelp-error.h" #include "yelp-man-parser.h" -#include "yelp-magic-decompressor.h" - -#define PARSER_CUR (g_utf8_get_char (parser->cur) != '\0' \ - && (parser->cur - parser->buffer < parser->length)) - -static void parser_parse_line (YelpManParser *parser); -static void parser_handle_linetag (YelpManParser *parser); -static void parser_ensure_P (YelpManParser *parser); -static void parser_read_until (YelpManParser *parser, - gchar delim); -static void parser_escape_tags (YelpManParser *parser, - gchar **tags, - gint ntags); -static xmlNodePtr parser_append_text (YelpManParser *parser); -static xmlNodePtr parser_append_given_text (YelpManParser *parser, - gchar *text); -static void parser_append_given_text_handle_escapes - (YelpManParser *parser, - gchar *text, - gboolean make_links); -static xmlNodePtr parser_append_node (YelpManParser *parser, - gchar *name); -static xmlNodePtr parser_append_node_attr (YelpManParser *parser, - gchar *name, - gchar *attr, - gchar *value); -static void parser_stack_push_node (YelpManParser *parser, - xmlNodePtr node); -static xmlNodePtr parser_stack_pop_node (YelpManParser *parser, - gchar *name); -static void parser_parse_table (YelpManParser *parser); - -typedef struct _StackElem StackElem; -struct _YelpManParser { - xmlDocPtr doc; /* The top-level XML document */ - xmlNodePtr ins; /* The insertion node */ - xmlNodePtr th_node; /* The TH node, or NULL if it doesn't exist */ - GDataInputStream *stream; /* The GIO input stream to read from */ - gchar *buffer; /* The buffer, line at a time */ - gsize length; /* The buffer length */ +#define MAN_FONTS 8 - gchar *anc; /* The anchor point in the document */ - gchar *cur; /* Our current position in the document */ - - gchar *token; /* see ignore flag; we ignore the parsing stream until - * this string is found in the stream */ - gboolean make_links; /* Allow auto-generated hyperlinks to be disabled. */ - gboolean ignore; /* when true, ignore stream until "token" is found */ - - GSList *nodeStack; -}; - -YelpManParser * -yelp_man_parser_new (void) +/* The format has two copies of the title like MAN(1) at the top, + * possibly with a string of text in between for the collection. + * + * Start with the parser on START, then HAVE_TITLE when we've read the + * first word with parentheses. At that point, stick new words into + * the "collection" tag. Then finally switch to BODY when we've seen + * the second copy of the one with parentheses. + */ +typedef enum ManParserState { - YelpManParser *parser = g_new0 (YelpManParser, 1); + START, + HAVE_TITLE, + BODY +} ManParserState; - return parser; -} - -xmlDocPtr -yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding) +/* See parse_body_text for how this is used. */ +typedef enum ManParserSectionState { - GFile *gfile; - GConverter *converter; - GFileInputStream *file_stream; - GInputStream *stream; - gchar *line; - gsize len; - - gfile = g_file_new_for_path (file); - file_stream = g_file_read (gfile, NULL, NULL); - converter = (GConverter *) yelp_magic_decompressor_new (); - stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter); - parser->stream = g_data_input_stream_new (stream); - - parser->doc = xmlNewDoc (BAD_CAST "1.0"); - parser->ins = xmlNewNode (NULL, BAD_CAST "Man"); - xmlDocSetRootElement (parser->doc, parser->ins); - - parser->make_links = TRUE; - - while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) { - /* convert this line from the encoding indicated to UTF-8 */ - if (!g_str_equal (encoding, "UTF-8")) { - GError *converr = NULL; - gchar *new_buffer = NULL; - gsize bytes_written = 0; - - /* We are making the - * assumption that there are no partial characters at the end of this - * string, and therefore can use calls like g_convert() which do not - * preserve state - someone tell me if I'm wrong here */ - new_buffer = g_convert (parser->buffer, parser->length, "UTF-8", - encoding, NULL, &bytes_written, &converr); - if (converr != NULL) { - g_print ("Error occurred converting %s to UTF-8: %s\n", - encoding, converr->message); - g_error_free (converr); - break; - } else if (parser->buffer == NULL) { - g_print ("parser->buffer == NULL\n"); - break; - } - - g_free (parser->buffer); - parser->buffer = new_buffer; - parser->length = bytes_written; - } - - parser_parse_line (parser); - - g_free (parser->buffer); - } + SECTION_TITLE, + SECTION_BODY +} ManParserSectionState; - g_object_unref (parser->stream); +struct _YelpManParser { + xmlDocPtr doc; /* The top-level XML document */ + xmlNodePtr header; /* The header node */ + xmlNodePtr section_node; /* The current section */ + xmlNodePtr sheet_node; /* The current sheet */ - return parser->doc; -} + GDataInputStream *stream; /* The GIO input stream to read from */ + gchar *buffer; /* The buffer, line at a time */ + gsize length; /* The buffer length */ -void -yelp_man_parser_free (YelpManParser *parser) -{ - g_free (parser); -} + /* The width and height of a character according to troff. */ + guint char_width; + guint char_height; -/******************************************************************************/ + /* Count the number of lines we've parsed (needed to get prologue) */ + guint lines_parsed; -static void -parser_parse_line (YelpManParser *parser) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - /* check to see if we are ignoring input */ - if (parser->ignore) { - gchar *ptr; - /* needs to be utf-8 compatible */ - ptr = strstr (parser->buffer, parser->token); - if (ptr != NULL) { - while (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - g_free (parser->token); - parser->ignore = FALSE; - } else { - /* return to get another line of input */ - return; - } - } else { - switch (*(parser->buffer)) { - case '.': - parser_handle_linetag (parser); - /* we are ignoring everything until parser->token, - * so return and get next line */ - if (parser->ignore) - return; - break; - case '\0': - parser->ins = xmlDocGetRootElement (parser->doc); - break; - case '\'': - parser->cur = parser->buffer + parser->length - 1; - parser->anc = parser->cur; - default: - break; - } - } - - parser_read_until (parser, '\0'); - - if (parser->cur != parser->anc) - parser_append_text (parser); - - if (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser_append_text (parser); - } -} + /* The x f k name command sets the k'th register to be name. */ + gchar* font_registers[MAN_FONTS]; -/* creates a single string from all the macro arguments */ -static gchar * -args_concat_all (GSList *args) -{ - GSList *ptr = NULL; - gchar **str_array = NULL; - gchar *retval = NULL; - gint i = 0; - - if (!args) - return NULL; + /* The current font. Should be the index of one of the + * font_registers. Starts at 0 (of course!) + */ + guint current_font; - str_array = g_malloc0 ((sizeof (gchar *)) * (g_slist_length (args)+1) ); + /* See description of ManParserState above */ + ManParserState state; - ptr = args; - while (ptr && ptr->data) { - str_array[i++] = ptr->data; - ptr = g_slist_next (ptr); - } - - str_array[i] = NULL; + /* Vertical and horizontal position as far as the troff output is + * concerned. (Measured from top-left). + */ + guint vpos, hpos; - retval = g_strjoinv (" ", str_array); + /* Text accumulator (needed since it comes through in dribs & + * drabs...) */ + GString *accumulator; - g_free (str_array); + /* See parse_body_text for how this is used. */ + ManParserSectionState section_state; - return retval; -} + /* The indent of the current sheet */ + guint sheet_indent; -/* handler to ignore a macro by reading until the null character */ -static void -macro_ignore_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - while (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } -} + /* Set to TRUE if there's been a newline since the last text was + * parsed. */ + gboolean newline; +}; -static void -macro_bold_small_italic_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - gchar *str = NULL; +static gboolean parser_parse_line (YelpManParser *parser, GError **error); +static gboolean parse_prologue_line (YelpManParser *parser, GError **error); + +/* Parsers for different types of line */ +typedef gboolean (*LineParser)(YelpManParser *, GError **); +#define DECLARE_LINE_PARSER(name) \ + static gboolean (name) (YelpManParser *parser, GError **error); + +DECLARE_LINE_PARSER (parse_xf); +DECLARE_LINE_PARSER (parse_f); +DECLARE_LINE_PARSER (parse_V); +DECLARE_LINE_PARSER (parse_H); +DECLARE_LINE_PARSER (parse_v); +DECLARE_LINE_PARSER (parse_h); +DECLARE_LINE_PARSER (parse_text); +DECLARE_LINE_PARSER (parse_w); +DECLARE_LINE_PARSER (parse_body_text); +DECLARE_LINE_PARSER (parse_n); + +/* Declare a sort of alist registry of parsers for different lines. */ +struct LineParsePair +{ + const gchar *prefix; + LineParser handler; +}; +static struct LineParsePair line_parsers[] = { + { "x f", parse_xf }, { "f", parse_f }, + { "V", parse_V }, { "H", parse_H }, + { "v", parse_v }, { "h", parse_h }, + { "t", parse_text }, + { "w", parse_w }, + { "n", parse_n }, + { NULL, NULL } +}; - parser_ensure_P (parser); - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - str = args_concat_all (args); - parser_append_given_text_handle_escapes (parser, str, TRUE); - g_free (str); - } - - parser->ins = parser->ins->parent; -} +/******************************************************************************/ +/* Parser helper functions (managing the state of the various parsing + * bits) */ +static void finish_span (YelpManParser *parser); -static void -macro_roman_bold_small_italic_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - GSList *ptr = NULL; - gchar a[2], b[2]; - gboolean toggle = TRUE; - - a[0] = macro[0]; - b[0] = macro[1]; - a[1] = b[1] = '\0'; - - parser_ensure_P (parser); - - ptr = args; - while (ptr && ptr->data) { - if (toggle) - parser->ins = parser_append_node (parser, a); - else - parser->ins = parser_append_node (parser, b); - - if (ptr->next) { - gchar *tmp = ptr->next->data; - - if (tmp[0] == '(' && g_ascii_isdigit (tmp[1]) && - (tmp[2] == ')' || (g_ascii_isalpha (tmp[2]) && tmp[3] == ')'))) { - tmp = g_strconcat (ptr->data, " ", tmp, NULL); - parser_append_given_text_handle_escapes (parser, tmp, TRUE); - g_free (tmp); - parser->ins = parser->ins->parent; - ptr = ptr->next->next; - continue; - } - } - - parser_append_given_text_handle_escapes (parser, ptr->data, TRUE); - parser->ins = parser->ins->parent; - - toggle = (toggle) ? 0 : 1; - ptr = g_slist_next (ptr); - } -} +/******************************************************************************/ -static void -macro_new_paragraph_handler (YelpManParser *parser, gchar *macro, GSList *args) +YelpManParser * +yelp_man_parser_new (void) { - xmlNodePtr tmpNode; - - /* Clean up from 'lists'. If this is null we don't care. */ - tmpNode = parser_stack_pop_node (parser, "IP"); - - tmpNode = parser_stack_pop_node (parser, "P"); - if (tmpNode != NULL) { - parser->ins = tmpNode->parent; - } - - parser_ensure_P (parser); + YelpManParser *parser = g_new0 (YelpManParser, 1); + parser->accumulator = g_string_sized_new (1024); + return parser; } -static void -macro_insert_self_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - parser_append_node (parser, macro); -} +/* + This function is responsible for taking a path to a man file and + returning something in the groff intermediate output format for us + to use. -static void -macro_title_header_handler (YelpManParser *parser, gchar *macro, GSList *args) + If something goes wrong, we return NULL and set error to be a + YelpError describing the problem. +*/ +static GInputStream* +get_troff (gchar *path, GError **error) { - GSList *ptr = NULL; - gchar *fields[5] = { "Title", "Section", "Date", "Commentary", "Name" }; - gint i; - - parser->ins = parser_append_node (parser, macro); - - ptr = args; - for (i=0; i < 5; i++) { - if (ptr && ptr->data) { - parser->ins = parser_append_node (parser, fields[i]); - parser_append_given_text_handle_escapes (parser, ptr->data, FALSE); - parser->ins = parser->ins->parent; - ptr = g_slist_next (ptr); - } else - break; - } + gint stdout; + GError *err = NULL; + gchar *argv[] = { "man", "-Z", "-Tutf8", NULL, NULL }; - parser->ins = parser->ins->parent; -} + argv[3] = path; -static void -macro_section_header_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - static gint id = 0; - GIOStatus retval; - GError *error = NULL; - gchar *str = NULL; - gchar *macro_uc = g_strdup (macro); - gchar *ptr; - gchar idval[20]; - - if (!args) { - str = g_data_input_stream_read_line (parser->stream, NULL, NULL, &error); - if (error) { - g_warning ("%s\n", error->message); - g_error_free (error); - } + if (!g_spawn_async_with_pipes (NULL, argv, NULL, + G_SPAWN_SEARCH_PATH, NULL, NULL, + NULL, NULL, &stdout, NULL, &err)) { + /* We failed to run the man program. Return a "Huh?" error. */ + *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN, + err->message); + g_error_free (err); + return NULL; } - else - str = args_concat_all (args); - - for (ptr = macro_uc; *ptr != '\0'; ptr++) - /* FIXME: utf-8 */ - *ptr = g_ascii_toupper (*ptr); - - parser_stack_pop_node (parser, "IP"); - g_snprintf (idval, 20, "%d", ++id); - - /* Sections should be their own, well, section */ - parser->ins = xmlDocGetRootElement (parser->doc); - parser->ins = parser_append_node_attr (parser, macro_uc, "id", idval); - parser_append_given_text_handle_escapes (parser, str, FALSE); - parser->ins = parser->ins->parent; - - if (str) - g_free (str); + return (GInputStream*) g_unix_input_stream_new (stdout, TRUE); } -static void -macro_spacing_handler (YelpManParser *parser, gchar *macro, GSList *args) +xmlDocPtr +yelp_man_parser_parse_file (YelpManParser *parser, + gchar *path, + GError **error) { - parser->ins = parser_append_node (parser, macro); + GInputStream *troff_stream; + gchar *line; + gsize len; + gboolean ret; + xmlNodePtr root; - if (args && args->data) { - parser->ins = parser_append_node (parser, "Count"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } + troff_stream = get_troff (path, error); + if (!troff_stream) return NULL; - parser->ins = parser->ins->parent; -} - -/* this is used to define or redefine a macro until ".." - * is reached. */ -static void -macro_define_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - parser->ignore = TRUE; - parser->token = g_strdup(".."); -} - -static void -macro_tp_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode = NULL; - GError **errormsg = NULL; - - tmpNode = parser_stack_pop_node (parser, "IP"); + parser->stream = g_data_input_stream_new (troff_stream); - if (tmpNode != NULL) - parser->ins = tmpNode->parent; + parser->doc = xmlNewDoc (BAD_CAST "1.0"); + root = xmlNewNode (NULL, BAD_CAST "Man"); + xmlDocSetRootElement (parser->doc, root); - parser->ins = parser_append_node (parser, "IP"); + parser->header = xmlNewNode (NULL, BAD_CAST "header"); + xmlAddChild (root, parser->header); - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } + while (1) { + parser->buffer = + g_data_input_stream_read_line (parser->stream, + &(parser->length), + NULL, NULL); + if (parser->buffer == NULL) break; - g_free (parser->buffer); + ret = parser_parse_line (parser, error); - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->ins = parser_append_node (parser, "Tag"); - parser_parse_line (parser); - parser->ins = parser->ins->parent; - } + g_free (parser->buffer); - parser_stack_push_node (parser, parser->ins); -} - -static void -macro_ip_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode; - - tmpNode = parser_stack_pop_node (parser, "IP"); - - if (tmpNode != NULL) - parser->ins = tmpNode->parent; - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Tag"); - parser_append_given_text_handle_escapes (parser, args->data, TRUE); - parser->ins = parser->ins->parent; - - if (args->next && args->next->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text_handle_escapes (parser, args->next->data, TRUE); - parser->ins = parser->ins->parent; - } + if (!ret) { + xmlFreeDoc (parser->doc); + parser->doc = NULL; + break; + } } - parser_stack_push_node (parser, parser->ins); -} - -static void -macro_hanging_paragraph_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - parser_stack_pop_node (parser, "IP"); - - parser->ins = parser_append_node (parser, macro); + g_object_unref (parser->stream); - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } + return parser->doc; } -static xmlNodePtr -create_th_node (YelpManParser *parser) +void +yelp_man_parser_free (YelpManParser *parser) { - /* Create a TH node if we don't have one already */ - if (!parser->th_node) { - parser->th_node = parser_append_node (parser, "TH"); + guint k; + if (parser) { + for (k=0; k<MAN_FONTS; k++) + g_free (parser->font_registers[k]); } - return parser->th_node; + g_string_free (parser->accumulator, TRUE); + g_free (parser); } +/******************************************************************************/ + +/* Sets the k'th font register to be name. Copies name, so free it + * afterwards. k should be in [0,MAN_FONTS). It seems that man always + * gives us ones at least 1, but groff_out(5) says non-negative. + */ static void -macro_title_handler (YelpManParser *parser, gchar *macro, GSList *args) +set_font_register (YelpManParser *parser, guint k, const gchar* name) { - gchar *str = NULL; - - parser->ins = create_th_node (parser); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Title"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; + if (k > MAN_FONTS) { + g_warning ("Tried to set nonexistant font register %d to %s", + k, name); + return; } - - if (args && args->next && args->next->data) { - parser->ins = parser_append_node (parser, "Section"); - parser_append_given_text (parser, args->next->data); - } - parser->ins = parser->th_node->parent; + g_free (parser->font_registers[k]); + parser->font_registers[k] = g_strdup (name); } -static void -macro_os_handler (YelpManParser *parser, gchar *macro, GSList *args) +static const gchar* +get_font (const YelpManParser *parser) { - gchar *str = NULL; - xmlNodePtr new_ins = parser->ins; - - parser->ins = create_th_node (parser); + guint k = parser->current_font; + if (k > MAN_FONTS || + parser->font_registers[k] == NULL) { + + g_warning ("Tried to get nonexistant font register %d", k); - if (args && args->data) { - parser->ins = parser_append_node (parser, "Os"); - parser_append_given_text (parser, args->data); + return ""; } - parser->ins = parser->th_node->parent; + return parser->font_registers[k]; } -static void -macro_date_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - gchar *str = NULL; - - parser->ins = create_th_node (parser); - - if (args && args->data) { - - str = args_concat_all (args); - - parser->ins = parser_append_node (parser, "Date"); - parser_append_given_text (parser, str); +/******************************************************************************/ - g_free (str); - } +/* + Convenience macros to scan a string, checking for the correct number + of things read. - parser->ins = parser->th_node->parent; -} + Also to raise an error. Add an %s to the end of the format string, + which automatically gets given parser->buffer. + */ +#define SSCANF(fmt,num,...) \ + (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num)) +#define PARSE_ERROR(...) \ + g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, \ + __VA_ARGS__, parser->buffer) +#define RAISE_PARSE_ERROR(...) \ + { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; } -static void -macro_url_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parser_parse_line (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode = NULL; - - if (g_str_equal (macro, "UR")) { - /* If someone wants to do automatic hyperlink wizardry outside - * for the parser, then this should instead generate a tag. - */ - if (args && args->data) { - if (g_str_equal (args->data, ":")) - parser->make_links = FALSE; - else { - parser->ins = parser_append_node (parser, macro); - - parser_stack_push_node (parser, parser->ins); - - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } - } - } - else if (g_str_equal (macro, "UE")) { - - if (parser->make_links) { - tmpNode = parser_stack_pop_node (parser, "UR"); - - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; - } else - parser->make_links = TRUE; - - } - else if (g_str_equal (macro, "UN")) { - - if (args && args->data) { - parser->ins = parser_append_node (parser, macro); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } - + if (parser->lines_parsed < 3) + return parse_prologue_line (parser, error); + + const struct LineParsePair *p = line_parsers; + while (p->handler != NULL) { + if (g_str_has_prefix (parser->buffer, p->prefix)) { + return p->handler(parser, error); + } + p++; } + return TRUE; } -/* relative margin indent; FIXME: this takes a parameter that tells - * how many indents to do, which needs to be implemented to fix - * some man page formatting options */ -/*static void -macro_rs_re_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_prologue_line (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode; - - if (g_str_equal (macro, "RS")) { - parser->ins = parser_append_node (parser, macro); + parser->lines_parsed++; + if (parser->lines_parsed != 2) return TRUE; - parser_stack_push_node (parser, parser->ins); + /* This is the interesting line, which should look like + x res 240 24 40 + The interesting bits are the 24 and the 40, which are the + width and height of a character as far as -Tutf8 is + concerned. + */ + if (SSCANF ("x %*s %*u %u %u", 2, + &parser->char_width, &parser->char_height)) { + RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s"); + } - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } - } - else if (g_str_equal (macro, "RE")) { - parser_stack_pop_node (parser, "IP"); + return TRUE; +} - tmpNode = parser_stack_pop_node (parser, "RS"); +static gboolean +parse_xf (YelpManParser *parser, GError **error) +{ + gchar name[10]; + guint k; - if (tmpNode == NULL) - d (g_warning ("Found unexpected tag: '%s'\n", macro)); - else - parser->ins = tmpNode->parent; + if (SSCANF ("x f%*s %u %10s", 2, &k, name)) { + RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s"); } -}*/ + set_font_register (parser, k, name); + return TRUE; +} -static void -macro_mandoc_list_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_f (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode; - - if (g_str_equal (macro, "Bl")) { - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - gchar *listtype = (gchar *)args->data; - - if (g_str_equal (listtype, "-hang") || - g_str_equal (listtype, "-ohang") || - g_str_equal (listtype, "-tag") || - g_str_equal (listtype, "-diag") || - g_str_equal (listtype, "-inset") - ) { - listtype++; - xmlNewProp (parser->ins, BAD_CAST "listtype", - BAD_CAST listtype); - /* TODO: check for -width, -offset, -compact */ - } else if (g_str_equal (listtype, "-column")) { - /* TODO: support this */; - } else if (g_str_equal (listtype, "-item") || - g_str_equal (listtype, "-bullet") || - g_str_equal (listtype, "-hyphen") || - g_str_equal (listtype, "-dash") - ) { - listtype++; - xmlNewProp (parser->ins, BAD_CAST "listtype", - BAD_CAST listtype); - /* TODO: check for -offset, -compact */ - } - } - - parser_stack_push_node (parser, parser->ins); + guint k; + if (SSCANF ("f%u", 1, &k)) { + RAISE_PARSE_ERROR ("Invalid font line from troff: %s"); } - else if (g_str_equal (macro, "El")) { - - tmpNode = parser_stack_pop_node (parser, "It"); + finish_span (parser); - if (tmpNode != NULL) - parser->ins = tmpNode->parent; + parser->current_font = k; - tmpNode = parser_stack_pop_node (parser, "Bl"); - - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; - } + return TRUE; } -static void -macro_verbatim_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_v (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode; - - if (g_str_equal (macro, "nf") || g_str_equal (macro, "Vb")) { - parser->ins = parser_append_node (parser, "Verbatim"); - parser_stack_push_node (parser, parser->ins); - } - else if (g_str_equal (macro, "fi") || g_str_equal (macro, "Ve")) { - tmpNode = parser_stack_pop_node (parser, "Verbatim"); - - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; + guint dy; + if (SSCANF ("v%u", 1, &dy)) { + RAISE_PARSE_ERROR ("Invalid v line from troff: %s"); } + parser->vpos += dy; + return TRUE; } -static void -macro_reference_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_h (YelpManParser *parser, GError **error) { - if (g_str_equal (macro, "so")) { - gchar *basename = NULL; - gchar *link = NULL; - - if (args && args->data) { - basename = g_strrstr((const gchar *)args->data, "/"); - - if (basename) { - basename++; - link = g_strdup_printf ("man:%s", basename); - } else { - link = g_strdup_printf ("man:%s", (const gchar *)args->data); - basename = (gchar *)args->data; - } - - parser->ins = create_th_node (parser); - parser->ins = parser_append_node (parser, "Title"); - parser_append_given_text (parser, "REFERENCE"); - parser->ins = parser->ins->parent; - parser->ins = parser->ins->parent; - - parser->ins = parser_append_node_attr (parser, "SH", "id", "9999"); - parser_append_given_text (parser, "REFERENCE"); - parser->ins = parser->ins->parent; - - parser_append_given_text (parser, "See "); - parser->ins = parser_append_node (parser, "UR"); - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, link); - parser->ins = parser->ins->parent; - parser_append_given_text (parser, basename); - parser->ins = parser->ins->parent; - - g_free (link); - } + guint dx; + if (SSCANF ("h%u", 1, &dx)) { + RAISE_PARSE_ERROR ("Invalid h line from troff: %s"); } + parser->hpos += dx; + return TRUE; } - -/* many mandoc macros have their arguments parsed so that other - * macros can be called to operate on their arguments. This table - * indicates which macros are _parsed_ for other callable macros, - * and which are _callable_ from other macros: see mdoc(7) for more - * details - */ - -#define MANDOC_NONE 0x01 -#define MANDOC_PARSED 0x01 -#define MANDOC_CALLABLE 0x02 - -struct MandocMacro { - gchar *macro; - gint flags; -}; - -static struct MandocMacro manual_macros[] = { - { "Ad", MANDOC_PARSED | MANDOC_CALLABLE }, - { "An", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ar", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Cd", MANDOC_NONE }, - { "Cm", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Dv", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Er", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ev", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fa", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fd", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fl", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fn", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ic", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Li", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Nd", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Nm", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Op", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ot", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Pa", MANDOC_PARSED | MANDOC_CALLABLE }, - { "St", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Tn", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Va", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Vt", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Xr", MANDOC_PARSED | MANDOC_CALLABLE }, - { NULL, MANDOC_NONE } -}; static gboolean -is_mandoc_manual_macro_parsed (gchar *macro) +parse_V (YelpManParser *parser, GError **error) { - gint i; - - for (i=0; manual_macros[i].macro != NULL; i++) { - if (g_str_equal (macro, manual_macros[i].macro) && - (manual_macros[i].flags & MANDOC_PARSED) == MANDOC_PARSED - ) { - return TRUE; - } + guint y; + if (SSCANF ("V%u", 1, &y)) { + RAISE_PARSE_ERROR ("Invalid V line from troff: %s"); } - - return FALSE; + parser->vpos = y; + return TRUE; } static gboolean -is_mandoc_manual_macro_callable (gchar *macro) +parse_H (YelpManParser *parser, GError **error) { - gint i; - - for (i=0; manual_macros[i].macro != NULL; i++) { - if (g_str_equal (macro, manual_macros[i].macro) && - (manual_macros[i].flags & MANDOC_CALLABLE) == MANDOC_CALLABLE - ) { - return TRUE; - } + guint x; + if (SSCANF ("H%u", 1, &x)) { + RAISE_PARSE_ERROR ("Invalid H line from troff: %s"); } - - return FALSE; + parser->hpos = x; + return TRUE; } -static void -macro_mandoc_utility_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_text (YelpManParser *parser, GError **error) { - GSList *ptr = NULL; - gchar *str = NULL; - gchar *manpage, *uri; + gchar *text, *section, *tmp; + xmlNodePtr node; - g_return_if_fail (macro != NULL); - - if (is_mandoc_manual_macro_parsed (macro)) { - parser->ins = parser_append_node (parser, macro); - - ptr = args; - while (ptr && ptr->data) { - if (is_mandoc_manual_macro_callable ((gchar *)ptr->data)) { - macro_mandoc_utility_handler (parser, (gchar *)ptr->data, ptr->next); - break; - } else { - parser_append_given_text_handle_escapes (parser, (gchar *)ptr->data, TRUE); - } - ptr = ptr->next; - if (ptr && ptr->data) - parser_append_given_text (parser, " "); - } - - parser->ins = parser->ins->parent; - } else { - parser->ins = parser_append_node (parser, macro); - str = args_concat_all (args); - parser->ins = parser->ins->parent; - - g_free (str); - } + g_assert (parser->buffer[0] == 't'); - return; - - if (g_str_equal (macro, "Op")) { - - } else if (g_str_equal (macro, "Nm")) { - - if (str) { - parser_ensure_P (parser); - - parser->ins = parser_append_node (parser, "B"); - parser_append_given_text_handle_escapes (parser, str, TRUE); - parser->ins = parser->ins->parent; - } - } - else if (g_str_equal (macro, "Nd")) { - - if (str) { - parser_append_given_text (parser, " -- "); - parser_append_given_text_handle_escapes (parser, str, TRUE); - } - } - else if (g_str_equal (macro, "Xr")) { - - if (args && args->data && args->next && args->next->data) { - - manpage = g_strdup_printf ("%s(%s)", (gchar *)args->data, (gchar *)args->next->data); - uri = g_strdup_printf ("man:%s", manpage); - - parser_ensure_P (parser); - - parser->ins = parser_append_node (parser, "UR"); - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, uri); - parser->ins = parser->ins->parent; - parser_append_given_text (parser, manpage); - parser->ins = parser->ins->parent; - - ptr = args->next->next; - - while (ptr && ptr->data) { - parser_append_given_text (parser, ptr->data); - ptr = g_slist_next (ptr); - } - - g_free (uri); - g_free (manpage); - } - } + if (parser->state == START) { + /* With a bit of luck, this will be the tBLAH(1) line. Can't + * use sscanf to chop it up since that needs whitespace. */ + section = strchr (parser->buffer + 1, '('); + if (!section) + RAISE_PARSE_ERROR ("Expected t line with title. Got %s"); + text = g_strndup (parser->buffer + 1, + section - (parser->buffer + 1)); - g_free (str); -} + // Skip over the ( + section++; -static void -macro_mandoc_listitem_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - GSList *ptr = NULL; - xmlNodePtr tmpNode; - - tmpNode = parser_stack_pop_node (parser, "It"); - - if (tmpNode != NULL) - parser->ins = tmpNode->parent; - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "ItTag"); - - ptr = args; - while (ptr && ptr->data) { - if (is_mandoc_manual_macro_callable ((gchar *)ptr->data)) { - macro_mandoc_utility_handler (parser, (gchar *)ptr->data, ptr->next); - break; - } else { - parser_append_given_text (parser, (gchar *)ptr->data); - } - ptr = ptr->next; - if (ptr && ptr->data) - parser_append_given_text (parser, " "); - } - - parser->ins = parser->ins->parent; - } + tmp = strchr (section, ')'); + if (!tmp || (*(tmp+1) != '\0')) + RAISE_PARSE_ERROR ("Strange format for t title line: %s"); + section = g_strndup (section, tmp - section); - parser_stack_push_node (parser, parser->ins); -} + parser->state = HAVE_TITLE; -/* the handler functions for each macro all have this form: - * - the calling function, parser_handle_linetag owns the "macro", and "args" - * parameters, so do not free them. - */ -typedef void (*MacroFunc)(YelpManParser *parser, gchar *macro, GSList *args); + xmlNewTextChild (parser->header, + NULL, BAD_CAST "title", text); + xmlNewTextChild (parser->header, + NULL, BAD_CAST "section", section); -struct MacroHandler { - gchar *macro; - MacroFunc handler; -}; + g_free (text); + g_free (section); -/* We are calling all of these macros, when in reality some of them are - * requests (lowercase, defined by groff system), and some of them are - * macros (varying case, defined by man/mdoc/ms/tbl extensions) - * - * A great resource to figure out what each of these does is the groff - * info page. Also groff(7), man(7), and mdoc(7) are useful as well. - */ -static struct MacroHandler macro_handlers[] = { - { "\\\"", macro_ignore_handler }, /* groff: comment */ - { "ad", macro_ignore_handler }, /* groff: set adjusting mode */ - { "Ad", macro_mandoc_utility_handler }, /* mandoc: Address */ - { "An", macro_mandoc_utility_handler }, /* mandoc: Author name */ - { "Ar", macro_mandoc_utility_handler }, /* mandoc: Command line argument */ - { "B", macro_bold_small_italic_handler }, /* man: set bold font */ - { "Bd", macro_ignore_handler }, /* mandoc: Begin-display block */ - { "BI", macro_roman_bold_small_italic_handler }, /* man: bold italic font */ - { "Bl", macro_mandoc_list_handler }, /* mandoc: begin list */ - { "bp", macro_ignore_handler }, /* groff: break page */ - { "br", macro_insert_self_handler }, /* groff: line break */ - { "BR", macro_roman_bold_small_italic_handler }, /* man: set bold roman font */ - { "Cd", macro_mandoc_utility_handler }, /* mandoc: Configuration declaration */ - { "Cm", macro_mandoc_utility_handler }, /* mandoc: Command line argument modifier */ - { "ce", macro_ignore_handler }, /* groff: center text */ - { "Dd", macro_date_handler }, /* mandoc: Document date */ - { "de", macro_define_handler }, /* groff: define macro */ - { "ds", macro_ignore_handler }, /* groff: define string variable */ - { "D1", macro_ignore_handler }, /* mandoc: Indent and display one text line */ - { "Dl", macro_ignore_handler }, /* mandoc: Indent and display one line of literal text */ - { "Dt", macro_title_handler }, /* mandoc: Document title */ - { "Dv", macro_mandoc_utility_handler }, /* mandoc: Defined variable */ - { "Ed", macro_ignore_handler }, /* mandoc: End-display block */ - { "El", macro_mandoc_list_handler }, /* mandoc: end list */ - { "Er", macro_mandoc_utility_handler }, /* mandoc: Error number */ - { "Ev", macro_mandoc_utility_handler }, /* mandoc: Environment variable */ - { "Fa", macro_mandoc_utility_handler }, /* mandoc: Function argument */ - { "Fd", macro_mandoc_utility_handler }, /* mandoc: Function declaration */ - { "fi", macro_verbatim_handler }, /* groff: activate fill mode */ - { "Fl", macro_mandoc_utility_handler }, /* mandoc: ? */ - { "Fn", macro_mandoc_utility_handler }, /* mandoc: Function call */ - { "ft", macro_ignore_handler }, /* groff: change font */ - { "HP", macro_hanging_paragraph_handler }, /* man: paragraph with hanging left indentation */ - { "hy", macro_ignore_handler }, /* groff: enable hyphenation */ - { "I", macro_bold_small_italic_handler }, /* man: set italic font */ - { "Ic", macro_mandoc_utility_handler }, /* mandoc: Interactive Command */ - { "ie", macro_ignore_handler }, /* groff: else portion of if-else */ - { "if", macro_ignore_handler }, /* groff: if statement */ - { "ig", macro_ignore_handler }, /* groff: comment until '..' or '.END' */ - { "ih", macro_ignore_handler }, /* ? */ - { "IX", macro_ignore_handler }, /* ms: print index to stderr */ - { "IB", macro_roman_bold_small_italic_handler }, /* man: set italic bold font */ - { "IP", macro_ip_handler }, /* man: indented paragraph */ - { "IR", macro_roman_bold_small_italic_handler }, /* man: set italic roman font */ - { "It", macro_mandoc_listitem_handler }, /* mandoc: item in list */ - { "Li", macro_mandoc_utility_handler }, /* mandoc: Literal text */ - { "LP", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "na", macro_ignore_handler }, /* groff: disable adjusting */ - { "Nd", macro_mandoc_utility_handler }, /* mandoc: description of utility/program */ - { "ne", macro_ignore_handler }, /* groff: force space at bottom of page */ - { "nf", macro_verbatim_handler }, /* groff: no fill mode */ - { "nh", macro_ignore_handler }, /* groff: disable hyphenation */ - { "Nd", macro_mandoc_utility_handler }, /* mandoc: ? */ - { "Nm", macro_mandoc_utility_handler }, /* mandoc: Command/utility/program name*/ - { "Op", macro_mandoc_utility_handler }, /* mandoc: Option */ - { "Os", macro_os_handler }, /* mandoc: Operating System */ - { "Ot", macro_mandoc_utility_handler }, /* mandoc: Old style function type (Fortran) */ - { "P", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "Pa", macro_mandoc_utility_handler }, /* mandoc: Pathname or filename */ - { "PP", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "Pp", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "ps", macro_ignore_handler }, /* groff: change type size */ - { "RB", macro_roman_bold_small_italic_handler }, /* man: set roman bold font */ - { "RE", macro_ignore_handler }, /* man: move left margin back to NNN */ - { "RI", macro_roman_bold_small_italic_handler }, /* man: set roman italic font */ - { "RS", macro_ignore_handler }, /* man: move left margin to right by NNN */ - { "SH", macro_section_header_handler }, /* man: unnumbered section heading */ - { "Sh", macro_section_header_handler }, /* man: unnumbered section heading */ - { "SM", macro_bold_small_italic_handler }, /* man: set font size one SMaller */ - { "so", macro_reference_handler }, /* groff: include file */ - { "sp", macro_spacing_handler }, /* groff: */ - { "SS", macro_section_header_handler }, /* man: unnumbered subsection heading */ - { "Ss", macro_section_header_handler }, /* man: unnumbered subsection heading */ - { "St", macro_mandoc_utility_handler }, /* mandoc: Standards (-p1003.2, -p1003.1 or -ansiC) */ - { "TH", macro_title_header_handler }, /* man: set title of man page */ - { "TP", macro_tp_handler }, /* man: set indented paragraph with label */ - { "UR", macro_url_handler }, /* man: URL start hyperlink */ - { "UE", macro_url_handler }, /* man: URL end hyperlink */ - { "UN", macro_ignore_handler }, /* ? */ - { "TE", macro_ignore_handler }, /* ms: table */ - { "Tn", macro_mandoc_utility_handler }, /* mandoc: Trade or type name (small Caps). */ - { "ti", macro_ignore_handler }, /* groff: temporary indent */ - { "tr", macro_ignore_handler }, /* groff: translate characters */ - { "TS", macro_ignore_handler }, /* ms: table with optional header */ - { "Va", macro_mandoc_utility_handler }, /* mandoc: Variable name */ - { "Vb", macro_verbatim_handler }, /* pod2man: start of verbatim text */ - { "Ve", macro_verbatim_handler }, /* pod2man: end of verbatim text */ - { "Vt", macro_mandoc_utility_handler }, /* mandoc: Variable type (Fortran only) */ - { "Xr", macro_mandoc_utility_handler }, /* mandoc: Manual page cross reference */ - { NULL, NULL } -}; + /* The accumulator should currently be "". */ + g_assert (parser->accumulator && + *(parser->accumulator->str) == '\0'); -static void -parser_handle_linetag (YelpManParser *parser) { - gchar c, *str, *ptr, *arg; - GSList *arglist = NULL; - GSList *listptr = NULL; - MacroFunc handler_func = NULL; - - static GHashTable *macro_hash = NULL; - - /* check if we've created the hash of macros yet. If not, make it */ - if (!macro_hash) { - gint i; - - macro_hash = g_hash_table_new (g_str_hash, g_str_equal); - - for (i=0; macro_handlers[i].macro != NULL; i++) { - g_hash_table_insert (macro_hash, - macro_handlers[i].macro, - macro_handlers[i].handler); - } + return TRUE; } + if (parser->state == HAVE_TITLE) { + /* We expect (maybe!) to get some lines tThe wh24 + * tCollection. We've found (and can ignore!) the second + * title line if there's a (). */ + if (strchr (parser->buffer+1, '(') && + strchr (parser->buffer+1, ')')) { + parser->state = BODY; - /* FIXME: figure out a better way to handle these cases */ - /* special case, if the line is simply ".\0" then return */ - if (g_utf8_get_char (g_utf8_next_char (parser->cur)) == '\0') { - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - return; - } - /* special case, if the line is simply "..\0" then return */ - else if (g_utf8_get_char (g_utf8_next_char(parser->cur)) == '.' && - g_utf8_get_char (g_utf8_next_char (g_utf8_next_char (parser->cur+2))) == '\0') { - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - - /* skip any spaces after the control character . */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) == ' ') - parser->cur = g_utf8_next_char (parser->cur); - - while (PARSER_CUR - && g_utf8_get_char (parser->cur) != ' ' - && ( (g_utf8_get_char (parser->cur) != '\\') || - ( - (g_utf8_get_char(parser->cur) == '\\') && - (g_utf8_get_char(g_utf8_next_char (parser->cur)) == '\"') - ) - ) - && g_utf8_get_char (parser->cur) != '\0') { - if ( - (g_utf8_get_char (parser->cur) == '\\') && - (g_utf8_get_char (g_utf8_next_char (parser->cur)) == '\"') - ) { - parser->cur = g_utf8_next_char (g_utf8_next_char (parser->cur)); - break; - } - parser->cur = g_utf8_next_char (parser->cur); - } + xmlNewTextChild (parser->header, + NULL, BAD_CAST "collection", + parser->accumulator->str); + g_string_truncate (parser->accumulator, 0); - /* copy the macro/request into str */ - c = *(parser->cur); - *(parser->cur) = '\0'; - str = g_strdup (parser->anc + 1); /* skip control character '.' by adding one */ - *(parser->cur) = c; - parser->anc = parser->cur; - - /* FIXME: need to handle escaped characters */ - /* perform argument parsing and store argument in a singly linked list */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) != '\0') { - ptr = NULL; - arg = NULL; - - /* skip any whitespace */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) == ' ') { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - -get_argument: - /* search until we hit whitespace or an " */ - while (PARSER_CUR && - g_utf8_get_char (parser->cur) != '\0' && - g_utf8_get_char (parser->cur) != ' ' && - g_utf8_get_char (parser->cur) != '\"') - parser->cur = g_utf8_next_char (parser->cur); - - /* this checks for escaped spaces */ - if (PARSER_CUR && - ((parser->cur - parser->buffer) > 0) && - g_utf8_get_char (parser->cur) == ' ' && - g_utf8_get_char (g_utf8_prev_char (parser->cur)) == '\\') { - parser->cur = g_utf8_next_char (parser->cur); - goto get_argument; - } - - if (g_utf8_get_char (parser->cur) == '\0' && - (parser->cur == parser->anc)) - break; - - if (g_utf8_get_char (parser->cur) == '\"' && - g_utf8_get_char (g_utf8_prev_char (parser->cur)) == ' ') { - /* quoted argument */ - ptr = strchr (parser->cur+1, '\"'); - if (ptr != NULL) { - c = *(ptr); - *(ptr) = '\0'; - arg = g_strdup (parser->anc+1); - *(ptr) = c; - parser->cur = ptr; - parser->anc = ++parser->cur; - } else { - /* unmatched double quote: include the " as part of the argument */ - parser->cur++; - goto get_argument; - } - } - else if (*(parser->cur) == '\"') { - /* quote in the middle of an argument */ - c = *(parser->cur+1); - *(parser->cur+1) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur+1) = c; - parser->anc = ++parser->cur; - } - else if (*(parser->cur) == ' ') { - /* normal space separated argument */ - c = *(parser->cur); - *(parser->cur) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur) = c; - parser->anc = ++parser->cur; - } - else if (*(parser->cur) == '\0' && *(parser->cur-1) != ' ') { - /* special case for EOL */ - c = *(parser->cur); - *(parser->cur) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur) = c; - parser->anc = parser->cur; - } else - ; /* FIXME: do we need to handle this case? */ - - arglist = g_slist_append (arglist, arg); - } - - /*g_print ("handling macro (%s)\n", str); - - listptr = arglist; - while (listptr && listptr->data) { - g_print (" arg = %s\n", (gchar *)listptr->data); - listptr = g_slist_next (listptr); - } - */ - - /* lookup the macro handler and call that function */ - handler_func = g_hash_table_lookup (macro_hash, str); - if (handler_func) - (*handler_func) (parser, str, arglist); - - /* in case macro is not defined in hash table, ignore rest of line */ - else - macro_ignore_handler (parser, str, arglist); - - g_free (str); - - listptr = arglist; - while (listptr && listptr->data) { - g_free (listptr->data); - listptr = g_slist_next (listptr); - } - - return; - - if (0) { - } - /* Table (tbl) macros */ - else if (g_str_equal (str, "TS")) { - parser->ins = parser_append_node (parser, "TABLE"); - g_free (str); - - parser_stack_push_node (parser, parser->ins); - g_free (parser->buffer); - parser_parse_table (parser); - } - else if (g_str_equal (str, "TE")) { - /* We should only see this from within parser_parse_table */ - g_warning ("Found unexpected tag: '%s'\n", str); - g_free (str); - } - /* "ie" and "if" are conditional macros in groff - * "ds" is to define a variable; see groff(7) - * ignore anything between the \{ \}, otherwise ignore until - * the end of the linee*/ - else if (g_str_equal (str, "ds") || g_str_equal (str, "ie") - || g_str_equal (str, "if")) { - /* skip any remaining spaces */ - while (PARSER_CUR && (*parser->cur == ' ')) - parser->anc = ++parser->cur; - - /* skip the "stringvar" or "cond"; see groff(7) */ - while (PARSER_CUR && (*parser->cur != ' ')) - parser->anc = ++parser->cur; - - /* skip any remaining spaces */ - while (PARSER_CUR && (*parser->cur == ' ')) - parser->anc = ++parser->cur; - - /* check to see if the next two characters are the - * special "\{" sequence */ - if (*parser->cur == '\\' && *(parser->cur+1) == '{') { - parser->ignore = TRUE; - parser->token = g_strdup ("\\}"); - } else { - /* otherwise just ignore till the end of the line */ - while (PARSER_CUR) - parser->anc = ++parser->cur; - } - } - /* else conditional macro */ - else if (g_str_equal (str, "el")) { - /* check to see if the next two characters are the - * special "\{" sequence */ - parser->ignore = 0; - if (*parser->cur == '\\' && *(parser->cur+1) == '{') { - parser->ignore = TRUE; - parser->token = g_strdup ("\\}"); - } else { - /* otherwise just ignore till the end of the line */ - while (PARSER_CUR) - parser->anc = ++parser->cur; - } - } - -} + return TRUE; + } -static void -parser_ensure_P (YelpManParser *parser) -{ - if (xmlStrEqual (parser->ins->name, BAD_CAST "Man")) { - parser->ins = parser_append_node (parser, "P"); - parser_stack_push_node (parser, parser->ins); - } -} + g_string_append (parser->accumulator, parser->buffer+1); -static void -parser_read_until (YelpManParser *parser, - gchar delim) -{ - gchar c; - - while (PARSER_CUR - && g_utf8_get_char (parser->cur) != '\0' - && g_utf8_get_char (parser->cur) != delim) { - parser->cur = g_utf8_next_char (parser->cur); + return TRUE; } - if (parser->anc == parser->cur) - return; - - c = *(parser->cur); - *(parser->cur) = '\0'; - parser_append_given_text_handle_escapes (parser, parser->anc, TRUE); - *(parser->cur) = c; - - parser->anc = parser->cur; + return parse_body_text (parser, error); } -static void -parser_escape_tags (YelpManParser *parser, - gchar **tags, - gint ntags) +/* + w is a sort of prefix argument. It indicates a space, so we register + that here, then call parser_parse_line again on the rest of the + string to deal with that. + */ +static gboolean +parse_w (YelpManParser *parser, GError **error) { - gint i; - xmlNodePtr node = NULL; - xmlNodePtr cur = parser->ins; - GSList *path = NULL; - - /* Find the top node we can escape from */ - while (cur && cur != (xmlNodePtr)parser->doc && - cur->parent && cur->parent != (xmlNodePtr) parser->doc) { - for (i = 0; i < ntags; i++) - if (!xmlStrcmp (cur->name, BAD_CAST tags[i])) { - node = cur; - break; - } - path = g_slist_prepend (path, cur); - cur = cur->parent; - } + gboolean ret; - /* Walk back down, reproducing nodes we aren't escaping */ - if (node) { - GSList *c = path; - while (c && (xmlNodePtr) c->data != node) - c = g_slist_next (c); - - parser->ins = node->parent; - parser_ensure_P (parser); - - while ((c = c->next)) { - gboolean insert = TRUE; - cur = (xmlNodePtr) c->data; - - for (i = 0; i < ntags; i++) - if (!xmlStrcmp (cur->name, BAD_CAST tags[i])) { - insert = FALSE; - break; - } - if (insert) - parser->ins = parser_append_node (parser, (gchar *) cur->name); - } + if (parser->state != START) { + g_string_append_c (parser->accumulator, ' '); } -} - -static void -parser_append_given_text_handle_escapes (YelpManParser *parser, gchar *text, gboolean make_links) -{ - gchar *escape[] = { "fI", "fB" }; - gchar *baseptr, *ptr, *anc, *str; - gint c, len; - - g_return_if_fail (parser != NULL); - - if (!text) - return; - - baseptr = g_strdup (text); - ptr = baseptr; - anc = baseptr; - len = strlen (baseptr); - while (ptr && *ptr != '\0') { - - if (*ptr == '\\') { - - c = *ptr; - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - - anc = ++ptr; - - switch (*ptr) { - case '\0': - break; - case '-': - case '\\': - ptr++; - c = *ptr; - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - anc = ptr; - break; - case 'f': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - parser_ensure_P (parser); - parser_escape_tags (parser, escape, 2); - - /* the \f escape sequence changes the font - R is Roman, - * B is Bold, and I is italic */ - if (g_str_equal (str, "fI") || g_str_equal (str, "fB")) - parser->ins = parser_append_node (parser, str); - else if (!g_str_equal (str, "fR") && !g_str_equal (str, "fP")) - g_warning ("No rule matching the tag '%s'\n", str); - - g_free (str); - anc = ptr; - break; - case '(': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - if (g_str_equal (str, "(co")) - parser_append_given_text (parser, "©"); - else if (g_str_equal (str, "(bu")) - parser_append_given_text (parser, "•"); - else if (g_str_equal (str, "(em")) - parser_append_given_text (parser, "—"); - - g_free (str); - anc = ptr; - break; - case '*': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - - if (*(ptr) == 'R') { - parser_append_given_text (parser, "®"); - ptr++; - } else if (*(ptr) == '=') { - parser_append_given_text (parser, "--"); - ptr++; - } else if (*(ptr) == '(') { - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - if (g_str_equal (str, "*(Tm")) - parser_append_given_text (parser, "™"); - else if (g_str_equal (str, "*(lq")) - parser_append_given_text (parser, "“"); - else if (g_str_equal (str, "*(rq")) - parser_append_given_text (parser, "”"); - - g_free (str); - } - - anc = ptr; - break; - case 'e': - anc = ++ptr; - parser_append_given_text (parser, "\\"); - break; - case '&': - anc = ++ptr; - break; - case 's': - /* this handles (actually ignores) the groff macros \s[+-][0-9] */ - ptr++; - if (*(ptr) == '+' || *(ptr) == '-') { - ptr++; - if (g_ascii_isdigit (*ptr)) { - ptr++; - } - } else if (g_ascii_isdigit (*ptr)) { - ptr++; - } - anc = ptr; - break; - case '"': - /* Marks comments till end of line. so we can ignore it. */ - while (ptr && *ptr != '\0') - ptr++; - anc = ptr; - break; - case '^': - case '|': - /* 1/12th and 1/16th em respectively - ignore this and simply output a space */ - anc = ++ptr; - break; - default: - ptr++; - c = *(ptr); - *(ptr) = '\0'; - parser_append_given_text (parser, anc); - *(ptr) = c; - - anc++; - break; - } - - } - else if ((make_links) && (*ptr == '(')) { - gchar *space_pos; - gchar *url; - gchar c; - gchar *name_end; - gchar *num_start; - gchar *num_end; - - - space_pos = ptr; - - while (space_pos != anc && *(space_pos - 1) != ' ') { - space_pos--; - } - name_end = space_pos; - - if (space_pos != ptr && - g_ascii_isdigit(*(ptr+1)) && - (*(ptr+2) == ')' || (g_ascii_isalpha (*(ptr+2)) && *(ptr+3) == ')'))) { - num_start = ptr; - if (*(ptr+2) == ')') - num_end = ptr + 2; - else - num_end = ptr + 3; - - ptr+=3; - - parser_ensure_P (parser); - - ptr = space_pos; - - c = (*ptr); - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - anc = ptr; - ptr = num_start; - - c = *name_end; - *name_end = '\0'; - *num_end = '\0'; - url = g_strdup_printf ("man:%s(%s)", anc, num_start + 1); - - - parser->ins = parser_append_node (parser, "UR"); - - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, url); - parser->ins = parser->ins->parent; - - parser_append_given_text (parser, anc); - parser->ins = parser->ins->parent; - - *name_end = c; - *num_end = ')'; - anc = ptr; - - g_free (url); - - } else { - ptr++; - } - } - else { - ptr++; - } - - } /* end while */ - - c = *(ptr); - *(ptr) = '\0'; - parser_append_given_text (parser, anc); - parser_append_given_text (parser, "\n"); - *(ptr) = c; - - g_free (baseptr); + parser->buffer++; + ret = parser_parse_line (parser, error); + parser->buffer--; + return ret; } -static xmlNodePtr -parser_append_text (YelpManParser *parser) +static gboolean +parse_body_text (YelpManParser *parser, GError **error) { - xmlNodePtr node; - gchar c; + gchar tmp[64]; - if (parser->anc == parser->cur) - return NULL; + /* + It's this function which is responsible for trying to get *some* + semantic information back out of the manual page. - c = *(parser->cur); - *(parser->cur) = '\0'; + The highest-level chopping up is into sections. We use the + heuristic that if either + (1) We haven't got a section yet or + (2) text starts a line (hpos=0) + then it's a section title. - if (g_utf8_get_char (parser->anc) != '\0') - parser_ensure_P (parser); + It's possible to have spaces in section titles, so we carry on + accumulating the section title until the next newline. + */ + if (parser->section_state != SECTION_TITLE && parser->hpos == 0) { + g_string_truncate (parser->accumulator, 0); + /* End the current sheet & section */ + parser->section_state = SECTION_TITLE; + parser->sheet_node = NULL; - node = xmlNewText (BAD_CAST parser->anc); - xmlAddChild (parser->ins, node); + parser->section_node = + xmlAddChild (xmlDocGetRootElement (parser->doc), + xmlNewNode (NULL, BAD_CAST "section")); + } + if (parser->section_state == SECTION_TITLE) goto do_append; - *(parser->cur) = c; + /* + Here we've got real body text! If newline is true, this is the + first word on a line. - parser->anc = parser->cur; + In which case, we check to see whether hpos agrees with the + current sheet's indent. If so (or if there isn't a sheet yet!), + we just add to the accumulator. If not, start a new sheet with + the correct indent. - return node; -} + If we aren't the first word on the line, just add to the + accumulator. + */ + if ((!parser->sheet_node) || + (parser->newline && (parser->hpos != parser->sheet_indent))) { + /* We don't need to worry about finishing the current sheet, + since the accumulator etc. get cleared on newlines and we + know we're at the start of a line. + */ + parser->sheet_node = + xmlAddChild (parser->section_node, + xmlNewNode (NULL, BAD_CAST "sheet")); + parser->sheet_indent = parser->hpos; -static xmlNodePtr -parser_append_given_text (YelpManParser *parser, - gchar *text) -{ - xmlNodePtr node; + /* The indent is specified in em's. */ + snprintf (tmp, 64, "%d", + (int)(parser->hpos / ((float)parser->char_width) / 1.5)); + xmlNewProp (parser->sheet_node, BAD_CAST "indent", tmp); + } - parser_ensure_P (parser); + do_append: + g_string_append (parser->accumulator, parser->buffer+1); - node = xmlNewText (BAD_CAST text); - xmlAddChild (parser->ins, node); + /* Move hpos forward per char */ + parser->hpos += strlen (parser->buffer+1) * parser->char_width; - return node; -} + parser->newline = FALSE; -static xmlNodePtr -parser_append_node (YelpManParser *parser, - gchar *name) -{ - if (!name) - return NULL; - - return xmlNewChild (parser->ins, NULL, BAD_CAST name, NULL); + return TRUE; } -static xmlNodePtr -parser_append_node_attr (YelpManParser *parser, - gchar *name, - gchar *attr, - gchar *value) +static gboolean +parse_n (YelpManParser *parser, GError **error) { - xmlNodePtr node = NULL; - - node = xmlNewChild (parser->ins, NULL, BAD_CAST name, NULL); - xmlNewProp (node, BAD_CAST attr, BAD_CAST value); + xmlNodePtr node; - return node; -} - -static void -parser_stack_push_node (YelpManParser *parser, - xmlNodePtr node) -{ - parser->nodeStack = g_slist_prepend (parser->nodeStack, node); -} + /* Don't care about newlines in the header bit */ + if (parser->state != BODY) return TRUE; -static xmlNodePtr -parser_stack_pop_node (YelpManParser *parser, - gchar *name) -{ - xmlNodePtr popped; + if (parser->section_state == SECTION_TITLE) { + g_strchomp (parser->accumulator->str); + xmlNewTextChild (parser->section_node, NULL, + BAD_CAST "title", parser->accumulator->str); + g_string_truncate (parser->accumulator, 0); - if (parser->nodeStack == NULL) - return NULL; - - popped = (xmlNodePtr) parser->nodeStack->data; - - if (!xmlStrEqual (BAD_CAST name, popped->name)) - return NULL; - - parser->nodeStack = g_slist_remove (parser->nodeStack, popped); - return popped; -} + parser->section_state = SECTION_BODY; + } + else if (parser->sheet_node != NULL) { + /* + In the body of a section, when we get to a newline we should + have an accumulator with text in it and a non-null sheet + (hopefully!). -/* - * Table (tbl) macro package parsing - */ + We know the current font, so add a span for that font + containing the relevant text. Then add a <br/> tag. + */ + finish_span (parser); + node = xmlNewNode (NULL, BAD_CAST "br"); + xmlAddChild (parser->sheet_node, node); + } -static void -parser_handle_table_options (YelpManParser *parser) -{ - /* FIXME: do something with the options */ - g_free (parser->buffer); + parser->newline = TRUE; - return; + return TRUE; } static void -parser_handle_row_options (YelpManParser *parser) +finish_span (YelpManParser *parser) { - /* FIXME: do something with these options */ - - do { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, '.'); - - if (*(parser->cur) == '.') { - g_free (parser->buffer); - break; - } - - g_free (parser->buffer); - - } while ((parser->buffer = - g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) - != NULL); -} + xmlNodePtr node; -static void -parser_parse_table (YelpManParser *parser) -{ - xmlNodePtr table_start; - gboolean empty_row; - - table_start = parser->ins; - - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, ';'); - - if (*(parser->cur) == ';') { - parser_handle_table_options (parser); - - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, '\0'); - } else - return; - } - - parser_handle_row_options (parser); - - /* Now this is where we go through all the rows */ - while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - empty_row = FALSE; - - switch (*(parser->buffer)) { - case '.': - if (*(parser->buffer + 1) == 'T' - && *(parser->buffer + 2) == 'E') { - if (parser_stack_pop_node (parser, "TABLE") == NULL) - g_warning ("Found unexpected tag: 'TE'\n"); - else { - parser->ins = table_start; - - parser->anc = parser->buffer + 3; - parser->cur = parser->buffer + 3; - return; - } - } else if (*(parser->buffer + 1) == 'T' - && *(parser->buffer + 2) == 'H') { - /* Do nothing */ - empty_row = TRUE; - } else { - parser_handle_linetag (parser); - break; - } - case '\0': - empty_row = TRUE; - break; - default: - break; - } - - if (!empty_row) { - parser->ins = parser_append_node (parser, "ROW"); - while (PARSER_CUR && *(parser->cur) != '\0') { - parser_read_until (parser, '\t'); - parser->ins = parser_append_node (parser, "CELL"); - parser_append_text (parser); - parser->ins = parser->ins->parent; - parser->anc++; - parser->cur++; - } - } - - g_free (parser->buffer); - - parser->ins = table_start; - } + if (parser->accumulator->str[0] != '\0') { + node = xmlNewTextChild (parser->sheet_node, NULL, + BAD_CAST "span", + parser->accumulator->str); + xmlNewProp (node, BAD_CAST "class", get_font (parser)); + g_string_truncate (parser->accumulator, 0); } } diff --git a/libyelp/yelp-man-parser.h b/libyelp/yelp-man-parser.h index 1901f1b..963dfbb 100644 --- a/libyelp/yelp-man-parser.h +++ b/libyelp/yelp-man-parser.h @@ -30,8 +30,8 @@ typedef struct _YelpManParser YelpManParser; YelpManParser * yelp_man_parser_new (void); xmlDocPtr yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding); + gchar *path, + GError **error); void yelp_man_parser_free (YelpManParser *parser); #endif /* __YELP_MAN_PARSER_H__ */ diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 4b21bae..45aea88 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -17,349 +17,75 @@ <xsl:param name="linktrail" select="''"/> <xsl:template mode="html.title.mode" match="Man"> - <xsl:value-of select="TH/Title"/> -</xsl:template> - -<xsl:template mode="html.css.mode" match="Man"> - <xsl:param name="direction"/> - <xsl:param name="left"/> - <xsl:param name="right"/> -<xsl:text> -body { font-family: monospace; } -div.hgroup { font-family: sans-serif; } -</xsl:text> -</xsl:template> - -<xsl:template mode="html.header.mode" match="Man"> - <xsl:call-template name="html.linktrail"/> + <xsl:value-of select="header/title"/> </xsl:template> <xsl:template mode="html.body.mode" match="Man"> - <xsl:apply-templates select="TH"/> - <xsl:apply-templates select="SH"/> -</xsl:template> - -<xsl:template name="html.linktrail"> - <div class="linktrail" id="linktrail"> - <xsl:call-template name="html.linktrail.one"> - <xsl:with-param name="str" select="$linktrail"/> - </xsl:call-template> - </div> -</xsl:template> - -<xsl:template name="html.linktrail.one"> - <xsl:param name="str"/> - <xsl:variable name="id" select="substring-before($str, '|')"/> - <xsl:variable name="post_id" select="substring-after($str, '|')"/> - - <span class="linktrail"> - <a class="linktrail" href="x-yelp-toc:{$id}"> - <xsl:choose> - <xsl:when test="contains($post_id, '|')"> - <xsl:value-of select="substring-before($post_id, '|')"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$post_id"/> - </xsl:otherwise> - </xsl:choose> - </a> - </span> - - <xsl:if test="contains($post_id, '|')"> - <xsl:call-template name="html.linktrail.one"> - <xsl:with-param name="str" select="substring-after($post_id, '|')"/> - </xsl:call-template> - </xsl:if> + <xsl:apply-templates select="header"/> + <xsl:apply-templates select="section"/> </xsl:template> <!-- ======================================================================= --> -<xsl:template match="br"> - <xsl:apply-templates/><br/> -</xsl:template> - -<!-- ignore anything in the Indent,Count,sp element for now --> -<xsl:template match="Indent" /> -<xsl:template match="Count" /> -<xsl:template match="sp" /> - -<xsl:template match="B | fB"> - <b><xsl:apply-templates/></b> -</xsl:template> - -<xsl:template match="CELL"> - <td><xsl:apply-templates/></td> -</xsl:template> - -<xsl:template match="I | fI"> - <i><xsl:apply-templates/></i> -</xsl:template> - -<xsl:template match="R | fR"> - <span class="R"><xsl:apply-templates/></span> -</xsl:template> - -<xsl:template match="Verbatim"> - <pre> - <xsl:choose> - <xsl:when test="node()[1]/self::text()"> - <xsl:variable name="node" select="node()[1]"/> - <xsl:choose> - <xsl:when test="starts-with(string($node), '
')"> - <xsl:value-of select="substring-after(string($node), '
')"/> - <xsl:apply-templates select="node()[position() != 1]"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="string($node)"/> - <xsl:apply-templates select="node()[position() != 1]"/> - </xsl:otherwise> - </xsl:choose> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates /> - </xsl:otherwise> - </xsl:choose> - </pre> -</xsl:template> - -<xsl:template match="IP"> - <xsl:choose> - <xsl:when test="preceding-sibling::*[1][self::IP]"/> - <xsl:otherwise> - <dl> - <xsl:apply-templates mode="IP.mode" select="."/> - </dl> - </xsl:otherwise> - </xsl:choose> -</xsl:template> - -<xsl:template mode="IP.mode" match="IP"> - <dt> - <xsl:choose> - <xsl:when test="Tag"> - <xsl:apply-templates select="Tag"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates/> - </xsl:otherwise> - </xsl:choose> - </dt> - <dd> - <xsl:apply-templates select="Tag/following-sibling::node()"/> - </dd> - <xsl:apply-templates mode="IP.mode" - select="following-sibling::*[1][self::IP]"/> -</xsl:template> - -<xsl:template match="P"> - <p><xsl:apply-templates/></p> -</xsl:template> - -<xsl:template match="ROW"> - <tr><xsl:apply-templates/></tr> -</xsl:template> - -<xsl:template match="SS"> - <xsl:variable name="nextSH" select="following-sibling::SH[1]"/> - <xsl:variable name="nextSS" - select="following-sibling::SS[not($nextSH) or - following-sibling::SH[1] = $nextSH][1]"/> - <div class="sect sect-SS"> - <div class="hgroup"> - <h3 class="title"><xsl:apply-templates/></h3> - </div> - <div class="inner"> - <xsl:choose> - <xsl:when test="$nextSS"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SS[1] = $nextSS and - following-sibling::SS[1]/@id = $nextSS/@id]"/> - </xsl:when> - <xsl:when test="$nextSH"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SH[1] = $nextSH and - following-sibling::SH[1]/@id = $nextSH/@id]"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates select="following-sibling::*"/> - </xsl:otherwise> - </xsl:choose> - </div> - </div> -</xsl:template> - -<xsl:template match="SH"> - <xsl:variable name="nextSH" select="following-sibling::SH[1]"/> - <xsl:variable name="nextSS" - select="following-sibling::SS[not($nextSH) or - following-sibling::SH[1] = $nextSH]"/> - <div class="sect sect-SH"> - <div class="hgroup"> - <h2 class="title"><xsl:apply-templates/></h2> - </div> - <div class="inner"> - <xsl:choose> - <xsl:when test="$nextSS"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SS[1] = $nextSS[1] and - following-sibling::SS[1]/@id = $nextSS[1]/@id]"/> - <xsl:apply-templates select="$nextSS"/> - </xsl:when> - <xsl:when test="$nextSH"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SH[1] = $nextSH and - following-sibling::SH[1]/@id = $nextSH/@id]"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates select="following-sibling::*"/> - </xsl:otherwise> - </xsl:choose> - </div> - </div> -</xsl:template> - -<xsl:template match="TABLE"> - <table><xsl:apply-templates/></table> -</xsl:template> - -<xsl:template match="Tag"> - <span class="Tag"><xsl:apply-templates/></span> -</xsl:template> - -<xsl:template match="TH"> +<xsl:template match="header"> <div class="hgroup"> <h1 class="title"> - <span class="Title"> - <xsl:apply-templates select="Title/node()"/> - </span> - <span class="Section"> - <xsl:text>(</xsl:text> - <xsl:apply-templates select="Section/node()"/> - <xsl:text>)</xsl:text> - </span> + <xsl:value-of select="title"/> + <xsl:text>(</xsl:text> + <xsl:value-of select="section"/> + <xsl:text>)</xsl:text> </h1> + <h3 style="text-align: right;"> + <xsl:value-of select="collection"/> + </h3> </div> </xsl:template> -<xsl:template match="UR"> - <a> - <xsl:attribute name="href"> - <xsl:value-of select="URI" /> - </xsl:attribute> - <xsl:apply-templates/> - </a> -</xsl:template> - -<xsl:template match="URI"/> - -<xsl:template match="UN"> - <a name="text()" id="text()"/> -</xsl:template> - -<!-- these are all for mdoc (BSD) man page support --> - -<!-- these are just printed out --> -<xsl:template match="An | Dv | Er | Ev | Ic | Li | St"> - <xsl:text> -</xsl:text> - <xsl:apply-templates/> -</xsl:template> - -<!-- these are italicized --> -<xsl:template match="Ad | Ar | Fa | Ot | Pa | Va | Vt"> - <i><xsl:apply-templates/></i> +<xsl:template match="br"> + <br/> </xsl:template> -<!-- these are bold --> -<xsl:template match="Cd | Cm | Fd | Ic | Nm"> - <b><xsl:apply-templates/></b> -</xsl:template> +<xsl:template match="section"> + <div class="section" style="padding-top: 1em;"> + <h2> + <xsl:value-of select="title"/> + </h2> -<!-- Function call - TODO need to do the ( , ) here --> -<xsl:template match="Fn | Fo | Fc"> - <i><xsl:apply-templates/></i> + <xsl:apply-templates select="sheet"/> + </div> </xsl:template> -<!-- Cross reference --> -<xsl:template match="Xr"> - <xsl:variable name="manpage" select="substring-before(string(.), ' ')"/> - <xsl:variable name="section" select="substring-before(substring-after(string(.), ' '), ' ')"/> - <xsl:variable name="extra" select="substring-after(substring-after(string(.), ' '), ' ')"/> - <a> - <xsl:attribute name="href"> - <xsl:text>man:</xsl:text> - <xsl:value-of select="$manpage"/> - <xsl:text>(</xsl:text> - <xsl:value-of select="$section"/> - <xsl:text>)</xsl:text> +<xsl:template match="sheet"> + <xsl:element name="div"> + <xsl:attribute name="style"> + <xsl:text>padding-left: </xsl:text> + <xsl:value-of select="@indent"/> + <xsl:text>em;</xsl:text> </xsl:attribute> - <xsl:value-of select="$manpage"/> - <xsl:text>(</xsl:text> - <xsl:value-of select="$section"/> - <xsl:text>)</xsl:text> - </a> - <xsl:value-of select="$extra"/> -</xsl:template> - -<!-- Option --> -<xsl:template match="Op | Oo | Oc"> - <xsl:text> [</xsl:text> - <xsl:apply-templates/> - <xsl:text>]</xsl:text> -</xsl:template> - -<!-- Trade or type name (small Caps). --> -<xsl:template match="Tn"> - <xsl:variable name="txt" select="string(child::text())"/> - <xsl:text> </xsl:text> - <xsl:value-of select="translate($txt, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')"/> - <xsl:apply-templates select="*"/> -</xsl:template> -<xsl:template match="Nd"> - <xsl:text> - </xsl:text> - <xsl:apply-templates /> + <p> + <xsl:apply-templates select="span|br"/> + </p> + </xsl:element> </xsl:template> -<xsl:template match="Fl"> - <xsl:text>-</xsl:text> - <b><xsl:apply-templates select="child::text()"/></b> - <xsl:apply-templates select="*"/> -</xsl:template> - -<xsl:template match="Bl"> - <dl> - <xsl:for-each select="It"> - <xsl:choose> - <xsl:when test="ItTag"> - <dt><xsl:apply-templates select="ItTag"/></dt> - <dd> - <xsl:apply-templates select="ItTag/following-sibling::node()"/> - </dd> - </xsl:when> - <xsl:otherwise> - <dt> - <xsl:text>•</xsl:text> - </dt> - <dd> - <xsl:apply-templates /> - </dd> - </xsl:otherwise> - </xsl:choose> - </xsl:for-each> - </dl> -</xsl:template> - -<xsl:template match="ItTag"> - <xsl:apply-templates/> -</xsl:template> +<xsl:template match="span"> + <xsl:element name="span"> + <xsl:choose> + <xsl:when test="@class = 'B'"> + <xsl:attribute name="style"> + font-weight: 700; + </xsl:attribute> + </xsl:when> + <xsl:when test="@class = 'I'"> + <xsl:attribute name="style"> + font-style: italic; + </xsl:attribute> + </xsl:when> + </xsl:choose> -<xsl:template match="*"> - <xsl:message> - <xsl:text>Unmatched element: </xsl:text> - <xsl:value-of select="local-name(.)"/> - </xsl:message> - <xsl:apply-templates/> + <xsl:value-of select="."/> + </xsl:element> </xsl:template> </xsl:stylesheet> -- 1.7.2.3
From f802723263adcf6c5f4ecc588b862f8adb9370d1 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:13:17 +0000 Subject: [PATCH 2/8] Use a monospace font and for internal spacing. Basically, to get internal spaces correct (eg in tables etc), we *have* to use a monospace font. --- libyelp/yelp-man-parser.c | 35 ++++++++++++++++++++++++++++++++++- stylesheets/man2html.xsl.in | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index bceb465..a02cc14 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -145,6 +145,7 @@ static struct LineParsePair line_parsers[] = { /* Parser helper functions (managing the state of the various parsing * bits) */ static void finish_span (YelpManParser *parser); +static guint dx_to_em_count (YelpManParser *parser, guint dx); /******************************************************************************/ @@ -373,10 +374,36 @@ static gboolean parse_h (YelpManParser *parser, GError **error) { guint dx; + guint k; + const gchar *str; + if (SSCANF ("h%u", 1, &dx)) { RAISE_PARSE_ERROR ("Invalid h line from troff: %s"); } parser->hpos += dx; + + /* This is a bit hackish to be honest but... if we're in something + * that'll end up in a span, a spacing h command means that a gap + * should appear. It seems that the easiest way to get this is to + * insert nonbreaking spaces (eugh!) + * + * Of course we don't want to do this when chained from wh24 or + * whatever, so check that accumulator is nonempty and the last + * character isn't ' '. + */ + str = parser->accumulator->str; + if ((parser->sheet_node) && + (str[0] != '\0') && + (str[strlen (str)-1] != ' ')) { + + dx = dx_to_em_count (parser, dx); + for (k=0; k<dx; k++) { + /* 0xc2 0xa0 is nonbreaking space in utf8 */ + g_string_append_c (parser->accumulator, 0xc2); + g_string_append_c (parser->accumulator, 0xa0); + } + } + return TRUE; } @@ -542,7 +569,7 @@ parse_body_text (YelpManParser *parser, GError **error) /* The indent is specified in em's. */ snprintf (tmp, 64, "%d", - (int)(parser->hpos / ((float)parser->char_width) / 1.5)); + (int)(dx_to_em_count (parser, parser->hpos) / 1.5)); xmlNewProp (parser->sheet_node, BAD_CAST "indent", tmp); } @@ -605,3 +632,9 @@ finish_span (YelpManParser *parser) g_string_truncate (parser->accumulator, 0); } } + +static guint +dx_to_em_count (YelpManParser *parser, guint dx) +{ + return (int)(dx / ((float)parser->char_width)); +} diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 45aea88..90dbc05 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -60,7 +60,7 @@ <xsl:attribute name="style"> <xsl:text>padding-left: </xsl:text> <xsl:value-of select="@indent"/> - <xsl:text>em;</xsl:text> + <xsl:text>em; font-family: monospace;</xsl:text> </xsl:attribute> <p> -- 1.7.2.3
From 8a8f391065e72e5597cbb275798ccdf9c4bb7715 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:50:07 +0000 Subject: [PATCH 3/8] Add support for N and C lines. At the moment, there's a horrible hack with a hardcoded table of character names and unicode code points. Hopefully eventually this can be replaced by parsing a file or calling a program... but I don't know how yet :-( --- libyelp/yelp-man-parser.c | 232 +++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 223 insertions(+), 9 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index a02cc14..645d9fd 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -104,6 +104,15 @@ struct _YelpManParser { /* Set to TRUE if there's been a newline since the last text was * parsed. */ gboolean newline; + + /* Count the number of 'N' lines we've seen since the last h + * command. This is because for some reason N doesn't + * automatically move the position forward. Thus immediately after + * one, you see a h24 or the like. Unless there's a space. Then it + * might be wh48. This is set in parse_N (obviously) and used in + * parse_h. + */ + guint N_count; }; static gboolean parser_parse_line (YelpManParser *parser, GError **error); @@ -124,6 +133,8 @@ DECLARE_LINE_PARSER (parse_text); DECLARE_LINE_PARSER (parse_w); DECLARE_LINE_PARSER (parse_body_text); DECLARE_LINE_PARSER (parse_n); +DECLARE_LINE_PARSER (parse_N); +DECLARE_LINE_PARSER (parse_C); /* Declare a sort of alist registry of parsers for different lines. */ struct LineParsePair @@ -138,6 +149,8 @@ static struct LineParsePair line_parsers[] = { { "t", parse_text }, { "w", parse_w }, { "n", parse_n }, + { "N", parse_N }, + { "C", parse_C }, { NULL, NULL } }; @@ -146,6 +159,134 @@ static struct LineParsePair line_parsers[] = { * bits) */ static void finish_span (YelpManParser *parser); static guint dx_to_em_count (YelpManParser *parser, guint dx); +static void append_nbsps (YelpManParser *parser, guint k); + +/******************************************************************************/ +/* Translations for the 'C' command. This is indeed hackish, but the + * -Tutf8 output doesn't seem to give include files so we can do this + * at runtime :-( + * + * On my machine, this data's at /usr/share/groff/current/tmac/ in + * latin1.tmac, unicode.tmac and I worked out the lq and rq from + * running man: I'm not sure where that comes from! + */ +struct StringPair +{ + const gchar *from; + gunichar to; +}; +static const struct StringPair char_translations[] = { + { "r!", 161 }, + { "ct", 162 }, + { "Po", 163 }, + { "Cs", 164 }, + { "Ye", 165 }, + { "bb", 166 }, + { "sc", 167 }, + { "ad", 168 }, + { "co", 169 }, + { "Of", 170 }, + { "Fo", 171 }, + { "tno", 172 }, + { "%", 173 }, + { "rg", 174 }, + { "a-", 175 }, + { "de", 176 }, + { "t+-", 177 }, + { "S2", 178 }, + { "S3", 179 }, + { "aa", 180 }, + { "mc", 181 }, + { "ps", 182 }, + { "pc", 183 }, + { "ac", 184 }, + { "S1", 185 }, + { "Om", 186 }, + { "Fc", 187 }, + { "14", 188 }, + { "12", 189 }, + { "34", 190 }, + { "r?", 191 }, + { "`A", 192 }, + { "'A", 193 }, + { "^A", 194 }, + { "~A", 195 }, + { ":A", 196 }, + { "oA", 197 }, + { "AE", 198 }, + { ",C", 199 }, + { "`E", 200 }, + { "'E", 201 }, + { "^E", 202 }, + { ":E", 203 }, + { "`I", 204 }, + { "'I", 205 }, + { "^I", 206 }, + { ":I", 207 }, + { "-D", 208 }, + { "~N", 209 }, + { "`O", 210 }, + { "'O", 211 }, + { "^O", 212 }, + { "~O", 213 }, + { ":O", 214 }, + { "tmu", 215 }, + { "/O", 216 }, + { "`U", 217 }, + { "'U", 218 }, + { "^U", 219 }, + { ":U", 220 }, + { "'Y", 221 }, + { "TP", 222 }, + { "ss", 223 }, + { "`a", 224 }, + { "'a", 225 }, + { "^a", 226 }, + { "~a", 227 }, + { ":a", 228 }, + { "oa", 229 }, + { "ae", 230 }, + { ",c", 231 }, + { "`e", 232 }, + { "'e", 233 }, + { "^e", 234 }, + { ":e", 235 }, + { "`i", 236 }, + { "'i", 237 }, + { "^i", 238 }, + { ":i", 239 }, + { "Sd", 240 }, + { "~n", 241 }, + { "`o", 242 }, + { "'o", 243 }, + { "^o", 244 }, + { "~o", 245 }, + { ":o", 246 }, + { "tdi", 247 }, + { "/o", 248 }, + { "`u", 249 }, + { "'u", 250 }, + { "^u", 251 }, + { ":u", 252 }, + { "'y", 253 }, + { "Tp", 254 }, + { ":y", 255 }, + { "hy", '-' }, + { "oq", '`' }, + { "cq", '\'' }, + { "lq", 8220 }, // left smart quotes + { "rq", 8221 }, // right smart quotes + { "en", 8211 }, // en-dash + { "em", 8212 }, // em-dash + { "la", 10216 }, // left angle bracket + { "ra", 10217 }, // left angle bracket + { "rs", '\\' }, + { "<=", 8804 }, // < or equal to sign + { ">=", 8805 }, // > or equal to sign + { "aq", '\'' }, + { "tm", 8482 }, // trademark symbol + { NULL, 0 } +}; /******************************************************************************/ @@ -170,9 +311,9 @@ get_troff (gchar *path, GError **error) { gint stdout; GError *err = NULL; - gchar *argv[] = { "man", "-Z", "-Tutf8", NULL, NULL }; + gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", NULL, NULL }; - argv[3] = path; + argv[4] = path; if (!g_spawn_async_with_pipes (NULL, argv, NULL, G_SPAWN_SEARCH_PATH, NULL, NULL, @@ -374,7 +515,7 @@ static gboolean parse_h (YelpManParser *parser, GError **error) { guint dx; - guint k; + int k; const gchar *str; if (SSCANF ("h%u", 1, &dx)) { @@ -396,12 +537,11 @@ parse_h (YelpManParser *parser, GError **error) (str[0] != '\0') && (str[strlen (str)-1] != ' ')) { - dx = dx_to_em_count (parser, dx); - for (k=0; k<dx; k++) { - /* 0xc2 0xa0 is nonbreaking space in utf8 */ - g_string_append_c (parser->accumulator, 0xc2); - g_string_append_c (parser->accumulator, 0xa0); - } + k = dx_to_em_count (parser, dx) - parser->N_count; + parser->N_count = 0; + if (k < 0) k = 0; + + append_nbsps (parser, k); } return TRUE; @@ -638,3 +778,77 @@ dx_to_em_count (YelpManParser *parser, guint dx) { return (int)(dx / ((float)parser->char_width)); } + +static gboolean +parse_N (YelpManParser *parser, GError **error) +{ + gint n; + if (SSCANF ("N%i", 1, &n)) { + RAISE_PARSE_ERROR ("Strange format for N line: %s"); + } + if (n > 127) { + RAISE_PARSE_ERROR ("N line has non-7-bit character: %s"); + } + if (n < -200) { + RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s"); + } + + if (n < 0) { + append_nbsps (parser, -n); + parser->N_count += -n; + } + else { + g_string_append_c (parser->accumulator, (gchar)n); + parser->N_count++; + } + + return TRUE; +} + +static void +append_nbsps (YelpManParser *parser, guint k) +{ + for (; k > 0; k--) { + /* 0xc2 0xa0 is nonbreaking space in utf8 */ + g_string_append_c (parser->accumulator, 0xc2); + g_string_append_c (parser->accumulator, 0xa0); + } +} + +static gboolean +parse_C (YelpManParser *parser, GError **error) +{ + gchar name[16]; + gunichar code = 0; + guint k; + gint len; + + if (SSCANF ("C%16s", 1, name)) { + RAISE_PARSE_ERROR ("Can't understand special character: %s"); + } + + for (k=0; char_translations[k].from; k++) { + if (g_str_equal (char_translations[k].from, name)) { + code = char_translations[k].to; + break; + } + } + if (sscanf (name, "u%x", &k) == 1) { + code = k; + } + + if (!code) { + g_warning ("Couldn't parse troff special character: '%s'", + name); + code = 65533; /* Unicode replacement character */ + } + + /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */ + len = g_unichar_to_utf8 (code, name); + name[len] = '\0'; + g_string_append (parser->accumulator, name); + + parser->N_count++; + + return TRUE; +} -- 1.7.2.3
From f6bc84033785d1cc567e1c808b5bce82dafbe1c3 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:32:54 +0000 Subject: [PATCH 4/8] Spacing changes to fix horizontal spacing for perl(1) and man(1). --- libyelp/yelp-man-parser.c | 122 +++++++++++++++++++++++++++---------------- stylesheets/man2html.xsl.in | 18 ++---- 2 files changed, 82 insertions(+), 58 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 645d9fd..03507ac 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -73,7 +73,7 @@ struct _YelpManParser { guint char_height; /* Count the number of lines we've parsed (needed to get prologue) */ - guint lines_parsed; + guint line_no; /* The x f k name command sets the k'th register to be name. */ gchar* font_registers[MAN_FONTS]; @@ -113,6 +113,13 @@ struct _YelpManParser { * parse_h. */ guint N_count; + + /* Keep track of whether the last character was a space. We can't + * just do this by looking at the last char of accumulator, + * because if there's a font change, it gets zeroed. This gets set + * to TRUE by parse_w and is FALSE the rest of the time. + */ + gboolean last_char_was_space; }; static gboolean parser_parse_line (YelpManParser *parser, GError **error); @@ -160,6 +167,7 @@ static struct LineParsePair line_parsers[] = { static void finish_span (YelpManParser *parser); static guint dx_to_em_count (YelpManParser *parser, guint dx); static void append_nbsps (YelpManParser *parser, guint k); +static void deal_with_newlines (YelpManParser *parser); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the @@ -358,6 +366,7 @@ yelp_man_parser_parse_file (YelpManParser *parser, NULL, NULL); if (parser->buffer == NULL) break; + parser->line_no++; ret = parser_parse_line (parser, error); g_free (parser->buffer); @@ -440,7 +449,7 @@ get_font (const YelpManParser *parser) static gboolean parser_parse_line (YelpManParser *parser, GError **error) { - if (parser->lines_parsed < 3) + if (parser->line_no <= 3) return parse_prologue_line (parser, error); const struct LineParsePair *p = line_parsers; @@ -456,8 +465,7 @@ parser_parse_line (YelpManParser *parser, GError **error) static gboolean parse_prologue_line (YelpManParser *parser, GError **error) { - parser->lines_parsed++; - if (parser->lines_parsed != 2) return TRUE; + if (parser->line_no != 2) return TRUE; /* This is the interesting line, which should look like x res 240 24 40 @@ -516,7 +524,6 @@ parse_h (YelpManParser *parser, GError **error) { guint dx; int k; - const gchar *str; if (SSCANF ("h%u", 1, &dx)) { RAISE_PARSE_ERROR ("Invalid h line from troff: %s"); @@ -529,21 +536,29 @@ parse_h (YelpManParser *parser, GError **error) * insert nonbreaking spaces (eugh!) * * Of course we don't want to do this when chained from wh24 or - * whatever, so check that accumulator is nonempty and the last - * character isn't ' '. + * whatever, so use the last_char_was_space flag + * but... unfortunately some documents actually use stuff like + * wh96 for spacing (eg the lists in perl(1)). So (very hackish!), + * ignore double spaces, since that's probably just been put in to + * make the text justified (eugh), but allow bigger jumps. + * + * Incidentally, the perl manual here has bizarre gaps in the + * synopsis section. God knows why, but man displays them too so + * it's not our fault! :-) */ - str = parser->accumulator->str; + k = dx_to_em_count (parser, dx); + if ((parser->sheet_node) && - (str[0] != '\0') && - (str[strlen (str)-1] != ' ')) { + ((!parser->last_char_was_space) || (k > 2))) { - k = dx_to_em_count (parser, dx) - parser->N_count; - parser->N_count = 0; + k -= parser->N_count; if (k < 0) k = 0; append_nbsps (parser, k); } + parser->N_count = 0; + return TRUE; } @@ -649,7 +664,10 @@ parse_w (YelpManParser *parser, GError **error) } parser->buffer++; + parser->last_char_was_space = TRUE; + ret = parser_parse_line (parser, error); + parser->buffer--; return ret; } @@ -657,8 +675,6 @@ parse_w (YelpManParser *parser, GError **error) static gboolean parse_body_text (YelpManParser *parser, GError **error) { - gchar tmp[64]; - /* It's this function which is responsible for trying to get *some* semantic information back out of the manual page. @@ -682,44 +698,16 @@ parse_body_text (YelpManParser *parser, GError **error) xmlAddChild (xmlDocGetRootElement (parser->doc), xmlNewNode (NULL, BAD_CAST "section")); } - if (parser->section_state == SECTION_TITLE) goto do_append; - /* - Here we've got real body text! If newline is true, this is the - first word on a line. - - In which case, we check to see whether hpos agrees with the - current sheet's indent. If so (or if there isn't a sheet yet!), - we just add to the accumulator. If not, start a new sheet with - the correct indent. + if (parser->section_state != SECTION_TITLE) + deal_with_newlines (parser); - If we aren't the first word on the line, just add to the - accumulator. - */ - if ((!parser->sheet_node) || - (parser->newline && (parser->hpos != parser->sheet_indent))) { - /* We don't need to worry about finishing the current sheet, - since the accumulator etc. get cleared on newlines and we - know we're at the start of a line. - */ - parser->sheet_node = - xmlAddChild (parser->section_node, - xmlNewNode (NULL, BAD_CAST "sheet")); - parser->sheet_indent = parser->hpos; - - /* The indent is specified in em's. */ - snprintf (tmp, 64, "%d", - (int)(dx_to_em_count (parser, parser->hpos) / 1.5)); - xmlNewProp (parser->sheet_node, BAD_CAST "indent", tmp); - } - - do_append: g_string_append (parser->accumulator, parser->buffer+1); /* Move hpos forward per char */ parser->hpos += strlen (parser->buffer+1) * parser->char_width; - - parser->newline = FALSE; + parser->last_char_was_space = FALSE; + parser->N_count = 0; return TRUE; } @@ -755,6 +743,7 @@ parse_n (YelpManParser *parser, GError **error) } parser->newline = TRUE; + parser->last_char_was_space = FALSE; return TRUE; } @@ -793,6 +782,9 @@ parse_N (YelpManParser *parser, GError **error) RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s"); } + deal_with_newlines (parser); + parser->last_char_was_space = FALSE; + if (n < 0) { append_nbsps (parser, -n); parser->N_count += -n; @@ -843,6 +835,9 @@ parse_C (YelpManParser *parser, GError **error) code = 65533; /* Unicode replacement character */ } + deal_with_newlines (parser); + parser->last_char_was_space = FALSE; + /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */ len = g_unichar_to_utf8 (code, name); name[len] = '\0'; @@ -852,3 +847,38 @@ parse_C (YelpManParser *parser, GError **error) return TRUE; } + +static void +deal_with_newlines (YelpManParser *parser) +{ + /* + If newline is true, this is the first word on a line. + + In which case, we check to see whether hpos agrees with the + current sheet's indent. If so (or if there isn't a sheet yet!), + we just add to the accumulator. If not, start a new sheet with + the correct indent. + + If we aren't the first word on the line, just add to the + accumulator. + */ + gchar tmp[64]; + + if ((!parser->sheet_node) || + (parser->newline && (parser->hpos != parser->sheet_indent))) { + /* We don't need to worry about finishing the current sheet, + since the accumulator etc. get cleared on newlines and we + know we're at the start of a line. + */ + parser->sheet_node = + xmlAddChild (parser->section_node, + xmlNewNode (NULL, BAD_CAST "sheet")); + parser->sheet_indent = parser->hpos; + } + + if (parser->newline) { + append_nbsps (parser, dx_to_em_count (parser, parser->hpos)); + } + + parser->newline = FALSE; +} diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 90dbc05..14a26c9 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -51,22 +51,16 @@ <xsl:value-of select="title"/> </h2> - <xsl:apply-templates select="sheet"/> + <div class="section-contents" style="font-family: monospace;"> + <xsl:apply-templates select="sheet"/> + </div> </div> </xsl:template> <xsl:template match="sheet"> - <xsl:element name="div"> - <xsl:attribute name="style"> - <xsl:text>padding-left: </xsl:text> - <xsl:value-of select="@indent"/> - <xsl:text>em; font-family: monospace;</xsl:text> - </xsl:attribute> - - <p> - <xsl:apply-templates select="span|br"/> - </p> - </xsl:element> + <div style="margin: 0px;"> + <p><xsl:apply-templates select="span|br"/></p> + </div> </xsl:template> <xsl:template match="span"> -- 1.7.2.3
From b121126fdc44835b6955bc56ba08b7bd687b3d89 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:40:12 +0000 Subject: [PATCH 5/8] Draw vertical jumps correctly. This allows us to work out when there should be a wider gap between two paragraphs, for instance. --- libyelp/yelp-man-parser.c | 68 +++++++++++++++++++++++++++++++++++++----- stylesheets/man2html.xsl.in | 8 ++++- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 03507ac..a39227a 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -120,6 +120,11 @@ struct _YelpManParser { * to TRUE by parse_w and is FALSE the rest of the time. */ gboolean last_char_was_space; + + /* Keep track of the size of the last vertical jump - used to tell + * whether we need to insert extra space above a line. + */ + gint last_vertical_jump; }; static gboolean parser_parse_line (YelpManParser *parser, GError **error); @@ -142,6 +147,7 @@ DECLARE_LINE_PARSER (parse_body_text); DECLARE_LINE_PARSER (parse_n); DECLARE_LINE_PARSER (parse_N); DECLARE_LINE_PARSER (parse_C); +DECLARE_LINE_PARSER (parse_p); /* Declare a sort of alist registry of parsers for different lines. */ struct LineParsePair @@ -158,6 +164,7 @@ static struct LineParsePair line_parsers[] = { { "n", parse_n }, { "N", parse_N }, { "C", parse_C }, + { "p", parse_p }, { NULL, NULL } }; @@ -168,6 +175,7 @@ static void finish_span (YelpManParser *parser); static guint dx_to_em_count (YelpManParser *parser, guint dx); static void append_nbsps (YelpManParser *parser, guint k); static void deal_with_newlines (YelpManParser *parser); +static void new_sheet (YelpManParser *parser); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the @@ -515,6 +523,7 @@ parse_v (YelpManParser *parser, GError **error) if (SSCANF ("v%u", 1, &dy)) { RAISE_PARSE_ERROR ("Invalid v line from troff: %s"); } + parser->last_vertical_jump += dy; parser->vpos += dy; return TRUE; } @@ -569,6 +578,7 @@ parse_V (YelpManParser *parser, GError **error) if (SSCANF ("V%u", 1, &y)) { RAISE_PARSE_ERROR ("Invalid V line from troff: %s"); } + parser->last_vertical_jump += y - parser->vpos; parser->vpos = y; return TRUE; } @@ -863,22 +873,64 @@ deal_with_newlines (YelpManParser *parser) accumulator. */ gchar tmp[64]; + guint jump_lines; + gboolean made_sheet = FALSE, dont_jump = FALSE; + + /* This only happens at the start of a section, where there's + already a gap + */ + if (!parser->sheet_node) { + dont_jump = TRUE; + } if ((!parser->sheet_node) || (parser->newline && (parser->hpos != parser->sheet_indent))) { - /* We don't need to worry about finishing the current sheet, - since the accumulator etc. get cleared on newlines and we - know we're at the start of a line. - */ - parser->sheet_node = - xmlAddChild (parser->section_node, - xmlNewNode (NULL, BAD_CAST "sheet")); - parser->sheet_indent = parser->hpos; + new_sheet (parser); + made_sheet = TRUE; } if (parser->newline) { append_nbsps (parser, dx_to_em_count (parser, parser->hpos)); + + if ((parser->last_vertical_jump > 0) && (!dont_jump)) { + jump_lines = + parser->last_vertical_jump/parser->char_height; + } else { + jump_lines = 1; + } + + if (jump_lines > 1) { + if (!made_sheet) new_sheet (parser); + made_sheet = TRUE; + } + + if (made_sheet) { + snprintf (tmp, 64, "%u", jump_lines-1); + xmlNewProp (parser->sheet_node, BAD_CAST "jump", tmp); + } } parser->newline = FALSE; + parser->last_vertical_jump = 0; +} + +static gboolean +parse_p (YelpManParser *parser, GError **error) +{ + parser->vpos = 0; + parser->hpos = 0; + return TRUE; +} + +static void +new_sheet (YelpManParser *parser) +{ + /* We don't need to worry about finishing the current sheet, + since the accumulator etc. get cleared on newlines and we + know we're at the start of a line. + */ + parser->sheet_node = + xmlAddChild (parser->section_node, + xmlNewNode (NULL, BAD_CAST "sheet")); + parser->sheet_indent = parser->hpos; } diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 14a26c9..5b3cd59 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -58,9 +58,13 @@ </xsl:template> <xsl:template match="sheet"> - <div style="margin: 0px;"> + <xsl:element name="div"> + <xsl:attribute name="style"> + margin-bottom: 0px; + margin-top: <xsl:value-of select="@jump"/>em; + </xsl:attribute> <p><xsl:apply-templates select="span|br"/></p> - </div> + </xsl:element> </xsl:template> <xsl:template match="span"> -- 1.7.2.3
From 029a552af8544f919f80a8722a813531891416bc Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Wed, 15 Dec 2010 00:47:46 +0000 Subject: [PATCH 6/8] Correctly deal with titles with spaces. This works harder to understand the first line of output: the previous code didn't work if there was a space (or hyphen or anything else) in the command name. --- libyelp/yelp-man-parser.c | 159 +++++++++++++++++++++++++++++++++------------ 1 files changed, 116 insertions(+), 43 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index a39227a..350dc81 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -125,6 +125,9 @@ struct _YelpManParser { * whether we need to insert extra space above a line. */ gint last_vertical_jump; + + /* The title we read earlier (eg 'Foo(2)') */ + gchar *title_str; }; static gboolean parser_parse_line (YelpManParser *parser, GError **error); @@ -176,6 +179,9 @@ static guint dx_to_em_count (YelpManParser *parser, guint dx); static void append_nbsps (YelpManParser *parser, guint k); static void deal_with_newlines (YelpManParser *parser); static void new_sheet (YelpManParser *parser); +static void register_title (YelpManParser *parser, + const gchar* name, const gchar* section); +static void right_truncate_common (gchar *dst, const gchar *src); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the @@ -400,6 +406,7 @@ yelp_man_parser_free (YelpManParser *parser) g_free (parser->font_registers[k]); } g_string_free (parser->accumulator, TRUE); + g_free (parser->title_str); g_free (parser); } @@ -598,65 +605,68 @@ static gboolean parse_text (YelpManParser *parser, GError **error) { gchar *text, *section, *tmp; - xmlNodePtr node; + const gchar *acc; g_assert (parser->buffer[0] == 't'); if (parser->state == START) { - /* With a bit of luck, this will be the tBLAH(1) line. Can't - * use sscanf to chop it up since that needs whitespace. */ - section = strchr (parser->buffer + 1, '('); - if (!section) - RAISE_PARSE_ERROR ("Expected t line with title. Got %s"); - text = g_strndup (parser->buffer + 1, - section - (parser->buffer + 1)); + /* This should be the 'Title String(1)' line. It might come in + * chunks (for example, it might be more than one line + * long!). So just read bits until we get a (blah) bit: stick + * everything in the accumulator and check for + * parentheses. When we've got some, stick the parsed title in + * the header and switch to HAVE_TITLE. + * + * The parse_n code will error out if we didn't manage to get + * a title before the first newline and otherwise is in charge + * of switching to body-parsing mode. + */ + g_string_append (parser->accumulator, parser->buffer+1); - // Skip over the ( - section++; + acc = parser->accumulator->str; - tmp = strchr (section, ')'); - if (!tmp || (*(tmp+1) != '\0')) - RAISE_PARSE_ERROR ("Strange format for t title line: %s"); - section = g_strndup (section, tmp - section); + section = strchr (acc, '('); - parser->state = HAVE_TITLE; + if (section) { + section++; + tmp = strchr (section, ')'); + } - xmlNewTextChild (parser->header, - NULL, BAD_CAST "title", text); - xmlNewTextChild (parser->header, - NULL, BAD_CAST "section", section); + if (section && tmp) { + /* We've got 'Blah (3)' or the like in the accumulator */ + if (*(tmp+1) != '\0') { + RAISE_PARSE_ERROR ("Don't understand title line: '%s'"); + } + parser->state = HAVE_TITLE; + parser->title_str = g_strdup (acc); - g_free (text); - g_free (section); + text = g_strndup (acc, (section - 1) - acc); + section = g_strndup (section, tmp - section); - /* The accumulator should currently be "". */ - g_assert (parser->accumulator && - *(parser->accumulator->str) == '\0'); + register_title (parser, text, section); - return TRUE; - } - if (parser->state == HAVE_TITLE) { - /* We expect (maybe!) to get some lines tThe wh24 - * tCollection. We've found (and can ignore!) the second - * title line if there's a (). */ - if (strchr (parser->buffer+1, '(') && - strchr (parser->buffer+1, ')')) { - parser->state = BODY; - - xmlNewTextChild (parser->header, - NULL, BAD_CAST "collection", - parser->accumulator->str); g_string_truncate (parser->accumulator, 0); - return TRUE; + g_free (text); + g_free (section); } - g_string_append (parser->accumulator, parser->buffer+1); - return TRUE; } - return parse_body_text (parser, error); + if (parser->state == BODY) + return parse_body_text (parser, error); + + /* In state HAVE_TITLE */ + else { + /* We expect (maybe!) to get some lines in between the two + * occurrences of the title itself. So collect up all the text + * we get and then we'll remove the copy of the title at the + * end (hopefully) when we find a newline in parse_n. + */ + g_string_append (parser->accumulator, parser->buffer+1); + return TRUE; + } } /* @@ -727,8 +737,40 @@ parse_n (YelpManParser *parser, GError **error) { xmlNodePtr node; - /* Don't care about newlines in the header bit */ - if (parser->state != BODY) return TRUE; + /* When we're in the header, the parse_n is responsible for + * switching to body text. (See the body of parse_text() for more + * of an explanation). + */ + if (parser->state == START) { + /* Oh no! We've not got a proper title yet! Ho hum, let's + stick whatever's going into a 'title title' and have a null + section. Sob. + */ + register_title (parser, + parser->accumulator->str, + "unknown section"); + g_string_truncate (parser->accumulator, 0); + parser->state = BODY; + return TRUE; + } + + if (parser->state == HAVE_TITLE) { + /* What we've got so far is the manual's collection, followed + by the title again. So we want to get rid of the latter if + possible... + */ + right_truncate_common (parser->accumulator->str, + parser->title_str); + + xmlNewTextChild (parser->header, + NULL, BAD_CAST "collection", + parser->accumulator->str); + g_string_truncate (parser->accumulator, 0); + parser->state = BODY; + return TRUE; + } + + /* parser->state == BODY */ if (parser->section_state == SECTION_TITLE) { g_strchomp (parser->accumulator->str); @@ -934,3 +976,34 @@ new_sheet (YelpManParser *parser) xmlNewNode (NULL, BAD_CAST "sheet")); parser->sheet_indent = parser->hpos; } + +static void +register_title (YelpManParser *parser, + const gchar* name, const gchar* section) +{ + xmlNewTextChild (parser->header, + NULL, BAD_CAST "title", name); + xmlNewTextChild (parser->header, + NULL, BAD_CAST "section", section); +} + +static void +right_truncate_common (gchar *dst, const gchar *src) +{ + guint len_src = strlen (src); + guint len_dst = strlen (dst); + + guint k = (len_src < len_dst) ? len_src - 1 : len_dst - 1; + + dst += len_dst - 1; + src += len_src - 1; + + while (k >= 0) { + if (*dst != *src) break; + *dst = '\0'; + + k--; + dst--; + src--; + } +} -- 1.7.2.3
From e9b51f47d45f603df6661e4e8157fa3c1c24b0b4 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Thu, 16 Dec 2010 00:46:23 +0000 Subject: [PATCH 7/8] Treat 'C' and 'N' lines more carefully. Yay! Russian works properly at last! --- libyelp/yelp-man-parser.c | 121 +++++++++++++++++++++++++++++--------------- 1 files changed, 80 insertions(+), 41 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 350dc81..a821d3a 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -182,6 +182,10 @@ static void new_sheet (YelpManParser *parser); static void register_title (YelpManParser *parser, const gchar* name, const gchar* section); static void right_truncate_common (gchar *dst, const gchar *src); +static gboolean cheeky_call_parse_line (YelpManParser *parser, + GError **error, + gchar first_char, + const gchar *text); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the @@ -607,7 +611,13 @@ parse_text (YelpManParser *parser, GError **error) gchar *text, *section, *tmp; const gchar *acc; - g_assert (parser->buffer[0] == 't'); + /* + Sneakily, this might get called with something other than t + starting the buffer: see parse_C and parse_N. + */ + if (parser->buffer[0] == 't') { + parser->N_count = 0; + } if (parser->state == START) { /* This should be the 'Title String(1)' line. It might come in @@ -669,29 +679,6 @@ parse_text (YelpManParser *parser, GError **error) } } -/* - w is a sort of prefix argument. It indicates a space, so we register - that here, then call parser_parse_line again on the rest of the - string to deal with that. - */ -static gboolean -parse_w (YelpManParser *parser, GError **error) -{ - gboolean ret; - - if (parser->state != START) { - g_string_append_c (parser->accumulator, ' '); - } - - parser->buffer++; - parser->last_char_was_space = TRUE; - - ret = parser_parse_line (parser, error); - - parser->buffer--; - return ret; -} - static gboolean parse_body_text (YelpManParser *parser, GError **error) { @@ -708,7 +695,8 @@ parse_body_text (YelpManParser *parser, GError **error) It's possible to have spaces in section titles, so we carry on accumulating the section title until the next newline. */ - if (parser->section_state != SECTION_TITLE && parser->hpos == 0) { + if (parser->section_state == SECTION_BODY && + (!parser->section_node || (parser->hpos == 0))) { g_string_truncate (parser->accumulator, 0); /* End the current sheet & section */ parser->section_state = SECTION_TITLE; @@ -726,12 +714,35 @@ parse_body_text (YelpManParser *parser, GError **error) /* Move hpos forward per char */ parser->hpos += strlen (parser->buffer+1) * parser->char_width; + parser->last_char_was_space = FALSE; - parser->N_count = 0; return TRUE; } +/* + w is a sort of prefix argument. It indicates a space, so we register + that here, then call parser_parse_line again on the rest of the + string to deal with that. + */ +static gboolean +parse_w (YelpManParser *parser, GError **error) +{ + gboolean ret; + + if (parser->state != START) { + g_string_append_c (parser->accumulator, ' '); + } + + parser->buffer++; + parser->last_char_was_space = TRUE; + + ret = parser_parse_line (parser, error); + + parser->buffer--; + return ret; +} + static gboolean parse_n (YelpManParser *parser, GError **error) { @@ -767,12 +778,13 @@ parse_n (YelpManParser *parser, GError **error) parser->accumulator->str); g_string_truncate (parser->accumulator, 0); parser->state = BODY; + parser->section_state = SECTION_BODY; return TRUE; } /* parser->state == BODY */ - if (parser->section_state == SECTION_TITLE) { + g_strchomp (parser->accumulator->str); xmlNewTextChild (parser->section_node, NULL, BAD_CAST "title", parser->accumulator->str); @@ -824,6 +836,8 @@ static gboolean parse_N (YelpManParser *parser, GError **error) { gint n; + gchar tmp[2]; + if (SSCANF ("N%i", 1, &n)) { RAISE_PARSE_ERROR ("Strange format for N line: %s"); } @@ -840,13 +854,15 @@ parse_N (YelpManParser *parser, GError **error) if (n < 0) { append_nbsps (parser, -n); parser->N_count += -n; - } - else { - g_string_append_c (parser->accumulator, (gchar)n); - parser->N_count++; + return TRUE; } - return TRUE; + parser->N_count++; + + tmp[0] = (gchar)n; + tmp[1] = '\0'; + + return cheeky_call_parse_line (parser, error, 'N', tmp); } static void @@ -887,17 +903,13 @@ parse_C (YelpManParser *parser, GError **error) code = 65533; /* Unicode replacement character */ } - deal_with_newlines (parser); - parser->last_char_was_space = FALSE; - /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */ len = g_unichar_to_utf8 (code, name); name[len] = '\0'; - g_string_append (parser->accumulator, name); parser->N_count++; - return TRUE; + return cheeky_call_parse_line (parser, error, 'C', name); } static void @@ -967,10 +979,10 @@ parse_p (YelpManParser *parser, GError **error) static void new_sheet (YelpManParser *parser) { - /* We don't need to worry about finishing the current sheet, - since the accumulator etc. get cleared on newlines and we - know we're at the start of a line. - */ + /* We don't need to worry about finishing the current sheet, + since the accumulator etc. get cleared on newlines and we + know we're at the start of a line. + */ parser->sheet_node = xmlAddChild (parser->section_node, xmlNewNode (NULL, BAD_CAST "sheet")); @@ -1007,3 +1019,30 @@ right_truncate_common (gchar *dst, const gchar *src) src--; } } + +static gboolean +cheeky_call_parse_line (YelpManParser *parser, GError **error, + gchar first_char, const gchar* text) +{ + /* Do a cunning trick. There's all sorts of code that parse_text + * does, which we don't want to duplicate in parse_N and + * parse_C. So feed a buffer back to parse_text. Tada! Start it + * with "C" or "N" rather than "t" so clever stuff in parse_text + * can tell the difference. + */ + gchar *tmp; + gboolean ret; + guint len = strlen (text); + + tmp = parser->buffer; + parser->buffer = g_new (gchar, 2 + len); + parser->buffer[0] = first_char; + strncpy (parser->buffer + 1, text, len + 1); + + ret = parse_text (parser, error); + + g_free (parser->buffer); + parser->buffer = tmp; + + return ret; +} -- 1.7.2.3
From 24de7cb75540686fab3fd88c7cc4fd1903348936 Mon Sep 17 00:00:00 2001 From: Rupert Swarbrick <rswarbrick gmail com> Date: Wed, 15 Dec 2010 22:42:56 +0000 Subject: [PATCH 8/8] Deal correctly with the last line of output. Since we're parsing the whole thing in one pass, we can't know this is the last line until we're done, so we go through at the end and check whether there's an empty section. If so, we know what the title should look like, so we extract the information and put it somewhere more useful. --- libyelp/yelp-man-parser.c | 163 +++++++++++++++++++++++++++++++++++++++++-- stylesheets/man2html.xsl.in | 13 +++- 2 files changed, 169 insertions(+), 7 deletions(-) diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index a821d3a..68eac81 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -186,6 +186,9 @@ static gboolean cheeky_call_parse_line (YelpManParser *parser, GError **error, gchar first_char, const gchar *text); +static void cleanup_parsed_page (YelpManParser *parser); +static gboolean parse_last_line (YelpManParser *parser, gchar* line); +static void unicode_strstrip (gchar *str); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the @@ -396,6 +399,8 @@ yelp_man_parser_parse_file (YelpManParser *parser, } } + cleanup_parsed_page (parser); + g_object_unref (parser->stream); return parser->doc; @@ -568,8 +573,7 @@ parse_h (YelpManParser *parser, GError **error) */ k = dx_to_em_count (parser, dx); - if ((parser->sheet_node) && - ((!parser->last_char_was_space) || (k > 2))) { + if ((!parser->last_char_was_space) || (k > 2)) { k -= parser->N_count; if (k < 0) k = 0; @@ -707,8 +711,9 @@ parse_body_text (YelpManParser *parser, GError **error) xmlNewNode (NULL, BAD_CAST "section")); } - if (parser->section_state != SECTION_TITLE) + if (parser->section_state != SECTION_TITLE) { deal_with_newlines (parser); + } g_string_append (parser->accumulator, parser->buffer+1); @@ -772,6 +777,7 @@ parse_n (YelpManParser *parser, GError **error) */ right_truncate_common (parser->accumulator->str, parser->title_str); + unicode_strstrip (parser->accumulator->str); xmlNewTextChild (parser->header, NULL, BAD_CAST "collection", @@ -848,9 +854,6 @@ parse_N (YelpManParser *parser, GError **error) RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s"); } - deal_with_newlines (parser); - parser->last_char_was_space = FALSE; - if (n < 0) { append_nbsps (parser, -n); parser->N_count += -n; @@ -1046,3 +1049,151 @@ cheeky_call_parse_line (YelpManParser *parser, GError **error, return ret; } + +static void +cleanup_parsed_page (YelpManParser *parser) +{ + /* First job: the last line usually has the version, date and + * title (again!). The code above misunderstands and parses this + * as a section, so we need to "undo" this and stick the data in + * the header where it belongs. + * + * parser->section_node should still point to it. We assume this + * has happened if it has exactly one child element (the <title> + * tag) + */ + gchar *lastline; + + if (xmlChildElementCount (parser->section_node) == 1) { + lastline = xmlNodeGetContent (parser->section_node); + + /* If parse_last_line works, it sets the data from it in the + <header> tag, so delete the final section. */ + if (parse_last_line (parser, lastline)) { + xmlUnlinkNode (parser->section_node); + xmlFreeNode (parser->section_node); + } + else { + /* Oh dear. This would be unexpected and doesn't seem to + happen with man on my system. But we probably shouldn't + ditch the info, so let's leave the <section> tag and + print a warning message to the console. + */ + g_warning ("Unexpected final line in man document (%s)\n", + lastline); + } + + xmlFree (lastline); + } +} + +static gchar * +skip_whitespace (gchar *text) +{ + while (g_unichar_isspace (g_utf8_get_char (text))) { + text = g_utf8_next_char (text); + } + return text; +} + +static gchar * +last_non_whitespace (gchar *text) +{ + gchar *end = text + strlen(text); + gchar *prev; + + prev = g_utf8_find_prev_char (text, end); + if (!prev) { + /* The string must have been zero-length. */ + return NULL; + } + + while (g_unichar_isspace (g_utf8_get_char (prev))) { + end = prev; + prev = g_utf8_find_prev_char (text, prev); + if (!prev) return NULL; + } + return end; +} + +static gchar * +find_contiguous_whitespace (gchar *text, guint ws_len) +{ + guint counter = 0; + gchar *ws_start; + while (*text) { + if (g_unichar_isspace (g_utf8_get_char (text))) { + if (!counter) ws_start = text; + counter++; + } + else counter = 0; + + if (counter == ws_len) return ws_start; + + text = g_utf8_next_char (text); + } + return NULL; +} + +static gboolean +parse_last_line (YelpManParser *parser, gchar* line) +{ + /* We expect a line of the form + '1.2.3 blah 2009 libfoo(1)' + where the spaces are all nbsp's. + + Look for a gap of at least 3 in a row. If we find that, expand + either side and declare the stuff before to be the version + number and then the stuff afterwards to be the start of the + date. Then do the same thing on the next gap, if there is one. + */ + gchar *gap, *date_start; + + gchar *version; + gchar *date; + + gap = find_contiguous_whitespace (line, 3); + if (!gap) return FALSE; + + version = g_strndup (line, gap - line); + + date_start = skip_whitespace (gap); + + gap = find_contiguous_whitespace (date_start, 3); + if (!gap) return FALSE; + + date = g_strndup (date_start, gap - date_start); + + xmlNewProp (parser->header, BAD_CAST "version", version); + xmlNewProp (parser->header, BAD_CAST "date", date); + + g_free (version); + g_free (date); + + return TRUE; +} + +/* This should work like g_strstrip, but that's an ASCII-only version + * and I want to strip the nbsp's that I so thoughtfully plaster + * stuff with... + */ +static void +unicode_strstrip (gchar *str) +{ + gchar *start, *end; + + if (str == NULL) return; + + end = last_non_whitespace (str); + + if (!end) { + /* String is zero-length or entirely whitespace */ + *str = '\0'; + return; + } + start = skip_whitespace (str); + g_utf8_next_char (end); + + g_memmove (str, start, end - start); + *(str + (end - start)) = '\0'; +} diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 5b3cd59..cc97e8a 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -31,13 +31,24 @@ <div class="hgroup"> <h1 class="title"> <xsl:value-of select="title"/> - <xsl:text>(</xsl:text> + <xsl:text> (</xsl:text> <xsl:value-of select="section"/> <xsl:text>)</xsl:text> </h1> <h3 style="text-align: right;"> <xsl:value-of select="collection"/> </h3> + <xsl:if test="@version or @date"> + <p style="text-align: right"> + <xsl:if test="@version"> + Version: <xsl:value-of select="@version"/> + </xsl:if> + <xsl:if test="@version and @date"><br/></xsl:if> + <xsl:if test="@date"> + Date: <xsl:value-of select="@date"/> + </xsl:if> + </p> + </xsl:if> </div> </xsl:template> -- 1.7.2.3
Attachment:
pgpBNNqDhCcbH.pgp
Description: PGP signature