Hi, I've spent a few hours staring at the bug https://bugzilla.gnome.org/show_bug.cgi?id=477788 and thought I could hack together something based on the groff intermediate format. And I've attached the resulting patch. Also, I've attached the new version of yelp-man-parser.c, since the changes are massive enough to make the diff pretty unhelpful. I'd love to hear feedback. It's not ready for merging yet, since there's a complete hack for named character lines (the hack works for English and German, but not for Czech etc.). But already, it looks better than yelp master on man:man and man:perl etc. Have a play and let me know what you think! Rupert P.S. Oh, and one more reason: rupert hake:/git/yelp git diff --stat master libyelp/yelp-man-document.c | 6 +- libyelp/yelp-man-parser.c | 2322 +++++++++++++------------------------------ libyelp/yelp-man-parser.h | 4 +- stylesheets/man2html.xsl.in | 366 +------ 4 files changed, 766 insertions(+), 1932 deletions(-)
diff --git a/libyelp/yelp-man-document.c b/libyelp/yelp-man-document.c index 14ac8cd..4fac05a 100644 --- a/libyelp/yelp-man-document.c +++ b/libyelp/yelp-man-document.c @@ -436,14 +436,10 @@ man_document_process (YelpManDocument *man) } parser = yelp_man_parser_new (); - priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, encoding); + priv->xmldoc = yelp_man_parser_parse_file (parser, filepath, &error); yelp_man_parser_free (parser); if (priv->xmldoc == NULL) { - error = g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, - _("The file ‘%s’ could not be parsed because it is" - " not a well-formed man page."), - filepath); yelp_document_error_pending ((YelpDocument *) man, error); } diff --git a/libyelp/yelp-man-parser.c b/libyelp/yelp-man-parser.c index 49efe9f..af012fd 100644 --- a/libyelp/yelp-man-parser.c +++ b/libyelp/yelp-man-parser.c @@ -27,125 +27,362 @@ #include <glib.h> #include <glib/gi18n.h> #include <libxml/tree.h> +#include <gio/gio.h> #include <string.h> +#include <math.h> +#include "yelp-error.h" #include "yelp-man-parser.h" -#include "yelp-magic-decompressor.h" - -#define PARSER_CUR (g_utf8_get_char (parser->cur) != '\0' \ - && (parser->cur - parser->buffer < parser->length)) - -static void parser_parse_line (YelpManParser *parser); -static void parser_handle_linetag (YelpManParser *parser); -static void parser_ensure_P (YelpManParser *parser); -static void parser_read_until (YelpManParser *parser, - gchar delim); -static void parser_escape_tags (YelpManParser *parser, - gchar **tags, - gint ntags); -static xmlNodePtr parser_append_text (YelpManParser *parser); -static xmlNodePtr parser_append_given_text (YelpManParser *parser, - gchar *text); -static void parser_append_given_text_handle_escapes - (YelpManParser *parser, - gchar *text, - gboolean make_links); -static xmlNodePtr parser_append_node (YelpManParser *parser, - gchar *name); -static xmlNodePtr parser_append_node_attr (YelpManParser *parser, - gchar *name, - gchar *attr, - gchar *value); -static void parser_stack_push_node (YelpManParser *parser, - xmlNodePtr node); -static xmlNodePtr parser_stack_pop_node (YelpManParser *parser, - gchar *name); -static void parser_parse_table (YelpManParser *parser); - -typedef struct _StackElem StackElem; + +#define MAN_FONTS 8 + +/* The format has two copies of the title like MAN(1) at the top, + * possibly with a string of text in between for the collection. + * + * Start with the parser on START, then HAVE_TITLE when we've read the + * first word with parentheses. At that point, stick new words into + * the "collection" tag. Then finally switch to BODY when we've seen + * the second copy of the one with parentheses. + */ +typedef enum ManParserState +{ + START, + HAVE_TITLE, + BODY +} ManParserState; + +/* See parse_body_text for how this is used. */ +typedef enum ManParserSectionState +{ + SECTION_TITLE, + SECTION_BODY +} ManParserSectionState; + struct _YelpManParser { xmlDocPtr doc; /* The top-level XML document */ - xmlNodePtr ins; /* The insertion node */ - xmlNodePtr th_node; /* The TH node, or NULL if it doesn't exist */ + xmlNodePtr header; /* The header node */ + xmlNodePtr section_node; /* The current section */ + xmlNodePtr sheet_node; /* The current sheet */ GDataInputStream *stream; /* The GIO input stream to read from */ gchar *buffer; /* The buffer, line at a time */ gsize length; /* The buffer length */ - gchar *anc; /* The anchor point in the document */ - gchar *cur; /* Our current position in the document */ + /* The width and height of a character according to troff. */ + guint char_width; + guint char_height; + + /* Count the number of lines we've parsed (needed to get prologue) */ + guint line_no; + + /* The x f k name command sets the k'th register to be name. */ + gchar* font_registers[MAN_FONTS]; + + /* The current font. Should be the index of one of the + * font_registers. Starts at 0 (of course!) + */ + guint current_font; + + /* See description of ManParserState above */ + ManParserState state; + + /* Vertical and horizontal position as far as the troff output is + * concerned. (Measured from top-left). + */ + guint vpos, hpos; + + /* Text accumulator (needed since it comes through in dribs & + * drabs...) */ + GString *accumulator; + + /* See parse_body_text for how this is used. */ + ManParserSectionState section_state; + + /* The indent of the current sheet */ + guint sheet_indent; + + /* Set to TRUE if there's been a newline since the last text was + * parsed. */ + gboolean newline; + + /* Count the number of 'N' lines we've seen since the last h + * command. This is because for some reason N doesn't + * automatically move the position forward. Thus immediately after + * one, you see a h24 or the like. Unless there's a space. Then it + * might be wh48. This is set in parse_N (obviously) and used in + * parse_h. + */ + guint N_count; + + /* Keep track of whether the last character was a space. We can't + * just do this by looking at the last char of accumulator, + * because if there's a font change, it gets zeroed. This gets set + * to TRUE by parse_w and is FALSE the rest of the time. + */ + gboolean last_char_was_space; + + /* Keep track of the size of the last vertical jump - used to tell + * whether we need to insert extra space above a line. + */ + gint last_vertical_jump; +}; - gchar *token; /* see ignore flag; we ignore the parsing stream until - * this string is found in the stream */ - gboolean make_links; /* Allow auto-generated hyperlinks to be disabled. */ - gboolean ignore; /* when true, ignore stream until "token" is found */ - - GSList *nodeStack; +static gboolean parser_parse_line (YelpManParser *parser, GError **error); +static gboolean parse_prologue_line (YelpManParser *parser, GError **error); + +/* Parsers for different types of line */ +typedef gboolean (*LineParser)(YelpManParser *, GError **); +#define DECLARE_LINE_PARSER(name) \ + static gboolean (name) (YelpManParser *parser, GError **error); + +DECLARE_LINE_PARSER (parse_xf); +DECLARE_LINE_PARSER (parse_f); +DECLARE_LINE_PARSER (parse_V); +DECLARE_LINE_PARSER (parse_H); +DECLARE_LINE_PARSER (parse_v); +DECLARE_LINE_PARSER (parse_h); +DECLARE_LINE_PARSER (parse_text); +DECLARE_LINE_PARSER (parse_w); +DECLARE_LINE_PARSER (parse_body_text); +DECLARE_LINE_PARSER (parse_n); +DECLARE_LINE_PARSER (parse_N); +DECLARE_LINE_PARSER (parse_C); +DECLARE_LINE_PARSER (parse_p); + +/* Declare a sort of alist registry of parsers for different lines. */ +struct LineParsePair +{ + const gchar *prefix; + LineParser handler; +}; +static struct LineParsePair line_parsers[] = { + { "x f", parse_xf }, { "f", parse_f }, + { "V", parse_V }, { "H", parse_H }, + { "v", parse_v }, { "h", parse_h }, + { "t", parse_text }, + { "w", parse_w }, + { "n", parse_n }, + { "N", parse_N }, + { "C", parse_C }, + { "p", parse_p }, + { NULL, NULL } }; +/******************************************************************************/ +/* Parser helper functions (managing the state of the various parsing + * bits) */ +static void finish_span (YelpManParser *parser); +static guint dx_to_em_count (YelpManParser *parser, guint dx); +static void append_nbsps (YelpManParser *parser, guint k); +static void deal_with_newlines (YelpManParser *parser); +static void new_sheet (YelpManParser *parser); + +/******************************************************************************/ +/* Translations for the 'C' command. This is indeed hackish, but the + * -Tutf8 output doesn't seem to give include files so we can do this + * at runtime :-( + * + * On my machine, this data's at /usr/share/groff/current/tmac/ in + * latin1.tmac, unicode.tmac and I worked out the lq and rq from + * running man: I'm not sure where that comes from! + */ +struct StringPair +{ + const gchar *from; + gunichar to; +}; +static const struct StringPair char_translations[] = { + { "r!", 161 }, + { "ct", 162 }, + { "Po", 163 }, + { "Cs", 164 }, + { "Ye", 165 }, + { "bb", 166 }, + { "sc", 167 }, + { "ad", 168 }, + { "co", 169 }, + { "Of", 170 }, + { "Fo", 171 }, + { "tno", 172 }, + { "%", 173 }, + { "rg", 174 }, + { "a-", 175 }, + { "de", 176 }, + { "t+-", 177 }, + { "S2", 178 }, + { "S3", 179 }, + { "aa", 180 }, + { "mc", 181 }, + { "ps", 182 }, + { "pc", 183 }, + { "ac", 184 }, + { "S1", 185 }, + { "Om", 186 }, + { "Fc", 187 }, + { "14", 188 }, + { "12", 189 }, + { "34", 190 }, + { "r?", 191 }, + { "`A", 192 }, + { "'A", 193 }, + { "^A", 194 }, + { "~A", 195 }, + { ":A", 196 }, + { "oA", 197 }, + { "AE", 198 }, + { ",C", 199 }, + { "`E", 200 }, + { "'E", 201 }, + { "^E", 202 }, + { ":E", 203 }, + { "`I", 204 }, + { "'I", 205 }, + { "^I", 206 }, + { ":I", 207 }, + { "-D", 208 }, + { "~N", 209 }, + { "`O", 210 }, + { "'O", 211 }, + { "^O", 212 }, + { "~O", 213 }, + { ":O", 214 }, + { "tmu", 215 }, + { "/O", 216 }, + { "`U", 217 }, + { "'U", 218 }, + { "^U", 219 }, + { ":U", 220 }, + { "'Y", 221 }, + { "TP", 222 }, + { "ss", 223 }, + { "`a", 224 }, + { "'a", 225 }, + { "^a", 226 }, + { "~a", 227 }, + { ":a", 228 }, + { "oa", 229 }, + { "ae", 230 }, + { ",c", 231 }, + { "`e", 232 }, + { "'e", 233 }, + { "^e", 234 }, + { ":e", 235 }, + { "`i", 236 }, + { "'i", 237 }, + { "^i", 238 }, + { ":i", 239 }, + { "Sd", 240 }, + { "~n", 241 }, + { "`o", 242 }, + { "'o", 243 }, + { "^o", 244 }, + { "~o", 245 }, + { ":o", 246 }, + { "tdi", 247 }, + { "/o", 248 }, + { "`u", 249 }, + { "'u", 250 }, + { "^u", 251 }, + { ":u", 252 }, + { "'y", 253 }, + { "Tp", 254 }, + { ":y", 255 }, + { "hy", '-' }, + { "oq", '`' }, + { "cq", '\'' }, + { "lq", 8220 }, // left smart quotes + { "rq", 8221 }, // right smart quotes + { "em", 8212 }, // em-dash + { "la", 10216 }, // left angle bracket + { "ra", 10217 }, // left angle bracket + { "rs", '\\' }, + { "<=", 8804 }, // < or equal to sign + { ">=", 8805 }, // > or equal to sign + { "aq", '\'' }, + { "tm", 8482 }, // trademark symbol + { NULL, 0 } +}; + +/******************************************************************************/ + YelpManParser * yelp_man_parser_new (void) { YelpManParser *parser = g_new0 (YelpManParser, 1); - + parser->accumulator = g_string_sized_new (1024); return parser; } +/* + This function is responsible for taking a path to a man file and + returning something in the groff intermediate output format for us + to use. + + If something goes wrong, we return NULL and set error to be a + YelpError describing the problem. +*/ +static GInputStream* +get_troff (gchar *path, GError **error) +{ + gint stdout; + GError *err = NULL; + gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", NULL, NULL }; + + argv[4] = path; + + if (!g_spawn_async_with_pipes (NULL, argv, NULL, + G_SPAWN_SEARCH_PATH, NULL, NULL, + NULL, NULL, &stdout, NULL, &err)) { + /* We failed to run the man program. Return a "Huh?" error. */ + *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN, + err->message); + g_error_free (err); + return NULL; + } + + return (GInputStream*) g_unix_input_stream_new (stdout, TRUE); +} + xmlDocPtr -yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding) +yelp_man_parser_parse_file (YelpManParser *parser, + gchar *path, + GError **error) { - GFile *gfile; - GConverter *converter; - GFileInputStream *file_stream; - GInputStream *stream; + GInputStream *troff_stream; gchar *line; gsize len; + gboolean ret; + xmlNodePtr root; - gfile = g_file_new_for_path (file); - file_stream = g_file_read (gfile, NULL, NULL); - converter = (GConverter *) yelp_magic_decompressor_new (); - stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter); - parser->stream = g_data_input_stream_new (stream); + troff_stream = get_troff (path, error); + if (!troff_stream) return NULL; + + parser->stream = g_data_input_stream_new (troff_stream); parser->doc = xmlNewDoc (BAD_CAST "1.0"); - parser->ins = xmlNewNode (NULL, BAD_CAST "Man"); - xmlDocSetRootElement (parser->doc, parser->ins); - - parser->make_links = TRUE; - - while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) { - /* convert this line from the encoding indicated to UTF-8 */ - if (!g_str_equal (encoding, "UTF-8")) { - GError *converr = NULL; - gchar *new_buffer = NULL; - gsize bytes_written = 0; - - /* We are making the - * assumption that there are no partial characters at the end of this - * string, and therefore can use calls like g_convert() which do not - * preserve state - someone tell me if I'm wrong here */ - new_buffer = g_convert (parser->buffer, parser->length, "UTF-8", - encoding, NULL, &bytes_written, &converr); - if (converr != NULL) { - g_print ("Error occurred converting %s to UTF-8: %s\n", - encoding, converr->message); - g_error_free (converr); - break; - } else if (parser->buffer == NULL) { - g_print ("parser->buffer == NULL\n"); - break; - } - - g_free (parser->buffer); - parser->buffer = new_buffer; - parser->length = bytes_written; - } - - parser_parse_line (parser); - - g_free (parser->buffer); + root = xmlNewNode (NULL, BAD_CAST "Man"); + xmlDocSetRootElement (parser->doc, root); + + parser->header = xmlNewNode (NULL, BAD_CAST "header"); + xmlAddChild (root, parser->header); + + while (1) { + parser->buffer = + g_data_input_stream_read_line (parser->stream, + &(parser->length), + NULL, NULL); + if (parser->buffer == NULL) break; + + parser->line_no++; + ret = parser_parse_line (parser, error); + + g_free (parser->buffer); + + if (!ret) { + xmlFreeDoc (parser->doc); + parser->doc = NULL; + break; + } } g_object_unref (parser->stream); @@ -156,1667 +393,544 @@ yelp_man_parser_parse_file (YelpManParser *parser, void yelp_man_parser_free (YelpManParser *parser) { + guint k; + if (parser) { + for (k=0; k<MAN_FONTS; k++) + g_free (parser->font_registers[k]); + } + g_string_free (parser->accumulator, TRUE); g_free (parser); } /******************************************************************************/ +/* Sets the k'th font register to be name. Copies name, so free it + * afterwards. k should be in [0,MAN_FONTS). It seems that man always + * gives us ones at least 1, but groff_out(5) says non-negative. + */ static void -parser_parse_line (YelpManParser *parser) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - /* check to see if we are ignoring input */ - if (parser->ignore) { - gchar *ptr; - /* needs to be utf-8 compatible */ - ptr = strstr (parser->buffer, parser->token); - if (ptr != NULL) { - while (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - g_free (parser->token); - parser->ignore = FALSE; - } else { - /* return to get another line of input */ - return; - } - } else { - switch (*(parser->buffer)) { - case '.': - parser_handle_linetag (parser); - /* we are ignoring everything until parser->token, - * so return and get next line */ - if (parser->ignore) - return; - break; - case '\0': - parser->ins = xmlDocGetRootElement (parser->doc); - break; - case '\'': - parser->cur = parser->buffer + parser->length - 1; - parser->anc = parser->cur; - default: - break; - } - } - - parser_read_until (parser, '\0'); - - if (parser->cur != parser->anc) - parser_append_text (parser); - - if (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser_append_text (parser); +set_font_register (YelpManParser *parser, guint k, const gchar* name) +{ + if (k > MAN_FONTS) { + g_warning ("Tried to set nonexistant font register %d to %s", + k, name); + return; } + g_free (parser->font_registers[k]); + parser->font_registers[k] = g_strdup (name); } -/* creates a single string from all the macro arguments */ -static gchar * -args_concat_all (GSList *args) +static const gchar* +get_font (const YelpManParser *parser) { - GSList *ptr = NULL; - gchar **str_array = NULL; - gchar *retval = NULL; - gint i = 0; - - if (!args) - return NULL; - - str_array = g_malloc0 ((sizeof (gchar *)) * (g_slist_length (args)+1) ); - - ptr = args; - while (ptr && ptr->data) { - str_array[i++] = ptr->data; - ptr = g_slist_next (ptr); - } - - str_array[i] = NULL; + guint k = parser->current_font; + if (k > MAN_FONTS || + parser->font_registers[k] == NULL) { - retval = g_strjoinv (" ", str_array); + g_warning ("Tried to get nonexistant font register %d", + k); - g_free (str_array); + return ""; + } - return retval; + return parser->font_registers[k]; } -/* handler to ignore a macro by reading until the null character */ -static void -macro_ignore_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - while (PARSER_CUR) { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } -} +/******************************************************************************/ -static void -macro_bold_small_italic_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - gchar *str = NULL; - - parser_ensure_P (parser); - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - str = args_concat_all (args); - parser_append_given_text_handle_escapes (parser, str, TRUE); - g_free (str); - } - - parser->ins = parser->ins->parent; -} +/* + Convenience macros to scan a string, checking for the correct number + of things read. -static void -macro_roman_bold_small_italic_handler (YelpManParser *parser, gchar *macro, GSList *args) + Also to raise an error. Add an %s to the end of the format string, + which automatically gets given parser->buffer. + */ +#define SSCANF(fmt,num,...) \ + (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num)) + +#define PARSE_ERROR(...) \ + g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, \ + __VA_ARGS__, parser->buffer) +#define RAISE_PARSE_ERROR(...) \ + { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; } + +static gboolean +parser_parse_line (YelpManParser *parser, GError **error) { - GSList *ptr = NULL; - gchar a[2], b[2]; - gboolean toggle = TRUE; - - a[0] = macro[0]; - b[0] = macro[1]; - a[1] = b[1] = '\0'; - - parser_ensure_P (parser); - - ptr = args; - while (ptr && ptr->data) { - if (toggle) - parser->ins = parser_append_node (parser, a); - else - parser->ins = parser_append_node (parser, b); - - if (ptr->next) { - gchar *tmp = ptr->next->data; - - if (tmp[0] == '(' && g_ascii_isdigit (tmp[1]) && - (tmp[2] == ')' || (g_ascii_isalpha (tmp[2]) && tmp[3] == ')'))) { - tmp = g_strconcat (ptr->data, " ", tmp, NULL); - parser_append_given_text_handle_escapes (parser, tmp, TRUE); - g_free (tmp); - parser->ins = parser->ins->parent; - ptr = ptr->next->next; - continue; - } - } - - parser_append_given_text_handle_escapes (parser, ptr->data, TRUE); - parser->ins = parser->ins->parent; - - toggle = (toggle) ? 0 : 1; - ptr = g_slist_next (ptr); + if (parser->line_no <= 3) + return parse_prologue_line (parser, error); + + const struct LineParsePair *p = line_parsers; + while (p->handler != NULL) { + if (g_str_has_prefix (parser->buffer, p->prefix)) { + return p->handler(parser, error); + } + p++; } + return TRUE; } -static void -macro_new_paragraph_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_prologue_line (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode; - - /* Clean up from 'lists'. If this is null we don't care. */ - tmpNode = parser_stack_pop_node (parser, "IP"); - - tmpNode = parser_stack_pop_node (parser, "P"); - if (tmpNode != NULL) { - parser->ins = tmpNode->parent; + if (parser->line_no != 2) return TRUE; + + /* This is the interesting line, which should look like + x res 240 24 40 + The interesting bits are the 24 and the 40, which are the + width and height of a character as far as -Tutf8 is + concerned. + */ + if (SSCANF ("x %*s %*u %u %u", 2, + &parser->char_width, &parser->char_height)) { + RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s"); } - parser_ensure_P (parser); + return TRUE; } -static void -macro_insert_self_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_xf (YelpManParser *parser, GError **error) { - parser_append_node (parser, macro); + gchar name[10]; + guint k; + + if (SSCANF ("x f%*s %u %10s", 2, &k, name)) { + RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s"); + } + set_font_register (parser, k, name); + return TRUE; } -static void -macro_title_header_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_f (YelpManParser *parser, GError **error) { - GSList *ptr = NULL; - gchar *fields[5] = { "Title", "Section", "Date", "Commentary", "Name" }; - gint i; - - parser->ins = parser_append_node (parser, macro); - - ptr = args; - for (i=0; i < 5; i++) { - if (ptr && ptr->data) { - parser->ins = parser_append_node (parser, fields[i]); - parser_append_given_text_handle_escapes (parser, ptr->data, FALSE); - parser->ins = parser->ins->parent; - ptr = g_slist_next (ptr); - } else - break; + guint k; + if (SSCANF ("f%u", 1, &k)) { + RAISE_PARSE_ERROR ("Invalid font line from troff: %s"); } + finish_span (parser); - parser->ins = parser->ins->parent; + parser->current_font = k; + + return TRUE; } -static void -macro_section_header_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_v (YelpManParser *parser, GError **error) { - static gint id = 0; - GIOStatus retval; - GError *error = NULL; - gchar *str = NULL; - gchar *macro_uc = g_strdup (macro); - gchar *ptr; - gchar idval[20]; - - if (!args) { - str = g_data_input_stream_read_line (parser->stream, NULL, NULL, &error); - if (error) { - g_warning ("%s\n", error->message); - g_error_free (error); - } + guint dy; + if (SSCANF ("v%u", 1, &dy)) { + RAISE_PARSE_ERROR ("Invalid v line from troff: %s"); } - else - str = args_concat_all (args); - - for (ptr = macro_uc; *ptr != '\0'; ptr++) - /* FIXME: utf-8 */ - *ptr = g_ascii_toupper (*ptr); - - parser_stack_pop_node (parser, "IP"); - - g_snprintf (idval, 20, "%d", ++id); - - /* Sections should be their own, well, section */ - parser->ins = xmlDocGetRootElement (parser->doc); - parser->ins = parser_append_node_attr (parser, macro_uc, "id", idval); - parser_append_given_text_handle_escapes (parser, str, FALSE); - parser->ins = parser->ins->parent; - - if (str) - g_free (str); + parser->last_vertical_jump += dy; + parser->vpos += dy; + return TRUE; } -static void -macro_spacing_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_h (YelpManParser *parser, GError **error) { - parser->ins = parser_append_node (parser, macro); + guint dx; + int k; - if (args && args->data) { - parser->ins = parser_append_node (parser, "Count"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; + if (SSCANF ("h%u", 1, &dx)) { + RAISE_PARSE_ERROR ("Invalid h line from troff: %s"); } + parser->hpos += dx; - parser->ins = parser->ins->parent; -} - -/* this is used to define or redefine a macro until ".." - * is reached. */ -static void -macro_define_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - parser->ignore = TRUE; - parser->token = g_strdup(".."); -} - -static void -macro_tp_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode = NULL; - GError **errormsg = NULL; + /* This is a bit hackish to be honest but... if we're in something + * that'll end up in a span, a spacing h command means that a gap + * should appear. It seems that the easiest way to get this is to + * insert nonbreaking spaces (eugh!) + * + * Of course we don't want to do this when chained from wh24 or + * whatever, so use the last_char_was_space flag + * but... unfortunately some documents actually use stuff like + * wh96 for spacing (eg the lists in perl(1)). So (very hackish!), + * ignore double spaces, since that's probably just been put in to + * make the text justified (eugh), but allow bigger jumps. + * + * Incidentally, the perl manual here has bizarre gaps in the + * synopsis section. God knows why, but man displays them too so + * it's not our fault! :-) + */ + k = dx_to_em_count (parser, dx); - tmpNode = parser_stack_pop_node (parser, "IP"); + if ((parser->sheet_node) && + ((!parser->last_char_was_space) || (k > 2))) { - if (tmpNode != NULL) - parser->ins = tmpNode->parent; + k -= parser->N_count; + if (k < 0) k = 0; - parser->ins = parser_append_node (parser, "IP"); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; + append_nbsps (parser, k); } - g_free (parser->buffer); - - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->ins = parser_append_node (parser, "Tag"); - parser_parse_line (parser); - parser->ins = parser->ins->parent; - } + parser->N_count = 0; - parser_stack_push_node (parser, parser->ins); + return TRUE; } - -static void -macro_ip_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode; - - tmpNode = parser_stack_pop_node (parser, "IP"); - - if (tmpNode != NULL) - parser->ins = tmpNode->parent; - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Tag"); - parser_append_given_text_handle_escapes (parser, args->data, TRUE); - parser->ins = parser->ins->parent; - - if (args->next && args->next->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text_handle_escapes (parser, args->next->data, TRUE); - parser->ins = parser->ins->parent; - } - } - parser_stack_push_node (parser, parser->ins); -} - -static void -macro_hanging_paragraph_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_V (YelpManParser *parser, GError **error) { - parser_stack_pop_node (parser, "IP"); - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; + guint y; + if (SSCANF ("V%u", 1, &y)) { + RAISE_PARSE_ERROR ("Invalid V line from troff: %s"); } + parser->last_vertical_jump += y - parser->vpos; + parser->vpos = y; + return TRUE; } -static xmlNodePtr -create_th_node (YelpManParser *parser) +static gboolean +parse_H (YelpManParser *parser, GError **error) { - /* Create a TH node if we don't have one already */ - if (!parser->th_node) { - parser->th_node = parser_append_node (parser, "TH"); + guint x; + if (SSCANF ("H%u", 1, &x)) { + RAISE_PARSE_ERROR ("Invalid H line from troff: %s"); } - return parser->th_node; + parser->hpos = x; + return TRUE; } -static void -macro_title_handler (YelpManParser *parser, gchar *macro, GSList *args) +static gboolean +parse_text (YelpManParser *parser, GError **error) { - gchar *str = NULL; - - parser->ins = create_th_node (parser); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Title"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } + gchar *text, *section, *tmp; + xmlNodePtr node; - if (args && args->next && args->next->data) { - parser->ins = parser_append_node (parser, "Section"); - parser_append_given_text (parser, args->next->data); - } - parser->ins = parser->th_node->parent; -} + g_assert (parser->buffer[0] == 't'); -static void -macro_os_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - gchar *str = NULL; - xmlNodePtr new_ins = parser->ins; - - parser->ins = create_th_node (parser); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Os"); - parser_append_given_text (parser, args->data); - } + if (parser->state == START) { + /* With a bit of luck, this will be the tBLAH(1) line. Can't + * use sscanf to chop it up since that needs whitespace. */ + section = strchr (parser->buffer + 1, '('); + if (!section) + RAISE_PARSE_ERROR ("Expected t line with title. Got %s"); + text = g_strndup (parser->buffer + 1, + section - (parser->buffer + 1)); - parser->ins = parser->th_node->parent; -} + // Skip over the ( + section++; -static void -macro_date_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - gchar *str = NULL; - - parser->ins = create_th_node (parser); + tmp = strchr (section, ')'); + if (!tmp || (*(tmp+1) != '\0')) + RAISE_PARSE_ERROR ("Strange format for t title line: %s"); + section = g_strndup (section, tmp - section); - if (args && args->data) { - - str = args_concat_all (args); - - parser->ins = parser_append_node (parser, "Date"); - parser_append_given_text (parser, str); + parser->state = HAVE_TITLE; - g_free (str); - } + xmlNewTextChild (parser->header, + NULL, BAD_CAST "title", text); + xmlNewTextChild (parser->header, + NULL, BAD_CAST "section", section); - parser->ins = parser->th_node->parent; -} + g_free (text); + g_free (section); + /* The accumulator should currently be "". */ + g_assert (parser->accumulator && + *(parser->accumulator->str) == '\0'); -static void -macro_url_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode = NULL; - - if (g_str_equal (macro, "UR")) { - /* If someone wants to do automatic hyperlink wizardry outside - * for the parser, then this should instead generate a tag. - */ - if (args && args->data) { - if (g_str_equal (args->data, ":")) - parser->make_links = FALSE; - else { - parser->ins = parser_append_node (parser, macro); - - parser_stack_push_node (parser, parser->ins); - - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } - } - } - else if (g_str_equal (macro, "UE")) { - - if (parser->make_links) { - tmpNode = parser_stack_pop_node (parser, "UR"); - - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; - } else - parser->make_links = TRUE; - - } - else if (g_str_equal (macro, "UN")) { - - if (args && args->data) { - parser->ins = parser_append_node (parser, macro); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; - } - + return TRUE; } -} - -/* relative margin indent; FIXME: this takes a parameter that tells - * how many indents to do, which needs to be implemented to fix - * some man page formatting options */ -/*static void -macro_rs_re_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode; + if (parser->state == HAVE_TITLE) { + /* We expect (maybe!) to get some lines tThe wh24 + * tCollection. We've found (and can ignore!) the second + * title line if there's a (). */ + if (strchr (parser->buffer+1, '(') && + strchr (parser->buffer+1, ')')) { + parser->state = BODY; - if (g_str_equal (macro, "RS")) { - parser->ins = parser_append_node (parser, macro); + xmlNewTextChild (parser->header, + NULL, BAD_CAST "collection", + parser->accumulator->str); + g_string_truncate (parser->accumulator, 0); - parser_stack_push_node (parser, parser->ins); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "Indent"); - parser_append_given_text (parser, args->data); - parser->ins = parser->ins->parent; + return TRUE; } - } - else if (g_str_equal (macro, "RE")) { - parser_stack_pop_node (parser, "IP"); - - tmpNode = parser_stack_pop_node (parser, "RS"); - if (tmpNode == NULL) - d (g_warning ("Found unexpected tag: '%s'\n", macro)); - else - parser->ins = tmpNode->parent; - } -}*/ + g_string_append (parser->accumulator, parser->buffer+1); -static void -macro_mandoc_list_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - xmlNodePtr tmpNode; - - if (g_str_equal (macro, "Bl")) { - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - gchar *listtype = (gchar *)args->data; - - if (g_str_equal (listtype, "-hang") || - g_str_equal (listtype, "-ohang") || - g_str_equal (listtype, "-tag") || - g_str_equal (listtype, "-diag") || - g_str_equal (listtype, "-inset") - ) { - listtype++; - xmlNewProp (parser->ins, BAD_CAST "listtype", - BAD_CAST listtype); - /* TODO: check for -width, -offset, -compact */ - } else if (g_str_equal (listtype, "-column")) { - /* TODO: support this */; - } else if (g_str_equal (listtype, "-item") || - g_str_equal (listtype, "-bullet") || - g_str_equal (listtype, "-hyphen") || - g_str_equal (listtype, "-dash") - ) { - listtype++; - xmlNewProp (parser->ins, BAD_CAST "listtype", - BAD_CAST listtype); - /* TODO: check for -offset, -compact */ - } - } - - parser_stack_push_node (parser, parser->ins); + return TRUE; } - else if (g_str_equal (macro, "El")) { - - tmpNode = parser_stack_pop_node (parser, "It"); - - if (tmpNode != NULL) - parser->ins = tmpNode->parent; - - tmpNode = parser_stack_pop_node (parser, "Bl"); - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; - } + return parse_body_text (parser, error); } -static void -macro_verbatim_handler (YelpManParser *parser, gchar *macro, GSList *args) +/* + w is a sort of prefix argument. It indicates a space, so we register + that here, then call parser_parse_line again on the rest of the + string to deal with that. + */ +static gboolean +parse_w (YelpManParser *parser, GError **error) { - xmlNodePtr tmpNode; - - if (g_str_equal (macro, "nf") || g_str_equal (macro, "Vb")) { - parser->ins = parser_append_node (parser, "Verbatim"); - parser_stack_push_node (parser, parser->ins); - } - else if (g_str_equal (macro, "fi") || g_str_equal (macro, "Ve")) { - tmpNode = parser_stack_pop_node (parser, "Verbatim"); - - if (tmpNode == NULL) - g_warning ("Found unexpected tag: '%s'\n", macro); - else - parser->ins = tmpNode->parent; - } -} + gboolean ret; -static void -macro_reference_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - if (g_str_equal (macro, "so")) { - gchar *basename = NULL; - gchar *link = NULL; - - if (args && args->data) { - basename = g_strrstr((const gchar *)args->data, "/"); - - if (basename) { - basename++; - link = g_strdup_printf ("man:%s", basename); - } else { - link = g_strdup_printf ("man:%s", (const gchar *)args->data); - basename = (gchar *)args->data; - } - - parser->ins = create_th_node (parser); - parser->ins = parser_append_node (parser, "Title"); - parser_append_given_text (parser, "REFERENCE"); - parser->ins = parser->ins->parent; - parser->ins = parser->ins->parent; - - parser->ins = parser_append_node_attr (parser, "SH", "id", "9999"); - parser_append_given_text (parser, "REFERENCE"); - parser->ins = parser->ins->parent; - - parser_append_given_text (parser, "See "); - parser->ins = parser_append_node (parser, "UR"); - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, link); - parser->ins = parser->ins->parent; - parser_append_given_text (parser, basename); - parser->ins = parser->ins->parent; - - g_free (link); - } + if (parser->state != START) { + g_string_append_c (parser->accumulator, ' '); } -} - -/* many mandoc macros have their arguments parsed so that other - * macros can be called to operate on their arguments. This table - * indicates which macros are _parsed_ for other callable macros, - * and which are _callable_ from other macros: see mdoc(7) for more - * details - */ - -#define MANDOC_NONE 0x01 -#define MANDOC_PARSED 0x01 -#define MANDOC_CALLABLE 0x02 -struct MandocMacro { - gchar *macro; - gint flags; -}; + parser->buffer++; + parser->last_char_was_space = TRUE; -static struct MandocMacro manual_macros[] = { - { "Ad", MANDOC_PARSED | MANDOC_CALLABLE }, - { "An", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ar", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Cd", MANDOC_NONE }, - { "Cm", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Dv", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Er", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ev", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fa", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fd", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fl", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Fn", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ic", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Li", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Nd", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Nm", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Op", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Ot", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Pa", MANDOC_PARSED | MANDOC_CALLABLE }, - { "St", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Tn", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Va", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Vt", MANDOC_PARSED | MANDOC_CALLABLE }, - { "Xr", MANDOC_PARSED | MANDOC_CALLABLE }, - { NULL, MANDOC_NONE } -}; + ret = parser_parse_line (parser, error); -static gboolean -is_mandoc_manual_macro_parsed (gchar *macro) -{ - gint i; - - for (i=0; manual_macros[i].macro != NULL; i++) { - if (g_str_equal (macro, manual_macros[i].macro) && - (manual_macros[i].flags & MANDOC_PARSED) == MANDOC_PARSED - ) { - return TRUE; - } - } - - return FALSE; + parser->buffer--; + return ret; } static gboolean -is_mandoc_manual_macro_callable (gchar *macro) +parse_body_text (YelpManParser *parser, GError **error) { - gint i; - - for (i=0; manual_macros[i].macro != NULL; i++) { - if (g_str_equal (macro, manual_macros[i].macro) && - (manual_macros[i].flags & MANDOC_CALLABLE) == MANDOC_CALLABLE - ) { - return TRUE; - } - } + /* + It's this function which is responsible for trying to get *some* + semantic information back out of the manual page. - return FALSE; -} + The highest-level chopping up is into sections. We use the + heuristic that if either + (1) We haven't got a section yet or + (2) text starts a line (hpos=0) + then it's a section title. -static void -macro_mandoc_utility_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - GSList *ptr = NULL; - gchar *str = NULL; - gchar *manpage, *uri; - - g_return_if_fail (macro != NULL); - - if (is_mandoc_manual_macro_parsed (macro)) { - parser->ins = parser_append_node (parser, macro); - - ptr = args; - while (ptr && ptr->data) { - if (is_mandoc_manual_macro_callable ((gchar *)ptr->data)) { - macro_mandoc_utility_handler (parser, (gchar *)ptr->data, ptr->next); - break; - } else { - parser_append_given_text_handle_escapes (parser, (gchar *)ptr->data, TRUE); - } - ptr = ptr->next; - if (ptr && ptr->data) - parser_append_given_text (parser, " "); - } - - parser->ins = parser->ins->parent; - } else { - parser->ins = parser_append_node (parser, macro); - str = args_concat_all (args); - parser->ins = parser->ins->parent; - - g_free (str); - } + It's possible to have spaces in section titles, so we carry on + accumulating the section title until the next newline. + */ + if (parser->section_state != SECTION_TITLE && parser->hpos == 0) { + g_string_truncate (parser->accumulator, 0); + /* End the current sheet & section */ + parser->section_state = SECTION_TITLE; + parser->sheet_node = NULL; - return; - - if (g_str_equal (macro, "Op")) { - - } else if (g_str_equal (macro, "Nm")) { - - if (str) { - parser_ensure_P (parser); - - parser->ins = parser_append_node (parser, "B"); - parser_append_given_text_handle_escapes (parser, str, TRUE); - parser->ins = parser->ins->parent; - } - } - else if (g_str_equal (macro, "Nd")) { - - if (str) { - parser_append_given_text (parser, " -- "); - parser_append_given_text_handle_escapes (parser, str, TRUE); - } - } - else if (g_str_equal (macro, "Xr")) { - - if (args && args->data && args->next && args->next->data) { - - manpage = g_strdup_printf ("%s(%s)", (gchar *)args->data, (gchar *)args->next->data); - uri = g_strdup_printf ("man:%s", manpage); - - parser_ensure_P (parser); - - parser->ins = parser_append_node (parser, "UR"); - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, uri); - parser->ins = parser->ins->parent; - parser_append_given_text (parser, manpage); - parser->ins = parser->ins->parent; - - ptr = args->next->next; - - while (ptr && ptr->data) { - parser_append_given_text (parser, ptr->data); - ptr = g_slist_next (ptr); - } - - g_free (uri); - g_free (manpage); - } + parser->section_node = + xmlAddChild (xmlDocGetRootElement (parser->doc), + xmlNewNode (NULL, BAD_CAST "section")); } - g_free (str); -} + if (parser->section_state != SECTION_TITLE) + deal_with_newlines (parser); -static void -macro_mandoc_listitem_handler (YelpManParser *parser, gchar *macro, GSList *args) -{ - GSList *ptr = NULL; - xmlNodePtr tmpNode; - - tmpNode = parser_stack_pop_node (parser, "It"); - - if (tmpNode != NULL) - parser->ins = tmpNode->parent; - - parser->ins = parser_append_node (parser, macro); - - if (args && args->data) { - parser->ins = parser_append_node (parser, "ItTag"); - - ptr = args; - while (ptr && ptr->data) { - if (is_mandoc_manual_macro_callable ((gchar *)ptr->data)) { - macro_mandoc_utility_handler (parser, (gchar *)ptr->data, ptr->next); - break; - } else { - parser_append_given_text (parser, (gchar *)ptr->data); - } - ptr = ptr->next; - if (ptr && ptr->data) - parser_append_given_text (parser, " "); - } - - parser->ins = parser->ins->parent; - } + g_string_append (parser->accumulator, parser->buffer+1); - parser_stack_push_node (parser, parser->ins); + /* Move hpos forward per char */ + parser->hpos += strlen (parser->buffer+1) * parser->char_width; + parser->last_char_was_space = FALSE; + parser->N_count = 0; + + return TRUE; } -/* the handler functions for each macro all have this form: - * - the calling function, parser_handle_linetag owns the "macro", and "args" - * parameters, so do not free them. - */ -typedef void (*MacroFunc)(YelpManParser *parser, gchar *macro, GSList *args); +static gboolean +parse_n (YelpManParser *parser, GError **error) +{ + xmlNodePtr node; -struct MacroHandler { - gchar *macro; - MacroFunc handler; -}; + /* Don't care about newlines in the header bit */ + if (parser->state != BODY) return TRUE; -/* We are calling all of these macros, when in reality some of them are - * requests (lowercase, defined by groff system), and some of them are - * macros (varying case, defined by man/mdoc/ms/tbl extensions) - * - * A great resource to figure out what each of these does is the groff - * info page. Also groff(7), man(7), and mdoc(7) are useful as well. - */ -static struct MacroHandler macro_handlers[] = { - { "\\\"", macro_ignore_handler }, /* groff: comment */ - { "ad", macro_ignore_handler }, /* groff: set adjusting mode */ - { "Ad", macro_mandoc_utility_handler }, /* mandoc: Address */ - { "An", macro_mandoc_utility_handler }, /* mandoc: Author name */ - { "Ar", macro_mandoc_utility_handler }, /* mandoc: Command line argument */ - { "B", macro_bold_small_italic_handler }, /* man: set bold font */ - { "Bd", macro_ignore_handler }, /* mandoc: Begin-display block */ - { "BI", macro_roman_bold_small_italic_handler }, /* man: bold italic font */ - { "Bl", macro_mandoc_list_handler }, /* mandoc: begin list */ - { "bp", macro_ignore_handler }, /* groff: break page */ - { "br", macro_insert_self_handler }, /* groff: line break */ - { "BR", macro_roman_bold_small_italic_handler }, /* man: set bold roman font */ - { "Cd", macro_mandoc_utility_handler }, /* mandoc: Configuration declaration */ - { "Cm", macro_mandoc_utility_handler }, /* mandoc: Command line argument modifier */ - { "ce", macro_ignore_handler }, /* groff: center text */ - { "Dd", macro_date_handler }, /* mandoc: Document date */ - { "de", macro_define_handler }, /* groff: define macro */ - { "ds", macro_ignore_handler }, /* groff: define string variable */ - { "D1", macro_ignore_handler }, /* mandoc: Indent and display one text line */ - { "Dl", macro_ignore_handler }, /* mandoc: Indent and display one line of literal text */ - { "Dt", macro_title_handler }, /* mandoc: Document title */ - { "Dv", macro_mandoc_utility_handler }, /* mandoc: Defined variable */ - { "Ed", macro_ignore_handler }, /* mandoc: End-display block */ - { "El", macro_mandoc_list_handler }, /* mandoc: end list */ - { "Er", macro_mandoc_utility_handler }, /* mandoc: Error number */ - { "Ev", macro_mandoc_utility_handler }, /* mandoc: Environment variable */ - { "Fa", macro_mandoc_utility_handler }, /* mandoc: Function argument */ - { "Fd", macro_mandoc_utility_handler }, /* mandoc: Function declaration */ - { "fi", macro_verbatim_handler }, /* groff: activate fill mode */ - { "Fl", macro_mandoc_utility_handler }, /* mandoc: ? */ - { "Fn", macro_mandoc_utility_handler }, /* mandoc: Function call */ - { "ft", macro_ignore_handler }, /* groff: change font */ - { "HP", macro_hanging_paragraph_handler }, /* man: paragraph with hanging left indentation */ - { "hy", macro_ignore_handler }, /* groff: enable hyphenation */ - { "I", macro_bold_small_italic_handler }, /* man: set italic font */ - { "Ic", macro_mandoc_utility_handler }, /* mandoc: Interactive Command */ - { "ie", macro_ignore_handler }, /* groff: else portion of if-else */ - { "if", macro_ignore_handler }, /* groff: if statement */ - { "ig", macro_ignore_handler }, /* groff: comment until '..' or '.END' */ - { "ih", macro_ignore_handler }, /* ? */ - { "IX", macro_ignore_handler }, /* ms: print index to stderr */ - { "IB", macro_roman_bold_small_italic_handler }, /* man: set italic bold font */ - { "IP", macro_ip_handler }, /* man: indented paragraph */ - { "IR", macro_roman_bold_small_italic_handler }, /* man: set italic roman font */ - { "It", macro_mandoc_listitem_handler }, /* mandoc: item in list */ - { "Li", macro_mandoc_utility_handler }, /* mandoc: Literal text */ - { "LP", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "na", macro_ignore_handler }, /* groff: disable adjusting */ - { "Nd", macro_mandoc_utility_handler }, /* mandoc: description of utility/program */ - { "ne", macro_ignore_handler }, /* groff: force space at bottom of page */ - { "nf", macro_verbatim_handler }, /* groff: no fill mode */ - { "nh", macro_ignore_handler }, /* groff: disable hyphenation */ - { "Nd", macro_mandoc_utility_handler }, /* mandoc: ? */ - { "Nm", macro_mandoc_utility_handler }, /* mandoc: Command/utility/program name*/ - { "Op", macro_mandoc_utility_handler }, /* mandoc: Option */ - { "Os", macro_os_handler }, /* mandoc: Operating System */ - { "Ot", macro_mandoc_utility_handler }, /* mandoc: Old style function type (Fortran) */ - { "P", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "Pa", macro_mandoc_utility_handler }, /* mandoc: Pathname or filename */ - { "PP", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "Pp", macro_new_paragraph_handler }, /* man: line break and left margin and indentation are reset */ - { "ps", macro_ignore_handler }, /* groff: change type size */ - { "RB", macro_roman_bold_small_italic_handler }, /* man: set roman bold font */ - { "RE", macro_ignore_handler }, /* man: move left margin back to NNN */ - { "RI", macro_roman_bold_small_italic_handler }, /* man: set roman italic font */ - { "RS", macro_ignore_handler }, /* man: move left margin to right by NNN */ - { "SH", macro_section_header_handler }, /* man: unnumbered section heading */ - { "Sh", macro_section_header_handler }, /* man: unnumbered section heading */ - { "SM", macro_bold_small_italic_handler }, /* man: set font size one SMaller */ - { "so", macro_reference_handler }, /* groff: include file */ - { "sp", macro_spacing_handler }, /* groff: */ - { "SS", macro_section_header_handler }, /* man: unnumbered subsection heading */ - { "Ss", macro_section_header_handler }, /* man: unnumbered subsection heading */ - { "St", macro_mandoc_utility_handler }, /* mandoc: Standards (-p1003.2, -p1003.1 or -ansiC) */ - { "TH", macro_title_header_handler }, /* man: set title of man page */ - { "TP", macro_tp_handler }, /* man: set indented paragraph with label */ - { "UR", macro_url_handler }, /* man: URL start hyperlink */ - { "UE", macro_url_handler }, /* man: URL end hyperlink */ - { "UN", macro_ignore_handler }, /* ? */ - { "TE", macro_ignore_handler }, /* ms: table */ - { "Tn", macro_mandoc_utility_handler }, /* mandoc: Trade or type name (small Caps). */ - { "ti", macro_ignore_handler }, /* groff: temporary indent */ - { "tr", macro_ignore_handler }, /* groff: translate characters */ - { "TS", macro_ignore_handler }, /* ms: table with optional header */ - { "Va", macro_mandoc_utility_handler }, /* mandoc: Variable name */ - { "Vb", macro_verbatim_handler }, /* pod2man: start of verbatim text */ - { "Ve", macro_verbatim_handler }, /* pod2man: end of verbatim text */ - { "Vt", macro_mandoc_utility_handler }, /* mandoc: Variable type (Fortran only) */ - { "Xr", macro_mandoc_utility_handler }, /* mandoc: Manual page cross reference */ - { NULL, NULL } -}; + if (parser->section_state == SECTION_TITLE) { + g_strchomp (parser->accumulator->str); + xmlNewTextChild (parser->section_node, NULL, + BAD_CAST "title", parser->accumulator->str); + g_string_truncate (parser->accumulator, 0); -static void -parser_handle_linetag (YelpManParser *parser) { - gchar c, *str, *ptr, *arg; - GSList *arglist = NULL; - GSList *listptr = NULL; - MacroFunc handler_func = NULL; - - static GHashTable *macro_hash = NULL; - - /* check if we've created the hash of macros yet. If not, make it */ - if (!macro_hash) { - gint i; - - macro_hash = g_hash_table_new (g_str_hash, g_str_equal); - - for (i=0; macro_handlers[i].macro != NULL; i++) { - g_hash_table_insert (macro_hash, - macro_handlers[i].macro, - macro_handlers[i].handler); - } + parser->section_state = SECTION_BODY; } + else if (parser->sheet_node != NULL) { + /* + In the body of a section, when we get to a newline we should + have an accumulator with text in it and a non-null sheet + (hopefully!). - /* FIXME: figure out a better way to handle these cases */ - /* special case, if the line is simply ".\0" then return */ - if (g_utf8_get_char (g_utf8_next_char (parser->cur)) == '\0') { - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - return; - } - /* special case, if the line is simply "..\0" then return */ - else if (g_utf8_get_char (g_utf8_next_char(parser->cur)) == '.' && - g_utf8_get_char (g_utf8_next_char (g_utf8_next_char (parser->cur+2))) == '\0') { - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - - /* skip any spaces after the control character . */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) == ' ') - parser->cur = g_utf8_next_char (parser->cur); - - while (PARSER_CUR - && g_utf8_get_char (parser->cur) != ' ' - && ( (g_utf8_get_char (parser->cur) != '\\') || - ( - (g_utf8_get_char(parser->cur) == '\\') && - (g_utf8_get_char(g_utf8_next_char (parser->cur)) == '\"') - ) - ) - && g_utf8_get_char (parser->cur) != '\0') { - if ( - (g_utf8_get_char (parser->cur) == '\\') && - (g_utf8_get_char (g_utf8_next_char (parser->cur)) == '\"') - ) { - parser->cur = g_utf8_next_char (g_utf8_next_char (parser->cur)); - break; - } - parser->cur = g_utf8_next_char (parser->cur); + We know the current font, so add a span for that font + containing the relevant text. Then add a <br/> tag. + */ + finish_span (parser); + node = xmlNewNode (NULL, BAD_CAST "br"); + xmlAddChild (parser->sheet_node, node); } - /* copy the macro/request into str */ - c = *(parser->cur); - *(parser->cur) = '\0'; - str = g_strdup (parser->anc + 1); /* skip control character '.' by adding one */ - *(parser->cur) = c; - parser->anc = parser->cur; - - /* FIXME: need to handle escaped characters */ - /* perform argument parsing and store argument in a singly linked list */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) != '\0') { - ptr = NULL; - arg = NULL; - - /* skip any whitespace */ - while (PARSER_CUR && g_utf8_get_char (parser->cur) == ' ') { - parser->cur = g_utf8_next_char (parser->cur); - parser->anc = parser->cur; - } - -get_argument: - /* search until we hit whitespace or an " */ - while (PARSER_CUR && - g_utf8_get_char (parser->cur) != '\0' && - g_utf8_get_char (parser->cur) != ' ' && - g_utf8_get_char (parser->cur) != '\"') - parser->cur = g_utf8_next_char (parser->cur); - - /* this checks for escaped spaces */ - if (PARSER_CUR && - ((parser->cur - parser->buffer) > 0) && - g_utf8_get_char (parser->cur) == ' ' && - g_utf8_get_char (g_utf8_prev_char (parser->cur)) == '\\') { - parser->cur = g_utf8_next_char (parser->cur); - goto get_argument; - } - - if (g_utf8_get_char (parser->cur) == '\0' && - (parser->cur == parser->anc)) - break; - - if (g_utf8_get_char (parser->cur) == '\"' && - g_utf8_get_char (g_utf8_prev_char (parser->cur)) == ' ') { - /* quoted argument */ - ptr = strchr (parser->cur+1, '\"'); - if (ptr != NULL) { - c = *(ptr); - *(ptr) = '\0'; - arg = g_strdup (parser->anc+1); - *(ptr) = c; - parser->cur = ptr; - parser->anc = ++parser->cur; - } else { - /* unmatched double quote: include the " as part of the argument */ - parser->cur++; - goto get_argument; - } - } - else if (*(parser->cur) == '\"') { - /* quote in the middle of an argument */ - c = *(parser->cur+1); - *(parser->cur+1) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur+1) = c; - parser->anc = ++parser->cur; - } - else if (*(parser->cur) == ' ') { - /* normal space separated argument */ - c = *(parser->cur); - *(parser->cur) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur) = c; - parser->anc = ++parser->cur; - } - else if (*(parser->cur) == '\0' && *(parser->cur-1) != ' ') { - /* special case for EOL */ - c = *(parser->cur); - *(parser->cur) = '\0'; - arg = g_strdup (parser->anc); - *(parser->cur) = c; - parser->anc = parser->cur; - } else - ; /* FIXME: do we need to handle this case? */ - - arglist = g_slist_append (arglist, arg); - } - - /*g_print ("handling macro (%s)\n", str); - - listptr = arglist; - while (listptr && listptr->data) { - g_print (" arg = %s\n", (gchar *)listptr->data); - listptr = g_slist_next (listptr); - } - */ - - /* lookup the macro handler and call that function */ - handler_func = g_hash_table_lookup (macro_hash, str); - if (handler_func) - (*handler_func) (parser, str, arglist); - - /* in case macro is not defined in hash table, ignore rest of line */ - else - macro_ignore_handler (parser, str, arglist); - - g_free (str); - - listptr = arglist; - while (listptr && listptr->data) { - g_free (listptr->data); - listptr = g_slist_next (listptr); - } - - return; - - if (0) { - } - /* Table (tbl) macros */ - else if (g_str_equal (str, "TS")) { - parser->ins = parser_append_node (parser, "TABLE"); - g_free (str); - - parser_stack_push_node (parser, parser->ins); - g_free (parser->buffer); - parser_parse_table (parser); - } - else if (g_str_equal (str, "TE")) { - /* We should only see this from within parser_parse_table */ - g_warning ("Found unexpected tag: '%s'\n", str); - g_free (str); - } - /* "ie" and "if" are conditional macros in groff - * "ds" is to define a variable; see groff(7) - * ignore anything between the \{ \}, otherwise ignore until - * the end of the linee*/ - else if (g_str_equal (str, "ds") || g_str_equal (str, "ie") - || g_str_equal (str, "if")) { - /* skip any remaining spaces */ - while (PARSER_CUR && (*parser->cur == ' ')) - parser->anc = ++parser->cur; - - /* skip the "stringvar" or "cond"; see groff(7) */ - while (PARSER_CUR && (*parser->cur != ' ')) - parser->anc = ++parser->cur; - - /* skip any remaining spaces */ - while (PARSER_CUR && (*parser->cur == ' ')) - parser->anc = ++parser->cur; - - /* check to see if the next two characters are the - * special "\{" sequence */ - if (*parser->cur == '\\' && *(parser->cur+1) == '{') { - parser->ignore = TRUE; - parser->token = g_strdup ("\\}"); - } else { - /* otherwise just ignore till the end of the line */ - while (PARSER_CUR) - parser->anc = ++parser->cur; - } - } - /* else conditional macro */ - else if (g_str_equal (str, "el")) { - /* check to see if the next two characters are the - * special "\{" sequence */ - parser->ignore = 0; - if (*parser->cur == '\\' && *(parser->cur+1) == '{') { - parser->ignore = TRUE; - parser->token = g_strdup ("\\}"); - } else { - /* otherwise just ignore till the end of the line */ - while (PARSER_CUR) - parser->anc = ++parser->cur; - } - } + parser->newline = TRUE; + parser->last_char_was_space = FALSE; + return TRUE; } static void -parser_ensure_P (YelpManParser *parser) +finish_span (YelpManParser *parser) { - if (xmlStrEqual (parser->ins->name, BAD_CAST "Man")) { - parser->ins = parser_append_node (parser, "P"); - parser_stack_push_node (parser, parser->ins); + xmlNodePtr node; + + if (parser->accumulator->str[0] != '\0') { + node = xmlNewTextChild (parser->sheet_node, NULL, + BAD_CAST "span", + parser->accumulator->str); + xmlNewProp (node, BAD_CAST "class", get_font (parser)); + g_string_truncate (parser->accumulator, 0); } } -static void -parser_read_until (YelpManParser *parser, - gchar delim) +static guint +dx_to_em_count (YelpManParser *parser, guint dx) { - gchar c; - - while (PARSER_CUR - && g_utf8_get_char (parser->cur) != '\0' - && g_utf8_get_char (parser->cur) != delim) { - parser->cur = g_utf8_next_char (parser->cur); - } - - if (parser->anc == parser->cur) - return; - - c = *(parser->cur); - *(parser->cur) = '\0'; - parser_append_given_text_handle_escapes (parser, parser->anc, TRUE); - *(parser->cur) = c; - - parser->anc = parser->cur; + return (int)(dx / ((float)parser->char_width)); } -static void -parser_escape_tags (YelpManParser *parser, - gchar **tags, - gint ntags) +static gboolean +parse_N (YelpManParser *parser, GError **error) { - gint i; - xmlNodePtr node = NULL; - xmlNodePtr cur = parser->ins; - GSList *path = NULL; - - /* Find the top node we can escape from */ - while (cur && cur != (xmlNodePtr)parser->doc && - cur->parent && cur->parent != (xmlNodePtr) parser->doc) { - for (i = 0; i < ntags; i++) - if (!xmlStrcmp (cur->name, BAD_CAST tags[i])) { - node = cur; - break; - } - path = g_slist_prepend (path, cur); - cur = cur->parent; + gint n; + if (SSCANF ("N%i", 1, &n)) { + RAISE_PARSE_ERROR ("Strange format for N line: %s"); + } + if (n > 127) { + RAISE_PARSE_ERROR ("N line has non-7-bit character: %s"); } + if (n < -200) { + RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s"); + } + + deal_with_newlines (parser); + parser->last_char_was_space = FALSE; - /* Walk back down, reproducing nodes we aren't escaping */ - if (node) { - GSList *c = path; - while (c && (xmlNodePtr) c->data != node) - c = g_slist_next (c); - - parser->ins = node->parent; - parser_ensure_P (parser); - - while ((c = c->next)) { - gboolean insert = TRUE; - cur = (xmlNodePtr) c->data; - - for (i = 0; i < ntags; i++) - if (!xmlStrcmp (cur->name, BAD_CAST tags[i])) { - insert = FALSE; - break; - } - if (insert) - parser->ins = parser_append_node (parser, (gchar *) cur->name); - } + if (n < 0) { + append_nbsps (parser, -n); + parser->N_count += -n; } + else { + g_string_append_c (parser->accumulator, (gchar)n); + parser->N_count++; + } + + return TRUE; } static void -parser_append_given_text_handle_escapes (YelpManParser *parser, gchar *text, gboolean make_links) +append_nbsps (YelpManParser *parser, guint k) { - gchar *escape[] = { "fI", "fB" }; - gchar *baseptr, *ptr, *anc, *str; - gint c, len; - - g_return_if_fail (parser != NULL); - - if (!text) - return; - - baseptr = g_strdup (text); - ptr = baseptr; - anc = baseptr; - len = strlen (baseptr); - - while (ptr && *ptr != '\0') { - - if (*ptr == '\\') { - - c = *ptr; - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - - anc = ++ptr; - - switch (*ptr) { - case '\0': - break; - case '-': - case '\\': - ptr++; - c = *ptr; - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - anc = ptr; - break; - case 'f': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - parser_ensure_P (parser); - parser_escape_tags (parser, escape, 2); - - /* the \f escape sequence changes the font - R is Roman, - * B is Bold, and I is italic */ - if (g_str_equal (str, "fI") || g_str_equal (str, "fB")) - parser->ins = parser_append_node (parser, str); - else if (!g_str_equal (str, "fR") && !g_str_equal (str, "fP")) - g_warning ("No rule matching the tag '%s'\n", str); - - g_free (str); - anc = ptr; - break; - case '(': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - if (g_str_equal (str, "(co")) - parser_append_given_text (parser, "©"); - else if (g_str_equal (str, "(bu")) - parser_append_given_text (parser, "•"); - else if (g_str_equal (str, "(em")) - parser_append_given_text (parser, "—"); - - g_free (str); - anc = ptr; - break; - case '*': - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - - if (*(ptr) == 'R') { - parser_append_given_text (parser, "®"); - ptr++; - } else if (*(ptr) == '=') { - parser_append_given_text (parser, "--"); - ptr++; - } else if (*(ptr) == '(') { - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - if ((ptr - baseptr) > len || *ptr == '\0') break; - ptr++; - - c = *(ptr); - *(ptr) = '\0'; - str = g_strdup (anc); - *(ptr) = c; - - if (g_str_equal (str, "*(Tm")) - parser_append_given_text (parser, "™"); - else if (g_str_equal (str, "*(lq")) - parser_append_given_text (parser, "“"); - else if (g_str_equal (str, "*(rq")) - parser_append_given_text (parser, "”"); - - g_free (str); - } - - anc = ptr; - break; - case 'e': - anc = ++ptr; - parser_append_given_text (parser, "\\"); - break; - case '&': - anc = ++ptr; - break; - case 's': - /* this handles (actually ignores) the groff macros \s[+-][0-9] */ - ptr++; - if (*(ptr) == '+' || *(ptr) == '-') { - ptr++; - if (g_ascii_isdigit (*ptr)) { - ptr++; - } - } else if (g_ascii_isdigit (*ptr)) { - ptr++; - } - anc = ptr; - break; - case '"': - /* Marks comments till end of line. so we can ignore it. */ - while (ptr && *ptr != '\0') - ptr++; - anc = ptr; - break; - case '^': - case '|': - /* 1/12th and 1/16th em respectively - ignore this and simply output a space */ - anc = ++ptr; - break; - default: - ptr++; - c = *(ptr); - *(ptr) = '\0'; - parser_append_given_text (parser, anc); - *(ptr) = c; - - anc++; - break; - } - - } - else if ((make_links) && (*ptr == '(')) { - gchar *space_pos; - gchar *url; - gchar c; - gchar *name_end; - gchar *num_start; - gchar *num_end; - - - space_pos = ptr; - - while (space_pos != anc && *(space_pos - 1) != ' ') { - space_pos--; - } - name_end = space_pos; - - if (space_pos != ptr && - g_ascii_isdigit(*(ptr+1)) && - (*(ptr+2) == ')' || (g_ascii_isalpha (*(ptr+2)) && *(ptr+3) == ')'))) { - num_start = ptr; - if (*(ptr+2) == ')') - num_end = ptr + 2; - else - num_end = ptr + 3; - - ptr+=3; - - parser_ensure_P (parser); - - ptr = space_pos; - - c = (*ptr); - *ptr = '\0'; - parser_append_given_text (parser, anc); - *ptr = c; - anc = ptr; - ptr = num_start; - - c = *name_end; - *name_end = '\0'; - *num_end = '\0'; - url = g_strdup_printf ("man:%s(%s)", anc, num_start + 1); - - - parser->ins = parser_append_node (parser, "UR"); - - parser->ins = parser_append_node (parser, "URI"); - parser_append_given_text (parser, url); - parser->ins = parser->ins->parent; - - parser_append_given_text (parser, anc); - parser->ins = parser->ins->parent; - - *name_end = c; - *num_end = ')'; - anc = ptr; - - g_free (url); - - } else { - ptr++; - } - } - else { - ptr++; - } - - } /* end while */ - - c = *(ptr); - *(ptr) = '\0'; - parser_append_given_text (parser, anc); - parser_append_given_text (parser, "\n"); - *(ptr) = c; - - g_free (baseptr); + for (; k > 0; k--) { + /* 0xc2 0xa0 is nonbreaking space in utf8 */ + g_string_append_c (parser->accumulator, 0xc2); + g_string_append_c (parser->accumulator, 0xa0); + } } -static xmlNodePtr -parser_append_text (YelpManParser *parser) +static gboolean +parse_C (YelpManParser *parser, GError **error) { - xmlNodePtr node; - gchar c; + gchar name[16]; + gunichar code = 0; + guint k; + gint len; - if (parser->anc == parser->cur) - return NULL; + if (SSCANF ("C%16s", 1, name)) { + RAISE_PARSE_ERROR ("Can't understand special character: %s"); + } - c = *(parser->cur); - *(parser->cur) = '\0'; + for (k=0; char_translations[k].from; k++) { + if (g_str_equal (char_translations[k].from, name)) { + code = char_translations[k].to; + break; + } + } + if (sscanf (name, "u%x", &k) == 1) { + code = k; + } - if (g_utf8_get_char (parser->anc) != '\0') - parser_ensure_P (parser); + if (!code) { + g_warning ("Couldn't parse troff special character: '%s'", + name); + code = 65533; /* Unicode replacement character */ + } - node = xmlNewText (BAD_CAST parser->anc); - xmlAddChild (parser->ins, node); + deal_with_newlines (parser); + parser->last_char_was_space = FALSE; - *(parser->cur) = c; + /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */ + len = g_unichar_to_utf8 (code, name); + name[len] = '\0'; + g_string_append (parser->accumulator, name); - parser->anc = parser->cur; + parser->N_count++; - return node; + return TRUE; } -static xmlNodePtr -parser_append_given_text (YelpManParser *parser, - gchar *text) +static void +deal_with_newlines (YelpManParser *parser) { - xmlNodePtr node; - - parser_ensure_P (parser); + /* + If newline is true, this is the first word on a line. - node = xmlNewText (BAD_CAST text); - xmlAddChild (parser->ins, node); + In which case, we check to see whether hpos agrees with the + current sheet's indent. If so (or if there isn't a sheet yet!), + we just add to the accumulator. If not, start a new sheet with + the correct indent. - return node; -} - -static xmlNodePtr -parser_append_node (YelpManParser *parser, - gchar *name) -{ - if (!name) - return NULL; - - return xmlNewChild (parser->ins, NULL, BAD_CAST name, NULL); -} + If we aren't the first word on the line, just add to the + accumulator. + */ + gchar tmp[64]; + guint jump_lines; + gboolean made_sheet = FALSE, dont_jump = FALSE; -static xmlNodePtr -parser_append_node_attr (YelpManParser *parser, - gchar *name, - gchar *attr, - gchar *value) -{ - xmlNodePtr node = NULL; - - node = xmlNewChild (parser->ins, NULL, BAD_CAST name, NULL); - xmlNewProp (node, BAD_CAST attr, BAD_CAST value); + /* This only happens at the start of a section, where there's + already a gap + */ + if (!parser->sheet_node) { + dont_jump = TRUE; + } - return node; -} + if ((!parser->sheet_node) || + (parser->newline && (parser->hpos != parser->sheet_indent))) { + new_sheet (parser); + made_sheet = TRUE; + } -static void -parser_stack_push_node (YelpManParser *parser, - xmlNodePtr node) -{ - parser->nodeStack = g_slist_prepend (parser->nodeStack, node); -} + if (parser->newline) { + append_nbsps (parser, dx_to_em_count (parser, parser->hpos)); -static xmlNodePtr -parser_stack_pop_node (YelpManParser *parser, - gchar *name) -{ - xmlNodePtr popped; - - if (parser->nodeStack == NULL) - return NULL; - - popped = (xmlNodePtr) parser->nodeStack->data; - - if (!xmlStrEqual (BAD_CAST name, popped->name)) - return NULL; - - parser->nodeStack = g_slist_remove (parser->nodeStack, popped); - return popped; -} + if ((parser->last_vertical_jump > 0) && (!dont_jump)) { + jump_lines = + parser->last_vertical_jump/parser->char_height; + } else { + jump_lines = 1; + } -/* - * Table (tbl) macro package parsing - */ + if (jump_lines > 1) { + if (!made_sheet) new_sheet (parser); + made_sheet = TRUE; + } -static void -parser_handle_table_options (YelpManParser *parser) -{ - /* FIXME: do something with the options */ - g_free (parser->buffer); + if (made_sheet) { + snprintf (tmp, 64, "%u", jump_lines-1); + xmlNewProp (parser->sheet_node, BAD_CAST "jump", tmp); + } + } - return; + parser->newline = FALSE; + parser->last_vertical_jump = 0; } -static void -parser_handle_row_options (YelpManParser *parser) +static gboolean +parse_p (YelpManParser *parser, GError **error) { - /* FIXME: do something with these options */ - - do { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, '.'); - - if (*(parser->cur) == '.') { - g_free (parser->buffer); - break; - } - - g_free (parser->buffer); - - } while ((parser->buffer = - g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) - != NULL); + parser->vpos = 0; + parser->hpos = 0; + return TRUE; } static void -parser_parse_table (YelpManParser *parser) -{ - xmlNodePtr table_start; - gboolean empty_row; - - table_start = parser->ins; - - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, ';'); - - if (*(parser->cur) == ';') { - parser_handle_table_options (parser); - - parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); - if (parser->buffer != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - parser_read_until (parser, '\0'); - } else - return; - } - - parser_handle_row_options (parser); - - /* Now this is where we go through all the rows */ - while ((parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL)) != NULL) { - parser->anc = parser->buffer; - parser->cur = parser->buffer; - - empty_row = FALSE; - - switch (*(parser->buffer)) { - case '.': - if (*(parser->buffer + 1) == 'T' - && *(parser->buffer + 2) == 'E') { - if (parser_stack_pop_node (parser, "TABLE") == NULL) - g_warning ("Found unexpected tag: 'TE'\n"); - else { - parser->ins = table_start; - - parser->anc = parser->buffer + 3; - parser->cur = parser->buffer + 3; - return; - } - } else if (*(parser->buffer + 1) == 'T' - && *(parser->buffer + 2) == 'H') { - /* Do nothing */ - empty_row = TRUE; - } else { - parser_handle_linetag (parser); - break; - } - case '\0': - empty_row = TRUE; - break; - default: - break; - } - - if (!empty_row) { - parser->ins = parser_append_node (parser, "ROW"); - while (PARSER_CUR && *(parser->cur) != '\0') { - parser_read_until (parser, '\t'); - parser->ins = parser_append_node (parser, "CELL"); - parser_append_text (parser); - parser->ins = parser->ins->parent; - parser->anc++; - parser->cur++; - } - } - - g_free (parser->buffer); - - parser->ins = table_start; - } - } +new_sheet (YelpManParser *parser) +{ + /* We don't need to worry about finishing the current sheet, + since the accumulator etc. get cleared on newlines and we + know we're at the start of a line. + */ + parser->sheet_node = + xmlAddChild (parser->section_node, + xmlNewNode (NULL, BAD_CAST "sheet")); + parser->sheet_indent = parser->hpos; } diff --git a/libyelp/yelp-man-parser.h b/libyelp/yelp-man-parser.h index 1901f1b..33cb951 100644 --- a/libyelp/yelp-man-parser.h +++ b/libyelp/yelp-man-parser.h @@ -30,8 +30,8 @@ typedef struct _YelpManParser YelpManParser; YelpManParser * yelp_man_parser_new (void); xmlDocPtr yelp_man_parser_parse_file (YelpManParser *parser, - gchar *file, - const gchar *encoding); + gchar *path, + GError **error); void yelp_man_parser_free (YelpManParser *parser); #endif /* __YELP_MAN_PARSER_H__ */ diff --git a/stylesheets/man2html.xsl.in b/stylesheets/man2html.xsl.in index 4b21bae..5b3cd59 100644 --- a/stylesheets/man2html.xsl.in +++ b/stylesheets/man2html.xsl.in @@ -17,349 +17,73 @@ <xsl:param name="linktrail" select="''"/> <xsl:template mode="html.title.mode" match="Man"> - <xsl:value-of select="TH/Title"/> -</xsl:template> - -<xsl:template mode="html.css.mode" match="Man"> - <xsl:param name="direction"/> - <xsl:param name="left"/> - <xsl:param name="right"/> -<xsl:text> -body { font-family: monospace; } -div.hgroup { font-family: sans-serif; } -</xsl:text> -</xsl:template> - -<xsl:template mode="html.header.mode" match="Man"> - <xsl:call-template name="html.linktrail"/> + <xsl:value-of select="header/title"/> </xsl:template> <xsl:template mode="html.body.mode" match="Man"> - <xsl:apply-templates select="TH"/> - <xsl:apply-templates select="SH"/> -</xsl:template> - -<xsl:template name="html.linktrail"> - <div class="linktrail" id="linktrail"> - <xsl:call-template name="html.linktrail.one"> - <xsl:with-param name="str" select="$linktrail"/> - </xsl:call-template> - </div> -</xsl:template> - -<xsl:template name="html.linktrail.one"> - <xsl:param name="str"/> - <xsl:variable name="id" select="substring-before($str, '|')"/> - <xsl:variable name="post_id" select="substring-after($str, '|')"/> - - <span class="linktrail"> - <a class="linktrail" href="x-yelp-toc:{$id}"> - <xsl:choose> - <xsl:when test="contains($post_id, '|')"> - <xsl:value-of select="substring-before($post_id, '|')"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="$post_id"/> - </xsl:otherwise> - </xsl:choose> - </a> - </span> - - <xsl:if test="contains($post_id, '|')"> - <xsl:call-template name="html.linktrail.one"> - <xsl:with-param name="str" select="substring-after($post_id, '|')"/> - </xsl:call-template> - </xsl:if> + <xsl:apply-templates select="header"/> + <xsl:apply-templates select="section"/> </xsl:template> <!-- ======================================================================= --> -<xsl:template match="br"> - <xsl:apply-templates/><br/> -</xsl:template> - -<!-- ignore anything in the Indent,Count,sp element for now --> -<xsl:template match="Indent" /> -<xsl:template match="Count" /> -<xsl:template match="sp" /> - -<xsl:template match="B | fB"> - <b><xsl:apply-templates/></b> -</xsl:template> - -<xsl:template match="CELL"> - <td><xsl:apply-templates/></td> -</xsl:template> - -<xsl:template match="I | fI"> - <i><xsl:apply-templates/></i> -</xsl:template> - -<xsl:template match="R | fR"> - <span class="R"><xsl:apply-templates/></span> -</xsl:template> - -<xsl:template match="Verbatim"> - <pre> - <xsl:choose> - <xsl:when test="node()[1]/self::text()"> - <xsl:variable name="node" select="node()[1]"/> - <xsl:choose> - <xsl:when test="starts-with(string($node), '
')"> - <xsl:value-of select="substring-after(string($node), '
')"/> - <xsl:apply-templates select="node()[position() != 1]"/> - </xsl:when> - <xsl:otherwise> - <xsl:value-of select="string($node)"/> - <xsl:apply-templates select="node()[position() != 1]"/> - </xsl:otherwise> - </xsl:choose> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates /> - </xsl:otherwise> - </xsl:choose> - </pre> -</xsl:template> - -<xsl:template match="IP"> - <xsl:choose> - <xsl:when test="preceding-sibling::*[1][self::IP]"/> - <xsl:otherwise> - <dl> - <xsl:apply-templates mode="IP.mode" select="."/> - </dl> - </xsl:otherwise> - </xsl:choose> -</xsl:template> - -<xsl:template mode="IP.mode" match="IP"> - <dt> - <xsl:choose> - <xsl:when test="Tag"> - <xsl:apply-templates select="Tag"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates/> - </xsl:otherwise> - </xsl:choose> - </dt> - <dd> - <xsl:apply-templates select="Tag/following-sibling::node()"/> - </dd> - <xsl:apply-templates mode="IP.mode" - select="following-sibling::*[1][self::IP]"/> -</xsl:template> - -<xsl:template match="P"> - <p><xsl:apply-templates/></p> -</xsl:template> - -<xsl:template match="ROW"> - <tr><xsl:apply-templates/></tr> -</xsl:template> - -<xsl:template match="SS"> - <xsl:variable name="nextSH" select="following-sibling::SH[1]"/> - <xsl:variable name="nextSS" - select="following-sibling::SS[not($nextSH) or - following-sibling::SH[1] = $nextSH][1]"/> - <div class="sect sect-SS"> - <div class="hgroup"> - <h3 class="title"><xsl:apply-templates/></h3> - </div> - <div class="inner"> - <xsl:choose> - <xsl:when test="$nextSS"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SS[1] = $nextSS and - following-sibling::SS[1]/@id = $nextSS/@id]"/> - </xsl:when> - <xsl:when test="$nextSH"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SH[1] = $nextSH and - following-sibling::SH[1]/@id = $nextSH/@id]"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates select="following-sibling::*"/> - </xsl:otherwise> - </xsl:choose> - </div> - </div> -</xsl:template> - -<xsl:template match="SH"> - <xsl:variable name="nextSH" select="following-sibling::SH[1]"/> - <xsl:variable name="nextSS" - select="following-sibling::SS[not($nextSH) or - following-sibling::SH[1] = $nextSH]"/> - <div class="sect sect-SH"> - <div class="hgroup"> - <h2 class="title"><xsl:apply-templates/></h2> - </div> - <div class="inner"> - <xsl:choose> - <xsl:when test="$nextSS"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SS[1] = $nextSS[1] and - following-sibling::SS[1]/@id = $nextSS[1]/@id]"/> - <xsl:apply-templates select="$nextSS"/> - </xsl:when> - <xsl:when test="$nextSH"> - <xsl:apply-templates - select="following-sibling::*[following-sibling::SH[1] = $nextSH and - following-sibling::SH[1]/@id = $nextSH/@id]"/> - </xsl:when> - <xsl:otherwise> - <xsl:apply-templates select="following-sibling::*"/> - </xsl:otherwise> - </xsl:choose> - </div> - </div> -</xsl:template> - -<xsl:template match="TABLE"> - <table><xsl:apply-templates/></table> -</xsl:template> - -<xsl:template match="Tag"> - <span class="Tag"><xsl:apply-templates/></span> -</xsl:template> - -<xsl:template match="TH"> +<xsl:template match="header"> <div class="hgroup"> <h1 class="title"> - <span class="Title"> - <xsl:apply-templates select="Title/node()"/> - </span> - <span class="Section"> - <xsl:text>(</xsl:text> - <xsl:apply-templates select="Section/node()"/> - <xsl:text>)</xsl:text> - </span> + <xsl:value-of select="title"/> + <xsl:text>(</xsl:text> + <xsl:value-of select="section"/> + <xsl:text>)</xsl:text> </h1> + <h3 style="text-align: right;"> + <xsl:value-of select="collection"/> + </h3> </div> </xsl:template> -<xsl:template match="UR"> - <a> - <xsl:attribute name="href"> - <xsl:value-of select="URI" /> - </xsl:attribute> - <xsl:apply-templates/> - </a> -</xsl:template> - -<xsl:template match="URI"/> - -<xsl:template match="UN"> - <a name="text()" id="text()"/> -</xsl:template> - -<!-- these are all for mdoc (BSD) man page support --> - -<!-- these are just printed out --> -<xsl:template match="An | Dv | Er | Ev | Ic | Li | St"> - <xsl:text> -</xsl:text> - <xsl:apply-templates/> -</xsl:template> - -<!-- these are italicized --> -<xsl:template match="Ad | Ar | Fa | Ot | Pa | Va | Vt"> - <i><xsl:apply-templates/></i> +<xsl:template match="br"> + <br/> </xsl:template> -<!-- these are bold --> -<xsl:template match="Cd | Cm | Fd | Ic | Nm"> - <b><xsl:apply-templates/></b> -</xsl:template> +<xsl:template match="section"> + <div class="section" style="padding-top: 1em;"> + <h2> + <xsl:value-of select="title"/> + </h2> -<!-- Function call - TODO need to do the ( , ) here --> -<xsl:template match="Fn | Fo | Fc"> - <i><xsl:apply-templates/></i> + <div class="section-contents" style="font-family: monospace;"> + <xsl:apply-templates select="sheet"/> + </div> + </div> </xsl:template> -<!-- Cross reference --> -<xsl:template match="Xr"> - <xsl:variable name="manpage" select="substring-before(string(.), ' ')"/> - <xsl:variable name="section" select="substring-before(substring-after(string(.), ' '), ' ')"/> - <xsl:variable name="extra" select="substring-after(substring-after(string(.), ' '), ' ')"/> - <a> - <xsl:attribute name="href"> - <xsl:text>man:</xsl:text> - <xsl:value-of select="$manpage"/> - <xsl:text>(</xsl:text> - <xsl:value-of select="$section"/> - <xsl:text>)</xsl:text> +<xsl:template match="sheet"> + <xsl:element name="div"> + <xsl:attribute name="style"> + margin-bottom: 0px; + margin-top: <xsl:value-of select="@jump"/>em; </xsl:attribute> - <xsl:value-of select="$manpage"/> - <xsl:text>(</xsl:text> - <xsl:value-of select="$section"/> - <xsl:text>)</xsl:text> - </a> - <xsl:value-of select="$extra"/> + <p><xsl:apply-templates select="span|br"/></p> + </xsl:element> </xsl:template> -<!-- Option --> -<xsl:template match="Op | Oo | Oc"> - <xsl:text> [</xsl:text> - <xsl:apply-templates/> - <xsl:text>]</xsl:text> -</xsl:template> - -<!-- Trade or type name (small Caps). --> -<xsl:template match="Tn"> - <xsl:variable name="txt" select="string(child::text())"/> - <xsl:text> </xsl:text> - <xsl:value-of select="translate($txt, 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')"/> - <xsl:apply-templates select="*"/> -</xsl:template> - -<xsl:template match="Nd"> - <xsl:text> - </xsl:text> - <xsl:apply-templates /> -</xsl:template> - -<xsl:template match="Fl"> - <xsl:text>-</xsl:text> - <b><xsl:apply-templates select="child::text()"/></b> - <xsl:apply-templates select="*"/> -</xsl:template> - -<xsl:template match="Bl"> - <dl> - <xsl:for-each select="It"> - <xsl:choose> - <xsl:when test="ItTag"> - <dt><xsl:apply-templates select="ItTag"/></dt> - <dd> - <xsl:apply-templates select="ItTag/following-sibling::node()"/> - </dd> - </xsl:when> - <xsl:otherwise> - <dt> - <xsl:text>•</xsl:text> - </dt> - <dd> - <xsl:apply-templates /> - </dd> - </xsl:otherwise> - </xsl:choose> - </xsl:for-each> - </dl> -</xsl:template> - -<xsl:template match="ItTag"> - <xsl:apply-templates/> -</xsl:template> +<xsl:template match="span"> + <xsl:element name="span"> + <xsl:choose> + <xsl:when test="@class = 'B'"> + <xsl:attribute name="style"> + font-weight: 700; + </xsl:attribute> + </xsl:when> + <xsl:when test="@class = 'I'"> + <xsl:attribute name="style"> + font-style: italic; + </xsl:attribute> + </xsl:when> + </xsl:choose> -<xsl:template match="*"> - <xsl:message> - <xsl:text>Unmatched element: </xsl:text> - <xsl:value-of select="local-name(.)"/> - </xsl:message> - <xsl:apply-templates/> + <xsl:value-of select="."/> + </xsl:element> </xsl:template> </xsl:stylesheet>
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */ /* * Copyright (C) 2003-2010 Shaun McCance <shaunm gnome org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. * * Author: Shaun McCance <shaunm gnome org> */ #ifdef HAVE_CONFIG_H #include <config.h> #endif #include <glib.h> #include <glib/gi18n.h> #include <libxml/tree.h> #include <gio/gio.h> #include <string.h> #include <math.h> #include "yelp-error.h" #include "yelp-man-parser.h" #define MAN_FONTS 8 /* The format has two copies of the title like MAN(1) at the top, * possibly with a string of text in between for the collection. * * Start with the parser on START, then HAVE_TITLE when we've read the * first word with parentheses. At that point, stick new words into * the "collection" tag. Then finally switch to BODY when we've seen * the second copy of the one with parentheses. */ typedef enum ManParserState { START, HAVE_TITLE, BODY } ManParserState; /* See parse_body_text for how this is used. */ typedef enum ManParserSectionState { SECTION_TITLE, SECTION_BODY } ManParserSectionState; struct _YelpManParser { xmlDocPtr doc; /* The top-level XML document */ xmlNodePtr header; /* The header node */ xmlNodePtr section_node; /* The current section */ xmlNodePtr sheet_node; /* The current sheet */ GDataInputStream *stream; /* The GIO input stream to read from */ gchar *buffer; /* The buffer, line at a time */ gsize length; /* The buffer length */ /* The width and height of a character according to troff. */ guint char_width; guint char_height; /* Count the number of lines we've parsed (needed to get prologue) */ guint line_no; /* The x f k name command sets the k'th register to be name. */ gchar* font_registers[MAN_FONTS]; /* The current font. Should be the index of one of the * font_registers. Starts at 0 (of course!) */ guint current_font; /* See description of ManParserState above */ ManParserState state; /* Vertical and horizontal position as far as the troff output is * concerned. (Measured from top-left). */ guint vpos, hpos; /* Text accumulator (needed since it comes through in dribs & * drabs...) */ GString *accumulator; /* See parse_body_text for how this is used. */ ManParserSectionState section_state; /* The indent of the current sheet */ guint sheet_indent; /* Set to TRUE if there's been a newline since the last text was * parsed. */ gboolean newline; /* Count the number of 'N' lines we've seen since the last h * command. This is because for some reason N doesn't * automatically move the position forward. Thus immediately after * one, you see a h24 or the like. Unless there's a space. Then it * might be wh48. This is set in parse_N (obviously) and used in * parse_h. */ guint N_count; /* Keep track of whether the last character was a space. We can't * just do this by looking at the last char of accumulator, * because if there's a font change, it gets zeroed. This gets set * to TRUE by parse_w and is FALSE the rest of the time. */ gboolean last_char_was_space; /* Keep track of the size of the last vertical jump - used to tell * whether we need to insert extra space above a line. */ gint last_vertical_jump; }; static gboolean parser_parse_line (YelpManParser *parser, GError **error); static gboolean parse_prologue_line (YelpManParser *parser, GError **error); /* Parsers for different types of line */ typedef gboolean (*LineParser)(YelpManParser *, GError **); #define DECLARE_LINE_PARSER(name) \ static gboolean (name) (YelpManParser *parser, GError **error); DECLARE_LINE_PARSER (parse_xf); DECLARE_LINE_PARSER (parse_f); DECLARE_LINE_PARSER (parse_V); DECLARE_LINE_PARSER (parse_H); DECLARE_LINE_PARSER (parse_v); DECLARE_LINE_PARSER (parse_h); DECLARE_LINE_PARSER (parse_text); DECLARE_LINE_PARSER (parse_w); DECLARE_LINE_PARSER (parse_body_text); DECLARE_LINE_PARSER (parse_n); DECLARE_LINE_PARSER (parse_N); DECLARE_LINE_PARSER (parse_C); DECLARE_LINE_PARSER (parse_p); /* Declare a sort of alist registry of parsers for different lines. */ struct LineParsePair { const gchar *prefix; LineParser handler; }; static struct LineParsePair line_parsers[] = { { "x f", parse_xf }, { "f", parse_f }, { "V", parse_V }, { "H", parse_H }, { "v", parse_v }, { "h", parse_h }, { "t", parse_text }, { "w", parse_w }, { "n", parse_n }, { "N", parse_N }, { "C", parse_C }, { "p", parse_p }, { NULL, NULL } }; /******************************************************************************/ /* Parser helper functions (managing the state of the various parsing * bits) */ static void finish_span (YelpManParser *parser); static guint dx_to_em_count (YelpManParser *parser, guint dx); static void append_nbsps (YelpManParser *parser, guint k); static void deal_with_newlines (YelpManParser *parser); static void new_sheet (YelpManParser *parser); /******************************************************************************/ /* Translations for the 'C' command. This is indeed hackish, but the * -Tutf8 output doesn't seem to give include files so we can do this * at runtime :-( * * On my machine, this data's at /usr/share/groff/current/tmac/ in * latin1.tmac, unicode.tmac and I worked out the lq and rq from * running man: I'm not sure where that comes from! */ struct StringPair { const gchar *from; gunichar to; }; static const struct StringPair char_translations[] = { { "r!", 161 }, { "ct", 162 }, { "Po", 163 }, { "Cs", 164 }, { "Ye", 165 }, { "bb", 166 }, { "sc", 167 }, { "ad", 168 }, { "co", 169 }, { "Of", 170 }, { "Fo", 171 }, { "tno", 172 }, { "%", 173 }, { "rg", 174 }, { "a-", 175 }, { "de", 176 }, { "t+-", 177 }, { "S2", 178 }, { "S3", 179 }, { "aa", 180 }, { "mc", 181 }, { "ps", 182 }, { "pc", 183 }, { "ac", 184 }, { "S1", 185 }, { "Om", 186 }, { "Fc", 187 }, { "14", 188 }, { "12", 189 }, { "34", 190 }, { "r?", 191 }, { "`A", 192 }, { "'A", 193 }, { "^A", 194 }, { "~A", 195 }, { ":A", 196 }, { "oA", 197 }, { "AE", 198 }, { ",C", 199 }, { "`E", 200 }, { "'E", 201 }, { "^E", 202 }, { ":E", 203 }, { "`I", 204 }, { "'I", 205 }, { "^I", 206 }, { ":I", 207 }, { "-D", 208 }, { "~N", 209 }, { "`O", 210 }, { "'O", 211 }, { "^O", 212 }, { "~O", 213 }, { ":O", 214 }, { "tmu", 215 }, { "/O", 216 }, { "`U", 217 }, { "'U", 218 }, { "^U", 219 }, { ":U", 220 }, { "'Y", 221 }, { "TP", 222 }, { "ss", 223 }, { "`a", 224 }, { "'a", 225 }, { "^a", 226 }, { "~a", 227 }, { ":a", 228 }, { "oa", 229 }, { "ae", 230 }, { ",c", 231 }, { "`e", 232 }, { "'e", 233 }, { "^e", 234 }, { ":e", 235 }, { "`i", 236 }, { "'i", 237 }, { "^i", 238 }, { ":i", 239 }, { "Sd", 240 }, { "~n", 241 }, { "`o", 242 }, { "'o", 243 }, { "^o", 244 }, { "~o", 245 }, { ":o", 246 }, { "tdi", 247 }, { "/o", 248 }, { "`u", 249 }, { "'u", 250 }, { "^u", 251 }, { ":u", 252 }, { "'y", 253 }, { "Tp", 254 }, { ":y", 255 }, { "hy", '-' }, { "oq", '`' }, { "cq", '\'' }, { "lq", 8220 }, // left smart quotes { "rq", 8221 }, // right smart quotes { "em", 8212 }, // em-dash { "la", 10216 }, // left angle bracket { "ra", 10217 }, // left angle bracket { "rs", '\\' }, { "<=", 8804 }, // < or equal to sign { ">=", 8805 }, // > or equal to sign { "aq", '\'' }, { "tm", 8482 }, // trademark symbol { NULL, 0 } }; /******************************************************************************/ YelpManParser * yelp_man_parser_new (void) { YelpManParser *parser = g_new0 (YelpManParser, 1); parser->accumulator = g_string_sized_new (1024); return parser; } /* This function is responsible for taking a path to a man file and returning something in the groff intermediate output format for us to use. If something goes wrong, we return NULL and set error to be a YelpError describing the problem. */ static GInputStream* get_troff (gchar *path, GError **error) { gint stdout; GError *err = NULL; gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", NULL, NULL }; argv[4] = path; if (!g_spawn_async_with_pipes (NULL, argv, NULL, G_SPAWN_SEARCH_PATH, NULL, NULL, NULL, NULL, &stdout, NULL, &err)) { /* We failed to run the man program. Return a "Huh?" error. */ *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN, err->message); g_error_free (err); return NULL; } return (GInputStream*) g_unix_input_stream_new (stdout, TRUE); } xmlDocPtr yelp_man_parser_parse_file (YelpManParser *parser, gchar *path, GError **error) { GInputStream *troff_stream; gchar *line; gsize len; gboolean ret; xmlNodePtr root; troff_stream = get_troff (path, error); if (!troff_stream) return NULL; parser->stream = g_data_input_stream_new (troff_stream); parser->doc = xmlNewDoc (BAD_CAST "1.0"); root = xmlNewNode (NULL, BAD_CAST "Man"); xmlDocSetRootElement (parser->doc, root); parser->header = xmlNewNode (NULL, BAD_CAST "header"); xmlAddChild (root, parser->header); while (1) { parser->buffer = g_data_input_stream_read_line (parser->stream, &(parser->length), NULL, NULL); if (parser->buffer == NULL) break; parser->line_no++; ret = parser_parse_line (parser, error); g_free (parser->buffer); if (!ret) { xmlFreeDoc (parser->doc); parser->doc = NULL; break; } } g_object_unref (parser->stream); return parser->doc; } void yelp_man_parser_free (YelpManParser *parser) { guint k; if (parser) { for (k=0; k<MAN_FONTS; k++) g_free (parser->font_registers[k]); } g_string_free (parser->accumulator, TRUE); g_free (parser); } /******************************************************************************/ /* Sets the k'th font register to be name. Copies name, so free it * afterwards. k should be in [0,MAN_FONTS). It seems that man always * gives us ones at least 1, but groff_out(5) says non-negative. */ static void set_font_register (YelpManParser *parser, guint k, const gchar* name) { if (k > MAN_FONTS) { g_warning ("Tried to set nonexistant font register %d to %s", k, name); return; } g_free (parser->font_registers[k]); parser->font_registers[k] = g_strdup (name); } static const gchar* get_font (const YelpManParser *parser) { guint k = parser->current_font; if (k > MAN_FONTS || parser->font_registers[k] == NULL) { g_warning ("Tried to get nonexistant font register %d", k); return ""; } return parser->font_registers[k]; } /******************************************************************************/ /* Convenience macros to scan a string, checking for the correct number of things read. Also to raise an error. Add an %s to the end of the format string, which automatically gets given parser->buffer. */ #define SSCANF(fmt,num,...) \ (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num)) #define PARSE_ERROR(...) \ g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, \ __VA_ARGS__, parser->buffer) #define RAISE_PARSE_ERROR(...) \ { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; } static gboolean parser_parse_line (YelpManParser *parser, GError **error) { if (parser->line_no <= 3) return parse_prologue_line (parser, error); const struct LineParsePair *p = line_parsers; while (p->handler != NULL) { if (g_str_has_prefix (parser->buffer, p->prefix)) { return p->handler(parser, error); } p++; } return TRUE; } static gboolean parse_prologue_line (YelpManParser *parser, GError **error) { if (parser->line_no != 2) return TRUE; /* This is the interesting line, which should look like x res 240 24 40 The interesting bits are the 24 and the 40, which are the width and height of a character as far as -Tutf8 is concerned. */ if (SSCANF ("x %*s %*u %u %u", 2, &parser->char_width, &parser->char_height)) { RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s"); } return TRUE; } static gboolean parse_xf (YelpManParser *parser, GError **error) { gchar name[10]; guint k; if (SSCANF ("x f%*s %u %10s", 2, &k, name)) { RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s"); } set_font_register (parser, k, name); return TRUE; } static gboolean parse_f (YelpManParser *parser, GError **error) { guint k; if (SSCANF ("f%u", 1, &k)) { RAISE_PARSE_ERROR ("Invalid font line from troff: %s"); } finish_span (parser); parser->current_font = k; return TRUE; } static gboolean parse_v (YelpManParser *parser, GError **error) { guint dy; if (SSCANF ("v%u", 1, &dy)) { RAISE_PARSE_ERROR ("Invalid v line from troff: %s"); } parser->last_vertical_jump += dy; parser->vpos += dy; return TRUE; } static gboolean parse_h (YelpManParser *parser, GError **error) { guint dx; int k; if (SSCANF ("h%u", 1, &dx)) { RAISE_PARSE_ERROR ("Invalid h line from troff: %s"); } parser->hpos += dx; /* This is a bit hackish to be honest but... if we're in something * that'll end up in a span, a spacing h command means that a gap * should appear. It seems that the easiest way to get this is to * insert nonbreaking spaces (eugh!) * * Of course we don't want to do this when chained from wh24 or * whatever, so use the last_char_was_space flag * but... unfortunately some documents actually use stuff like * wh96 for spacing (eg the lists in perl(1)). So (very hackish!), * ignore double spaces, since that's probably just been put in to * make the text justified (eugh), but allow bigger jumps. * * Incidentally, the perl manual here has bizarre gaps in the * synopsis section. God knows why, but man displays them too so * it's not our fault! :-) */ k = dx_to_em_count (parser, dx); if ((parser->sheet_node) && ((!parser->last_char_was_space) || (k > 2))) { k -= parser->N_count; if (k < 0) k = 0; append_nbsps (parser, k); } parser->N_count = 0; return TRUE; } static gboolean parse_V (YelpManParser *parser, GError **error) { guint y; if (SSCANF ("V%u", 1, &y)) { RAISE_PARSE_ERROR ("Invalid V line from troff: %s"); } parser->last_vertical_jump += y - parser->vpos; parser->vpos = y; return TRUE; } static gboolean parse_H (YelpManParser *parser, GError **error) { guint x; if (SSCANF ("H%u", 1, &x)) { RAISE_PARSE_ERROR ("Invalid H line from troff: %s"); } parser->hpos = x; return TRUE; } static gboolean parse_text (YelpManParser *parser, GError **error) { gchar *text, *section, *tmp; xmlNodePtr node; g_assert (parser->buffer[0] == 't'); if (parser->state == START) { /* With a bit of luck, this will be the tBLAH(1) line. Can't * use sscanf to chop it up since that needs whitespace. */ section = strchr (parser->buffer + 1, '('); if (!section) RAISE_PARSE_ERROR ("Expected t line with title. Got %s"); text = g_strndup (parser->buffer + 1, section - (parser->buffer + 1)); // Skip over the ( section++; tmp = strchr (section, ')'); if (!tmp || (*(tmp+1) != '\0')) RAISE_PARSE_ERROR ("Strange format for t title line: %s"); section = g_strndup (section, tmp - section); parser->state = HAVE_TITLE; xmlNewTextChild (parser->header, NULL, BAD_CAST "title", text); xmlNewTextChild (parser->header, NULL, BAD_CAST "section", section); g_free (text); g_free (section); /* The accumulator should currently be "". */ g_assert (parser->accumulator && *(parser->accumulator->str) == '\0'); return TRUE; } if (parser->state == HAVE_TITLE) { /* We expect (maybe!) to get some lines tThe wh24 * tCollection. We've found (and can ignore!) the second * title line if there's a (). */ if (strchr (parser->buffer+1, '(') && strchr (parser->buffer+1, ')')) { parser->state = BODY; xmlNewTextChild (parser->header, NULL, BAD_CAST "collection", parser->accumulator->str); g_string_truncate (parser->accumulator, 0); return TRUE; } g_string_append (parser->accumulator, parser->buffer+1); return TRUE; } return parse_body_text (parser, error); } /* w is a sort of prefix argument. It indicates a space, so we register that here, then call parser_parse_line again on the rest of the string to deal with that. */ static gboolean parse_w (YelpManParser *parser, GError **error) { gboolean ret; if (parser->state != START) { g_string_append_c (parser->accumulator, ' '); } parser->buffer++; parser->last_char_was_space = TRUE; ret = parser_parse_line (parser, error); parser->buffer--; return ret; } static gboolean parse_body_text (YelpManParser *parser, GError **error) { /* It's this function which is responsible for trying to get *some* semantic information back out of the manual page. The highest-level chopping up is into sections. We use the heuristic that if either (1) We haven't got a section yet or (2) text starts a line (hpos=0) then it's a section title. It's possible to have spaces in section titles, so we carry on accumulating the section title until the next newline. */ if (parser->section_state != SECTION_TITLE && parser->hpos == 0) { g_string_truncate (parser->accumulator, 0); /* End the current sheet & section */ parser->section_state = SECTION_TITLE; parser->sheet_node = NULL; parser->section_node = xmlAddChild (xmlDocGetRootElement (parser->doc), xmlNewNode (NULL, BAD_CAST "section")); } if (parser->section_state != SECTION_TITLE) deal_with_newlines (parser); g_string_append (parser->accumulator, parser->buffer+1); /* Move hpos forward per char */ parser->hpos += strlen (parser->buffer+1) * parser->char_width; parser->last_char_was_space = FALSE; parser->N_count = 0; return TRUE; } static gboolean parse_n (YelpManParser *parser, GError **error) { xmlNodePtr node; /* Don't care about newlines in the header bit */ if (parser->state != BODY) return TRUE; if (parser->section_state == SECTION_TITLE) { g_strchomp (parser->accumulator->str); xmlNewTextChild (parser->section_node, NULL, BAD_CAST "title", parser->accumulator->str); g_string_truncate (parser->accumulator, 0); parser->section_state = SECTION_BODY; } else if (parser->sheet_node != NULL) { /* In the body of a section, when we get to a newline we should have an accumulator with text in it and a non-null sheet (hopefully!). We know the current font, so add a span for that font containing the relevant text. Then add a <br/> tag. */ finish_span (parser); node = xmlNewNode (NULL, BAD_CAST "br"); xmlAddChild (parser->sheet_node, node); } parser->newline = TRUE; parser->last_char_was_space = FALSE; return TRUE; } static void finish_span (YelpManParser *parser) { xmlNodePtr node; if (parser->accumulator->str[0] != '\0') { node = xmlNewTextChild (parser->sheet_node, NULL, BAD_CAST "span", parser->accumulator->str); xmlNewProp (node, BAD_CAST "class", get_font (parser)); g_string_truncate (parser->accumulator, 0); } } static guint dx_to_em_count (YelpManParser *parser, guint dx) { return (int)(dx / ((float)parser->char_width)); } static gboolean parse_N (YelpManParser *parser, GError **error) { gint n; if (SSCANF ("N%i", 1, &n)) { RAISE_PARSE_ERROR ("Strange format for N line: %s"); } if (n > 127) { RAISE_PARSE_ERROR ("N line has non-7-bit character: %s"); } if (n < -200) { RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s"); } deal_with_newlines (parser); parser->last_char_was_space = FALSE; if (n < 0) { append_nbsps (parser, -n); parser->N_count += -n; } else { g_string_append_c (parser->accumulator, (gchar)n); parser->N_count++; } return TRUE; } static void append_nbsps (YelpManParser *parser, guint k) { for (; k > 0; k--) { /* 0xc2 0xa0 is nonbreaking space in utf8 */ g_string_append_c (parser->accumulator, 0xc2); g_string_append_c (parser->accumulator, 0xa0); } } static gboolean parse_C (YelpManParser *parser, GError **error) { gchar name[16]; gunichar code = 0; guint k; gint len; if (SSCANF ("C%16s", 1, name)) { RAISE_PARSE_ERROR ("Can't understand special character: %s"); } for (k=0; char_translations[k].from; k++) { if (g_str_equal (char_translations[k].from, name)) { code = char_translations[k].to; break; } } if (sscanf (name, "u%x", &k) == 1) { code = k; } if (!code) { g_warning ("Couldn't parse troff special character: '%s'", name); code = 65533; /* Unicode replacement character */ } deal_with_newlines (parser); parser->last_char_was_space = FALSE; /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */ len = g_unichar_to_utf8 (code, name); name[len] = '\0'; g_string_append (parser->accumulator, name); parser->N_count++; return TRUE; } static void deal_with_newlines (YelpManParser *parser) { /* If newline is true, this is the first word on a line. In which case, we check to see whether hpos agrees with the current sheet's indent. If so (or if there isn't a sheet yet!), we just add to the accumulator. If not, start a new sheet with the correct indent. If we aren't the first word on the line, just add to the accumulator. */ gchar tmp[64]; guint jump_lines; gboolean made_sheet = FALSE, dont_jump = FALSE; /* This only happens at the start of a section, where there's already a gap */ if (!parser->sheet_node) { dont_jump = TRUE; } if ((!parser->sheet_node) || (parser->newline && (parser->hpos != parser->sheet_indent))) { new_sheet (parser); made_sheet = TRUE; } if (parser->newline) { append_nbsps (parser, dx_to_em_count (parser, parser->hpos)); if ((parser->last_vertical_jump > 0) && (!dont_jump)) { jump_lines = parser->last_vertical_jump/parser->char_height; } else { jump_lines = 1; } if (jump_lines > 1) { if (!made_sheet) new_sheet (parser); made_sheet = TRUE; } if (made_sheet) { snprintf (tmp, 64, "%u", jump_lines-1); xmlNewProp (parser->sheet_node, BAD_CAST "jump", tmp); } } parser->newline = FALSE; parser->last_vertical_jump = 0; } static gboolean parse_p (YelpManParser *parser, GError **error) { parser->vpos = 0; parser->hpos = 0; return TRUE; } static void new_sheet (YelpManParser *parser) { /* We don't need to worry about finishing the current sheet, since the accumulator etc. get cleared on newlines and we know we're at the start of a line. */ parser->sheet_node = xmlAddChild (parser->section_node, xmlNewNode (NULL, BAD_CAST "sheet")); parser->sheet_indent = parser->hpos; }
Attachment:
pgp5UY9EbZ2GT.pgp
Description: PGP signature