Re: Info parsing

From: Rupert Swarbrick <rswarbrick gmail com>
To: gnome-doc-devel-list gnome org
Subject: Re: Info parsing
Date: Wed, 15 Sep 2010 11:07:41 +0100

I've made a couple more changes and rolled in some existing patches that
I've already posted to bugzilla. The patches here fix bug 420343 and
have the patches from bugs 621390 and 623304.

Also, there's the new info formatting code. I forgot to mention in the
previous post, but it also makes nice sexy <h1>, <h2>... headings at the
top of the page.

Are they suitable for use?

Rupert

From 7059753590f6f6371b3b04880e82eeb6edb57ce0 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Wed, 16 Jun 2010 10:32:20 +0100
Subject: [PATCH 1/9] Support for headings in info files.

---
 libyelp/yelp-info-parser.c   |  173 +++++++++++++++++++++++++++++++++++++-----
 stylesheets/info2html.xsl.in |   17 ++++
 2 files changed, 170 insertions(+), 20 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 3310794..7d32905 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -1,4 +1,4 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
 /*
  * Copyright (C) 2005 Davyd Madeley <davyd madeley id au>
  *
@@ -58,8 +58,13 @@ void                  fix_tag_table                      (gchar *offset,
 							  TagTableFix *a);
 void   		      info_process_text_notes            (xmlNodePtr *node, 
 							  gchar *content,
-							  GtkTreeStore *tree);
+							  GtkTreeStore
+							  *tree);
 
+/*
+  Used to output the correct <heading level="?" /> tag.
+ */
+static const gchar* level_headings[] = { NULL, "1", "2", "3" };
 
 static GHashTable *
 info_image_get_attributes (gchar const* string)
@@ -141,15 +146,144 @@ info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
 }
 
 /*
-  Convert body text CONTENT to xml nodes, processing info image tags
-  when found.  IWBN add a regex match for *Note: here and call the
-  *Note ==> <a href> logic of info_process_text_notes from here.
+  If every element of `str' is `ch' then return TRUE, else FALSE.
  */
-static xmlNodePtr
-info_body_text (xmlNodePtr parent, xmlNsPtr ns, gchar const *name, gchar const *content)
+static gboolean
+string_all_char_p (const gchar* str, gchar ch)
+{
+  for (; *str; str++) {
+    if (*str != ch) return FALSE;
+  }
+  return TRUE;
+}
+
+/*
+  If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
+  for the heading level. If it's anything else, return 0.
+ */
+static int
+header_underline_level (const gchar* line)
+{
+  if (*line != '*' && *line != '=' && *line != '-')
+    return 0;
+
+  if (string_all_char_p (line, '*')) return 1;
+  if (string_all_char_p (line, '=')) return 2;
+  if (string_all_char_p (line, '-')) return 3;
+
+  return 0;
+}
+
+/*
+  Use g_strjoinv to join up the strings from `strings', but they might
+  not actually be a null-terminated array. `end' should be strings+n,
+  where I want the first n strings (strings+0, ..., strings+(n-1)). It
+  shouldn't point outside of the array allocated, but it can point at
+  the null string at the end.
+ */
+static gchar*
+join_strings_subset (const gchar *separator,
+                     gchar** strings, gchar** end)
+{
+  g_assert(end > strings);
+
+  gchar *ptr = *end;
+  *end = NULL;
+  
+  gchar *glob = g_strjoinv (separator, strings);
+  *end = ptr;
+  return glob;
+}
+
+/*
+  Create a text node, child of `parent', with the lines strictly
+  between `first' and `last'.
+*/
+static void
+lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
+                         gboolean inline_p,
+                         gchar** first, gchar** last)
 {
-  if (!strstr (content, INFO_C_IMAGE_TAG_OPEN))
-    return xmlNewTextChild (parent, ns, BAD_CAST name, BAD_CAST content);
+  /* TODO? Currently we're copying the split strings again, which is
+     less efficient than somehow storing lengths and using a sort of
+     window on `content'. But that's much more difficult, so unless
+     there's a problem, let's go with the stupid approach. */
+  gchar *glob;
+  if (last > first) {
+    glob = join_strings_subset ("\n", first, last);
+    xmlNewTextChild (parent, ns,
+                     inline_p ? BAD_CAST "para1" : BAD_CAST "para",
+                     BAD_CAST glob);
+    g_free (glob);
+  }
+}
+
+/*
+  Convert body text CONTENT to xml nodes. This function is responsible
+  for spotting headings etc and splitting them out correctly.
+
+  If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
+  <para> tag. 
+
+  TODO: IWBN add a regex match for *Note: here and call the *Note ==>
+  <a href> logic of info_process_text_notes from here.
+ */
+static void
+info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
+                      gboolean inline_p, const gchar *content)
+{
+  /* The easiest things to spot are headings: they look like a line of
+   * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
+   * them, we split content into single lines and work with them. */
+  gchar **lines = g_strsplit (content, "\n", 0);
+  gchar **first = lines, **last = lines+1;
+  int header_level;
+  xmlNodePtr header_node;
+
+  /* Deal with the possibility that `content' is empty */
+  if (*lines == NULL) {
+    if (!inline_p) {
+      xmlNewTextChild (parent, NULL, BAD_CAST "para", BAD_CAST "");
+    }
+    return;
+  }
+
+  for (; *last; last++) {
+    header_level = header_underline_level (*last);
+    if (header_level) {
+      /* Write out any lines beforehand */
+      lines_subset_text_child (parent, ns, FALSE, first, last-1);
+      /* Now write out the actual header line */
+      header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
+                                     BAD_CAST *(last-1));
+      xmlNewProp (header_node, BAD_CAST "level",
+                  BAD_CAST level_headings[header_level]);
+      
+      first = last+1;
+      last = first+1;
+    }
+  }
+  /* Write out any lines left */
+  lines_subset_text_child (parent, ns, inline_p, first, last);
+  
+  g_strfreev (lines);
+}
+
+/*
+  info_body_text is responsible for taking a hunk of the info page's
+  body and turning it into paragraph tags. It searches out images and
+  marks them up properly if necessary.
+
+  It uses info_body_parse_text to mark up the actual bits of text.
+ */
+static void
+info_body_text (xmlNodePtr parent, xmlNsPtr ns,
+                gboolean inline_p, gchar const *content)
+{
+  if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
+    info_body_parse_text (parent, ns, inline_p, content);
+    return;
+  }
 
   gint content_len = strlen (content);
   gint pos = 0;
@@ -164,16 +298,15 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns, gchar const *name, gchar const *
 						     &image_start, &image_end);
       gchar *before = g_strndup (&content[pos], image_start - pos);
       pos = image_end + 1;
-      xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST (before));
+      info_body_parse_text (parent, NULL, TRUE, before);
       g_free (before);
       if (image_found)
 	info_insert_image (parent, match_info);
       g_match_info_next (match_info, NULL);
     }
   gchar *after = g_strndup (&content[pos], content_len - pos);
-  xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST (after));
+  info_body_parse_text (parent, NULL, TRUE, after);
   g_free (after);
-  return 0;
 }
 
 /* Part 1: Parse File Into Tree Store */
@@ -840,7 +973,7 @@ parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 					     BAD_CAST "Section",
 					     NULL);
 		  if (!notes)
-		    info_body_text (newnode, NULL, "para", page_content);
+		    info_body_text (newnode, NULL, FALSE, page_content);
 		  
 		  else {
 		    /* Handle notes here */
@@ -1005,7 +1138,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
 
   tmp = g_strconcat (split[0], "\n* Menu:", NULL);
   if (!notes)
-    info_body_text (newnode, NULL, "para", tmp);
+    info_body_text (newnode, NULL, FALSE, tmp);
   else {
     info_process_text_notes (&newnode, tmp, tree);
   }
@@ -1119,7 +1252,7 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	 * start, so we can just add it and forget about it.
 	 */
 	first = FALSE;
-	info_body_text (holder, NULL, "para1", (*current_real));
+	info_body_text (holder, NULL, TRUE, (*current_real));
 	continue;
       }
       /* If we got to here, we now gotta parse the note reference */
@@ -1128,13 +1261,13 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	/* Special type of note that isn't really a note, but pretends
 	 * it is
 	 */
-	info_body_text (holder, NULL, "para1",
+	info_body_text (holder, NULL, TRUE,
 			g_strconcat ("*Note", *current_real, NULL));
 	continue;
       }
       append = strchr (*current_real, ':');
       if (!append) {
-	info_body_text (holder, NULL, "para1", *current_real);
+	info_body_text (holder, NULL, TRUE, *current_real);
 	continue;
       }
       append++;
@@ -1149,7 +1282,7 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
       }
       alt_append1 = strchr (alt_append1, ',');
       if (!append && !alt_append && !alt_append1) {
-	info_body_text (holder, NULL, "para1", *current_real);
+	info_body_text (holder, NULL, TRUE, *current_real);
 	continue;
       }
       if (!append || alt_append || alt_append1) {
@@ -1285,14 +1418,14 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	ref1 = xmlNewTextChild (holder, NULL, BAD_CAST "a",
 				BAD_CAST link_text);
 	if (*(ulink+1) != NULL)
-	  info_body_text (holder, NULL, "para", "");
+	  info_body_text (holder, NULL, FALSE, "");
 
 	g_free (link_text);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
       }
       g_strfreev (urls);
       /* Finally, we can add the text as required */
-      info_body_text (holder, NULL, "para1", append);
+      info_body_text (holder, NULL, TRUE, append);
       g_free (url);
       g_free (href);
     }
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index ec75878..c029148 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -115,6 +115,23 @@ a.navbar-next::after {
   <xsl:value-of select="node()"/>
 </xsl:template>
 
+<xsl:template match="header">
+  <xsl:choose>
+    <xsl:when test='@level = 1'>
+      <h1><xsl:value-of select="node()"/></h1>
+    </xsl:when>
+    <xsl:when test='@level = 2'>
+      <h2><xsl:value-of select="node()"/></h2>
+    </xsl:when>
+    <xsl:when test='@level = 3'>
+      <h3><xsl:value-of select="node()"/></h3>
+    </xsl:when>
+    <xsl:otherwise>
+      <h1>(Unknown heading level) <xsl:value-of select="node()"/></h1>
+    </xsl:otherwise>
+  </xsl:choose>
+</xsl:template>
+
 <xsl:template match="spacing">
   <xsl:value-of select="node()"/>
 </xsl:template>
-- 
1.7.1

From d1369b91a2bbde04a94911123c4d583087b0b692 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Fri, 3 Sep 2010 00:09:31 +0100
Subject: [PATCH 2/9] Display menus as <ul>'s, rather than the original text.

---
 libyelp/yelp-info-parser.c   |   87 +++++++++++++++++++++++++++++++++--------
 stylesheets/info2html.xsl.in |   24 +++++++++--
 2 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 7d32905..1605ecf 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -248,7 +248,11 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
     return;
   }
 
+  /* Use a pair of pointers, first and last, which point to two lines,
+   * the chunk of the body we're displaying (inclusive) */
   for (; *last; last++) {
+
+    /* Check for a header */
     header_level = header_underline_level (*last);
     if (header_level) {
       /* Write out any lines beforehand */
@@ -1120,6 +1124,16 @@ get_menuoptions (gchar *line, gchar **title, gchar **ref, gchar **desc,
   return TRUE;
 }
 
+/* Find the first non-space character in str or return pointer to the
+ * '\0' if there isn't one. */
+static gchar*
+first_non_space (gchar* str)
+{
+  /* As long as str is null terminated, this is ok! */
+  while (*str == ' ') str++;
+  return str;
+}
+
 xmlNodePtr
 yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node, 
 		      gchar *page_content, gboolean notes)
@@ -1127,7 +1141,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
   gchar **split;
   gchar **menuitems;
   gchar *tmp = NULL;
-  xmlNodePtr newnode;
+  xmlNodePtr newnode, menu_node, mholder = NULL;
   int i=0;
 
   split = g_strsplit (page_content, "* Menu:", 2);
@@ -1136,37 +1150,69 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
 			 BAD_CAST "Section", NULL);
     
 
-  tmp = g_strconcat (split[0], "\n* Menu:", NULL);
   if (!notes)
-    info_body_text (newnode, NULL, FALSE, tmp);
+    info_body_text (newnode, NULL, FALSE, split[0]);
   else {
-    info_process_text_notes (&newnode, tmp, tree);
+    info_process_text_notes (&newnode, split[0], tree);
   }
-  g_free (tmp);
 
   menuitems = g_strsplit (split[1], "\n", -1);
   g_strfreev (split);
 
+  /* The output xml should look something like the following:
+
+     <menu>
+       <menuholder>
+         <a href="xref:Help-Inv">Help-Inv</a>
+         <para1>Invisible text in Emacs Info.</para1>
+       </menuholder>
+       <menuholder>
+         <a href="xref:Help-M">Help-M</a>
+         <para1>Menus.</para1>
+       </menuholder>
+       ...
+     </menu>
+
+     (from the top page of info:info). Note the absence of *'s and
+     ::'s on the links.
+
+     If there's a line with no "* Blah::", it looks like a child of
+     the previous menu item so (for i > 0) deal with that correctly by
+     not "closing" the <menuholder> tag until we find the next
+     start.
+  */
+
+  if (menuitems[0] != NULL) {
+    /* If there are any menu items, make the <menu> node */
+    menu_node = xmlNewChild (newnode, NULL, BAD_CAST "menu", NULL);
+  }
+
   while (menuitems[i] != NULL) {
     gboolean menu = FALSE;
     gchar *title = NULL;
     gchar *ref = NULL;
     gchar *desc = NULL;
     gchar *xref = NULL;
-    xmlNodePtr mholder;
     xmlNodePtr ref1;
 
     menu = get_menuoptions (menuitems[i], &title, &ref, &desc, &xref);
- 
+
+    if (menu && (*title == '\0' || *(title + 1) == '\0')) {
+      g_warning ("Info title unexpectedly short for menu item (%s)",
+                 menuitems[i]);
+      menu = FALSE;
+    }
+
     if (menu) {
-      mholder = xmlNewChild (newnode, NULL, BAD_CAST "menuholder", NULL);
+      mholder = xmlNewChild (menu_node, NULL, BAD_CAST "menuholder", NULL);
       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &xref);
       
       if (ref == NULL) { /* A standard type menu */
-	tmp = g_strconcat (title, "::", NULL);
+        /* title+2 skips the "* ". We know we haven't jumped over the
+           end of the string because strlen (title) >= 3 */
 	ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
-				BAD_CAST tmp);
-	g_free (tmp);
+				BAD_CAST title+2);
+
         tmp = g_strconcat ("xref:", xref, NULL);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
         g_free (tmp);
@@ -1200,12 +1246,19 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
         g_free (tmp);
 	g_free (sp);
       }
-      xmlNewTextChild (mholder, NULL, BAD_CAST "para",
-		       BAD_CAST desc);
-    } else {
-      xmlNewTextChild (newnode, NULL, BAD_CAST "para",
-		       BAD_CAST menuitems[i]);
-      
+
+      tmp = g_strconcat ("\n", first_non_space (desc), NULL);
+      xmlNewTextChild (mholder, NULL, BAD_CAST "para1",
+		       BAD_CAST tmp);
+      g_free (tmp);
+
+    }
+    else if (*(menuitems[i]) != '\0') {
+      tmp = g_strconcat ("\n", first_non_space (menuitems[i]), NULL);
+      xmlNewTextChild (mholder ? mholder : menu_node,
+                       NULL, BAD_CAST "para1",
+		       BAD_CAST tmp);
+      g_free (tmp);
     }
     i++;
     g_free (title);
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index c029148..1117a80 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -47,7 +47,8 @@
   <xsl:param name="left"/>
   <xsl:param name="right"/>
   <xsl:text>
-div.body { white-space: pre; font-family: monospace; }
+div.body { font-family: monospace; }
+span.fixed { white-space: pre; }
 <!-- navbar from mal2html, possibly move to html.xsl -->
 div.navbar {
   margin: 0 0 1em 0;
@@ -106,13 +107,17 @@ a.navbar-next::after {
 <!-- = Normal Matches = -->
 
 <xsl:template match="para">
-  <xsl:value-of select="node()"/>
-  <xsl:text>
+  <span class="fixed">
+    <xsl:value-of select="node()"/>
+    <xsl:text>
   </xsl:text>
+  </span>
 </xsl:template>
 
 <xsl:template match="para1">
-  <xsl:value-of select="node()"/>
+  <span class="fixed">
+    <xsl:value-of select="node()"/>
+  </span>
 </xsl:template>
 
 <xsl:template match="header">
@@ -156,8 +161,17 @@ a.navbar-next::after {
   </xsl:element>
 </xsl:template>
 
+<xsl:template match="menu">
+  <xsl:element name="p">Menu:</xsl:element>
+  <xsl:element name="ul">
+    <xsl:apply-templates />
+  </xsl:element>
+</xsl:template>
+
 <xsl:template match="menuholder">
-  <xsl:apply-templates select="node()[not(self::menuholder)]"/>
+  <xsl:element name="li">
+    <xsl:apply-templates />
+  </xsl:element>
 </xsl:template>
 
 <xsl:template match="noteholder">
-- 
1.7.1

From 7ede37523e6fe60ae13fe5e9f98b356d9da4b4d4 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Fri, 10 Sep 2010 11:42:23 +0100
Subject: [PATCH 3/9] Parse info files into paragraphs (separated by blank lines).

---
 libyelp/yelp-info-parser.c   |  454 ++++++++++++++++++++++--------------------
 stylesheets/info2html.xsl.in |   15 +-
 2 files changed, 248 insertions(+), 221 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 1605ecf..a85f733 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -128,7 +128,8 @@ info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
     source = (gchar*)g_hash_table_lookup (h, "src");
 
   if (!h || !source || !*source)
-    return xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST "[broken image]");
+    return xmlNewTextChild (parent, NULL, BAD_CAST "para",
+                            BAD_CAST "[broken image]");
 
   gchar *title = (gchar*)g_hash_table_lookup (h, "title");
   gchar *text = (gchar*)g_hash_table_lookup (h, "text");
@@ -201,7 +202,6 @@ join_strings_subset (const gchar *separator,
 */
 static void
 lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
-                         gboolean inline_p,
                          gchar** first, gchar** last)
 {
   /* TODO? Currently we're copying the split strings again, which is
@@ -209,11 +209,10 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
      window on `content'. But that's much more difficult, so unless
      there's a problem, let's go with the stupid approach. */
   gchar *glob;
+
   if (last > first) {
     glob = join_strings_subset ("\n", first, last);
-    xmlNewTextChild (parent, ns,
-                     inline_p ? BAD_CAST "para1" : BAD_CAST "para",
-                     BAD_CAST glob);
+    xmlAddChild (parent, xmlNewText (BAD_CAST glob));
     g_free (glob);
   }
 }
@@ -222,21 +221,24 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
   Convert body text CONTENT to xml nodes. This function is responsible
   for spotting headings etc and splitting them out correctly.
 
+  paragraph is as described in info_body_text, but cannot be null.
+
   If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
-  <para> tag. 
+  <para> tag.
 
   TODO: IWBN add a regex match for *Note: here and call the *Note ==>
   <a href> logic of info_process_text_notes from here.
  */
 static void
-info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_parse_text (xmlNodePtr parent, xmlNodePtr *paragraph,
+                      xmlNsPtr ns,
                       gboolean inline_p, const gchar *content)
 {
   /* The easiest things to spot are headings: they look like a line of
    * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
    * them, we split content into single lines and work with them. */
   gchar **lines = g_strsplit (content, "\n", 0);
-  gchar **first = lines, **last = lines+1;
+  gchar **first = lines, **last = lines;
   int header_level;
   xmlNodePtr header_node;
 
@@ -252,11 +254,27 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
    * the chunk of the body we're displaying (inclusive) */
   for (; *last; last++) {
 
+    /* Check for a blank line */
+    if (**last == '\0') {
+      if (last != first) {
+        if (!*paragraph) {
+          *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+        }
+        lines_subset_text_child (*paragraph, ns, first, last);
+      }
+      /* On the next iteration, last==first both pointing at the next
+         line. */
+      first = last+1;
+      *paragraph = NULL;
+
+      continue;
+    }
+
     /* Check for a header */
     header_level = header_underline_level (*last);
     if (header_level) {
       /* Write out any lines beforehand */
-      lines_subset_text_child (parent, ns, FALSE, first, last-1);
+      lines_subset_text_child (parent, ns, first, last-1);
       /* Now write out the actual header line */
       header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
                                      BAD_CAST *(last-1));
@@ -264,11 +282,15 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
                   BAD_CAST level_headings[header_level]);
       
       first = last+1;
-      last = first+1;
+      last = first-1;
     }
   }
+
   /* Write out any lines left */
-  lines_subset_text_child (parent, ns, inline_p, first, last);
+  if (!*paragraph) {
+    *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+  }
+  lines_subset_text_child (*paragraph, ns, first, last);
   
   g_strfreev (lines);
 }
@@ -278,14 +300,21 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
   body and turning it into paragraph tags. It searches out images and
   marks them up properly if necessary.
 
+  parent should be the node in which we're currently storing text and
+  paragraph a pointer to a <para> tag or NULL. At blank lines, we
+  finish with the current para tag and switch to a new one.
+
   It uses info_body_parse_text to mark up the actual bits of text.
  */
 static void
-info_body_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_text (xmlNodePtr parent, xmlNodePtr *paragraph, xmlNsPtr ns,
                 gboolean inline_p, gchar const *content)
 {
+  xmlNodePtr thepara = NULL;
+  if (paragraph == NULL) paragraph = &thepara;
+
   if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
-    info_body_parse_text (parent, ns, inline_p, content);
+    info_body_parse_text (parent, paragraph, ns, inline_p, content);
     return;
   }
 
@@ -293,6 +322,7 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
   gint pos = 0;
   GRegex *regex = g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE "((?:[^" INFO_TAG_1 "]|[^" INFO_C_TAG_0 "]+" INFO_TAG_1 ")*)" INFO_C_TAG_CLOSE_RE ")", 0, 0, NULL);
   GMatchInfo *match_info;
+
   g_regex_match (regex, content, 0, &match_info);
   while (g_match_info_matches (match_info))
     {
@@ -302,14 +332,18 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
 						     &image_start, &image_end);
       gchar *before = g_strndup (&content[pos], image_start - pos);
       pos = image_end + 1;
-      info_body_parse_text (parent, NULL, TRUE, before);
+      info_body_parse_text (parent, paragraph, NULL, TRUE, before);
       g_free (before);
+
+      /* End the paragraph that was before */
+      *paragraph = NULL;
+
       if (image_found)
 	info_insert_image (parent, match_info);
       g_match_info_next (match_info, NULL);
     }
   gchar *after = g_strndup (&content[pos], content_len - pos);
-  info_body_parse_text (parent, NULL, TRUE, after);
+  info_body_parse_text (parent, paragraph, NULL, TRUE, after);
   g_free (after);
 }
 
@@ -977,8 +1011,8 @@ parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 					     BAD_CAST "Section",
 					     NULL);
 		  if (!notes)
-		    info_body_text (newnode, NULL, FALSE, page_content);
-		  
+		    info_body_text (newnode, NULL, NULL, FALSE, page_content);
+
 		  else {
 		    /* Handle notes here */
 		    info_process_text_notes (&newnode, page_content, tree);
@@ -1151,7 +1185,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
     
 
   if (!notes)
-    info_body_text (newnode, NULL, FALSE, split[0]);
+    info_body_text (newnode, NULL, NULL, FALSE, split[0]);
   else {
     info_process_text_notes (&newnode, split[0], tree);
   }
@@ -1277,212 +1311,208 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 {
   gchar **notes;
   gchar **current;
-  xmlNodePtr holder;
   xmlNodePtr ref1;
+  xmlNodePtr paragraph = NULL;
   gboolean first = TRUE;
 
-  notes = g_strsplit (content, "*Note", -1);
-  holder = xmlNewChild (*node, NULL, BAD_CAST "noteholder", NULL);
+  /*
+    Split using the regular expression
+
+      \*[Nn]ote(?!_)
+
+    which deals with either case and the last bit is a lookahead so
+    that we don't split on things of the form *Note:_, which aren't
+    real notes.
+  */
+  notes = g_regex_split_simple ("\\*[Nn]ote(?!_)", content, 0, 0);
 
   for (current = notes; *current != NULL; current++) {
-    /* Since the notes can be either *Note or *note, we handle the second 
-     * variety here
-     */
-    gchar **subnotes;
-    gchar **current_real;
-
-    subnotes = g_strsplit (*current, "*note", -1);
-    for (current_real = subnotes; *current_real != NULL; current_real++) {
-      gchar *url, **urls, **ulink;
-      gchar *append;
-      gchar *alt_append, *alt_append1;
-      gchar *link_text;
-      gchar *href = NULL;
-      gchar *break_point = NULL;
-      gboolean broken = FALSE;
-      if (first) {
-	/* The first node is special.  It doesn't have a note ref at the 
-	 * start, so we can just add it and forget about it.
-	 */
-	first = FALSE;
-	info_body_text (holder, NULL, TRUE, (*current_real));
-	continue;
-      }
-      /* If we got to here, we now gotta parse the note reference */
-
-      if (*current_real[0] == '_') {
-	/* Special type of note that isn't really a note, but pretends
-	 * it is
-	 */
-	info_body_text (holder, NULL, TRUE,
-			g_strconcat ("*Note", *current_real, NULL));
-	continue;
-      }
-      append = strchr (*current_real, ':');
-      if (!append) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      append++;
-      alt_append = append;
-      alt_append1 = alt_append;
-      append = strchr (append, ':');
-      alt_append = strchr (alt_append, '.');
-      if (alt_append && g_str_has_prefix (alt_append, ".info")) {
-	broken = TRUE;
-	alt_append++;
-	alt_append = strchr (alt_append, '.');
-      }
-      alt_append1 = strchr (alt_append1, ',');
-      if (!append && !alt_append && !alt_append1) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      if (!append || alt_append || alt_append1) {
-	if (!append) {
-	  if (alt_append) append = alt_append;
-	  else append = alt_append1;
-	}
-	if ((alt_append && alt_append < append))
-	  append = alt_append;
-	if (alt_append1 && alt_append1 < append)
-	  append = alt_append1;
-      }
-      append++;
-      url = g_strndup (*current_real, append - (*current_real));
-
-      /* By now, we got 2 things.  First, is append which is the (hopefully)
-       * non-link text.  Second, we got a url.
-       * The url can be in several forms:
-       * 1. linkend::
-       * 2. linkend:(infofile)Linkend.
-       * 3. Title: Linkend.
-       * 4. Title: Linkend, (pretty sure this is just broken)
-       * 5. Title: (infofile.info)Linkend.
-       * All possibilities should have been picked up.
-       * Here:
-       * Clean up the split.  Should be left with a real url and
-       * a list of fragments that should be linked
-       * Also goes through and removes extra spaces, leaving only one 
-       * space in place of many
+    gchar *url, **urls, **ulink;
+    gchar *append;
+    gchar *alt_append, *alt_append1;
+    gchar *link_text;
+    gchar *href = NULL;
+    gchar *break_point = NULL;
+    gboolean broken = FALSE;
+    if (first) {
+      /* The first node is special.  It doesn't have a note ref at the 
+       * start, so we can just add it and forget about it.
        */
-      urls = g_strsplit (url, "\n", -1);
-      break_point = strchr (url, '\n');
-      while (break_point) {
-	*break_point = ' ';
-	break_point = strchr (++break_point, '\n');
+      first = FALSE;
+      info_body_text (*node, &paragraph, NULL, TRUE, (*current));
+      continue;
+    }
+
+    /* If we got to here, we now gotta parse the note reference */
+    append = strchr (*current, ':');
+    if (!append) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    append++;
+    alt_append = append;
+    alt_append1 = alt_append;
+    append = strchr (append, ':');
+    alt_append = strchr (alt_append, '.');
+    if (alt_append && g_str_has_prefix (alt_append, ".info")) {
+      broken = TRUE;
+      alt_append++;
+      alt_append = strchr (alt_append, '.');
+    }
+    alt_append1 = strchr (alt_append1, ',');
+    if (!append && !alt_append && !alt_append1) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    if (!append || alt_append || alt_append1) {
+      if (!append) {
+        if (alt_append) append = alt_append;
+        else append = alt_append1;
       }
-      break_point = strchr (url, ' ');
-      while (break_point) {
-	if (*(break_point+1) == ' ') {
-	  /* Massive space.  Fix. */
-	  gchar *next = break_point;
-	  gchar *url_copy;
-	  while (*next == ' ')
-	    next++;
-	  next--;
-	  url_copy = g_strndup (url, break_point-url);
-	  g_free (url);
-	  url = g_strconcat (url_copy, next, NULL);
-	  break_point = strchr (url, ' ');
-	  g_free (url_copy);
-	} else {
-	  break_point++;
-	  break_point = strchr (break_point, ' ');
-	}
+      if ((alt_append && alt_append < append))
+        append = alt_append;
+      if (alt_append1 && alt_append1 < append)
+        append = alt_append1;
+    }
+    append++;
+    url = g_strndup (*current, append - (*current));
+
+    /* By now, we got 2 things.  First, is append which is the (hopefully)
+     * non-link text.  Second, we got a url.
+     * The url can be in several forms:
+     * 1. linkend::
+     * 2. linkend:(infofile)Linkend.
+     * 3. Title: Linkend.
+     * 4. Title: Linkend, (pretty sure this is just broken)
+     * 5. Title: (infofile.info)Linkend.
+     * All possibilities should have been picked up.
+     * Here:
+     * Clean up the split.  Should be left with a real url and
+     * a list of fragments that should be linked
+     * Also goes through and removes extra spaces, leaving only one 
+     * space in place of many
+     */
+    urls = g_strsplit (url, "\n", -1);
+    break_point = strchr (url, '\n');
+    while (break_point) {
+      *break_point = ' ';
+      break_point = strchr (++break_point, '\n');
+    }
+    break_point = strchr (url, ' ');
+    while (break_point) {
+      if (*(break_point+1) == ' ') {
+        /* Massive space.  Fix. */
+        gchar *next = break_point;
+        gchar *url_copy;
+        while (*next == ' ')
+          next++;
+        next--;
+        url_copy = g_strndup (url, break_point-url);
+        g_free (url);
+        url = g_strconcat (url_copy, next, NULL);
+        break_point = strchr (url, ' ');
+        g_free (url_copy);
+      } else {
+        break_point++;
+        break_point = strchr (break_point, ' ');
       }
-      if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */ 
-	gchar *stop = NULL;
-	gchar *lurl = NULL;
-	gchar *zloc = NULL;
-	stop = strchr (url, ':');
-	lurl = strchr (stop, '(');
-	if (!lurl) { /* 3rd type of link */
-	  gchar *link;
-	  gint length;
-	  stop++;
-	  link = g_strdup (stop);
-	  link = g_strstrip (link);
-	  length = strlen (link) - 1;
-	  link[length] = '\0';	  
-	  href = g_strconcat ("xref:", link, NULL);
-	  link[length] = 'a';
-	  g_free (link);
-
-
-	} else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
-	  if (broken) {
-	    gchar *new_url;
-	    gchar *info;
-	    gchar *stripped;
-
-	    new_url = g_strdup (lurl);
-	    info = strstr (new_url, ".info)");
-	    stripped = g_strndup (new_url, info-new_url);
-	    info +=5;
-	    lurl = g_strconcat (stripped, info, NULL);
-	    g_free (stripped);
-	    g_free (new_url);
-	  }
-	  zloc = &(lurl[strlen(lurl)-1]);
-	  *zloc = '\0';
-	  href = g_strconcat ("info:", lurl, NULL);
-	  *zloc = 'a';
-	}
-      } else { /* First kind of link */
-	gchar *tmp1;
-	gchar *frag;
-
-	tmp1 = strchr (url, ':');
-	if (!tmp1)
-	  frag = g_strdup (url);
-	else 
-	  frag = g_strndup (url, tmp1 - url);
-	g_strstrip (frag);
-	gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
-	href = g_strconcat ("xref:", frag, NULL);
-        g_free (frag);
+    }
+    if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */
+      gchar *stop = NULL;
+      gchar *lurl = NULL;
+      gchar *zloc = NULL;
+      stop = strchr (url, ':');
+      lurl = strchr (stop, '(');
+      if (!lurl) { /* 3rd type of link */
+        gchar *link;
+        gint length;
+        stop++;
+        link = g_strdup (stop);
+        link = g_strstrip (link);
+        length = strlen (link) - 1;
+        link[length] = '\0';
+        href = g_strconcat ("xref:", link, NULL);
+        link[length] = 'a';
+        g_free (link);
+
+
+      } else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
+        if (broken) {
+          gchar *new_url;
+          gchar *info;
+          gchar *stripped;
+
+          new_url = g_strdup (lurl);
+          info = strstr (new_url, ".info)");
+          stripped = g_strndup (new_url, info-new_url);
+          info +=5;
+          lurl = g_strconcat (stripped, info, NULL);
+          g_free (stripped);
+          g_free (new_url);
+        }
+        zloc = &(lurl[strlen(lurl)-1]);
+        *zloc = '\0';
+        href = g_strconcat ("info:", lurl, NULL);
+        *zloc = 'a';
       }
-      for (ulink = urls; *ulink != NULL; ulink++) {
-	if (ulink == urls)
-	  link_text = g_strconcat ("*Note", *ulink, NULL);
-	else {
-	  gchar *spacing = *ulink;
-	  gchar *tmp;
-	  gint count = 0;
-	  while (*spacing == ' ') {
-	    spacing++;
-	    count++;
-	  }
-	  if (spacing != *ulink) {
-	    if (count > 1)
-	      spacing-=2;
-	    tmp = g_strndup (*ulink, spacing-*ulink);
-	    if (count > 1)
-	      spacing+=2;
-	    xmlNewTextChild (holder, NULL, BAD_CAST "spacing",
-			     BAD_CAST tmp);
-	    g_free (tmp);
-	    link_text = g_strdup (spacing);
-	  } else {
-	    link_text = g_strdup (*ulink);
-	  }
-	}
-	ref1 = xmlNewTextChild (holder, NULL, BAD_CAST "a",
-				BAD_CAST link_text);
-	if (*(ulink+1) != NULL)
-	  info_body_text (holder, NULL, FALSE, "");
+    } else { /* First kind of link */
+      gchar *tmp1;
+      gchar *frag;
+
+      tmp1 = strchr (url, ':');
+      if (!tmp1)
+        frag = g_strdup (url);
+      else
+        frag = g_strndup (url, tmp1 - url);
+      g_strstrip (frag);
+      gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
+      href = g_strconcat ("xref:", frag, NULL);
+      g_free (frag);
+    }
 
-	g_free (link_text);
-	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
+    /* Check we've got a valid paragraph node */
+    if (!paragraph) {
+      paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
+    }
+
+    for (ulink = urls; *ulink != NULL; ulink++) {
+      if (ulink == urls)
+        link_text = g_strconcat ("*Note", *ulink, NULL);
+      else {
+        gchar *spacing = *ulink;
+        gchar *tmp;
+        gint count = 0;
+        while (*spacing == ' ') {
+          spacing++;
+          count++;
+        }
+        if (spacing != *ulink) {
+          if (count > 1)
+            spacing-=2;
+          tmp = g_strndup (*ulink, spacing-*ulink);
+          if (count > 1)
+            spacing+=2;
+          xmlNewTextChild (paragraph, NULL, BAD_CAST "spacing",
+                           BAD_CAST tmp);
+          g_free (tmp);
+          link_text = g_strdup (spacing);
+        } else {
+          link_text = g_strdup (*ulink);
+        }
       }
-      g_strfreev (urls);
-      /* Finally, we can add the text as required */
-      info_body_text (holder, NULL, TRUE, append);
-      g_free (url);
-      g_free (href);
+      ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
+                              BAD_CAST link_text);
+      if (*(ulink+1) != NULL)
+        info_body_text (*node, &paragraph, NULL, FALSE, "");
+
+      g_free (link_text);
+      xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
     }
-    g_strfreev (subnotes);
+    g_strfreev (urls);
+    /* Finally, we can add the text as required */
+    info_body_text (*node, &paragraph, NULL, TRUE, append);
+    g_free (url);
+    g_free (href);
   }
   g_strfreev (notes);
 }
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index 1117a80..a97b054 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -107,11 +107,12 @@ a.navbar-next::after {
 <!-- = Normal Matches = -->
 
 <xsl:template match="para">
-  <span class="fixed">
-    <xsl:value-of select="node()"/>
-    <xsl:text>
-  </xsl:text>
-  </span>
+  <p>
+    <span class="fixed">
+      <!-- Apply templates for <a> tags and copy text straight through. -->
+      <xsl:apply-templates select="./text()|*"/>
+    </span>
+  </p>
 </xsl:template>
 
 <xsl:template match="para1">
@@ -174,8 +175,4 @@ a.navbar-next::after {
   </xsl:element>
 </xsl:template>
 
-<xsl:template match="noteholder">
-  <xsl:apply-templates select="node()[not(self::noteholder)]"/>
-</xsl:template>
-
 </xsl:stylesheet>
-- 
1.7.1

From 45762b7f91b57038f893df6e6221db0bd7fbe255 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Sat, 11 Sep 2010 22:21:19 +0100
Subject: [PATCH 4/9] Render multi-line links correctly.

---
 libyelp/yelp-info-parser.c |   50 +++++++++++++++-----------------------------
 1 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index a85f733..d4ef7bc 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -1377,6 +1377,9 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
     append++;
     url = g_strndup (*current, append - (*current));
 
+    /* Save a copy of the unadulterated link text for later. */
+    link_text = g_strconcat ("*Note", url, NULL);
+
     /* By now, we got 2 things.  First, is append which is the (hopefully)
      * non-link text.  Second, we got a url.
      * The url can be in several forms:
@@ -1475,42 +1478,23 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
       paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
     }
 
-    for (ulink = urls; *ulink != NULL; ulink++) {
-      if (ulink == urls)
-        link_text = g_strconcat ("*Note", *ulink, NULL);
-      else {
-        gchar *spacing = *ulink;
-        gchar *tmp;
-        gint count = 0;
-        while (*spacing == ' ') {
-          spacing++;
-          count++;
-        }
-        if (spacing != *ulink) {
-          if (count > 1)
-            spacing-=2;
-          tmp = g_strndup (*ulink, spacing-*ulink);
-          if (count > 1)
-            spacing+=2;
-          xmlNewTextChild (paragraph, NULL, BAD_CAST "spacing",
-                           BAD_CAST tmp);
-          g_free (tmp);
-          link_text = g_strdup (spacing);
-        } else {
-          link_text = g_strdup (*ulink);
-        }
-      }
-      ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
-                              BAD_CAST link_text);
-      if (*(ulink+1) != NULL)
-        info_body_text (*node, &paragraph, NULL, FALSE, "");
+    /*
+      Now we're supposed to actually render the link. I have a list of
+      bits of URL and actually this is really easy - I want to have
+      the link *text* exactly the same as it appeared in the .info
+      file, so don't use the list of strings urls, instead use the
+      whole lot: url (complete with embedded newlines etc.)
+    */
+    ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
+                            BAD_CAST link_text);
+    g_free (link_text);
+    xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
 
-      g_free (link_text);
-      xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
-    }
     g_strfreev (urls);
-    /* Finally, we can add the text as required */
+
+    /* Finally, we can add the following text as required */
     info_body_text (*node, &paragraph, NULL, TRUE, append);
+
     g_free (url);
     g_free (href);
   }
-- 
1.7.1

From ceb73e0c918c516c90fe0d71d1ec42103785769f Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Wed, 15 Sep 2010 00:22:30 +0100
Subject: [PATCH 5/9] Menu formatting.

Checks for general whitespace in menus, so no longer gets confused by
tabs. Also, doesn't write the link text twice if the description is
identical.
---
 libyelp/yelp-info-parser.c |   38 ++++++++++++++++++++++++++++++--------
 1 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index d4ef7bc..edd3812 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -1158,13 +1158,13 @@ get_menuoptions (gchar *line, gchar **title, gchar **ref, gchar **desc,
   return TRUE;
 }
 
-/* Find the first non-space character in str or return pointer to the
+/* Find the first non whitespace character in str or return pointer to the
  * '\0' if there isn't one. */
 static gchar*
 first_non_space (gchar* str)
 {
   /* As long as str is null terminated, this is ok! */
-  while (*str == ' ') str++;
+  while (g_ascii_isspace (*str)) str++;
   return str;
 }
 
@@ -1227,6 +1227,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
     gchar *ref = NULL;
     gchar *desc = NULL;
     gchar *xref = NULL;
+    gchar *link_text = NULL;
     xmlNodePtr ref1;
 
     menu = get_menuoptions (menuitems[i], &title, &ref, &desc, &xref);
@@ -1244,8 +1245,10 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
       if (ref == NULL) { /* A standard type menu */
         /* title+2 skips the "* ". We know we haven't jumped over the
            end of the string because strlen (title) >= 3 */
-	ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
-				BAD_CAST title+2);
+        link_text = g_strdup (title+2);
+
+        ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
+                                BAD_CAST link_text);
 
         tmp = g_strconcat ("xref:", xref, NULL);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
@@ -1263,8 +1266,10 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
 	}
 	sp = g_strndup (ref, c);
 	
+        link_text = g_strdup (title);
+
 	ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
-					BAD_CAST title);
+                                BAD_CAST link_text);
         tmp = g_strconcat ("xref:", xref, NULL);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
         g_free (tmp);
@@ -1282,10 +1287,27 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
       }
 
       tmp = g_strconcat ("\n", first_non_space (desc), NULL);
-      xmlNewTextChild (mholder, NULL, BAD_CAST "para1",
-		       BAD_CAST tmp);
-      g_free (tmp);
 
+      /*
+        Don't print the link text a second time, because that looks
+        really stupid.
+
+        We don't do a straight check for equality because lots of
+        .info files have something like
+
+          * Foo::    Foo.
+
+        Obviously if the longer explanation has more afterwards, we
+        don't want to omit it, which is why there's the strlen test.
+      */
+      if (strncmp (link_text, tmp + 1, strlen (link_text)) ||
+          strlen (link_text) + 1 < strlen (tmp + 1)) {
+        xmlNewTextChild (mholder, NULL,
+                         BAD_CAST "para1", BAD_CAST tmp);
+      }
+
+      g_free (tmp);
+      g_free (link_text);
     }
     else if (*(menuitems[i]) != '\0') {
       tmp = g_strconcat ("\n", first_non_space (menuitems[i]), NULL);
-- 
1.7.1

From d9918c73fb422a36495710472d7f7743723e5440 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Wed, 15 Sep 2010 10:42:56 +0100
Subject: [PATCH 6/9] Allow special characters in info links.

If an info file has a '<' in the title for example (section 12.1 of
info:autoconf has) then when clicked the url gets "url-encoded" to
something like "%3c-...". This patch adds a url-decode stage to the
parsing of info xrefs, which undoes this mangling.
---
 libyelp/yelp-uri.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 72 insertions(+), 0 deletions(-)

diff --git a/libyelp/yelp-uri.c b/libyelp/yelp-uri.c
index 7010a24..a58e288 100644
--- a/libyelp/yelp-uri.c
+++ b/libyelp/yelp-uri.c
@@ -958,6 +958,66 @@ resolve_man_uri (YelpUri *uri)
     g_free (newarg);
 }
 
+/*
+  Return 1 if ch is a number from 0 to 9 or a letter a-f or A-F and 0
+  otherwise. This is sort of not utf8-safe, but since we are only
+  looking for 7-bit things, it doesn't matter.
+ */
+static int
+is_hex (gchar ch)
+{
+    if (((48 <= ch) && (ch <= 57)) ||
+        ((65 <= ch) && (ch <= 70)) ||
+        ((97 <= ch) && (ch <= 102)))
+        return 1;
+    return 0;
+}
+
+/*
+  Return a newly allocated string, where %ab for a,b in [0, f] is
+  replaced by the character it represents.
+ */
+static gchar*
+decode_url (const gchar *url)
+{
+    if (!url) return NULL;
+
+    unsigned int len = strlen (url) + 1;
+    int hex;
+    gchar *ret = g_new (gchar, len);
+    const gchar *ptr = url, *end = url+len;
+    gchar *retptr = ret, *tmp;
+
+    while (ptr < end) {
+        if (*ptr == '%' && is_hex(*(ptr + 1)) && is_hex(*(ptr + 2))) {
+            *retptr = *(ptr+1);
+            *(retptr+1) = *(ptr+2);
+            *(retptr+2) = '\0';
+
+            sscanf (retptr, "%x", &hex);
+
+            if (hex < 0 || hex > 127) {
+                g_warning ("Skipping non-7-bit character.");
+                ptr++;
+                continue;
+            }
+            *retptr = (gchar)hex;
+
+            retptr++;
+            ptr += 3;
+        }
+        else {
+            tmp = g_utf8_next_char(ptr);
+            memcpy (retptr, ptr, (tmp-ptr));
+            retptr += tmp-ptr;
+            ptr = tmp;
+        }
+    }
+    *retptr = '\0';
+
+    return ret;
+}
+
 static void
 resolve_info_uri (YelpUri *uri)
 {
@@ -1094,6 +1154,18 @@ resolve_xref_uri (YelpUri *uri)
         }
     }
 
+    if (priv->page_id &&
+        g_str_has_prefix (priv->docuri, "info:")) {
+        /*
+          Special characters get url-encoded when they get clicked on
+          as links. Info files, at least, don't want that so decode
+          the url again here.
+         */
+        gchar* tmp = priv->page_id;
+        priv->page_id = decode_url (priv->page_id);
+        g_free (tmp);
+    }
+
     if (g_str_has_prefix (priv->docuri, "ghelp:"))
         priv->fulluri = g_strconcat (priv->docuri,
                                      priv->page_id ? "?" : "",
-- 
1.7.1

From 90f18fa4f9b28dd12295647f6b485faeb2aa4833 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Sat, 12 Jun 2010 17:51:47 +0100
Subject: [PATCH 7/9] Copy the info program for dealing with INFOPATH.

The info program uses the following logic to deal with INFOPATH[1]:
  - If there is no such environment variable, use a default list.
  - If INFOPATH is specified and ends with ':', prepend it to the
    default list
  - Otherwise, use INFOPATH rather than the default.

[1] http://www.gnu.org/software/texinfo/manual/info-stnd/html_node/Invoking-Info.html
---
 libyelp/yelp-uri.c |   22 +++++++++++++++++++---
 1 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/libyelp/yelp-uri.c b/libyelp/yelp-uri.c
index a58e288..02d1841 100644
--- a/libyelp/yelp-uri.c
+++ b/libyelp/yelp-uri.c
@@ -114,6 +114,9 @@ static const gchar *infosuffix[] = {
     NULL
 };
 
+static const gchar default_info_path[] =
+    "/usr/info:/usr/share/info:/usr/local/info:/usr/local/share/info";
+
 /******************************************************************************/
 
 static void
@@ -1029,7 +1032,6 @@ resolve_info_uri (YelpUri *uri)
      * info:(name)
      */
     static gchar **infopath = NULL;
-    const gchar * const * langs = g_get_language_names ();
     gchar *name = NULL;
     gchar *sect = NULL;
     gchar *fullpath = NULL;
@@ -1049,10 +1051,24 @@ resolve_info_uri (YelpUri *uri)
 
     if (!infopath) {
         /* Initialize infopath only once */
+
+        /* Use the same logic as the info program. If INFOPATH is not
+           specified, use the default. If it is specified, just use it
+           unless it ends with a colon, in which case we add the
+           default as a suffix.
+        */
         const gchar *env = g_getenv ("INFOPATH");
+        gchar *paths;
         if (!env || env[0] == '\0')
-            env = "/usr/info:/usr/share/info:/usr/local/info:/usr/local/share/info";
-        infopath = g_strsplit (env, ":", 0);
+            paths = g_strdup (default_info_path);
+        else if (env[strlen (env)-1] == ':')
+            paths = g_strconcat (env, default_info_path, NULL);
+        else
+            paths = g_strdup (env);
+
+        infopath = g_strsplit (paths, ":", 0);
+
+        g_free (paths);
     }
 
     colon = strchr (priv->res_arg, ':');
-- 
1.7.1

From 13b52ee5e90f6f6a3feb887897e995a2c77f2add Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Thu, 10 Jun 2010 11:35:46 +0100
Subject: [PATCH 8/9] Allow the possibility of no compression for info files.

At the moment, the info parser uses YelpMagicDecompressor. If it
didn't recognize a stream as BZ2 or LZMA, the decompressor assumed it
was GZip. This didn't work if the file is uncompressed... Since GZip
decompression certainly won't work if the file is not GZip compressed,
this patch changes YelpMagicDecompressor to fall back to a
pass-through converter if it doesn't recognise the compression format
(which might mean the file is uncompressed!).
---
 libyelp/yelp-magic-decompressor.c |   86 ++++++++++++++++++++++--------------
 1 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/libyelp/yelp-magic-decompressor.c b/libyelp/yelp-magic-decompressor.c
index 89fa2be..6b3a83c 100644
--- a/libyelp/yelp-magic-decompressor.c
+++ b/libyelp/yelp-magic-decompressor.c
@@ -24,6 +24,7 @@
 #include "config.h"
 
 #include <glib/gi18n.h>
+#include <string.h>
 
 #include "yelp-magic-decompressor.h"
 
@@ -54,7 +55,7 @@ G_DEFINE_TYPE_WITH_CODE (YelpMagicDecompressor, yelp_magic_decompressor, G_TYPE_
 static void
 yelp_magic_decompressor_dispose (GObject *object)
 {
-    YelpMagicDecompressor *decompressor;
+    YelpMagicDecompressor *decompressor = YELP_MAGIC_DECOMPRESSOR (object);
 
     if (decompressor->magic_decoder_ring) {
         g_object_unref (decompressor->magic_decoder_ring);
@@ -98,6 +99,37 @@ yelp_magic_decompressor_reset (GConverter *converter)
         g_converter_reset (decompressor->magic_decoder_ring);
 }
 
+static GConverter*
+yelp_magic_decompressor_choose (const void *inbuf, gsize inbuf_size)
+{
+    /* If input_size is less than two the first time, we end up
+     * not getting detection.  Might be worth addressing.  Not
+     * sure I care.
+     *
+     * The two-byte magic we're doing here is not sufficient in
+     * the general case.  It is sufficient for the specific data
+     * Yelp deals with.
+     */
+    if (inbuf_size <= 2)
+        return NULL;
+
+#ifdef ENABLE_BZ2
+    if (((gchar *) inbuf)[0] == 'B' && ((gchar *) inbuf)[1] == 'Z') {
+        return (GConverter *) yelp_bz2_decompressor_new ();
+    }
+#endif
+#ifdef ENABLE_LZMA
+    if (((gchar *) inbuf)[0] == ']' && ((gchar *) inbuf)[1] == '\0') {
+        return (GConverter *) yelp_lzma_decompressor_new ();
+    }
+#endif
+    if (((guint8*) inbuf)[0] == 0x1F && ((guint8*) inbuf)[1] == 0x8B) {
+        return (GConverter *) g_zlib_decompressor_new (G_ZLIB_COMPRESSOR_FORMAT_GZIP);
+    }
+
+    return NULL;
+}
+
 static GConverterResult
 yelp_magic_decompressor_convert (GConverter *converter,
                                  const void *inbuf,
@@ -110,47 +142,33 @@ yelp_magic_decompressor_convert (GConverter *converter,
                                  GError    **error)
 {
     YelpMagicDecompressor *decompressor;
+    gsize txfer_size;
 
     decompressor = YELP_MAGIC_DECOMPRESSOR (converter);
 
     if (decompressor->first) {
+        decompressor->magic_decoder_ring =
+            yelp_magic_decompressor_choose (inbuf, inbuf_size);
         decompressor->first = FALSE;
-        /* If input_size is less than two the first time, we end up
-         * not getting detection.  Might be worth addressing.  Not
-         * sure I care.
-         *
-         * The two-byte magic we're doing here is not sufficient in
-         * the general case.  It is sufficient for the specific data
-         * Yelp deals with.
-         */
-        if (inbuf_size <= 2)
-            ;
-#ifdef ENABLE_BZ2
-        else if (((gchar *) inbuf)[0] == 'B' &&
-                 ((gchar *) inbuf)[1] == 'Z') {
-            decompressor->magic_decoder_ring = (GConverter *) yelp_bz2_decompressor_new ();
-        }
-#endif
-#ifdef ENABLE_LZMA
-        else if (((gchar *) inbuf)[0] == ']' &&
-                 ((gchar *) inbuf)[1] == '\0') {
-            decompressor->magic_decoder_ring = (GConverter *) yelp_lzma_decompressor_new ();
-        }
-#endif
-        else {
-            decompressor->magic_decoder_ring =
-                (GConverter *) g_zlib_decompressor_new (G_ZLIB_COMPRESSOR_FORMAT_GZIP);
-        }
     }
 
-    return g_converter_convert (decompressor->magic_decoder_ring,
-                                inbuf, inbuf_size,
-                                outbuf, outbuf_size,
-                                flags,
-                                bytes_read, bytes_written,
-                                error);
+    if (decompressor->magic_decoder_ring) {
+        return g_converter_convert (decompressor->magic_decoder_ring,
+                                    inbuf, inbuf_size,
+                                    outbuf, outbuf_size,
+                                    flags,
+                                    bytes_read, bytes_written,
+                                    error);
+    }
 
-    g_assert_not_reached ();
+    /* If there's no magic_decoder_ring, we just copy the data
+     * straight through. */
+    txfer_size = MIN (inbuf_size, outbuf_size);
+    memcpy (outbuf, inbuf, txfer_size);
+    *bytes_read = txfer_size;
+    *bytes_written = txfer_size;
+    
+    return G_CONVERTER_CONVERTED;
 }
 
 static void
-- 
1.7.1

From 616f819a0732ebb2addbe64d44e87e4c62016c9a Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Wed, 15 Sep 2010 11:00:44 +0100
Subject: [PATCH 9/9] Fix returns in info_request_page ()

Return TRUE if we succeed. Before we were returning no value (and the
function is declared as having type gboolean).
---
 libyelp/yelp-info-document.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/libyelp/yelp-info-document.c b/libyelp/yelp-info-document.c
index 5fb609d..acfb33f 100644
--- a/libyelp/yelp-info-document.c
+++ b/libyelp/yelp-info-document.c
@@ -211,7 +211,7 @@ info_request_page (YelpDocument         *document,
                                                                              callback,
                                                                              user_data);
     if (handled) {
-        return;
+        return TRUE;
     }
 
     g_mutex_lock (priv->mutex);
@@ -241,6 +241,7 @@ info_request_page (YelpDocument         *document,
     }
 
     g_mutex_unlock (priv->mutex);
+    return TRUE;
 }
 
 
-- 
1.7.1

Attachment: pgp6G833kVJjF.pgp
Description: PGP signature

Follow-Ups:
- Re: Info parsing
  - From: Shaun McCance

References:
- Info parsing
  - From: Rupert Swarbrick

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]