Info parsing

From: Rupert Swarbrick <rswarbrick gmail com>
To: gnome-doc-devel-list gnome org
Subject: Info parsing
Date: Sat, 11 Sep 2010 22:33:58 +0100

Hi,

After disappearing for ages (sorry!), I've hacked together some patches
that seem to improve info file handling here.

I can open a bug etc. if it'd be better but maybe this is a good place
for someone to have a look at them?

Patches attached.

What they do:

  - Use blank lines to detect paragraphs and output rather more semantic
    html.

  - Parse the menus more helpfully and display them as a <ul>

  - Correctly format multi-line links (on my system at least, there's
    one at info:info so it's quite noticeable)

I'd love to hear if I've introduced any regressions. There are still
some problems (from before!). Most notably, there seems to be a problem
with spaces in links or maybe with links between different info files
(info:info, then go to expert then click on a texinfo link). I'll try to
work out what's going on soon.


Rupert

From 7059753590f6f6371b3b04880e82eeb6edb57ce0 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Wed, 16 Jun 2010 10:32:20 +0100
Subject: [PATCH 1/4] Support for headings in info files.

---
 libyelp/yelp-info-parser.c   |  173 +++++++++++++++++++++++++++++++++++++-----
 stylesheets/info2html.xsl.in |   17 ++++
 2 files changed, 170 insertions(+), 20 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 3310794..7d32905 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -1,4 +1,4 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
 /*
  * Copyright (C) 2005 Davyd Madeley <davyd madeley id au>
  *
@@ -58,8 +58,13 @@ void                  fix_tag_table                      (gchar *offset,
 							  TagTableFix *a);
 void   		      info_process_text_notes            (xmlNodePtr *node, 
 							  gchar *content,
-							  GtkTreeStore *tree);
+							  GtkTreeStore
+							  *tree);
 
+/*
+  Used to output the correct <heading level="?" /> tag.
+ */
+static const gchar* level_headings[] = { NULL, "1", "2", "3" };
 
 static GHashTable *
 info_image_get_attributes (gchar const* string)
@@ -141,15 +146,144 @@ info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
 }
 
 /*
-  Convert body text CONTENT to xml nodes, processing info image tags
-  when found.  IWBN add a regex match for *Note: here and call the
-  *Note ==> <a href> logic of info_process_text_notes from here.
+  If every element of `str' is `ch' then return TRUE, else FALSE.
  */
-static xmlNodePtr
-info_body_text (xmlNodePtr parent, xmlNsPtr ns, gchar const *name, gchar const *content)
+static gboolean
+string_all_char_p (const gchar* str, gchar ch)
+{
+  for (; *str; str++) {
+    if (*str != ch) return FALSE;
+  }
+  return TRUE;
+}
+
+/*
+  If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
+  for the heading level. If it's anything else, return 0.
+ */
+static int
+header_underline_level (const gchar* line)
+{
+  if (*line != '*' && *line != '=' && *line != '-')
+    return 0;
+
+  if (string_all_char_p (line, '*')) return 1;
+  if (string_all_char_p (line, '=')) return 2;
+  if (string_all_char_p (line, '-')) return 3;
+
+  return 0;
+}
+
+/*
+  Use g_strjoinv to join up the strings from `strings', but they might
+  not actually be a null-terminated array. `end' should be strings+n,
+  where I want the first n strings (strings+0, ..., strings+(n-1)). It
+  shouldn't point outside of the array allocated, but it can point at
+  the null string at the end.
+ */
+static gchar*
+join_strings_subset (const gchar *separator,
+                     gchar** strings, gchar** end)
+{
+  g_assert(end > strings);
+
+  gchar *ptr = *end;
+  *end = NULL;
+  
+  gchar *glob = g_strjoinv (separator, strings);
+  *end = ptr;
+  return glob;
+}
+
+/*
+  Create a text node, child of `parent', with the lines strictly
+  between `first' and `last'.
+*/
+static void
+lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
+                         gboolean inline_p,
+                         gchar** first, gchar** last)
 {
-  if (!strstr (content, INFO_C_IMAGE_TAG_OPEN))
-    return xmlNewTextChild (parent, ns, BAD_CAST name, BAD_CAST content);
+  /* TODO? Currently we're copying the split strings again, which is
+     less efficient than somehow storing lengths and using a sort of
+     window on `content'. But that's much more difficult, so unless
+     there's a problem, let's go with the stupid approach. */
+  gchar *glob;
+  if (last > first) {
+    glob = join_strings_subset ("\n", first, last);
+    xmlNewTextChild (parent, ns,
+                     inline_p ? BAD_CAST "para1" : BAD_CAST "para",
+                     BAD_CAST glob);
+    g_free (glob);
+  }
+}
+
+/*
+  Convert body text CONTENT to xml nodes. This function is responsible
+  for spotting headings etc and splitting them out correctly.
+
+  If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
+  <para> tag. 
+
+  TODO: IWBN add a regex match for *Note: here and call the *Note ==>
+  <a href> logic of info_process_text_notes from here.
+ */
+static void
+info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
+                      gboolean inline_p, const gchar *content)
+{
+  /* The easiest things to spot are headings: they look like a line of
+   * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
+   * them, we split content into single lines and work with them. */
+  gchar **lines = g_strsplit (content, "\n", 0);
+  gchar **first = lines, **last = lines+1;
+  int header_level;
+  xmlNodePtr header_node;
+
+  /* Deal with the possibility that `content' is empty */
+  if (*lines == NULL) {
+    if (!inline_p) {
+      xmlNewTextChild (parent, NULL, BAD_CAST "para", BAD_CAST "");
+    }
+    return;
+  }
+
+  for (; *last; last++) {
+    header_level = header_underline_level (*last);
+    if (header_level) {
+      /* Write out any lines beforehand */
+      lines_subset_text_child (parent, ns, FALSE, first, last-1);
+      /* Now write out the actual header line */
+      header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
+                                     BAD_CAST *(last-1));
+      xmlNewProp (header_node, BAD_CAST "level",
+                  BAD_CAST level_headings[header_level]);
+      
+      first = last+1;
+      last = first+1;
+    }
+  }
+  /* Write out any lines left */
+  lines_subset_text_child (parent, ns, inline_p, first, last);
+  
+  g_strfreev (lines);
+}
+
+/*
+  info_body_text is responsible for taking a hunk of the info page's
+  body and turning it into paragraph tags. It searches out images and
+  marks them up properly if necessary.
+
+  It uses info_body_parse_text to mark up the actual bits of text.
+ */
+static void
+info_body_text (xmlNodePtr parent, xmlNsPtr ns,
+                gboolean inline_p, gchar const *content)
+{
+  if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
+    info_body_parse_text (parent, ns, inline_p, content);
+    return;
+  }
 
   gint content_len = strlen (content);
   gint pos = 0;
@@ -164,16 +298,15 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns, gchar const *name, gchar const *
 						     &image_start, &image_end);
       gchar *before = g_strndup (&content[pos], image_start - pos);
       pos = image_end + 1;
-      xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST (before));
+      info_body_parse_text (parent, NULL, TRUE, before);
       g_free (before);
       if (image_found)
 	info_insert_image (parent, match_info);
       g_match_info_next (match_info, NULL);
     }
   gchar *after = g_strndup (&content[pos], content_len - pos);
-  xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST (after));
+  info_body_parse_text (parent, NULL, TRUE, after);
   g_free (after);
-  return 0;
 }
 
 /* Part 1: Parse File Into Tree Store */
@@ -840,7 +973,7 @@ parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 					     BAD_CAST "Section",
 					     NULL);
 		  if (!notes)
-		    info_body_text (newnode, NULL, "para", page_content);
+		    info_body_text (newnode, NULL, FALSE, page_content);
 		  
 		  else {
 		    /* Handle notes here */
@@ -1005,7 +1138,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
 
   tmp = g_strconcat (split[0], "\n* Menu:", NULL);
   if (!notes)
-    info_body_text (newnode, NULL, "para", tmp);
+    info_body_text (newnode, NULL, FALSE, tmp);
   else {
     info_process_text_notes (&newnode, tmp, tree);
   }
@@ -1119,7 +1252,7 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	 * start, so we can just add it and forget about it.
 	 */
 	first = FALSE;
-	info_body_text (holder, NULL, "para1", (*current_real));
+	info_body_text (holder, NULL, TRUE, (*current_real));
 	continue;
       }
       /* If we got to here, we now gotta parse the note reference */
@@ -1128,13 +1261,13 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	/* Special type of note that isn't really a note, but pretends
 	 * it is
 	 */
-	info_body_text (holder, NULL, "para1",
+	info_body_text (holder, NULL, TRUE,
 			g_strconcat ("*Note", *current_real, NULL));
 	continue;
       }
       append = strchr (*current_real, ':');
       if (!append) {
-	info_body_text (holder, NULL, "para1", *current_real);
+	info_body_text (holder, NULL, TRUE, *current_real);
 	continue;
       }
       append++;
@@ -1149,7 +1282,7 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
       }
       alt_append1 = strchr (alt_append1, ',');
       if (!append && !alt_append && !alt_append1) {
-	info_body_text (holder, NULL, "para1", *current_real);
+	info_body_text (holder, NULL, TRUE, *current_real);
 	continue;
       }
       if (!append || alt_append || alt_append1) {
@@ -1285,14 +1418,14 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 	ref1 = xmlNewTextChild (holder, NULL, BAD_CAST "a",
 				BAD_CAST link_text);
 	if (*(ulink+1) != NULL)
-	  info_body_text (holder, NULL, "para", "");
+	  info_body_text (holder, NULL, FALSE, "");
 
 	g_free (link_text);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
       }
       g_strfreev (urls);
       /* Finally, we can add the text as required */
-      info_body_text (holder, NULL, "para1", append);
+      info_body_text (holder, NULL, TRUE, append);
       g_free (url);
       g_free (href);
     }
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index ec75878..c029148 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -115,6 +115,23 @@ a.navbar-next::after {
   <xsl:value-of select="node()"/>
 </xsl:template>
 
+<xsl:template match="header">
+  <xsl:choose>
+    <xsl:when test='@level = 1'>
+      <h1><xsl:value-of select="node()"/></h1>
+    </xsl:when>
+    <xsl:when test='@level = 2'>
+      <h2><xsl:value-of select="node()"/></h2>
+    </xsl:when>
+    <xsl:when test='@level = 3'>
+      <h3><xsl:value-of select="node()"/></h3>
+    </xsl:when>
+    <xsl:otherwise>
+      <h1>(Unknown heading level) <xsl:value-of select="node()"/></h1>
+    </xsl:otherwise>
+  </xsl:choose>
+</xsl:template>
+
 <xsl:template match="spacing">
   <xsl:value-of select="node()"/>
 </xsl:template>
-- 
1.7.1

From d1369b91a2bbde04a94911123c4d583087b0b692 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Fri, 3 Sep 2010 00:09:31 +0100
Subject: [PATCH 2/4] Display menus as <ul>'s, rather than the original text.

---
 libyelp/yelp-info-parser.c   |   87 +++++++++++++++++++++++++++++++++--------
 stylesheets/info2html.xsl.in |   24 +++++++++--
 2 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 7d32905..1605ecf 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -248,7 +248,11 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
     return;
   }
 
+  /* Use a pair of pointers, first and last, which point to two lines,
+   * the chunk of the body we're displaying (inclusive) */
   for (; *last; last++) {
+
+    /* Check for a header */
     header_level = header_underline_level (*last);
     if (header_level) {
       /* Write out any lines beforehand */
@@ -1120,6 +1124,16 @@ get_menuoptions (gchar *line, gchar **title, gchar **ref, gchar **desc,
   return TRUE;
 }
 
+/* Find the first non-space character in str or return pointer to the
+ * '\0' if there isn't one. */
+static gchar*
+first_non_space (gchar* str)
+{
+  /* As long as str is null terminated, this is ok! */
+  while (*str == ' ') str++;
+  return str;
+}
+
 xmlNodePtr
 yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node, 
 		      gchar *page_content, gboolean notes)
@@ -1127,7 +1141,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
   gchar **split;
   gchar **menuitems;
   gchar *tmp = NULL;
-  xmlNodePtr newnode;
+  xmlNodePtr newnode, menu_node, mholder = NULL;
   int i=0;
 
   split = g_strsplit (page_content, "* Menu:", 2);
@@ -1136,37 +1150,69 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
 			 BAD_CAST "Section", NULL);
     
 
-  tmp = g_strconcat (split[0], "\n* Menu:", NULL);
   if (!notes)
-    info_body_text (newnode, NULL, FALSE, tmp);
+    info_body_text (newnode, NULL, FALSE, split[0]);
   else {
-    info_process_text_notes (&newnode, tmp, tree);
+    info_process_text_notes (&newnode, split[0], tree);
   }
-  g_free (tmp);
 
   menuitems = g_strsplit (split[1], "\n", -1);
   g_strfreev (split);
 
+  /* The output xml should look something like the following:
+
+     <menu>
+       <menuholder>
+         <a href="xref:Help-Inv">Help-Inv</a>
+         <para1>Invisible text in Emacs Info.</para1>
+       </menuholder>
+       <menuholder>
+         <a href="xref:Help-M">Help-M</a>
+         <para1>Menus.</para1>
+       </menuholder>
+       ...
+     </menu>
+
+     (from the top page of info:info). Note the absence of *'s and
+     ::'s on the links.
+
+     If there's a line with no "* Blah::", it looks like a child of
+     the previous menu item so (for i > 0) deal with that correctly by
+     not "closing" the <menuholder> tag until we find the next
+     start.
+  */
+
+  if (menuitems[0] != NULL) {
+    /* If there are any menu items, make the <menu> node */
+    menu_node = xmlNewChild (newnode, NULL, BAD_CAST "menu", NULL);
+  }
+
   while (menuitems[i] != NULL) {
     gboolean menu = FALSE;
     gchar *title = NULL;
     gchar *ref = NULL;
     gchar *desc = NULL;
     gchar *xref = NULL;
-    xmlNodePtr mholder;
     xmlNodePtr ref1;
 
     menu = get_menuoptions (menuitems[i], &title, &ref, &desc, &xref);
- 
+
+    if (menu && (*title == '\0' || *(title + 1) == '\0')) {
+      g_warning ("Info title unexpectedly short for menu item (%s)",
+                 menuitems[i]);
+      menu = FALSE;
+    }
+
     if (menu) {
-      mholder = xmlNewChild (newnode, NULL, BAD_CAST "menuholder", NULL);
+      mholder = xmlNewChild (menu_node, NULL, BAD_CAST "menuholder", NULL);
       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &xref);
       
       if (ref == NULL) { /* A standard type menu */
-	tmp = g_strconcat (title, "::", NULL);
+        /* title+2 skips the "* ". We know we haven't jumped over the
+           end of the string because strlen (title) >= 3 */
 	ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
-				BAD_CAST tmp);
-	g_free (tmp);
+				BAD_CAST title+2);
+
         tmp = g_strconcat ("xref:", xref, NULL);
 	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
         g_free (tmp);
@@ -1200,12 +1246,19 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
         g_free (tmp);
 	g_free (sp);
       }
-      xmlNewTextChild (mholder, NULL, BAD_CAST "para",
-		       BAD_CAST desc);
-    } else {
-      xmlNewTextChild (newnode, NULL, BAD_CAST "para",
-		       BAD_CAST menuitems[i]);
-      
+
+      tmp = g_strconcat ("\n", first_non_space (desc), NULL);
+      xmlNewTextChild (mholder, NULL, BAD_CAST "para1",
+		       BAD_CAST tmp);
+      g_free (tmp);
+
+    }
+    else if (*(menuitems[i]) != '\0') {
+      tmp = g_strconcat ("\n", first_non_space (menuitems[i]), NULL);
+      xmlNewTextChild (mholder ? mholder : menu_node,
+                       NULL, BAD_CAST "para1",
+		       BAD_CAST tmp);
+      g_free (tmp);
     }
     i++;
     g_free (title);
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index c029148..1117a80 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -47,7 +47,8 @@
   <xsl:param name="left"/>
   <xsl:param name="right"/>
   <xsl:text>
-div.body { white-space: pre; font-family: monospace; }
+div.body { font-family: monospace; }
+span.fixed { white-space: pre; }
 <!-- navbar from mal2html, possibly move to html.xsl -->
 div.navbar {
   margin: 0 0 1em 0;
@@ -106,13 +107,17 @@ a.navbar-next::after {
 <!-- = Normal Matches = -->
 
 <xsl:template match="para">
-  <xsl:value-of select="node()"/>
-  <xsl:text>
+  <span class="fixed">
+    <xsl:value-of select="node()"/>
+    <xsl:text>
   </xsl:text>
+  </span>
 </xsl:template>
 
 <xsl:template match="para1">
-  <xsl:value-of select="node()"/>
+  <span class="fixed">
+    <xsl:value-of select="node()"/>
+  </span>
 </xsl:template>
 
 <xsl:template match="header">
@@ -156,8 +161,17 @@ a.navbar-next::after {
   </xsl:element>
 </xsl:template>
 
+<xsl:template match="menu">
+  <xsl:element name="p">Menu:</xsl:element>
+  <xsl:element name="ul">
+    <xsl:apply-templates />
+  </xsl:element>
+</xsl:template>
+
 <xsl:template match="menuholder">
-  <xsl:apply-templates select="node()[not(self::menuholder)]"/>
+  <xsl:element name="li">
+    <xsl:apply-templates />
+  </xsl:element>
 </xsl:template>
 
 <xsl:template match="noteholder">
-- 
1.7.1

From 7ede37523e6fe60ae13fe5e9f98b356d9da4b4d4 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Fri, 10 Sep 2010 11:42:23 +0100
Subject: [PATCH 3/4] Parse info files into paragraphs (separated by blank lines).

---
 libyelp/yelp-info-parser.c   |  454 ++++++++++++++++++++++--------------------
 stylesheets/info2html.xsl.in |   15 +-
 2 files changed, 248 insertions(+), 221 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 1605ecf..a85f733 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -128,7 +128,8 @@ info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
     source = (gchar*)g_hash_table_lookup (h, "src");
 
   if (!h || !source || !*source)
-    return xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST "[broken image]");
+    return xmlNewTextChild (parent, NULL, BAD_CAST "para",
+                            BAD_CAST "[broken image]");
 
   gchar *title = (gchar*)g_hash_table_lookup (h, "title");
   gchar *text = (gchar*)g_hash_table_lookup (h, "text");
@@ -201,7 +202,6 @@ join_strings_subset (const gchar *separator,
 */
 static void
 lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
-                         gboolean inline_p,
                          gchar** first, gchar** last)
 {
   /* TODO? Currently we're copying the split strings again, which is
@@ -209,11 +209,10 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
      window on `content'. But that's much more difficult, so unless
      there's a problem, let's go with the stupid approach. */
   gchar *glob;
+
   if (last > first) {
     glob = join_strings_subset ("\n", first, last);
-    xmlNewTextChild (parent, ns,
-                     inline_p ? BAD_CAST "para1" : BAD_CAST "para",
-                     BAD_CAST glob);
+    xmlAddChild (parent, xmlNewText (BAD_CAST glob));
     g_free (glob);
   }
 }
@@ -222,21 +221,24 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
   Convert body text CONTENT to xml nodes. This function is responsible
   for spotting headings etc and splitting them out correctly.
 
+  paragraph is as described in info_body_text, but cannot be null.
+
   If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
-  <para> tag. 
+  <para> tag.
 
   TODO: IWBN add a regex match for *Note: here and call the *Note ==>
   <a href> logic of info_process_text_notes from here.
  */
 static void
-info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_parse_text (xmlNodePtr parent, xmlNodePtr *paragraph,
+                      xmlNsPtr ns,
                       gboolean inline_p, const gchar *content)
 {
   /* The easiest things to spot are headings: they look like a line of
    * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
    * them, we split content into single lines and work with them. */
   gchar **lines = g_strsplit (content, "\n", 0);
-  gchar **first = lines, **last = lines+1;
+  gchar **first = lines, **last = lines;
   int header_level;
   xmlNodePtr header_node;
 
@@ -252,11 +254,27 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
    * the chunk of the body we're displaying (inclusive) */
   for (; *last; last++) {
 
+    /* Check for a blank line */
+    if (**last == '\0') {
+      if (last != first) {
+        if (!*paragraph) {
+          *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+        }
+        lines_subset_text_child (*paragraph, ns, first, last);
+      }
+      /* On the next iteration, last==first both pointing at the next
+         line. */
+      first = last+1;
+      *paragraph = NULL;
+
+      continue;
+    }
+
     /* Check for a header */
     header_level = header_underline_level (*last);
     if (header_level) {
       /* Write out any lines beforehand */
-      lines_subset_text_child (parent, ns, FALSE, first, last-1);
+      lines_subset_text_child (parent, ns, first, last-1);
       /* Now write out the actual header line */
       header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
                                      BAD_CAST *(last-1));
@@ -264,11 +282,15 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
                   BAD_CAST level_headings[header_level]);
       
       first = last+1;
-      last = first+1;
+      last = first-1;
     }
   }
+
   /* Write out any lines left */
-  lines_subset_text_child (parent, ns, inline_p, first, last);
+  if (!*paragraph) {
+    *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+  }
+  lines_subset_text_child (*paragraph, ns, first, last);
   
   g_strfreev (lines);
 }
@@ -278,14 +300,21 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
   body and turning it into paragraph tags. It searches out images and
   marks them up properly if necessary.
 
+  parent should be the node in which we're currently storing text and
+  paragraph a pointer to a <para> tag or NULL. At blank lines, we
+  finish with the current para tag and switch to a new one.
+
   It uses info_body_parse_text to mark up the actual bits of text.
  */
 static void
-info_body_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_text (xmlNodePtr parent, xmlNodePtr *paragraph, xmlNsPtr ns,
                 gboolean inline_p, gchar const *content)
 {
+  xmlNodePtr thepara = NULL;
+  if (paragraph == NULL) paragraph = &thepara;
+
   if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
-    info_body_parse_text (parent, ns, inline_p, content);
+    info_body_parse_text (parent, paragraph, ns, inline_p, content);
     return;
   }
 
@@ -293,6 +322,7 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
   gint pos = 0;
   GRegex *regex = g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE "((?:[^" INFO_TAG_1 "]|[^" INFO_C_TAG_0 "]+" INFO_TAG_1 ")*)" INFO_C_TAG_CLOSE_RE ")", 0, 0, NULL);
   GMatchInfo *match_info;
+
   g_regex_match (regex, content, 0, &match_info);
   while (g_match_info_matches (match_info))
     {
@@ -302,14 +332,18 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
 						     &image_start, &image_end);
       gchar *before = g_strndup (&content[pos], image_start - pos);
       pos = image_end + 1;
-      info_body_parse_text (parent, NULL, TRUE, before);
+      info_body_parse_text (parent, paragraph, NULL, TRUE, before);
       g_free (before);
+
+      /* End the paragraph that was before */
+      *paragraph = NULL;
+
       if (image_found)
 	info_insert_image (parent, match_info);
       g_match_info_next (match_info, NULL);
     }
   gchar *after = g_strndup (&content[pos], content_len - pos);
-  info_body_parse_text (parent, NULL, TRUE, after);
+  info_body_parse_text (parent, paragraph, NULL, TRUE, after);
   g_free (after);
 }
 
@@ -977,8 +1011,8 @@ parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 					     BAD_CAST "Section",
 					     NULL);
 		  if (!notes)
-		    info_body_text (newnode, NULL, FALSE, page_content);
-		  
+		    info_body_text (newnode, NULL, NULL, FALSE, page_content);
+
 		  else {
 		    /* Handle notes here */
 		    info_process_text_notes (&newnode, page_content, tree);
@@ -1151,7 +1185,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
     
 
   if (!notes)
-    info_body_text (newnode, NULL, FALSE, split[0]);
+    info_body_text (newnode, NULL, NULL, FALSE, split[0]);
   else {
     info_process_text_notes (&newnode, split[0], tree);
   }
@@ -1277,212 +1311,208 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 {
   gchar **notes;
   gchar **current;
-  xmlNodePtr holder;
   xmlNodePtr ref1;
+  xmlNodePtr paragraph = NULL;
   gboolean first = TRUE;
 
-  notes = g_strsplit (content, "*Note", -1);
-  holder = xmlNewChild (*node, NULL, BAD_CAST "noteholder", NULL);
+  /*
+    Split using the regular expression
+
+      \*[Nn]ote(?!_)
+
+    which deals with either case and the last bit is a lookahead so
+    that we don't split on things of the form *Note:_, which aren't
+    real notes.
+  */
+  notes = g_regex_split_simple ("\\*[Nn]ote(?!_)", content, 0, 0);
 
   for (current = notes; *current != NULL; current++) {
-    /* Since the notes can be either *Note or *note, we handle the second 
-     * variety here
-     */
-    gchar **subnotes;
-    gchar **current_real;
-
-    subnotes = g_strsplit (*current, "*note", -1);
-    for (current_real = subnotes; *current_real != NULL; current_real++) {
-      gchar *url, **urls, **ulink;
-      gchar *append;
-      gchar *alt_append, *alt_append1;
-      gchar *link_text;
-      gchar *href = NULL;
-      gchar *break_point = NULL;
-      gboolean broken = FALSE;
-      if (first) {
-	/* The first node is special.  It doesn't have a note ref at the 
-	 * start, so we can just add it and forget about it.
-	 */
-	first = FALSE;
-	info_body_text (holder, NULL, TRUE, (*current_real));
-	continue;
-      }
-      /* If we got to here, we now gotta parse the note reference */
-
-      if (*current_real[0] == '_') {
-	/* Special type of note that isn't really a note, but pretends
-	 * it is
-	 */
-	info_body_text (holder, NULL, TRUE,
-			g_strconcat ("*Note", *current_real, NULL));
-	continue;
-      }
-      append = strchr (*current_real, ':');
-      if (!append) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      append++;
-      alt_append = append;
-      alt_append1 = alt_append;
-      append = strchr (append, ':');
-      alt_append = strchr (alt_append, '.');
-      if (alt_append && g_str_has_prefix (alt_append, ".info")) {
-	broken = TRUE;
-	alt_append++;
-	alt_append = strchr (alt_append, '.');
-      }
-      alt_append1 = strchr (alt_append1, ',');
-      if (!append && !alt_append && !alt_append1) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      if (!append || alt_append || alt_append1) {
-	if (!append) {
-	  if (alt_append) append = alt_append;
-	  else append = alt_append1;
-	}
-	if ((alt_append && alt_append < append))
-	  append = alt_append;
-	if (alt_append1 && alt_append1 < append)
-	  append = alt_append1;
-      }
-      append++;
-      url = g_strndup (*current_real, append - (*current_real));
-
-      /* By now, we got 2 things.  First, is append which is the (hopefully)
-       * non-link text.  Second, we got a url.
-       * The url can be in several forms:
-       * 1. linkend::
-       * 2. linkend:(infofile)Linkend.
-       * 3. Title: Linkend.
-       * 4. Title: Linkend, (pretty sure this is just broken)
-       * 5. Title: (infofile.info)Linkend.
-       * All possibilities should have been picked up.
-       * Here:
-       * Clean up the split.  Should be left with a real url and
-       * a list of fragments that should be linked
-       * Also goes through and removes extra spaces, leaving only one 
-       * space in place of many
+    gchar *url, **urls, **ulink;
+    gchar *append;
+    gchar *alt_append, *alt_append1;
+    gchar *link_text;
+    gchar *href = NULL;
+    gchar *break_point = NULL;
+    gboolean broken = FALSE;
+    if (first) {
+      /* The first node is special.  It doesn't have a note ref at the 
+       * start, so we can just add it and forget about it.
        */
-      urls = g_strsplit (url, "\n", -1);
-      break_point = strchr (url, '\n');
-      while (break_point) {
-	*break_point = ' ';
-	break_point = strchr (++break_point, '\n');
+      first = FALSE;
+      info_body_text (*node, &paragraph, NULL, TRUE, (*current));
+      continue;
+    }
+
+    /* If we got to here, we now gotta parse the note reference */
+    append = strchr (*current, ':');
+    if (!append) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    append++;
+    alt_append = append;
+    alt_append1 = alt_append;
+    append = strchr (append, ':');
+    alt_append = strchr (alt_append, '.');
+    if (alt_append && g_str_has_prefix (alt_append, ".info")) {
+      broken = TRUE;
+      alt_append++;
+      alt_append = strchr (alt_append, '.');
+    }
+    alt_append1 = strchr (alt_append1, ',');
+    if (!append && !alt_append && !alt_append1) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    if (!append || alt_append || alt_append1) {
+      if (!append) {
+        if (alt_append) append = alt_append;
+        else append = alt_append1;
       }
-      break_point = strchr (url, ' ');
-      while (break_point) {
-	if (*(break_point+1) == ' ') {
-	  /* Massive space.  Fix. */
-	  gchar *next = break_point;
-	  gchar *url_copy;
-	  while (*next == ' ')
-	    next++;
-	  next--;
-	  url_copy = g_strndup (url, break_point-url);
-	  g_free (url);
-	  url = g_strconcat (url_copy, next, NULL);
-	  break_point = strchr (url, ' ');
-	  g_free (url_copy);
-	} else {
-	  break_point++;
-	  break_point = strchr (break_point, ' ');
-	}
+      if ((alt_append && alt_append < append))
+        append = alt_append;
+      if (alt_append1 && alt_append1 < append)
+        append = alt_append1;
+    }
+    append++;
+    url = g_strndup (*current, append - (*current));
+
+    /* By now, we got 2 things.  First, is append which is the (hopefully)
+     * non-link text.  Second, we got a url.
+     * The url can be in several forms:
+     * 1. linkend::
+     * 2. linkend:(infofile)Linkend.
+     * 3. Title: Linkend.
+     * 4. Title: Linkend, (pretty sure this is just broken)
+     * 5. Title: (infofile.info)Linkend.
+     * All possibilities should have been picked up.
+     * Here:
+     * Clean up the split.  Should be left with a real url and
+     * a list of fragments that should be linked
+     * Also goes through and removes extra spaces, leaving only one 
+     * space in place of many
+     */
+    urls = g_strsplit (url, "\n", -1);
+    break_point = strchr (url, '\n');
+    while (break_point) {
+      *break_point = ' ';
+      break_point = strchr (++break_point, '\n');
+    }
+    break_point = strchr (url, ' ');
+    while (break_point) {
+      if (*(break_point+1) == ' ') {
+        /* Massive space.  Fix. */
+        gchar *next = break_point;
+        gchar *url_copy;
+        while (*next == ' ')
+          next++;
+        next--;
+        url_copy = g_strndup (url, break_point-url);
+        g_free (url);
+        url = g_strconcat (url_copy, next, NULL);
+        break_point = strchr (url, ' ');
+        g_free (url_copy);
+      } else {
+        break_point++;
+        break_point = strchr (break_point, ' ');
       }
-      if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */ 
-	gchar *stop = NULL;
-	gchar *lurl = NULL;
-	gchar *zloc = NULL;
-	stop = strchr (url, ':');
-	lurl = strchr (stop, '(');
-	if (!lurl) { /* 3rd type of link */
-	  gchar *link;
-	  gint length;
-	  stop++;
-	  link = g_strdup (stop);
-	  link = g_strstrip (link);
-	  length = strlen (link) - 1;
-	  link[length] = '\0';	  
-	  href = g_strconcat ("xref:", link, NULL);
-	  link[length] = 'a';
-	  g_free (link);
-
-
-	} else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
-	  if (broken) {
-	    gchar *new_url;
-	    gchar *info;
-	    gchar *stripped;
-
-	    new_url = g_strdup (lurl);
-	    info = strstr (new_url, ".info)");
-	    stripped = g_strndup (new_url, info-new_url);
-	    info +=5;
-	    lurl = g_strconcat (stripped, info, NULL);
-	    g_free (stripped);
-	    g_free (new_url);
-	  }
-	  zloc = &(lurl[strlen(lurl)-1]);
-	  *zloc = '\0';
-	  href = g_strconcat ("info:", lurl, NULL);
-	  *zloc = 'a';
-	}
-      } else { /* First kind of link */
-	gchar *tmp1;
-	gchar *frag;
-
-	tmp1 = strchr (url, ':');
-	if (!tmp1)
-	  frag = g_strdup (url);
-	else 
-	  frag = g_strndup (url, tmp1 - url);
-	g_strstrip (frag);
-	gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
-	href = g_strconcat ("xref:", frag, NULL);
-        g_free (frag);
+    }
+    if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */
+      gchar *stop = NULL;
+      gchar *lurl = NULL;
+      gchar *zloc = NULL;
+      stop = strchr (url, ':');
+      lurl = strchr (stop, '(');
+      if (!lurl) { /* 3rd type of link */
+        gchar *link;
+        gint length;
+        stop++;
+        link = g_strdup (stop);
+        link = g_strstrip (link);
+        length = strlen (link) - 1;
+        link[length] = '\0';
+        href = g_strconcat ("xref:", link, NULL);
+        link[length] = 'a';
+        g_free (link);
+
+
+      } else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
+        if (broken) {
+          gchar *new_url;
+          gchar *info;
+          gchar *stripped;
+
+          new_url = g_strdup (lurl);
+          info = strstr (new_url, ".info)");
+          stripped = g_strndup (new_url, info-new_url);
+          info +=5;
+          lurl = g_strconcat (stripped, info, NULL);
+          g_free (stripped);
+          g_free (new_url);
+        }
+        zloc = &(lurl[strlen(lurl)-1]);
+        *zloc = '\0';
+        href = g_strconcat ("info:", lurl, NULL);
+        *zloc = 'a';
       }
-      for (ulink = urls; *ulink != NULL; ulink++) {
-	if (ulink == urls)
-	  link_text = g_strconcat ("*Note", *ulink, NULL);
-	else {
-	  gchar *spacing = *ulink;
-	  gchar *tmp;
-	  gint count = 0;
-	  while (*spacing == ' ') {
-	    spacing++;
-	    count++;
-	  }
-	  if (spacing != *ulink) {
-	    if (count > 1)
-	      spacing-=2;
-	    tmp = g_strndup (*ulink, spacing-*ulink);
-	    if (count > 1)
-	      spacing+=2;
-	    xmlNewTextChild (holder, NULL, BAD_CAST "spacing",
-			     BAD_CAST tmp);
-	    g_free (tmp);
-	    link_text = g_strdup (spacing);
-	  } else {
-	    link_text = g_strdup (*ulink);
-	  }
-	}
-	ref1 = xmlNewTextChild (holder, NULL, BAD_CAST "a",
-				BAD_CAST link_text);
-	if (*(ulink+1) != NULL)
-	  info_body_text (holder, NULL, FALSE, "");
+    } else { /* First kind of link */
+      gchar *tmp1;
+      gchar *frag;
+
+      tmp1 = strchr (url, ':');
+      if (!tmp1)
+        frag = g_strdup (url);
+      else
+        frag = g_strndup (url, tmp1 - url);
+      g_strstrip (frag);
+      gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
+      href = g_strconcat ("xref:", frag, NULL);
+      g_free (frag);
+    }
 
-	g_free (link_text);
-	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
+    /* Check we've got a valid paragraph node */
+    if (!paragraph) {
+      paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
+    }
+
+    for (ulink = urls; *ulink != NULL; ulink++) {
+      if (ulink == urls)
+        link_text = g_strconcat ("*Note", *ulink, NULL);
+      else {
+        gchar *spacing = *ulink;
+        gchar *tmp;
+        gint count = 0;
+        while (*spacing == ' ') {
+          spacing++;
+          count++;
+        }
+        if (spacing != *ulink) {
+          if (count > 1)
+            spacing-=2;
+          tmp = g_strndup (*ulink, spacing-*ulink);
+          if (count > 1)
+            spacing+=2;
+          xmlNewTextChild (paragraph, NULL, BAD_CAST "spacing",
+                           BAD_CAST tmp);
+          g_free (tmp);
+          link_text = g_strdup (spacing);
+        } else {
+          link_text = g_strdup (*ulink);
+        }
       }
-      g_strfreev (urls);
-      /* Finally, we can add the text as required */
-      info_body_text (holder, NULL, TRUE, append);
-      g_free (url);
-      g_free (href);
+      ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
+                              BAD_CAST link_text);
+      if (*(ulink+1) != NULL)
+        info_body_text (*node, &paragraph, NULL, FALSE, "");
+
+      g_free (link_text);
+      xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
     }
-    g_strfreev (subnotes);
+    g_strfreev (urls);
+    /* Finally, we can add the text as required */
+    info_body_text (*node, &paragraph, NULL, TRUE, append);
+    g_free (url);
+    g_free (href);
   }
   g_strfreev (notes);
 }
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index 1117a80..a97b054 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -107,11 +107,12 @@ a.navbar-next::after {
 <!-- = Normal Matches = -->
 
 <xsl:template match="para">
-  <span class="fixed">
-    <xsl:value-of select="node()"/>
-    <xsl:text>
-  </xsl:text>
-  </span>
+  <p>
+    <span class="fixed">
+      <!-- Apply templates for <a> tags and copy text straight through. -->
+      <xsl:apply-templates select="./text()|*"/>
+    </span>
+  </p>
 </xsl:template>
 
 <xsl:template match="para1">
@@ -174,8 +175,4 @@ a.navbar-next::after {
   </xsl:element>
 </xsl:template>
 
-<xsl:template match="noteholder">
-  <xsl:apply-templates select="node()[not(self::noteholder)]"/>
-</xsl:template>
-
 </xsl:stylesheet>
-- 
1.7.1

From 45762b7f91b57038f893df6e6221db0bd7fbe255 Mon Sep 17 00:00:00 2001
From: Rupert Swarbrick <rswarbrick gmail com>
Date: Sat, 11 Sep 2010 22:21:19 +0100
Subject: [PATCH 4/4] Render multi-line links correctly.

---
 libyelp/yelp-info-parser.c |   50 +++++++++++++++-----------------------------
 1 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index a85f733..d4ef7bc 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -1377,6 +1377,9 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
     append++;
     url = g_strndup (*current, append - (*current));
 
+    /* Save a copy of the unadulterated link text for later. */
+    link_text = g_strconcat ("*Note", url, NULL);
+
     /* By now, we got 2 things.  First, is append which is the (hopefully)
      * non-link text.  Second, we got a url.
      * The url can be in several forms:
@@ -1475,42 +1478,23 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
       paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
     }
 
-    for (ulink = urls; *ulink != NULL; ulink++) {
-      if (ulink == urls)
-        link_text = g_strconcat ("*Note", *ulink, NULL);
-      else {
-        gchar *spacing = *ulink;
-        gchar *tmp;
-        gint count = 0;
-        while (*spacing == ' ') {
-          spacing++;
-          count++;
-        }
-        if (spacing != *ulink) {
-          if (count > 1)
-            spacing-=2;
-          tmp = g_strndup (*ulink, spacing-*ulink);
-          if (count > 1)
-            spacing+=2;
-          xmlNewTextChild (paragraph, NULL, BAD_CAST "spacing",
-                           BAD_CAST tmp);
-          g_free (tmp);
-          link_text = g_strdup (spacing);
-        } else {
-          link_text = g_strdup (*ulink);
-        }
-      }
-      ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
-                              BAD_CAST link_text);
-      if (*(ulink+1) != NULL)
-        info_body_text (*node, &paragraph, NULL, FALSE, "");
+    /*
+      Now we're supposed to actually render the link. I have a list of
+      bits of URL and actually this is really easy - I want to have
+      the link *text* exactly the same as it appeared in the .info
+      file, so don't use the list of strings urls, instead use the
+      whole lot: url (complete with embedded newlines etc.)
+    */
+    ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
+                            BAD_CAST link_text);
+    g_free (link_text);
+    xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
 
-      g_free (link_text);
-      xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
-    }
     g_strfreev (urls);
-    /* Finally, we can add the text as required */
+
+    /* Finally, we can add the following text as required */
     info_body_text (*node, &paragraph, NULL, TRUE, append);
+
     g_free (url);
     g_free (href);
   }
-- 
1.7.1

Attachment: pgp1Jm02G5iqm.pgp
Description: PGP signature

Follow-Ups:
- Re: Info parsing
  - From: Rupert Swarbrick

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]