[yelp] Parse info files into paragraphs (separated by blank lines).

From: Shaun McCance <shaunm src gnome org>
To: commits-list gnome org
Cc:
Subject: [yelp] Parse info files into paragraphs (separated by blank lines).
Date: Wed, 15 Sep 2010 15:01:44 +0000 (UTC)
commit 73e13001e849ed6abac71a7addddae7de99739aa
Author: Rupert Swarbrick <rswarbrick gmail com>
Date:   Fri Sep 10 11:42:23 2010 +0100

    Parse info files into paragraphs (separated by blank lines).

 libyelp/yelp-info-parser.c   |  454 ++++++++++++++++++++++--------------------
 stylesheets/info2html.xsl.in |   15 +-
 2 files changed, 248 insertions(+), 221 deletions(-)
---
diff --git a/libyelp/yelp-info-parser.c b/libyelp/yelp-info-parser.c
index 1605ecf..a85f733 100644
--- a/libyelp/yelp-info-parser.c
+++ b/libyelp/yelp-info-parser.c
@@ -128,7 +128,8 @@ info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
     source = (gchar*)g_hash_table_lookup (h, "src");
 
   if (!h || !source || !*source)
-    return xmlNewTextChild (parent, NULL, BAD_CAST "para1", BAD_CAST "[broken image]");
+    return xmlNewTextChild (parent, NULL, BAD_CAST "para",
+                            BAD_CAST "[broken image]");
 
   gchar *title = (gchar*)g_hash_table_lookup (h, "title");
   gchar *text = (gchar*)g_hash_table_lookup (h, "text");
@@ -201,7 +202,6 @@ join_strings_subset (const gchar *separator,
 */
 static void
 lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
-                         gboolean inline_p,
                          gchar** first, gchar** last)
 {
   /* TODO? Currently we're copying the split strings again, which is
@@ -209,11 +209,10 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
      window on `content'. But that's much more difficult, so unless
      there's a problem, let's go with the stupid approach. */
   gchar *glob;
+
   if (last > first) {
     glob = join_strings_subset ("\n", first, last);
-    xmlNewTextChild (parent, ns,
-                     inline_p ? BAD_CAST "para1" : BAD_CAST "para",
-                     BAD_CAST glob);
+    xmlAddChild (parent, xmlNewText (BAD_CAST glob));
     g_free (glob);
   }
 }
@@ -222,21 +221,24 @@ lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
   Convert body text CONTENT to xml nodes. This function is responsible
   for spotting headings etc and splitting them out correctly.
 
+  paragraph is as described in info_body_text, but cannot be null.
+
   If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
-  <para> tag. 
+  <para> tag.
 
   TODO: IWBN add a regex match for *Note: here and call the *Note ==>
   <a href> logic of info_process_text_notes from here.
  */
 static void
-info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_parse_text (xmlNodePtr parent, xmlNodePtr *paragraph,
+                      xmlNsPtr ns,
                       gboolean inline_p, const gchar *content)
 {
   /* The easiest things to spot are headings: they look like a line of
    * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
    * them, we split content into single lines and work with them. */
   gchar **lines = g_strsplit (content, "\n", 0);
-  gchar **first = lines, **last = lines+1;
+  gchar **first = lines, **last = lines;
   int header_level;
   xmlNodePtr header_node;
 
@@ -252,11 +254,27 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
    * the chunk of the body we're displaying (inclusive) */
   for (; *last; last++) {
 
+    /* Check for a blank line */
+    if (**last == '\0') {
+      if (last != first) {
+        if (!*paragraph) {
+          *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+        }
+        lines_subset_text_child (*paragraph, ns, first, last);
+      }
+      /* On the next iteration, last==first both pointing at the next
+         line. */
+      first = last+1;
+      *paragraph = NULL;
+
+      continue;
+    }
+
     /* Check for a header */
     header_level = header_underline_level (*last);
     if (header_level) {
       /* Write out any lines beforehand */
-      lines_subset_text_child (parent, ns, FALSE, first, last-1);
+      lines_subset_text_child (parent, ns, first, last-1);
       /* Now write out the actual header line */
       header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
                                      BAD_CAST *(last-1));
@@ -264,11 +282,15 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
                   BAD_CAST level_headings[header_level]);
       
       first = last+1;
-      last = first+1;
+      last = first-1;
     }
   }
+
   /* Write out any lines left */
-  lines_subset_text_child (parent, ns, inline_p, first, last);
+  if (!*paragraph) {
+    *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
+  }
+  lines_subset_text_child (*paragraph, ns, first, last);
   
   g_strfreev (lines);
 }
@@ -278,14 +300,21 @@ info_body_parse_text (xmlNodePtr parent, xmlNsPtr ns,
   body and turning it into paragraph tags. It searches out images and
   marks them up properly if necessary.
 
+  parent should be the node in which we're currently storing text and
+  paragraph a pointer to a <para> tag or NULL. At blank lines, we
+  finish with the current para tag and switch to a new one.
+
   It uses info_body_parse_text to mark up the actual bits of text.
  */
 static void
-info_body_text (xmlNodePtr parent, xmlNsPtr ns,
+info_body_text (xmlNodePtr parent, xmlNodePtr *paragraph, xmlNsPtr ns,
                 gboolean inline_p, gchar const *content)
 {
+  xmlNodePtr thepara = NULL;
+  if (paragraph == NULL) paragraph = &thepara;
+
   if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
-    info_body_parse_text (parent, ns, inline_p, content);
+    info_body_parse_text (parent, paragraph, ns, inline_p, content);
     return;
   }
 
@@ -293,6 +322,7 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
   gint pos = 0;
   GRegex *regex = g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE "((?:[^" INFO_TAG_1 "]|[^" INFO_C_TAG_0 "]+" INFO_TAG_1 ")*)" INFO_C_TAG_CLOSE_RE ")", 0, 0, NULL);
   GMatchInfo *match_info;
+
   g_regex_match (regex, content, 0, &match_info);
   while (g_match_info_matches (match_info))
     {
@@ -302,14 +332,18 @@ info_body_text (xmlNodePtr parent, xmlNsPtr ns,
 						     &image_start, &image_end);
       gchar *before = g_strndup (&content[pos], image_start - pos);
       pos = image_end + 1;
-      info_body_parse_text (parent, NULL, TRUE, before);
+      info_body_parse_text (parent, paragraph, NULL, TRUE, before);
       g_free (before);
+
+      /* End the paragraph that was before */
+      *paragraph = NULL;
+
       if (image_found)
 	info_insert_image (parent, match_info);
       g_match_info_next (match_info, NULL);
     }
   gchar *after = g_strndup (&content[pos], content_len - pos);
-  info_body_parse_text (parent, NULL, TRUE, after);
+  info_body_parse_text (parent, paragraph, NULL, TRUE, after);
   g_free (after);
 }
 
@@ -977,8 +1011,8 @@ parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 					     BAD_CAST "Section",
 					     NULL);
 		  if (!notes)
-		    info_body_text (newnode, NULL, FALSE, page_content);
-		  
+		    info_body_text (newnode, NULL, NULL, FALSE, page_content);
+
 		  else {
 		    /* Handle notes here */
 		    info_process_text_notes (&newnode, page_content, tree);
@@ -1151,7 +1185,7 @@ yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
     
 
   if (!notes)
-    info_body_text (newnode, NULL, FALSE, split[0]);
+    info_body_text (newnode, NULL, NULL, FALSE, split[0]);
   else {
     info_process_text_notes (&newnode, split[0], tree);
   }
@@ -1277,212 +1311,208 @@ info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
 {
   gchar **notes;
   gchar **current;
-  xmlNodePtr holder;
   xmlNodePtr ref1;
+  xmlNodePtr paragraph = NULL;
   gboolean first = TRUE;
 
-  notes = g_strsplit (content, "*Note", -1);
-  holder = xmlNewChild (*node, NULL, BAD_CAST "noteholder", NULL);
+  /*
+    Split using the regular expression
+
+      \*[Nn]ote(?!_)
+
+    which deals with either case and the last bit is a lookahead so
+    that we don't split on things of the form *Note:_, which aren't
+    real notes.
+  */
+  notes = g_regex_split_simple ("\\*[Nn]ote(?!_)", content, 0, 0);
 
   for (current = notes; *current != NULL; current++) {
-    /* Since the notes can be either *Note or *note, we handle the second 
-     * variety here
-     */
-    gchar **subnotes;
-    gchar **current_real;
-
-    subnotes = g_strsplit (*current, "*note", -1);
-    for (current_real = subnotes; *current_real != NULL; current_real++) {
-      gchar *url, **urls, **ulink;
-      gchar *append;
-      gchar *alt_append, *alt_append1;
-      gchar *link_text;
-      gchar *href = NULL;
-      gchar *break_point = NULL;
-      gboolean broken = FALSE;
-      if (first) {
-	/* The first node is special.  It doesn't have a note ref at the 
-	 * start, so we can just add it and forget about it.
-	 */
-	first = FALSE;
-	info_body_text (holder, NULL, TRUE, (*current_real));
-	continue;
-      }
-      /* If we got to here, we now gotta parse the note reference */
-
-      if (*current_real[0] == '_') {
-	/* Special type of note that isn't really a note, but pretends
-	 * it is
-	 */
-	info_body_text (holder, NULL, TRUE,
-			g_strconcat ("*Note", *current_real, NULL));
-	continue;
-      }
-      append = strchr (*current_real, ':');
-      if (!append) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      append++;
-      alt_append = append;
-      alt_append1 = alt_append;
-      append = strchr (append, ':');
-      alt_append = strchr (alt_append, '.');
-      if (alt_append && g_str_has_prefix (alt_append, ".info")) {
-	broken = TRUE;
-	alt_append++;
-	alt_append = strchr (alt_append, '.');
-      }
-      alt_append1 = strchr (alt_append1, ',');
-      if (!append && !alt_append && !alt_append1) {
-	info_body_text (holder, NULL, TRUE, *current_real);
-	continue;
-      }
-      if (!append || alt_append || alt_append1) {
-	if (!append) {
-	  if (alt_append) append = alt_append;
-	  else append = alt_append1;
-	}
-	if ((alt_append && alt_append < append))
-	  append = alt_append;
-	if (alt_append1 && alt_append1 < append)
-	  append = alt_append1;
-      }
-      append++;
-      url = g_strndup (*current_real, append - (*current_real));
-
-      /* By now, we got 2 things.  First, is append which is the (hopefully)
-       * non-link text.  Second, we got a url.
-       * The url can be in several forms:
-       * 1. linkend::
-       * 2. linkend:(infofile)Linkend.
-       * 3. Title: Linkend.
-       * 4. Title: Linkend, (pretty sure this is just broken)
-       * 5. Title: (infofile.info)Linkend.
-       * All possibilities should have been picked up.
-       * Here:
-       * Clean up the split.  Should be left with a real url and
-       * a list of fragments that should be linked
-       * Also goes through and removes extra spaces, leaving only one 
-       * space in place of many
+    gchar *url, **urls, **ulink;
+    gchar *append;
+    gchar *alt_append, *alt_append1;
+    gchar *link_text;
+    gchar *href = NULL;
+    gchar *break_point = NULL;
+    gboolean broken = FALSE;
+    if (first) {
+      /* The first node is special.  It doesn't have a note ref at the 
+       * start, so we can just add it and forget about it.
        */
-      urls = g_strsplit (url, "\n", -1);
-      break_point = strchr (url, '\n');
-      while (break_point) {
-	*break_point = ' ';
-	break_point = strchr (++break_point, '\n');
+      first = FALSE;
+      info_body_text (*node, &paragraph, NULL, TRUE, (*current));
+      continue;
+    }
+
+    /* If we got to here, we now gotta parse the note reference */
+    append = strchr (*current, ':');
+    if (!append) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    append++;
+    alt_append = append;
+    alt_append1 = alt_append;
+    append = strchr (append, ':');
+    alt_append = strchr (alt_append, '.');
+    if (alt_append && g_str_has_prefix (alt_append, ".info")) {
+      broken = TRUE;
+      alt_append++;
+      alt_append = strchr (alt_append, '.');
+    }
+    alt_append1 = strchr (alt_append1, ',');
+    if (!append && !alt_append && !alt_append1) {
+      info_body_text (*node, &paragraph, NULL, TRUE, *current);
+      continue;
+    }
+    if (!append || alt_append || alt_append1) {
+      if (!append) {
+        if (alt_append) append = alt_append;
+        else append = alt_append1;
       }
-      break_point = strchr (url, ' ');
-      while (break_point) {
-	if (*(break_point+1) == ' ') {
-	  /* Massive space.  Fix. */
-	  gchar *next = break_point;
-	  gchar *url_copy;
-	  while (*next == ' ')
-	    next++;
-	  next--;
-	  url_copy = g_strndup (url, break_point-url);
-	  g_free (url);
-	  url = g_strconcat (url_copy, next, NULL);
-	  break_point = strchr (url, ' ');
-	  g_free (url_copy);
-	} else {
-	  break_point++;
-	  break_point = strchr (break_point, ' ');
-	}
+      if ((alt_append && alt_append < append))
+        append = alt_append;
+      if (alt_append1 && alt_append1 < append)
+        append = alt_append1;
+    }
+    append++;
+    url = g_strndup (*current, append - (*current));
+
+    /* By now, we got 2 things.  First, is append which is the (hopefully)
+     * non-link text.  Second, we got a url.
+     * The url can be in several forms:
+     * 1. linkend::
+     * 2. linkend:(infofile)Linkend.
+     * 3. Title: Linkend.
+     * 4. Title: Linkend, (pretty sure this is just broken)
+     * 5. Title: (infofile.info)Linkend.
+     * All possibilities should have been picked up.
+     * Here:
+     * Clean up the split.  Should be left with a real url and
+     * a list of fragments that should be linked
+     * Also goes through and removes extra spaces, leaving only one 
+     * space in place of many
+     */
+    urls = g_strsplit (url, "\n", -1);
+    break_point = strchr (url, '\n');
+    while (break_point) {
+      *break_point = ' ';
+      break_point = strchr (++break_point, '\n');
+    }
+    break_point = strchr (url, ' ');
+    while (break_point) {
+      if (*(break_point+1) == ' ') {
+        /* Massive space.  Fix. */
+        gchar *next = break_point;
+        gchar *url_copy;
+        while (*next == ' ')
+          next++;
+        next--;
+        url_copy = g_strndup (url, break_point-url);
+        g_free (url);
+        url = g_strconcat (url_copy, next, NULL);
+        break_point = strchr (url, ' ');
+        g_free (url_copy);
+      } else {
+        break_point++;
+        break_point = strchr (break_point, ' ');
       }
-      if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */ 
-	gchar *stop = NULL;
-	gchar *lurl = NULL;
-	gchar *zloc = NULL;
-	stop = strchr (url, ':');
-	lurl = strchr (stop, '(');
-	if (!lurl) { /* 3rd type of link */
-	  gchar *link;
-	  gint length;
-	  stop++;
-	  link = g_strdup (stop);
-	  link = g_strstrip (link);
-	  length = strlen (link) - 1;
-	  link[length] = '\0';	  
-	  href = g_strconcat ("xref:", link, NULL);
-	  link[length] = 'a';
-	  g_free (link);
-
-
-	} else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
-	  if (broken) {
-	    gchar *new_url;
-	    gchar *info;
-	    gchar *stripped;
-
-	    new_url = g_strdup (lurl);
-	    info = strstr (new_url, ".info)");
-	    stripped = g_strndup (new_url, info-new_url);
-	    info +=5;
-	    lurl = g_strconcat (stripped, info, NULL);
-	    g_free (stripped);
-	    g_free (new_url);
-	  }
-	  zloc = &(lurl[strlen(lurl)-1]);
-	  *zloc = '\0';
-	  href = g_strconcat ("info:", lurl, NULL);
-	  *zloc = 'a';
-	}
-      } else { /* First kind of link */
-	gchar *tmp1;
-	gchar *frag;
-
-	tmp1 = strchr (url, ':');
-	if (!tmp1)
-	  frag = g_strdup (url);
-	else 
-	  frag = g_strndup (url, tmp1 - url);
-	g_strstrip (frag);
-	gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
-	href = g_strconcat ("xref:", frag, NULL);
-        g_free (frag);
+    }
+    if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */
+      gchar *stop = NULL;
+      gchar *lurl = NULL;
+      gchar *zloc = NULL;
+      stop = strchr (url, ':');
+      lurl = strchr (stop, '(');
+      if (!lurl) { /* 3rd type of link */
+        gchar *link;
+        gint length;
+        stop++;
+        link = g_strdup (stop);
+        link = g_strstrip (link);
+        length = strlen (link) - 1;
+        link[length] = '\0';
+        href = g_strconcat ("xref:", link, NULL);
+        link[length] = 'a';
+        g_free (link);
+
+
+      } else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
+        if (broken) {
+          gchar *new_url;
+          gchar *info;
+          gchar *stripped;
+
+          new_url = g_strdup (lurl);
+          info = strstr (new_url, ".info)");
+          stripped = g_strndup (new_url, info-new_url);
+          info +=5;
+          lurl = g_strconcat (stripped, info, NULL);
+          g_free (stripped);
+          g_free (new_url);
+        }
+        zloc = &(lurl[strlen(lurl)-1]);
+        *zloc = '\0';
+        href = g_strconcat ("info:", lurl, NULL);
+        *zloc = 'a';
       }
-      for (ulink = urls; *ulink != NULL; ulink++) {
-	if (ulink == urls)
-	  link_text = g_strconcat ("*Note", *ulink, NULL);
-	else {
-	  gchar *spacing = *ulink;
-	  gchar *tmp;
-	  gint count = 0;
-	  while (*spacing == ' ') {
-	    spacing++;
-	    count++;
-	  }
-	  if (spacing != *ulink) {
-	    if (count > 1)
-	      spacing-=2;
-	    tmp = g_strndup (*ulink, spacing-*ulink);
-	    if (count > 1)
-	      spacing+=2;
-	    xmlNewTextChild (holder, NULL, BAD_CAST "spacing",
-			     BAD_CAST tmp);
-	    g_free (tmp);
-	    link_text = g_strdup (spacing);
-	  } else {
-	    link_text = g_strdup (*ulink);
-	  }
-	}
-	ref1 = xmlNewTextChild (holder, NULL, BAD_CAST "a",
-				BAD_CAST link_text);
-	if (*(ulink+1) != NULL)
-	  info_body_text (holder, NULL, FALSE, "");
+    } else { /* First kind of link */
+      gchar *tmp1;
+      gchar *frag;
+
+      tmp1 = strchr (url, ':');
+      if (!tmp1)
+        frag = g_strdup (url);
+      else
+        frag = g_strndup (url, tmp1 - url);
+      g_strstrip (frag);
+      gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
+      href = g_strconcat ("xref:", frag, NULL);
+      g_free (frag);
+    }
 
-	g_free (link_text);
-	xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
+    /* Check we've got a valid paragraph node */
+    if (!paragraph) {
+      paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
+    }
+
+    for (ulink = urls; *ulink != NULL; ulink++) {
+      if (ulink == urls)
+        link_text = g_strconcat ("*Note", *ulink, NULL);
+      else {
+        gchar *spacing = *ulink;
+        gchar *tmp;
+        gint count = 0;
+        while (*spacing == ' ') {
+          spacing++;
+          count++;
+        }
+        if (spacing != *ulink) {
+          if (count > 1)
+            spacing-=2;
+          tmp = g_strndup (*ulink, spacing-*ulink);
+          if (count > 1)
+            spacing+=2;
+          xmlNewTextChild (paragraph, NULL, BAD_CAST "spacing",
+                           BAD_CAST tmp);
+          g_free (tmp);
+          link_text = g_strdup (spacing);
+        } else {
+          link_text = g_strdup (*ulink);
+        }
       }
-      g_strfreev (urls);
-      /* Finally, we can add the text as required */
-      info_body_text (holder, NULL, TRUE, append);
-      g_free (url);
-      g_free (href);
+      ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
+                              BAD_CAST link_text);
+      if (*(ulink+1) != NULL)
+        info_body_text (*node, &paragraph, NULL, FALSE, "");
+
+      g_free (link_text);
+      xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
     }
-    g_strfreev (subnotes);
+    g_strfreev (urls);
+    /* Finally, we can add the text as required */
+    info_body_text (*node, &paragraph, NULL, TRUE, append);
+    g_free (url);
+    g_free (href);
   }
   g_strfreev (notes);
 }
diff --git a/stylesheets/info2html.xsl.in b/stylesheets/info2html.xsl.in
index 1117a80..a97b054 100644
--- a/stylesheets/info2html.xsl.in
+++ b/stylesheets/info2html.xsl.in
@@ -107,11 +107,12 @@ a.navbar-next::after {
 <!-- = Normal Matches = -->
 
 <xsl:template match="para">
-  <span class="fixed">
-    <xsl:value-of select="node()"/>
-    <xsl:text>
-  </xsl:text>
-  </span>
+  <p>
+    <span class="fixed">
+      <!-- Apply templates for <a> tags and copy text straight through. -->
+      <xsl:apply-templates select="./text()|*"/>
+    </span>
+  </p>
 </xsl:template>
 
 <xsl:template match="para1">
@@ -174,8 +175,4 @@ a.navbar-next::after {
   </xsl:element>
 </xsl:template>
 
-<xsl:template match="noteholder">
-  <xsl:apply-templates select="node()[not(self::noteholder)]"/>
-</xsl:template>
-
 </xsl:stylesheet>
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]