[gtk-doc] New MarkDown parser

From: William Jon McCann <mccann src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk-doc] New MarkDown parser
Date: Tue, 4 Feb 2014 21:22:26 +0000 (UTC)
commit 973687ea08362961c21e93d273433be70137be9d
Author: William Jon McCann <william jon mccann gmail com>
Date:   Thu Jan 30 18:13:51 2014 -0500

    New MarkDown parser
    
    Much more robust and complete MarkDown parser inspired by
    ParseDown http://parsedown.org/
    
    https://bugzilla.gnome.org/show_bug.cgi?id=723417

 gtkdoc-mkdb.in              |  500 ++++++++++++++++++++++++++++++-------------
 tests/gobject/src/gobject.c |    5 +
 2 files changed, 361 insertions(+), 144 deletions(-)
---
diff --git a/gtkdoc-mkdb.in b/gtkdoc-mkdb.in
index 9b47d0c..31e227b 100755
--- a/gtkdoc-mkdb.in
+++ b/gtkdoc-mkdb.in
@@ -25,6 +25,7 @@
 # Description : This creates the DocBook files from the edited templates.
 #############################################################################
 
+use warnings;
 use strict;
 use Getopt::Long;
 
@@ -4569,120 +4570,391 @@ sub IsEmptyDoc {
     return 0;
 }
 
-my %md_in_tags;
+#############################################################################
+# Function    : ConvertMarkDown
+# Description : Converts mark down syntax to the respective docbook.
+#               http://de.wikipedia.org/wiki/Markdown
+#               Inspired by the design of ParseDown
+#               http://parsedown.org/
+#               Copyright (c) 2013 Emanuil Rusev, erusev.com
+# Arguments   : the doc-string, the symbol name
+#############################################################################
 
-# If the tag is open, close it and update counter
-sub ConvertMarkDownTerminateTag {
-  my ($key) = @_;
-  my $text = "";
+sub ConvertMarkDown {
+    my ($text, $symbol) = @_;
 
-  if ($md_in_tags{$key} > 0) {
-    $text .= "</$key>\n";
-    $md_in_tags{$key}--;
-  }
+    $text = &MarkDownParse ($text);
 
-  return $text;
+    return $text
 }
 
-sub ConvertMarkDownOpenTag {
-  my ($key) = @_;
-  my $text = "<$key>\n";
+# SUPPORTED MARKDOWN
+# ==================
+#
+# Atx-style Headers
+# -----------------
+#
+# # Header 1
+#
+# ## Header 2 ##
+#
+# Setext-style Headers
+# --------------------
+#
+# Header 1
+# ========
+#
+# Header 2
+# --------
+#
+# Ordered (unnested) Lists
+# ------------------------
+#
+# 1. item 1
+#
+# 1. item 2 with loooong
+#    description
+#
+# 3. item 3
+#
+# Note: we require a blank line above the list items
+#
 
-  $md_in_tags{$key}++;
+# TODO(ensonic): it would be nice to add id parameters to the refsect2 elements
+
+sub MarkDownParseBlocks {
+  my ($linesref, $context) = @_;
+  my $line;
+  my @md_blocks = ();
+  my $md_block = { type => "" };
+
+ OUTER: foreach $line (@$linesref) {
+    my $first_char = substr ($line, 0, 1);
+    my $deindented_line = $line;
+    $deindented_line =~ s/^\s+//;
+
+    if ($md_block->{"type"} eq "heading") {
+      # a heading is ended by any level less than or equal
+      if ($md_block->{"level"} == 1) {
+        if ($line =~ /^={4,}[ \t]*$/) {
+          my $text = pop $md_block->{"lines"};
+          $md_block->{"interrupted"} = 0;
+          push @md_blocks, $md_block;
+
+          $md_block = { type => "heading",
+                        text => $text,
+                        lines => [],
+                        level => 1 };
+          next OUTER;
+        } elsif ($line =~ /^[#][ \t]+(.+?)[ \t]*[#]*\s*$/) {
+          $md_block->{"interrupted"} = 0;
+          push @md_blocks, $md_block;
+
+          $md_block = { type => "heading",
+                        text => $1,
+                        lines => [],
+                        level => 1 };
+          next OUTER;
+        } else {
+          # push lines into the block until the end is reached
+          push $md_block->{"lines"}, $line;
+          next OUTER;
+        }
+      } else {
+        if ($line =~ /^[=]{4,}[ \t]*$/) {
+          my $text = pop $md_block->{"lines"};
+          $md_block->{"interrupted"} = 0;
+          push @md_blocks, $md_block;
+
+          $md_block = { type => "heading",
+                        text => $text,
+                        lines => [],
+                        level => 1 };
+          next OUTER;
+        } elsif ($line =~ /^[-]{4,}[ \t]*$/) {
+          my $text = pop $md_block->{"lines"};
+          $md_block->{"interrupted"} = 0;
+          push @md_blocks, $md_block;
+
+          $md_block = { type => "heading",
+                        text => $text,
+                        lines => [],
+                        level => 2 };
+          next OUTER;
+        } elsif ($line =~ /^([#]{1,2})[ \t]+(.+?)[ \t]*[#]*\s*$/) {
+          $md_block->{"interrupted"} = 0;
+          push @md_blocks, $md_block;
+
+          $md_block = { type => "heading",
+                        text => $2,
+                        lines => [],
+                        level => length($1) };
+          next OUTER;
+        } else {
+          # push lines into the block until the end is reached
+          push $md_block->{"lines"}, $line;
+          next OUTER;
+        }
+      }
+    } elsif ($md_block->{"type"} eq "code") {
+      push $md_block->{"lines"}, $line;
+      if ($line =~ /^[ \t]*\]\|/) {
+        push @md_blocks, $md_block;
+        $md_block = { type => "paragraph",
+                      text => "",
+                      lines => [] };
+      }
+      next OUTER;
+    }
 
-  return $text;
-}
+    if ($deindented_line eq "") {
+      $md_block->{"interrupted"} = 1;
+      next;
+    }
 
-#############################################################################
-# Function    : ConvertMarkDown
-# Description : Converts mark down syntax to the respective docbook, but only
-#               outside CDATA and <programlisting> tags.
-#               http://de.wikipedia.org/wiki/Markdown
-#               Code snippets have been takesn from
-#               http://daringfireball.net/projects/markdown/
-#                 Copyright (c) 2004 John Gruber
-# Arguments   : the doc-string, the symbol name
-#############################################################################
-sub ConvertMarkDown {
-    my ($text, $symbol) = @_;
+    if ($md_block->{"type"} eq "li") {
+      if ($line =~ /^([ ]{0,3})(\d+[.]|[*+-])[ ](.*)/) {
+        my $indentation = $1;
+        if ($md_block->{"indentation"} ne $indentation) {
+          push $md_block->{"lines"}, $line;
+        } else {
+          my $lines = $3;
+          my $ordered = $md_block->{"ordered"};
+          $lines =~ s/^[ ]{0,4}//;
+          $md_block->{"last"} = 0;
+          push @md_blocks, $md_block;
+          $md_block = { type => "li",
+                        ordered => $ordered,
+                        indentation => $indentation,
+                        first => 0,
+                        last => 1,
+                        lines => [ $lines ] };
+        }
+        next OUTER;
+      }
 
-    # reset state
-    $md_in_tags{"para"} = 0;
-    $md_in_tags{"refsect2"} = 0;
-    $md_in_tags{"refsect3"} = 0;
-    $md_in_tags{"itemizedlist"} = 0;
-    $md_in_tags{"orderedlist"} = 0;
+      if ($md_block->{"interrupted"}) {
+        if ($first_char eq " ") {
+          push $md_block->{"lines"}, "";
+          $line =~ s/^[ ]{0,4}//;
+          push $md_block->{"lines"}, $line;
+          $md_block->{"interrupted"} = 0;
+          next OUTER;
+        }
+      } else {
+        $line =~ s/^[ ]{0,4}//;
+        push $md_block->{"lines"}, $line;
+        next OUTER;
+      }
+    }
 
-    $text = ConvertMarkDownOpenTag ("para") . $text;
+    # indentation sensitive types
 
-    # convert
-    $text = &ModifyXMLElements ($text, $symbol,
-                               "<!\\[CDATA\\[|<programlisting[^>]*>|\\|\\[",
-                               \&ConvertMarkDownEndTag,
-                               \&ConvertMarkDownCallback);
+    if ($line =~ /^([#]{1,2})[ \t]+(.+?)[ \t]*[#]*\s*$/) {
+      # atx heading (#)
+      push @md_blocks, $md_block;
 
-    $text .= &ConvertMarkDownTerminateTag ("para");
-    $text .= &ConvertMarkDownTerminateTag ("itemizedlist");
-    $text .= &ConvertMarkDownTerminateTag ("refsect3");
-    $text .= &ConvertMarkDownTerminateTag ("refsect2");
+      $md_block = { type => "heading",
+                    text => $2,
+                    lines => [],
+                    level => length($1) };
 
-    return $text
-}
+      next OUTER;
+    } elsif ($line =~ /^={4,}[ \t]*$/) {
+      # setext heading (====)
 
-sub ConvertMarkDownEndTag {
-  if ($_[0] eq "<!\[CDATA\[") {
-    return "]]>";
-  } elsif ($_[0] eq "|[") {
-    return "]\\|";
-  } else {
-    return "</programlisting>";
-  }
-}
+      if ($md_block->{"type"} eq "paragraph" && $md_block->{"interrupted"}) {
+        push @md_blocks, $md_block;
+        $md_block->{"type"} = "heading";
+        $md_block->{"lines"} = [];
+        $md_block->{"level"} = 1;
+      }
 
-sub ReplaceMarkDownSections {
-  my ($title, $depth) = @_;
-  my $result = "";
-  my $tag = "refsect3";
+      next OUTER;
+    } elsif ($line =~ /^-{4,}[ \t]*$/) {
+      # setext heading (-----)
 
-  $result .= &ConvertMarkDownTerminateTag ("para");
-  $result .= &ConvertMarkDownTerminateTag ("refsect3");
+      if ($md_block->{"type"} eq "paragraph" && $md_block->{"interrupted"}) {
+        push @md_blocks, $md_block;
+        $md_block->{"type"} = "heading";
+        $md_block->{"lines"} = [];
+        $md_block->{"level"} = 2;
+      }
 
-  if ($depth == 1) {
-    $tag = "refsect2";
-    $result .= &ConvertMarkDownTerminateTag ("refsect2");
+      next OUTER;
+    } elsif ($line =~ /^[ \t]*\|\[/) {
+      # code
+      $md_block->{"interrupted"} = 1;
+      push @md_blocks, $md_block;
+
+      $md_block = { type => "code",
+                    lines => [ $line ] };
+      next OUTER;
+    }
+
+    # indentation insensitive types
+
+    if ($line =~ /^([ ]*)[*+-][ ](.*)/) {
+      # li
+      push @md_blocks, $md_block;
+      my $lines = $2;
+      my $indentation = $1;
+      $lines =~ s/^[ ]{0,4}//;
+      $md_block = { type => "li",
+                    ordered => 0,
+                    indentation => $indentation,
+                    first => 1,
+                    last => 1,
+                    lines => [ $lines ] };
+      next OUTER;
+    }
+
+    # list item
+
+    if ($line =~ /^([ ]{0,4})\d+[.][ ]+(.*)/) {
+      push @md_blocks, $md_block;
+      my $lines = $2;
+      my $indentation = $1;
+      $lines =~ s/^[ ]{0,4}//;
+
+      $md_block = { type => "li",
+                    ordered => 1,
+                    indentation => $indentation,
+                    first => 1,
+                    last => 1,
+                    lines => [ $lines ] };
+
+      next;
+    }
+
+    # paragraph
+    if ($md_block->{"type"} eq "paragraph") {
+      if ($md_block->{"interrupted"}) {
+        push @md_blocks, $md_block;
+        $md_block = { type => "paragraph",
+                      interrupted => 0,
+                      text => $line };
+      } else {
+        $md_block->{"text"} .= "\n" . $line;
+      }
+    } else {
+      push @md_blocks, $md_block;
+      $md_block = { type => "paragraph",
+                    text => $line };
+    }
   }
 
-  $result .= ConvertMarkDownOpenTag ($tag);
-  $result .= "<title>$title</title>\n";
-  $result .= ConvertMarkDownOpenTag ("para");
+  push @md_blocks, $md_block;
+
+  shift @md_blocks;
 
-  return $result;
+  return @md_blocks;
 }
 
-sub ReplaceMarkDownListItem {
-  my ($block, $type, $is_last) = @_;
-  my $result = "";
+sub MarkDownParseSpanElements {
+  my ($text) = @_;
 
-  if ($md_in_tags{$type} < 1) {
-    $result .= ConvertMarkDownOpenTag ($type);
-  }
+  return $text;
+}
+
+sub MarkDownOutputDocBook {
+  my ($blocksref, $context) = @_;
+  my $output = "";
+  my $block;
+  my @blocks = @$blocksref;
+
+  foreach $block (@blocks) {
+    my $text;
+    my $title;
+
+    if ($block->{"type"} eq "paragraph") {
+      $text = &MarkDownParseSpanElements ($block->{"text"});
+
+      if ($context eq "li" && $output eq "") {
+        if ($block->{"interrupted"}) {
+          $output .= "\n"."<para>".$text."</para>"."\n";
+        } else {
+          $output .= $text;
+          if ($#blocks > 0) {
+            $output .= "\n";
+          }
+        }
+      } else {
+        $output .= "<para>".$text."</para>"."\n";
+      }
+
+    } elsif ($block->{"type"} eq "heading") {
+      my $tag;
+
+      $title = &MarkDownParseSpanElements ($block->{"text"});
+      if ($block->{"level"} == 1) {
+        $tag = "refsect2";
+      } else {
+        $tag = "refsect3";
+      }
 
-  $result .= "<listitem><para>$block</para></listitem>";
+      $text = &MarkDownParseLines ($block->{"lines"}, "heading");
+      $output .= "<".$tag."><title>".$title."</title>".$text."</".$tag.">\n";
+
+    } elsif ($block->{"type"} eq "li") {
+      my $tag = "itemizedlist";
+
+      if ($block->{"first"}) {
+        if ($block->{"ordered"}) {
+          $tag = "orderedlist";
+        }
+        $output .= "<".$tag.">\n";
+      }
+
+      if ($block->{"interrupted"}) {
+        push $block->{"lines"}, "";
+      }
 
-  if ($is_last == 1) {
-    $result .= &ConvertMarkDownTerminateTag ($type);
+      $text = &MarkDownParseLines ($block->{"lines"}, "li");
+      $output .= "<listitem>".$text."</listitem>\n";
+      if ($block->{"last"}) {
+        if ($block->{"ordered"}) {
+          $tag = "orderedlist";
+        }
+        $output .= "</".$tag.">\n";
+      }
+    } elsif ($block->{"type"} eq "code") {
+      foreach (@{$block->{"lines"}}) {
+        $output .= $_ . "\n";
+      }
+    } else {
+      $output .= $block->{"text"}."\n";
+    }
   }
 
-  return $result;
+  return $output;
+}
+
+sub MarkDownParseLines {
+  my ($linesref, $context) = @_;
+  my $output;
+  my @lines = @$linesref;
+  my @blocks;
+
+  @blocks = &MarkDownParseBlocks (\ lines, $context);
+  $output = &MarkDownOutputDocBook (\ blocks, $context);
+
+  return $output;
 }
 
-sub ReplaceMarkDownPara {
-  my $result = "";
+sub MarkDownParse {
+  my ($text) = @_;
+  my @lines;
 
-  $result .= &ConvertMarkDownTerminateTag ("para");
-  $result .= ConvertMarkDownOpenTag ("para");
+  # take out some variability in line endings
+  $text =~ s%\r\n%\n%g;
+  $text =~ s%\r%\n%g;
 
-  return $result;
+  # split lines
+  @lines = split("\n", $text);
+  $text = MarkDownParseLines(\ lines, "");
+
+  return $text;
 }
 
 sub ConvertMarkDownCallback {
@@ -4691,67 +4963,7 @@ sub ConvertMarkDownCallback {
   # If we're not in CDATA or a <programlisting> we convert blank lines so
   # they start a new <para>.
   if ($tag eq "") {
-
-    # TODO(ensonic): it would be nice to add id parameters to the refsect2 elements
-
-    # Setext-style headers:
-    #          Header 1
-    #          ========
-    #
-    #          Header 2
-    #          --------
-    #
-    $text =~ s%(?<=\n)(.+)[ \t]*\n={4,}[ \t]*\n\n%ReplaceMarkDownSections($1, 1);%egm;
-    $text =~ s%(?<=\n)(.+)[ \t]*\n-{4,}[ \t]*\n\n%ReplaceMarkDownSections($1, 2);%egm;
-
-    # atx-style headers:
-    #        # Header 1
-    #        ## Header 2
-    #        ## Header 2 with closing hashes ##
-    #        ...
-    #        ###### Header 6
-    #
-    $text =~ s%(?<=\n)(\#{1,2})[ \t]+(.+?)[ \t]*\#*\n+%ReplaceMarkDownSections($2, length($1));%egm;
-
-    # Simple (unnested) lists:
-    #   Please select:
-    #   - item 1
-    #   - item 2 with loooong
-    #     description
-    #   - item 3
-    #
-    #   New paragraph.
-    $text.="\n"; # we need a new line to avoid too complicated matching rules below
-    our $is_last = 0;
-    $text =~ s%(?<=\n)-\s+(.+?)(?=(?:\n-\s+(?{$is_last=0}))|(?:\n\n[^ 
\-\t](?{$is_last=1}))|(?:\n$(?{$is_last=1})))%ReplaceMarkDownListItem($1, "itemizedlist", $is_last)%egs;
-    chomp $text;
-
-    # Simple (unnested) lists:
-    #   Please select:
-    #   * item 1
-    #   * item 2 with loooong
-    #     description
-    #   * item 3
-    #
-    #   New paragraph.
-    $text.="\n"; # we need a new line to avoid too complicated matching rules below
-    $text =~ s%(?<=\n)\*\s+(.+?)(?=(?:\n\*\s+(?{$is_last=0}))|(?:\n\n[^ 
*\t](?{$is_last=1}))|(?:\n$(?{$is_last=1})))%ReplaceMarkDownListItem($1, "itemizedlist", $is_last)%egs;
-    chomp $text;
-
-    # Ordered (unnested) lists:
-    #   Please select:
-    #   1. item 1
-    #   1. item 2 with loooong
-    #     description
-    #   3. item 3
-    #
-    #   New paragraph.
-    $text.="\n"; # we need a new line to avoid too complicated matching rules below
-    $text =~ s%(?<=\n)\d+\.\s+(.+?)(?=(?:\n\d+\.\s+(?{$is_last=0}))|(?:\n\n[^ 
\d\t](?{$is_last=1}))|(?:\n$(?{$is_last=1})))%ReplaceMarkDownListItem($1, "orderedlist", $is_last)%egs;
-    chomp $text;
-
-    # Make Paragraphs on blank lines
-    $text =~ s%\n{2,}%ReplaceMarkDownPara()%eg;
+    $text = &MarkDownParse ($text);
   }
 
   return $text;
diff --git a/tests/gobject/src/gobject.c b/tests/gobject/src/gobject.c
index 67ee27b..8497b45 100644
--- a/tests/gobject/src/gobject.c
+++ b/tests/gobject/src/gobject.c
@@ -35,9 +35,12 @@
  * </informalexample>
  *
  * This example serves two main purposes:
+ *
  * - testing conversion (long description
  *   follows here)
+ *
  * - catching bugs
+ *
  * - having an example
  *
  * # Discussion
@@ -54,6 +57,7 @@
  * </orderedlist>
  *
  * This example serves two main purposes:
+ *
  * * testing alternate list syntax
  *
  *   With section text in each.
@@ -104,6 +108,7 @@
  * =========
  *
  * All the internal details go here or not:
+ *
  * - single item list
  */
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]