[glibmm] Make splitting tokens more robust in GtkDefs.pm and Enum.pm.

From: Krzesimir Nowak <krnowak src gnome org>
To: commits-list gnome org
Cc:
Subject: [glibmm] Make splitting tokens more robust in GtkDefs.pm and Enum.pm.
Date: Tue, 9 Mar 2010 18:23:01 +0000 (UTC)
commit ea4170fa33a528717492b3b399308150a6d81683
Author: Krzesimir Nowak <qdlacz gmail com>
Date:   Tue Jan 5 18:25:24 2010 +0100

    Make splitting tokens more robust in GtkDefs.pm and Enum.pm.
    
    * tools/pm/Enum.pm: Written splitter for values in
    (flags|enum)-extended defs.
    * tools/pm/GtkDefs.pm: Written splitter for general defs.

 tools/pm/Enum.pm    |  111 ++++++++++++++++++++++++++++++++++++++--
 tools/pm/GtkDefs.pm |  141 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 227 insertions(+), 25 deletions(-)
---
diff --git a/tools/pm/Enum.pm b/tools/pm/Enum.pm
index 47485e1..6096645 100644
--- a/tools/pm/Enum.pm
+++ b/tools/pm/Enum.pm
@@ -31,6 +31,106 @@ our @EXPORT_OK;
 #       bool mark;
 #    }
 
+#
+# private functions:
+#
+
+sub split_enum_tokens($)
+{
+  my ($token_string) = @_;
+  my @tokens = ();
+  # index of first opening double quotes between parens - beginning of a new
+  # token.
+  my $begin_token = 0;
+  # index of last closing double quotes between parens - end of a token.
+  my $end_token = 0;
+  # whether we are inside double quotes.
+  my $inside_dquotes = 0;
+  # whether we are inside double and then single quotes (for situations like
+  # "'"'").
+  my $inside_squotes = 0;
+  my $len = length($token_string);
+  # whether we found opening paren and we are expecting an opening double
+  # quotes.
+  my $near_begin = 0;
+  # count of double quotes pairs between parens.
+  my $dq_count = 0;
+  # whether previous char was a backslash - important only when being between
+  # double quotes.
+  my $backslash = 0;
+  for (my $index = 0; $index < $len; $index++)
+  {
+    my $char = substr($token_string, $index, 1);
+    if ($inside_dquotes)
+    {
+      # if prevous char was backslash, then current char is not important -
+      # we are still inside double or double/single quotes anyway.
+      if ($backslash)
+      {
+        $backslash = 0;
+      }
+      # if current char is backslash.
+      elsif ($char eq '\\')
+      {
+        $backslash = 1;
+      }
+      # if current char is unescaped double quotes and we are not inside single
+      # ones - means, we are going outside string. We mark this place as an end
+      # of the token in case we find a closing paren after this.
+      elsif ($char eq '"' and not $inside_squotes)
+      {
+        $inside_dquotes = 0;
+        $end_token = $index;
+      }
+      # if current char is single quote then switch being inside single quotes
+      # state.
+      elsif ($char eq '\'')
+      {
+        $inside_squotes = not $inside_squotes;
+      }
+    }
+    # current char is opening paren - this means we are near the beginning of
+    # a token (first double quotes after this paren).
+    elsif ($char eq '(')
+    {
+      $near_begin = 1;
+    }
+    # current char is closing paren - this means we reached end of a token at
+    # last closing double quotes.
+    elsif ($char eq ')')
+    {
+      my $token_len = $end_token + 1 - $begin_token;
+      my $token = substr($token_string, $begin_token, $token_len);
+      # there should be three pairs of double quotes.
+      if ($dq_count == 3)
+      {
+        push(@tokens, $token);
+      }
+      else
+      {
+        print STDERR "Wrong value statement while parsing ($token)\n";
+      }
+      $dq_count = 0;
+    }
+    # current char is opening double quotes - this can be a beginning of
+    # a token.
+    elsif ($char eq '"')
+    {
+      if ($near_begin)
+      {
+        $begin_token = $index;
+        $near_begin = 0;
+      }
+      $inside_dquotes = 1;
+      $dq_count++;
+    }
+  }
+  return @tokens;
+}
+
+#
+# end of private functions.
+#
 
 sub new
 {
@@ -82,13 +182,11 @@ sub parse_values($$)
   my $elem_names  = [];
   my $elem_values = [];
   my $common_prefix = undef;
-
-  # break up the value statements
-  foreach(split(/\s*'*[()]\s*/, $value))
+  # break up the value statements - it works with parens inside double quotes
+  # and handles triples like '("dq-token", "MY_SCANNER_DQ_TOKEN", "'"'").
+  foreach (split_enum_tokens($value))
   {
-    next if($_ eq "");
-
-    if(/^"\S+" "(\S+)" "([^"]+)"$/)
+    if (/^"\S+" "(\S+)" "(.+)"$/)
     {
       my ($name, $value) = ($1, $2);
 
@@ -143,6 +241,7 @@ sub beautify_values($)
   # Continuous?  (Aliases to prior enum values are allowed.)
   foreach my $value (@$elem_values)
   {
+    return if ($value =~ /[G-WY-Zg-wy-z_]/);
     return if(($value < $first) || ($value > $prev + 1));
     $prev = $value;
   }
diff --git a/tools/pm/GtkDefs.pm b/tools/pm/GtkDefs.pm
index 7c791b8..b0002e2 100644
--- a/tools/pm/GtkDefs.pm
+++ b/tools/pm/GtkDefs.pm
@@ -95,25 +95,31 @@ sub read_defs($$;$)
   # break the tokens into lisp phrases up to three levels deep.
   #   WARNING: reading the following perl statement may induce seizures,
   #   please flush eyes with water immediately, and consult a mortician.
-  my @tokens = split(
-    m/(
-        \(
-        (?:
-            [^()]*
-            \(
-            (?:
-                [^()]*
-                \(
-                [^()]*
-                \)
-            )*
-            [^()]*
-            \)
-        )*
-        [^()]*
-        \)
-    )/x,
-    read_file($path, $filename));
+  #
+  # this regexp is weak - it does not work on multiple and/or unpaired parens
+  # inside double quotes - those shouldn't be ever considered. i replaced this
+  # splitting with my own function, which does the job very well - krnowak.
+#  my @tokens = split(
+#    m/(
+#        \(
+#        (?:
+#            [^()]*
+#            \(
+#            (?:
+#                [^()]*
+#                \(
+#                [^()]*
+#                \)
+#            )*
+#            [^()]*
+#            \)
+#        )*
+#        [^()]*
+#        \)
+#    )/x,
+#    read_file($path, $filename));
+
+  my @tokens = split_tokens(read_file($path, $filename));
 
   # scan through top level tokens
   while ($#tokens > -1)
@@ -161,6 +167,103 @@ sub read_defs($$;$)
   }
 }
 
+sub split_tokens($)
+{
+  my ($token_string) = @_;
+  my @tokens = ();
+  # whether we are inside double quotes.
+  my $inside_dquotes = 0;
+  # whether we are inside double and then single quotes (for situations like
+  # "'"'").
+  my $inside_squotes = 0;
+  # number of yet unpaired opening parens.
+  my $parens = 0;
+  my $len = length($token_string);
+  # whether previous char was a backslash - important only when being between
+  # double quotes.
+  my $backslash = 0;
+  # index of first opening paren - beginning of a new token.
+  my $begin_token = 0;
+
+  for (my $index = 0; $index < $len; $index++)
+  {
+    my $char = substr($token_string, $index, 1);
+    # if we are inside double quotes.
+    if ($inside_dquotes)
+    {
+      # if prevous char was backslash, then current char is not important -
+      # we are still inside double or double/single quotes anyway.
+      if ($backslash)
+      {
+        $backslash = 0;
+      }
+      # if current char is backslash.
+      elsif ($char eq '\\')
+      {
+        $backslash = 1;
+      }
+      # if current char is unescaped double quotes and we are not inside single
+      # ones - means, we are going outside string.
+      elsif ($char eq '"' and not $inside_squotes)
+      {
+        $inside_dquotes = 0;
+      }
+      # if current char is unescaped single quote, then we have two cases:
+      # 1. it just plain apostrophe.
+      # 2. it is a piece of a C code:
+      #  a) opening quotes,
+      #  b) closing quotes.
+      # if there is near (2 or 3 indexes away) second quote, then it is 2a,
+      # if 2a occured earlier, then it is 2b.
+      # otherwise is 1.
+      elsif ($char eq '\'')
+      {
+        # if we are already inside single quotes, it is 2b.
+        if ($inside_squotes)
+        {
+          $inside_squotes = 0;
+        }
+        else
+        {
+          # if there is closing quotes near, it is 2a.
+          if (substr($token_string, $index, 4) =~ /^'\\?.'/)
+          {
+            $inside_squotes = 1;
+          }
+          # else it is just 1.
+        }
+      }
+    }
+    # double quotes - beginning of a string.
+    elsif ($char eq '"')
+    {
+      $inside_dquotes = 1;
+    }
+    # opening paren - if paren count is 0 then this is a beginning of a token.
+    elsif ($char eq '(')
+    {
+      unless ($parens)
+      {
+        $begin_token = $index;
+      }
+      $parens++;
+    }
+    # closing paren - if paren count is 1 then this is an end of a token, so we
+    # extract it from token string and push into token list.
+    elsif ($char eq ')')
+    {
+      $parens--;
+      unless ($parens)
+      {
+        my $token_len = $index + 1 - $begin_token;
+        my $token = substr($token_string, $begin_token, $token_len);
+        push(@tokens, $token);
+      }
+    }
+    # do nothing on other chars.
+  }
+  return @tokens;
+}
 
 sub read_file($$)
 {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]