[PATCH] Reimplement Roman numeral detection

From: Santtu Lakkala <inz inz fi>
To: easytag-list gnome org
Subject: [PATCH] Reimplement Roman numeral detection
Date: Fri, 4 Apr 2014 08:06:35 +0300
Roman numeral detection code used complicated process of check, parse, build,
compare to detect if a string was a valid roman numeral. Reimplemented as a
single step, that only checks for the validity (and well-formedness) of the
string, without computing its value.

It uses the same modern Roman numeral notation, including subtractive
notation.
---
 src/scan.c | 427 +++++--------------------------------------------------------
 1 file changed, 35 insertions(+), 392 deletions(-)

diff --git a/src/scan.c b/src/scan.c
index a926ddc..37ce750 100644
--- a/src/scan.c
+++ b/src/scan.c
@@ -1831,417 +1831,60 @@ handle_error:
     goto out;
 }
 
-/*
- * Quick roman numeral check (non-roman numerals may also return true)
- * Patch from Slash Bunny (2007.08.12)
- * (http://home.hiwaay.net/~lkseitz/math/roman/numerals.shtml)
- *    I = 1    (one)
- *    V = 5    (five)
- *    X = 10   (ten)
- *    L = 50   (fifty)
- *    C = 100  (one hundred)
- *    D = 500  (five hundred)
- *    M = 1000 (one thousand)
- */
 static gint
 Scan_Word_Is_Roman_Numeral (const gchar *text)
 {
-    const gchar *tmp;
-    gint  len;
-    gchar *buf = NULL;
-    gint   rn_int;
-    gchar *rn_char;
-
-    if (!ScannerWindow
-    ||  !ProcessFieldsDetectRomanNumerals
-    ||  !gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(ProcessFieldsDetectRomanNumerals)))
-        return 0;
-    
-    tmp = text;
-    len = 0;
-
-    while (*tmp)
-    {
-        if (*tmp == (gunichar)'i'
-        ||  *tmp == (gunichar)'v'
-        ||  *tmp == (gunichar)'x'
-        ||  *tmp == (gunichar)'l'
-        ||  *tmp == (gunichar)'c'
-        ||  *tmp == (gunichar)'d'
-        ||  *tmp == (gunichar)'m')
-        {
-            // Found a roman numeral => continue
-            tmp++;
-            len++;
-        } else if (*tmp == ' '
-               ||  *tmp == '_'
-               ||  *tmp == '.'
-               ||  *tmp == ','
-               ||  *tmp == '-')
-        {
-            // A separator was found => end of word
-            // Check if it is a valid roman numeral
-            goto roman_numeral_found;
-            
-        } else
-        {
-            return 0;
-        }
-    }
-
-    // Found in last word of the string
-    
-roman_numeral_found:
-    // Check if it is a valid roman numeral
-    buf = g_strndup(text,len);
-    rn_int = roman2int(buf); // Convert the Roman numeral string to integer
-    
-    if (rn_int >= 0 )
-    {
-        // Some strings as "IIIII" or "CCCCC" are returned as valid, which is not correct...
-        // Same problem with: IC MIC IM MIM IL CIL XM MXM VC MVC VM MVM VL MVL LC MLC LD MLD LM MLM MDM 
-        // So we convert it again to a string, and compare to the initial one.
-        rn_char = (gchar *)int2roman(rn_int); // Convert the Roman numeral integer to string
-        if (rn_char
-        && strcasecmp(buf,rn_char)==0)
-        {
-            g_free(buf);
-            g_free(rn_char);
-            return len; // Roman numeral valid
-        }else
-        {
-            g_free(buf);
-            g_free(rn_char);
-            return 0;
-        }
-    }else
-    {
-        g_free(buf);
-        return 0;
-    }
-}
+    /* No need for caseless strchr */
+    static const gchar romans[] = "MmDdCcLlXxVvIi";
 
+    gsize next_allowed = 0;
+    gsize prev = 0;
+    gsize count = 0;
 
+    const gchar *i;
 
-/*
- * Taken from :
- *   Roman Numeral Conversion API (http://sourceforge.net/project/showfiles.php?group_id=211070)
- *    Copyright (c) 2007 David M. Syzdek <roman-project syzdek net>
- */
-/* Convert Roman numeral from integer to string */
-static const char *
-int2roman (int num)
-{
-    #define ROMAN_BUFF_LEN 512
-    
-    /* buffer for storing conversions */
-    char roman_string[ROMAN_BUFF_LEN];
-
-    /* wrap long2roman_r() with library buffer */
-    char *result = int2roman_r(num, roman_string, ROMAN_BUFF_LEN);
-   
-    if (!result)
-         return NULL;
-    return g_strdup(roman_string);
-}
+    for (i = text; *i; i++) {
+        const char *s = strchr (romans, *i);
+        if (s) {
+            gsize c = (s - romans) / 2;
 
-static char *
-int2roman_r (int num, char * str, size_t len)
-{
-   // local variables
-   unsigned pos;
-   unsigned u;
-   unsigned dividend;
-
-   g_return_val_if_fail (str != NULL, NULL);
-
-   // verify that number is withing boundaries
-   if ((num > 5000) || (num < 0))
-   {
-      return NULL;
-   };
-
-   // sets initial values
-   pos = 0;
-   memset(str, 0, len);
-   len--;
-
-   // checks for nullae
-   if (!(num))
-   {
-      str[0] = 'N';
-      return str;
-   };
-
-   // calculates thousands
-   dividend = num/1000;
-   if (dividend > (len-1))
-   {
-      return NULL;
-   };
-   for (u = 0; u < dividend; u++)
-      str[pos+u] = 'M';
-   num %= 1000;
-   pos += u;
-
-   // calculates hundreds
-   dividend = num/100;
-   if (dividend > (len-1-pos))
-   {
-      return NULL;
-   };
-   if (dividend == 9)
-   {
-      str[pos+0] = 'C';
-      str[pos+1] = 'M';
-      pos += 2;
-      dividend = 0;
-   };
-   if (dividend >= 5)
-   {
-      str[pos] = 'D';
-      dividend -= 5;
-      pos++;
-   };
-   if (dividend == 4)
-   {
-      str[pos+0] = 'C';
-      str[pos+1] = 'D';
-      dividend -= 4;
-      pos += 2;
-   };
-   for(u = 0; u < dividend; u++)
-      str[pos+u] = 'C';
-   pos += u;
-   num %= 100;
-
-   // calculates tens
-   dividend = num/10;
-   if (dividend > (len-1-pos))
-   {
-      return NULL;
-   };
-   if (dividend == 9)
-   {
-      str[pos+0] = 'X';
-      str[pos+1] = 'C';
-      dividend = 0;
-      pos += 2;
-   };
-   if (dividend >= 5)
-   {
-      str[pos+0] = 'L';
-      dividend -= 5;
-      pos++;
-   };
-   if (dividend == 4)
-   {
-      str[pos+0] = 'X';
-      str[pos+1] = 'L';
-      pos += 2;
-      dividend -= 4;
-   };
-   for (u = 0; u < dividend; u++)
-      str[pos+u] = 'X';
-   pos += u;
-   num %= 10;
-
-   // calculates ones
-   dividend = num;
-   if (dividend > (len-1-pos))
-   {
-      return NULL;
-   };
-   if (dividend == 9)
-   {
-      str[pos+0] = 'I';
-      str[pos+1] = 'X';
-      dividend = 0;
-      pos += 2;
-   };
-   if (dividend >= 5)
-   {
-      str[pos+0] = 'V';
-      dividend -= 5;
-      pos++;
-   };
-   if (dividend == 4)
-   {
-      str[pos+0] = 'I';
-      str[pos+1] = 'V';
-      pos += 2;
-      dividend -= 4;
-   };
-   for(u = 0; u < dividend; u++)
-      str[pos+u] = 'I';
-
-   /* ends function */
-   return str;
-}
+            if (c < next_allowed)
+                return 0;
 
-/* Convert Roman numeral from string to integer */
-static int
-roman2int (const char *str)
-{
-   // declares local vars
-   int      num;
-   unsigned i;
-   unsigned len;
-   unsigned last;
-   unsigned prevlast;
-
-   // checks args
-   if (!(str))
-   {
-      return(0);
-   };
-
-   // sets initial values 
-   num  = 0;
-   len  = strlen(str);
-   last = 1000;
-   prevlast = 1000;
-
-   // loops through characters
-   for(i = 0; i < len; i++)
-   {
-      switch(str[i])
-      {
-         case 'n':
-         case 'N':
-            if (strlen(str) > 1)
+            if (c < prev)
             {
-               return(-1);
-            };
-            return(0);
-            break;
-         case 'i':
-         case 'I':
-            num  += 1;
-            // prevent patterns like IXI
-            if ((prevlast == 1) && (last != 1))
-            {
-               return(-1);
-            };
-            // prevent patterns like IIIII and VIIIII
-            if ((!(num%5)) || (!(num%10)))
-            {
-              return(-1);
-            };
-            // rotates value into history
-            prevlast   = last;
-            last  = 1;
-            break;
-         case 'v':
-         case 'V':
-            num += 5;
-            // tests for invalid syntax
-            if ( ((last <= 5) && (last != 1)) || (prevlast <= 5) )
-            {
-               return(-1);
+                /* After subtraction, no more subtracted chars allowed */
+                next_allowed = prev + 1;
             }
-            // applies subtraction method & rotates value into history
-            if (last < 5)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 5;
-            break;
-         case 'x':
-         case 'X':
-            num += 10;
-            // tests for invalid syntax
-            if ( ((prevlast < 10) && (last <= 10)) || ((last < 10) && (last != 1)) )
-            {
-               return(-1);
-            };
-            // prevent patterns like XXXXX and VXXXXX
-            if ((!(num%50)) || (!(num%100)))
+            else if (c == prev)
             {
-               return(-1);
-            };
-            // applies subtraction method & rotates value into history
-            if (last == 1)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 10;
-            break;
-         case 'l':
-         case 'L':
-            num += 50;
-            // tests for invalid syntax
-            if ( ((last <= 50) && (last != 10)) || (prevlast <= 50) )
-            {
-               return(-1);
+                /* Allow indefinite repetition for m; three for c, x and
+                 * i; and none for d, l and v. */
+                if ((c && ++count > 3) || (c & 1))
+                    return 0;
+                /* No more subtraction */
+                next_allowed = c;
             }
-            // applies subtraction method & rotates value into history
-            if (last < 50)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 50;
-            break;
-         case 'c':
-         case 'C':
-            num += 100;
-            // tests for invalid syntax
-            if ( ((prevlast < 100) && (last <= 100)) || ((last < 100) && (last != 10)) )
-            {
-               return(-1);
-            };
-            // prevent patterns like CCCCC and VCCCCC
-            if ((!(num%500)) || (!(num%1000)))
-            {
-               return(-1);
-            };
-            // applies subtraction method & rotates value into history
-            if (last == 10)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 100;
-            break;
-         case 'd':
-         case 'D':
-            num += 500;
-            // tests for invalid syntax
-            if ( ((last <= 500) && (last != 100)) || (prevlast <= 500) )
+            else if (c && !(c & 1))
             {
-               return(-1);
+                /* For first occurrence of c, x and i, allow "substraction"
+                 * from 10 and 5 times self, reset counting */
+                next_allowed = c - 2;
+                count = 1;
             }
-            // applies subtraction method & rotates value into history
-            if (last < 500)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 500;
+            prev = c;
+        } else {
+            if (g_unichar_isalnum (g_utf8_get_char (i)))
+                return 0;
             break;
-         case 'm':
-         case 'M':
-            num += 1000;
-            // tests for invalid syntax
-            if ( ((prevlast < 1000) && (last <= 1000)) || ((last < 1000) && (last != 100)) )
-            {
-               return(-1);
-            };
-            // prevent patterns like MMMMM and VMMMMM
-            if ((!(num%5000)) || (!(num%10000)))
-            {
-               return(-1);
-            };
-            // applies subtraction method & rotates value into history
-            if (last == 100)
-               num -= (last * 2);
-            prevlast  = last;
-            last = 1000;
-            break;
-         default:
-            return(-1);
-      };
-   };
+        }
+    }
 
-   // ends function
-   return(num);
+    /* Return length of found roman numeral */
+    return i - text;
 }
 
 
-
 /*
  * Return the field of a 'File_Tag' structure corresponding to the mask code
  */
-- 
1.8.3.2
Follow-Ups:
- Re: [PATCH] Reimplement Roman numeral detection
  - From: David King
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]