Glib bithacks

From: Behdad Esfahbod <behdad cs toronto edu>
To: performance-list gnome org
Subject: Glib bithacks
Date: Fri, 4 Nov 2005 20:23:05 -0500 (EST)

Hi,

Attaching the patch, copying details from my blog post [1]:


Was looking into replacing glib Unicode tables with stuff
generated with my awesome compressor (lack of self-confidence!)
that I found the functions accessing those tables are in fact
more in need of some love.

Here is the problem: There's an enum of some 30 entries, which
are Unicode general categores, like this is a letter, this is a
digit, etc. You want to test whether a character is in one of a
few of these classes. Typically you write something like this:


#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER      \
                      || (Type) == G_UNICODE_LETTER_NUMBER     \
                      || (Type) == G_UNICODE_OTHER_NUMBER)



If you are efficiency-concious, you may convince yourself that
gcc takes care of it. Which it doesn't.

I solved this problem in FriBidi by assinging specially built
values to my enum entries, but later found that that's overly
complex for the task at hand. And forces you into 32-bit enum
entries too, which may not be suitable. Here is part of the patch
for your visual enjoyment, of my new solution:


-#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER      \
-                      || (Type) == G_UNICODE_LETTER_NUMBER     \
-                      || (Type) == G_UNICODE_OTHER_NUMBER)
-
-#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER    \
-                      || (Type) == G_UNICODE_UPPERCASE_LETTER  \
-                      || (Type) == G_UNICODE_TITLECASE_LETTER  \
-                      || (Type) == G_UNICODE_MODIFIER_LETTER   \
-                      || (Type) == G_UNICODE_OTHER_LETTER)
-
-#define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||  \
-                     (Type) == G_UNICODE_COMBINING_MARK ||     \
-                     (Type) == G_UNICODE_ENCLOSING_MARK)
-
+#define IS(Type, Class)        (((guint)1 << (Type)) & (Class) ?
1 : 0)
+#define OR(Type, Rest) (((guint)1 << (Type)) | (Rest))
+
+
+
+#define ISDIGIT(Type)  IS((Type),                              \
+                          OR(G_UNICODE_DECIMAL_NUMBER,         \
+                          OR(G_UNICODE_LETTER_NUMBER,          \
+                          OR(G_UNICODE_OTHER_NUMBER,
0))))
+
+#define ISALPHA(Type)  IS((Type),                              \
+                          OR(G_UNICODE_LOWERCASE_LETTER,       \
+                          OR(G_UNICODE_UPPERCASE_LETTER,       \
+                          OR(G_UNICODE_TITLECASE_LETTER,       \
+                          OR(G_UNICODE_MODIFIER_LETTER,        \
+                          OR(G_UNICODE_OTHER_LETTER,
0))))))
+
+#define ISMARK(Type)   IS((Type),                              \
+                          OR(G_UNICODE_NON_SPACING_MARK,       \
+                          OR(G_UNICODE_COMBINING_MARK,         \
+                          OR(G_UNICODE_ENCLOSING_MARK,
0))))



Yes, good old Pascal-like bit-sets! The real patch is much
longer. As a side benefit, the macros only expand Type once, so
you don't need to allocate an intermediate variable for it. How's
that?



--behdad
http://behdad.org/

[1] http://mces.blogspot.com/2005/11/glib-bithacks.html

"Commandment Three says Do Not Kill, Amendment Two says Blood Will Spill"
	-- Dan Bern, "New American Language"

Index: glib/guniprop.c
===================================================================
RCS file: /cvs/gnome/glib/glib/guniprop.c,v
retrieving revision 1.37
diff -u -p -r1.37 guniprop.c
--- glib/guniprop.c	14 Mar 2005 04:26:57 -0000	1.37
+++ glib/guniprop.c	4 Nov 2005 20:36:41 -0000
@@ -55,20 +56,37 @@
       : G_UNICODE_UNASSIGNED))
 
 
-#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER	\
-		       || (Type) == G_UNICODE_LETTER_NUMBER	\
-		       || (Type) == G_UNICODE_OTHER_NUMBER)
-
-#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER	\
-		       || (Type) == G_UNICODE_UPPERCASE_LETTER	\
-		       || (Type) == G_UNICODE_TITLECASE_LETTER	\
-		       || (Type) == G_UNICODE_MODIFIER_LETTER	\
-		       || (Type) == G_UNICODE_OTHER_LETTER)
-
-#define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||	\
-		      (Type) == G_UNICODE_COMBINING_MARK ||	\
-		      (Type) == G_UNICODE_ENCLOSING_MARK)
-		      
+#define IS(Type, Class)	(((guint)1 << (Type)) & (Class) ? 1 : 0)
+#define OR(Type, Rest)	(((guint)1 << (Type)) | (Rest))
+
+
+
+#define ISDIGIT(Type)	IS((Type),				\
+			   OR(G_UNICODE_DECIMAL_NUMBER,		\
+			   OR(G_UNICODE_LETTER_NUMBER,		\
+			   OR(G_UNICODE_OTHER_NUMBER,		0))))
+
+#define ISALPHA(Type)	IS((Type),				\
+			   OR(G_UNICODE_LOWERCASE_LETTER,	\
+			   OR(G_UNICODE_UPPERCASE_LETTER,	\
+			   OR(G_UNICODE_TITLECASE_LETTER,	\
+			   OR(G_UNICODE_MODIFIER_LETTER,	\
+			   OR(G_UNICODE_OTHER_LETTER,		0))))))
+
+#define ISALDIGIT(Type)	IS((Type),				\
+			   OR(G_UNICODE_DECIMAL_NUMBER,		\
+			   OR(G_UNICODE_LETTER_NUMBER,		\
+			   OR(G_UNICODE_OTHER_NUMBER,		\
+			   OR(G_UNICODE_LOWERCASE_LETTER,	\
+			   OR(G_UNICODE_UPPERCASE_LETTER,	\
+			   OR(G_UNICODE_TITLECASE_LETTER,	\
+			   OR(G_UNICODE_MODIFIER_LETTER,	\
+			   OR(G_UNICODE_OTHER_LETTER,		0)))))))))
+
+#define ISMARK(Type)	IS((Type),				\
+			   OR(G_UNICODE_NON_SPACING_MARK,	\
+			   OR(G_UNICODE_COMBINING_MARK,		\
+			   OR(G_UNICODE_ENCLOSING_MARK,		0))))
 
 /**
  * g_unichar_isalnum:
@@ -83,8 +101,7 @@
 gboolean
 g_unichar_isalnum (gunichar c)
 {
-  int t = TYPE (c);
-  return ISDIGIT (t) || ISALPHA (t);
+  return ISALDIGIT (TYPE (c));
 }
 
 /**
@@ -100,8 +117,7 @@ g_unichar_isalnum (gunichar c)
 gboolean
 g_unichar_isalpha (gunichar c)
 {
-  int t = TYPE (c);
-  return ISALPHA (t);
+  return ISALPHA (TYPE (c));
 }
 
 
@@ -153,13 +169,14 @@ g_unichar_isdigit (gunichar c)
 gboolean
 g_unichar_isgraph (gunichar c)
 {
-  int t = TYPE (c);
-  return (t != G_UNICODE_CONTROL
-	  && t != G_UNICODE_FORMAT
-	  && t != G_UNICODE_UNASSIGNED
-	  && t != G_UNICODE_PRIVATE_USE
-	  && t != G_UNICODE_SURROGATE
-	  && t != G_UNICODE_SPACE_SEPARATOR);
+  return !IS(TYPE(c),
+	     OR(G_UNICODE_CONTROL,
+	     OR(G_UNICODE_FORMAT,
+	     OR(G_UNICODE_UNASSIGNED,
+	     OR(G_UNICODE_PRIVATE_USE,
+	     OR(G_UNICODE_SURROGATE,
+	     OR(G_UNICODE_SPACE_SEPARATOR,
+	     0)))))));
 }
 
 /**
@@ -193,12 +210,13 @@ g_unichar_islower (gunichar c)
 gboolean
 g_unichar_isprint (gunichar c)
 {
-  int t = TYPE (c);
-  return (t != G_UNICODE_CONTROL
-	  && t != G_UNICODE_FORMAT
-	  && t != G_UNICODE_UNASSIGNED
-	  && t != G_UNICODE_PRIVATE_USE
-	  && t != G_UNICODE_SURROGATE);
+  return !IS(TYPE(c),
+	     OR(G_UNICODE_CONTROL,
+	     OR(G_UNICODE_FORMAT,
+	     OR(G_UNICODE_UNASSIGNED,
+	     OR(G_UNICODE_PRIVATE_USE,
+	     OR(G_UNICODE_SURROGATE,
+	     0))))));
 }
 
 /**
@@ -214,13 +232,19 @@ g_unichar_isprint (gunichar c)
 gboolean
 g_unichar_ispunct (gunichar c)
 {
-  int t = TYPE (c);
-  return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
-	  || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
-	  || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
-	  || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
-	  || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
-	  || t == G_UNICODE_OTHER_SYMBOL);
+  return IS(TYPE(c),
+	    OR(G_UNICODE_CONNECT_PUNCTUATION,
+	    OR(G_UNICODE_DASH_PUNCTUATION,
+	    OR(G_UNICODE_CLOSE_PUNCTUATION,
+	    OR(G_UNICODE_FINAL_PUNCTUATION,
+	    OR(G_UNICODE_INITIAL_PUNCTUATION,
+	    OR(G_UNICODE_OTHER_PUNCTUATION,
+	    OR(G_UNICODE_OPEN_PUNCTUATION,
+	    OR(G_UNICODE_CURRENCY_SYMBOL,
+	    OR(G_UNICODE_MODIFIER_SYMBOL,
+	    OR(G_UNICODE_MATH_SYMBOL,
+	    OR(G_UNICODE_OTHER_SYMBOL,
+	    0))))))))))));
 }
 
 /**
@@ -235,7 +259,7 @@ g_unichar_ispunct (gunichar c)
  * Pango or equivalent to get word breaking right, the algorithm
  * is fairly complex.)
  *  
- * Return value: %TRUE if @c is a punctuation character
+ * Return value: %TRUE if @c is a space character
  **/
 gboolean
 g_unichar_isspace (gunichar c)
@@ -252,9 +276,11 @@ g_unichar_isspace (gunichar c)
       
     default:
       {
-        int t = TYPE (c);
-        return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
-                || t == G_UNICODE_PARAGRAPH_SEPARATOR);
+	return IS(TYPE(c),
+	          OR(G_UNICODE_SPACE_SEPARATOR,
+	          OR(G_UNICODE_LINE_SEPARATOR,
+                  OR(G_UNICODE_PARAGRAPH_SEPARATOR,
+		  0))));
       }
       break;
     }
@@ -308,10 +334,9 @@ g_unichar_istitle (gunichar c)
 gboolean
 g_unichar_isxdigit (gunichar c)
 {
-  int t = TYPE (c);
   return ((c >= 'a' && c <= 'f')
 	  || (c >= 'A' && c <= 'F')
-	  || ISDIGIT (t));
+	  || ISDIGIT (TYPE (c)));
 }
 
 /**
@@ -326,8 +351,7 @@ g_unichar_isxdigit (gunichar c)
 gboolean
 g_unichar_isdefined (gunichar c)
 {
-  int t = TYPE (c);
-  return t != G_UNICODE_UNASSIGNED;
+  return TYPE (c) != G_UNICODE_UNASSIGNED;
 }
 
 /**
@@ -566,9 +590,8 @@ output_marks (const char **p_inout,
   while (*p)
     {
       gunichar c = g_utf8_get_char (p);
-      int t = TYPE(c);
       
-      if (ISMARK(t))
+      if (ISMARK (TYPE (c)))
 	{
 	  if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 	    len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
@@ -652,7 +675,7 @@ real_toupper (const gchar *str,
 		  continue;
 		}
 
-	      if (!ISMARK(t))
+	      if (!ISMARK (t))
 		last_was_i = FALSE;
 	    }
 	}
@@ -672,7 +695,10 @@ real_toupper (const gchar *str,
 	  /* And output as GREEK CAPITAL LETTER IOTA */
 	  len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); 	  
 	}
-      else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
+      else if (IS(t,
+		  OR(G_UNICODE_LOWERCASE_LETTER,
+		  OR(G_UNICODE_TITLECASE_LETTER,
+		  0))))
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);
 
@@ -844,7 +870,7 @@ real_tolower (const gchar *str,
 	       * sigma, but I don't think that occurs in real text.
 	       * The test here matches that in ICU.
 	       */
-	      if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
+	      if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */
 		val = 0x3c3;	/* GREEK SMALL SIGMA */
 	      else
 		val = 0x3c2;	/* GREEK SMALL FINAL SIGMA */
@@ -854,7 +880,10 @@ real_tolower (const gchar *str,
 
 	  len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 	}
-      else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
+      else if (IS(t,
+		  OR(G_UNICODE_UPPERCASE_LETTER,
+		  OR(G_UNICODE_TITLECASE_LETTER,
+		  0))))
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);
 
@@ -997,7 +1026,7 @@ g_utf8_casefold (const gchar *str,
 
 /**
  * g_unichar_get_mirror_char:
- * @ch: a unicode character
+ * @ch: a Unicode character
  * @mirrored_ch: location to store the mirrored character
  * 
  * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
Index: glib/gutf8.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gutf8.c,v
retrieving revision 1.45
diff -u -p -r1.45 gutf8.c
--- glib/gutf8.c	30 Oct 2005 03:05:21 -0000	1.45
+++ glib/gutf8.c	4 Nov 2005 20:36:41 -0000
@@ -523,7 +523,7 @@ g_get_charset (G_CONST_RETURN char **cha
 
 /**
  * g_unichar_to_utf8:
- * @c: a ISO10646 character code
+ * @c: a Unicode character code
  * @outbuf: output buffer, must have at least 6 bytes of space.
  *       If %NULL, the length will be computed and returned
  *       and nothing will be written to @outbuf.
@@ -589,9 +589,9 @@ g_unichar_to_utf8 (gunichar c,
  * g_utf8_strchr:
  * @p: a nul-terminated UTF-8 encoded string
  * @len: the maximum length of @p
- * @c: a ISO10646 character
+ * @c: a Unicode character
  * 
- * Finds the leftmost occurrence of the given ISO10646 character
+ * Finds the leftmost occurrence of the given Unicode character
  * in a UTF-8 encoded string, while limiting the search to @len bytes.
  * If @len is -1, allow unbounded search.
  * 
@@ -617,9 +617,9 @@ g_utf8_strchr (const char *p,
  * g_utf8_strrchr:
  * @p: a nul-terminated UTF-8 encoded string
  * @len: the maximum length of @p
- * @c: a ISO10646 character
+ * @c: a Unicode character
  * 
- * Find the rightmost occurrence of the given ISO10646 character
+ * Find the rightmost occurrence of the given Unicode character
  * in a UTF-8 encoded string, while limiting the search to @len bytes.
  * If @len is -1, allow unbounded search.
  *

Follow-Ups:
- Re: Glib bithacks
  - From: Federico Mena Quintero

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]