g_utf8_offset_to_pointer benchmark

From: Behdad Esfahbod <behdad cs toronto edu>
To: performance-list gnome org
Subject: g_utf8_offset_to_pointer benchmark
Date: Thu, 3 Nov 2005 11:57:08 -0500 (EST)

Hi,

I'm attaching my benchmark with pvanhoof's, luis's, hpj's, and
mine implementations. To compile, use -DWHICH=behdad or similar,
to choose implementation.

I've posted my analysis here:

  http://mces.blogspot.com/2005/11/false-alarm-on-gutf8offsettopointer.html


I suggest we close this discussion and let the current
implementation be there, go optimize its users.

Cheers,

--behdad
http://behdad.org/

"Commandment Three says Do Not Kill, Amendment Two says Blood Will Spill"
	-- Dan Bern, "New American Language"

/*
 * Copyright (C) 2005 Federico Mena-Quintero federico novell com
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/times.h>
#include <glib.h>

#define ALL_LANGUAGES "ALL"
#define DEFAULT_DATA_DIR "po-data"
#define DEFAULT_BENCHMARK_NAME "glib benchmark"
#define DEFAULT_NUM_ITERATIONS 1000

static void
error_and_exit (const char *msg, ...)
{
	va_list args;

	va_start (args, msg);
	vfprintf (stderr, msg, args);
	va_end (args);
	fputs ("\n", stderr);
	exit (1);
}

typedef struct {
	clock_t start_utime;
} UserTimer;

UserTimer *
user_timer_new (void)
{
	UserTimer *utimer;
	struct tms tms;

	utimer = g_new0 (UserTimer, 1);
	times (&tms);
	utimer->start_utime = tms.tms_utime;

	return utimer;
}

double
user_timer_elapsed (UserTimer *utimer)
{
	static long clktck;
	struct tms tms;

	if (clktck == 0)
		clktck = sysconf (_SC_CLK_TCK);

	times (&tms);
	return (double) (tms.tms_utime - utimer->start_utime) / clktck;
}

void
user_timer_destroy (UserTimer *utimer)
{
	g_free (utimer);
}

typedef struct {
	char *str;
	int num_chars;
	int num_bytes;
	gboolean valid;
} String;

typedef struct {
	gsize num_strings;
	char *strings_raw;
	String *strings;
} StringSet;

typedef struct {
	double elapsed;
	long total_strings;
	long total_chars;
} LanguageResults;

static StringSet *
string_set_read (const char *filename)
{
	GError *error;
	gsize length;
	char *end;
	char *p, *string_start;
	gsize max_strings;
	char *strings_raw;
	gsize num_strings;
	String *strings;
	StringSet *set;

	error = NULL;
	if (!g_file_get_contents (filename, &strings_raw, &length, &error))
		error_and_exit ("Could not read the strings file %s: %s", filename, error->message);

	max_strings = 1024;
	num_strings = 0;
	strings = g_new (String, max_strings);

	string_start = strings_raw;
	end = strings_raw + length;

	for (p = strings_raw; p < end; p++)
		if (*p == 0) {
			if (num_strings == max_strings) {
				max_strings = max_strings * 2;
				strings = g_renew (String, strings, max_strings);
			}

			strings[num_strings].str = string_start;
			strings[num_strings].num_chars = g_utf8_strlen (string_start, -1);
			strings[num_strings].num_bytes = strlen (string_start);
			strings[num_strings].valid = (strstr (string_start, "POT-Creation") == 0
						      && g_utf8_validate (string_start, -1, NULL));
			string_start = p + 1;
			num_strings++;
		}

	set = g_new (StringSet, 1);
	set->num_strings = num_strings;
	set->strings_raw = strings_raw;
	set->strings = strings;

	return set;
}

static void
string_set_free (StringSet *set)
{
	g_free (set->strings);
	g_free (set->strings_raw);
	g_free (set);
}


/* PUT IMPLEMENTATIONS HERE */

static const gchar utf8_skip_data[256] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
const gchar * const g_utf8_skip = utf8_skip_data;
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
gchar *
glib_utf8_offset_to_pointer  (const gchar *str,
			   glong        offset)    
{
  const gchar *s = str;
  while (offset--)
    s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
glib_unwrapped_utf8_offset_to_pointer  (const gchar *str,
			   glong        offset)    
{
  const gchar *s = str;
  while (offset>=4) {
    offset -= 4;
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
  }
  while (offset--)
    s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
pvanhoof_utf8_offset_to_pointer  (const gchar *str,
			   glong        offset)    
{
  const gchar *s = str;
  while (offset--)
    if (*(guchar*)s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
pvanhoof_unwrapped_utf8_offset_to_pointer  (const gchar *str,
			   glong        offset)    
{
  const guchar *s = (const guchar *)str;
  while (offset>=4) {
    offset -= 4;
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  }
  while (offset--)
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

#define WORDTYPE guint
#define REPEAT(x) (((WORDTYPE)-1 / 0xFF) * (x))

gchar *
pvanhoof_unwrapped2_utf8_offset_to_pointer  (const gchar *str,
			   glong        offset)    
{
  const guchar *s = (const guchar *)str;
  while (offset>=4) {
    offset -= 4;
    if (*(guint32 *)s & REPEAT(0xC0)) {
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
    } else 
      s += 4;
  }
  while (offset--)
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar*
behdad_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
  const WORDTYPE *ws;
  const gchar *s;;

  ws = (const WORDTYPE *)str;
  while (offset >= sizeof (WORDTYPE)) {
    register WORDTYPE w = *ws++;
    offset -= sizeof (WORDTYPE);
    w &= ~(w << 1);
    w &= REPEAT(0x80);
    w >>=7;
    w *= REPEAT(1);
    w >>= (sizeof (WORDTYPE) - 1) * 8;
    offset += w;
  }

  s = (const gchar *)ws;
  while ((*(const guchar*)s)>>6==2)
    s++;
  while (offset--)
    s = g_utf8_next_char (s);

  return s;
}

gchar*
luis_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
	while (offset)
	{
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
	}
	
	return (gchar *)str;
}

gchar*
luis_unwrapped_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
	while (offset>=4)
	{
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
	}
	while (offset)
	{
		if (((guchar)(*++str) >> 6) != 0x02)
			--offset ;
	}
	
	return (gchar *)str;
}

gchar*
hpj_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
  union
  {
    const guint32 *p32;
    const gchar   *p8;
  }
  pt;
 
  pt.p8 = str;
 
  while (offset >= 4)
    {
      guint32 seg    = *pt.p32;
      guint32 seg_hi = seg & 0x80808080;
 
      if (!seg_hi)
        {
          pt.p32++;
          offset -= 4;
          continue;
        }
      else if G_LIKELY (seg_hi == 0x80808080)
        {
          if ((seg & 0x80e080e0) == 0x80c080c0)
 	    {
              pt.p32++;
              offset -= 2;
              continue;
            }
 
          if ((seg & 0xf08080f0) == 0xe08080e0)
            {
              pt.p8 += 6;
              offset -= 2;
              continue;
            }
        }
 
      pt.p8 += g_utf8_skip [(guchar) seg];
      offset--;
    }
 
  for ( ; offset; offset--)
    pt.p8 = g_utf8_next_char (pt.p8);
 
  return (gchar *) pt.p8;
}



/****************************/

#define JOIN_(a,b) a##b
#define JOIN(a,b) JOIN_(a,b)

static void
measure_strings (StringSet *set, LanguageResults *results, int num_iters)
{
	int i, j;
	UserTimer *utimer;
	gchar*  (*utf8_offset_to_pointer) (const gchar *str, glong        offset);

	/* choose implementation */
	utf8_offset_to_pointer = JOIN(WHICH,_utf8_offset_to_pointer);


	utimer = user_timer_new ();

	results->elapsed = 0.0;
	results->total_strings = 0;
	results->total_chars = 0;

	for (i = 0; i < num_iters; i++)
		for (j = 0; j < set->num_strings; j++) {

			if (set->strings[j].valid) {
				gchar *str;
				int offset;
				gchar *pointer;
				
				str = set->strings[j].str;
				/*
				offset = set->strings[j].num_chars - 1;
				if (offset < 0)
				  continue;
				pointer = g_utf8_find_prev_char (str, str + set->strings[j].num_bytes);
				*/
				offset = set->strings[j].num_chars;
				pointer = str + set->strings[j].num_bytes;


				gchar *p = utf8_offset_to_pointer (str, offset);
				if (p != pointer) {
				  error_and_exit ("ERROR: expected %d, got %d\n", pointer-str, p-str);
				}

				results->total_strings++;
				results->total_chars += set->strings[j].num_chars;
			}
		}

	results->elapsed = user_timer_elapsed (utimer);
	user_timer_destroy (utimer);
}

static char **option_langs;
static char *option_data_dir = DEFAULT_DATA_DIR;
static char *option_name = DEFAULT_BENCHMARK_NAME;
static char *option_output;
static int option_num_iterations = DEFAULT_NUM_ITERATIONS;

static FILE *output_file;

static GOptionEntry option_entries[] = {
	{ "lang", 'l', 0, G_OPTION_ARG_STRING_ARRAY, &option_langs,
	  "Specify language name (e.g. \"es\" for Spanish), or \"" ALL_LANGUAGES "\"", "string" },
	{ "data-dir", 'd', 0, G_OPTION_ARG_FILENAME, &option_data_dir,
	  "Directory where .dat files live", "dirname" },
	{ "name", 'n', 0, G_OPTION_ARG_STRING, &option_name,
	  "Name for benchmark", "string" },
	{ "output", 'o', 0, G_OPTION_ARG_FILENAME, &option_output,
	  "Output filename.  If not specified, standard output will be used.", "filename" },
	{ NULL, 0, 0, 0, NULL, NULL, NULL }
};

static void
run_one_language (const char *lang_name, const char *filename)
{
	StringSet *set;
	LanguageResults results;

	fprintf (stderr, "Processing %s\n", filename);

	set = string_set_read (filename);
	measure_strings (set, &results, option_num_iterations);
	string_set_free (set);

	fprintf (output_file, "  <language>\n");
	fprintf (output_file, "    <name>%s</name>\n", lang_name);
	fprintf (output_file, "    <elapsed>%f</elapsed>\n", results.elapsed);
	fprintf (output_file, "    <total_strings>%ld</total_strings>\n", results.total_strings);
	fprintf (output_file, "    <total_chars>%ld</total_chars>\n", results.total_chars);
	fprintf (output_file, "  </language>\n");
}

static void
run_all_languages (void)
{
	GDir *dir;
	GError *error;
	const char *entry;

	error = NULL;
	dir = g_dir_open (option_data_dir, 0, &error);
	if (!dir)
		error_and_exit ("Could not open directory: %s", error->message);

	while ((entry = g_dir_read_name (dir)) != NULL) {
		if (g_str_has_suffix (entry, ".dat")) {
			char *lang_name;
			char *filename;

			lang_name = g_strndup (entry, strlen (entry) - 4);
			filename = g_build_filename (option_data_dir, entry, NULL);

			run_one_language (lang_name, filename);

			g_free (lang_name);
			g_free (filename);
		}
	}

	g_dir_close (dir);
}

static void
run_some_languages (void)
{
	char **langs;

	for (langs = option_langs; *langs; langs++) {
		char *raw_filename;
		char *filename;

		raw_filename = g_strconcat (*langs, ".dat", NULL);
		filename = g_build_filename (option_data_dir, raw_filename, NULL);
		g_free (raw_filename);

		run_one_language (*langs, filename);
		g_free (filename);
	}
}

static gboolean
have_all_languages (char **langs)
{
	for (; *langs; langs++)
		if (strcmp (*langs, "ALL") == 0)
			return TRUE;

	return FALSE;
}

int
main (int argc, char **argv)
{
	GOptionContext *option_ctx;
	
	option_ctx = g_option_context_new ("Options");
	g_option_context_add_main_entries (option_ctx, option_entries, NULL);
	if (!g_option_context_parse (option_ctx, &argc, &argv, NULL)) {
		fprintf (stderr, "Invalid usage; type \"%s --help\" for instructions.\n", argv[0]);
		exit (EXIT_FAILURE);
	}

	if (option_output) {
		output_file = fopen (option_output, "w");
		if (!output_file)
			error_and_exit ("Could not create output file %s", option_output);
	} else
		output_file = stdout;

	fputs ("<?xml version=\"1.0\"?>\n", output_file);
	fputs ("<pango-benchmark>\n", output_file);
	fprintf (output_file, "  <name>%s</name>\n", option_name);

	if (option_langs == NULL || have_all_languages (option_langs))
		run_all_languages ();
	else
		run_some_languages ();

	fputs ("</pango-benchmark>\n", output_file);

	if (output_file != stdout)
		fclose (output_file);

	return 0;
}

Follow-Ups:
- Re: g_utf8_offset_to_pointer benchmark
  - From: Morten Welinder

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]