g_utf8_offset_to_pointer benchmark
- From: Behdad Esfahbod <behdad cs toronto edu>
- To: performance-list gnome org
- Subject: g_utf8_offset_to_pointer benchmark
- Date: Thu, 3 Nov 2005 11:57:08 -0500 (EST)
Hi,
I'm attaching my benchmark with pvanhoof's, luis's, hpj's, and
mine implementations. To compile, use -DWHICH=behdad or similar,
to choose implementation.
I've posted my analysis here:
http://mces.blogspot.com/2005/11/false-alarm-on-gutf8offsettopointer.html
I suggest we close this discussion and let the current
implementation be there, go optimize its users.
Cheers,
--behdad
http://behdad.org/
"Commandment Three says Do Not Kill, Amendment Two says Blood Will Spill"
-- Dan Bern, "New American Language"
/*
* Copyright (C) 2005 Federico Mena-Quintero federico novell com
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/times.h>
#include <glib.h>
#define ALL_LANGUAGES "ALL"
#define DEFAULT_DATA_DIR "po-data"
#define DEFAULT_BENCHMARK_NAME "glib benchmark"
#define DEFAULT_NUM_ITERATIONS 1000
static void
error_and_exit (const char *msg, ...)
{
va_list args;
va_start (args, msg);
vfprintf (stderr, msg, args);
va_end (args);
fputs ("\n", stderr);
exit (1);
}
typedef struct {
clock_t start_utime;
} UserTimer;
UserTimer *
user_timer_new (void)
{
UserTimer *utimer;
struct tms tms;
utimer = g_new0 (UserTimer, 1);
times (&tms);
utimer->start_utime = tms.tms_utime;
return utimer;
}
double
user_timer_elapsed (UserTimer *utimer)
{
static long clktck;
struct tms tms;
if (clktck == 0)
clktck = sysconf (_SC_CLK_TCK);
times (&tms);
return (double) (tms.tms_utime - utimer->start_utime) / clktck;
}
void
user_timer_destroy (UserTimer *utimer)
{
g_free (utimer);
}
typedef struct {
char *str;
int num_chars;
int num_bytes;
gboolean valid;
} String;
typedef struct {
gsize num_strings;
char *strings_raw;
String *strings;
} StringSet;
typedef struct {
double elapsed;
long total_strings;
long total_chars;
} LanguageResults;
static StringSet *
string_set_read (const char *filename)
{
GError *error;
gsize length;
char *end;
char *p, *string_start;
gsize max_strings;
char *strings_raw;
gsize num_strings;
String *strings;
StringSet *set;
error = NULL;
if (!g_file_get_contents (filename, &strings_raw, &length, &error))
error_and_exit ("Could not read the strings file %s: %s", filename, error->message);
max_strings = 1024;
num_strings = 0;
strings = g_new (String, max_strings);
string_start = strings_raw;
end = strings_raw + length;
for (p = strings_raw; p < end; p++)
if (*p == 0) {
if (num_strings == max_strings) {
max_strings = max_strings * 2;
strings = g_renew (String, strings, max_strings);
}
strings[num_strings].str = string_start;
strings[num_strings].num_chars = g_utf8_strlen (string_start, -1);
strings[num_strings].num_bytes = strlen (string_start);
strings[num_strings].valid = (strstr (string_start, "POT-Creation") == 0
&& g_utf8_validate (string_start, -1, NULL));
string_start = p + 1;
num_strings++;
}
set = g_new (StringSet, 1);
set->num_strings = num_strings;
set->strings_raw = strings_raw;
set->strings = strings;
return set;
}
static void
string_set_free (StringSet *set)
{
g_free (set->strings);
g_free (set->strings_raw);
g_free (set);
}
/* PUT IMPLEMENTATIONS HERE */
static const gchar utf8_skip_data[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
const gchar * const g_utf8_skip = utf8_skip_data;
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
gchar *
glib_utf8_offset_to_pointer (const gchar *str,
glong offset)
{
const gchar *s = str;
while (offset--)
s = g_utf8_next_char (s);
return (gchar *)s;
}
gchar *
glib_unwrapped_utf8_offset_to_pointer (const gchar *str,
glong offset)
{
const gchar *s = str;
while (offset>=4) {
offset -= 4;
s = g_utf8_next_char (s);
s = g_utf8_next_char (s);
s = g_utf8_next_char (s);
s = g_utf8_next_char (s);
}
while (offset--)
s = g_utf8_next_char (s);
return (gchar *)s;
}
gchar *
pvanhoof_utf8_offset_to_pointer (const gchar *str,
glong offset)
{
const gchar *s = str;
while (offset--)
if (*(guchar*)s<192) s++;
else s = g_utf8_next_char (s);
return (gchar *)s;
}
gchar *
pvanhoof_unwrapped_utf8_offset_to_pointer (const gchar *str,
glong offset)
{
const guchar *s = (const guchar *)str;
while (offset>=4) {
offset -= 4;
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
}
while (offset--)
if (*s<192) s++;
else s = g_utf8_next_char (s);
return (gchar *)s;
}
#define WORDTYPE guint
#define REPEAT(x) (((WORDTYPE)-1 / 0xFF) * (x))
gchar *
pvanhoof_unwrapped2_utf8_offset_to_pointer (const gchar *str,
glong offset)
{
const guchar *s = (const guchar *)str;
while (offset>=4) {
offset -= 4;
if (*(guint32 *)s & REPEAT(0xC0)) {
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
if (*s<192) s++;
else s = g_utf8_next_char (s);
} else
s += 4;
}
while (offset--)
if (*s<192) s++;
else s = g_utf8_next_char (s);
return (gchar *)s;
}
gchar*
behdad_utf8_offset_to_pointer (const gchar *str, glong offset)
{
const WORDTYPE *ws;
const gchar *s;;
ws = (const WORDTYPE *)str;
while (offset >= sizeof (WORDTYPE)) {
register WORDTYPE w = *ws++;
offset -= sizeof (WORDTYPE);
w &= ~(w << 1);
w &= REPEAT(0x80);
w >>=7;
w *= REPEAT(1);
w >>= (sizeof (WORDTYPE) - 1) * 8;
offset += w;
}
s = (const gchar *)ws;
while ((*(const guchar*)s)>>6==2)
s++;
while (offset--)
s = g_utf8_next_char (s);
return s;
}
gchar*
luis_utf8_offset_to_pointer (const gchar *str, glong offset)
{
while (offset)
{
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
}
return (gchar *)str;
}
gchar*
luis_unwrapped_utf8_offset_to_pointer (const gchar *str, glong offset)
{
while (offset>=4)
{
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
}
while (offset)
{
if (((guchar)(*++str) >> 6) != 0x02)
--offset ;
}
return (gchar *)str;
}
gchar*
hpj_utf8_offset_to_pointer (const gchar *str, glong offset)
{
union
{
const guint32 *p32;
const gchar *p8;
}
pt;
pt.p8 = str;
while (offset >= 4)
{
guint32 seg = *pt.p32;
guint32 seg_hi = seg & 0x80808080;
if (!seg_hi)
{
pt.p32++;
offset -= 4;
continue;
}
else if G_LIKELY (seg_hi == 0x80808080)
{
if ((seg & 0x80e080e0) == 0x80c080c0)
{
pt.p32++;
offset -= 2;
continue;
}
if ((seg & 0xf08080f0) == 0xe08080e0)
{
pt.p8 += 6;
offset -= 2;
continue;
}
}
pt.p8 += g_utf8_skip [(guchar) seg];
offset--;
}
for ( ; offset; offset--)
pt.p8 = g_utf8_next_char (pt.p8);
return (gchar *) pt.p8;
}
/****************************/
#define JOIN_(a,b) a##b
#define JOIN(a,b) JOIN_(a,b)
static void
measure_strings (StringSet *set, LanguageResults *results, int num_iters)
{
int i, j;
UserTimer *utimer;
gchar* (*utf8_offset_to_pointer) (const gchar *str, glong offset);
/* choose implementation */
utf8_offset_to_pointer = JOIN(WHICH,_utf8_offset_to_pointer);
utimer = user_timer_new ();
results->elapsed = 0.0;
results->total_strings = 0;
results->total_chars = 0;
for (i = 0; i < num_iters; i++)
for (j = 0; j < set->num_strings; j++) {
if (set->strings[j].valid) {
gchar *str;
int offset;
gchar *pointer;
str = set->strings[j].str;
/*
offset = set->strings[j].num_chars - 1;
if (offset < 0)
continue;
pointer = g_utf8_find_prev_char (str, str + set->strings[j].num_bytes);
*/
offset = set->strings[j].num_chars;
pointer = str + set->strings[j].num_bytes;
gchar *p = utf8_offset_to_pointer (str, offset);
if (p != pointer) {
error_and_exit ("ERROR: expected %d, got %d\n", pointer-str, p-str);
}
results->total_strings++;
results->total_chars += set->strings[j].num_chars;
}
}
results->elapsed = user_timer_elapsed (utimer);
user_timer_destroy (utimer);
}
static char **option_langs;
static char *option_data_dir = DEFAULT_DATA_DIR;
static char *option_name = DEFAULT_BENCHMARK_NAME;
static char *option_output;
static int option_num_iterations = DEFAULT_NUM_ITERATIONS;
static FILE *output_file;
static GOptionEntry option_entries[] = {
{ "lang", 'l', 0, G_OPTION_ARG_STRING_ARRAY, &option_langs,
"Specify language name (e.g. \"es\" for Spanish), or \"" ALL_LANGUAGES "\"", "string" },
{ "data-dir", 'd', 0, G_OPTION_ARG_FILENAME, &option_data_dir,
"Directory where .dat files live", "dirname" },
{ "name", 'n', 0, G_OPTION_ARG_STRING, &option_name,
"Name for benchmark", "string" },
{ "output", 'o', 0, G_OPTION_ARG_FILENAME, &option_output,
"Output filename. If not specified, standard output will be used.", "filename" },
{ NULL, 0, 0, 0, NULL, NULL, NULL }
};
static void
run_one_language (const char *lang_name, const char *filename)
{
StringSet *set;
LanguageResults results;
fprintf (stderr, "Processing %s\n", filename);
set = string_set_read (filename);
measure_strings (set, &results, option_num_iterations);
string_set_free (set);
fprintf (output_file, " <language>\n");
fprintf (output_file, " <name>%s</name>\n", lang_name);
fprintf (output_file, " <elapsed>%f</elapsed>\n", results.elapsed);
fprintf (output_file, " <total_strings>%ld</total_strings>\n", results.total_strings);
fprintf (output_file, " <total_chars>%ld</total_chars>\n", results.total_chars);
fprintf (output_file, " </language>\n");
}
static void
run_all_languages (void)
{
GDir *dir;
GError *error;
const char *entry;
error = NULL;
dir = g_dir_open (option_data_dir, 0, &error);
if (!dir)
error_and_exit ("Could not open directory: %s", error->message);
while ((entry = g_dir_read_name (dir)) != NULL) {
if (g_str_has_suffix (entry, ".dat")) {
char *lang_name;
char *filename;
lang_name = g_strndup (entry, strlen (entry) - 4);
filename = g_build_filename (option_data_dir, entry, NULL);
run_one_language (lang_name, filename);
g_free (lang_name);
g_free (filename);
}
}
g_dir_close (dir);
}
static void
run_some_languages (void)
{
char **langs;
for (langs = option_langs; *langs; langs++) {
char *raw_filename;
char *filename;
raw_filename = g_strconcat (*langs, ".dat", NULL);
filename = g_build_filename (option_data_dir, raw_filename, NULL);
g_free (raw_filename);
run_one_language (*langs, filename);
g_free (filename);
}
}
static gboolean
have_all_languages (char **langs)
{
for (; *langs; langs++)
if (strcmp (*langs, "ALL") == 0)
return TRUE;
return FALSE;
}
int
main (int argc, char **argv)
{
GOptionContext *option_ctx;
option_ctx = g_option_context_new ("Options");
g_option_context_add_main_entries (option_ctx, option_entries, NULL);
if (!g_option_context_parse (option_ctx, &argc, &argv, NULL)) {
fprintf (stderr, "Invalid usage; type \"%s --help\" for instructions.\n", argv[0]);
exit (EXIT_FAILURE);
}
if (option_output) {
output_file = fopen (option_output, "w");
if (!output_file)
error_and_exit ("Could not create output file %s", option_output);
} else
output_file = stdout;
fputs ("<?xml version=\"1.0\"?>\n", output_file);
fputs ("<pango-benchmark>\n", output_file);
fprintf (output_file, " <name>%s</name>\n", option_name);
if (option_langs == NULL || have_all_languages (option_langs))
run_all_languages ();
else
run_some_languages ();
fputs ("</pango-benchmark>\n", output_file);
if (output_file != stdout)
fclose (output_file);
return 0;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]