Re: g_str_hash




> > This is the exact same hash except operations have been moved to
> > enhance speed.  The difference being the new one keeps the upper bits 
> > and thus has less collisions and operates somewhere between the TCL and 
> > GLib on.  
> 
> I suspect the appeal of the ASU one is solely that it is nice and
> easily analyzable. I won't believe that it creates less collisions
> until somebody shows me a set of (plausible) strings where that is the case.

Well, I have some formulations which should have much better collision
properties.  (This took several hours, so please give it more than
a token glance.)

This hash is based on multiplying by 31.  (and is approx the same speed 
as TCL.)
 
guint hash (gconstpointer v)
{
  const char *p;
  guint h=0;
  for(p = (const char*) v; *p != '\0'; p += 1) {
    h = ( h << 5 ) - h  + *p;
  }
  return h;
}

I include a programs at the end used to measure the frequencies of collisions
on my local dictionary.  Both with long and short phrases.  

The results are gathered using 
  ./hash | awk '{print $1}' | wc
  ./hash | awk '{print $1}' | sort -u |wc


Short word test: (3-16 char)
----------------------------
(with 24474 words, most short)
With Glib current:   86 hits  
With TCL:           250 hits 
With X31:             1 hits  

TCL is quite poor here.  (tcl is X9 which has poor properties on short.)


Long test:  (20-40 char, vary in tail)
----------------------------------------
Using the 2040 longest words in dict and then pairing them
with every other word we get 4330561 phrases.  (we have a lot
more phases so the hit ratio is expected to be a lot higher.)

Glib:  220793
TCL:   132501
X31:   131478

Here, TCL and X31 match closely.  Gtk+ is poor.  
 
> Well, if the Tcl (or the Perl one with the << 5) are as good, why 
> waste the time? And your optimization doesn't help for long strings,
> which take the most time.

My optimization was actually targeted at the long strings while getting
the short ones reasonable close.  It had much better properties than
gtk+ and tcl for the short (8-15 char).  However, it doesn't touch the
X31.

Okay now for some raw speed tests....

       HP-O0  HP-O3  x86-O0  x86-O3
glib:  1.70   0.47   1.90    1.08
tcl:   0.92   0.25   1.52    0.63
X31:   0.82   0.24   1.38    0.79
 
X31 is a bit slower on x86 then tcl but not significantly.
   
Of the 3 options, I would recommend the X31 hash as it is both quick
and has the best overall properties.

--Karl

========================================================================
#include <glib.h>
#include <stdio.h>
#include <time.h>

/*#define GTK_HASH */
/*#define TCL_HASH */
#define X31_HASH
#define SHORT_TEST
#ifdef GTK_HASH
guint hash (gconstpointer v)
{
  const char *s = (char*)v;
  const char *p;
  guint h=0, g;

  for(p = s; *p != '\0'; p += 1) {
    h = ( h << 4 ) + *p;
    if ( ( g = h & 0xf0000000 ) ) {
      h = h ^ (g >> 24);
      h = h ^ g;
    }
  }

  return h /* % M */;
}
#endif

#ifdef TCL_HASH
guint hash(gconstpointer v)
{
  const char* string = (char*)v;
  unsigned int result = 0;
  int c;

  while (1) {
    c = *string;
    string++;
    if (c == 0) {
      break;
    }
    result += (result<<3) + c;
  }
  return result;
}
#endif

#ifdef X31_HASH
guint hash (gconstpointer v)
{
  const char *p;
  guint h=0;
  for(p = (const char*) v; *p != '\0'; p += 1) {
    h = ( h << 5 ) - h + *p;
  }
  return h;
}
#endif

#ifdef SHORT_TEST
int main(int argc,char** argv)
{
  FILE *fptr;
  char c[100];
  fptr=fopen ("/usr/share/dict/words","r");
  while (!feof(fptr))
    {
      fgets(c,100,fptr);
      c[strlen(c)-1]=0;
      printf("%08x\n",hash(c));
    }
  return 0;
}
#else
/* long test */
int main(int argc,char** argv)
{
  FILE *fptr1,*fptr2;
  char c1[100],c2[100],c3[100];
  fptr1=fopen ("/tmp/dict","r");
  while(!feof(fptr1))
    {
      fgets(c1,100,fptr1);
      fptr2=fopen ("/tmp/dict","r");
      c1[strlen(c1)-1]=' ';
      while(!feof(fptr2))
        {
          fgets(c2,100,fptr2);
          c2[strlen(c1)-1]=0;
          strcpy(c3,c1);
          strcat(c3,c2);
          printf("%08x\n",hash(c3));
        }
      fclose(fptr2);
    }
  return 0;
}
#endif



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]