hash: 64-bit hash functions should take 64-bit base.

[ccan] / ccan / hash / hash.c
diff --git a/ccan/hash/hash.c b/ccan/hash/hash.c

index 5e106ad35cc4e4a85d65edc3c7025a0588f2990f..9b3f58b2b20a649785b7f053c4c7828413610148 100644 (file)
--- a/ccan/hash/hash.c
+++ b/ccan/hash/hash.c
@@ -208,63 +208,16 @@ uint32_t        initval)         /* the previous hash, or an arbitrary value */
    return c;
  }
  
-
-#if 0
-/*
---------------------------------------------------------------------
-hash_word2() -- same as hash_word(), but take two seeds and return two
-32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
-both be initialized with seeds.  If you pass in (*pb)==0, the output 
-(*pc) will be the same as the return value from hash_word().
---------------------------------------------------------------------
-*/
-void hash_word2 (
-const uint32_t *k,                   /* the key, an array of uint32_t values */
-size_t          length,               /* the length of the key, in uint32_ts */
-uint32_t       *pc,                      /* IN: seed OUT: primary hash value */
-uint32_t       *pb)               /* IN: more seed OUT: secondary hash value */
-{
-  uint32_t a,b,c;
-
-  /* Set up the internal state */
-  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
-  c += *pb;
-
-  /*------------------------------------------------- handle most of the key */
-  while (length > 3)
-  {
-    a += k[0];
-    b += k[1];
-    c += k[2];
-    mix(a,b,c);
-    length -= 3;
-    k += 3;
-  }
-
-  /*------------------------------------------- handle the last 3 uint32_t's */
-  switch(length)                     /* all the case statements fall through */
-  { 
-  case 3 : c+=k[2];
-  case 2 : b+=k[1];
-  case 1 : a+=k[0];
-    final(a,b,c);
-  case 0:     /* case 0: nothing left to add */
-    break;
-  }
-  /*------------------------------------------------------ report the result */
-  *pc=c; *pb=b;
-}
-#endif
-
  /*
  -------------------------------------------------------------------------------
  hashlittle() -- hash a variable-length key into a 32-bit value
    k       : the key (the unaligned variable-length array of bytes)
    length  : the length of the key, counting by bytes
-  initval : can be any 4-byte value
+  val2    : IN: can be any 4-byte value OUT: second 32 bit hash.
  Returns a 32-bit value.  Every bit of the key affects every bit of
  the return value.  Two keys differing by one or two bits will have
-totally different hash values.
+totally different hash values.  Note that the return value is better
+mixed than val2, so use that first.
  
  The best hash table sizes are powers of 2.  There is no need to do
  mod a prime (mod is sooo slow!).  If you need less than 32 bits,
@@ -283,13 +236,13 @@ acceptable.  Do NOT use for cryptographic purposes.
  -------------------------------------------------------------------------------
  */
  
-static uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
+static uint32_t hashlittle( const void *key, size_t length, uint32_t *val2 )
  {
    uint32_t a,b,c;                                          /* internal state */
    union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
  
    /* Set up the internal state */
-  a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
+  a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
  
    u.ptr = key;
    if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
@@ -451,209 +404,23 @@ static uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
    }
  
    final(a,b,c);
+  *val2 = b;
    return c;
  }
  
-#if 0
-/*
- * hashlittle2: return 2 32-bit hash values
- *
- * This is identical to hashlittle(), except it returns two 32-bit hash
- * values instead of just one.  This is good enough for hash table
- * lookup with 2^^64 buckets, or if you want a second hash if you're not
- * happy with the first, or if you want a probably-unique 64-bit ID for
- * the key.  *pc is better mixed than *pb, so use *pc first.  If you want
- * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)".
- */
-void hashlittle2( 
-  const void *key,       /* the key to hash */
-  size_t      length,    /* length of the key */
-  uint32_t   *pc,        /* IN: primary initval, OUT: primary hash */
-  uint32_t   *pb)        /* IN: secondary initval, OUT: secondary hash */
-{
-  uint32_t a,b,c;                                          /* internal state */
-  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
-
-  /* Set up the internal state */
-  a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc;
-  c += *pb;
-
-  u.ptr = key;
-  if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
-    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
-    const uint8_t  *k8;
-
-    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
-    while (length > 12)
-    {
-      a += k[0];
-      b += k[1];
-      c += k[2];
-      mix(a,b,c);
-      length -= 12;
-      k += 3;
-    }
-
-    /*----------------------------- handle the last (probably partial) block */
-    /* 
-     * "k[2]&0xffffff" actually reads beyond the end of the string, but
-     * then masks off the part it's not allowed to read.  Because the
-     * string is aligned, the masked-off tail is in the same word as the
-     * rest of the string.  Every machine with memory protection I've seen
-     * does it on word boundaries, so is OK with this.  But VALGRIND will
-     * still catch it and complain.  The masking trick does make the hash
-     * noticably faster for short strings (like English words).
-     */
-#ifndef VALGRIND
-
-    switch(length)
-    {
-    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
-    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
-    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
-    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
-    case 8 : b+=k[1]; a+=k[0]; break;
-    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
-    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
-    case 5 : b+=k[1]&0xff; a+=k[0]; break;
-    case 4 : a+=k[0]; break;
-    case 3 : a+=k[0]&0xffffff; break;
-    case 2 : a+=k[0]&0xffff; break;
-    case 1 : a+=k[0]&0xff; break;
-    case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
-    }
-
-#else /* make valgrind happy */
-
-    k8 = (const uint8_t *)k;
-    switch(length)
-    {
-    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
-    case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
-    case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
-    case 9 : c+=k8[8];                   /* fall through */
-    case 8 : b+=k[1]; a+=k[0]; break;
-    case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
-    case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
-    case 5 : b+=k8[4];                   /* fall through */
-    case 4 : a+=k[0]; break;
-    case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
-    case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
-    case 1 : a+=k8[0]; break;
-    case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
-    }
-
-#endif /* !valgrind */
-
-  } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
-    const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
-    const uint8_t  *k8;
-
-    /*--------------- all but last block: aligned reads and different mixing */
-    while (length > 12)
-    {
-      a += k[0] + (((uint32_t)k[1])<<16);
-      b += k[2] + (((uint32_t)k[3])<<16);
-      c += k[4] + (((uint32_t)k[5])<<16);
-      mix(a,b,c);
-      length -= 12;
-      k += 6;
-    }
-
-    /*----------------------------- handle the last (probably partial) block */
-    k8 = (const uint8_t *)k;
-    switch(length)
-    {
-    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
-             b+=k[2]+(((uint32_t)k[3])<<16);
-             a+=k[0]+(((uint32_t)k[1])<<16);
-             break;
-    case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
-    case 10: c+=k[4];
-             b+=k[2]+(((uint32_t)k[3])<<16);
-             a+=k[0]+(((uint32_t)k[1])<<16);
-             break;
-    case 9 : c+=k8[8];                      /* fall through */
-    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
-             a+=k[0]+(((uint32_t)k[1])<<16);
-             break;
-    case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
-    case 6 : b+=k[2];
-             a+=k[0]+(((uint32_t)k[1])<<16);
-             break;
-    case 5 : b+=k8[4];                      /* fall through */
-    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
-             break;
-    case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
-    case 2 : a+=k[0];
-             break;
-    case 1 : a+=k8[0];
-             break;
-    case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
-    }
-
-  } else {                        /* need to read the key one byte at a time */
-    const uint8_t *k = (const uint8_t *)key;
-
-    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
-    while (length > 12)
-    {
-      a += k[0];
-      a += ((uint32_t)k[1])<<8;
-      a += ((uint32_t)k[2])<<16;
-      a += ((uint32_t)k[3])<<24;
-      b += k[4];
-      b += ((uint32_t)k[5])<<8;
-      b += ((uint32_t)k[6])<<16;
-      b += ((uint32_t)k[7])<<24;
-      c += k[8];
-      c += ((uint32_t)k[9])<<8;
-      c += ((uint32_t)k[10])<<16;
-      c += ((uint32_t)k[11])<<24;
-      mix(a,b,c);
-      length -= 12;
-      k += 12;
-    }
-
-    /*-------------------------------- last block: affect all 32 bits of (c) */
-    switch(length)                   /* all the case statements fall through */
-    {
-    case 12: c+=((uint32_t)k[11])<<24;
-    case 11: c+=((uint32_t)k[10])<<16;
-    case 10: c+=((uint32_t)k[9])<<8;
-    case 9 : c+=k[8];
-    case 8 : b+=((uint32_t)k[7])<<24;
-    case 7 : b+=((uint32_t)k[6])<<16;
-    case 6 : b+=((uint32_t)k[5])<<8;
-    case 5 : b+=k[4];
-    case 4 : a+=((uint32_t)k[3])<<24;
-    case 3 : a+=((uint32_t)k[2])<<16;
-    case 2 : a+=((uint32_t)k[1])<<8;
-    case 1 : a+=k[0];
-             break;
-    case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
-    }
-  }
-
-  final(a,b,c);
-  *pc=c; *pb=b;
-}
-#endif
-
-
  /*
   * hashbig():
   * This is the same as hash_word() on big-endian machines.  It is different
   * from hashlittle() on all machines.  hashbig() takes advantage of
   * big-endian byte ordering. 
   */
-static uint32_t hashbig( const void *key, size_t length, uint32_t initval)
+static uint32_t hashbig( const void *key, size_t length, uint32_t *val2)
  {
    uint32_t a,b,c;
    union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */
  
    /* Set up the internal state */
-  a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
+  a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
  
    u.ptr = key;
    if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) {
@@ -768,6 +535,7 @@ static uint32_t hashbig( const void *key, size_t length, uint32_t initval)
    }
  
    final(a,b,c);
+  *val2 = b;
    return c;
  }
  
@@ -775,13 +543,13 @@ static uint32_t hashbig( const void *key, size_t length, uint32_t initval)
   * element.  This delivers least-surprise: hash such as "int arr[] = {
   * 1, 2 }; hash_stable(arr, 2, 0);" will be the same on big and little
   * endian machines, even though a bytewise hash wouldn't be. */
-uint32_t hash_stable_64(const void *key, size_t n, uint32_t base)
+uint64_t hash64_stable_64(const void *key, size_t n, uint64_t base)
  {
         const uint64_t *k = key;
         uint32_t a,b,c;
  
         /* Set up the internal state */
-       a = b = c = 0xdeadbeef + ((uint32_t)n*8) + base;
+       a = b = c = 0xdeadbeef + ((uint32_t)n*8) + (base >> 32) + base;
  
         while (n > 3) {
                 a += (uint32_t)k[0];
@@ -811,16 +579,16 @@ uint32_t hash_stable_64(const void *key, size_t n, uint32_t base)
                 return c;
         }
         final(a,b,c);
-       return c;
+       return ((uint64_t)b << 32) | c;
  }
  
-uint32_t hash_stable_32(const void *key, size_t n, uint32_t base)
+uint64_t hash64_stable_32(const void *key, size_t n, uint64_t base)
  {
         const uint32_t *k = key;
         uint32_t a,b,c;
  
         /* Set up the internal state */
-       a = b = c = 0xdeadbeef + ((uint32_t)n*4) + base;
+       a = b = c = 0xdeadbeef + ((uint32_t)n*4) + (base >> 32) + base;
  
         while (n > 3) {
                 a += k[0];
@@ -841,16 +609,16 @@ uint32_t hash_stable_32(const void *key, size_t n, uint32_t base)
                 return c;
         }
         final(a,b,c);
-       return c;
+       return ((uint64_t)b << 32) | c;
  }
  
-uint32_t hash_stable_16(const void *key, size_t n, uint32_t base)
+uint64_t hash64_stable_16(const void *key, size_t n, uint64_t base)
  {
         const uint16_t *k = key;
         uint32_t a,b,c;
  
         /* Set up the internal state */
-       a = b = c = 0xdeadbeef + ((uint32_t)n*2) + base;
+       a = b = c = 0xdeadbeef + ((uint32_t)n*2) + (base >> 32) + base;
  
         while (n > 6) {
                 a += (uint32_t)k[0] + ((uint32_t)k[1] << 16);
@@ -878,20 +646,58 @@ uint32_t hash_stable_16(const void *key, size_t n, uint32_t base)
                 return c;
         }
         final(a,b,c);
-       return c;
+       return ((uint64_t)b << 32) | c;
  }
-       
-uint32_t hash_stable_8(const void *key, size_t n, uint32_t base)
+
+uint64_t hash64_stable_8(const void *key, size_t n, uint64_t base)
  {
-       return hashlittle(key, n, base);
+       uint32_t b32 = base + (base >> 32);
+       uint32_t lower = hashlittle(key, n, &b32);
+
+       return ((uint64_t)b32 << 32) | lower;   
  }
  
  uint32_t hash_any(const void *key, size_t length, uint32_t base)
  {
         if (HASH_BIG_ENDIAN)
-               return hashbig(key, length, base);
+               return hashbig(key, length, &base);
         else
-               return hashlittle(key, length, base);
+               return hashlittle(key, length, &base);
+}
+
+uint32_t hash_stable_64(const void *key, size_t n, uint32_t base)
+{
+       return hash64_stable_64(key, n, base);
+}
+
+uint32_t hash_stable_32(const void *key, size_t n, uint32_t base)
+{
+       return hash64_stable_32(key, n, base);
+}
+
+uint32_t hash_stable_16(const void *key, size_t n, uint32_t base)
+{
+       return hash64_stable_16(key, n, base);
+}
+
+uint32_t hash_stable_8(const void *key, size_t n, uint32_t base)
+{
+       return hashlittle(key, n, &base);
+}
+
+/* Jenkins' lookup8 is a 64 bit hash, but he says it's obsolete.  Use
+ * the plain one and recombine into 64 bits. */
+uint64_t hash64_any(const void *key, size_t length, uint64_t base)
+{
+       uint32_t b32 = base + (base >> 32);
+       uint32_t lower;
+
+       if (HASH_BIG_ENDIAN)
+               lower = hashbig(key, length, &b32);
+       else
+               lower = hashlittle(key, length, &b32);
+
+       return ((uint64_t)b32 << 32) | lower;
  }
  
  #ifdef SELF_TEST