X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fhash%2Fhash.c;h=cb4465405e2eff8022a014e0690583e6d16d9ac0;hp=886101a13ac3a76155fc33c05fe69c385579bd83;hb=7449ae0ff0ef5c96b7a76010986e00ebb28b2a65;hpb=0953f929bc024a9107869a40516b89932d5482e0 diff --git a/ccan/hash/hash.c b/ccan/hash/hash.c index 886101a1..cb446540 100644 --- a/ccan/hash/hash.c +++ b/ccan/hash/hash.c @@ -42,7 +42,7 @@ on 1 byte), but shoehorning those bytes into integers efficiently is messy. #include /* attempt to define endianness */ #endif -#include "hash/hash.h" +#include "hash.h" #ifdef linux # include /* attempt to define endianness */ #endif @@ -63,8 +63,7 @@ on 1 byte), but shoehorning those bytes into integers efficiently is messy. # define HASH_LITTLE_ENDIAN 0 # define HASH_BIG_ENDIAN 1 #else -# define HASH_LITTLE_ENDIAN 0 -# define HASH_BIG_ENDIAN 0 +# error Unknown endian #endif #define hashsize(n) ((uint32_t)1<<(n)) @@ -209,63 +208,16 @@ uint32_t initval) /* the previous hash, or an arbitrary value */ return c; } - -#if 0 -/* --------------------------------------------------------------------- -hash_word2() -- same as hash_word(), but take two seeds and return two -32-bit values. pc and pb must both be nonnull, and *pc and *pb must -both be initialized with seeds. If you pass in (*pb)==0, the output -(*pc) will be the same as the return value from hash_word(). --------------------------------------------------------------------- -*/ -void hash_word2 ( -const uint32_t *k, /* the key, an array of uint32_t values */ -size_t length, /* the length of the key, in uint32_ts */ -uint32_t *pc, /* IN: seed OUT: primary hash value */ -uint32_t *pb) /* IN: more seed OUT: secondary hash value */ -{ - uint32_t a,b,c; - - /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; - c += *pb; - - /*------------------------------------------------- handle most of the key */ - while (length > 3) - { - a += k[0]; - b += k[1]; - c += k[2]; - mix(a,b,c); - length -= 3; - k += 3; - } - - /*------------------------------------------- handle the last 3 uint32_t's */ - switch(length) /* all the case statements fall through */ - { - case 3 : c+=k[2]; - case 2 : b+=k[1]; - case 1 : a+=k[0]; - final(a,b,c); - case 0: /* case 0: nothing left to add */ - break; - } - /*------------------------------------------------------ report the result */ - *pc=c; *pb=b; -} -#endif - /* ------------------------------------------------------------------------------- hashlittle() -- hash a variable-length key into a 32-bit value k : the key (the unaligned variable-length array of bytes) length : the length of the key, counting by bytes - initval : can be any 4-byte value + val2 : IN: can be any 4-byte value OUT: second 32 bit hash. Returns a 32-bit value. Every bit of the key affects every bit of the return value. Two keys differing by one or two bits will have -totally different hash values. +totally different hash values. Note that the return value is better +mixed than val2, so use that first. The best hash table sizes are powers of 2. There is no need to do mod a prime (mod is sooo slow!). If you need less than 32 bits, @@ -284,13 +236,13 @@ acceptable. Do NOT use for cryptographic purposes. ------------------------------------------------------------------------------- */ -static uint32_t hashlittle( const void *key, size_t length, uint32_t initval) +static uint32_t hashlittle( const void *key, size_t length, uint32_t *val2 ) { uint32_t a,b,c; /* internal state */ union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; + a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2; u.ptr = key; if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { @@ -452,209 +404,23 @@ static uint32_t hashlittle( const void *key, size_t length, uint32_t initval) } final(a,b,c); + *val2 = b; return c; } -#if 0 -/* - * hashlittle2: return 2 32-bit hash values - * - * This is identical to hashlittle(), except it returns two 32-bit hash - * values instead of just one. This is good enough for hash table - * lookup with 2^^64 buckets, or if you want a second hash if you're not - * happy with the first, or if you want a probably-unique 64-bit ID for - * the key. *pc is better mixed than *pb, so use *pc first. If you want - * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)". - */ -void hashlittle2( - const void *key, /* the key to hash */ - size_t length, /* length of the key */ - uint32_t *pc, /* IN: primary initval, OUT: primary hash */ - uint32_t *pb) /* IN: secondary initval, OUT: secondary hash */ -{ - uint32_t a,b,c; /* internal state */ - union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ - - /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc; - c += *pb; - - u.ptr = key; - if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { - const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */ - const uint8_t *k8; - - /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ - while (length > 12) - { - a += k[0]; - b += k[1]; - c += k[2]; - mix(a,b,c); - length -= 12; - k += 3; - } - - /*----------------------------- handle the last (probably partial) block */ - /* - * "k[2]&0xffffff" actually reads beyond the end of the string, but - * then masks off the part it's not allowed to read. Because the - * string is aligned, the masked-off tail is in the same word as the - * rest of the string. Every machine with memory protection I've seen - * does it on word boundaries, so is OK with this. But VALGRIND will - * still catch it and complain. The masking trick does make the hash - * noticably faster for short strings (like English words). - */ -#ifndef VALGRIND - - switch(length) - { - case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; - case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; - case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; - case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; - case 8 : b+=k[1]; a+=k[0]; break; - case 7 : b+=k[1]&0xffffff; a+=k[0]; break; - case 6 : b+=k[1]&0xffff; a+=k[0]; break; - case 5 : b+=k[1]&0xff; a+=k[0]; break; - case 4 : a+=k[0]; break; - case 3 : a+=k[0]&0xffffff; break; - case 2 : a+=k[0]&0xffff; break; - case 1 : a+=k[0]&0xff; break; - case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ - } - -#else /* make valgrind happy */ - - k8 = (const uint8_t *)k; - switch(length) - { - case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; - case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ - case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ - case 9 : c+=k8[8]; /* fall through */ - case 8 : b+=k[1]; a+=k[0]; break; - case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ - case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ - case 5 : b+=k8[4]; /* fall through */ - case 4 : a+=k[0]; break; - case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ - case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ - case 1 : a+=k8[0]; break; - case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ - } - -#endif /* !valgrind */ - - } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { - const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ - const uint8_t *k8; - - /*--------------- all but last block: aligned reads and different mixing */ - while (length > 12) - { - a += k[0] + (((uint32_t)k[1])<<16); - b += k[2] + (((uint32_t)k[3])<<16); - c += k[4] + (((uint32_t)k[5])<<16); - mix(a,b,c); - length -= 12; - k += 6; - } - - /*----------------------------- handle the last (probably partial) block */ - k8 = (const uint8_t *)k; - switch(length) - { - case 12: c+=k[4]+(((uint32_t)k[5])<<16); - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ - case 10: c+=k[4]; - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 9 : c+=k8[8]; /* fall through */ - case 8 : b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ - case 6 : b+=k[2]; - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 5 : b+=k8[4]; /* fall through */ - case 4 : a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ - case 2 : a+=k[0]; - break; - case 1 : a+=k8[0]; - break; - case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ - } - - } else { /* need to read the key one byte at a time */ - const uint8_t *k = (const uint8_t *)key; - - /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ - while (length > 12) - { - a += k[0]; - a += ((uint32_t)k[1])<<8; - a += ((uint32_t)k[2])<<16; - a += ((uint32_t)k[3])<<24; - b += k[4]; - b += ((uint32_t)k[5])<<8; - b += ((uint32_t)k[6])<<16; - b += ((uint32_t)k[7])<<24; - c += k[8]; - c += ((uint32_t)k[9])<<8; - c += ((uint32_t)k[10])<<16; - c += ((uint32_t)k[11])<<24; - mix(a,b,c); - length -= 12; - k += 12; - } - - /*-------------------------------- last block: affect all 32 bits of (c) */ - switch(length) /* all the case statements fall through */ - { - case 12: c+=((uint32_t)k[11])<<24; - case 11: c+=((uint32_t)k[10])<<16; - case 10: c+=((uint32_t)k[9])<<8; - case 9 : c+=k[8]; - case 8 : b+=((uint32_t)k[7])<<24; - case 7 : b+=((uint32_t)k[6])<<16; - case 6 : b+=((uint32_t)k[5])<<8; - case 5 : b+=k[4]; - case 4 : a+=((uint32_t)k[3])<<24; - case 3 : a+=((uint32_t)k[2])<<16; - case 2 : a+=((uint32_t)k[1])<<8; - case 1 : a+=k[0]; - break; - case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */ - } - } - - final(a,b,c); - *pc=c; *pb=b; -} -#endif - - /* * hashbig(): * This is the same as hash_word() on big-endian machines. It is different * from hashlittle() on all machines. hashbig() takes advantage of * big-endian byte ordering. */ -static uint32_t hashbig( const void *key, size_t length, uint32_t initval) +static uint32_t hashbig( const void *key, size_t length, uint32_t *val2) { uint32_t a,b,c; union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */ /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)length) + initval; + a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2; u.ptr = key; if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) { @@ -769,23 +535,167 @@ static uint32_t hashbig( const void *key, size_t length, uint32_t initval) } final(a,b,c); + *val2 = b; return c; } -uint32_t hash_any_stable(const void *key, size_t length, uint32_t base) +/* I basically use hashlittle here, but use native endian within each + * element. This delivers least-surprise: hash such as "int arr[] = { + * 1, 2 }; hash_stable(arr, 2, 0);" will be the same on big and little + * endian machines, even though a bytewise hash wouldn't be. */ +uint64_t hash64_stable_64(const void *key, size_t n, uint32_t base) +{ + const uint64_t *k = key; + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)n*8) + base; + + while (n > 3) { + a += (uint32_t)k[0]; + b += (uint32_t)(k[0] >> 32); + c += (uint32_t)k[1]; + mix(a,b,c); + a += (uint32_t)(k[1] >> 32); + b += (uint32_t)k[2]; + c += (uint32_t)(k[2] >> 32); + mix(a,b,c); + n -= 3; + k += 3; + } + switch (n) { + case 2: + a += (uint32_t)k[0]; + b += (uint32_t)(k[0] >> 32); + c += (uint32_t)k[1]; + mix(a,b,c); + a += (uint32_t)(k[1] >> 32); + break; + case 1: + a += (uint32_t)k[0]; + b += (uint32_t)(k[0] >> 32); + break; + case 0: + return c; + } + final(a,b,c); + return ((uint64_t)b << 32) | c; +} + +uint64_t hash64_stable_32(const void *key, size_t n, uint32_t base) +{ + const uint32_t *k = key; + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)n*4) + base; + + while (n > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + + n -= 3; + k += 3; + } + switch (n) { + case 2: + b += (uint32_t)k[1]; + case 1: + a += (uint32_t)k[0]; + break; + case 0: + return c; + } + final(a,b,c); + return ((uint64_t)b << 32) | c; +} + +uint64_t hash64_stable_16(const void *key, size_t n, uint32_t base) +{ + const uint16_t *k = key; + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)n*2) + base; + + while (n > 6) { + a += (uint32_t)k[0] + ((uint32_t)k[1] << 16); + b += (uint32_t)k[2] + ((uint32_t)k[3] << 16); + c += (uint32_t)k[4] + ((uint32_t)k[5] << 16); + mix(a,b,c); + + n -= 6; + k += 6; + } + + switch (n) { + case 5: + c += (uint32_t)k[4]; + case 4: + b += ((uint32_t)k[3] << 16); + case 3: + b += (uint32_t)k[2]; + case 2: + a += ((uint32_t)k[1] << 16); + case 1: + a += (uint32_t)k[0]; + break; + case 0: + return c; + } + final(a,b,c); + return ((uint64_t)b << 32) | c; +} + +uint64_t hash64_stable_8(const void *key, size_t n, uint32_t base) { - /* We use hashlittle as our stable hash. */ - return hashlittle(key, length, base); + uint32_t lower = hashlittle(key, n, &base); + + return ((uint64_t)base << 32) | lower; } uint32_t hash_any(const void *key, size_t length, uint32_t base) { if (HASH_BIG_ENDIAN) - return hashbig(key, length, base); + return hashbig(key, length, &base); else - /* We call hash_any_stable not hashlittle. This way we know - * that hashlittle will be inlined in hash_any_stable. */ - return hash_any_stable(key, length, base); + return hashlittle(key, length, &base); +} + +uint32_t hash_stable_64(const void *key, size_t n, uint32_t base) +{ + return hash64_stable_64(key, n, base); +} + +uint32_t hash_stable_32(const void *key, size_t n, uint32_t base) +{ + return hash64_stable_32(key, n, base); +} + +uint32_t hash_stable_16(const void *key, size_t n, uint32_t base) +{ + return hash64_stable_16(key, n, base); +} + +uint32_t hash_stable_8(const void *key, size_t n, uint32_t base) +{ + return hashlittle(key, n, &base); +} + +/* Jenkins' lookup8 is a 64 bit hash, but he says it's obsolete. Use + * the plain one and recombine into 64 bits. */ +uint64_t hash64_any(const void *key, size_t length, uint32_t base) +{ + uint32_t lower; + + if (HASH_BIG_ENDIAN) + lower = hashbig(key, length, &base); + else + lower = hashlittle(key, length, &base); + + return ((uint64_t)base << 32) | lower; } #ifdef SELF_TEST