WSP-Hash-32: The Fastest Portable 32-Bit Hashing Algorithm With Aligned Multi-Byte Memory Reading and Good Quality
William Stafford Parsons developed an optimal 32-bit hashing algorithm as a significant improvement to CityHash32 v1.1, Lookup3, 32-bit MurmurHash3 and XXHash32.
Library
Source
#include <stdint.h>
#include <string.h>
struct wsp_hash_32_s {
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
uint32_t e;
uint32_t f;
uint32_t g;
uint32_t h;
uint32_t mix;
uint32_t mix_offset;
unsigned long input_count_capture;
};
uint32_t wsp_hash_read_32(const uint8_t *input, unsigned long i) {
uint32_t input_aligned;
memcpy(&input_aligned, &input[i], sizeof(input_aligned));
return input_aligned;
}
uint32_t wsp_hash_32(unsigned long input_count, const uint8_t *input) {
uint32_t a = 1;
uint32_t b = 11;
uint32_t c = 111;
uint32_t d = 1111;
uint32_t e = 11111;
uint32_t f = 111111;
uint32_t g = 1111111;
uint32_t h = 11111111;
uint32_t mix = 1111111111;
uint32_t mix_offset = 111111111;
unsigned long input_count_capture = input_count;
unsigned long i = 0;
if (input_count >= 32) {
i = 31;
while (i < input_count) {
mix += a + b + c + d + e + f + g + h;
a += wsp_hash_read_32(input, i - 3) + ((a << 8) | (a >> 24)) + mix;
b += wsp_hash_read_32(input, i - 7) + ((b << 23) | (b >> 9));
c += wsp_hash_read_32(input, i - 11) + ((c << 10) | (c >> 22));
d += wsp_hash_read_32(input, i - 15) + ((d << 21) | (d >> 11));
e += wsp_hash_read_32(input, i - 19) + ((e << 12) | (e >> 20));
f += wsp_hash_read_32(input, i - 23) + ((f << 19) | (f >> 13));
g += wsp_hash_read_32(input, i - 27) + ((g << 14) | (g >> 18));
h += wsp_hash_read_32(input, i - 31) + ((h << 17) | (h >> 15));
i += 32;
}
if (i >= input_count) {
i -= 32;
}
mix_offset += a + b + c + d + e + f + g + h;
i++;
}
if ((input_count - i) >= 16) {
i += 16;
a += wsp_hash_read_32(input, i - 16) + ((a << 8) | (a >> 24));
b += wsp_hash_read_32(input, i - 12) + ((b << 23) | (b >> 9));
c += wsp_hash_read_32(input, i - 8) + ((c << 10) | (c >> 22));
d += wsp_hash_read_32(input, i - 4) + ((d << 21) | (d >> 11));
mix += a + b + c + d;
}
if ((input_count - i) >= 8) {
i += 8;
a += wsp_hash_read_32(input, i - 8) + ((a << 8) | (a >> 24));
b += wsp_hash_read_32(input, i - 4) + ((b << 23) | (b >> 9));
mix += a + b;
}
if (i != input_count) {
mix += (a << 8) | (a >> 24);
input_count -= i;
if (input_count >= 4) {
a += wsp_hash_read_32(input, i) + ((a << 23) | (a >> 9));
if (input_count != 4) {
mix_offset += a + b;
switch (input_count) {
case 7:
b += input[i + 6] << 16;
case 6:
b += input[i + 5] << 8;
case 5:
b += input[i + 4];
}
}
}
switch (input_count) {
case 3:
a ^= input[i + 2] << 16;
case 2:
a ^= input[i + 1] << 8;
case 1:
a ^= input[i];
}
}
a += b + mix_offset;
mix += (a << 8) | (a >> 24);
if (input_count_capture > 12) {
mix += ((c << 22) | (c >> 10)) + ((d << 11) | (d >> 21));
if (input_count_capture >= 32) {
mix_offset += ((e << 20) | (e >> 12)) ^ ((g << 18) | (g >> 14));
mix += (h << 15) | (h >> 17);
}
}
mix_offset += mix + input_count_capture;
mix += ((a << 13) | (a >> 19)) ^ h;
b ^= mix_offset;
mix_offset += a ^ ((b << 18) | (b >> 14));
if (input_count_capture >= 8) {
c ^= mix;
mix += b ^ ((c << 15) | (c >> 17));
e ^= mix_offset;
mix_offset += d ^ ((e << 18) | (e >> 14));
if (input_count_capture >= 32) {
f ^= ((e << 18) | (e >> 14)) + mix;
mix_offset += e ^ ((f << 19) | (f >> 13));
g ^= mix_offset;
mix += f ^ ((g << 21) | (g >> 11));
}
}
return mix + mix_offset;
}
void wsp_hash_32_initialize(struct wsp_hash_32_s *s) {
s->a = 1;
s->b = 11;
s->c = 111;
s->d = 1111;
s->e = 11111;
s->f = 111111;
s->g = 1111111;
s->h = 11111111;
s->mix = 1111111111;
s->mix_offset = 111111111;
s->input_count_capture = 0;
}
void wsp_hash_32_transform(unsigned long i, unsigned long input_count,
const uint8_t *input, struct wsp_hash_32_s *s) {
s->input_count_capture += input_count;
if (input_count >= 32) {
i = 31;
while (i < input_count) {
s->mix += s->a + s->b + s->c + s->d + s->e + s->f + s->g + s->h;
s->a += wsp_hash_read_32(input, i - 3) + ((s->a << 8) | (s->a >> 24))
+ s->mix;
s->b += wsp_hash_read_32(input, i - 7) + ((s->b << 23) | (s->b >> 9));
s->c += wsp_hash_read_32(input, i - 11) + ((s->c << 10) | (s->c >> 22));
s->d += wsp_hash_read_32(input, i - 15) + ((s->d << 21) | (s->d >> 11));
s->e += wsp_hash_read_32(input, i - 19) + ((s->e << 12) | (s->e >> 20));
s->f += wsp_hash_read_32(input, i - 23) + ((s->f << 19) | (s->f >> 13));
s->g += wsp_hash_read_32(input, i - 27) + ((s->g << 14) | (s->g >> 18));
s->h += wsp_hash_read_32(input, i - 31) + ((s->h << 17) | (s->h >> 15));
i += 32;
}
if (i >= input_count) {
i -= 32;
}
s->mix_offset += s->a + s->b + s->c + s->d + s->e + s->f + s->g + s->h;
i++;
}
if ((input_count - i) >= 16) {
i += 16;
s->a += wsp_hash_read_32(input, i - 16) + ((s->a << 8) | (s->a >> 24));
s->b += wsp_hash_read_32(input, i - 12) + ((s->b << 23) | (s->b >> 9));
s->c += wsp_hash_read_32(input, i - 8) + ((s->c << 10) | (s->c >> 22));
s->d += wsp_hash_read_32(input, i - 4) + ((s->d << 21) | (s->d >> 11));
s->mix += s->a + s->b + s->c + s->d;
}
if ((input_count - i) >= 8) {
i += 8;
s->a += wsp_hash_read_32(input, i - 8) + ((s->a << 8) | (s->a >> 24));
s->b += wsp_hash_read_32(input, i - 4) + ((s->b << 23) | (s->b >> 9));
s->mix += s->a + s->b;
}
if (i != input_count) {
s->mix += (s->a << 8) | (s->a >> 24);
input_count -= i;
if (input_count >= 4) {
s->a += wsp_hash_read_32(input, i) + ((s->a << 23) | (s->a >> 9));
if (input_count != 4) {
s->mix_offset += s->a + s->b;
switch (input_count) {
case 7:
s->b += input[i + 6] << 16;
case 6:
s->b += input[i + 5] << 8;
case 5:
s->b += input[i + 4];
}
}
}
switch (input_count) {
case 3:
s->a ^= input[i + 2] << 16;
case 2:
s->a ^= input[i + 1] << 8;
case 1:
s->a ^= input[i];
}
}
}
void wsp_hash_32_finalize(struct wsp_hash_32_s *s) {
s->a += s->b + s->mix_offset;
s->mix += (s->a << 8) | (s->a >> 24);
if (s->input_count_capture > 12) {
s->mix += ((s->c << 22) | (s->c >> 10)) + ((s->d << 11) | (s->d >> 21));
if (s->input_count_capture >= 32) {
s->mix_offset += ((s->e << 20) | (s->e >> 12))
^ ((s->g << 18) | (s->g >> 14));
s->mix += (s->h << 15) | (s->h >> 17);
}
}
s->mix_offset += s->mix + s->input_count_capture;
s->mix += ((s->a << 13) | (s->a >> 19)) ^ s->h;
s->b ^= s->mix_offset;
s->mix_offset += s->a ^ ((s->b << 18) | (s->b >> 14));
if (s->input_count_capture >= 8) {
s->c ^= s->mix;
s->mix += s->b ^ ((s->c << 15) | (s->c >> 17));
s->e ^= s->mix_offset;
s->mix_offset += s->d ^ ((s->e << 18) | (s->e >> 14));
if (s->input_count_capture >= 32) {
s->f ^= ((s->e << 18) | (s->e >> 14)) + s->mix;
s->mix_offset += s->e ^ ((s->f << 19) | (s->f >> 13));
s->g ^= s->mix_offset;
s->mix += s->f ^ ((s->g << 21) | (s->g >> 11));
}
}
s->mix += s->mix_offset;
}
Reference
wsp_hash_32() is the all-at-once hashing function that accepts the 2 following arguments.
input_count is the count of elements in the input array.
input is the const uint8_t array to hash.
The return value data type is uint32_t.
It returns the 32-bit unsigned integer hash digest result.
wsp_hash_32_initialize() is the initialization function that accepts the following argument.
s is a struct wsp_hash_32_s pointer.
wsp_hash_32_transform() is the core hashing loop function that accepts the 4 following arguments.
i is the starting index position of elements in the input array.
input_count is the count of elements in the input array. When hashing in split segments, the value must be a multiple of 32, with the exception of the end segment.
input is the const uint8_t array to hash.
s is a struct wsp_hash_32_s pointer.
wsp_hash_32_finalize() is the finalization function that accepts the following argument.
s is a struct wsp_hash_32_s pointer. s.mix contains the finalized hash digest result.
The return value data type is void.
Requirements
C compiler with C99 (ISO/IEC 9899:1999) standard compatibility.
CPU with single-threaded, instruction-level parallelism support.
Explanation
This 32-bit non-cryptographic hashing algorithm is designed to hash keys of all sizes as quickly as possible with minimal collisions across all truncated bit sizes.
It's the fastest portable 32-bit hashing algorithm with good statistical quality relevant to non-cryptographic hashing.
It's portable for both 32-bit and 64-bit systems.
It meets strict compliance, portability and code security requirements.
Multi-byte memory reading is designed primarily for systems with little-endian byte order, although big-endian memory reading is functional with similar quality and speed results.
In rare instances when hash table states must be saved after program termination and re-used in multiple systems with varying endianness, keys should be re-hashed during initialization instead of slowing down runtime hashing with byte order alignment.
In rare instances when byte order is expected to change during runtime, using a one-at-a-time hashing algorithm is a better alternative.
It doesn't use modulus, multiplication or division arithmetic operations.
It supports unlimited input length by splitting input bits into descending segments of 256, 128, 64 and the remaining 8–63 bits. All-at-once hashing is the faster option, but it isn't required and the digest results are consistent when hashing in partial segments.
Single-threaded, instruction-level parallelism with low-cost addition and bitwise instructions work well on a wide range of CPU queue loads and devices.
Not allowing a seed prevents issues from 2³² additional hashing algorithms, so all seed-based tests are omitted.
It passes the SMHasher differential test and key set collision tests including Cyclic, Permutation, Sparse, Text, Two-Byte, Windowed and Zeroes.
It passes the extended version of these collision tests as well.
There aren't any bit distribution calculation percentages exceeding 10% in the worst instance, which suggests there aren't any critical non-cryptographic distribution issues among each of the aforementioned tests.
The strict avalanche test in SMHasher is omitted as it's only relevant to the analysis of cryptographic hash function properties where the worst instance exposes a critical cryptographic vulnerability.
Furthermore, the following code tests collision counts for truncated digests against each of the 8 bits flipped within a single input byte ranging from 1 to 255 at all positions for all input_count values ranging from 1 to 256.
#include <stdio.h>
#include "wsp_hash_32.h"
int main(void) {
uint8_t input[256];
uint32_t result = 0;
uint32_t result_flipped = 1;
unsigned long bit_collisions_counts[32];
unsigned short sparse_byte = 0;
unsigned short i = 0;
unsigned short j = 0;
unsigned short k = 0;
unsigned short l = 0;
unsigned short m = 0;
while (i != 32) {
bit_collisions_counts[i] = 0;
i++;
}
i = 1;
while (sparse_byte != 16) {
while (i != 256) {
j = 1;
while (j != 256) {
k = 0;
while (k != j) {
while (l != j) {
input[l] = sparse_byte;
l++;
}
if (sparse_byte == i) {
k = j;
l = 0;
continue;
}
input[k] = i;
result = wsp_hash_32(j, (const uint8_t *) input);
l = 0;
while (l != 8) {
input[k] = input[k] ^ (1 << l);
result_flipped = wsp_hash_32(j, (const uint8_t *) input);
input[k] = input[k] ^ (1 << l);
m = 1;
while (m != 32) {
if (
(result & ((2 << m) - 1)) == (result_flipped & ((2 << m) - 1))
) {
bit_collisions_counts[m - 1]++;
}
m++;
}
l++;
m = 0;
}
k++;
l = 0;
}
j++;
k = 0;
}
i++;
}
sparse_byte++;
i = 1;
}
i = 2;
j = 0;
while (i != 33) {
printf("%2u-Bit Segmented Collisions: %9lu\n",
i,
bit_collisions_counts[i - 2]);
i++;
j++;
}
return 0;
}
The following collision results demonstrate a sufficient collision-based avalanche effect in the worst instance with non-cryptographic, universal hashing relevance.
Avalanche Collision Results Among 1 Billion 1-Bit Sparse Keys at Varying Lengths
2-Bit Segmented Collisions: 267930841
3-Bit Segmented Collisions: 135013310
4-Bit Segmented Collisions: 67964843
5-Bit Segmented Collisions: 34162986
6-Bit Segmented Collisions: 17250096
7-Bit Segmented Collisions: 8699038
8-Bit Segmented Collisions: 4354013
9-Bit Segmented Collisions: 2208362
10-Bit Segmented Collisions: 1131627
11-Bit Segmented Collisions: 586244
12-Bit Segmented Collisions: 299872
13-Bit Segmented Collisions: 154559
14-Bit Segmented Collisions: 83441
15-Bit Segmented Collisions: 47000
16-Bit Segmented Collisions: 29634
17-Bit Segmented Collisions: 20410
18-Bit Segmented Collisions: 4062
19-Bit Segmented Collisions: 2080
20-Bit Segmented Collisions: 1015
21-Bit Segmented Collisions: 493
22-Bit Segmented Collisions: 237
23-Bit Segmented Collisions: 132
24-Bit Segmented Collisions: 60
25-Bit Segmented Collisions: 31
26-Bit Segmented Collisions: 19
27-Bit Segmented Collisions: 7
28-Bit Segmented Collisions: 4
29-Bit Segmented Collisions: 4
30-Bit Segmented Collisions: 2
31-Bit Segmented Collisions: 2
32-Bit Segmented Collisions: 2
The 2 32-bit collisions out of 125 million groups of 8 single-bit flip tests proves the worst instances are rare enough to be considered random occurrences in practical implementations.
The semi-linear collision increments after each truncation are within an acceptable range based on the probability of collisions at each bit size. For example, 200-300 million collisions in the lower 2 bits are expected out of 1 billion total keys, but only up to a few are expected in the full 32 bits.
Compared to CityHash32 v1.1, the speed's approximately 68% faster on average for small keys and 71% faster on average for large keys.
Compared to Lookup3, the speed's approximately 20% faster on average for small keys and 300% faster on average for large keys.
Furthermore, Lookup3 relies on direct pointer type conversions and unaligned memory reading.
Compared to 32-bit Murmur3, the speed's approximately 45% faster on average for small keys and 300% faster on average for large keys.
Compared to XXHash32, the speed's at least 46% faster on average for small keys and 30% faster for large keys.
All speed tests were performed locally on a Pixelbook Go M3 using Debian.