WSP-Hash-64: The Fastest Portable 64-Bit Hashing Algorithm With Aligned Multi-Byte Memory Reading and Good Quality
William Stafford Parsons developed a 64-bit hashing algorithm as a substantial improvement to CityHash64, FarmHash64, 64-bit MurmurHash3 x64 and x86, 64-bit SpookyHashV2 and XXHash64.
Library
Source
#include <stdint.h>
#include <string.h>
struct wsp_hash_64_s {
uint64_t a;
uint64_t b;
uint64_t c;
uint64_t d;
uint64_t mix;
uint64_t mix_offset;
unsigned long input_count_capture;
};
uint32_t wsp_hash_read_32(const uint8_t *input, unsigned long i) {
uint32_t input_aligned;
memcpy(&input_aligned, &input[i], sizeof(input_aligned));
return input_aligned;
}
uint64_t wsp_hash_read_64(const uint8_t *input, unsigned long i) {
uint64_t input_aligned;
memcpy(&input_aligned, &input[i], sizeof(input_aligned));
return input_aligned;
}
uint64_t wsp_hash_64(unsigned long input_count, const uint8_t *input) {
uint64_t input_aligned_capture_a;
uint64_t input_aligned_capture_b;
uint64_t input_aligned_capture_c;
uint64_t input_aligned_capture_d;
uint64_t a = 1;
uint64_t b = 11;
uint64_t c = 111;
uint64_t d = 1111;
uint64_t mix = 1111111111;
uint64_t mix_offset = 111111111;
unsigned long input_count_capture = input_count;
unsigned long i = 0;
if (input_count >= 32) {
i = 31;
while (i < input_count) {
input_aligned_capture_a = wsp_hash_read_64(input, i - 7);
input_aligned_capture_b = wsp_hash_read_64(input, i - 15);
input_aligned_capture_c = wsp_hash_read_64(input, i - 23);
input_aligned_capture_d = wsp_hash_read_64(input, i - 31);
mix += input_aligned_capture_a + input_aligned_capture_b
+ input_aligned_capture_c + input_aligned_capture_d;
a += input_aligned_capture_a + ((a << 30) | (a >> 34)) + mix;
b += input_aligned_capture_b + ((b << 29) | (b >> 35));
c += input_aligned_capture_c + ((c << 28) | (c >> 36));
d += input_aligned_capture_d + ((d << 27) | (d >> 37));
i += 32;
}
if (i >= input_count) {
i -= 32;
}
mix_offset += input_aligned_capture_a + input_aligned_capture_b
+ input_aligned_capture_c + input_aligned_capture_d + a + b + c + d;
i++;
}
if ((input_count - i) >= 16) {
i += 16;
input_aligned_capture_a = wsp_hash_read_64(input, i - 16);
input_aligned_capture_b = wsp_hash_read_64(input, i - 8);
mix += input_aligned_capture_a + input_aligned_capture_b;
a += input_aligned_capture_a + ((a << 30) | (a >> 34)) + mix;
b += input_aligned_capture_b + ((b << 28) | (b >> 36)) + mix;
}
if ((input_count - i) >= 8) {
i += 8;
input_aligned_capture_a = wsp_hash_read_64(input, i - 8);
mix += input_aligned_capture_a + a;
a += ((input_aligned_capture_a << 30) | (input_aligned_capture_a >> 34))
+ mix;
}
if (i != input_count) {
input_count -= i;
if (input_count >= 4) {
mix += (a << 16) | (a >> 48);
a += wsp_hash_read_32(input, i) + mix_offset;
if (input_count != 4) {
mix += (b << 20) | (b >> 44);
switch (input_count) {
case 7:
b += input[i + 6] << 16;
case 6:
b += input[i + 5] << 8;
case 5:
b += input[i + 4];
}
}
}
switch (input_count) {
case 3:
a += input[i + 2] << 16;
case 2:
a += input[i + 1] << 8;
case 1:
a += input[i] + mix;
}
}
a += b + mix_offset + input_count_capture;
mix = a ^ (mix + ((a << 16) | (a >> 48)) + ((c << 44) | (c >> 20)));
mix_offset += ((a << 40) | (a >> 24)) ^ ((b << 36) | (b >> 28));
b ^= mix_offset + input_count_capture;
mix_offset += a ^ (((b << 36) | (b >> 28)) + ((d << 14) | (d >> 50)) + mix);
a = b + d + mix_offset;
c += (d << 22) | (d >> 42);
mix_offset += ((a << 40) | (a >> 24)) ^ b;
b = a ^ ((mix_offset << 36) | (mix_offset >> 28));
d = (((a << 26) | (a >> 38)) ^ d) + c + mix;
return b + d + mix_offset;
}
void wsp_hash_64_initialize(struct wsp_hash_64_s *s) {
s->a = 1;
s->b = 11;
s->c = 111;
s->d = 1111;
s->mix = 1111111111;
s->mix_offset = 111111111;
s->input_count_capture = 0;
}
void wsp_hash_64_transform(unsigned long i, unsigned long input_count,
const uint8_t *input,
struct wsp_hash_64_s *s) {
uint64_t input_aligned_capture_a;
uint64_t input_aligned_capture_b;
uint64_t input_aligned_capture_c;
uint64_t input_aligned_capture_d;
s->input_count_capture += input_count;
if (input_count >= 32) {
i = 31;
while (i < input_count) {
input_aligned_capture_a = wsp_hash_read_64(input, i - 7);
input_aligned_capture_b = wsp_hash_read_64(input, i - 15);
input_aligned_capture_c = wsp_hash_read_64(input, i - 23);
input_aligned_capture_d = wsp_hash_read_64(input, i - 31);
s->mix += input_aligned_capture_a + input_aligned_capture_b
+ input_aligned_capture_c + input_aligned_capture_d;
s->a += input_aligned_capture_a + ((s->a << 30) | (s->a >> 34)) + s->mix;
s->b += input_aligned_capture_b + ((s->b << 29) | (s->b >> 35));
s->c += input_aligned_capture_c + ((s->c << 28) | (s->c >> 36));
s->d += input_aligned_capture_d + ((s->d << 27) | (s->d >> 37));
i += 32;
}
if (i >= input_count) {
i -= 32;
}
s->mix_offset += input_aligned_capture_a + input_aligned_capture_b
+ input_aligned_capture_c + input_aligned_capture_d + s->a + s->b + s->c
+ s->d;
i++;
}
if ((input_count - i) >= 16) {
i += 16;
input_aligned_capture_a = wsp_hash_read_64(input, i - 16);
input_aligned_capture_b = wsp_hash_read_64(input, i - 8);
s->mix += input_aligned_capture_a + input_aligned_capture_b;
s->a += input_aligned_capture_a + ((s->a << 30) | (s->a >> 34)) + s->mix;
s->b += input_aligned_capture_b + ((s->b << 28) | (s->b >> 36)) + s->mix;
}
if ((input_count - i) >= 8) {
i += 8;
input_aligned_capture_a = wsp_hash_read_64(input, i - 8);
s->mix += input_aligned_capture_a + s->a;
s->a += ((input_aligned_capture_a << 30) | (input_aligned_capture_a >> 34))
+ s->mix;
}
if (i != input_count) {
input_count -= i;
if (input_count >= 4) {
s->mix += (s->a << 16) | (s->a >> 48);
s->a += wsp_hash_read_32(input, i) + s->mix_offset;
if (input_count != 4) {
s->mix += (s->b << 20) | (s->b >> 44);
switch (input_count) {
case 7:
s->b += input[i + 6] << 16;
case 6:
s->b += input[i + 5] << 8;
case 5:
s->b += input[i + 4];
}
}
}
switch (input_count) {
case 3:
s->a += input[i + 2] << 16;
case 2:
s->a += input[i + 1] << 8;
case 1:
s->a += input[i] + s->mix;
}
}
}
void wsp_hash_64_finalize(struct wsp_hash_64_s *s) {
s->a += s->b + s->mix_offset + s->input_count_capture;
s->mix = s->a ^ (s->mix + ((s->a << 16) | (s->a >> 48))
+ ((s->c << 44) | (s->c >> 20)));
s->mix_offset += ((s->a << 40) | (s->a >> 24))
^ ((s->b << 36) | (s->b >> 28));
s->b ^= s->mix_offset + s->input_count_capture;
s->mix_offset += s->a ^ (((s->b << 36) | (s->b >> 28))
+ ((s->d << 14) | (s->d >> 50)) + s->mix);
s->a = s->b + s->d + s->mix_offset;
s->c += (s->d << 22) | (s->d >> 42);
s->mix_offset += ((s->a << 40) | (s->a >> 24)) ^ s->b;
s->b = s->a ^ ((s->mix_offset << 36) | (s->mix_offset >> 28));
s->d = (((s->a << 26) | (s->a >> 38)) ^ s->d) + s->c + s->mix;
s->mix = s->b + s->d + s->mix_offset;
}
Reference
wsp_hash_64_initialize() is the initialization function that accepts the following argument.
s is a struct wsp_hash_64_s pointer.
wsp_hash_64_transform() is the core hashing loop that accepts the 4 following arguments.
i is the starting index position of elements in the input array.
input_count is the count of elements in the input array. When hashing in split segments, the value must be a multiple of 32, with the exception of the end segment.
input is the const uint8_t array to hash.
s is a struct wsp_hash_64_s pointer.
wsp_hash_64_finalize() is the finalization function that accepts the following argument.
s is a struct wsp_hash_64_s pointer. s.state contains the finalized hash digest result.
The return value data type is void.
Requirements
C compiler with C99 (ISO/IEC 9899:1999) standard compatibility.
CPU with single-threaded, instruction-level parallelism support.
Explanation
WSP-Hash-64 is designed to hash keys of all sizes as quickly as possible with minimal collisions across all truncated bit sizes.
It's the fastest portable 64-bit hashing algorithm with good statistical quality relevant to non-cryptographic hashing.
It's portable for 64-bit systems. There's an alternative 32-bit hashing algorithm for 32-bit systems.
It meets strict compliance, portability and code security requirements.
Multi-byte memory reading is designed primarily for systems with little-endian byte order, although big-endian memory reading is functional with similar quality and speed results.
In rare instances when hash table states must be saved after program termination and re-used in multiple systems with varying endianness, keys should be re-hashed during initialization instead of slowing down runtime hashing with byte order alignment.
In rare instances when byte order is expected to change during runtime, using a one-at-a-time hashing algorithm is a better alternative.
It doesn't use modulus, multiplication or division arithmetic operations.
It supports unlimited input length by splitting input bytes into descending segments of 256, 128, 64 and the remaining 8–63 bits. All-at-once hashing is the faster option, but it isn't required and the digest results are consistent when hashing in partial segments.
Single-threaded, instruction-level parallelism with low-cost addition and bitwise instructions work well on a wide range of CPU queue loads and devices.
Not allowing a seed prevents issues from 2³² additional hashing algorithms, so all seed-based tests are omitted.
It passes the SMHasher differential test and key set collision tests including Cyclic, Permutation, Sparse, Text, Two-Byte, Windowed and Zeroes.
It passes the extended version of these collision tests as well with both 64-bit output and low 32-bit output.
There aren't any bit distribution calculation percentages exceeding 5% in the worst instances for both 64-bit output and low 32-bit output at both byte orders, which suggests there aren't any critical non-cryptographic distribution issues among each of the aforementioned tests.
The strict avalanche test in SMHasher is omitted as it's only relevant to the analysis of cryptographic hash function properties where the worst instance exposes a critical cryptographic vulnerability.
Furthermore, the following code tests collision counts for truncated digests against each of the 8 bits flipped within a single input byte ranging from 1 to 255 at all positions for all input_count values ranging from 1 to 256.
#include <stdio.h>
#include "wsp_hash_64.h"
int main(void) {
uint8_t input[256];
uint64_t result = 0;
uint64_t result_flipped = 1;
unsigned long bit_collisions_counts[64];
unsigned short sparse_byte = 0;
unsigned short i = 0;
unsigned short j = 0;
unsigned short k = 0;
unsigned short l = 0;
unsigned short m = 0;
while (i != 64) {
bit_collisions_counts[i] = 0;
i++;
}
i = 1;
while (sparse_byte != 16) {
while (i != 256) {
j = 1;
while (j != 256) {
k = 0;
while (k != j) {
while (l != j) {
input[l] = sparse_byte;
l++;
}
if (sparse_byte == i) {
k = j;
l = 0;
continue;
}
input[k] = i;
result = wsp_hash_64(j, (const uint8_t *) input);
l = 0;
while (l != 8) {
input[k] = input[k] ^ (1 << l);
result_flipped = wsp_hash_64(j, (const uint8_t *) input);
input[k] = input[k] ^ (1 << l);
m = 1;
while (m != 64) {
if (
(result & ((2UL << m) - 1UL)) == (result_flipped & ((2UL << m)
- 1UL))
) {
bit_collisions_counts[m - 1]++;
}
m++;
}
l++;
m = 0;
}
k++;
l = 0;
}
j++;
k = 0;
}
i++;
}
sparse_byte++;
i = 1;
}
i = 2;
j = 0;
while (i != 65) {
printf("%2u-Bit Segmented Collisions: %9lu\n",
i,
bit_collisions_counts[i - 2]);
i++;
j++;
}
return 0;
}
The following collision results demonstrate a sufficient collision-based avalanche effect in the worst instance with non-cryptographic, universal hashing relevance.
Avalanche Collision Results Among 1 Billion 1-Bit Sparse Keys at Varying Lengths
2-Bit Segmented Collisions: 273414977
3-Bit Segmented Collisions: 138972645
4-Bit Segmented Collisions: 71405334
5-Bit Segmented Collisions: 36949894
6-Bit Segmented Collisions: 19201445
7-Bit Segmented Collisions: 10078202
8-Bit Segmented Collisions: 4920417
9-Bit Segmented Collisions: 2552460
10-Bit Segmented Collisions: 1394152
11-Bit Segmented Collisions: 824942
12-Bit Segmented Collisions: 382725
13-Bit Segmented Collisions: 195935
14-Bit Segmented Collisions: 111663
15-Bit Segmented Collisions: 69770
16-Bit Segmented Collisions: 33368
17-Bit Segmented Collisions: 21773
18-Bit Segmented Collisions: 17170
19-Bit Segmented Collisions: 14915
20-Bit Segmented Collisions: 3837
21-Bit Segmented Collisions: 3295
22-Bit Segmented Collisions: 3001
23-Bit Segmented Collisions: 2882
24-Bit Segmented Collisions: 87
25-Bit Segmented Collisions: 38
26-Bit Segmented Collisions: 26
27-Bit Segmented Collisions: 10
28-Bit Segmented Collisions: 2
29-Bit Segmented Collisions: 2
30-Bit Segmented Collisions: 0
31-Bit Segmented Collisions: 0
32-Bit Segmented Collisions: 0
33-Bit Segmented Collisions: 0
34-Bit Segmented Collisions: 0
35-Bit Segmented Collisions: 0
36-Bit Segmented Collisions: 0
37-Bit Segmented Collisions: 0
38-Bit Segmented Collisions: 0
39-Bit Segmented Collisions: 0
40-Bit Segmented Collisions: 0
41-Bit Segmented Collisions: 0
42-Bit Segmented Collisions: 0
43-Bit Segmented Collisions: 0
44-Bit Segmented Collisions: 0
45-Bit Segmented Collisions: 0
46-Bit Segmented Collisions: 0
47-Bit Segmented Collisions: 0
48-Bit Segmented Collisions: 0
49-Bit Segmented Collisions: 0
50-Bit Segmented Collisions: 0
51-Bit Segmented Collisions: 0
52-Bit Segmented Collisions: 0
53-Bit Segmented Collisions: 0
54-Bit Segmented Collisions: 0
55-Bit Segmented Collisions: 0
56-Bit Segmented Collisions: 0
57-Bit Segmented Collisions: 0
58-Bit Segmented Collisions: 0
59-Bit Segmented Collisions: 0
60-Bit Segmented Collisions: 0
61-Bit Segmented Collisions: 0
62-Bit Segmented Collisions: 0
63-Bit Segmented Collisions: 0
64-Bit Segmented Collisions: 0
0 64-bit collisions out of 125 million groups of 8 single-bit flip tests proves the worst instances are rare enough to be considered random occurrences in practical implementations.
The semi-linear collision increments after each truncation are within an acceptable range based on the probability of collisions at each bit size. For example, 200-300 million collisions in the lower 2 bits are expected out of 1 billion total keys, but only up to a few are expected in the range of 32 to 64 bits.
It's compared to the fastest portable 64-bit hashing algorithms that aren't limited by specific programming languages, platform optimizations or portabillity macros. For example, it's 75% to 300% faster than RapidHash without platform-specific instructions. Furthermore, the expressions in the core mixing loop are structured conveniently for compatiblity with SIMD and other intrinsics.
Compared to CityHash64, the speed's approximately 21% faster on average for small keys and 19% faster on average for large keys.
Compared to FarmHash64, the speed's approximately 16% faster on average for small keys and 18% faster on average for large keys.
Compared to 64-bit MurmurHash3 x64 and x86, the speed's approximately 25% faster on average for small keys and 145% faster on average for large keys.
Compared to 64-bit SpookyHashV2, the speed's approximately 40% faster on average for small keys and 22% faster on average for large keys.
Compared to XXHash64, the speed's at least 26% faster on average for small keys and 21% faster for large keys.
All speed tests were performed locally on a Pixelbook Go M3 using Debian.