From c7d94a66b0ac3c39abb4c0e007c699f93165f59b Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 27 Jun 2020 17:01:31 +0200 Subject: [PATCH] Bump xxh3 to 0.7.4. --- server/tracy_xxh3.h | 1465 +++++++++++++++++++++++++++-------------- server/tracy_xxhash.c | 43 -- server/tracy_xxhash.h | 509 ++++++++------ 3 files changed, 1274 insertions(+), 743 deletions(-) delete mode 100644 server/tracy_xxhash.c diff --git a/server/tracy_xxh3.h b/server/tracy_xxh3.h index 291c4a9d..c132981b 100644 --- a/server/tracy_xxh3.h +++ b/server/tracy_xxh3.h @@ -158,14 +158,17 @@ /* ========================================== * Vectorization detection * ========================================== */ -#define XXH_SCALAR 0 /* Portable scalar version */ -#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ -#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ -#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ -#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ +#define XXH_SCALAR 0 /* Portable scalar version */ +#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ +#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ +#define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */ +#define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */ +#define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */ #ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__AVX2__) +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) # define XXH_VECTOR XXH_SSE2 @@ -184,11 +187,13 @@ #endif /* - * Controls the alignment of the accumulator. - * This is for compatibility with aligned vector loads, which are usually faster. + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. */ #ifndef XXH_ACC_ALIGN -# if XXH_VECTOR == XXH_SCALAR /* scalar */ +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ # define XXH_ACC_ALIGN 8 # elif XXH_VECTOR == XXH_SSE2 /* sse2 */ # define XXH_ACC_ALIGN 16 @@ -198,9 +203,18 @@ # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_VSX /* vsx */ # define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 # endif #endif +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + /* * UGLY HACK: * GCC usually generates the best code with -O3 for xxHash. @@ -402,7 +416,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) /* s390x is always big endian, no issue on this platform */ # define XXH_vec_mulo vec_mulo # define XXH_vec_mule vec_mule -# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ # define XXH_vec_mulo __builtin_altivec_vmulouw # define XXH_vec_mule __builtin_altivec_vmuleuw @@ -452,7 +466,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) #endif /* Pseudorandom secret taken directly from FARSH */ -XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, @@ -461,13 +475,17 @@ XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, - 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, }; + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + /* * Calculates a 32-bit to 64-bit long multiply. * @@ -525,8 +543,10 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) && defined(__SIZEOF_INT128__) \ || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; - XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); return r128; /* @@ -543,7 +563,9 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) #endif xxh_u64 product_high; xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); - XXH128_hash_t const r128 = { product_low, product_high }; + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; return r128; #else @@ -601,7 +623,9 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - XXH128_hash_t r128 = { lower, upper }; + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; return r128; #endif } @@ -627,9 +651,8 @@ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) } /* - * We don't need to (or want to) mix as much as XXH64. - * - * Short hashes are more evenly distributed, so it isn't necessary. + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed */ static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) { @@ -639,6 +662,21 @@ static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) return h64; } +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= 0x9FB21C651E98DF25ULL; + h64 ^= (h64 >> 35) + len ; + h64 *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(h64, 28); +} + /* ========================================== * Short keys @@ -684,15 +722,14 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h * len = 2: combined = { input[1], 0x02, input[0], input[1] } * len = 3: combined = { input[2], 0x03, input[0], input[1] } */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; - xxh_u64 const mixed = keyed * PRIME64_1; - return XXH3_avalanche(mixed); + return XXH64_avalanche(keyed); } } @@ -707,13 +744,8 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h xxh_u32 const input2 = XXH_readLE32(input + len - 4); xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); - xxh_u64 x = input64 ^ bitflip; - /* this mix is inspired by Pelle Evensen's rrmxmx */ - x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); - x *= 0x9FB21C651E98DF25ULL; - x ^= (x >> 35) + len ; - x *= 0x9FB21C651E98DF25ULL; - return XXH_xorshift64(x, 28); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); } } @@ -741,7 +773,7 @@ XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); if (len) return XXH3_len_1to3_64b(input, len, secret, seed); - return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); } } @@ -812,7 +844,7 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(16 < len && len <= 128); - { xxh_u64 acc = len * PRIME64_1; + { xxh_u64 acc = len * XXH_PRIME64_1; if (len > 32) { if (len > 64) { if (len > 96) { @@ -845,7 +877,7 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, #define XXH3_MIDSIZE_STARTOFFSET 3 #define XXH3_MIDSIZE_LASTOFFSET 17 - { xxh_u64 acc = len * PRIME64_1; + { xxh_u64 acc = len * XXH_PRIME64_1; int const nbRounds = (int)len / 16; int i; for (i=0; i<8; i++) { @@ -888,13 +920,36 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, } -/* === Long Keys === */ +/* ======= Long Keys ======= */ -#define STRIPE_LEN 64 +#define XXH_STRIPE_LEN 64 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ -#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) -typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif /* * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. @@ -918,174 +973,39 @@ typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; * * Both XXH3_64bits and XXH3_128bits use this subroutine. */ -XXH_FORCE_INLINE void -XXH3_accumulate_512( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret, - XXH3_accWidth_e accWidth) + +#if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) { -#if (XXH_VECTOR == XXH_AVX2) + XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - XXH_ASSERT((((size_t)acc) & 31) == 0); - { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xinput = (const __m256i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { - /* data_vec = xinput[i]; */ - __m256i const data_vec = _mm256_loadu_si256 (xinput+i); - /* key_vec = xsecret[i]; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); - if (accWidth == XXH3_acc_128bits) { - /* xacc[i] += swap(data_vec); */ - __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } else { /* XXH3_acc_64bits */ - /* xacc[i] += data_vec; */ - __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } - } } - -#elif (XXH_VECTOR == XXH_SSE2) - - /* SSE2 is just a half-scale version of the AVX2 version. */ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xinput = (const __m128i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { - /* data_vec = xinput[i]; */ - __m128i const data_vec = _mm_loadu_si128 (xinput+i); - /* key_vec = xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); - if (accWidth == XXH3_acc_128bits) { - /* xacc[i] += swap(data_vec); */ - __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); - __m128i const sum = _mm_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } else { /* XXH3_acc_64bits */ - /* xacc[i] += data_vec; */ - __m128i const sum = _mm_add_epi64(xacc[i], data_vec); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } - } } - -#elif (XXH_VECTOR == XXH_NEON) - - XXH_ASSERT((((size_t)acc) & 15) == 0); { - XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; - /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* const xinput = (const uint8_t *) input; - uint8_t const* const xsecret = (const uint8_t *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { - /* data_vec = xinput[i]; */ - uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); - /* key_vec = xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); - uint32x2_t data_key_lo, data_key_hi; - if (accWidth == XXH3_acc_64bits) { - /* xacc[i] += data_vec; */ - xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); - } else { /* XXH3_acc_128bits */ - /* xacc[i] += swap(data_vec); */ - uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); - uint64x2_t const swapped = vextq_u64(data64, data64, 1); - xacc[i] = vaddq_u64 (xacc[i], swapped); - } - /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); - * data_key_hi = (uint32x2_t) (data_key >> 32); - * data_key = UNDEFINED; */ - XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ - xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); - - } + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); } - -#elif (XXH_VECTOR == XXH_VSX) - xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ - xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ - xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ - xxh_u64x2 const v32 = { 32, 32 }; - size_t i; - for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); - /* key_vec = xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - /* shuffled = (data_key << 32) | (data_key >> 32); */ - xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); - /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ - xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); - xacc[i] += product; - - if (accWidth == XXH3_acc_64bits) { - xacc[i] += data_vec; - } else { /* XXH3_acc_128bits */ - /* swap high and low halves */ -#ifdef __s390x__ - xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); -#else - xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); -#endif - xacc[i] += data_swapped; - } - } - -#else /* scalar variant of Accumulator - universal */ - - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - size_t i; - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); - for (i=0; i < ACC_NB; i++) { - xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); - xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); - - if (accWidth == XXH3_acc_64bits) { - xacc[i] += data_val; - } else { - xacc[i ^ 1] += data_val; /* swap adjacent lanes */ - } - xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); - } -#endif } /* @@ -1108,20 +1028,109 @@ XXH3_accumulate_512( void* XXH_RESTRICT acc, * * Both XXH3_64bits and XXH3_128bits use this subroutine. */ -XXH_FORCE_INLINE void -XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ -#if (XXH_VECTOR == XXH_AVX2) +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64); + + XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret; + XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret; + int i; + for (i=0; i < nbRounds; ++i) { + // GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', + // this will warn "discards ‘const’ qualifier". + union { + XXH_ALIGN(64) const __m512i* const cp; + XXH_ALIGN(64) void* const p; + } const remote_const_void = { .cp = src + i }; + dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ XXH_ASSERT((((size_t)acc) & 31) == 0); { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xsecret = (const __m256i *) secret; - const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ __m256i const acc_vec = xacc[i]; __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); @@ -1130,25 +1139,105 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - /* xacc[i] *= PRIME32_1; */ + /* xacc[i] *= XXH_PRIME32_1; */ __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); } } +} -#elif (XXH_VECTOR == XXH_SSE2) +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64); + XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret; + XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + __asm__("" : "+r" (dest)); +# endif + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + } +} + +#endif + +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ XXH_ASSERT((((size_t)acc) & 15) == 0); { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ __m128i const acc_vec = xacc[i]; __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); @@ -1157,24 +1246,98 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) __m128i const key_vec = _mm_loadu_si128 (xsecret+i); __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - /* xacc[i] *= PRIME32_1; */ + /* xacc[i] *= XXH_PRIME32_1; */ __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); } } +} -#elif (XXH_VECTOR == XXH_NEON) +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + // MSVC 32bit mode does not support _mm_set_epi64x before 2015 + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64); +# endif + int i; + + XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret; + XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + __asm__("" : "+r" (dest)); +# endif + + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ XXH_ASSERT((((size_t)acc) & 15) == 0); { uint64x2_t* xacc = (uint64x2_t*) acc; uint8_t const* xsecret = (uint8_t const*) secret; - uint32x2_t prime = vdup_n_u32 (PRIME32_1); + uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); size_t i; - for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { /* xacc[i] ^= (xacc[i] >> 47); */ uint64x2_t acc_vec = xacc[i]; uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); @@ -1184,14 +1347,14 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); - /* xacc[i] *= PRIME32_1 */ + /* xacc[i] *= XXH_PRIME32_1 */ uint32x2_t data_key_lo, data_key_hi; /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (xacc[i] >> 32); * xacc[i] = UNDEFINED; */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); { /* - * prod_hi = (data_key >> 32) * PRIME32_1; + * prod_hi = (data_key >> 32) * XXH_PRIME32_1; * * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will * incorrectly "optimize" this: @@ -1211,13 +1374,51 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); /* xacc[i] = prod_hi << 32; */ xacc[i] = vshlq_n_u64(prod_hi, 32); - /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); } } } +} -#elif (XXH_VECTOR == XXH_VSX) +#endif +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + /* swap high and low halves */ +#ifdef __s390x__ + xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); +#else + xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] += data_swapped; + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ XXH_ASSERT((((size_t)acc) & 15) == 0); { xxh_u64x2* const xacc = (xxh_u64x2*) acc; @@ -1225,9 +1426,9 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) /* constants */ xxh_u64x2 const v32 = { 32, 32 }; xxh_u64x2 const v47 = { 47, 47 }; - xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; size_t i; - for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { /* xacc[i] ^= (xacc[i] >> 47); */ xxh_u64x2 const acc_vec = xacc[i]; xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); @@ -1236,33 +1437,175 @@ XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); xxh_u64x2 const data_key = data_vec ^ key_vec; - /* xacc[i] *= PRIME32_1 */ + /* xacc[i] *= XXH_PRIME32_1 */ /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); xacc[i] = prod_odd + (prod_even << v32); } } +} -#else /* scalar variant of Scrambler - universal */ +#endif +/* scalar variants - universal */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < XXH_ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ size_t i; XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - for (i=0; i < ACC_NB; i++) { + for (i=0; i < XXH_ACC_NB; i++) { xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); xxh_u64 acc64 = xacc[i]; acc64 = XXH_xorshift64(acc64, 47); acc64 ^= key64; - acc64 *= PRIME32_1; + acc64 *= XXH_PRIME32_1; xacc[i] = acc64; } - -#endif } -#define XXH_PREFETCH_DIST 384 +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + __asm__("" : "+r" (kSecretPtr)); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == XXH3_kSecret); + + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + + + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ /* * XXH3_accumulate() @@ -1274,50 +1617,48 @@ XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, const xxh_u8* XXH_RESTRICT secret, size_t nbStripes, - XXH3_accWidth_e accWidth) + XXH3_f_accumulate_512 f_acc512) { size_t n; for (n = 0; n < nbStripes; n++ ) { - const xxh_u8* const in = input + n*STRIPE_LEN; + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; XXH_PREFETCH(in + XXH_PREFETCH_DIST); - XXH3_accumulate_512(acc, - in, - secret + n*XXH_SECRET_CONSUME_RATE, - accWidth); + f_acc512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE); } } XXH_FORCE_INLINE void -XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_accWidth_e accWidth) + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) { - size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - size_t const block_len = STRIPE_LEN * nb_rounds; - size_t const nb_blocks = len / block_len; + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; size_t n; XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); for (n = 0; n < nb_blocks; n++) { - XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); - XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); } /* last partial block */ - XXH_ASSERT(len > STRIPE_LEN); - { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); /* last stripe */ - if (len & (STRIPE_LEN - 1)) { - const xxh_u8* const p = input + len - STRIPE_LEN; - /* Do not align on 8, so that the secret is different from the scrambler */ -#define XXH_SECRET_LASTACC_START 7 - XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); } } } @@ -1333,81 +1674,79 @@ static XXH64_hash_t XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) { xxh_u64 result64 = start; + size_t i = 0; - result64 += XXH3_mix2Accs(acc+0, secret + 0); - result64 += XXH3_mix2Accs(acc+2, secret + 16); - result64 += XXH3_mix2Accs(acc+4, secret + 32); - result64 += XXH3_mix2Accs(acc+6, secret + 48); + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + __asm__("" : "+r" (result64)); +#endif + } return XXH3_avalanche(result64); } -#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ - PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, f_acc512, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); /* do not align on 8, so that the secret is different from the accumulator */ #define XXH_SECRET_MERGEACCS_START 11 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); -} - -XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) -{ - if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); - memcpy(dst, &v64, sizeof(v64)); -} - -/* XXH3_initCustomSecret() : - * destination `customSecret` is presumed allocated and same size as `kSecret`. - */ -XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) -{ - int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - int i; - - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - - for (i=0; i < nbRounds; i++) { - XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64); - XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64); - } -} - - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) -{ - return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret)); + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); } /* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. + * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { - return XXH3_hashLong_64b_internal(input, len, secret, secretSize); + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + * Since the function is not inlined, the compiler may not be able to understand that, + * in some scenarios, its `secret` argument is actually a compile time constant. + * This variant enforces that the compiler can detect that, + * and uses this opportunity to streamline the generated code for better performance. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_default(const xxh_u8* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); } /* * XXH3_hashLong_64b_withSeed(): - * Generate a custom key based on alteration of default kSecret with the seed, + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, * and then use this key for long mode hashing. * * This operation is decently fast but nonetheless costs a little bit of time. @@ -1416,61 +1755,84 @@ XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, * It's important for performance that XXH3_hashLong is not inlined. Not sure * why (uop cache maybe?), but the difference is large and easily measurable. */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const xxh_u8* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) { - XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); - XXH3_initCustomSecret(secret, seed); - return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc512, f_scramble); + } } +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, + XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const xxh_u8* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong((const xxh_u8*)input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + /* === Public entry point === */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) { - if (len <= 16) - return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); - if (len <= 128) - return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); + return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) { - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secret` conditions are not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - */ - if (len <= 16) - return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); - if (len <= 128) - return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); + return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) { - if (len <= 16) - return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); - if (len <= 128) - return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); + return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); } -/* === XXH3 streaming === */ +/* === XXH3 streaming === */ /* * Malloc's a pointer that is always aligned to align. @@ -1561,19 +1923,19 @@ XXH3_64bits_reset_internal(XXH3_state_t* statePtr, { XXH_ASSERT(statePtr != NULL); memset(statePtr, 0, sizeof(*statePtr)); - statePtr->acc[0] = PRIME32_3; - statePtr->acc[1] = PRIME64_1; - statePtr->acc[2] = PRIME64_2; - statePtr->acc[3] = PRIME64_3; - statePtr->acc[4] = PRIME64_4; - statePtr->acc[5] = PRIME32_2; - statePtr->acc[6] = PRIME64_5; - statePtr->acc[7] = PRIME32_1; + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; statePtr->seed = seed; XXH_ASSERT(secret != NULL); - statePtr->secret = secret; + statePtr->extSecret = secret; XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; } @@ -1581,7 +1943,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } @@ -1599,30 +1961,36 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_64bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); XXH3_initCustomSecret(statePtr->customSecret, seed); - statePtr->secret = statePtr->customSecret; + statePtr->extSecret = NULL; return XXH_OK; } +/* Note : when XXH3_consumeStripes() is invoked, + * there must be a guarantee that at least one more byte must be consumed from input + * so that the function can blindly consume all stripes using the "normal" secret segment */ XXH_FORCE_INLINE void -XXH3_consumeStripes( xxh_u64* acc, - XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, - const xxh_u8* input, size_t totalStripes, - const xxh_u8* secret, size_t secretLimit, - XXH3_accWidth_e accWidth) +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) { + XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); - if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { + if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { /* need a scrambling operation */ - size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); - XXH3_scrambleAcc(acc, secret + secretLimit); - XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); - *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); + size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; + size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); + f_scramble(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); + *nbStripesSoFarPtr = nbStripesAfterBlock; } else { - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); - *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); + *nbStripesSoFarPtr += nbStripes; } } @@ -1630,7 +1998,10 @@ XXH3_consumeStripes( xxh_u64* acc, * Both XXH3_64bits_update and XXH3_128bits_update use this routine. */ XXH_FORCE_INLINE XXH_errorcode -XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) +XXH3_update(XXH3_state_t* state, + const xxh_u8* input, size_t len, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) @@ -1640,6 +2011,7 @@ XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_ #endif { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; state->totalLen += len; @@ -1648,14 +2020,14 @@ XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_ state->bufferedSize += (XXH32_hash_t)len; return XXH_OK; } - /* input is now > XXH3_INTERNALBUFFER_SIZE */ + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ - #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) - XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ /* - * There is some input left inside the internal buffer. - * Fill it, then consume it. + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. */ if (state->bufferedSize) { size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; @@ -1664,28 +2036,31 @@ XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_ XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, state->nbStripesPerBlock, state->buffer, XXH3_INTERNALBUFFER_STRIPES, - state->secret, state->secretLimit, - accWidth); + secret, state->secretLimit, + f_acc512, f_scramble); state->bufferedSize = 0; } + XXH_ASSERT(input < bEnd); - /* Consume input by full buffer quantities */ - if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { + /* Consume input by a multiple of internal buffer size */ + if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) { const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; do { XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, state->nbStripesPerBlock, input, XXH3_INTERNALBUFFER_STRIPES, - state->secret, state->secretLimit, - accWidth); + secret, state->secretLimit, + f_acc512, f_scramble); input += XXH3_INTERNALBUFFER_SIZE; - } while (input<=limit); + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); } + XXH_ASSERT(input < bEnd); - if (input < bEnd) { /* Some remaining input: buffer it */ - XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); - state->bufferedSize = (XXH32_hash_t)(bEnd-input); - } + /* Some remaining input (always) : buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); } return XXH_OK; @@ -1694,61 +2069,108 @@ XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) { - return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); } XXH_FORCE_INLINE void -XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) { /* * Digest on a local copy. This way, the state remains unaltered, and it can * continue ingesting more input afterwards. */ memcpy(acc, state->acc, sizeof(state->acc)); - if (state->bufferedSize >= STRIPE_LEN) { - size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; - XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; + if (state->bufferedSize >= XXH_STRIPE_LEN) { + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, totalNbStripes, - state->secret, state->secretLimit, - accWidth); - if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ - XXH3_accumulate_512(acc, - state->buffer + state->bufferedSize - STRIPE_LEN, - state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, - accWidth); - } - } else { /* bufferedSize < STRIPE_LEN */ - if (state->bufferedSize) { /* one last stripe */ - xxh_u8 lastStripe[STRIPE_LEN]; - size_t const catchupSize = STRIPE_LEN - state->bufferedSize; - memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); - memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); - XXH3_accumulate_512(acc, - lastStripe, - state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, - accWidth); - } } + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate_512, XXH3_scrambleAcc); + /* last stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - XXH_STRIPE_LEN, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } else { /* bufferedSize < XXH_STRIPE_LEN */ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } } XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) { + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; - XXH3_digest_long(acc, state, XXH3_acc_64bits); + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); return XXH3_mergeAccs(acc, - state->secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * PRIME64_1); + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); } - /* len <= XXH3_MIDSIZE_MAX: short code */ + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ if (state->seed) return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), - state->secret, state->secretLimit + STRIPE_LEN); + secret, state->secretLimit + XXH_STRIPE_LEN); } + +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_PUBLIC_API void +XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize) +{ + XXH_ASSERT(secretBuffer != NULL); + if (customSeedSize == 0) { + memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return; + } + XXH_ASSERT(customSeed != NULL); + + { size_t const segmentSize = sizeof(XXH128_hash_t); + size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize; + XXH128_canonical_t scrambler; + XXH64_hash_t seeds[12]; + size_t segnb; + XXH_ASSERT(nbSegments == 12); + XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */ + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + + /* + * Copy customSeed to seeds[], truncating or repeating as necessary. + */ + { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds)); + size_t filled = toFill; + memcpy(seeds, customSeed, toFill); + while (filled < sizeof(seeds)) { + toFill = XXH_MIN(filled, sizeof(seeds) - filled); + memcpy((char*)seeds + filled, seeds, toFill); + filled += toFill; + } } + + /* generate secret */ + memcpy(secretBuffer, &scrambler, sizeof(scrambler)); + for (segnb=1; segnb < nbSegments; segnb++) { + size_t const segmentStart = segnb * segmentSize; + XXH128_canonical_t segment; + XXH128_canonicalFromHash(&segment, + XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) ); + memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment)); + } } +} + + /* ========================================== * XXH3 128 bits (a.k.a XXH128) * ========================================== @@ -1788,9 +2210,9 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; - xxh_u64 const mixedl = keyed_lo * PRIME64_1; - xxh_u64 const mixedh = keyed_hi * PRIME64_5; - XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ }; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); return h128; } } @@ -1809,7 +2231,7 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ xxh_u64 const keyed = input_64 ^ bitflip; /* Shift len to the left to ensure it is even, this avoids even multiplies. */ - XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); m128.high64 += (m128.low64 << 1); m128.low64 ^= (m128.high64 >> 3); @@ -1832,7 +2254,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; xxh_u64 const input_lo = XXH_readLE64(input); xxh_u64 input_hi = XXH_readLE64(input + len - 8); - XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); /* * Put len in the middle of m128 to ensure that the length gets mixed to * both the low and high bits in the 128x64 multiply below. @@ -1841,7 +2263,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 input_hi ^= bitfliph; /* * Add the high 32 bits of input_hi to the high 32 bits of m128, then - * add the long product of the low 32 bits of input_hi and PRIME32_2 to + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to * the high 64 bits of m128. * * The best approach to this operation is different on 32-bit and 64-bit. @@ -1853,7 +2275,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 * On 32-bit, it removes an ADC and delays a dependency between the two * halves of m128.high64, but it generates an extra mask on 64-bit. */ - m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); + m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); } else { /* * 64-bit optimized (albeit more confusing) version. @@ -1863,7 +2285,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 * Let: * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) - * c = PRIME32_2 + * c = XXH_PRIME32_2 * * a + (b * c) * Inverse Property: x + y - x == y @@ -1874,19 +2296,19 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 * a + b + (b * (c - 1)) * * Substitute a, b, and c: - * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) * * Since input_hi.hi + input_hi.lo == input_hi, we get this: - * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) */ - m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); } /* m128 ^= XXH_swap64(m128 >> 64); */ m128.low64 ^= XXH_swap64(m128.high64); - { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ - XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); - h128.high64 += m128.high64 * PRIME64_2; + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = XXH3_avalanche(h128.high64); @@ -1907,8 +2329,8 @@ XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 { XXH128_hash_t h128; xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); - h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); - h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); return h128; } } } @@ -1937,7 +2359,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(16 < len && len <= 128); { XXH128_hash_t acc; - acc.low64 = len * PRIME64_1; + acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; if (len > 32) { if (len > 64) { @@ -1949,11 +2371,13 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); } acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); - { xxh_u64 const low64 = acc.low64 + acc.high64; - xxh_u64 const high64 = (acc.low64 * PRIME64_1) - + (acc.high64 * PRIME64_4) - + ((len - seed) * PRIME64_2); - XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) }; + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); return h128; } } @@ -1970,7 +2394,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, { XXH128_hash_t acc; int const nbRounds = (int)len / 32; int i; - acc.low64 = len * PRIME64_1; + acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; for (i=0; i<4; i++) { acc = XXH128_mix32B(acc, @@ -1996,11 +2420,13 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed); - { xxh_u64 const low64 = acc.low64 + acc.high64; - xxh_u64 const high64 = (acc.low64 * PRIME64_1) - + (acc.high64 * PRIME64_4) - + ((len - seed) * PRIME64_2); - XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) }; + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); return h128; } } @@ -2008,77 +2434,94 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, f_acc512, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { xxh_u64 low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)len * PRIME64_1); - xxh_u64 high64 = XXH3_mergeAccs(acc, - secret + secretSize - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)len * PRIME64_2)); - XXH128_hash_t h128 = { low64, high64 }; + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); return h128; } } /* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. + * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) +XXH3_hashLong_128b_default(const xxh_u8* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { - return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate_512, XXH3_scrambleAcc); } /* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. + * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, - const xxh_u8* secret, size_t secretSize) +XXH3_hashLong_128b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { - return XXH3_hashLong_128b_internal(input, len, secret, secretSize); + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, secret, secretLen, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret), + f_acc512, f_scramble); + } } /* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. + * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { - XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); - XXH3_initCustomSecret(secret, seed); - return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); } +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const xxh_u8* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) { - if (len <= 16) - return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); - if (len <= 128) - return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); -} - -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); /* * If an action is to be taken if `secret` conditions are not respected, * it should be done here. @@ -2086,24 +2529,38 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_ * Adding a check and a branch here would cost performance at every hash. */ if (len <= 16) - return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + return XXH3_len_0to16_128b((const xxh_u8*)input, len, secret, seed64); if (len <= 128) - return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_len_17to128_128b((const xxh_u8*)input, len, secret, secretLen, seed64); if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); + return XXH3_len_129to240_128b((const xxh_u8*)input, len, secret, secretLen, seed64); + return f_hl128((const xxh_u8*)input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); } XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) { - if (len <= 16) - return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); - if (len <= 128) - return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); } XXH_PUBLIC_API XXH128_hash_t @@ -2132,7 +2589,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } @@ -2150,32 +2607,34 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_128bits_reset_internal(statePtr, seed, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); XXH3_initCustomSecret(statePtr->customSecret, seed); - statePtr->secret = statePtr->customSecret; + statePtr->extSecret = NULL; return XXH_OK; } XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) { - return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); } XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) { + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; - XXH3_digest_long(acc, state, XXH3_acc_128bits); - XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { xxh_u64 low64 = XXH3_mergeAccs(acc, - state->secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * PRIME64_1); - xxh_u64 high64 = XXH3_mergeAccs(acc, - state->secret + state->secretLimit + STRIPE_LEN - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)state->totalLen * PRIME64_2)); - XXH128_hash_t const h128 = { low64, high64 }; + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); return h128; } } @@ -2183,7 +2642,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) if (state->seed) return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), - state->secret, state->secretLimit + STRIPE_LEN); + secret, state->secretLimit + XXH_STRIPE_LEN); } /* 128-bit utility functions */ diff --git a/server/tracy_xxhash.c b/server/tracy_xxhash.c deleted file mode 100644 index 9ef28e64..00000000 --- a/server/tracy_xxhash.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Copyright (C) 2012-2020 Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - - -/* - * xxhash.c instantiates functions defined in xxhash.h - */ - -#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ -#define XXH_IMPLEMENTATION /* access definitions */ - -#include "tracy_xxhash.h" diff --git a/server/tracy_xxhash.h b/server/tracy_xxhash.h index 6249bea9..d8c32b5d 100644 --- a/server/tracy_xxhash.h +++ b/server/tracy_xxhash.h @@ -367,7 +367,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); /******* Canonical representation *******/ -typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); @@ -451,8 +451,8 @@ struct XXH64_state_s { * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are * explained in the implementation. * - * Optimized implementations are provided for AVX2, SSE2, NEON, POWER8, ZVector, - * and scalar targets. This can be controlled with the XXH_VECTOR macro. + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. * * XXH3 offers 2 variants, _64bits and _128bits. * When only 64 bits are needed, prefer calling the _64bits variant, as it @@ -470,16 +470,11 @@ struct XXH64_state_s { * ephemeral data (local sessions). * * Avoid storing values in long-term storage until the algorithm is finalized. - * - * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if - * everything remains fine, its current format will be "frozen" and become the - * final one. + * XXH3's return values will be officially finalized upon reaching v0.8.0. * * After which, return values of XXH3 and XXH128 will no longer change in * future versions. * - * XXH3's return values will be officially finalized upon reaching v0.8.0. - * * The API supports one-shot hashing, streaming mode, and custom secrets. */ @@ -497,6 +492,8 @@ struct XXH64_state_s { # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) + +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) #endif /* XXH3_64bits(): @@ -504,20 +501,6 @@ struct XXH64_state_s { * It's the fastest variant. */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); -/* - * XXH3_64bits_withSecret(): - * It's possible to provide any blob of bytes as a "secret" to generate the hash. - * This makes it more difficult for an external actor to prepare an intentional - * collision. - * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). - * It should consist of random bytes. - * Avoid trivial sequences, such as repeating sequences and especially '\0', - * as this can cancel out itself. - * Failure to respect these conditions will result in a poor quality hash. - */ -#define XXH3_SECRET_SIZE_MIN 136 -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); - /* * XXH3_64bits_withSeed(): * This variant generates a custom secret on the fly based on the default @@ -527,6 +510,22 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the hash highly depends on the secret's entropy. + * Technically, the secret must look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever unsure about the "randonmess" of the blob of bytes, + * consider relabelling it as a "custom seed" instead, + * and employ "XXH3_generateSecret()" (see below) + * to generate a high quality secret derived from this custom seed. + */ +#define XXH3_SECRET_SIZE_MIN 136 +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + /* streaming 64-bit */ @@ -551,26 +550,25 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X typedef struct XXH3_state_s XXH3_state_t; -#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ #define XXH3_INTERNALBUFFER_SIZE 256 +#define XXH3_SECRET_DEFAULT_SIZE 192 struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /* used to store a custom secret generated from the seed. Makes state larger. - * Design might change */ + /* used to store a custom secret generated from a seed */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); XXH32_hash_t bufferedSize; - XXH32_hash_t nbStripesPerBlock; - XXH32_hash_t nbStripesSoFar; - XXH32_hash_t secretLimit; XXH32_hash_t reserved32; - XXH32_hash_t reserved32_2; + size_t nbStripesPerBlock; + size_t nbStripesSoFar; + size_t secretLimit; XXH64_hash_t totalLen; XXH64_hash_t seed; XXH64_hash_t reserved64; - /* note: there is some padding after due to alignment on 64 bytes */ - const unsigned char* secret; -}; /* typedef'd to XXH3_state_t */ + const unsigned char* extSecret; /* reference to external secret; + * if == NULL, use .customSecret instead */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ #undef XXH_ALIGN_MEMBER @@ -599,9 +597,12 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); /* * XXH3_64bits_reset_withSecret(): - * `secret` is referenced, and must outlive the hash streaming session, so - * be careful when using stack arrays. - * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of the hash depends on secret's entropy, + * meaning that the secret should look like a bunch of random bytes. + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); @@ -661,20 +662,56 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. * * return: >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 */ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); /******* Canonical representation *******/ -typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/* + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, + * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE + * into an already allocated buffer secretBuffer. + * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, + * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` + * are part of this list. They all accept a `secret` parameter + * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so + * this function can be used to generate a secret of proper quality. + * + * customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes. + * The resulting `secret` will nonetheless provide all expected qualities. + * + * Supplying NULL as the customSeed copies the default secret into `secretBuffer`. + * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + */ +XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize); + + #endif /* XXH_NO_LONG_LONG */ + #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) # define XXH_IMPLEMENTATION #endif @@ -751,7 +788,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* #endif /*! - *XXH_ACCEPT_NULL_INPUT_POINTER: + * XXH_ACCEPT_NULL_INPUT_POINTER: * If the input pointer is NULL, xxHash's default behavior is to dereference it, * triggering a segfault. * When this macro is enabled, xxHash actively checks the input for a null pointer. @@ -763,16 +800,26 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* /*! * XXH_FORCE_ALIGN_CHECK: - * This is a minor performance trick, only useful with lots of very small keys. - * It means: check for aligned/unaligned input. - * The check costs one initial branch per hash; - * Set it to 0 when the input is guaranteed to be aligned or when alignment - * doesn't matter for performance. + * This is an important performance trick + * for architectures without decent unaligned memory access performance. + * It checks for input alignment, and when conditions are met, + * uses a "fast path" employing direct 32-bit/64-bit read, + * resulting in _dramatically faster_ read speed. * - * This option does not affect XXH3. + * The check costs one initial branch per hash, which is generally negligible, but not zero. + * Moreover, it's not useful to generate binary for an additional code path + * if memory access uses same instruction for both aligned and unaligned adresses. + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64 & arm64, + * which are platforms known to offer good unaligned memory accesses performance. + * + * This option does not affect XXH3 (only XXH32 and XXH64). */ -#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ -# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ # define XXH_FORCE_ALIGN_CHECK 0 # else # define XXH_FORCE_ALIGN_CHECK 1 @@ -852,26 +899,27 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif -#if XXH_NO_INLINE_HINTS /* disable inlining hints */ -# define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -#elif defined(_MSC_VER) /* Visual Studio */ -# define XXH_FORCE_INLINE static __forceinline -# define XXH_NO_INLINE static __declspec(noinline) -#else -# if defined (__cplusplus) \ - || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) -# define XXH_NO_INLINE static __attribute__((noinline)) -# else -# define XXH_FORCE_INLINE static inline -# define XXH_NO_INLINE static -# endif +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) +# define XXH_FORCE_INLINE static __attribute__((unused)) # else # define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -# endif /* __STDC_VERSION__ */ +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined(__GNUC__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static #endif @@ -880,14 +928,18 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) * Debug ***************************************/ /* - * DEBUGLEVEL is expected to be defined externally, typically via the compiler's - * command line options. The value must be a number. + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. */ -#ifndef DEBUGLEVEL -# define DEBUGLEVEL 0 +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif #endif -#if (DEBUGLEVEL>=1) +#if (XXH_DEBUGLEVEL>=1) # include /* note: can still be disabled with NDEBUG */ # define XXH_ASSERT(c) assert(c) #else @@ -895,7 +947,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) #endif /* note: use after variable declarations */ -#define XXH_STATIC_ASSERT(c) { enum { XXH_sa = 1/(int)(!!(c)) }; } +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) /* ************************************* @@ -905,12 +957,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include - typedef uint8_t xxh_u8; + typedef uint8_t xxh_u8; #else - typedef unsigned char xxh_u8; + typedef unsigned char xxh_u8; #endif typedef XXH32_hash_t xxh_u32; +#ifdef XXH_OLD_NAMES +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif /* *** Memory access *** */ @@ -935,8 +992,14 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; * * Currently only defined for GCC and ICC. */ +#ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; -static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; + return ((const xxh_unalign*)ptr)->u32; +} #else @@ -978,10 +1041,13 @@ typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) # define XXH_CPU_LITTLE_ENDIAN 0 # else +/* + * runtime test, presumed to simplify to a constant by compiler + */ static int XXH_isLittleEndian(void) { /* - * Nonstandard, but well-defined behavior in practice. + * Portable and well-defined behavior. * Don't use static: it is detrimental to performance. */ const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; @@ -999,12 +1065,14 @@ static int XXH_isLittleEndian(void) ******************************************/ #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -#ifndef __has_builtin -# define __has_builtin(x) 0 +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 #endif -#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ - && __has_builtin(__builtin_rotateleft64) +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) # define XXH_rotl32 __builtin_rotateleft32 # define XXH_rotl64 __builtin_rotateleft64 /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ @@ -1093,17 +1161,25 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } /* ******************************************************************* * 32-bit hash functions *********************************************************************/ -static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ -static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ -static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ -static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ -static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ +static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { - acc += input * PRIME32_2; + acc += input * XXH_PRIME32_2; acc = XXH_rotl32(acc, 13); - acc *= PRIME32_1; + acc *= XXH_PRIME32_1; #if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: @@ -1159,9 +1235,9 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) static xxh_u32 XXH32_avalanche(xxh_u32 h32) { h32 ^= h32 >> 15; - h32 *= PRIME32_2; + h32 *= XXH_PRIME32_2; h32 ^= h32 >> 13; - h32 *= PRIME32_3; + h32 *= XXH_PRIME32_3; h32 ^= h32 >> 16; return(h32); } @@ -1171,64 +1247,66 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) { -#define PROCESS1 \ - h32 += (*ptr++) * PRIME32_5; \ - h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; +#define XXH_PROCESS1 do { \ + h32 += (*ptr++) * XXH_PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ +} while (0) -#define PROCESS4 \ - h32 += XXH_get32bits(ptr) * PRIME32_3; \ - ptr+=4; \ - h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; +#define XXH_PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ +} while (0) /* Compact rerolled version */ if (XXH_REROLL) { len &= 15; while (len >= 4) { - PROCESS4; + XXH_PROCESS4; len -= 4; } while (len > 0) { - PROCESS1; + XXH_PROCESS1; --len; } return XXH32_avalanche(h32); } else { switch(len&15) /* or switch(bEnd - p) */ { - case 12: PROCESS4; + case 12: XXH_PROCESS4; /* fallthrough */ - case 8: PROCESS4; + case 8: XXH_PROCESS4; /* fallthrough */ - case 4: PROCESS4; + case 4: XXH_PROCESS4; return XXH32_avalanche(h32); - case 13: PROCESS4; + case 13: XXH_PROCESS4; /* fallthrough */ - case 9: PROCESS4; + case 9: XXH_PROCESS4; /* fallthrough */ - case 5: PROCESS4; - PROCESS1; + case 5: XXH_PROCESS4; + XXH_PROCESS1; return XXH32_avalanche(h32); - case 14: PROCESS4; + case 14: XXH_PROCESS4; /* fallthrough */ - case 10: PROCESS4; + case 10: XXH_PROCESS4; /* fallthrough */ - case 6: PROCESS4; - PROCESS1; - PROCESS1; + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; return XXH32_avalanche(h32); - case 15: PROCESS4; + case 15: XXH_PROCESS4; /* fallthrough */ - case 11: PROCESS4; + case 11: XXH_PROCESS4; /* fallthrough */ - case 7: PROCESS4; + case 7: XXH_PROCESS4; /* fallthrough */ - case 3: PROCESS1; + case 3: XXH_PROCESS1; /* fallthrough */ - case 2: PROCESS1; + case 2: XXH_PROCESS1; /* fallthrough */ - case 1: PROCESS1; + case 1: XXH_PROCESS1; /* fallthrough */ case 0: return XXH32_avalanche(h32); } @@ -1237,6 +1315,14 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) } } +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) { @@ -1252,10 +1338,10 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment if (len>=16) { const xxh_u8* const limit = bEnd - 15; - xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; - xxh_u32 v2 = seed + PRIME32_2; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; xxh_u32 v3 = seed + 0; - xxh_u32 v4 = seed - PRIME32_1; + xxh_u32 v4 = seed - XXH_PRIME32_1; do { v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; @@ -1267,7 +1353,7 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); } else { - h32 = seed + PRIME32_5; + h32 = seed + XXH_PRIME32_5; } h32 += (xxh_u32)len; @@ -1319,10 +1405,10 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t s { XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; + state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + state.v2 = seed + XXH_PRIME32_2; state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; + state.v4 = seed - XXH_PRIME32_1; /* do not write into reserved, planned to be removed in a future version */ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); return XXH_OK; @@ -1403,7 +1489,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); } else { - h32 = state->v3 /* == seed */ + PRIME32_5; + h32 = state->v3 /* == seed */ + XXH_PRIME32_5; } h32 += state->total_len_32; @@ -1450,6 +1536,9 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src typedef XXH64_hash_t xxh_u64; +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif /*! * XXH_REROLL_XXH64: @@ -1498,8 +1587,14 @@ static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; * * Currently only defined for GCC and ICC. */ +#ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; -static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; + return ((const xxh_unalign64*)ptr)->u64; +} #else @@ -1588,17 +1683,25 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align) /******* xxh64 *******/ -static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ -static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ -static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ -static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ -static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ +static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) { - acc += input * PRIME64_2; + acc += input * XXH_PRIME64_2; acc = XXH_rotl64(acc, 31); - acc *= PRIME64_1; + acc *= XXH_PRIME64_1; return acc; } @@ -1606,16 +1709,16 @@ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) { val = XXH64_round(0, val); acc ^= val; - acc = acc * PRIME64_1 + PRIME64_4; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; return acc; } static xxh_u64 XXH64_avalanche(xxh_u64 h64) { h64 ^= h64 >> 33; - h64 *= PRIME64_2; + h64 *= XXH_PRIME64_2; h64 ^= h64 >> 29; - h64 *= PRIME64_3; + h64 *= XXH_PRIME64_3; h64 ^= h64 >> 32; return h64; } @@ -1626,117 +1729,119 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64) static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) { -#define PROCESS1_64 \ - h64 ^= (*ptr++) * PRIME64_5; \ - h64 = XXH_rotl64(h64, 11) * PRIME64_1; +#define XXH_PROCESS1_64 do { \ + h64 ^= (*ptr++) * XXH_PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \ +} while (0) -#define PROCESS4_64 \ - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ - ptr+=4; \ - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; +#define XXH_PROCESS4_64 do { \ + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \ + ptr += 4; \ + h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \ +} while (0) -#define PROCESS8_64 { \ +#define XXH_PROCESS8_64 do { \ xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ - ptr+=8; \ - h64 ^= k1; \ - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ -} + ptr += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \ +} while (0) /* Rerolled version for 32-bit targets is faster and much smaller. */ if (XXH_REROLL || XXH_REROLL_XXH64) { len &= 31; while (len >= 8) { - PROCESS8_64; + XXH_PROCESS8_64; len -= 8; } if (len >= 4) { - PROCESS4_64; + XXH_PROCESS4_64; len -= 4; } while (len > 0) { - PROCESS1_64; + XXH_PROCESS1_64; --len; } return XXH64_avalanche(h64); } else { switch(len & 31) { - case 24: PROCESS8_64; + case 24: XXH_PROCESS8_64; /* fallthrough */ - case 16: PROCESS8_64; + case 16: XXH_PROCESS8_64; /* fallthrough */ - case 8: PROCESS8_64; + case 8: XXH_PROCESS8_64; return XXH64_avalanche(h64); - case 28: PROCESS8_64; + case 28: XXH_PROCESS8_64; /* fallthrough */ - case 20: PROCESS8_64; + case 20: XXH_PROCESS8_64; /* fallthrough */ - case 12: PROCESS8_64; + case 12: XXH_PROCESS8_64; /* fallthrough */ - case 4: PROCESS4_64; + case 4: XXH_PROCESS4_64; return XXH64_avalanche(h64); - case 25: PROCESS8_64; + case 25: XXH_PROCESS8_64; /* fallthrough */ - case 17: PROCESS8_64; + case 17: XXH_PROCESS8_64; /* fallthrough */ - case 9: PROCESS8_64; - PROCESS1_64; + case 9: XXH_PROCESS8_64; + XXH_PROCESS1_64; return XXH64_avalanche(h64); - case 29: PROCESS8_64; + case 29: XXH_PROCESS8_64; /* fallthrough */ - case 21: PROCESS8_64; + case 21: XXH_PROCESS8_64; /* fallthrough */ - case 13: PROCESS8_64; + case 13: XXH_PROCESS8_64; /* fallthrough */ - case 5: PROCESS4_64; - PROCESS1_64; + case 5: XXH_PROCESS4_64; + XXH_PROCESS1_64; return XXH64_avalanche(h64); - case 26: PROCESS8_64; + case 26: XXH_PROCESS8_64; /* fallthrough */ - case 18: PROCESS8_64; + case 18: XXH_PROCESS8_64; /* fallthrough */ - case 10: PROCESS8_64; - PROCESS1_64; - PROCESS1_64; + case 10: XXH_PROCESS8_64; + XXH_PROCESS1_64; + XXH_PROCESS1_64; return XXH64_avalanche(h64); - case 30: PROCESS8_64; + case 30: XXH_PROCESS8_64; /* fallthrough */ - case 22: PROCESS8_64; + case 22: XXH_PROCESS8_64; /* fallthrough */ - case 14: PROCESS8_64; + case 14: XXH_PROCESS8_64; /* fallthrough */ - case 6: PROCESS4_64; - PROCESS1_64; - PROCESS1_64; + case 6: XXH_PROCESS4_64; + XXH_PROCESS1_64; + XXH_PROCESS1_64; return XXH64_avalanche(h64); - case 27: PROCESS8_64; + case 27: XXH_PROCESS8_64; /* fallthrough */ - case 19: PROCESS8_64; + case 19: XXH_PROCESS8_64; /* fallthrough */ - case 11: PROCESS8_64; - PROCESS1_64; - PROCESS1_64; - PROCESS1_64; + case 11: XXH_PROCESS8_64; + XXH_PROCESS1_64; + XXH_PROCESS1_64; + XXH_PROCESS1_64; return XXH64_avalanche(h64); - case 31: PROCESS8_64; + case 31: XXH_PROCESS8_64; /* fallthrough */ - case 23: PROCESS8_64; + case 23: XXH_PROCESS8_64; /* fallthrough */ - case 15: PROCESS8_64; + case 15: XXH_PROCESS8_64; /* fallthrough */ - case 7: PROCESS4_64; + case 7: XXH_PROCESS4_64; /* fallthrough */ - case 3: PROCESS1_64; + case 3: XXH_PROCESS1_64; /* fallthrough */ - case 2: PROCESS1_64; + case 2: XXH_PROCESS1_64; /* fallthrough */ - case 1: PROCESS1_64; + case 1: XXH_PROCESS1_64; /* fallthrough */ case 0: return XXH64_avalanche(h64); } @@ -1746,6 +1851,16 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) return 0; /* unreachable, but some compilers complain without it */ } +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) { @@ -1761,10 +1876,10 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment if (len>=32) { const xxh_u8* const limit = bEnd - 32; - xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; - xxh_u64 v2 = seed + PRIME64_2; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; xxh_u64 v3 = seed + 0; - xxh_u64 v4 = seed - PRIME64_1; + xxh_u64 v4 = seed - XXH_PRIME64_1; do { v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; @@ -1780,7 +1895,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment h64 = XXH64_mergeRound(h64, v4); } else { - h64 = seed + PRIME64_5; + h64 = seed + XXH_PRIME64_5; } h64 += (xxh_u64) len; @@ -1831,10 +1946,10 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s { XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME64_1 + PRIME64_2; - state.v2 = seed + PRIME64_2; + state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + state.v2 = seed + XXH_PRIME64_2; state.v3 = seed + 0; - state.v4 = seed - PRIME64_1; + state.v4 = seed - XXH_PRIME64_1; /* do not write into reserved64, might be removed in a future version */ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); return XXH_OK; @@ -1917,7 +2032,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) h64 = XXH64_mergeRound(h64, v3); h64 = XXH64_mergeRound(h64, v4); } else { - h64 = state->v3 /*seed*/ + PRIME64_5; + h64 = state->v3 /*seed*/ + XXH_PRIME64_5; } h64 += (xxh_u64) state->total_len;