Version 5.5. Added IBM XL compiler intrinsics support, added "always_…

…inline" attribute to internal functions.
avaneev · Aug 9, 2023 · 339a7f5 · 339a7f5
1 parent 7a27918
commit 339a7f5
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ Compiler options: `/Ox /arch:sse2`; overhead: `1.8` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|10.3           |12.1           |26.5           |
+|**komihash 5.5**|10.2           |12.1           |26.7           |
 |komihash 4.5    |11.0           |12.7           |26.2           |
 |komihash 4.3    |11.2           |13.0           |26.0           |
 |komihash 3.6    |11.1           |16.9           |27.5           |
@@ -135,7 +135,7 @@ Compiler options: `/Ox -mavx2`; overhead: `1.8` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|10.3           |12.1           |26.5           |
+|**komihash 5.5**|10.2           |12.1           |26.7           |
 |komihash 4.5    |11.1           |12.7           |26.3           |
 |komihash 4.3    |11.2           |13.0           |25.9           |
 |komihash 3.6    |11.0           |16.3           |27.5           |
@@ -151,7 +151,7 @@ Compiler options: `/O3 /QxSSE2`; overhead: `2.0` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|12.6           |14.5           |22.3           |
+|**komihash 5.5**|12.4           |14.5           |22.5           |
 |komihash 4.5    |18.1           |21.9           |16.4           |
 |komihash 4.3    |17.9           |21.6           |16.3           |
 |komihash 3.6    |20.1           |24.0           |16.3           |
@@ -171,7 +171,7 @@ Compiler options: `-O3 -mavx2`; overhead: `5.3` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|12.7           |13.8           |22.8           |
+|**komihash 5.5**|12.7           |13.8           |22.8           |
 |komihash 4.5    |12.8           |14.4           |22.4           |
 |komihash 4.3    |15.3           |16.3           |22.8           |
 |komihash 3.6    |16.0           |19.0           |22.3           |
@@ -187,7 +187,7 @@ Compiler options: `-O3 -msse2`; overhead: `5.8` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|13.3           |14.4           |25.0           |
+|**komihash 5.5**|13.3           |14.4           |25.0           |
 |komihash 4.5    |13.2           |15.1           |24.7           |
 |komihash 4.3    |15.4           |16.2           |24.4           |
 |komihash 3.6    |16.4           |20.3           |24.7           |
@@ -201,7 +201,7 @@ Compiler options: `-O3 -mavx2`; overhead: `5.8` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|13.3           |14.3           |25.3           |
+|**komihash 5.5**|13.3           |14.3           |25.3           |
 |komihash 4.5    |13.8           |15.2           |24.7           |
 |komihash 4.3    |15.3           |16.4           |24.4           |
 |komihash 3.6    |15.8           |20.1           |24.7           |
@@ -217,7 +217,7 @@ Compiler options: `/Ox -mavx2`; overhead: `5.5` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|11.4           |12.7           |22.7           |
+|**komihash 5.5**|11.4           |12.7           |22.7           |
 |komihash 4.5    |12.6           |14.5           |22.2           |
 |komihash 4.3    |14.1           |16.0           |22.0           |
 |komihash 3.6    |14.0           |22.0           |22.9           |
@@ -233,7 +233,7 @@ Compiler options: `/O3 /QxSSE2`; overhead: `5.9` cycles/h.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|15.8           |19.3           |17.1           |
+|**komihash 5.5**|15.8           |19.3           |17.1           |
 |komihash 4.5    |18.1           |21.1           |17.2           |
 |komihash 4.3    |18.7           |21.5           |18.5           |
 |komihash 3.6    |19.5           |23.1           |18.1           |
@@ -249,7 +249,7 @@ Compiler options: `-O3`; overhead: `unestimatable`.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|bulk, GB/s     |
 |----            |----           |----           |----           |
-|**komihash 5.4**|8.2            |8.4            |23.6           |
+|**komihash 5.5**|8.2            |8.4            |23.6           |
 |komihash 4.5    |8.3            |8.7            |23.6           |
 |komihash 4.3    |8.6            |9.0            |23.6           |
 |komihash 3.6    |8.5            |10.7           |23.6           |
@@ -269,7 +269,7 @@ overhead. Measurement error is approximately 3%.
 
 |Hash function   |0-15b, cycles/h|8-28b, cycles/h|
 |----            |----           |----           |
-|**komihash 5.4**|**8.2**        |**9.7**        |
+|**komihash 5.5**|**8.2**        |**9.7**        |
 |komihash 4.5    |9.5            |11.4           |
 |komihash 4.3    |10.4           |12.1           |
 |komihash 3.6    |10.9           |15.4           |

diff --git a/komihash.h b/komihash.h
@@ -1,5 +1,5 @@
 /**
- * komihash.h version 5.4
+ * komihash.h version 5.5
  *
  * The inclusion file for the "komihash" hash function, "komirand" 64-bit
  * PRNG, and streamed "komihash" implementation.
@@ -8,6 +8,7 @@
  * (located in Russia), native to the author.
  *
  * Description is available at https://github.com/avaneev/komihash
+ * E-mail: [email protected]
  *
  * License
  *
@@ -38,38 +39,6 @@
 #include <stdint.h>
 #include <string.h>
 
-// Macros that apply byte-swapping.
-
-#if defined( __GNUC__ ) || defined( __clang__ )
-
-	#define KOMIHASH_BYTESW32( v ) __builtin_bswap32( v )
-	#define KOMIHASH_BYTESW64( v ) __builtin_bswap64( v )
-
-#elif defined( _MSC_VER )
-
-	#define KOMIHASH_BYTESW32( v ) _byteswap_ulong( v )
-	#define KOMIHASH_BYTESW64( v ) _byteswap_uint64( v )
-
-#else // defined( _MSC_VER )
-
-	#define KOMIHASH_BYTESW32( v ) ( \
-		( v & 0xFF000000 ) >> 24 | \
-		( v & 0x00FF0000 ) >> 8 | \
-		( v & 0x0000FF00 ) << 8 | \
-		( v & 0x000000FF ) << 24 )
-
-	#define KOMIHASH_BYTESW64( v ) ( \
-		( v & 0xFF00000000000000 ) >> 56 | \
-		( v & 0x00FF000000000000 ) >> 40 | \
-		( v & 0x0000FF0000000000 ) >> 24 | \
-		( v & 0x000000FF00000000 ) >> 8 | \
-		( v & 0x00000000FF000000 ) << 8 | \
-		( v & 0x0000000000FF0000 ) << 24 | \
-		( v & 0x000000000000FF00 ) << 40 | \
-		( v & 0x00000000000000FF ) << 56 )
-
-#endif // defined( _MSC_VER )
-
 // Endianness-definition macro, can be defined externally (e.g. =1, if
 // endianness-correction is unnecessary in any case, to reduce its associated
 // overhead).
@@ -80,7 +49,7 @@
 
 		#define KOMIHASH_LITTLE_ENDIAN 1
 
-	#elif defined( __BIG_ENDIAN__ ) || \
+	#elif defined( __BIG_ENDIAN__ ) || defined( _BIG_ENDIAN ) || \
 		( defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
 
 		#define KOMIHASH_LITTLE_ENDIAN 0
@@ -94,6 +63,15 @@
 	#endif // defined( __BIG_ENDIAN__ )
 #endif // !defined( KOMIHASH_LITTLE_ENDIAN )
 
+// Macro that denotes availability of required GCC-style built-in functions.
+
+#if defined( __GNUC__ ) || defined( __clang__ ) || \
+	defined( __IBMC__ ) || defined( __IBMCPP__ )
+
+	#define KOMIHASH_GCC_BUILTINS 1
+
+#endif // GCC built-ins.
+
 // Macros that apply byte-swapping, used for endianness-correction.
 
 #if KOMIHASH_LITTLE_ENDIAN
@@ -103,17 +81,44 @@
 
 #else // KOMIHASH_LITTLE_ENDIAN
 
-	#define KOMIHASH_EC32( v ) KOMIHASH_BYTESW32( v )
-	#define KOMIHASH_EC64( v ) KOMIHASH_BYTESW64( v )
+	#if defined( KOMIHASH_GCC_BUILTINS )
+
+		#define KOMIHASH_EC32( v ) __builtin_bswap32( v )
+		#define KOMIHASH_EC64( v ) __builtin_bswap64( v )
+
+	#elif defined( _MSC_VER )
+
+		#define KOMIHASH_EC32( v ) _byteswap_ulong( v )
+		#define KOMIHASH_EC64( v ) _byteswap_uint64( v )
+
+	#else // defined( _MSC_VER )
+
+		#define KOMIHASH_EC32( v ) ( \
+			( v & 0xFF000000 ) >> 24 | \
+			( v & 0x00FF0000 ) >> 8 | \
+			( v & 0x0000FF00 ) << 8 | \
+			( v & 0x000000FF ) << 24 )
+
+		#define KOMIHASH_EC64( v ) ( \
+			( v & 0xFF00000000000000 ) >> 56 | \
+			( v & 0x00FF000000000000 ) >> 40 | \
+			( v & 0x0000FF0000000000 ) >> 24 | \
+			( v & 0x000000FF00000000 ) >> 8 | \
+			( v & 0x00000000FF000000 ) << 8 | \
+			( v & 0x0000000000FF0000 ) << 24 | \
+			( v & 0x000000000000FF00 ) << 40 | \
+			( v & 0x00000000000000FF ) << 56 )
+
+	#endif // defined( _MSC_VER )
 
 #endif // KOMIHASH_LITTLE_ENDIAN
 
 // Likelihood macros that are used for manually-guided micro-optimization.
 
-#if defined( __GNUC__ ) || defined( __clang__ )
+#if defined( KOMIHASH_GCC_BUILTINS )
 
-	#define KOMIHASH_LIKELY( x )  __builtin_expect( x, 1 )
-	#define KOMIHASH_UNLIKELY( x )  __builtin_expect( x, 0 )
+	#define KOMIHASH_LIKELY( x ) __builtin_expect( x, 1 )
+	#define KOMIHASH_UNLIKELY( x ) __builtin_expect( x, 0 )
 
 #else // likelihood macros
 
@@ -125,7 +130,7 @@
 // Memory address prefetch macro (temporal locality=1, in case a collision
 // resolution would be necessary).
 
-#if defined( __GNUC__ ) || defined( __clang__ )
+#if defined( KOMIHASH_GCC_BUILTINS )
 
 	#define KOMIHASH_PREFETCH( addr ) __builtin_prefetch( addr, 0, 1 )
 
@@ -135,6 +140,18 @@
 
 #endif // prefetch macro
 
+// Macro to force code inlining.
+
+#if defined( KOMIHASH_GCC_BUILTINS )
+
+	#define KOMIHASH_INLINE inline __attribute__((always_inline))
+
+#else // defined( KOMIHASH_GCC_BUILTINS )
+
+	#define KOMIHASH_INLINE inline
+
+#endif // defined( KOMIHASH_GCC_BUILTINS )
+
 /**
  * An auxiliary function that returns an unsigned 32-bit value created out of
  * a sequence of bytes in memory. This function is used to convert endianness
@@ -145,7 +162,7 @@
  * @return Endianness-corrected 32-bit value from memory.
  */
 
-static inline uint32_t kh_lu32ec( const uint8_t* const p )
+static KOMIHASH_INLINE uint32_t kh_lu32ec( const uint8_t* const p )
 {
 	uint32_t v;
 	memcpy( &v, p, 4 );
@@ -163,7 +180,7 @@ static inline uint32_t kh_lu32ec( const uint8_t* const p )
  * @return Endianness-corrected 64-bit value from memory.
  */
 
-static inline uint64_t kh_lu64ec( const uint8_t* const p )
+static KOMIHASH_INLINE uint64_t kh_lu64ec( const uint8_t* const p )
 {
 	uint64_t v;
 	memcpy( &v, p, 8 );
@@ -182,7 +199,7 @@ static inline uint64_t kh_lu64ec( const uint8_t* const p )
  * @return Final byte-padded value from the message.
  */
 
-static inline uint64_t kh_lpu64ec_l3( const uint8_t* const Msg,
+static KOMIHASH_INLINE uint64_t kh_lpu64ec_l3( const uint8_t* const Msg,
 	const size_t MsgLen )
 {
 	const int ml8 = (int) ( MsgLen * 8 );
@@ -213,7 +230,7 @@ static inline uint64_t kh_lpu64ec_l3( const uint8_t* const Msg,
  * @return Final byte-padded value from the message.
  */
 
-static inline uint64_t kh_lpu64ec_nz( const uint8_t* const Msg,
+static KOMIHASH_INLINE uint64_t kh_lpu64ec_nz( const uint8_t* const Msg,
 	const size_t MsgLen )
 {
 	const int ml8 = (int) ( MsgLen * 8 );
@@ -252,7 +269,7 @@ static inline uint64_t kh_lpu64ec_nz( const uint8_t* const Msg,
  * @return Final byte-padded value from the message.
  */
 
-static inline uint64_t kh_lpu64ec_l4( const uint8_t* const Msg,
+static KOMIHASH_INLINE uint64_t kh_lpu64ec_l4( const uint8_t* const Msg,
 	const size_t MsgLen )
 {
 	const int ml8 = (int) ( MsgLen * 8 );
@@ -280,57 +297,67 @@ static inline uint64_t kh_lpu64ec_l4( const uint8_t* const Msg,
 	 * @param[out] rh The higher half of the 128-bit result.
 	 */
 
-	static inline void kh_m128( const uint64_t m1, const uint64_t m2,
+	static KOMIHASH_INLINE void kh_m128( const uint64_t m1, const uint64_t m2,
 		uint64_t* const rl, uint64_t* const rh )
 	{
-		const __uint128_t r = (__uint128_t) m1 * m2;
+		const unsigned __int128 r = (unsigned __int128) m1 * m2;
 
-		*rl = (uint64_t) r;
 		*rh = (uint64_t) ( r >> 64 );
+		*rl = (uint64_t) r;
+	}
+
+#elif ( defined( __IBMC__ ) || defined( __IBMCPP__ )) && defined( __LP64__ )
+
+	static KOMIHASH_INLINE void kh_m128( const uint64_t m1, const uint64_t m2,
+		uint64_t* const rl, uint64_t* const rh )
+	{
+		*rh = __mulhdu( m1, m2 );
+		*rl = m1 * m2;
 	}
 
 #elif defined( _MSC_VER ) && ( defined( _M_ARM64 ) || \
 	( defined( _M_X64 ) && defined( __INTEL_COMPILER )))
 
 	#include <intrin.h>
 
-	static inline void kh_m128( const uint64_t m1, const uint64_t m2,
+	static KOMIHASH_INLINE void kh_m128( const uint64_t m1, const uint64_t m2,
 		uint64_t* const rl, uint64_t* const rh )
 	{
-		*rl = m1 * m2;
 		*rh = __umulh( m1, m2 );
+		*rl = m1 * m2;
 	}
 
-#elif defined( _MSC_VER ) && defined( _M_X64 )
+#elif defined( _MSC_VER ) && ( defined( _M_X64 ) || defined( _M_IA64 ))
 
 	#include <intrin.h>
 	#pragma intrinsic(_umul128)
 
-	static inline void kh_m128( const uint64_t m1, const uint64_t m2,
+	static KOMIHASH_INLINE void kh_m128( const uint64_t m1, const uint64_t m2,
 		uint64_t* const rl, uint64_t* const rh )
 	{
 		*rl = _umul128( m1, m2, rh );
 	}
 
 #else // defined( _MSC_VER ) && defined( _M_X64 )
 
-	// _umul128() code for 32-bit systems, adapted from mullu(),
-	// from https://go.dev/src/runtime/softfloat64.go
-	// Licensed under BSD-style license.
+	// _umul128() code for 32-bit systems, adapted from Hacker's Delight,
+	// Henry S. Warren, Jr.
 
 	#if defined( _MSC_VER ) && !defined( __INTEL_COMPILER )
 
 		#include <intrin.h>
 		#pragma intrinsic(__emulu)
 
-		static inline uint64_t kh__emulu( const uint32_t x, const uint32_t y )
+		static KOMIHASH_INLINE uint64_t kh__emulu( const uint32_t x,
+			const uint32_t y )
 		{
 			return( __emulu( x, y ));
 		}
 
 	#else // defined( _MSC_VER ) && !defined( __INTEL_COMPILER )
 
-		static inline uint64_t kh__emulu( const uint32_t x, const uint32_t y )
+		static KOMIHASH_INLINE uint64_t kh__emulu( const uint32_t x,
+			const uint32_t y )
 		{
 			return( (uint64_t) x * y );
 		}