blake3_dispatch.c (7537B)
1 #include <stdbool.h> 2 #include <stddef.h> 3 #include <stdint.h> 4 5 #include "blake3_impl.h" 6 7 #if defined(IS_X86) 8 #if defined(_MSC_VER) 9 #include <intrin.h> 10 #elif defined(__GNUC__) 11 #include <immintrin.h> 12 #else 13 #error "Unimplemented!" 14 #endif 15 #endif 16 17 #define MAYBE_UNUSED(x) (void)((x)) 18 19 #if defined(IS_X86) 20 static uint64_t xgetbv() { 21 #if defined(_MSC_VER) 22 return _xgetbv(0); 23 #else 24 uint32_t eax = 0, edx = 0; 25 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); 26 return ((uint64_t)edx << 32) | eax; 27 #endif 28 } 29 30 static void cpuid(uint32_t out[4], uint32_t id) { 31 #if defined(_MSC_VER) 32 __cpuid((int *)out, id); 33 #elif defined(__i386__) || defined(_M_IX86) 34 __asm__ __volatile__("movl %%ebx, %1\n" 35 "cpuid\n" 36 "xchgl %1, %%ebx\n" 37 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 38 : "a"(id)); 39 #else 40 __asm__ __volatile__("cpuid\n" 41 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 42 : "a"(id)); 43 #endif 44 } 45 46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { 47 #if defined(_MSC_VER) 48 __cpuidex((int *)out, id, sid); 49 #elif defined(__i386__) || defined(_M_IX86) 50 __asm__ __volatile__("movl %%ebx, %1\n" 51 "cpuid\n" 52 "xchgl %1, %%ebx\n" 53 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 54 : "a"(id), "c"(sid)); 55 #else 56 __asm__ __volatile__("cpuid\n" 57 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) 58 : "a"(id), "c"(sid)); 59 #endif 60 } 61 62 #endif 63 64 enum cpu_feature { 65 SSE2 = 1 << 0, 66 SSSE3 = 1 << 1, 67 SSE41 = 1 << 2, 68 AVX = 1 << 3, 69 AVX2 = 1 << 4, 70 AVX512F = 1 << 5, 71 AVX512VL = 1 << 6, 72 /* ... */ 73 UNDEFINED = 1 << 30 74 }; 75 76 #if !defined(BLAKE3_TESTING) 77 static /* Allow the variable to be controlled manually for testing */ 78 #endif 79 enum cpu_feature g_cpu_features = UNDEFINED; 80 81 #if !defined(BLAKE3_TESTING) 82 static 83 #endif 84 enum cpu_feature 85 get_cpu_features() { 86 87 if (g_cpu_features != UNDEFINED) { 88 return g_cpu_features; 89 } else { 90 #if defined(IS_X86) 91 uint32_t regs[4] = {0}; 92 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; 93 (void)edx; 94 enum cpu_feature features = 0; 95 cpuid(regs, 0); 96 const int max_id = *eax; 97 cpuid(regs, 1); 98 #if defined(__amd64__) || defined(_M_X64) 99 features |= SSE2; 100 #else 101 if (*edx & (1UL << 26)) 102 features |= SSE2; 103 #endif 104 if (*ecx & (1UL << 0)) 105 features |= SSSE3; 106 if (*ecx & (1UL << 19)) 107 features |= SSE41; 108 109 if (*ecx & (1UL << 27)) { // OSXSAVE 110 const uint64_t mask = xgetbv(); 111 if ((mask & 6) == 6) { // SSE and AVX states 112 if (*ecx & (1UL << 28)) 113 features |= AVX; 114 if (max_id >= 7) { 115 cpuidex(regs, 7, 0); 116 if (*ebx & (1UL << 5)) 117 features |= AVX2; 118 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm 119 if (*ebx & (1UL << 31)) 120 features |= AVX512VL; 121 if (*ebx & (1UL << 16)) 122 features |= AVX512F; 123 } 124 } 125 } 126 } 127 g_cpu_features = features; 128 return features; 129 #else 130 /* How to detect NEON? */ 131 return 0; 132 #endif 133 } 134 } 135 136 void blake3_compress_in_place(uint32_t cv[8], 137 const uint8_t block[BLAKE3_BLOCK_LEN], 138 uint8_t block_len, uint64_t counter, 139 uint8_t flags) { 140 #if defined(IS_X86) 141 const enum cpu_feature features = get_cpu_features(); 142 MAYBE_UNUSED(features); 143 #if !defined(BLAKE3_NO_AVX512) 144 if (features & AVX512VL) { 145 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); 146 return; 147 } 148 #endif 149 #if !defined(BLAKE3_NO_SSE41) 150 if (features & SSE41) { 151 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); 152 return; 153 } 154 #endif 155 #if !defined(BLAKE3_NO_SSE2) 156 if (features & SSE2) { 157 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); 158 return; 159 } 160 #endif 161 #endif 162 blake3_compress_in_place_portable(cv, block, block_len, counter, flags); 163 } 164 165 void blake3_compress_xof(const uint32_t cv[8], 166 const uint8_t block[BLAKE3_BLOCK_LEN], 167 uint8_t block_len, uint64_t counter, uint8_t flags, 168 uint8_t out[64]) { 169 #if defined(IS_X86) 170 const enum cpu_feature features = get_cpu_features(); 171 MAYBE_UNUSED(features); 172 #if !defined(BLAKE3_NO_AVX512) 173 if (features & AVX512VL) { 174 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); 175 return; 176 } 177 #endif 178 #if !defined(BLAKE3_NO_SSE41) 179 if (features & SSE41) { 180 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); 181 return; 182 } 183 #endif 184 #if !defined(BLAKE3_NO_SSE2) 185 if (features & SSE2) { 186 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); 187 return; 188 } 189 #endif 190 #endif 191 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); 192 } 193 194 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, 195 size_t blocks, const uint32_t key[8], uint64_t counter, 196 bool increment_counter, uint8_t flags, 197 uint8_t flags_start, uint8_t flags_end, uint8_t *out) { 198 #if defined(IS_X86) 199 const enum cpu_feature features = get_cpu_features(); 200 MAYBE_UNUSED(features); 201 #if !defined(BLAKE3_NO_AVX512) 202 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 203 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, 204 increment_counter, flags, flags_start, flags_end, 205 out); 206 return; 207 } 208 #endif 209 #if !defined(BLAKE3_NO_AVX2) 210 if (features & AVX2) { 211 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, 212 increment_counter, flags, flags_start, flags_end, 213 out); 214 return; 215 } 216 #endif 217 #if !defined(BLAKE3_NO_SSE41) 218 if (features & SSE41) { 219 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, 220 increment_counter, flags, flags_start, flags_end, 221 out); 222 return; 223 } 224 #endif 225 #if !defined(BLAKE3_NO_SSE2) 226 if (features & SSE2) { 227 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, 228 increment_counter, flags, flags_start, flags_end, 229 out); 230 return; 231 } 232 #endif 233 #endif 234 235 #if defined(BLAKE3_USE_NEON) 236 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, 237 increment_counter, flags, flags_start, flags_end, out); 238 return; 239 #endif 240 241 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, 242 increment_counter, flags, flags_start, flags_end, 243 out); 244 } 245 246 // The dynamically detected SIMD degree of the current platform. 247 size_t blake3_simd_degree(void) { 248 #if defined(IS_X86) 249 const enum cpu_feature features = get_cpu_features(); 250 MAYBE_UNUSED(features); 251 #if !defined(BLAKE3_NO_AVX512) 252 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { 253 return 16; 254 } 255 #endif 256 #if !defined(BLAKE3_NO_AVX2) 257 if (features & AVX2) { 258 return 8; 259 } 260 #endif 261 #if !defined(BLAKE3_NO_SSE41) 262 if (features & SSE41) { 263 return 4; 264 } 265 #endif 266 #if !defined(BLAKE3_NO_SSE2) 267 if (features & SSE2) { 268 return 4; 269 } 270 #endif 271 #endif 272 #if defined(BLAKE3_USE_NEON) 273 return 4; 274 #endif 275 return 1; 276 }