chibipub

experimental activitypub node in C
git clone git://jb55.com/chibipub
Log | Files | Refs | README | LICENSE

blake3_dispatch.c (7537B)


      1 #include <stdbool.h>
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 
      5 #include "blake3_impl.h"
      6 
      7 #if defined(IS_X86)
      8 #if defined(_MSC_VER)
      9 #include <intrin.h>
     10 #elif defined(__GNUC__)
     11 #include <immintrin.h>
     12 #else
     13 #error "Unimplemented!"
     14 #endif
     15 #endif
     16 
     17 #define MAYBE_UNUSED(x) (void)((x))
     18 
     19 #if defined(IS_X86)
     20 static uint64_t xgetbv() {
     21 #if defined(_MSC_VER)
     22   return _xgetbv(0);
     23 #else
     24   uint32_t eax = 0, edx = 0;
     25   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
     26   return ((uint64_t)edx << 32) | eax;
     27 #endif
     28 }
     29 
     30 static void cpuid(uint32_t out[4], uint32_t id) {
     31 #if defined(_MSC_VER)
     32   __cpuid((int *)out, id);
     33 #elif defined(__i386__) || defined(_M_IX86)
     34   __asm__ __volatile__("movl %%ebx, %1\n"
     35                        "cpuid\n"
     36                        "xchgl %1, %%ebx\n"
     37                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
     38                        : "a"(id));
     39 #else
     40   __asm__ __volatile__("cpuid\n"
     41                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
     42                        : "a"(id));
     43 #endif
     44 }
     45 
     46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
     47 #if defined(_MSC_VER)
     48   __cpuidex((int *)out, id, sid);
     49 #elif defined(__i386__) || defined(_M_IX86)
     50   __asm__ __volatile__("movl %%ebx, %1\n"
     51                        "cpuid\n"
     52                        "xchgl %1, %%ebx\n"
     53                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
     54                        : "a"(id), "c"(sid));
     55 #else
     56   __asm__ __volatile__("cpuid\n"
     57                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
     58                        : "a"(id), "c"(sid));
     59 #endif
     60 }
     61 
     62 #endif
     63 
     64 enum cpu_feature {
     65   SSE2 = 1 << 0,
     66   SSSE3 = 1 << 1,
     67   SSE41 = 1 << 2,
     68   AVX = 1 << 3,
     69   AVX2 = 1 << 4,
     70   AVX512F = 1 << 5,
     71   AVX512VL = 1 << 6,
     72   /* ... */
     73   UNDEFINED = 1 << 30
     74 };
     75 
     76 #if !defined(BLAKE3_TESTING)
     77 static /* Allow the variable to be controlled manually for testing */
     78 #endif
     79     enum cpu_feature g_cpu_features = UNDEFINED;
     80 
     81 #if !defined(BLAKE3_TESTING)
     82 static
     83 #endif
     84     enum cpu_feature
     85     get_cpu_features() {
     86 
     87   if (g_cpu_features != UNDEFINED) {
     88     return g_cpu_features;
     89   } else {
     90 #if defined(IS_X86)
     91     uint32_t regs[4] = {0};
     92     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
     93     (void)edx;
     94     enum cpu_feature features = 0;
     95     cpuid(regs, 0);
     96     const int max_id = *eax;
     97     cpuid(regs, 1);
     98 #if defined(__amd64__) || defined(_M_X64)
     99     features |= SSE2;
    100 #else
    101     if (*edx & (1UL << 26))
    102       features |= SSE2;
    103 #endif
    104     if (*ecx & (1UL << 0))
    105       features |= SSSE3;
    106     if (*ecx & (1UL << 19))
    107       features |= SSE41;
    108 
    109     if (*ecx & (1UL << 27)) { // OSXSAVE
    110       const uint64_t mask = xgetbv();
    111       if ((mask & 6) == 6) { // SSE and AVX states
    112         if (*ecx & (1UL << 28))
    113           features |= AVX;
    114         if (max_id >= 7) {
    115           cpuidex(regs, 7, 0);
    116           if (*ebx & (1UL << 5))
    117             features |= AVX2;
    118           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
    119             if (*ebx & (1UL << 31))
    120               features |= AVX512VL;
    121             if (*ebx & (1UL << 16))
    122               features |= AVX512F;
    123           }
    124         }
    125       }
    126     }
    127     g_cpu_features = features;
    128     return features;
    129 #else
    130     /* How to detect NEON? */
    131     return 0;
    132 #endif
    133   }
    134 }
    135 
    136 void blake3_compress_in_place(uint32_t cv[8],
    137                               const uint8_t block[BLAKE3_BLOCK_LEN],
    138                               uint8_t block_len, uint64_t counter,
    139                               uint8_t flags) {
    140 #if defined(IS_X86)
    141   const enum cpu_feature features = get_cpu_features();
    142   MAYBE_UNUSED(features);
    143 #if !defined(BLAKE3_NO_AVX512)
    144   if (features & AVX512VL) {
    145     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
    146     return;
    147   }
    148 #endif
    149 #if !defined(BLAKE3_NO_SSE41)
    150   if (features & SSE41) {
    151     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
    152     return;
    153   }
    154 #endif
    155 #if !defined(BLAKE3_NO_SSE2)
    156   if (features & SSE2) {
    157     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
    158     return;
    159   }
    160 #endif
    161 #endif
    162   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
    163 }
    164 
    165 void blake3_compress_xof(const uint32_t cv[8],
    166                          const uint8_t block[BLAKE3_BLOCK_LEN],
    167                          uint8_t block_len, uint64_t counter, uint8_t flags,
    168                          uint8_t out[64]) {
    169 #if defined(IS_X86)
    170   const enum cpu_feature features = get_cpu_features();
    171   MAYBE_UNUSED(features);
    172 #if !defined(BLAKE3_NO_AVX512)
    173   if (features & AVX512VL) {
    174     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
    175     return;
    176   }
    177 #endif
    178 #if !defined(BLAKE3_NO_SSE41)
    179   if (features & SSE41) {
    180     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
    181     return;
    182   }
    183 #endif
    184 #if !defined(BLAKE3_NO_SSE2)
    185   if (features & SSE2) {
    186     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
    187     return;
    188   }
    189 #endif
    190 #endif
    191   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
    192 }
    193 
    194 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
    195                       size_t blocks, const uint32_t key[8], uint64_t counter,
    196                       bool increment_counter, uint8_t flags,
    197                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
    198 #if defined(IS_X86)
    199   const enum cpu_feature features = get_cpu_features();
    200   MAYBE_UNUSED(features);
    201 #if !defined(BLAKE3_NO_AVX512)
    202   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
    203     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
    204                             increment_counter, flags, flags_start, flags_end,
    205                             out);
    206     return;
    207   }
    208 #endif
    209 #if !defined(BLAKE3_NO_AVX2)
    210   if (features & AVX2) {
    211     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
    212                           increment_counter, flags, flags_start, flags_end,
    213                           out);
    214     return;
    215   }
    216 #endif
    217 #if !defined(BLAKE3_NO_SSE41)
    218   if (features & SSE41) {
    219     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
    220                            increment_counter, flags, flags_start, flags_end,
    221                            out);
    222     return;
    223   }
    224 #endif
    225 #if !defined(BLAKE3_NO_SSE2)
    226   if (features & SSE2) {
    227     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
    228                           increment_counter, flags, flags_start, flags_end,
    229                           out);
    230     return;
    231   }
    232 #endif
    233 #endif
    234 
    235 #if defined(BLAKE3_USE_NEON)
    236   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
    237                         increment_counter, flags, flags_start, flags_end, out);
    238   return;
    239 #endif
    240 
    241   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
    242                             increment_counter, flags, flags_start, flags_end,
    243                             out);
    244 }
    245 
    246 // The dynamically detected SIMD degree of the current platform.
    247 size_t blake3_simd_degree(void) {
    248 #if defined(IS_X86)
    249   const enum cpu_feature features = get_cpu_features();
    250   MAYBE_UNUSED(features);
    251 #if !defined(BLAKE3_NO_AVX512)
    252   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
    253     return 16;
    254   }
    255 #endif
    256 #if !defined(BLAKE3_NO_AVX2)
    257   if (features & AVX2) {
    258     return 8;
    259   }
    260 #endif
    261 #if !defined(BLAKE3_NO_SSE41)
    262   if (features & SSE41) {
    263     return 4;
    264   }
    265 #endif
    266 #if !defined(BLAKE3_NO_SSE2)
    267   if (features & SSE2) {
    268     return 4;
    269   }
    270 #endif
    271 #endif
    272 #if defined(BLAKE3_USE_NEON)
    273   return 4;
    274 #endif
    275   return 1;
    276 }