commit 46c3f95e36432af593b18ed0a6acd007026fda6e
parent fbd9c3b10b2f2ebbe20b74ac5892c5d9614e6627
Author: William Casarin <jb55@jb55.com>
Date: Sun, 27 Dec 2020 14:55:43 -0800
blake3
Diffstat:
27 files changed, 31319 insertions(+), 5 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
.build-result
/wolfsocks
*.o
+*.a
+*.so
/tags
/test_out.ubjson
src/test_json
diff --git a/Makefile b/Makefile
@@ -1,22 +1,47 @@
-CFLAGS = -Og -ggdb -Wall -Werror -Isrc $(shell pkg-config --cflags openssl)
+CFLAGS = -Og -ggdb -Wall -Werror -Ideps -Isrc $(shell pkg-config --cflags openssl)
LDFLAGS = $(shell pkg-config --libs openssl)
-OBJS = src/http.o src/base64.o src/inbox.o src/json.o src/ubjson.o
+OBJS = src/http.o \
+ src/base64.o \
+ src/inbox.o \
+ src/json.o \
+ src/ubjson.o \
+ deps/blake3/blake3.a
-HEADERS = $(wildcard src/*.h)
+BLAKE3_OBJS = deps/blake3/blake3.o \
+ deps/blake3/blake3_dispatch.o \
+ deps/blake3/blake3_portable.o \
+ deps/blake3/blake3_sse2_x86-64_unix.o \
+ deps/blake3/blake3_sse41_x86-64_unix.o \
+ deps/blake3/blake3_avx2_x86-64_unix.o
+
+HEADERS = $(wildcard src/*.h)
all: wolfsocks
+%.o: %.c
+ @echo "cc $<"
+ @$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
+
%.o: %.c %.h
@echo "cc $<"
@$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
+%.o: %.S
+ @echo "cc $<"
+ @$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
+
+deps/blake3/blake3.a: $(BLAKE3_OBJS)
+ @echo "ar $@"
+ @ar rcs $@ $(BLAKE3_OBJS)
+
corpus/math.json:
curl --compressed -sL 'https://jb55.com/s/5aaaae6d64be61fd.json' > $@
wolfsocks: src/wolfsocks.c $(OBJS) $(HEADERS)
- $(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@
+ @echo "ld $@"
+ @$(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@
src/test_json: src/test_json.c $(OBJS) $(HEADERS) corpus/math.json
$(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@
@@ -25,7 +50,7 @@ check: src/test_json
./src/test_json
clean: fake
- rm -f $(OBJS) wolfsocks src/test_json
+ rm -f $(OBJS) $(BLAKE3_OBJS) deps/blake3/blake3.a wolfsocks src/test_json
tags: fake
ctags src/*.c src/*.h
diff --git a/deps/blake3/README.md b/deps/blake3/README.md
@@ -0,0 +1,270 @@
+The official C implementation of BLAKE3.
+
+# Example
+
+An example program that hashes bytes from standard input and prints the
+result:
+
+```c
+#include "blake3.h"
+#include <stdio.h>
+#include <unistd.h>
+
+int main() {
+ // Initialize the hasher.
+ blake3_hasher hasher;
+ blake3_hasher_init(&hasher);
+
+ // Read input bytes from stdin.
+ unsigned char buf[65536];
+ ssize_t n;
+ while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) {
+ blake3_hasher_update(&hasher, buf, n);
+ }
+
+ // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+ uint8_t output[BLAKE3_OUT_LEN];
+ blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+ // Print the hash as hexadecimal.
+ for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+ printf("%02x", output[i]);
+ }
+ printf("\n");
+ return 0;
+}
+```
+
+The code above is included in this directory as `example.c`. If you're
+on x86\_64 with a Unix-like OS, you can compile a working binary like
+this:
+
+```bash
+gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
+ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+ blake3_avx512_x86-64_unix.S
+```
+
+# API
+
+## The Struct
+
+```c
+typedef struct {
+ // private fields
+} blake3_hasher;
+```
+
+An incremental BLAKE3 hashing state, which can accept any number of
+updates. This implementation doesn't allocate any heap memory, but
+`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
+on x86-64. This size can be reduced by restricting the maximum input
+length, as described in Section 5.4 of [the BLAKE3
+spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
+but this implementation doesn't currently support that strategy.
+
+## Common API Functions
+
+```c
+void blake3_hasher_init(
+ blake3_hasher *self);
+```
+
+Initialize a `blake3_hasher` in the default hashing mode.
+
+---
+
+```c
+void blake3_hasher_update(
+ blake3_hasher *self,
+ const void *input,
+ size_t input_len);
+```
+
+Add input to the hasher. This can be called any number of times.
+
+---
+
+```c
+void blake3_hasher_finalize(
+ const blake3_hasher *self,
+ uint8_t *out,
+ size_t out_len);
+```
+
+Finalize the hasher and emit an output of any length. This doesn't
+modify the hasher itself, and it's possible to finalize again after
+adding more input. The constant `BLAKE3_OUT_LEN` provides the default
+output length, 32 bytes.
+
+## Less Common API Functions
+
+```c
+void blake3_hasher_init_keyed(
+ blake3_hasher *self,
+ const uint8_t key[BLAKE3_KEY_LEN]);
+```
+
+Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
+exactly 32 bytes.
+
+---
+
+```c
+void blake3_hasher_init_derive_key(
+ blake3_hasher *self,
+ const char *context);
+```
+
+Initialize a `blake3_hasher` in the key derivation mode. The context
+string is given as an initialization parameter, and afterwards input key
+material should be given with `blake3_hasher_update`. The context string
+is a null-terminated C string which should be **hardcoded, globally
+unique, and application-specific**. The context string should not
+include any dynamic input like salts, nonces, or identifiers read from a
+database at runtime. A good default format for the context string is
+`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
+2019-12-25 16:18:03 session tokens v1"`.
+
+This function is intended for application code written in C. For
+language bindings, see `blake3_hasher_init_derive_key_raw` below.
+
+---
+
+```c
+void blake3_hasher_init_derive_key_raw(
+ blake3_hasher *self,
+ const void *context,
+ size_t context_len);
+```
+
+As `blake3_hasher_init_derive_key` above, except that the context string
+is given as a pointer to an array of arbitrary bytes with a provided
+length. This is intended for writing language bindings, where C string
+conversion would add unnecessary overhead and new error cases. Unicode
+strings should be encoded as UTF-8.
+
+Application code in C should prefer `blake3_hasher_init_derive_key`,
+which takes the context as a C string. If you need to use arbitrary
+bytes as a context string in application code, consider whether you're
+violating the requirement that context strings should be hardcoded.
+
+---
+
+```c
+void blake3_hasher_finalize_seek(
+ const blake3_hasher *self,
+ uint64_t seek,
+ uint8_t *out,
+ size_t out_len);
+```
+
+The same as `blake3_hasher_finalize`, but with an additional `seek`
+parameter for the starting byte position in the output stream. To
+efficiently stream a large output without allocating memory, call this
+function in a loop, incrementing `seek` by the output length each time.
+
+# Building
+
+This implementation is just C and assembly files. It doesn't include a
+public-facing build system. (The `Makefile` in this directory is only
+for testing.) Instead, the intention is that you can include these files
+in whatever build system you're already using. This section describes
+the commands your build system should execute, or which you can execute
+by hand. Note that these steps may change in future versions.
+
+## x86
+
+Dynamic dispatch is enabled by default on x86. The implementation will
+query the CPU at runtime to detect SIMD support, and it will use the
+widest instruction set available. By default, `blake3_dispatch.c`
+expects to be linked with code for five different instruction sets:
+portable C, SSE2, SSE4.1, AVX2, and AVX-512.
+
+For each of the x86 SIMD instruction sets, two versions are available,
+one in assembly (which is further divided into three flavors: Unix,
+Windows MSVC, and Windows GNU) and one using C intrinsics. The assembly
+versions are generally preferred: they perform better, they perform more
+consistently across different compilers, and they build more quickly. On
+the other hand, the assembly versions are x86\_64-only, and you need to
+select the right flavor for your target platform.
+
+Here's an example of building a shared library on x86\_64 Linux using
+the assembly implementations:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+ blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+ blake3_avx512_x86-64_unix.S
+```
+
+When building the intrinsics-based implementations, you need to build
+each implementation separately, with the corresponding instruction set
+explicitly enabled in the compiler. Here's the same shared library using
+the intrinsics-based implementations:
+
+```bash
+gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
+gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
+gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
+gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+ blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
+```
+
+Note above that building `blake3_avx512.c` requires both `-mavx512f` and
+`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
+flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
+MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a
+corresponding flag.
+
+If you want to omit SIMD code entirely, you need to explicitly disable
+each instruction set. Here's an example of building a shared library on
+x86 with only portable code:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
+ -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+## ARM NEON
+
+The NEON implementation is not enabled by default on ARM, since not all
+ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an
+example of building a shared library on ARM Linux with NEON support:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \
+ blake3_portable.c blake3_neon.c
+```
+
+Note that on some targets (ARMv7 in particular), extra flags may be
+required to activate NEON support in the compiler. If you see an error
+like...
+
+```
+/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
+in call to always_inline ‘vaddq_u32’: target specific option mismatch
+```
+
+...then you may need to add something like `-mfpu=neon-vfpv4
+-mfloat-abi=hard`.
+
+## Other Platforms
+
+The portable implementation should work on most other architectures. For
+example:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+# Differences from the Rust Implementation
+
+The single-threaded Rust and C implementations use the same algorithms,
+and their performance is the same if you use the assembly
+implementations or if you compile the intrinsics-based implementations
+with Clang. (Both Clang and rustc are LLVM-based.)
+
+The C implementation doesn't currently include any multithreading
+optimizations. OpenMP support or similar might be added in the future.
diff --git a/deps/blake3/blake3.c b/deps/blake3/blake3.c
@@ -0,0 +1,607 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+const char * blake3_version(void) {
+ return BLAKE3_VERSION_STRING;
+}
+
+INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
+ uint8_t flags) {
+ memcpy(self->cv, key, BLAKE3_KEY_LEN);
+ self->chunk_counter = 0;
+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+ self->buf_len = 0;
+ self->blocks_compressed = 0;
+ self->flags = flags;
+}
+
+INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
+ uint64_t chunk_counter) {
+ memcpy(self->cv, key, BLAKE3_KEY_LEN);
+ self->chunk_counter = chunk_counter;
+ self->blocks_compressed = 0;
+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+ self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
+ return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
+ ((size_t)self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
+ const uint8_t *input, size_t input_len) {
+ size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
+ if (take > input_len) {
+ take = input_len;
+ }
+ uint8_t *dest = self->buf + ((size_t)self->buf_len);
+ memcpy(dest, input, take);
+ self->buf_len += (uint8_t)take;
+ return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
+ if (self->blocks_compressed == 0) {
+ return CHUNK_START;
+ } else {
+ return 0;
+ }
+}
+
+typedef struct {
+ uint32_t input_cv[8];
+ uint64_t counter;
+ uint8_t block[BLAKE3_BLOCK_LEN];
+ uint8_t block_len;
+ uint8_t flags;
+} output_t;
+
+INLINE output_t make_output(const uint32_t input_cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+ output_t ret;
+ memcpy(ret.input_cv, input_cv, 32);
+ memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+ ret.block_len = block_len;
+ ret.counter = counter;
+ ret.flags = flags;
+ return ret;
+}
+
+// Chaining values within a given chunk (specifically the compress_in_place
+// interface) are represented as words. This avoids unnecessary bytes<->words
+// conversion overhead in the portable implementation. However, the hash_many
+// interface handles both user input and parent node blocks, so it accepts
+// bytes. For that reason, chaining values in the CV stack are represented as
+// bytes.
+INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
+ uint32_t cv_words[8];
+ memcpy(cv_words, self->input_cv, 32);
+ blake3_compress_in_place(cv_words, self->block, self->block_len,
+ self->counter, self->flags);
+ store_cv_words(cv, cv_words);
+}
+
+INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
+ size_t out_len) {
+ uint64_t output_block_counter = seek / 64;
+ size_t offset_within_block = seek % 64;
+ uint8_t wide_buf[64];
+ while (out_len > 0) {
+ blake3_compress_xof(self->input_cv, self->block, self->block_len,
+ output_block_counter, self->flags | ROOT, wide_buf);
+ size_t available_bytes = 64 - offset_within_block;
+ size_t memcpy_len;
+ if (out_len > available_bytes) {
+ memcpy_len = available_bytes;
+ } else {
+ memcpy_len = out_len;
+ }
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
+ out += memcpy_len;
+ out_len -= memcpy_len;
+ output_block_counter += 1;
+ offset_within_block = 0;
+ }
+}
+
+INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
+ size_t input_len) {
+ if (self->buf_len > 0) {
+ size_t take = chunk_state_fill_buf(self, input, input_len);
+ input += take;
+ input_len -= take;
+ if (input_len > 0) {
+ blake3_compress_in_place(
+ self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
+ self->flags | chunk_state_maybe_start_flag(self));
+ self->blocks_compressed += 1;
+ self->buf_len = 0;
+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+ }
+ }
+
+ while (input_len > BLAKE3_BLOCK_LEN) {
+ blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
+ self->chunk_counter,
+ self->flags | chunk_state_maybe_start_flag(self));
+ self->blocks_compressed += 1;
+ input += BLAKE3_BLOCK_LEN;
+ input_len -= BLAKE3_BLOCK_LEN;
+ }
+
+ size_t take = chunk_state_fill_buf(self, input, input_len);
+ input += take;
+ input_len -= take;
+}
+
+INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
+ uint8_t block_flags =
+ self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+ return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
+ block_flags);
+}
+
+INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+ const uint32_t key[8], uint8_t flags) {
+ return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+INLINE size_t left_len(size_t content_len) {
+ // Subtract 1 to reserve at least one byte for the right side. content_len
+ // should always be greater than BLAKE3_CHUNK_LEN.
+ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+// on a single thread. Write out the chunk chaining values and return the
+// number of chunks hashed. These chunks are never the root and never empty;
+// those cases use a different codepath.
+INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags,
+ uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+ assert(0 < input_len);
+ assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
+#endif
+
+ const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+ size_t input_position = 0;
+ size_t chunks_array_len = 0;
+ while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+ chunks_array[chunks_array_len] = &input[input_position];
+ input_position += BLAKE3_CHUNK_LEN;
+ chunks_array_len += 1;
+ }
+
+ blake3_hash_many(chunks_array, chunks_array_len,
+ BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
+ true, flags, CHUNK_START, CHUNK_END, out);
+
+ // Hash the remaining partial chunk, if there is one. Note that the empty
+ // chunk (meaning the empty message) is a different codepath.
+ if (input_len > input_position) {
+ uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+ blake3_chunk_state chunk_state;
+ chunk_state_init(&chunk_state, key, flags);
+ chunk_state.chunk_counter = counter;
+ chunk_state_update(&chunk_state, &input[input_position],
+ input_len - input_position);
+ output_t output = chunk_state_output(&chunk_state);
+ output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+ return chunks_array_len + 1;
+ } else {
+ return chunks_array_len;
+ }
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+// on a single thread. Write out the parent chaining values and return the
+// number of parents hashed. (If there's an odd input chaining value left over,
+// return it as an additional output.) These parents are never the root and
+// never empty; those cases use a different codepath.
+INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
+ size_t num_chaining_values,
+ const uint32_t key[8], uint8_t flags,
+ uint8_t *out) {
+#if defined(BLAKE3_TESTING)
+ assert(2 <= num_chaining_values);
+ assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
+#endif
+
+ const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+ size_t parents_array_len = 0;
+ while (num_chaining_values - (2 * parents_array_len) >= 2) {
+ parents_array[parents_array_len] =
+ &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+ parents_array_len += 1;
+ }
+
+ blake3_hash_many(parents_array, parents_array_len, 1, key,
+ 0, // Parents always use counter 0.
+ false, flags | PARENT,
+ 0, // Parents have no start flags.
+ 0, // Parents have no end flags.
+ out);
+
+ // If there's an odd child left over, it becomes an output.
+ if (num_chaining_values > 2 * parents_array_len) {
+ memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+ &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+ BLAKE3_OUT_LEN);
+ return parents_array_len + 1;
+ } else {
+ return parents_array_len;
+ }
+}
+
+// The wide helper function returns (writes out) an array of chaining values
+// and returns the length of that array. The number of chaining values returned
+// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// if the input is shorter than that many chunks. The reason for maintaining a
+// wide array of chaining values going back up the tree, is to allow the
+// implementation to hash as many parents in parallel as possible.
+//
+// As a special case when the SIMD degree is 1, this function will still return
+// at least 2 outputs. This guarantees that this function doesn't perform the
+// root compression. (If it did, it would use the wrong flags, and also we
+// wouldn't be able to implement exendable ouput.) Note that this function is
+// not used when the whole input is only 1 chunk long; that's a different
+// codepath.
+//
+// Why not just have the caller split the input on the first update(), instead
+// of implementing this special rule? Because we don't want to limit SIMD or
+// multi-threading parallelism for that update().
+static size_t blake3_compress_subtree_wide(const uint8_t *input,
+ size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter,
+ uint8_t flags, uint8_t *out) {
+ // Note that the single chunk case does *not* bump the SIMD degree up to 2
+ // when it is 1. If this implementation adds multi-threading in the future,
+ // this gives us the option of multi-threading even the 2-chunk case, which
+ // can help performance on smaller platforms.
+ if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
+ return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
+ out);
+ }
+
+ // With more than simd_degree chunks, we need to recurse. Start by dividing
+ // the input into left and right subtrees. (Note that this is only optimal
+ // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+ // of 3 or something, we'll need a more complicated strategy.)
+ size_t left_input_len = left_len(input_len);
+ size_t right_input_len = input_len - left_input_len;
+ const uint8_t *right_input = &input[left_input_len];
+ uint64_t right_chunk_counter =
+ chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+ // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+ // account for the special case of returning 2 outputs when the SIMD degree
+ // is 1.
+ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t degree = blake3_simd_degree();
+ if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+ // The special case: We always use a degree of at least two, to make
+ // sure there are two outputs. Except, as noted above, at the chunk
+ // level, where we allow degree=1. (Note that the 1-chunk-input case is
+ // a different codepath.)
+ degree = 2;
+ }
+ uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+ // Recurse! If this implementation adds multi-threading support in the
+ // future, this is where it will go.
+ size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
+ chunk_counter, flags, cv_array);
+ size_t right_n = blake3_compress_subtree_wide(
+ right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+ // The special case again. If simd_degree=1, then we'll have left_n=1 and
+ // right_n=1. Rather than compressing them into a single output, return
+ // them directly, to make sure we always have at least two outputs.
+ if (left_n == 1) {
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+ return 2;
+ }
+
+ // Otherwise, do one layer of parent node compression.
+ size_t num_chaining_values = left_n + right_n;
+ return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
+ out);
+}
+
+// Hash a subtree with compress_subtree_wide(), and then condense the resulting
+// list of chaining values down to a single parent node. Don't compress that
+// last parent node, however. Instead, return its message bytes (the
+// concatenated chaining values of its children). This is necessary when the
+// first call to update() supplies a complete subtree, because the topmost
+// parent node of that subtree could end up being the root. It's also necessary
+// for extended output in the general case.
+//
+// As with compress_subtree_wide(), this function is not used on inputs of 1
+// chunk or less. That's a different codepath.
+INLINE void compress_subtree_to_parent_node(
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
+#if defined(BLAKE3_TESTING)
+ assert(input_len > BLAKE3_CHUNK_LEN);
+#endif
+
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
+ chunk_counter, flags, cv_array);
+
+ // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ // compress_subtree_wide() returns more than 2 chaining values. Condense
+ // them into 2 by forming parent nodes repeatedly.
+ uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+ while (num_cvs > 2) {
+ num_cvs =
+ compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
+ memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+ }
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
+ uint8_t flags) {
+ memcpy(self->key, key, BLAKE3_KEY_LEN);
+ chunk_state_init(&self->chunk, key, flags);
+ self->cv_stack_len = 0;
+}
+
+void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
+
+void blake3_hasher_init_keyed(blake3_hasher *self,
+ const uint8_t key[BLAKE3_KEY_LEN]) {
+ uint32_t key_words[8];
+ load_key_words(key, key_words);
+ hasher_init_base(self, key_words, KEYED_HASH);
+}
+
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+ size_t context_len) {
+ blake3_hasher context_hasher;
+ hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
+ blake3_hasher_update(&context_hasher, context, context_len);
+ uint8_t context_key[BLAKE3_KEY_LEN];
+ blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
+ uint32_t context_key_words[8];
+ load_key_words(context_key, context_key_words);
+ hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
+}
+
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
+ blake3_hasher_init_derive_key_raw(self, context, strlen(context));
+}
+
+// As described in hasher_push_cv() below, we do "lazy merging", delaying
+// merges until right before the next CV is about to be added. This is
+// different from the reference implementation. Another difference is that we
+// aren't always merging 1 chunk at a time. Instead, each CV might represent
+// any power-of-two number of chunks, as long as the smaller-above-larger stack
+// order is maintained. Instead of the "count the trailing 0-bits" algorithm
+// described in the spec, we use a "count the total number of 1-bits" variant
+// that doesn't require us to retain the subtree size of the CV on top of the
+// stack. The principle is the same: each CV that should remain in the stack is
+// represented by a 1-bit in the total number of chunks (or bytes) so far.
+INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
+ size_t post_merge_stack_len = (size_t)popcnt(total_len);
+ while (self->cv_stack_len > post_merge_stack_len) {
+ uint8_t *parent_node =
+ &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+ output_t output = parent_output(parent_node, self->key, self->chunk.flags);
+ output_chaining_value(&output, parent_node);
+ self->cv_stack_len -= 1;
+ }
+}
+
+// In reference_impl.rs, we merge the new CV with existing CVs from the stack
+// before pushing it. We can do that because we know more input is coming, so
+// we know none of the merges are root.
+//
+// This setting is different. We want to feed as much input as possible to
+// compress_subtree_wide(), without setting aside anything for the chunk_state.
+// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+// as a single subtree, if at all possible.
+//
+// This leads to two problems:
+// 1) This 64 KiB input might be the only call that ever gets made to update.
+// In this case, the root node of the 64 KiB subtree would be the root node
+// of the whole tree, and it would need to be ROOT finalized. We can't
+// compress it until we know.
+// 2) This 64 KiB input might complete a larger tree, whose root node is
+// similarly going to be the the root of the whole tree. For example, maybe
+// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+// node at the root of the 256 KiB subtree until we know how to finalize it.
+//
+// The second problem is solved with "lazy merging". That is, when we're about
+// to add a CV to the stack, we don't merge it with anything first, as the
+// reference impl does. Instead we do merges using the *previous* CV that was
+// added, which is sitting on top of the stack, and we put the new CV
+// (unmerged) on top of the stack afterwards. This guarantees that we never
+// merge the root node until finalize().
+//
+// Solving the first problem requires an additional tool,
+// compress_subtree_to_parent_node(). That function always returns the top
+// *two* chaining values of the subtree it's compressing. We then do lazy
+// merging with each of them separately, so that the second CV will always
+// remain unmerged. (That also helps us support extendable output when we're
+// hashing an input all-at-once.)
+INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
+ uint64_t chunk_counter) {
+ hasher_merge_cv_stack(self, chunk_counter);
+ memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+ BLAKE3_OUT_LEN);
+ self->cv_stack_len += 1;
+}
+
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+ size_t input_len) {
+ // Explicitly checking for zero avoids causing UB by passing a null pointer
+ // to memcpy. This comes up in practice with things like:
+ // std::vector<uint8_t> v;
+ // blake3_hasher_update(&hasher, v.data(), v.size());
+ if (input_len == 0) {
+ return;
+ }
+
+ const uint8_t *input_bytes = (const uint8_t *)input;
+
+ // If we have some partial chunk bytes in the internal chunk_state, we need
+ // to finish that chunk first.
+ if (chunk_state_len(&self->chunk) > 0) {
+ size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+ if (take > input_len) {
+ take = input_len;
+ }
+ chunk_state_update(&self->chunk, input_bytes, take);
+ input_bytes += take;
+ input_len -= take;
+ // If we've filled the current chunk and there's more coming, finalize this
+ // chunk and proceed. In this case we know it's not the root.
+ if (input_len > 0) {
+ output_t output = chunk_state_output(&self->chunk);
+ uint8_t chunk_cv[32];
+ output_chaining_value(&output, chunk_cv);
+ hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
+ chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+ } else {
+ return;
+ }
+ }
+
+ // Now the chunk_state is clear, and we have more input. If there's more than
+ // a single chunk (so, definitely not the root chunk), hash the largest whole
+ // subtree we can, with the full benefits of SIMD (and maybe in the future,
+ // multi-threading) parallelism. Two restrictions:
+ // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+ // the right edge can be incomplete, and we don't know where the right edge
+ // is going to be until we get to finalize().
+ // - The subtree must evenly divide the total number of chunks up until this
+ // point (if total is not 0). If the current incomplete subtree is only
+ // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+ // to complete the current subtree first.
+ // Because we might need to break up the input to form powers of 2, or to
+ // evenly divide what we already have, this part runs in a loop.
+ while (input_len > BLAKE3_CHUNK_LEN) {
+ size_t subtree_len = round_down_to_power_of_2(input_len);
+ uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+ // Shrink the subtree_len until it evenly divides the count so far. We know
+ // that subtree_len itself is a power of 2, so we can use a bitmasking
+ // trick instead of an actual remainder operation. (Note that if the caller
+ // consistently passes power-of-2 inputs of the same size, as is hopefully
+ // typical, this loop condition will always fail, and subtree_len will
+ // always be the full length of the input.)
+ //
+ // An aside: We don't have to shrink subtree_len quite this much. For
+ // example, if count_so_far is 1, we could pass 2 chunks to
+ // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+ // get the right answer in the end, and we might get to use 2-way SIMD
+ // parallelism. The problem with this optimization, is that it gets us
+ // stuck always hashing 2 chunks. The total number of chunks will remain
+ // odd, and we'll never graduate to higher degrees of parallelism. See
+ // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+ subtree_len /= 2;
+ }
+ // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
+ // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
+ uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+ if (subtree_len <= BLAKE3_CHUNK_LEN) {
+ blake3_chunk_state chunk_state;
+ chunk_state_init(&chunk_state, self->key, self->chunk.flags);
+ chunk_state.chunk_counter = self->chunk.chunk_counter;
+ chunk_state_update(&chunk_state, input_bytes, subtree_len);
+ output_t output = chunk_state_output(&chunk_state);
+ uint8_t cv[BLAKE3_OUT_LEN];
+ output_chaining_value(&output, cv);
+ hasher_push_cv(self, cv, chunk_state.chunk_counter);
+ } else {
+ // This is the high-performance happy path, though getting here depends
+ // on the caller giving us a long enough input.
+ uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+ compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
+ self->chunk.chunk_counter,
+ self->chunk.flags, cv_pair);
+ hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
+ hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
+ self->chunk.chunk_counter + (subtree_chunks / 2));
+ }
+ self->chunk.chunk_counter += subtree_chunks;
+ input_bytes += subtree_len;
+ input_len -= subtree_len;
+ }
+
+ // If there's any remaining input less than a full chunk, add it to the chunk
+ // state. In that case, also do a final merge loop to make sure the subtree
+ // stack doesn't contain any unmerged pairs. The remaining input means we
+ // know these merges are non-root. This merge loop isn't strictly necessary
+ // here, because hasher_push_chunk_cv already does its own merge loop, but it
+ // simplifies blake3_hasher_finalize below.
+ if (input_len > 0) {
+ chunk_state_update(&self->chunk, input_bytes, input_len);
+ hasher_merge_cv_stack(self, self->chunk.chunk_counter);
+ }
+}
+
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+ size_t out_len) {
+ blake3_hasher_finalize_seek(self, 0, out, out_len);
+}
+
+void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+ uint8_t *out, size_t out_len) {
+ // Explicitly checking for zero avoids causing UB by passing a null pointer
+ // to memcpy. This comes up in practice with things like:
+ // std::vector<uint8_t> v;
+ // blake3_hasher_finalize(&hasher, v.data(), v.size());
+ if (out_len == 0) {
+ return;
+ }
+
+ // If the subtree stack is empty, then the current chunk is the root.
+ if (self->cv_stack_len == 0) {
+ output_t output = chunk_state_output(&self->chunk);
+ output_root_bytes(&output, seek, out, out_len);
+ return;
+ }
+ // If there are any bytes in the chunk state, finalize that chunk and do a
+ // roll-up merge between that chunk hash and every subtree in the stack. In
+ // this case, the extra merge loop at the end of blake3_hasher_update
+ // guarantees that none of the subtrees in the stack need to be merged with
+ // each other first. Otherwise, if there are no bytes in the chunk state,
+ // then the top of the stack is a chunk hash, and we start the merge from
+ // that.
+ output_t output;
+ size_t cvs_remaining;
+ if (chunk_state_len(&self->chunk) > 0) {
+ cvs_remaining = self->cv_stack_len;
+ output = chunk_state_output(&self->chunk);
+ } else {
+ // There are always at least 2 CVs in the stack in this case.
+ cvs_remaining = self->cv_stack_len - 2;
+ output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
+ self->chunk.flags);
+ }
+ while (cvs_remaining > 0) {
+ cvs_remaining -= 1;
+ uint8_t parent_block[BLAKE3_BLOCK_LEN];
+ memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
+ output_chaining_value(&output, &parent_block[32]);
+ output = parent_output(parent_block, self->key, self->chunk.flags);
+ }
+ output_root_bytes(&output, seek, out, out_len);
+}
diff --git a/deps/blake3/blake3.h b/deps/blake3/blake3.h
@@ -0,0 +1,60 @@
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_VERSION_STRING "0.3.7"
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+#define BLAKE3_MAX_SIMD_DEGREE 16
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of blake3_hasher below.
+typedef struct {
+ uint32_t cv[8];
+ uint64_t chunk_counter;
+ uint8_t buf[BLAKE3_BLOCK_LEN];
+ uint8_t buf_len;
+ uint8_t blocks_compressed;
+ uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+ uint32_t key[8];
+ blake3_chunk_state chunk;
+ uint8_t cv_stack_len;
+ // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+ // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+ // requires a 4th entry, rather than merging everything down to 1, because we
+ // don't know whether more input is coming. This is different from how the
+ // reference implementation does things.
+ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+const char * blake3_version(void);
+void blake3_hasher_init(blake3_hasher *self);
+void blake3_hasher_init_keyed(blake3_hasher *self,
+ const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+ size_t context_len);
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+ size_t input_len);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+ size_t out_len);
+void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+ uint8_t *out, size_t out_len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
diff --git a/deps/blake3/blake3_avx2.c b/deps/blake3/blake3_avx2.c
@@ -0,0 +1,325 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const uint8_t src[32]) {
+ return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, uint8_t dest[16]) {
+ _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i rot16(__m256i x) {
+ return _mm256_shuffle_epi8(
+ x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+ 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+ return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+ return _mm256_shuffle_epi8(
+ x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+ 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+ return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[15] = rot16(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot12(v[4]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[15] = rot8(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot7(v[4]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot16(v[15]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[4] = rot12(v[4]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot8(v[15]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+ v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+ // is 22/33/66/77.
+ __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+ __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+ __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+ __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+ __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+ __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+ __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+ __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+ // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+ // 11/33.
+ __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+ __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+ __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+ __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+ __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+ __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+ __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+ __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+ // Interleave 128-bit lanes.
+ vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+ vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+ vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+ vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+ vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+ vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+ vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+ vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+ size_t block_offset, __m256i out[16]) {
+ out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+ out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+ out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+ out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+ out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+ out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+ out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+ out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+ out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+ out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+ out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+ out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+ out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+ out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+ out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+ out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+ for (size_t i = 0; i < 8; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs(&out[0]);
+ transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+ __m256i *out_lo, __m256i *out_hi) {
+ const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
+ const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ const __m256i add1 = _mm256_and_si256(mask, add0);
+ __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
+ __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
+ _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
+ __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
+ *out_lo = l;
+ *out_hi = h;
+}
+
+void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ __m256i h_vecs[8] = {
+ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+ set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+ };
+ __m256i counter_low_vec, counter_high_vec;
+ load_counters(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+ __m256i block_flags_vec = set1(block_flags);
+ __m256i msg_vecs[16];
+ transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m256i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn(v, msg_vecs, 0);
+ round_fn(v, msg_vecs, 1);
+ round_fn(v, msg_vecs, 2);
+ round_fn(v, msg_vecs, 3);
+ round_fn(v, msg_vecs, 4);
+ round_fn(v, msg_vecs, 5);
+ round_fn(v, msg_vecs, 6);
+ h_vecs[0] = xorv(v[0], v[8]);
+ h_vecs[1] = xorv(v[1], v[9]);
+ h_vecs[2] = xorv(v[2], v[10]);
+ h_vecs[3] = xorv(v[3], v[11]);
+ h_vecs[4] = xorv(v[4], v[12]);
+ h_vecs[5] = xorv(v[5], v[13]);
+ h_vecs[6] = xorv(v[6], v[14]);
+ h_vecs[7] = xorv(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs(h_vecs);
+ storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+ storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+ storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+ storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+ storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+ storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+ storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+ storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#else
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs >= DEGREE) {
+ blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += DEGREE;
+ }
+ inputs += DEGREE;
+ num_inputs -= DEGREE;
+ out = &out[DEGREE * BLAKE3_OUT_LEN];
+ }
+#if !defined(BLAKE3_NO_SSE41)
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+#else
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+#endif
+}
diff --git a/deps/blake3/blake3_avx2_x86-64_unix.S b/deps/blake3/blake3_avx2_x86-64_unix.S
@@ -0,0 +1,1815 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global _blake3_hash_many_avx2
+.global blake3_hash_many_avx2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+ .p2align 6
+_blake3_hash_many_avx2:
+blake3_hash_many_avx2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 680
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+0x280], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
+ vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
+ vmovdqa ymmword ptr [rsp+0x220], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+0x240], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+0x2A0], rdx
+ cmp rsi, 8
+ jc 3f
+2:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x2A0]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x200], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x20], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x40], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x60], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x80], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0xA0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0xC0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0xE0], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x100], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x120], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x140], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x160], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x180], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x1A0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x1C0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x1E0], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+0x200]
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+0x220]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
+ vmovdqa ymmword ptr [rsp+0x240], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+0x260]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x2A0]
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ test rsi, 0x4
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 0x50
+ vpermq ymm15, ymm15, 0x50
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpblendd ymm14, ymm14, ymm12, 0x44
+ vpblendd ymm15, ymm15, ymm12, 0x44
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+0x20], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 0x93
+ vpshufd ymm15, ymm15, 0x93
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ vpbroadcastd ymm2, dword ptr [rsp+0x200]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+0x20]
+ vpblendd ymm3, ymm3, ymm2, 0x88
+ vpblendd ymm11, ymm11, ymm2, 0x88
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+0x60], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+0x80], ymm5
+ vmovdqa ymmword ptr [rsp+0xA0], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm8, ymm8, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpshufd ymm10, ymm10, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm8, ymm8, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ vpshufd ymm10, ymm10, 0x93
+ dec al
+ je 9f
+ vmovdqa ymm4, ymmword ptr [rsp+0x40]
+ vmovdqa ymm5, ymmword ptr [rsp+0x80]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0x0F
+ vpshufd ymm4, ymm12, 0x39
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0xAA
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 0x88
+ vpshufd ymm12, ymm12, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymmword ptr [rsp+0x40], ymm13
+ vmovdqa ymmword ptr [rsp+0x80], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+0x60]
+ vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0x0F
+ vpshufd ymm12, ymm5, 0x39
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0xAA
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 0x88
+ vpshufd ymm5, ymm5, 0x78
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 0x1E
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+0x40]
+ vmovdqa ymm6, ymmword ptr [rsp+0x80]
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqu xmmword ptr [rbx+0x40], xmm8
+ vmovdqu xmmword ptr [rbx+0x50], xmm9
+ vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+ vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+ vmovaps xmm8, xmmword ptr [rsp+0x280]
+ vmovaps xmm0, xmmword ptr [rsp+0x240]
+ vmovaps xmm1, xmmword ptr [rsp+0x250]
+ vmovaps xmm2, xmmword ptr [rsp+0x260]
+ vmovaps xmm3, xmmword ptr [rsp+0x270]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+0x240], xmm0
+ vmovaps xmmword ptr [rsp+0x260], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test rsi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp+0x240]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x244]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x200]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovaps ymm8, ymmword ptr [rsp+0x280]
+ vmovaps ymm0, ymmword ptr [rsp+0x240]
+ vmovups ymm1, ymmword ptr [rsp+0x248]
+ vmovaps ymm2, ymmword ptr [rsp+0x260]
+ vmovups ymm3, ymmword ptr [rsp+0x268]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+0x240], ymm0
+ vmovaps ymmword ptr [rsp+0x260], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test rsi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm3, dword ptr [rsp+0x240]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm14, xmmword ptr [ROT16+rip]
+ vmovdqa xmm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+ .long 8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+
diff --git a/deps/blake3/blake3_avx2_x86-64_windows_gnu.S b/deps/blake3/blake3_avx2_x86-64_windows_gnu.S
@@ -0,0 +1,1817 @@
+.intel_syntax noprefix
+.global _blake3_hash_many_avx2
+.global blake3_hash_many_avx2
+.section .text
+ .p2align 6
+_blake3_hash_many_avx2:
+blake3_hash_many_avx2:
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 880
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ vmovdqa xmmword ptr [rsp+0x2D0], xmm6
+ vmovdqa xmmword ptr [rsp+0x2E0], xmm7
+ vmovdqa xmmword ptr [rsp+0x2F0], xmm8
+ vmovdqa xmmword ptr [rsp+0x300], xmm9
+ vmovdqa xmmword ptr [rsp+0x310], xmm10
+ vmovdqa xmmword ptr [rsp+0x320], xmm11
+ vmovdqa xmmword ptr [rsp+0x330], xmm12
+ vmovdqa xmmword ptr [rsp+0x340], xmm13
+ vmovdqa xmmword ptr [rsp+0x350], xmm14
+ vmovdqa xmmword ptr [rsp+0x360], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+0x68]
+ movzx r9, byte ptr [rbp+0x70]
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+0x260], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
+ vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
+ vmovdqa ymmword ptr [rsp+0x2A0], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+0x220], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+0x240], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+0x2C0], rdx
+ cmp rsi, 8
+ jc 3f
+2:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x78]
+ movzx ebx, byte ptr [rbp+0x80]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x88]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x2C0]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x200], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x20], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x40], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x60], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x80], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0xA0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0xC0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0xE0], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x100], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x120], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x140], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x160], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x180], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x1A0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x1C0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x1E0], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+0x200]
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+0x220]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+0x240]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x78]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x90]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+0x2A0]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220]
+ vmovdqa ymmword ptr [rsp+0x220], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+0x240]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+0x240], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+0x90], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ vmovdqa xmm6, xmmword ptr [rsp+0x2D0]
+ vmovdqa xmm7, xmmword ptr [rsp+0x2E0]
+ vmovdqa xmm8, xmmword ptr [rsp+0x2F0]
+ vmovdqa xmm9, xmmword ptr [rsp+0x300]
+ vmovdqa xmm10, xmmword ptr [rsp+0x310]
+ vmovdqa xmm11, xmmword ptr [rsp+0x320]
+ vmovdqa xmm12, xmmword ptr [rsp+0x330]
+ vmovdqa xmm13, xmmword ptr [rsp+0x340]
+ vmovdqa xmm14, xmmword ptr [rsp+0x350]
+ vmovdqa xmm15, xmmword ptr [rsp+0x360]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ mov rbx, qword ptr [rbp+0x90]
+ mov r15, qword ptr [rsp+0x2C0]
+ movzx r13d, byte ptr [rbp+0x78]
+ movzx r12d, byte ptr [rbp+0x88]
+ test rsi, 0x4
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+0x220]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+0x240]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 0x50
+ vpermq ymm15, ymm15, 0x50
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpblendd ymm14, ymm14, ymm12, 0x44
+ vpblendd ymm15, ymm15, ymm12, 0x44
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+0x20], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 0x93
+ vpshufd ymm15, ymm15, 0x93
+ vpbroadcastd ymm2, dword ptr [rsp+0x200]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+0x20]
+ vpblendd ymm3, ymm3, ymm2, 0x88
+ vpblendd ymm11, ymm11, ymm2, 0x88
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+0x60], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+0x80], ymm5
+ vmovdqa ymmword ptr [rsp+0xA0], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm8, ymm8, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpshufd ymm10, ymm10, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm8, ymm8, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ vpshufd ymm10, ymm10, 0x93
+ dec al
+ je 9f
+ vmovdqa ymm4, ymmword ptr [rsp+0x40]
+ vmovdqa ymm5, ymmword ptr [rsp+0x80]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0x0F
+ vpshufd ymm4, ymm12, 0x39
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0xAA
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 0x88
+ vpshufd ymm12, ymm12, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymmword ptr [rsp+0x40], ymm13
+ vmovdqa ymmword ptr [rsp+0x80], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+0x60]
+ vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0x0F
+ vpshufd ymm12, ymm5, 0x39
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0xAA
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 0x88
+ vpshufd ymm5, ymm5, 0x78
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 0x1E
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+0x40]
+ vmovdqa ymm6, ymmword ptr [rsp+0x80]
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqu xmmword ptr [rbx+0x40], xmm8
+ vmovdqu xmmword ptr [rbx+0x50], xmm9
+ vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+ vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+ vmovaps xmm8, xmmword ptr [rsp+0x260]
+ vmovaps xmm0, xmmword ptr [rsp+0x220]
+ vmovaps xmm1, xmmword ptr [rsp+0x230]
+ vmovaps xmm2, xmmword ptr [rsp+0x240]
+ vmovaps xmm3, xmmword ptr [rsp+0x250]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+0x220], xmm0
+ vmovaps xmmword ptr [rsp+0x240], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test rsi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp+0x220]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x224]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x200]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovaps ymm8, ymmword ptr [rsp+0x260]
+ vmovaps ymm0, ymmword ptr [rsp+0x220]
+ vmovups ymm1, ymmword ptr [rsp+0x228]
+ vmovaps ymm2, ymmword ptr [rsp+0x240]
+ vmovups ymm3, ymmword ptr [rsp+0x248]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+0x220], ymm0
+ vmovaps ymmword ptr [rsp+0x240], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test rsi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm3, dword ptr [rsp+0x220]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm14, xmmword ptr [ROT16+rip]
+ vmovdqa xmm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.section .rodata
+.p2align 6
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+ .long 8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+
diff --git a/deps/blake3/blake3_avx2_x86-64_windows_msvc.asm b/deps/blake3/blake3_avx2_x86-64_windows_msvc.asm
@@ -0,0 +1,1828 @@
+public _blake3_hash_many_avx2
+public blake3_hash_many_avx2
+
+_TEXT SEGMENT ALIGN(16) 'CODE'
+
+ALIGN 16
+blake3_hash_many_avx2 PROC
+_blake3_hash_many_avx2 PROC
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 880
+ and rsp, 0FFFFFFFFFFFFFFC0H
+ vmovdqa xmmword ptr [rsp+2D0H], xmm6
+ vmovdqa xmmword ptr [rsp+2E0H], xmm7
+ vmovdqa xmmword ptr [rsp+2F0H], xmm8
+ vmovdqa xmmword ptr [rsp+300H], xmm9
+ vmovdqa xmmword ptr [rsp+310H], xmm10
+ vmovdqa xmmword ptr [rsp+320H], xmm11
+ vmovdqa xmmword ptr [rsp+330H], xmm12
+ vmovdqa xmmword ptr [rsp+340H], xmm13
+ vmovdqa xmmword ptr [rsp+350H], xmm14
+ vmovdqa xmmword ptr [rsp+360H], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+68H]
+ movzx r9, byte ptr [rbp+70H]
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+260H], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0]
+ vpand ymm2, ymm0, ymmword ptr [ADD1]
+ vmovdqa ymmword ptr [rsp+2A0H], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+220H], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+240H], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+2C0H], rdx
+ cmp rsi, 8
+ jc final7blocks
+outerloop8:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+4H]
+ vpbroadcastd ymm2, dword ptr [rcx+8H]
+ vpbroadcastd ymm3, dword ptr [rcx+0CH]
+ vpbroadcastd ymm4, dword ptr [rcx+10H]
+ vpbroadcastd ymm5, dword ptr [rcx+14H]
+ vpbroadcastd ymm6, dword ptr [rcx+18H]
+ vpbroadcastd ymm7, dword ptr [rcx+1CH]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ mov r12, qword ptr [rdi+20H]
+ mov r13, qword ptr [rdi+28H]
+ mov r14, qword ptr [rdi+30H]
+ mov r15, qword ptr [rdi+38H]
+ movzx eax, byte ptr [rbp+78H]
+ movzx ebx, byte ptr [rbp+80H]
+ or eax, ebx
+ xor edx, edx
+ALIGN 16
+innerloop8:
+ movzx ebx, byte ptr [rbp+88H]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+2C0H]
+ cmove eax, ebx
+ mov dword ptr [rsp+200H], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-40H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-40H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-40H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-40H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+20H], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+40H], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+60H], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-30H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-30H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-30H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-30H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+80H], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0A0H], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0C0H], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0E0H], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-20H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-20H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-20H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-20H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+100H], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+120H], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+140H], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+160H], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-10H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-10H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-10H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-10H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+180H], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+1A0H], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+1C0H], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+1E0H], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+200H]
+ prefetcht0 byte ptr [r8+rdx+80H]
+ prefetcht0 byte ptr [r12+rdx+80H]
+ prefetcht0 byte ptr [r9+rdx+80H]
+ prefetcht0 byte ptr [r13+rdx+80H]
+ prefetcht0 byte ptr [r10+rdx+80H]
+ prefetcht0 byte ptr [r14+rdx+80H]
+ prefetcht0 byte ptr [r11+rdx+80H]
+ prefetcht0 byte ptr [r15+rdx+80H]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+80H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+220H]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+240H]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+20H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+100H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+180H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+120H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+40H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+20H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+120H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+160H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+60H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+80H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+40H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+160H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+140H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+60H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+80H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+100H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+180H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+120H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+140H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+40H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+60H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+20H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+120H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+160H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+100H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+180H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+20H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+40H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+80H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+60H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+140H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+160H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+20H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+100H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+120H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+40H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+60H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+200H], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+140H]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+180H]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+80H]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+200H]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+78H]
+ jne innerloop8
+ mov rbx, qword ptr [rbp+90H]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0CCH
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0CCH
+ vblendps ymm3, ymm12, ymm9, 0CCH
+ vperm2f128 ymm12, ymm1, ymm2, 20H
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0CCH
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 20H
+ vmovups ymmword ptr [rbx+20H], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0CCH
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0CCH
+ vblendps ymm14, ymm14, ymm13, 0CCH
+ vperm2f128 ymm8, ymm10, ymm14, 20H
+ vmovups ymmword ptr [rbx+40H], ymm8
+ vblendps ymm15, ymm13, ymm15, 0CCH
+ vperm2f128 ymm13, ymm6, ymm15, 20H
+ vmovups ymmword ptr [rbx+60H], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 31H
+ vperm2f128 ymm11, ymm3, ymm4, 31H
+ vmovups ymmword ptr [rbx+80H], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 31H
+ vperm2f128 ymm15, ymm6, ymm15, 31H
+ vmovups ymmword ptr [rbx+0A0H], ymm11
+ vmovups ymmword ptr [rbx+0C0H], ymm14
+ vmovups ymmword ptr [rbx+0E0H], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+2A0H]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+220H]
+ vmovdqa ymmword ptr [rsp+220H], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+240H]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+240H], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+90H], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc outerloop8
+ test rsi, rsi
+ jnz final7blocks
+unwind:
+ vzeroupper
+ vmovdqa xmm6, xmmword ptr [rsp+2D0H]
+ vmovdqa xmm7, xmmword ptr [rsp+2E0H]
+ vmovdqa xmm8, xmmword ptr [rsp+2F0H]
+ vmovdqa xmm9, xmmword ptr [rsp+300H]
+ vmovdqa xmm10, xmmword ptr [rsp+310H]
+ vmovdqa xmm11, xmmword ptr [rsp+320H]
+ vmovdqa xmm12, xmmword ptr [rsp+330H]
+ vmovdqa xmm13, xmmword ptr [rsp+340H]
+ vmovdqa xmm14, xmmword ptr [rsp+350H]
+ vmovdqa xmm15, xmmword ptr [rsp+360H]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+ALIGN 16
+final7blocks:
+ mov rbx, qword ptr [rbp+90H]
+ mov r15, qword ptr [rsp+2C0H]
+ movzx r13d, byte ptr [rbp+78H]
+ movzx r12d, byte ptr [rbp+88H]
+ test rsi, 4H
+ je final3blocks
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+220H]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+240H]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 50H
+ vpermq ymm15, ymm15, 50H
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN]
+ vpblendd ymm14, ymm14, ymm12, 44H
+ vpblendd ymm15, ymm15, ymm12, 44H
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+20H], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop4:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+200H], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-40H]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H
+ vmovups ymm3, ymmword ptr [r8+rdx-30H]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-20H]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H
+ vmovups ymm3, ymmword ptr [r8+rdx-10H]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 93H
+ vpshufd ymm7, ymm7, 93H
+ vmovups ymm10, ymmword ptr [r10+rdx-40H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H
+ vmovups ymm11, ymmword ptr [r10+rdx-30H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-20H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H
+ vmovups ymm11, ymmword ptr [r10+rdx-10H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 93H
+ vpshufd ymm15, ymm15, 93H
+ vpbroadcastd ymm2, dword ptr [rsp+200H]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+20H]
+ vpblendd ymm3, ymm3, ymm2, 88H
+ vpblendd ymm11, ymm11, ymm2, 88H
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+roundloop4:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+40H], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+60H], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+80H], ymm5
+ vmovdqa ymmword ptr [rsp+0A0H], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 93H
+ vpshufd ymm8, ymm8, 93H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm11, ymm11, 4EH
+ vpshufd ymm2, ymm2, 39H
+ vpshufd ymm10, ymm10, 39H
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 39H
+ vpshufd ymm8, ymm8, 39H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm11, ymm11, 4EH
+ vpshufd ymm2, ymm2, 93H
+ vpshufd ymm10, ymm10, 93H
+ dec al
+ je endroundloop4
+ vmovdqa ymm4, ymmword ptr [rsp+40H]
+ vmovdqa ymm5, ymmword ptr [rsp+80H]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0FH
+ vpshufd ymm4, ymm12, 39H
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0AAH
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 88H
+ vpshufd ymm12, ymm12, 78H
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 1EH
+ vmovdqa ymmword ptr [rsp+40H], ymm13
+ vmovdqa ymmword ptr [rsp+80H], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+60H]
+ vmovdqa ymm13, ymmword ptr [rsp+0A0H]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0FH
+ vpshufd ymm12, ymm5, 39H
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0AAH
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 88H
+ vpshufd ymm5, ymm5, 78H
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 1EH
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+40H]
+ vmovdqa ymm6, ymmword ptr [rsp+80H]
+ jmp roundloop4
+endroundloop4:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop4
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+ vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+ vmovdqu xmmword ptr [rbx+40H], xmm8
+ vmovdqu xmmword ptr [rbx+50H], xmm9
+ vextracti128 xmmword ptr [rbx+60H], ymm8, 01H
+ vextracti128 xmmword ptr [rbx+70H], ymm9, 01H
+ vmovaps xmm8, xmmword ptr [rsp+260H]
+ vmovaps xmm0, xmmword ptr [rsp+220H]
+ vmovaps xmm1, xmmword ptr [rsp+230H]
+ vmovaps xmm2, xmmword ptr [rsp+240H]
+ vmovaps xmm3, xmmword ptr [rsp+250H]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+220H], xmm0
+ vmovaps xmmword ptr [rsp+240H], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+final3blocks:
+ test rsi, 2H
+ je final1blocks
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+ vmovd xmm13, dword ptr [rsp+220H]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vmovd xmm14, dword ptr [rsp+224H]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vinserti128 ymm13, ymm13, xmm14, 01H
+ vbroadcasti128 ymm14, xmmword ptr [ROT16]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+200H], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+ vpbroadcastd ymm8, dword ptr [rsp+200H]
+ vpblendd ymm3, ymm13, ymm8, 88H
+ vmovups ymm8, ymmword ptr [r8+rdx-40H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
+ vmovups ymm9, ymmword ptr [r8+rdx-30H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-20H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
+ vmovups ymm9, ymmword ptr [r8+rdx-10H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 93H
+ vpshufd ymm7, ymm7, 93H
+ mov al, 7
+roundloop2:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 93H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm2, ymm2, 39H
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 39H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm2, ymm2, 93H
+ dec al
+ jz endroundloop2
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0FH
+ vpshufd ymm4, ymm8, 39H
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0AAH
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 88H
+ vpshufd ymm8, ymm8, 78H
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 1EH
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp roundloop2
+endroundloop2:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop2
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+ vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+ vmovaps ymm8, ymmword ptr [rsp+260H]
+ vmovaps ymm0, ymmword ptr [rsp+220H]
+ vmovups ymm1, ymmword ptr [rsp+228H]
+ vmovaps ymm2, ymmword ptr [rsp+240H]
+ vmovups ymm3, ymmword ptr [rsp+248H]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+220H], ymm0
+ vmovaps ymmword ptr [rsp+240H], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+final1blocks:
+ test rsi, 1H
+ je unwind
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+10H]
+ vmovd xmm3, dword ptr [rsp+220H]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vmovdqa xmm14, xmmword ptr [ROT16]
+ vmovdqa xmm15, xmmword ptr [ROT8]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop1:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-40H]
+ vmovups xmm9, xmmword ptr [r8+rdx-30H]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-20H]
+ vmovups xmm9, xmmword ptr [r8+rdx-10H]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 93H
+ vpshufd xmm7, xmm7, 93H
+ mov al, 7
+roundloop1:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 93H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 39H
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 39H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 93H
+ dec al
+ jz endroundloop1
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0FH
+ vpshufd xmm4, xmm8, 39H
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0AAH
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 88H
+ vpshufd xmm8, xmm8, 78H
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 1EH
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp roundloop1
+endroundloop1:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop1
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ jmp unwind
+
+_blake3_hash_many_avx2 ENDP
+blake3_hash_many_avx2 ENDP
+_TEXT ENDS
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN 64
+ADD0:
+ dd 0, 1, 2, 3, 4, 5, 6, 7
+
+ADD1:
+ dd 8 dup (8)
+
+BLAKE3_IV_0:
+ dd 8 dup (6A09E667H)
+
+BLAKE3_IV_1:
+ dd 8 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+ dd 8 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+ dd 8 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+ dd 8 dup (64)
+
+ROT16:
+ db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+
+ROT8:
+ db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+
+CMP_MSB_MASK:
+ dd 8 dup(80000000H)
+
+BLAKE3_IV:
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+_RDATA ENDS
+END
diff --git a/deps/blake3/blake3_avx512.c b/deps/blake3/blake3_avx512.c
@@ -0,0 +1,1204 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define _mm_shuffle_ps2(a, b, c) \
+ (_mm_castps_si128( \
+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu_128(const uint8_t src[16]) {
+ return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE __m256i loadu_256(const uint8_t src[32]) {
+ return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE __m512i loadu_512(const uint8_t src[64]) {
+ return _mm512_loadu_si512((const __m512i *)src);
+}
+
+INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
+ _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
+ _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
+
+INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
+
+INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
+
+INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
+
+INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
+
+INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
+
+INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
+
+INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
+
+INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
+
+INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
+
+INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
+
+INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
+
+INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
+
+INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
+
+/*
+ * ----------------------------------------------------------------------------
+ * compress_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = add_128(add_128(*row0, m), *row1);
+ *row3 = xor_128(*row3, *row0);
+ *row3 = rot16_128(*row3);
+ *row2 = add_128(*row2, *row3);
+ *row1 = xor_128(*row1, *row2);
+ *row1 = rot12_128(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = add_128(add_128(*row0, m), *row1);
+ *row3 = xor_128(*row3, *row0);
+ *row3 = rot8_128(*row3);
+ *row2 = add_128(*row2, *row3);
+ *row1 = xor_128(*row1, *row2);
+ *row1 = rot7_128(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
+ rows[0] = loadu_128((uint8_t *)&cv[0]);
+ rows[1] = loadu_128((uint8_t *)&cv[4]);
+ rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+ rows[3] = set4(counter_low(counter), counter_high(counter),
+ (uint32_t)block_len, (uint32_t)flags);
+
+ __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
+ __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
+ __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
+ __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
+
+ __m128i t0, t1, t2, t3, tt;
+
+ // Round 1. The first round permutes the message words from the original
+ // input order, into the groups that get mixed in parallel.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 2. This round and all following rounds apply a fixed permutation
+ // to the message words from the round before.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 3
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 4
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 5
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 6
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 7
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+ storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+ storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
+ storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
+}
+
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
+ storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = add_128(v[0], v[4]);
+ v[1] = add_128(v[1], v[5]);
+ v[2] = add_128(v[2], v[6]);
+ v[3] = add_128(v[3], v[7]);
+ v[12] = xor_128(v[12], v[0]);
+ v[13] = xor_128(v[13], v[1]);
+ v[14] = xor_128(v[14], v[2]);
+ v[15] = xor_128(v[15], v[3]);
+ v[12] = rot16_128(v[12]);
+ v[13] = rot16_128(v[13]);
+ v[14] = rot16_128(v[14]);
+ v[15] = rot16_128(v[15]);
+ v[8] = add_128(v[8], v[12]);
+ v[9] = add_128(v[9], v[13]);
+ v[10] = add_128(v[10], v[14]);
+ v[11] = add_128(v[11], v[15]);
+ v[4] = xor_128(v[4], v[8]);
+ v[5] = xor_128(v[5], v[9]);
+ v[6] = xor_128(v[6], v[10]);
+ v[7] = xor_128(v[7], v[11]);
+ v[4] = rot12_128(v[4]);
+ v[5] = rot12_128(v[5]);
+ v[6] = rot12_128(v[6]);
+ v[7] = rot12_128(v[7]);
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = add_128(v[0], v[4]);
+ v[1] = add_128(v[1], v[5]);
+ v[2] = add_128(v[2], v[6]);
+ v[3] = add_128(v[3], v[7]);
+ v[12] = xor_128(v[12], v[0]);
+ v[13] = xor_128(v[13], v[1]);
+ v[14] = xor_128(v[14], v[2]);
+ v[15] = xor_128(v[15], v[3]);
+ v[12] = rot8_128(v[12]);
+ v[13] = rot8_128(v[13]);
+ v[14] = rot8_128(v[14]);
+ v[15] = rot8_128(v[15]);
+ v[8] = add_128(v[8], v[12]);
+ v[9] = add_128(v[9], v[13]);
+ v[10] = add_128(v[10], v[14]);
+ v[11] = add_128(v[11], v[15]);
+ v[4] = xor_128(v[4], v[8]);
+ v[5] = xor_128(v[5], v[9]);
+ v[6] = xor_128(v[6], v[10]);
+ v[7] = xor_128(v[7], v[11]);
+ v[4] = rot7_128(v[4]);
+ v[5] = rot7_128(v[5]);
+ v[6] = rot7_128(v[6]);
+ v[7] = rot7_128(v[7]);
+
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = add_128(v[0], v[5]);
+ v[1] = add_128(v[1], v[6]);
+ v[2] = add_128(v[2], v[7]);
+ v[3] = add_128(v[3], v[4]);
+ v[15] = xor_128(v[15], v[0]);
+ v[12] = xor_128(v[12], v[1]);
+ v[13] = xor_128(v[13], v[2]);
+ v[14] = xor_128(v[14], v[3]);
+ v[15] = rot16_128(v[15]);
+ v[12] = rot16_128(v[12]);
+ v[13] = rot16_128(v[13]);
+ v[14] = rot16_128(v[14]);
+ v[10] = add_128(v[10], v[15]);
+ v[11] = add_128(v[11], v[12]);
+ v[8] = add_128(v[8], v[13]);
+ v[9] = add_128(v[9], v[14]);
+ v[5] = xor_128(v[5], v[10]);
+ v[6] = xor_128(v[6], v[11]);
+ v[7] = xor_128(v[7], v[8]);
+ v[4] = xor_128(v[4], v[9]);
+ v[5] = rot12_128(v[5]);
+ v[6] = rot12_128(v[6]);
+ v[7] = rot12_128(v[7]);
+ v[4] = rot12_128(v[4]);
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = add_128(v[0], v[5]);
+ v[1] = add_128(v[1], v[6]);
+ v[2] = add_128(v[2], v[7]);
+ v[3] = add_128(v[3], v[4]);
+ v[15] = xor_128(v[15], v[0]);
+ v[12] = xor_128(v[12], v[1]);
+ v[13] = xor_128(v[13], v[2]);
+ v[14] = xor_128(v[14], v[3]);
+ v[15] = rot8_128(v[15]);
+ v[12] = rot8_128(v[12]);
+ v[13] = rot8_128(v[13]);
+ v[14] = rot8_128(v[14]);
+ v[10] = add_128(v[10], v[15]);
+ v[11] = add_128(v[11], v[12]);
+ v[8] = add_128(v[8], v[13]);
+ v[9] = add_128(v[9], v[14]);
+ v[5] = xor_128(v[5], v[10]);
+ v[6] = xor_128(v[6], v[11]);
+ v[7] = xor_128(v[7], v[8]);
+ v[4] = xor_128(v[4], v[9]);
+ v[5] = rot7_128(v[5]);
+ v[6] = rot7_128(v[6]);
+ v[7] = rot7_128(v[7]);
+ v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(__m128i vecs[4]) {
+ // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // 22/33. Note that this doesn't split the vector into two lanes, as the
+ // AVX2 counterparts do.
+ __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+ __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+ __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+ __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+ // Interleave 64-bit lanes.
+ __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+ __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+ __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+ __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+ vecs[0] = abcd_0;
+ vecs[1] = abcd_1;
+ vecs[2] = abcd_2;
+ vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+ size_t block_offset, __m128i out[16]) {
+ out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+ out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+ out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+ out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+ out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+ out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+ out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+ out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+ out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+ out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+ out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+ out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+ out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+ out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+ out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+ out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+ for (size_t i = 0; i < 4; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs_128(&out[0]);
+ transpose_vecs_128(&out[4]);
+ transpose_vecs_128(&out[8]);
+ transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+ __m128i *out_lo, __m128i *out_hi) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ __m256i mask_vec = _mm256_set1_epi64x(mask);
+ __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+ deltas = _mm256_and_si256(mask_vec, deltas);
+ __m256i counters =
+ _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+ *out_lo = _mm256_cvtepi64_epi32(counters);
+ *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
+}
+
+void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ __m128i h_vecs[8] = {
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+ };
+ __m128i counter_low_vec, counter_high_vec;
+ load_counters4(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+ __m128i block_flags_vec = set1_128(block_flags);
+ __m128i msg_vecs[16];
+ transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m128i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn4(v, msg_vecs, 0);
+ round_fn4(v, msg_vecs, 1);
+ round_fn4(v, msg_vecs, 2);
+ round_fn4(v, msg_vecs, 3);
+ round_fn4(v, msg_vecs, 4);
+ round_fn4(v, msg_vecs, 5);
+ round_fn4(v, msg_vecs, 6);
+ h_vecs[0] = xor_128(v[0], v[8]);
+ h_vecs[1] = xor_128(v[1], v[9]);
+ h_vecs[2] = xor_128(v[2], v[10]);
+ h_vecs[3] = xor_128(v[3], v[11]);
+ h_vecs[4] = xor_128(v[4], v[12]);
+ h_vecs[5] = xor_128(v[5], v[13]);
+ h_vecs[6] = xor_128(v[6], v[14]);
+ h_vecs[7] = xor_128(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs_128(&h_vecs[0]);
+ transpose_vecs_128(&h_vecs[4]);
+ // The first four vecs now contain the first half of each output, and the
+ // second four vecs contain the second half of each output.
+ storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
+ storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
+ storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
+ storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
+ storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
+ storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
+ storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
+ storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash8_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
+ v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = add_256(v[0], v[4]);
+ v[1] = add_256(v[1], v[5]);
+ v[2] = add_256(v[2], v[6]);
+ v[3] = add_256(v[3], v[7]);
+ v[12] = xor_256(v[12], v[0]);
+ v[13] = xor_256(v[13], v[1]);
+ v[14] = xor_256(v[14], v[2]);
+ v[15] = xor_256(v[15], v[3]);
+ v[12] = rot16_256(v[12]);
+ v[13] = rot16_256(v[13]);
+ v[14] = rot16_256(v[14]);
+ v[15] = rot16_256(v[15]);
+ v[8] = add_256(v[8], v[12]);
+ v[9] = add_256(v[9], v[13]);
+ v[10] = add_256(v[10], v[14]);
+ v[11] = add_256(v[11], v[15]);
+ v[4] = xor_256(v[4], v[8]);
+ v[5] = xor_256(v[5], v[9]);
+ v[6] = xor_256(v[6], v[10]);
+ v[7] = xor_256(v[7], v[11]);
+ v[4] = rot12_256(v[4]);
+ v[5] = rot12_256(v[5]);
+ v[6] = rot12_256(v[6]);
+ v[7] = rot12_256(v[7]);
+ v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = add_256(v[0], v[4]);
+ v[1] = add_256(v[1], v[5]);
+ v[2] = add_256(v[2], v[6]);
+ v[3] = add_256(v[3], v[7]);
+ v[12] = xor_256(v[12], v[0]);
+ v[13] = xor_256(v[13], v[1]);
+ v[14] = xor_256(v[14], v[2]);
+ v[15] = xor_256(v[15], v[3]);
+ v[12] = rot8_256(v[12]);
+ v[13] = rot8_256(v[13]);
+ v[14] = rot8_256(v[14]);
+ v[15] = rot8_256(v[15]);
+ v[8] = add_256(v[8], v[12]);
+ v[9] = add_256(v[9], v[13]);
+ v[10] = add_256(v[10], v[14]);
+ v[11] = add_256(v[11], v[15]);
+ v[4] = xor_256(v[4], v[8]);
+ v[5] = xor_256(v[5], v[9]);
+ v[6] = xor_256(v[6], v[10]);
+ v[7] = xor_256(v[7], v[11]);
+ v[4] = rot7_256(v[4]);
+ v[5] = rot7_256(v[5]);
+ v[6] = rot7_256(v[6]);
+ v[7] = rot7_256(v[7]);
+
+ v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = add_256(v[0], v[5]);
+ v[1] = add_256(v[1], v[6]);
+ v[2] = add_256(v[2], v[7]);
+ v[3] = add_256(v[3], v[4]);
+ v[15] = xor_256(v[15], v[0]);
+ v[12] = xor_256(v[12], v[1]);
+ v[13] = xor_256(v[13], v[2]);
+ v[14] = xor_256(v[14], v[3]);
+ v[15] = rot16_256(v[15]);
+ v[12] = rot16_256(v[12]);
+ v[13] = rot16_256(v[13]);
+ v[14] = rot16_256(v[14]);
+ v[10] = add_256(v[10], v[15]);
+ v[11] = add_256(v[11], v[12]);
+ v[8] = add_256(v[8], v[13]);
+ v[9] = add_256(v[9], v[14]);
+ v[5] = xor_256(v[5], v[10]);
+ v[6] = xor_256(v[6], v[11]);
+ v[7] = xor_256(v[7], v[8]);
+ v[4] = xor_256(v[4], v[9]);
+ v[5] = rot12_256(v[5]);
+ v[6] = rot12_256(v[6]);
+ v[7] = rot12_256(v[7]);
+ v[4] = rot12_256(v[4]);
+ v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = add_256(v[0], v[5]);
+ v[1] = add_256(v[1], v[6]);
+ v[2] = add_256(v[2], v[7]);
+ v[3] = add_256(v[3], v[4]);
+ v[15] = xor_256(v[15], v[0]);
+ v[12] = xor_256(v[12], v[1]);
+ v[13] = xor_256(v[13], v[2]);
+ v[14] = xor_256(v[14], v[3]);
+ v[15] = rot8_256(v[15]);
+ v[12] = rot8_256(v[12]);
+ v[13] = rot8_256(v[13]);
+ v[14] = rot8_256(v[14]);
+ v[10] = add_256(v[10], v[15]);
+ v[11] = add_256(v[11], v[12]);
+ v[8] = add_256(v[8], v[13]);
+ v[9] = add_256(v[9], v[14]);
+ v[5] = xor_256(v[5], v[10]);
+ v[6] = xor_256(v[6], v[11]);
+ v[7] = xor_256(v[7], v[8]);
+ v[4] = xor_256(v[4], v[9]);
+ v[5] = rot7_256(v[5]);
+ v[6] = rot7_256(v[6]);
+ v[7] = rot7_256(v[7]);
+ v[4] = rot7_256(v[4]);
+}
+
+INLINE void transpose_vecs_256(__m256i vecs[8]) {
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+ // is 22/33/66/77.
+ __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+ __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+ __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+ __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+ __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+ __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+ __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+ __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+ // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+ // 11/33.
+ __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+ __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+ __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+ __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+ __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+ __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+ __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+ __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+ // Interleave 128-bit lanes.
+ vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+ vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+ vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+ vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+ vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+ vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+ vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+ vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
+ size_t block_offset, __m256i out[16]) {
+ out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+ out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+ out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+ out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+ out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+ out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+ out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+ out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+ out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+ out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+ out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+ out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+ out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+ out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+ out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+ out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+ for (size_t i = 0; i < 8; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs_256(&out[0]);
+ transpose_vecs_256(&out[8]);
+}
+
+INLINE void load_counters8(uint64_t counter, bool increment_counter,
+ __m256i *out_lo, __m256i *out_hi) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ __m512i mask_vec = _mm512_set1_epi64(mask);
+ __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ deltas = _mm512_and_si512(mask_vec, deltas);
+ __m512i counters =
+ _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+ *out_lo = _mm512_cvtepi64_epi32(counters);
+ *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
+}
+
+void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ __m256i h_vecs[8] = {
+ set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+ set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
+ };
+ __m256i counter_low_vec, counter_high_vec;
+ load_counters8(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
+ __m256i block_flags_vec = set1_256(block_flags);
+ __m256i msg_vecs[16];
+ transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m256i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn8(v, msg_vecs, 0);
+ round_fn8(v, msg_vecs, 1);
+ round_fn8(v, msg_vecs, 2);
+ round_fn8(v, msg_vecs, 3);
+ round_fn8(v, msg_vecs, 4);
+ round_fn8(v, msg_vecs, 5);
+ round_fn8(v, msg_vecs, 6);
+ h_vecs[0] = xor_256(v[0], v[8]);
+ h_vecs[1] = xor_256(v[1], v[9]);
+ h_vecs[2] = xor_256(v[2], v[10]);
+ h_vecs[3] = xor_256(v[3], v[11]);
+ h_vecs[4] = xor_256(v[4], v[12]);
+ h_vecs[5] = xor_256(v[5], v[13]);
+ h_vecs[6] = xor_256(v[6], v[14]);
+ h_vecs[7] = xor_256(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs_256(h_vecs);
+ storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
+ storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
+ storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
+ storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
+ storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
+ storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
+ storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
+ storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash16_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
+ v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = add_512(v[0], v[4]);
+ v[1] = add_512(v[1], v[5]);
+ v[2] = add_512(v[2], v[6]);
+ v[3] = add_512(v[3], v[7]);
+ v[12] = xor_512(v[12], v[0]);
+ v[13] = xor_512(v[13], v[1]);
+ v[14] = xor_512(v[14], v[2]);
+ v[15] = xor_512(v[15], v[3]);
+ v[12] = rot16_512(v[12]);
+ v[13] = rot16_512(v[13]);
+ v[14] = rot16_512(v[14]);
+ v[15] = rot16_512(v[15]);
+ v[8] = add_512(v[8], v[12]);
+ v[9] = add_512(v[9], v[13]);
+ v[10] = add_512(v[10], v[14]);
+ v[11] = add_512(v[11], v[15]);
+ v[4] = xor_512(v[4], v[8]);
+ v[5] = xor_512(v[5], v[9]);
+ v[6] = xor_512(v[6], v[10]);
+ v[7] = xor_512(v[7], v[11]);
+ v[4] = rot12_512(v[4]);
+ v[5] = rot12_512(v[5]);
+ v[6] = rot12_512(v[6]);
+ v[7] = rot12_512(v[7]);
+ v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = add_512(v[0], v[4]);
+ v[1] = add_512(v[1], v[5]);
+ v[2] = add_512(v[2], v[6]);
+ v[3] = add_512(v[3], v[7]);
+ v[12] = xor_512(v[12], v[0]);
+ v[13] = xor_512(v[13], v[1]);
+ v[14] = xor_512(v[14], v[2]);
+ v[15] = xor_512(v[15], v[3]);
+ v[12] = rot8_512(v[12]);
+ v[13] = rot8_512(v[13]);
+ v[14] = rot8_512(v[14]);
+ v[15] = rot8_512(v[15]);
+ v[8] = add_512(v[8], v[12]);
+ v[9] = add_512(v[9], v[13]);
+ v[10] = add_512(v[10], v[14]);
+ v[11] = add_512(v[11], v[15]);
+ v[4] = xor_512(v[4], v[8]);
+ v[5] = xor_512(v[5], v[9]);
+ v[6] = xor_512(v[6], v[10]);
+ v[7] = xor_512(v[7], v[11]);
+ v[4] = rot7_512(v[4]);
+ v[5] = rot7_512(v[5]);
+ v[6] = rot7_512(v[6]);
+ v[7] = rot7_512(v[7]);
+
+ v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = add_512(v[0], v[5]);
+ v[1] = add_512(v[1], v[6]);
+ v[2] = add_512(v[2], v[7]);
+ v[3] = add_512(v[3], v[4]);
+ v[15] = xor_512(v[15], v[0]);
+ v[12] = xor_512(v[12], v[1]);
+ v[13] = xor_512(v[13], v[2]);
+ v[14] = xor_512(v[14], v[3]);
+ v[15] = rot16_512(v[15]);
+ v[12] = rot16_512(v[12]);
+ v[13] = rot16_512(v[13]);
+ v[14] = rot16_512(v[14]);
+ v[10] = add_512(v[10], v[15]);
+ v[11] = add_512(v[11], v[12]);
+ v[8] = add_512(v[8], v[13]);
+ v[9] = add_512(v[9], v[14]);
+ v[5] = xor_512(v[5], v[10]);
+ v[6] = xor_512(v[6], v[11]);
+ v[7] = xor_512(v[7], v[8]);
+ v[4] = xor_512(v[4], v[9]);
+ v[5] = rot12_512(v[5]);
+ v[6] = rot12_512(v[6]);
+ v[7] = rot12_512(v[7]);
+ v[4] = rot12_512(v[4]);
+ v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = add_512(v[0], v[5]);
+ v[1] = add_512(v[1], v[6]);
+ v[2] = add_512(v[2], v[7]);
+ v[3] = add_512(v[3], v[4]);
+ v[15] = xor_512(v[15], v[0]);
+ v[12] = xor_512(v[12], v[1]);
+ v[13] = xor_512(v[13], v[2]);
+ v[14] = xor_512(v[14], v[3]);
+ v[15] = rot8_512(v[15]);
+ v[12] = rot8_512(v[12]);
+ v[13] = rot8_512(v[13]);
+ v[14] = rot8_512(v[14]);
+ v[10] = add_512(v[10], v[15]);
+ v[11] = add_512(v[11], v[12]);
+ v[8] = add_512(v[8], v[13]);
+ v[9] = add_512(v[9], v[14]);
+ v[5] = xor_512(v[5], v[10]);
+ v[6] = xor_512(v[6], v[11]);
+ v[7] = xor_512(v[7], v[8]);
+ v[4] = xor_512(v[4], v[9]);
+ v[5] = rot7_512(v[5]);
+ v[6] = rot7_512(v[6]);
+ v[7] = rot7_512(v[7]);
+ v[4] = rot7_512(v[4]);
+}
+
+// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
+#define LO_IMM8 0x88
+
+INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
+ return _mm512_shuffle_i32x4(a, b, LO_IMM8);
+}
+
+// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
+#define HI_IMM8 0xdd
+
+INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
+ return _mm512_shuffle_i32x4(a, b, HI_IMM8);
+}
+
+INLINE void transpose_vecs_512(__m512i vecs[16]) {
+ // Interleave 32-bit lanes. The _0 unpack is lanes
+ // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
+ // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
+ __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
+ __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
+ __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
+ __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
+ __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
+ __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
+ __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
+ __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
+ __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
+ __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
+ __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
+ __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
+ __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
+ __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
+ __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
+ __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
+
+ // Interleave 64-bit lates. The _0 unpack is lanes
+ // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
+ // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
+ // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
+ // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
+ __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
+ __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
+ __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
+ __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
+ __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
+ __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
+ __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
+ __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
+ __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
+ __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
+ __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
+ __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
+ __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
+ __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
+ __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
+ __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
+
+ // Interleave 128-bit lanes. The _0 unpack is
+ // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
+ // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
+ __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
+ __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
+ __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
+ __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
+ __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
+ __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
+ __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
+ __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
+ __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
+ __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
+ __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
+ __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
+ __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
+ __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
+ __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
+ __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
+
+ // Interleave 128-bit lanes again for the final outputs.
+ vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
+ vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
+ vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
+ vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
+ vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
+ vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
+ vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
+ vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
+ vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
+ vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
+ vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
+ vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
+ vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
+ vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
+ vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
+ vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
+}
+
+INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
+ size_t block_offset, __m512i out[16]) {
+ out[0] = loadu_512(&inputs[0][block_offset]);
+ out[1] = loadu_512(&inputs[1][block_offset]);
+ out[2] = loadu_512(&inputs[2][block_offset]);
+ out[3] = loadu_512(&inputs[3][block_offset]);
+ out[4] = loadu_512(&inputs[4][block_offset]);
+ out[5] = loadu_512(&inputs[5][block_offset]);
+ out[6] = loadu_512(&inputs[6][block_offset]);
+ out[7] = loadu_512(&inputs[7][block_offset]);
+ out[8] = loadu_512(&inputs[8][block_offset]);
+ out[9] = loadu_512(&inputs[9][block_offset]);
+ out[10] = loadu_512(&inputs[10][block_offset]);
+ out[11] = loadu_512(&inputs[11][block_offset]);
+ out[12] = loadu_512(&inputs[12][block_offset]);
+ out[13] = loadu_512(&inputs[13][block_offset]);
+ out[14] = loadu_512(&inputs[14][block_offset]);
+ out[15] = loadu_512(&inputs[15][block_offset]);
+ for (size_t i = 0; i < 16; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs_512(out);
+}
+
+INLINE void load_counters16(uint64_t counter, bool increment_counter,
+ __m512i *out_lo, __m512i *out_hi) {
+ const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
+ const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ const __m512i add1 = _mm512_and_si512(mask, add0);
+ __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
+ __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
+ __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
+ *out_lo = l;
+ *out_hi = h;
+}
+
+void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end,
+ uint8_t *out) {
+ __m512i h_vecs[8] = {
+ set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+ set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
+ };
+ __m512i counter_low_vec, counter_high_vec;
+ load_counters16(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
+ __m512i block_flags_vec = set1_512(block_flags);
+ __m512i msg_vecs[16];
+ transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m512i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn16(v, msg_vecs, 0);
+ round_fn16(v, msg_vecs, 1);
+ round_fn16(v, msg_vecs, 2);
+ round_fn16(v, msg_vecs, 3);
+ round_fn16(v, msg_vecs, 4);
+ round_fn16(v, msg_vecs, 5);
+ round_fn16(v, msg_vecs, 6);
+ h_vecs[0] = xor_512(v[0], v[8]);
+ h_vecs[1] = xor_512(v[1], v[9]);
+ h_vecs[2] = xor_512(v[2], v[10]);
+ h_vecs[3] = xor_512(v[3], v[11]);
+ h_vecs[4] = xor_512(v[4], v[12]);
+ h_vecs[5] = xor_512(v[5], v[13]);
+ h_vecs[6] = xor_512(v[6], v[14]);
+ h_vecs[7] = xor_512(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
+ // state vectors. Pad the matrix with zeros. After transposition, store the
+ // lower half of each vector.
+ __m512i padded[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+ set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+ };
+ transpose_vecs_512(padded);
+ _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
+ _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
+ _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
+ _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
+ _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
+ _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
+ _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
+ _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
+ _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
+ _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
+ _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
+ _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
+ _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
+ _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
+ _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
+ _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
+ block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs >= 16) {
+ blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 16;
+ }
+ inputs += 16;
+ num_inputs -= 16;
+ out = &out[16 * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs >= 8) {
+ blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 8;
+ }
+ inputs += 8;
+ num_inputs -= 8;
+ out = &out[8 * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs >= 4) {
+ blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 4;
+ }
+ inputs += 4;
+ num_inputs -= 4;
+ out = &out[4 * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs > 0) {
+ hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
diff --git a/deps/blake3/blake3_avx512_x86-64_unix.S b/deps/blake3/blake3_avx512_x86-64_unix.S
@@ -0,0 +1,2585 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global _blake3_hash_many_avx512
+.global blake3_hash_many_avx512
+.global blake3_compress_in_place_avx512
+.global _blake3_compress_in_place_avx512
+.global blake3_compress_xof_avx512
+.global _blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+.p2align 6
+_blake3_hash_many_avx512:
+blake3_hash_many_avx512:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 144
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+ vpcmpltud k2, ymm2, ymm0
+ vpcmpltud k3, ymm3, ymm0
+ vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+ vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+ vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+0x80], rdx
+ cmp rsi, 16
+ jc 3f
+2:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+ vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+ vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+ vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+ vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+ vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+ vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 0x88
+ vshufi32x4 zmm17, zmm1, zmm5, 0x88
+ vshufi32x4 zmm18, zmm2, zmm6, 0x88
+ vshufi32x4 zmm19, zmm3, zmm7, 0x88
+ vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+ vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+ vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+ vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+ vshufi32x4 zmm0, zmm16, zmm17, 0x88
+ vshufi32x4 zmm1, zmm18, zmm19, 0x88
+ vshufi32x4 zmm2, zmm20, zmm21, 0x88
+ vshufi32x4 zmm3, zmm22, zmm23, 0x88
+ vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+ vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+ vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+ vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+ vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+ vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+ vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+ vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+ vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+ vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+ vmovdqa32 zmm2, zmm0
+ vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+ vpcmpltud k2, zmm2, zmm0
+ vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 6
+3:
+ test esi, 0x8
+ je 3f
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+2:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+0x40]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd ymm15, dword ptr [rsp+0x88]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 2b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ add rdi, 64
+ sub rsi, 8
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x80]
+ movzx r13, byte ptr [rbp+0x38]
+ movzx r12, byte ptr [rbp+0x48]
+ test esi, 0x4
+ je 3f
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0xDC
+ vpermq ymm15, ymm15, 0xDC
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 0x93
+ vpshufd zmm7, zmm7, 0x93
+ mov al, 7
+9:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x93
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x39
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x39
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x93
+ dec al
+ jz 9f
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0x0F
+ vpshufd zmm4, zmm8, 0x39
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 0x78
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 0x1E
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp 9b
+9:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+ vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test esi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x4]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x88]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+_blake3_compress_in_place_avx512:
+blake3_compress_in_place_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rdi], xmm0
+ vmovdqu xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+_blake3_compress_xof_avx512:
+blake3_compress_xof_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, [rdi]
+ vpxor xmm3, xmm3, [rdi+0x10]
+ vmovdqu xmmword ptr [r9], xmm0
+ vmovdqu xmmword ptr [r9+0x10], xmm1
+ vmovdqu xmmword ptr [r9+0x20], xmm2
+ vmovdqu xmmword ptr [r9+0x30], xmm3
+ ret
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+INDEX0:
+ .long 0, 1, 2, 3, 16, 17, 18, 19
+ .long 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ .long 4, 5, 6, 7, 20, 21, 22, 23
+ .long 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ .long 8, 9, 10, 11, 12, 13, 14, 15
+ADD1: .long 1
+
+ADD16: .long 16
+BLAKE3_BLOCK_LEN:
+ .long 64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+ .long 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A
diff --git a/deps/blake3/blake3_avx512_x86-64_windows_gnu.S b/deps/blake3/blake3_avx512_x86-64_windows_gnu.S
@@ -0,0 +1,2615 @@
+.intel_syntax noprefix
+
+.global _blake3_hash_many_avx512
+.global blake3_hash_many_avx512
+.global blake3_compress_in_place_avx512
+.global _blake3_compress_in_place_avx512
+.global blake3_compress_xof_avx512
+.global _blake3_compress_xof_avx512
+
+.section .text
+.p2align 6
+_blake3_hash_many_avx512:
+blake3_hash_many_avx512:
+ push r15
+ push r14
+ push r13
+ push r12
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 304
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ vmovdqa xmmword ptr [rsp+0x90], xmm6
+ vmovdqa xmmword ptr [rsp+0xA0], xmm7
+ vmovdqa xmmword ptr [rsp+0xB0], xmm8
+ vmovdqa xmmword ptr [rsp+0xC0], xmm9
+ vmovdqa xmmword ptr [rsp+0xD0], xmm10
+ vmovdqa xmmword ptr [rsp+0xE0], xmm11
+ vmovdqa xmmword ptr [rsp+0xF0], xmm12
+ vmovdqa xmmword ptr [rsp+0x100], xmm13
+ vmovdqa xmmword ptr [rsp+0x110], xmm14
+ vmovdqa xmmword ptr [rsp+0x120], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+0x68]
+ movzx r9, byte ptr [rbp+0x70]
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+ vpcmpltud k2, ymm2, ymm0
+ vpcmpltud k3, ymm3, ymm0
+ vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+ vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+0x20], ymm3
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ vmovdqa ymmword ptr [rsp+0x60], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+0x80], rdx
+ cmp rsi, 16
+ jc 3f
+2:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+ vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+ vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+ vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+ vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+ vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+ vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+ movzx eax, byte ptr [rbp+0x78]
+ movzx ebx, byte ptr [rbp+0x80]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x88]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+0x78]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x90]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 0x88
+ vshufi32x4 zmm17, zmm1, zmm5, 0x88
+ vshufi32x4 zmm18, zmm2, zmm6, 0x88
+ vshufi32x4 zmm19, zmm3, zmm7, 0x88
+ vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+ vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+ vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+ vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+ vshufi32x4 zmm0, zmm16, zmm17, 0x88
+ vshufi32x4 zmm1, zmm18, zmm19, 0x88
+ vshufi32x4 zmm2, zmm20, zmm21, 0x88
+ vshufi32x4 zmm3, zmm22, zmm23, 0x88
+ vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+ vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+ vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+ vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+ vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+ vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+ vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+ vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+ vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+ vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+ vmovdqa32 zmm2, zmm0
+ vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+ vpcmpltud k2, zmm2, zmm0
+ vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+0x90], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc 2b
+ test rsi, rsi
+ jne 3f
+4:
+ vzeroupper
+ vmovdqa xmm6, xmmword ptr [rsp+0x90]
+ vmovdqa xmm7, xmmword ptr [rsp+0xA0]
+ vmovdqa xmm8, xmmword ptr [rsp+0xB0]
+ vmovdqa xmm9, xmmword ptr [rsp+0xC0]
+ vmovdqa xmm10, xmmword ptr [rsp+0xD0]
+ vmovdqa xmm11, xmmword ptr [rsp+0xE0]
+ vmovdqa xmm12, xmmword ptr [rsp+0xF0]
+ vmovdqa xmm13, xmmword ptr [rsp+0x100]
+ vmovdqa xmm14, xmmword ptr [rsp+0x110]
+ vmovdqa xmm15, xmmword ptr [rsp+0x120]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 6
+3:
+ test esi, 0x8
+ je 3f
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x78]
+ movzx ebx, byte ptr [rbp+0x80]
+ or eax, ebx
+ xor edx, edx
+2:
+ movzx ebx, byte ptr [rbp+0x88]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+0x40]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd ymm15, dword ptr [rsp+0x88]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x78]
+ jne 2b
+ mov rbx, qword ptr [rbp+0x90]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+0x40]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+0x40], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+0x90], rbx
+ add rdi, 64
+ sub rsi, 8
+3:
+ mov rbx, qword ptr [rbp+0x90]
+ mov r15, qword ptr [rsp+0x80]
+ movzx r13, byte ptr [rbp+0x78]
+ movzx r12, byte ptr [rbp+0x88]
+ test esi, 0x4
+ je 3f
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+0x40]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0xDC
+ vpermq ymm15, ymm15, 0xDC
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 0x93
+ vpshufd zmm7, zmm7, 0x93
+ mov al, 7
+9:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x93
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x39
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x39
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x93
+ dec al
+ jz 9f
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0x0F
+ vpshufd zmm4, zmm8, 0x39
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 0x78
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 0x1E
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp 9b
+9:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+ vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test esi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x4]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x88]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+
+.p2align 6
+_blake3_compress_in_place_avx512:
+blake3_compress_in_place_avx512:
+ sub rsp, 72
+ vmovdqa xmmword ptr [rsp], xmm6
+ vmovdqa xmmword ptr [rsp+0x10], xmm7
+ vmovdqa xmmword ptr [rsp+0x20], xmm8
+ vmovdqa xmmword ptr [rsp+0x30], xmm9
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ movzx eax, byte ptr [rsp+0x70]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ vmovq xmm3, r9
+ vmovq xmm4, r8
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rdx]
+ vmovups xmm9, xmmword ptr [rdx+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rdx+0x20]
+ vmovups xmm9, xmmword ptr [rdx+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rcx], xmm0
+ vmovdqu xmmword ptr [rcx+0x10], xmm1
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x10]
+ vmovdqa xmm8, xmmword ptr [rsp+0x20]
+ vmovdqa xmm9, xmmword ptr [rsp+0x30]
+ add rsp, 72
+ ret
+
+
+.p2align 6
+_blake3_compress_xof_avx512:
+blake3_compress_xof_avx512:
+ sub rsp, 72
+ vmovdqa xmmword ptr [rsp], xmm6
+ vmovdqa xmmword ptr [rsp+0x10], xmm7
+ vmovdqa xmmword ptr [rsp+0x20], xmm8
+ vmovdqa xmmword ptr [rsp+0x30], xmm9
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ movzx eax, byte ptr [rsp+0x70]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+0x78]
+ shl rax, 32
+ add r8, rax
+ vmovq xmm3, r9
+ vmovq xmm4, r8
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rdx]
+ vmovups xmm9, xmmword ptr [rdx+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rdx+0x20]
+ vmovups xmm9, xmmword ptr [rdx+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, xmmword ptr [rcx]
+ vpxor xmm3, xmm3, xmmword ptr [rcx+0x10]
+ vmovdqu xmmword ptr [r10], xmm0
+ vmovdqu xmmword ptr [r10+0x10], xmm1
+ vmovdqu xmmword ptr [r10+0x20], xmm2
+ vmovdqu xmmword ptr [r10+0x30], xmm3
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x10]
+ vmovdqa xmm8, xmmword ptr [rsp+0x20]
+ vmovdqa xmm9, xmmword ptr [rsp+0x30]
+ add rsp, 72
+ ret
+
+.section .rodata
+.p2align 6
+INDEX0:
+ .long 0, 1, 2, 3, 16, 17, 18, 19
+ .long 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ .long 4, 5, 6, 7, 20, 21, 22, 23
+ .long 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ .long 8, 9, 10, 11, 12, 13, 14, 15
+ADD1: .long 1
+
+ADD16: .long 16
+BLAKE3_BLOCK_LEN:
+ .long 64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+ .long 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A
diff --git a/deps/blake3/blake3_avx512_x86-64_windows_msvc.asm b/deps/blake3/blake3_avx512_x86-64_windows_msvc.asm
@@ -0,0 +1,2634 @@
+public _blake3_hash_many_avx512
+public blake3_hash_many_avx512
+public blake3_compress_in_place_avx512
+public _blake3_compress_in_place_avx512
+public blake3_compress_xof_avx512
+public _blake3_compress_xof_avx512
+
+_TEXT SEGMENT ALIGN(16) 'CODE'
+
+ALIGN 16
+blake3_hash_many_avx512 PROC
+_blake3_hash_many_avx512 PROC
+ push r15
+ push r14
+ push r13
+ push r12
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 304
+ and rsp, 0FFFFFFFFFFFFFFC0H
+ vmovdqa xmmword ptr [rsp+90H], xmm6
+ vmovdqa xmmword ptr [rsp+0A0H], xmm7
+ vmovdqa xmmword ptr [rsp+0B0H], xmm8
+ vmovdqa xmmword ptr [rsp+0C0H], xmm9
+ vmovdqa xmmword ptr [rsp+0D0H], xmm10
+ vmovdqa xmmword ptr [rsp+0E0H], xmm11
+ vmovdqa xmmword ptr [rsp+0F0H], xmm12
+ vmovdqa xmmword ptr [rsp+100H], xmm13
+ vmovdqa xmmword ptr [rsp+110H], xmm14
+ vmovdqa xmmword ptr [rsp+120H], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+68H]
+ movzx r9, byte ptr [rbp+70H]
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32]
+ vpcmpud k2, ymm2, ymm0, 1
+ vpcmpud k3, ymm3, ymm0, 1
+ ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
+ vpbroadcastd ymm6, dword ptr [ADD1]
+ vpaddd ymm4 {k2}, ymm4, ymm6
+ vpaddd ymm5 {k3}, ymm5, ymm6
+ ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8}
+ ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+20H], ymm3
+ vmovdqa ymmword ptr [rsp+40H], ymm4
+ vmovdqa ymmword ptr [rsp+60H], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+80H], rdx
+ cmp rsi, 16
+ jc final15blocks
+outerloop16:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+1H*4H]
+ vpbroadcastd zmm2, dword ptr [rcx+2H*4H]
+ vpbroadcastd zmm3, dword ptr [rcx+3H*4H]
+ vpbroadcastd zmm4, dword ptr [rcx+4H*4H]
+ vpbroadcastd zmm5, dword ptr [rcx+5H*4H]
+ vpbroadcastd zmm6, dword ptr [rcx+6H*4H]
+ vpbroadcastd zmm7, dword ptr [rcx+7H*4H]
+ movzx eax, byte ptr [rbp+78H]
+ movzx ebx, byte ptr [rbp+80H]
+ or eax, ebx
+ xor edx, edx
+ALIGN 16
+innerloop16:
+ movzx ebx, byte ptr [rbp+88H]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+80H]
+ cmove eax, ebx
+ mov dword ptr [rsp+88H], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ mov r12, qword ptr [rdi+40H]
+ mov r13, qword ptr [rdi+48H]
+ mov r14, qword ptr [rdi+50H]
+ mov r15, qword ptr [rdi+58H]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+20H]
+ mov r9, qword ptr [rdi+28H]
+ mov r10, qword ptr [rdi+30H]
+ mov r11, qword ptr [rdi+38H]
+ mov r12, qword ptr [rdi+60H]
+ mov r13, qword ptr [rdi+68H]
+ mov r14, qword ptr [rdi+70H]
+ mov r15, qword ptr [rdi+78H]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ mov r12, qword ptr [rdi+40H]
+ mov r13, qword ptr [rdi+48H]
+ mov r14, qword ptr [rdi+50H]
+ mov r15, qword ptr [rdi+58H]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 byte ptr [r8+rdx+80H]
+ prefetcht0 byte ptr [r12+rdx+80H]
+ prefetcht0 byte ptr [r9+rdx+80H]
+ prefetcht0 byte ptr [r13+rdx+80H]
+ prefetcht0 byte ptr [r10+rdx+80H]
+ prefetcht0 byte ptr [r14+rdx+80H]
+ prefetcht0 byte ptr [r11+rdx+80H]
+ prefetcht0 byte ptr [r15+rdx+80H]
+ mov r8, qword ptr [rdi+20H]
+ mov r9, qword ptr [rdi+28H]
+ mov r10, qword ptr [rdi+30H]
+ mov r11, qword ptr [rdi+38H]
+ mov r12, qword ptr [rdi+60H]
+ mov r13, qword ptr [rdi+68H]
+ mov r14, qword ptr [rdi+70H]
+ mov r15, qword ptr [rdi+78H]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 byte ptr [r8+rdx+80H]
+ prefetcht0 byte ptr [r12+rdx+80H]
+ prefetcht0 byte ptr [r9+rdx+80H]
+ prefetcht0 byte ptr [r13+rdx+80H]
+ prefetcht0 byte ptr [r10+rdx+80H]
+ prefetcht0 byte ptr [r14+rdx+80H]
+ prefetcht0 byte ptr [r11+rdx+80H]
+ prefetcht0 byte ptr [r15+rdx+80H]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN]
+ vpbroadcastd zmm15, dword ptr [rsp+22H*4H]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+78H]
+ jne innerloop16
+ mov rbx, qword ptr [rbp+90H]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 88H
+ vshufi32x4 zmm17, zmm1, zmm5, 88H
+ vshufi32x4 zmm18, zmm2, zmm6, 88H
+ vshufi32x4 zmm19, zmm3, zmm7, 88H
+ vshufi32x4 zmm20, zmm0, zmm4, 0DDH
+ vshufi32x4 zmm21, zmm1, zmm5, 0DDH
+ vshufi32x4 zmm22, zmm2, zmm6, 0DDH
+ vshufi32x4 zmm23, zmm3, zmm7, 0DDH
+ vshufi32x4 zmm0, zmm16, zmm17, 88H
+ vshufi32x4 zmm1, zmm18, zmm19, 88H
+ vshufi32x4 zmm2, zmm20, zmm21, 88H
+ vshufi32x4 zmm3, zmm22, zmm23, 88H
+ vshufi32x4 zmm4, zmm16, zmm17, 0DDH
+ vshufi32x4 zmm5, zmm18, zmm19, 0DDH
+ vshufi32x4 zmm6, zmm20, zmm21, 0DDH
+ vshufi32x4 zmm7, zmm22, zmm23, 0DDH
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1
+ vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2
+ vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3
+ vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4
+ vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5
+ vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6
+ vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H]
+ vmovdqa32 zmm2, zmm0
+ ; XXX: ml64.exe does not currently understand the syntax. We use a workaround.
+ vpbroadcastd zmm4, dword ptr [ADD16]
+ vpbroadcastd zmm5, dword ptr [ADD1]
+ vpaddd zmm2{k1}, zmm0, zmm4
+ ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16}
+ vpcmpud k2, zmm2, zmm0, 1
+ vpaddd zmm1 {k2}, zmm1, zmm5
+ ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+90H], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc outerloop16
+ test rsi, rsi
+ jne final15blocks
+unwind:
+ vzeroupper
+ vmovdqa xmm6, xmmword ptr [rsp+90H]
+ vmovdqa xmm7, xmmword ptr [rsp+0A0H]
+ vmovdqa xmm8, xmmword ptr [rsp+0B0H]
+ vmovdqa xmm9, xmmword ptr [rsp+0C0H]
+ vmovdqa xmm10, xmmword ptr [rsp+0D0H]
+ vmovdqa xmm11, xmmword ptr [rsp+0E0H]
+ vmovdqa xmm12, xmmword ptr [rsp+0F0H]
+ vmovdqa xmm13, xmmword ptr [rsp+100H]
+ vmovdqa xmm14, xmmword ptr [rsp+110H]
+ vmovdqa xmm15, xmmword ptr [rsp+120H]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+ALIGN 16
+final15blocks:
+ test esi, 8H
+ je final7blocks
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+4H]
+ vpbroadcastd ymm2, dword ptr [rcx+8H]
+ vpbroadcastd ymm3, dword ptr [rcx+0CH]
+ vpbroadcastd ymm4, dword ptr [rcx+10H]
+ vpbroadcastd ymm5, dword ptr [rcx+14H]
+ vpbroadcastd ymm6, dword ptr [rcx+18H]
+ vpbroadcastd ymm7, dword ptr [rcx+1CH]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ mov r12, qword ptr [rdi+20H]
+ mov r13, qword ptr [rdi+28H]
+ mov r14, qword ptr [rdi+30H]
+ mov r15, qword ptr [rdi+38H]
+ movzx eax, byte ptr [rbp+78H]
+ movzx ebx, byte ptr [rbp+80H]
+ or eax, ebx
+ xor edx, edx
+innerloop8:
+ movzx ebx, byte ptr [rbp+88H]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+80H]
+ cmove eax, ebx
+ mov dword ptr [rsp+88H], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-40H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-40H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-40H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-40H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-30H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-30H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-30H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-30H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-20H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-20H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-20H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-20H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-10H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H
+ vmovups xmm9, xmmword ptr [r9+rdx-10H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-10H]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H
+ vmovups xmm11, xmmword ptr [r11+rdx-10H]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+40H]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN]
+ vpbroadcastd ymm15, dword ptr [rsp+88H]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+78H]
+ jne innerloop8
+ mov rbx, qword ptr [rbp+90H]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0CCH
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0CCH
+ vblendps ymm3, ymm12, ymm9, 0CCH
+ vperm2f128 ymm12, ymm1, ymm2, 20H
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0CCH
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 20H
+ vmovups ymmword ptr [rbx+20H], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0CCH
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0CCH
+ vblendps ymm14, ymm14, ymm13, 0CCH
+ vperm2f128 ymm8, ymm10, ymm14, 20H
+ vmovups ymmword ptr [rbx+40H], ymm8
+ vblendps ymm15, ymm13, ymm15, 0CCH
+ vperm2f128 ymm13, ymm6, ymm15, 20H
+ vmovups ymmword ptr [rbx+60H], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 31H
+ vperm2f128 ymm11, ymm3, ymm4, 31H
+ vmovups ymmword ptr [rbx+80H], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 31H
+ vperm2f128 ymm15, ymm6, ymm15, 31H
+ vmovups ymmword ptr [rbx+0A0H], ymm11
+ vmovups ymmword ptr [rbx+0C0H], ymm14
+ vmovups ymmword ptr [rbx+0E0H], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+40H]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+40H], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+90H], rbx
+ add rdi, 64
+ sub rsi, 8
+final7blocks:
+ mov rbx, qword ptr [rbp+90H]
+ mov r15, qword ptr [rsp+80H]
+ movzx r13, byte ptr [rbp+78H]
+ movzx r12, byte ptr [rbp+88H]
+ test esi, 4H
+ je final3blocks
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+40H]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0DCH
+ vpermq ymm15, ymm15, 0DCH
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
+ vinserti64x4 zmm13, zmm14, ymm15, 01H
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop4:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+88H], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+22H*4H]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-1H*40H]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H
+ vmovups zmm9, zmmword ptr [r8+rdx-30H]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-20H]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H
+ vmovups zmm9, zmmword ptr [r8+rdx-10H]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 93H
+ vpshufd zmm7, zmm7, 93H
+ mov al, 7
+roundloop4:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 93H
+ vpshufd zmm3, zmm3, 4EH
+ vpshufd zmm2, zmm2, 39H
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 39H
+ vpshufd zmm3, zmm3, 4EH
+ vpshufd zmm2, zmm2, 93H
+ dec al
+ jz endroundloop4
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0FH
+ vpshufd zmm4, zmm8, 39H
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 78H
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 1EH
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp roundloop4
+endroundloop4:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop4
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+ vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+ vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H
+ vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H
+ vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H
+ vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+40H]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+40H], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+final3blocks:
+ test esi, 2H
+ je final1block
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+10H]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vmovd xmm14, dword ptr [rsp+4H]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vinserti128 ymm13, ymm13, xmm14, 01H
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+88H], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV]
+ vpbroadcastd ymm8, dword ptr [rsp+88H]
+ vpblendd ymm3, ymm13, ymm8, 88H
+ vmovups ymm8, ymmword ptr [r8+rdx-40H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H
+ vmovups ymm9, ymmword ptr [r8+rdx-30H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-20H]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H
+ vmovups ymm9, ymmword ptr [r8+rdx-10H]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 93H
+ vpshufd ymm7, ymm7, 93H
+ mov al, 7
+roundloop2:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 93H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm2, ymm2, 39H
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 39H
+ vpshufd ymm3, ymm3, 4EH
+ vpshufd ymm2, ymm2, 93H
+ dec al
+ jz endroundloop2
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0FH
+ vpshufd ymm4, ymm8, 39H
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0AAH
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 88H
+ vpshufd ymm8, ymm8, 78H
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 1EH
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp roundloop2
+endroundloop2:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop2
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ vextracti128 xmmword ptr [rbx+20H], ymm0, 01H
+ vextracti128 xmmword ptr [rbx+30H], ymm1, 01H
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+40H]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+40H], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+final1block:
+ test esi, 1H
+ je unwind
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+10H]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+ALIGN 16
+innerloop1:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-40H]
+ vmovups xmm9, xmmword ptr [r8+rdx-30H]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-20H]
+ vmovups xmm9, xmmword ptr [r8+rdx-10H]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 93H
+ vpshufd xmm7, xmm7, 93H
+ mov al, 7
+roundloop1:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 93H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 39H
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 39H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 93H
+ dec al
+ jz endroundloop1
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0FH
+ vpshufd xmm4, xmm8, 39H
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0AAH
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 88H
+ vpshufd xmm8, xmm8, 78H
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 1EH
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp roundloop1
+endroundloop1:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop1
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+10H], xmm1
+ jmp unwind
+
+_blake3_hash_many_avx512 ENDP
+blake3_hash_many_avx512 ENDP
+
+ALIGN 16
+blake3_compress_in_place_avx512 PROC
+_blake3_compress_in_place_avx512 PROC
+ sub rsp, 72
+ vmovdqa xmmword ptr [rsp], xmm6
+ vmovdqa xmmword ptr [rsp+10H], xmm7
+ vmovdqa xmmword ptr [rsp+20H], xmm8
+ vmovdqa xmmword ptr [rsp+30H], xmm9
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+10H]
+ movzx eax, byte ptr [rsp+70H]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ vmovq xmm3, r9
+ vmovq xmm4, r8
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV]
+ vmovups xmm8, xmmword ptr [rdx]
+ vmovups xmm9, xmmword ptr [rdx+10H]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rdx+20H]
+ vmovups xmm9, xmmword ptr [rdx+30H]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 93H
+ vpshufd xmm7, xmm7, 93H
+ mov al, 7
+@@:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 93H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 39H
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 39H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0FH
+ vpshufd xmm4, xmm8, 39H
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0AAH
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 88H
+ vpshufd xmm8, xmm8, 78H
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 1EH
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp @B
+@@:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rcx], xmm0
+ vmovdqu xmmword ptr [rcx+10H], xmm1
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+10H]
+ vmovdqa xmm8, xmmword ptr [rsp+20H]
+ vmovdqa xmm9, xmmword ptr [rsp+30H]
+ add rsp, 72
+ ret
+_blake3_compress_in_place_avx512 ENDP
+blake3_compress_in_place_avx512 ENDP
+
+ALIGN 16
+blake3_compress_xof_avx512 PROC
+_blake3_compress_xof_avx512 PROC
+ sub rsp, 72
+ vmovdqa xmmword ptr [rsp], xmm6
+ vmovdqa xmmword ptr [rsp+10H], xmm7
+ vmovdqa xmmword ptr [rsp+20H], xmm8
+ vmovdqa xmmword ptr [rsp+30H], xmm9
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+10H]
+ movzx eax, byte ptr [rsp+70H]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+78H]
+ shl rax, 32
+ add r8, rax
+ vmovq xmm3, r9
+ vmovq xmm4, r8
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV]
+ vmovups xmm8, xmmword ptr [rdx]
+ vmovups xmm9, xmmword ptr [rdx+10H]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rdx+20H]
+ vmovups xmm9, xmmword ptr [rdx+30H]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 93H
+ vpshufd xmm7, xmm7, 93H
+ mov al, 7
+@@:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 93H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 39H
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 39H
+ vpshufd xmm3, xmm3, 4EH
+ vpshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0FH
+ vpshufd xmm4, xmm8, 39H
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0AAH
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 88H
+ vpshufd xmm8, xmm8, 78H
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 1EH
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp @B
+@@:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, xmmword ptr [rcx]
+ vpxor xmm3, xmm3, xmmword ptr [rcx+10H]
+ vmovdqu xmmword ptr [r10], xmm0
+ vmovdqu xmmword ptr [r10+10H], xmm1
+ vmovdqu xmmword ptr [r10+20H], xmm2
+ vmovdqu xmmword ptr [r10+30H], xmm3
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+10H]
+ vmovdqa xmm8, xmmword ptr [rsp+20H]
+ vmovdqa xmm9, xmmword ptr [rsp+30H]
+ add rsp, 72
+ ret
+_blake3_compress_xof_avx512 ENDP
+blake3_compress_xof_avx512 ENDP
+
+_TEXT ENDS
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN 64
+INDEX0:
+ dd 0, 1, 2, 3, 16, 17, 18, 19
+ dd 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ dd 4, 5, 6, 7, 20, 21, 22, 23
+ dd 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ dd 0, 1, 2, 3, 4, 5, 6, 7
+ dd 8, 9, 10, 11, 12, 13, 14, 15
+ADD1:
+ dd 1
+ADD16:
+ dd 16
+BLAKE3_BLOCK_LEN:
+ dd 64
+ALIGN 64
+BLAKE3_IV:
+BLAKE3_IV_0:
+ dd 06A09E667H
+BLAKE3_IV_1:
+ dd 0BB67AE85H
+BLAKE3_IV_2:
+ dd 03C6EF372H
+BLAKE3_IV_3:
+ dd 0A54FF53AH
+
+_RDATA ENDS
+END
diff --git a/deps/blake3/blake3_dispatch.c b/deps/blake3/blake3_dispatch.c
@@ -0,0 +1,276 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "blake3_impl.h"
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <immintrin.h>
+#else
+#error "Unimplemented!"
+#endif
+#endif
+
+#define MAYBE_UNUSED(x) (void)((x))
+
+#if defined(IS_X86)
+static uint64_t xgetbv() {
+#if defined(_MSC_VER)
+ return _xgetbv(0);
+#else
+ uint32_t eax = 0, edx = 0;
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+ return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+static void cpuid(uint32_t out[4], uint32_t id) {
+#if defined(_MSC_VER)
+ __cpuid((int *)out, id);
+#elif defined(__i386__) || defined(_M_IX86)
+ __asm__ __volatile__("movl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %1, %%ebx\n"
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id));
+#else
+ __asm__ __volatile__("cpuid\n"
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id));
+#endif
+}
+
+static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
+#if defined(_MSC_VER)
+ __cpuidex((int *)out, id, sid);
+#elif defined(__i386__) || defined(_M_IX86)
+ __asm__ __volatile__("movl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %1, %%ebx\n"
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id), "c"(sid));
+#else
+ __asm__ __volatile__("cpuid\n"
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+ : "a"(id), "c"(sid));
+#endif
+}
+
+#endif
+
+enum cpu_feature {
+ SSE2 = 1 << 0,
+ SSSE3 = 1 << 1,
+ SSE41 = 1 << 2,
+ AVX = 1 << 3,
+ AVX2 = 1 << 4,
+ AVX512F = 1 << 5,
+ AVX512VL = 1 << 6,
+ /* ... */
+ UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+ enum cpu_feature g_cpu_features = UNDEFINED;
+
+#if !defined(BLAKE3_TESTING)
+static
+#endif
+ enum cpu_feature
+ get_cpu_features() {
+
+ if (g_cpu_features != UNDEFINED) {
+ return g_cpu_features;
+ } else {
+#if defined(IS_X86)
+ uint32_t regs[4] = {0};
+ uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
+ (void)edx;
+ enum cpu_feature features = 0;
+ cpuid(regs, 0);
+ const int max_id = *eax;
+ cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+ features |= SSE2;
+#else
+ if (*edx & (1UL << 26))
+ features |= SSE2;
+#endif
+ if (*ecx & (1UL << 0))
+ features |= SSSE3;
+ if (*ecx & (1UL << 19))
+ features |= SSE41;
+
+ if (*ecx & (1UL << 27)) { // OSXSAVE
+ const uint64_t mask = xgetbv();
+ if ((mask & 6) == 6) { // SSE and AVX states
+ if (*ecx & (1UL << 28))
+ features |= AVX;
+ if (max_id >= 7) {
+ cpuidex(regs, 7, 0);
+ if (*ebx & (1UL << 5))
+ features |= AVX2;
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
+ if (*ebx & (1UL << 31))
+ features |= AVX512VL;
+ if (*ebx & (1UL << 16))
+ features |= AVX512F;
+ }
+ }
+ }
+ }
+ g_cpu_features = features;
+ return features;
+#else
+ /* How to detect NEON? */
+ return 0;
+#endif
+ }
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+ return;
+ }
+#endif
+#endif
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64]) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if (features & AVX512VL) {
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+ return;
+ }
+#endif
+#endif
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+ if (features & AVX2) {
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+ return;
+ }
+#endif
+#endif
+
+#if defined(BLAKE3_USE_NEON)
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ return;
+#endif
+
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end,
+ out);
+}
+
+// The dynamically detected SIMD degree of the current platform.
+size_t blake3_simd_degree(void) {
+#if defined(IS_X86)
+ const enum cpu_feature features = get_cpu_features();
+ MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+ return 16;
+ }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+ if (features & AVX2) {
+ return 8;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+ if (features & SSE41) {
+ return 4;
+ }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+ if (features & SSE2) {
+ return 4;
+ }
+#endif
+#endif
+#if defined(BLAKE3_USE_NEON)
+ return 4;
+#endif
+ return 1;
+}
diff --git a/deps/blake3/blake3_impl.h b/deps/blake3/blake3_impl.h
@@ -0,0 +1,269 @@
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake3.h"
+
+// internal flags
+enum blake3_flags {
+ CHUNK_START = 1 << 0,
+ CHUNK_END = 1 << 1,
+ PARENT = 1 << 2,
+ ROOT = 1 << 3,
+ KEYED_HASH = 1 << 4,
+ DERIVE_KEY_CONTEXT = 1 << 5,
+ DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+// This C implementation tries to support recent versions of GCC, Clang, and
+// MSVC.
+#if defined(_MSC_VER)
+#define INLINE static __forceinline
+#else
+#define INLINE static inline __attribute__((always_inline))
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define IS_X86
+#define IS_X86_64
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define IS_X86
+#define IS_X86_32
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+#endif
+
+#if defined(IS_X86)
+#define MAX_SIMD_DEGREE 16
+#elif defined(BLAKE3_USE_NEON)
+#define MAX_SIMD_DEGREE 4
+#else
+#define MAX_SIMD_DEGREE 1
+#endif
+
+// There are some places where we want a static size that's equal to the
+// MAX_SIMD_DEGREE, but also at least 2.
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
+ 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
+ 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t MSG_SCHEDULE[7][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+/* x is assumed to be nonzero. */
+static unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+ return 63 ^ __builtin_clzll(x);
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+ unsigned long index;
+ _BitScanReverse64(&index, x);
+ return index;
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+ if(x >> 32) {
+ unsigned long index;
+ _BitScanReverse(&index, x >> 32);
+ return 32 + index;
+ } else {
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return index;
+ }
+#else
+ unsigned int c = 0;
+ if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+ if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+ if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
+ if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
+ if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
+ if(x & 0x0000000000000002ULL) { c += 1; }
+ return c;
+#endif
+}
+
+// Count the number of 1 bits.
+INLINE unsigned int popcnt(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+ return __builtin_popcountll(x);
+#else
+ unsigned int count = 0;
+ while (x != 0) {
+ count += 1;
+ x &= x - 1;
+ }
+ return count;
+#endif
+}
+
+// Largest power of two less than or equal to x. As a special case, returns 1
+// when x is 0.
+INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
+ return 1ULL << highest_one(x | 1);
+}
+
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
+
+INLINE uint32_t counter_high(uint64_t counter) {
+ return (uint32_t)(counter >> 32);
+}
+
+INLINE uint32_t load32(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+ uint32_t key_words[8]) {
+ key_words[0] = load32(&key[0 * 4]);
+ key_words[1] = load32(&key[1 * 4]);
+ key_words[2] = load32(&key[2 * 4]);
+ key_words[3] = load32(&key[3 * 4]);
+ key_words[4] = load32(&key[4 * 4]);
+ key_words[5] = load32(&key[5 * 4]);
+ key_words[6] = load32(&key[6 * 4]);
+ key_words[7] = load32(&key[7 * 4]);
+}
+
+INLINE void store32(void *dst, uint32_t w) {
+ uint8_t *p = (uint8_t *)dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+}
+
+INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+ store32(&bytes_out[0 * 4], cv_words[0]);
+ store32(&bytes_out[1 * 4], cv_words[1]);
+ store32(&bytes_out[2 * 4], cv_words[2]);
+ store32(&bytes_out[3 * 4], cv_words[3]);
+ store32(&bytes_out[4 * 4], cv_words[4]);
+ store32(&bytes_out[5 * 4], cv_words[5]);
+ store32(&bytes_out[6 * 4], cv_words[6]);
+ store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+void blake3_compress_in_place(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+void blake3_compress_xof(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64]);
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+size_t blake3_simd_degree(void);
+
+
+// Declarations for implementation-specific functions.
+void blake3_compress_in_place_portable(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+
+#if defined(IS_X86)
+#if !defined(BLAKE3_NO_SSE2)
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]);
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]);
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+#if !defined(BLAKE3_NO_AVX512)
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]);
+
+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+#endif
+
+#if defined(BLAKE3_USE_NEON)
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
+#endif
+
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/deps/blake3/blake3_neon.c b/deps/blake3/blake3_neon.c
@@ -0,0 +1,346 @@
+#include "blake3_impl.h"
+
+#include <arm_neon.h>
+
+// TODO: This is probably incorrect for big-endian ARM. How should that work?
+INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
+ // vld1q_u32 has alignment requirements. Don't use it.
+ uint32x4_t x;
+ memcpy(&x, src, 16);
+ return x;
+}
+
+INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
+ // vst1q_u32 has alignment requirements. Don't use it.
+ memcpy(dest, &src, 16);
+}
+
+INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
+ return vaddq_u32(a, b);
+}
+
+INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
+ return veorq_u32(a, b);
+}
+
+INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
+
+INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ uint32_t array[4] = {a, b, c, d};
+ return vld1q_u32(array);
+}
+
+INLINE uint32x4_t rot16_128(uint32x4_t x) {
+ return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+}
+
+INLINE uint32x4_t rot12_128(uint32x4_t x) {
+ return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+}
+
+INLINE uint32x4_t rot8_128(uint32x4_t x) {
+ return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+}
+
+INLINE uint32x4_t rot7_128(uint32x4_t x) {
+ return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+}
+
+// TODO: compress_neon
+
+// TODO: hash2_neon
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_neon
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = add_128(v[0], v[4]);
+ v[1] = add_128(v[1], v[5]);
+ v[2] = add_128(v[2], v[6]);
+ v[3] = add_128(v[3], v[7]);
+ v[12] = xor_128(v[12], v[0]);
+ v[13] = xor_128(v[13], v[1]);
+ v[14] = xor_128(v[14], v[2]);
+ v[15] = xor_128(v[15], v[3]);
+ v[12] = rot16_128(v[12]);
+ v[13] = rot16_128(v[13]);
+ v[14] = rot16_128(v[14]);
+ v[15] = rot16_128(v[15]);
+ v[8] = add_128(v[8], v[12]);
+ v[9] = add_128(v[9], v[13]);
+ v[10] = add_128(v[10], v[14]);
+ v[11] = add_128(v[11], v[15]);
+ v[4] = xor_128(v[4], v[8]);
+ v[5] = xor_128(v[5], v[9]);
+ v[6] = xor_128(v[6], v[10]);
+ v[7] = xor_128(v[7], v[11]);
+ v[4] = rot12_128(v[4]);
+ v[5] = rot12_128(v[5]);
+ v[6] = rot12_128(v[6]);
+ v[7] = rot12_128(v[7]);
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = add_128(v[0], v[4]);
+ v[1] = add_128(v[1], v[5]);
+ v[2] = add_128(v[2], v[6]);
+ v[3] = add_128(v[3], v[7]);
+ v[12] = xor_128(v[12], v[0]);
+ v[13] = xor_128(v[13], v[1]);
+ v[14] = xor_128(v[14], v[2]);
+ v[15] = xor_128(v[15], v[3]);
+ v[12] = rot8_128(v[12]);
+ v[13] = rot8_128(v[13]);
+ v[14] = rot8_128(v[14]);
+ v[15] = rot8_128(v[15]);
+ v[8] = add_128(v[8], v[12]);
+ v[9] = add_128(v[9], v[13]);
+ v[10] = add_128(v[10], v[14]);
+ v[11] = add_128(v[11], v[15]);
+ v[4] = xor_128(v[4], v[8]);
+ v[5] = xor_128(v[5], v[9]);
+ v[6] = xor_128(v[6], v[10]);
+ v[7] = xor_128(v[7], v[11]);
+ v[4] = rot7_128(v[4]);
+ v[5] = rot7_128(v[5]);
+ v[6] = rot7_128(v[6]);
+ v[7] = rot7_128(v[7]);
+
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = add_128(v[0], v[5]);
+ v[1] = add_128(v[1], v[6]);
+ v[2] = add_128(v[2], v[7]);
+ v[3] = add_128(v[3], v[4]);
+ v[15] = xor_128(v[15], v[0]);
+ v[12] = xor_128(v[12], v[1]);
+ v[13] = xor_128(v[13], v[2]);
+ v[14] = xor_128(v[14], v[3]);
+ v[15] = rot16_128(v[15]);
+ v[12] = rot16_128(v[12]);
+ v[13] = rot16_128(v[13]);
+ v[14] = rot16_128(v[14]);
+ v[10] = add_128(v[10], v[15]);
+ v[11] = add_128(v[11], v[12]);
+ v[8] = add_128(v[8], v[13]);
+ v[9] = add_128(v[9], v[14]);
+ v[5] = xor_128(v[5], v[10]);
+ v[6] = xor_128(v[6], v[11]);
+ v[7] = xor_128(v[7], v[8]);
+ v[4] = xor_128(v[4], v[9]);
+ v[5] = rot12_128(v[5]);
+ v[6] = rot12_128(v[6]);
+ v[7] = rot12_128(v[7]);
+ v[4] = rot12_128(v[4]);
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = add_128(v[0], v[5]);
+ v[1] = add_128(v[1], v[6]);
+ v[2] = add_128(v[2], v[7]);
+ v[3] = add_128(v[3], v[4]);
+ v[15] = xor_128(v[15], v[0]);
+ v[12] = xor_128(v[12], v[1]);
+ v[13] = xor_128(v[13], v[2]);
+ v[14] = xor_128(v[14], v[3]);
+ v[15] = rot8_128(v[15]);
+ v[12] = rot8_128(v[12]);
+ v[13] = rot8_128(v[13]);
+ v[14] = rot8_128(v[14]);
+ v[10] = add_128(v[10], v[15]);
+ v[11] = add_128(v[11], v[12]);
+ v[8] = add_128(v[8], v[13]);
+ v[9] = add_128(v[9], v[14]);
+ v[5] = xor_128(v[5], v[10]);
+ v[6] = xor_128(v[6], v[11]);
+ v[7] = xor_128(v[7], v[8]);
+ v[4] = xor_128(v[4], v[9]);
+ v[5] = rot7_128(v[5]);
+ v[6] = rot7_128(v[6]);
+ v[7] = rot7_128(v[7]);
+ v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
+ // Individually transpose the four 2x2 sub-matrices in each corner.
+ uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
+ uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
+
+ // Swap the top-right and bottom-left 2x2s (which just got transposed).
+ vecs[0] =
+ vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
+ vecs[1] =
+ vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
+ vecs[2] =
+ vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
+ vecs[3] =
+ vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
+}
+
+INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
+ size_t block_offset, uint32x4_t out[16]) {
+ out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
+ out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
+ out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
+ out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
+ out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
+ out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
+ out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
+ out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
+ out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
+ out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
+ out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
+ out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
+ out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
+ out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
+ out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
+ out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
+ transpose_vecs_128(&out[0]);
+ transpose_vecs_128(&out[4]);
+ transpose_vecs_128(&out[8]);
+ transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+ uint32x4_t *out_low, uint32x4_t *out_high) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ *out_low = set4(
+ counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+ counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+ *out_high = set4(
+ counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+ counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
+}
+
+void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ uint32x4_t h_vecs[8] = {
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+ };
+ uint32x4_t counter_low_vec, counter_high_vec;
+ load_counters4(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+ uint32x4_t block_flags_vec = set1_128(block_flags);
+ uint32x4_t msg_vecs[16];
+ transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ uint32x4_t v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn4(v, msg_vecs, 0);
+ round_fn4(v, msg_vecs, 1);
+ round_fn4(v, msg_vecs, 2);
+ round_fn4(v, msg_vecs, 3);
+ round_fn4(v, msg_vecs, 4);
+ round_fn4(v, msg_vecs, 5);
+ round_fn4(v, msg_vecs, 6);
+ h_vecs[0] = xor_128(v[0], v[8]);
+ h_vecs[1] = xor_128(v[1], v[9]);
+ h_vecs[2] = xor_128(v[2], v[10]);
+ h_vecs[3] = xor_128(v[3], v[11]);
+ h_vecs[4] = xor_128(v[4], v[12]);
+ h_vecs[5] = xor_128(v[5], v[13]);
+ h_vecs[6] = xor_128(v[6], v[14]);
+ h_vecs[7] = xor_128(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs_128(&h_vecs[0]);
+ transpose_vecs_128(&h_vecs[4]);
+ // The first four vecs now contain the first half of each output, and the
+ // second four vecs contain the second half of each output.
+ storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
+ storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_neon
+ * ----------------------------------------------------------------------------
+ */
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+ uint8_t out[BLAKE3_OUT_LEN]) {
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ // TODO: Implement compress_neon. However note that according to
+ // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
+ // compress_neon might not be any faster than compress_portable.
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+ block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs >= 4) {
+ blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 4;
+ }
+ inputs += 4;
+ num_inputs -= 4;
+ out = &out[4 * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs > 0) {
+ hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
diff --git a/deps/blake3/blake3_portable.c b/deps/blake3/blake3_portable.c
@@ -0,0 +1,160 @@
+#include "blake3_impl.h"
+#include <string.h>
+
+INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
+ return (w >> c) | (w << (32 - c));
+}
+
+INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+ uint32_t x, uint32_t y) {
+ state[a] = state[a] + state[b] + x;
+ state[d] = rotr32(state[d] ^ state[a], 16);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 12);
+ state[a] = state[a] + state[b] + y;
+ state[d] = rotr32(state[d] ^ state[a], 8);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
+ // Select the message schedule based on the round.
+ const uint8_t *schedule = MSG_SCHEDULE[round];
+
+ // Mix the columns.
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+ // Mix the rows.
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
+ uint32_t block_words[16];
+ block_words[0] = load32(block + 4 * 0);
+ block_words[1] = load32(block + 4 * 1);
+ block_words[2] = load32(block + 4 * 2);
+ block_words[3] = load32(block + 4 * 3);
+ block_words[4] = load32(block + 4 * 4);
+ block_words[5] = load32(block + 4 * 5);
+ block_words[6] = load32(block + 4 * 6);
+ block_words[7] = load32(block + 4 * 7);
+ block_words[8] = load32(block + 4 * 8);
+ block_words[9] = load32(block + 4 * 9);
+ block_words[10] = load32(block + 4 * 10);
+ block_words[11] = load32(block + 4 * 11);
+ block_words[12] = load32(block + 4 * 12);
+ block_words[13] = load32(block + 4 * 13);
+ block_words[14] = load32(block + 4 * 14);
+ block_words[15] = load32(block + 4 * 15);
+
+ state[0] = cv[0];
+ state[1] = cv[1];
+ state[2] = cv[2];
+ state[3] = cv[3];
+ state[4] = cv[4];
+ state[5] = cv[5];
+ state[6] = cv[6];
+ state[7] = cv[7];
+ state[8] = IV[0];
+ state[9] = IV[1];
+ state[10] = IV[2];
+ state[11] = IV[3];
+ state[12] = counter_low(counter);
+ state[13] = counter_high(counter);
+ state[14] = (uint32_t)block_len;
+ state[15] = (uint32_t)flags;
+
+ round_fn(state, &block_words[0], 0);
+ round_fn(state, &block_words[0], 1);
+ round_fn(state, &block_words[0], 2);
+ round_fn(state, &block_words[0], 3);
+ round_fn(state, &block_words[0], 4);
+ round_fn(state, &block_words[0], 5);
+ round_fn(state, &block_words[0], 6);
+}
+
+void blake3_compress_in_place_portable(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+void blake3_compress_xof_portable(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]) {
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+
+ store32(&out[0 * 4], state[0] ^ state[8]);
+ store32(&out[1 * 4], state[1] ^ state[9]);
+ store32(&out[2 * 4], state[2] ^ state[10]);
+ store32(&out[3 * 4], state[3] ^ state[11]);
+ store32(&out[4 * 4], state[4] ^ state[12]);
+ store32(&out[5 * 4], state[5] ^ state[13]);
+ store32(&out[6 * 4], state[6] ^ state[14]);
+ store32(&out[7 * 4], state[7] ^ state[15]);
+ store32(&out[8 * 4], state[8] ^ cv[0]);
+ store32(&out[9 * 4], state[9] ^ cv[1]);
+ store32(&out[10 * 4], state[10] ^ cv[2]);
+ store32(&out[11 * 4], state[11] ^ cv[3]);
+ store32(&out[12 * 4], state[12] ^ cv[4]);
+ store32(&out[13 * 4], state[13] ^ cv[5]);
+ store32(&out[14 * 4], state[14] ^ cv[6]);
+ store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+ block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ store_cv_words(out, cv);
+}
+
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs > 0) {
+ hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
diff --git a/deps/blake3/blake3_sse2.c b/deps/blake3/blake3_sse2.c
@@ -0,0 +1,565 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c) \
+ (_mm_castps_si128( \
+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+ return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+ _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+ return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
+}
+
+INLINE __m128i rot12(__m128i x) {
+ return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+ return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
+}
+
+INLINE __m128i rot7(__m128i x) {
+ return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = addv(addv(*row0, m), *row1);
+ *row3 = xorv(*row3, *row0);
+ *row3 = rot16(*row3);
+ *row2 = addv(*row2, *row3);
+ *row1 = xorv(*row1, *row2);
+ *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = addv(addv(*row0, m), *row1);
+ *row3 = xorv(*row3, *row0);
+ *row3 = rot8(*row3);
+ *row2 = addv(*row2, *row3);
+ *row1 = xorv(*row1, *row2);
+ *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
+ const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ __m128i mask = _mm_set1_epi16(imm8);
+ mask = _mm_and_si128(mask, bits);
+ mask = _mm_cmpeq_epi16(mask, bits);
+ return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
+ rows[0] = loadu((uint8_t *)&cv[0]);
+ rows[1] = loadu((uint8_t *)&cv[4]);
+ rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+ rows[3] = set4(counter_low(counter), counter_high(counter),
+ (uint32_t)block_len, (uint32_t)flags);
+
+ __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+ __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+ __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+ __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+ __m128i t0, t1, t2, t3, tt;
+
+ // Round 1. The first round permutes the message words from the original
+ // input order, into the groups that get mixed in parallel.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 2. This round and all following rounds apply a fixed permutation
+ // to the message words from the round before.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 3
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 4
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 5
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 6
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 7
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+ storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu(xorv(rows[0], rows[2]), &out[0]);
+ storeu(xorv(rows[1], rows[3]), &out[16]);
+ storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+ storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[15] = rot16(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot12(v[4]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[15] = rot8(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot7(v[4]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot16(v[15]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[4] = rot12(v[4]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot8(v[15]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+ v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+ // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // 22/33. Note that this doesn't split the vector into two lanes, as the
+ // AVX2 counterparts do.
+ __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+ __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+ __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+ __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+ // Interleave 64-bit lanes.
+ __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+ __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+ __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+ __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+ vecs[0] = abcd_0;
+ vecs[1] = abcd_1;
+ vecs[2] = abcd_2;
+ vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+ size_t block_offset, __m128i out[16]) {
+ out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+ out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+ out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+ out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+ out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+ out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+ out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+ out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+ out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+ out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+ out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+ out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+ out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+ out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+ out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+ out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+ for (size_t i = 0; i < 4; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs(&out[0]);
+ transpose_vecs(&out[4]);
+ transpose_vecs(&out[8]);
+ transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+ __m128i *out_lo, __m128i *out_hi) {
+ const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+ const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+ const __m128i add1 = _mm_and_si128(mask, add0);
+ __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
+ __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
+ _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
+ *out_lo = l;
+ *out_hi = h;
+}
+
+void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ __m128i h_vecs[8] = {
+ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+ set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+ };
+ __m128i counter_low_vec, counter_high_vec;
+ load_counters(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+ __m128i block_flags_vec = set1(block_flags);
+ __m128i msg_vecs[16];
+ transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m128i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn(v, msg_vecs, 0);
+ round_fn(v, msg_vecs, 1);
+ round_fn(v, msg_vecs, 2);
+ round_fn(v, msg_vecs, 3);
+ round_fn(v, msg_vecs, 4);
+ round_fn(v, msg_vecs, 5);
+ round_fn(v, msg_vecs, 6);
+ h_vecs[0] = xorv(v[0], v[8]);
+ h_vecs[1] = xorv(v[1], v[9]);
+ h_vecs[2] = xorv(v[2], v[10]);
+ h_vecs[3] = xorv(v[3], v[11]);
+ h_vecs[4] = xorv(v[4], v[12]);
+ h_vecs[5] = xorv(v[5], v[13]);
+ h_vecs[6] = xorv(v[6], v[14]);
+ h_vecs[7] = xorv(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs(&h_vecs[0]);
+ transpose_vecs(&h_vecs[4]);
+ // The first four vecs now contain the first half of each output, and the
+ // second four vecs contain the second half of each output.
+ storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+ storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+ storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+ storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+ storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+ storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+ storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+ storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
+ block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs >= DEGREE) {
+ blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += DEGREE;
+ }
+ inputs += DEGREE;
+ num_inputs -= DEGREE;
+ out = &out[DEGREE * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs > 0) {
+ hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
diff --git a/deps/blake3/blake3_sse2_x86-64_unix.S b/deps/blake3/blake3_sse2_x86-64_unix.S
@@ -0,0 +1,2291 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global blake3_hash_many_sse2
+.global _blake3_hash_many_sse2
+.global blake3_compress_in_place_sse2
+.global _blake3_compress_in_place_sse2
+.global blake3_compress_xof_sse2
+.global _blake3_compress_xof_sse2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+ .p2align 6
+_blake3_hash_many_sse2:
+blake3_hash_many_sse2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+blake3_compress_in_place_sse2:
+_blake3_compress_in_place_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+blake3_compress_xof_sse2:
+_blake3_compress_xof_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/deps/blake3/blake3_sse2_x86-64_windows_gnu.S b/deps/blake3/blake3_sse2_x86-64_windows_gnu.S
@@ -0,0 +1,2332 @@
+.intel_syntax noprefix
+.global blake3_hash_many_sse2
+.global _blake3_hash_many_sse2
+.global blake3_compress_in_place_sse2
+.global _blake3_compress_in_place_sse2
+.global blake3_compress_xof_sse2
+.global _blake3_compress_xof_sse2
+.section .text
+ .p2align 6
+_blake3_hash_many_sse2:
+blake3_hash_many_sse2:
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 528
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ movdqa xmmword ptr [rsp+0x170], xmm6
+ movdqa xmmword ptr [rsp+0x180], xmm7
+ movdqa xmmword ptr [rsp+0x190], xmm8
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
+ movdqa xmmword ptr [rsp+0x200], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+0x68]
+ movzx r9, byte ptr [rbp+0x70]
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x90]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x78]
+ movzx r12d, byte ptr [rbp+0x88]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jne 3f
+4:
+ movdqa xmm6, xmmword ptr [rsp+0x170]
+ movdqa xmm7, xmmword ptr [rsp+0x180]
+ movdqa xmm8, xmmword ptr [rsp+0x190]
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
+ movdqa xmm15, xmmword ptr [rsp+0x200]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+blake3_compress_in_place_sse2:
+_blake3_compress_in_place_sse2:
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+0x10], xmm7
+ movdqa xmmword ptr [rsp+0x20], xmm8
+ movdqa xmmword ptr [rsp+0x30], xmm9
+ movdqa xmmword ptr [rsp+0x40], xmm11
+ movdqa xmmword ptr [rsp+0x50], xmm14
+ movdqa xmmword ptr [rsp+0x60], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, byte ptr [rsp+0xA0]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+0x20]
+ movups xmm7, xmmword ptr [rdx+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rcx], xmm0
+ movups xmmword ptr [rcx+0x10], xmm1
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+0x10]
+ movdqa xmm8, xmmword ptr [rsp+0x20]
+ movdqa xmm9, xmmword ptr [rsp+0x30]
+ movdqa xmm11, xmmword ptr [rsp+0x40]
+ movdqa xmm14, xmmword ptr [rsp+0x50]
+ movdqa xmm15, xmmword ptr [rsp+0x60]
+ add rsp, 120
+ ret
+
+
+.p2align 6
+_blake3_compress_xof_sse2:
+blake3_compress_xof_sse2:
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+0x10], xmm7
+ movdqa xmmword ptr [rsp+0x20], xmm8
+ movdqa xmmword ptr [rsp+0x30], xmm9
+ movdqa xmmword ptr [rsp+0x40], xmm11
+ movdqa xmmword ptr [rsp+0x50], xmm14
+ movdqa xmmword ptr [rsp+0x60], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, byte ptr [rsp+0xA0]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+0xA8]
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+0x20]
+ movups xmm7, xmmword ptr [rdx+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rcx]
+ movdqu xmm5, xmmword ptr [rcx+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r10], xmm0
+ movups xmmword ptr [r10+0x10], xmm1
+ movups xmmword ptr [r10+0x20], xmm2
+ movups xmmword ptr [r10+0x30], xmm3
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+0x10]
+ movdqa xmm8, xmmword ptr [rsp+0x20]
+ movdqa xmm9, xmmword ptr [rsp+0x30]
+ movdqa xmm11, xmmword ptr [rsp+0x40]
+ movdqa xmm14, xmmword ptr [rsp+0x50]
+ movdqa xmm15, xmmword ptr [rsp+0x60]
+ add rsp, 120
+ ret
+
+
+.section .rodata
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/deps/blake3/blake3_sse2_x86-64_windows_msvc.asm b/deps/blake3/blake3_sse2_x86-64_windows_msvc.asm
@@ -0,0 +1,2350 @@
+public _blake3_hash_many_sse2
+public blake3_hash_many_sse2
+public blake3_compress_in_place_sse2
+public _blake3_compress_in_place_sse2
+public blake3_compress_xof_sse2
+public _blake3_compress_xof_sse2
+
+_TEXT SEGMENT ALIGN(16) 'CODE'
+
+ALIGN 16
+blake3_hash_many_sse2 PROC
+_blake3_hash_many_sse2 PROC
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 528
+ and rsp, 0FFFFFFFFFFFFFFC0H
+ movdqa xmmword ptr [rsp+170H], xmm6
+ movdqa xmmword ptr [rsp+180H], xmm7
+ movdqa xmmword ptr [rsp+190H], xmm8
+ movdqa xmmword ptr [rsp+1A0H], xmm9
+ movdqa xmmword ptr [rsp+1B0H], xmm10
+ movdqa xmmword ptr [rsp+1C0H], xmm11
+ movdqa xmmword ptr [rsp+1D0H], xmm12
+ movdqa xmmword ptr [rsp+1E0H], xmm13
+ movdqa xmmword ptr [rsp+1F0H], xmm14
+ movdqa xmmword ptr [rsp+200H], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+68H]
+ movzx r9, byte ptr [rbp+70H]
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 00H
+ movdqa xmmword ptr [rsp+130H], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0]
+ pand xmm0, xmmword ptr [ADD1]
+ movdqa xmmword ptr [rsp+150H], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 00H
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+110H], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 00H
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+120H], xmm2
+ mov rbx, qword ptr [rbp+90H]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+78H]
+ movzx r12d, byte ptr [rbp+88H]
+ cmp rsi, 4
+ jc final3blocks
+outerloop4:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 00H
+ pshufd xmm1, xmm3, 55H
+ pshufd xmm2, xmm3, 0AAH
+ pshufd xmm3, xmm3, 0FFH
+ movdqu xmm7, xmmword ptr [rcx+10H]
+ pshufd xmm4, xmm7, 00H
+ pshufd xmm5, xmm7, 55H
+ pshufd xmm6, xmm7, 0AAH
+ pshufd xmm7, xmm7, 0FFH
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop4:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-40H]
+ movdqu xmm9, xmmword ptr [r9+rdx-40H]
+ movdqu xmm10, xmmword ptr [r10+rdx-40H]
+ movdqu xmm11, xmmword ptr [r11+rdx-40H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+10H], xmm9
+ movdqa xmmword ptr [rsp+20H], xmm12
+ movdqa xmmword ptr [rsp+30H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-30H]
+ movdqu xmm9, xmmword ptr [r9+rdx-30H]
+ movdqu xmm10, xmmword ptr [r10+rdx-30H]
+ movdqu xmm11, xmmword ptr [r11+rdx-30H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+40H], xmm8
+ movdqa xmmword ptr [rsp+50H], xmm9
+ movdqa xmmword ptr [rsp+60H], xmm12
+ movdqa xmmword ptr [rsp+70H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-20H]
+ movdqu xmm9, xmmword ptr [r9+rdx-20H]
+ movdqu xmm10, xmmword ptr [r10+rdx-20H]
+ movdqu xmm11, xmmword ptr [r11+rdx-20H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+80H], xmm8
+ movdqa xmmword ptr [rsp+90H], xmm9
+ movdqa xmmword ptr [rsp+0A0H], xmm12
+ movdqa xmmword ptr [rsp+0B0H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-10H]
+ movdqu xmm9, xmmword ptr [r9+rdx-10H]
+ movdqu xmm10, xmmword ptr [r10+rdx-10H]
+ movdqu xmm11, xmmword ptr [r11+rdx-10H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0C0H], xmm8
+ movdqa xmmword ptr [rsp+0D0H], xmm9
+ movdqa xmmword ptr [rsp+0E0H], xmm12
+ movdqa xmmword ptr [rsp+0F0H], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
+ movdqa xmm12, xmmword ptr [rsp+110H]
+ movdqa xmm13, xmmword ptr [rsp+120H]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 00H
+ prefetcht0 byte ptr [r8+rdx+80H]
+ prefetcht0 byte ptr [r9+rdx+80H]
+ prefetcht0 byte ptr [r10+rdx+80H]
+ prefetcht0 byte ptr [r11+rdx+80H]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+40H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+10H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+50H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+80H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+0C0H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+90H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+0D0H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+20H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+70H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+60H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+10H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+90H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0B0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+0E0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+30H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+0D0H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+40H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+20H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+60H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+0B0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+50H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0F0H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0A0H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+0E0H]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+70H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+30H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+40H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+50H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+80H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0C0H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+0F0H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0D0H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+0A0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+70H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+20H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+10H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+90H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+80H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0E0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+0C0H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0D0H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+20H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+30H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+60H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0B0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+10H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0F0H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+90H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0E0H]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+30H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0B1H
+ pshufhw xmm15, xmm15, 0B1H
+ pshuflw xmm12, xmm12, 0B1H
+ pshufhw xmm12, xmm12, 0B1H
+ pshuflw xmm13, xmm13, 0B1H
+ pshufhw xmm13, xmm13, 0B1H
+ pshuflw xmm14, xmm14, 0B1H
+ pshufhw xmm14, xmm14, 0B1H
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0A0H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+40H]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne innerloop4
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+20H], xmm1
+ movdqu xmmword ptr [rbx+40H], xmm9
+ movdqu xmmword ptr [rbx+60H], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+10H], xmm4
+ movdqu xmmword ptr [rbx+30H], xmm5
+ movdqu xmmword ptr [rbx+50H], xmm9
+ movdqu xmmword ptr [rbx+70H], xmm7
+ movdqa xmm1, xmmword ptr [rsp+110H]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+150H]
+ movdqa xmmword ptr [rsp+110H], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+120H]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+120H], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc outerloop4
+ test rsi, rsi
+ jne final3blocks
+unwind:
+ movdqa xmm6, xmmword ptr [rsp+170H]
+ movdqa xmm7, xmmword ptr [rsp+180H]
+ movdqa xmm8, xmmword ptr [rsp+190H]
+ movdqa xmm9, xmmword ptr [rsp+1A0H]
+ movdqa xmm10, xmmword ptr [rsp+1B0H]
+ movdqa xmm11, xmmword ptr [rsp+1C0H]
+ movdqa xmm12, xmmword ptr [rsp+1D0H]
+ movdqa xmm13, xmmword ptr [rsp+1E0H]
+ movdqa xmm14, xmmword ptr [rsp+1F0H]
+ movdqa xmm15, xmmword ptr [rsp+200H]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+ALIGN 16
+final3blocks:
+ test esi, 2H
+ je final1block
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+110H]
+ movd xmm14, dword ptr [rsp+120H]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+114H]
+ movd xmm13, dword ptr [rsp+124H]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+10H], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-40H]
+ movups xmm5, xmmword ptr [r8+rdx-30H]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-20H]
+ movups xmm7, xmmword ptr [r8+rdx-10H]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 93H
+ movups xmm12, xmmword ptr [r9+rdx-40H]
+ movups xmm13, xmmword ptr [r9+rdx-30H]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-20H]
+ movups xmm15, xmmword ptr [r9+rdx-10H]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 93H
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 93H
+ shl rax, 20H
+ or rax, 40H
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+20H], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+10H]
+ punpcklqdq xmm3, xmmword ptr [rsp+20H]
+ punpcklqdq xmm11, xmmword ptr [rsp+20H]
+ mov al, 7
+roundloop2:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+20H], xmm4
+ movaps xmmword ptr [rsp+30H], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ pshuflw xmm11, xmm11, 0B1H
+ pshufhw xmm11, xmm11, 0B1H
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+40H], xmm5
+ movaps xmmword ptr [rsp+50H], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm8, xmm8, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm11, xmm11, 4EH
+ pshufd xmm2, xmm2, 39H
+ pshufd xmm10, xmm10, 39H
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ pshuflw xmm11, xmm11, 0B1H
+ pshufhw xmm11, xmm11, 0B1H
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm8, xmm8, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm11, xmm11, 4EH
+ pshufd xmm2, xmm2, 93H
+ pshufd xmm10, xmm10, 93H
+ dec al
+ je endroundloop2
+ movdqa xmm12, xmmword ptr [rsp+20H]
+ movdqa xmm5, xmmword ptr [rsp+40H]
+ pshufd xmm13, xmm12, 0FH
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 39H
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+20H], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmmword ptr [rsp+40H], xmm12
+ movdqa xmm5, xmmword ptr [rsp+30H]
+ movdqa xmm13, xmmword ptr [rsp+50H]
+ pshufd xmm6, xmm5, 0FH
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 39H
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+30H], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+30H]
+ pshufd xmm5, xmm5, 78H
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 1EH
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+20H]
+ movdqa xmm6, xmmword ptr [rsp+40H]
+ jmp roundloop2
+endroundloop2:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop2
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+10H], xmm1
+ movups xmmword ptr [rbx+20H], xmm8
+ movups xmmword ptr [rbx+30H], xmm9
+ mov eax, dword ptr [rsp+130H]
+ neg eax
+ mov r10d, dword ptr [rsp+110H+8*rax]
+ mov r11d, dword ptr [rsp+120H+8*rax]
+ mov dword ptr [rsp+110H], r10d
+ mov dword ptr [rsp+120H], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+final1block:
+ test esi, 1H
+ je unwind
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movd xmm13, dword ptr [rsp+110H]
+ movd xmm14, dword ptr [rsp+120H]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop1:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-40H]
+ movups xmm5, xmmword ptr [r8+rdx-30H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-20H]
+ movups xmm7, xmmword ptr [r8+rdx-10H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ mov al, 7
+roundloop1:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz endroundloop1
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp roundloop1
+endroundloop1:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop1
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+10H], xmm1
+ jmp unwind
+_blake3_hash_many_sse2 ENDP
+blake3_hash_many_sse2 ENDP
+
+blake3_compress_in_place_sse2 PROC
+_blake3_compress_in_place_sse2 PROC
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+10H], xmm7
+ movdqa xmmword ptr [rsp+20H], xmm8
+ movdqa xmmword ptr [rsp+30H], xmm9
+ movdqa xmmword ptr [rsp+40H], xmm11
+ movdqa xmmword ptr [rsp+50H], xmm14
+ movdqa xmmword ptr [rsp+60H], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movzx eax, byte ptr [rsp+0A0H]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+10H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+20H]
+ movups xmm7, xmmword ptr [rdx+30H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ mov al, 7
+@@:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp @B
+@@:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rcx], xmm0
+ movups xmmword ptr [rcx+10H], xmm1
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+10H]
+ movdqa xmm8, xmmword ptr [rsp+20H]
+ movdqa xmm9, xmmword ptr [rsp+30H]
+ movdqa xmm11, xmmword ptr [rsp+40H]
+ movdqa xmm14, xmmword ptr [rsp+50H]
+ movdqa xmm15, xmmword ptr [rsp+60H]
+ add rsp, 120
+ ret
+_blake3_compress_in_place_sse2 ENDP
+blake3_compress_in_place_sse2 ENDP
+
+ALIGN 16
+blake3_compress_xof_sse2 PROC
+_blake3_compress_xof_sse2 PROC
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+10H], xmm7
+ movdqa xmmword ptr [rsp+20H], xmm8
+ movdqa xmmword ptr [rsp+30H], xmm9
+ movdqa xmmword ptr [rsp+40H], xmm11
+ movdqa xmmword ptr [rsp+50H], xmm14
+ movdqa xmmword ptr [rsp+60H], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movzx eax, byte ptr [rsp+0A0H]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+0A8H]
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+10H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+20H]
+ movups xmm7, xmmword ptr [rdx+30H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ mov al, 7
+@@:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0B1H
+ pshufhw xmm3, xmm3, 0B1H
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp @B
+@@:
+ movdqu xmm4, xmmword ptr [rcx]
+ movdqu xmm5, xmmword ptr [rcx+10H]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r10], xmm0
+ movups xmmword ptr [r10+10H], xmm1
+ movups xmmword ptr [r10+20H], xmm2
+ movups xmmword ptr [r10+30H], xmm3
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+10H]
+ movdqa xmm8, xmmword ptr [rsp+20H]
+ movdqa xmm9, xmmword ptr [rsp+30H]
+ movdqa xmm11, xmmword ptr [rsp+40H]
+ movdqa xmm14, xmmword ptr [rsp+50H]
+ movdqa xmm15, xmmword ptr [rsp+60H]
+ add rsp, 120
+ ret
+_blake3_compress_xof_sse2 ENDP
+blake3_compress_xof_sse2 ENDP
+
+_TEXT ENDS
+
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN 64
+BLAKE3_IV:
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+ADD0:
+ dd 0, 1, 2, 3
+
+ADD1:
+ dd 4 dup (4)
+
+BLAKE3_IV_0:
+ dd 4 dup (6A09E667H)
+
+BLAKE3_IV_1:
+ dd 4 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+ dd 4 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+ dd 4 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+ dd 4 dup (64)
+
+CMP_MSB_MASK:
+ dd 8 dup(80000000H)
+
+PBLENDW_0x33_MASK:
+ dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
+PBLENDW_0xCC_MASK:
+ dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
+PBLENDW_0x3F_MASK:
+ dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
+PBLENDW_0xC0_MASK:
+ dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
+
+_RDATA ENDS
+END
diff --git a/deps/blake3/blake3_sse41.c b/deps/blake3/blake3_sse41.c
@@ -0,0 +1,559 @@
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c) \
+ (_mm_castps_si128( \
+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const uint8_t src[16]) {
+ return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, uint8_t dest[16]) {
+ _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+ return _mm_shuffle_epi8(
+ x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m128i rot12(__m128i x) {
+ return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+ return _mm_shuffle_epi8(
+ x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m128i rot7(__m128i x) {
+ return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = addv(addv(*row0, m), *row1);
+ *row3 = xorv(*row3, *row0);
+ *row3 = rot16(*row3);
+ *row2 = addv(*row2, *row3);
+ *row1 = xorv(*row1, *row2);
+ *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+ __m128i m) {
+ *row0 = addv(addv(*row0, m), *row1);
+ *row3 = xorv(*row3, *row0);
+ *row3 = rot8(*row3);
+ *row2 = addv(*row2, *row3);
+ *row1 = xorv(*row1, *row2);
+ *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
+ rows[0] = loadu((uint8_t *)&cv[0]);
+ rows[1] = loadu((uint8_t *)&cv[4]);
+ rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+ rows[3] = set4(counter_low(counter), counter_high(counter),
+ (uint32_t)block_len, (uint32_t)flags);
+
+ __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+ __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+ __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+ __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+ __m128i t0, t1, t2, t3, tt;
+
+ // Round 1. The first round permutes the message words from the original
+ // input order, into the groups that get mixed in parallel.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 2. This round and all following rounds apply a fixed permutation
+ // to the message words from the round before.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 3
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 4
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 5
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 6
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 7
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
+ storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
+}
+
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags, uint8_t out[64]) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, counter, flags);
+ storeu(xorv(rows[0], rows[2]), &out[0]);
+ storeu(xorv(rows[1], rows[3]), &out[16]);
+ storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
+ storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[15] = rot16(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot12(v[4]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+ v[0] = addv(v[0], v[4]);
+ v[1] = addv(v[1], v[5]);
+ v[2] = addv(v[2], v[6]);
+ v[3] = addv(v[3], v[7]);
+ v[12] = xorv(v[12], v[0]);
+ v[13] = xorv(v[13], v[1]);
+ v[14] = xorv(v[14], v[2]);
+ v[15] = xorv(v[15], v[3]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[15] = rot8(v[15]);
+ v[8] = addv(v[8], v[12]);
+ v[9] = addv(v[9], v[13]);
+ v[10] = addv(v[10], v[14]);
+ v[11] = addv(v[11], v[15]);
+ v[4] = xorv(v[4], v[8]);
+ v[5] = xorv(v[5], v[9]);
+ v[6] = xorv(v[6], v[10]);
+ v[7] = xorv(v[7], v[11]);
+ v[4] = rot7(v[4]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot16(v[15]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot12(v[5]);
+ v[6] = rot12(v[6]);
+ v[7] = rot12(v[7]);
+ v[4] = rot12(v[4]);
+ v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+ v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+ v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+ v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+ v[0] = addv(v[0], v[5]);
+ v[1] = addv(v[1], v[6]);
+ v[2] = addv(v[2], v[7]);
+ v[3] = addv(v[3], v[4]);
+ v[15] = xorv(v[15], v[0]);
+ v[12] = xorv(v[12], v[1]);
+ v[13] = xorv(v[13], v[2]);
+ v[14] = xorv(v[14], v[3]);
+ v[15] = rot8(v[15]);
+ v[12] = rot8(v[12]);
+ v[13] = rot8(v[13]);
+ v[14] = rot8(v[14]);
+ v[10] = addv(v[10], v[15]);
+ v[11] = addv(v[11], v[12]);
+ v[8] = addv(v[8], v[13]);
+ v[9] = addv(v[9], v[14]);
+ v[5] = xorv(v[5], v[10]);
+ v[6] = xorv(v[6], v[11]);
+ v[7] = xorv(v[7], v[8]);
+ v[4] = xorv(v[4], v[9]);
+ v[5] = rot7(v[5]);
+ v[6] = rot7(v[6]);
+ v[7] = rot7(v[7]);
+ v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+ // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+ // 22/33. Note that this doesn't split the vector into two lanes, as the
+ // AVX2 counterparts do.
+ __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+ __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+ __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+ __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+ // Interleave 64-bit lanes.
+ __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+ __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+ __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+ __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+ vecs[0] = abcd_0;
+ vecs[1] = abcd_1;
+ vecs[2] = abcd_2;
+ vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+ size_t block_offset, __m128i out[16]) {
+ out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+ out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+ out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+ out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+ out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+ out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+ out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+ out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+ out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+ out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+ out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+ out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+ out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+ out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+ out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+ out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+ for (size_t i = 0; i < 4; ++i) {
+ _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+ }
+ transpose_vecs(&out[0]);
+ transpose_vecs(&out[4]);
+ transpose_vecs(&out[8]);
+ transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+ __m128i *out_lo, __m128i *out_hi) {
+ const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+ const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+ const __m128i add1 = _mm_and_si128(mask, add0);
+ __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
+ __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
+ _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
+ *out_lo = l;
+ *out_hi = h;
+}
+
+void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ __m128i h_vecs[8] = {
+ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+ set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+ };
+ __m128i counter_low_vec, counter_high_vec;
+ load_counters(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
+ uint8_t block_flags = flags | flags_start;
+
+ for (size_t block = 0; block < blocks; block++) {
+ if (block + 1 == blocks) {
+ block_flags |= flags_end;
+ }
+ __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+ __m128i block_flags_vec = set1(block_flags);
+ __m128i msg_vecs[16];
+ transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+ __m128i v[16] = {
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+ };
+ round_fn(v, msg_vecs, 0);
+ round_fn(v, msg_vecs, 1);
+ round_fn(v, msg_vecs, 2);
+ round_fn(v, msg_vecs, 3);
+ round_fn(v, msg_vecs, 4);
+ round_fn(v, msg_vecs, 5);
+ round_fn(v, msg_vecs, 6);
+ h_vecs[0] = xorv(v[0], v[8]);
+ h_vecs[1] = xorv(v[1], v[9]);
+ h_vecs[2] = xorv(v[2], v[10]);
+ h_vecs[3] = xorv(v[3], v[11]);
+ h_vecs[4] = xorv(v[4], v[12]);
+ h_vecs[5] = xorv(v[5], v[13]);
+ h_vecs[6] = xorv(v[6], v[14]);
+ h_vecs[7] = xorv(v[7], v[15]);
+
+ block_flags = flags;
+ }
+
+ transpose_vecs(&h_vecs[0]);
+ transpose_vecs(&h_vecs[4]);
+ // The first four vecs now contain the first half of each output, and the
+ // second four vecs contain the second half of each output.
+ storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+ storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+ storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+ storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+ storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+ storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+ storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+ storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
+ block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8],
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out) {
+ while (num_inputs >= DEGREE) {
+ blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += DEGREE;
+ }
+ inputs += DEGREE;
+ num_inputs -= DEGREE;
+ out = &out[DEGREE * BLAKE3_OUT_LEN];
+ }
+ while (num_inputs > 0) {
+ hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
diff --git a/deps/blake3/blake3_sse41_x86-64_unix.S b/deps/blake3/blake3_sse41_x86-64_unix.S
@@ -0,0 +1,2028 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global blake3_hash_many_sse41
+.global _blake3_hash_many_sse41
+.global blake3_compress_in_place_sse41
+.global _blake3_compress_in_place_sse41
+.global blake3_compress_xof_sse41
+.global _blake3_compress_xof_sse41
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+ .p2align 6
+_blake3_hash_many_sse41:
+blake3_hash_many_sse41:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16+rip]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8+rip]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0xCC
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0xC0
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0xCC
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0xC0
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ movdqa xmm0, xmmword ptr [rsp+0x130]
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm2, xmmword ptr [rsp+0x120]
+ movdqu xmm3, xmmword ptr [rsp+0x118]
+ movdqu xmm4, xmmword ptr [rsp+0x128]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+blake3_compress_in_place_sse41:
+_blake3_compress_in_place_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+blake3_compress_xof_sse41:
+_blake3_compress_xof_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
diff --git a/deps/blake3/blake3_sse41_x86-64_windows_gnu.S b/deps/blake3/blake3_sse41_x86-64_windows_gnu.S
@@ -0,0 +1,2069 @@
+.intel_syntax noprefix
+.global blake3_hash_many_sse41
+.global _blake3_hash_many_sse41
+.global blake3_compress_in_place_sse41
+.global _blake3_compress_in_place_sse41
+.global blake3_compress_xof_sse41
+.global _blake3_compress_xof_sse41
+.section .text
+ .p2align 6
+_blake3_hash_many_sse41:
+blake3_hash_many_sse41:
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 528
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ movdqa xmmword ptr [rsp+0x170], xmm6
+ movdqa xmmword ptr [rsp+0x180], xmm7
+ movdqa xmmword ptr [rsp+0x190], xmm8
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
+ movdqa xmmword ptr [rsp+0x200], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+0x68]
+ movzx r9, byte ptr [rbp+0x70]
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x90]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x78]
+ movzx r12d, byte ptr [rbp+0x88]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jne 3f
+4:
+ movdqa xmm6, xmmword ptr [rsp+0x170]
+ movdqa xmm7, xmmword ptr [rsp+0x180]
+ movdqa xmm8, xmmword ptr [rsp+0x190]
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
+ movdqa xmm15, xmmword ptr [rsp+0x200]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16+rip]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8+rip]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0xCC
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0xC0
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0xCC
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0xC0
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ movdqa xmm0, xmmword ptr [rsp+0x130]
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm2, xmmword ptr [rsp+0x120]
+ movdqu xmm3, xmmword ptr [rsp+0x118]
+ movdqu xmm4, xmmword ptr [rsp+0x128]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x80]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+blake3_compress_in_place_sse41:
+_blake3_compress_in_place_sse41:
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+0x10], xmm7
+ movdqa xmmword ptr [rsp+0x20], xmm8
+ movdqa xmmword ptr [rsp+0x30], xmm9
+ movdqa xmmword ptr [rsp+0x40], xmm11
+ movdqa xmmword ptr [rsp+0x50], xmm14
+ movdqa xmmword ptr [rsp+0x60], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, byte ptr [rsp+0xA0]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+0x20]
+ movups xmm7, xmmword ptr [rdx+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rcx], xmm0
+ movups xmmword ptr [rcx+0x10], xmm1
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+0x10]
+ movdqa xmm8, xmmword ptr [rsp+0x20]
+ movdqa xmm9, xmmword ptr [rsp+0x30]
+ movdqa xmm11, xmmword ptr [rsp+0x40]
+ movdqa xmm14, xmmword ptr [rsp+0x50]
+ movdqa xmm15, xmmword ptr [rsp+0x60]
+ add rsp, 120
+ ret
+
+
+.p2align 6
+_blake3_compress_xof_sse41:
+blake3_compress_xof_sse41:
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+0x10], xmm7
+ movdqa xmmword ptr [rsp+0x20], xmm8
+ movdqa xmmword ptr [rsp+0x30], xmm9
+ movdqa xmmword ptr [rsp+0x40], xmm11
+ movdqa xmmword ptr [rsp+0x50], xmm14
+ movdqa xmmword ptr [rsp+0x60], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, byte ptr [rsp+0xA0]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+0xA8]
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+0x20]
+ movups xmm7, xmmword ptr [rdx+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rcx]
+ movdqu xmm5, xmmword ptr [rcx+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r10], xmm0
+ movups xmmword ptr [r10+0x10], xmm1
+ movups xmmword ptr [r10+0x20], xmm2
+ movups xmmword ptr [r10+0x30], xmm3
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+0x10]
+ movdqa xmm8, xmmword ptr [rsp+0x20]
+ movdqa xmm9, xmmword ptr [rsp+0x30]
+ movdqa xmm11, xmmword ptr [rsp+0x40]
+ movdqa xmm14, xmmword ptr [rsp+0x50]
+ movdqa xmm15, xmmword ptr [rsp+0x60]
+ add rsp, 120
+ ret
+
+
+.section .rodata
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
diff --git a/deps/blake3/blake3_sse41_x86-64_windows_msvc.asm b/deps/blake3/blake3_sse41_x86-64_windows_msvc.asm
@@ -0,0 +1,2089 @@
+public _blake3_hash_many_sse41
+public blake3_hash_many_sse41
+public blake3_compress_in_place_sse41
+public _blake3_compress_in_place_sse41
+public blake3_compress_xof_sse41
+public _blake3_compress_xof_sse41
+
+_TEXT SEGMENT ALIGN(16) 'CODE'
+
+ALIGN 16
+blake3_hash_many_sse41 PROC
+_blake3_hash_many_sse41 PROC
+ push r15
+ push r14
+ push r13
+ push r12
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 528
+ and rsp, 0FFFFFFFFFFFFFFC0H
+ movdqa xmmword ptr [rsp+170H], xmm6
+ movdqa xmmword ptr [rsp+180H], xmm7
+ movdqa xmmword ptr [rsp+190H], xmm8
+ movdqa xmmword ptr [rsp+1A0H], xmm9
+ movdqa xmmword ptr [rsp+1B0H], xmm10
+ movdqa xmmword ptr [rsp+1C0H], xmm11
+ movdqa xmmword ptr [rsp+1D0H], xmm12
+ movdqa xmmword ptr [rsp+1E0H], xmm13
+ movdqa xmmword ptr [rsp+1F0H], xmm14
+ movdqa xmmword ptr [rsp+200H], xmm15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, qword ptr [rbp+68H]
+ movzx r9, byte ptr [rbp+70H]
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 00H
+ movdqa xmmword ptr [rsp+130H], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0]
+ pand xmm0, xmmword ptr [ADD1]
+ movdqa xmmword ptr [rsp+150H], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 00H
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+110H], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 00H
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+120H], xmm2
+ mov rbx, qword ptr [rbp+90H]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+78H]
+ movzx r12d, byte ptr [rbp+88H]
+ cmp rsi, 4
+ jc final3blocks
+outerloop4:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 00H
+ pshufd xmm1, xmm3, 55H
+ pshufd xmm2, xmm3, 0AAH
+ pshufd xmm3, xmm3, 0FFH
+ movdqu xmm7, xmmword ptr [rcx+10H]
+ pshufd xmm4, xmm7, 00H
+ pshufd xmm5, xmm7, 55H
+ pshufd xmm6, xmm7, 0AAH
+ pshufd xmm7, xmm7, 0FFH
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ mov r10, qword ptr [rdi+10H]
+ mov r11, qword ptr [rdi+18H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop4:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-40H]
+ movdqu xmm9, xmmword ptr [r9+rdx-40H]
+ movdqu xmm10, xmmword ptr [r10+rdx-40H]
+ movdqu xmm11, xmmword ptr [r11+rdx-40H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+10H], xmm9
+ movdqa xmmword ptr [rsp+20H], xmm12
+ movdqa xmmword ptr [rsp+30H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-30H]
+ movdqu xmm9, xmmword ptr [r9+rdx-30H]
+ movdqu xmm10, xmmword ptr [r10+rdx-30H]
+ movdqu xmm11, xmmword ptr [r11+rdx-30H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+40H], xmm8
+ movdqa xmmword ptr [rsp+50H], xmm9
+ movdqa xmmword ptr [rsp+60H], xmm12
+ movdqa xmmword ptr [rsp+70H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-20H]
+ movdqu xmm9, xmmword ptr [r9+rdx-20H]
+ movdqu xmm10, xmmword ptr [r10+rdx-20H]
+ movdqu xmm11, xmmword ptr [r11+rdx-20H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+80H], xmm8
+ movdqa xmmword ptr [rsp+90H], xmm9
+ movdqa xmmword ptr [rsp+0A0H], xmm12
+ movdqa xmmword ptr [rsp+0B0H], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-10H]
+ movdqu xmm9, xmmword ptr [r9+rdx-10H]
+ movdqu xmm10, xmmword ptr [r10+rdx-10H]
+ movdqu xmm11, xmmword ptr [r11+rdx-10H]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0C0H], xmm8
+ movdqa xmmword ptr [rsp+0D0H], xmm9
+ movdqa xmmword ptr [rsp+0E0H], xmm12
+ movdqa xmmword ptr [rsp+0F0H], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
+ movdqa xmm12, xmmword ptr [rsp+110H]
+ movdqa xmm13, xmmword ptr [rsp+120H]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 00H
+ prefetcht0 byte ptr [r8+rdx+80H]
+ prefetcht0 byte ptr [r9+rdx+80H]
+ prefetcht0 byte ptr [r10+rdx+80H]
+ prefetcht0 byte ptr [r11+rdx+80H]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+40H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+10H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+50H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+80H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+0C0H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+90H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+0D0H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+20H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+70H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+60H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+10H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+90H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0B0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+0E0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+30H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+0D0H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+40H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+20H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+60H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+0B0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+50H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0F0H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0A0H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+0E0H]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+70H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+30H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+40H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+50H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+80H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0C0H]
+ paddd xmm1, xmmword ptr [rsp+90H]
+ paddd xmm2, xmmword ptr [rsp+0F0H]
+ paddd xmm3, xmmword ptr [rsp+0E0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0D0H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+0A0H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+70H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+20H]
+ paddd xmm1, xmmword ptr [rsp+30H]
+ paddd xmm2, xmmword ptr [rsp+10H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+90H]
+ paddd xmm1, xmmword ptr [rsp+0B0H]
+ paddd xmm2, xmmword ptr [rsp+80H]
+ paddd xmm3, xmmword ptr [rsp+0F0H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0E0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+0C0H]
+ paddd xmm3, xmmword ptr [rsp+10H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0D0H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+20H]
+ paddd xmm3, xmmword ptr [rsp+40H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+30H]
+ paddd xmm1, xmmword ptr [rsp+0A0H]
+ paddd xmm2, xmmword ptr [rsp+60H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0B0H]
+ paddd xmm1, xmmword ptr [rsp+50H]
+ paddd xmm2, xmmword ptr [rsp+10H]
+ paddd xmm3, xmmword ptr [rsp+80H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0F0H]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+90H]
+ paddd xmm3, xmmword ptr [rsp+60H]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0E0H]
+ paddd xmm1, xmmword ptr [rsp+20H]
+ paddd xmm2, xmmword ptr [rsp+30H]
+ paddd xmm3, xmmword ptr [rsp+70H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+100H], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0A0H]
+ paddd xmm1, xmmword ptr [rsp+0C0H]
+ paddd xmm2, xmmword ptr [rsp+40H]
+ paddd xmm3, xmmword ptr [rsp+0D0H]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+100H]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne innerloop4
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+20H], xmm1
+ movdqu xmmword ptr [rbx+40H], xmm9
+ movdqu xmmword ptr [rbx+60H], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+10H], xmm4
+ movdqu xmmword ptr [rbx+30H], xmm5
+ movdqu xmmword ptr [rbx+50H], xmm9
+ movdqu xmmword ptr [rbx+70H], xmm7
+ movdqa xmm1, xmmword ptr [rsp+110H]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+150H]
+ movdqa xmmword ptr [rsp+110H], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+120H]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+120H], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc outerloop4
+ test rsi, rsi
+ jne final3blocks
+unwind:
+ movdqa xmm6, xmmword ptr [rsp+170H]
+ movdqa xmm7, xmmword ptr [rsp+180H]
+ movdqa xmm8, xmmword ptr [rsp+190H]
+ movdqa xmm9, xmmword ptr [rsp+1A0H]
+ movdqa xmm10, xmmword ptr [rsp+1B0H]
+ movdqa xmm11, xmmword ptr [rsp+1C0H]
+ movdqa xmm12, xmmword ptr [rsp+1D0H]
+ movdqa xmm13, xmmword ptr [rsp+1E0H]
+ movdqa xmm14, xmmword ptr [rsp+1F0H]
+ movdqa xmm15, xmmword ptr [rsp+200H]
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+ALIGN 16
+final3blocks:
+ test esi, 2H
+ je final1block
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+110H]
+ pinsrd xmm13, dword ptr [rsp+120H], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+114H]
+ pinsrd xmm14, dword ptr [rsp+124H], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+ movaps xmmword ptr [rsp+10H], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+8H]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-40H]
+ movups xmm5, xmmword ptr [r8+rdx-30H]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-20H]
+ movups xmm7, xmmword ptr [r8+rdx-10H]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 93H
+ movups xmm12, xmmword ptr [r9+rdx-40H]
+ movups xmm13, xmmword ptr [r9+rdx-30H]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-20H]
+ movups xmm15, xmmword ptr [r9+rdx-10H]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 93H
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 93H
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+10H]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+roundloop2:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+20H], xmm4
+ movaps xmmword ptr [rsp+30H], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+40H], xmm5
+ movaps xmmword ptr [rsp+50H], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm8, xmm8, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm11, xmm11, 4EH
+ pshufd xmm2, xmm2, 39H
+ pshufd xmm10, xmm10, 39H
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm8, xmm8, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm11, xmm11, 4EH
+ pshufd xmm2, xmm2, 93H
+ pshufd xmm10, xmm10, 93H
+ dec al
+ je endroundloop2
+ movdqa xmm12, xmmword ptr [rsp+20H]
+ movdqa xmm5, xmmword ptr [rsp+40H]
+ pshufd xmm13, xmm12, 0FH
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 39H
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0CCH
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0C0H
+ pshufd xmm12, xmm12, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmmword ptr [rsp+20H], xmm13
+ movdqa xmmword ptr [rsp+40H], xmm12
+ movdqa xmm5, xmmword ptr [rsp+30H]
+ movdqa xmm13, xmmword ptr [rsp+50H]
+ pshufd xmm6, xmm5, 0FH
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 39H
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0CCH
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0C0H
+ pshufd xmm5, xmm5, 78H
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 1EH
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+20H]
+ movdqa xmm6, xmmword ptr [rsp+40H]
+ jmp roundloop2
+endroundloop2:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop2
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+10H], xmm1
+ movups xmmword ptr [rbx+20H], xmm8
+ movups xmmword ptr [rbx+30H], xmm9
+ movdqa xmm0, xmmword ptr [rsp+130H]
+ movdqa xmm1, xmmword ptr [rsp+110H]
+ movdqa xmm2, xmmword ptr [rsp+120H]
+ movdqu xmm3, xmmword ptr [rsp+118H]
+ movdqu xmm4, xmmword ptr [rsp+128H]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+110H], xmm1
+ movdqa xmmword ptr [rsp+120H], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+final1block:
+ test esi, 1H
+ je unwind
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movd xmm13, dword ptr [rsp+110H]
+ pinsrd xmm13, dword ptr [rsp+120H], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ movaps xmm14, xmmword ptr [ROT8]
+ movaps xmm15, xmmword ptr [ROT16]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+80H]
+ or eax, r13d
+ xor edx, edx
+innerloop1:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-40H]
+ movups xmm5, xmmword ptr [r8+rdx-30H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-20H]
+ movups xmm7, xmmword ptr [r8+rdx-10H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ mov al, 7
+roundloop1:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz endroundloop1
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0CCH
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0C0H
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp roundloop1
+endroundloop1:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne innerloop1
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+10H], xmm1
+ jmp unwind
+_blake3_hash_many_sse41 ENDP
+blake3_hash_many_sse41 ENDP
+
+blake3_compress_in_place_sse41 PROC
+_blake3_compress_in_place_sse41 PROC
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+10H], xmm7
+ movdqa xmmword ptr [rsp+20H], xmm8
+ movdqa xmmword ptr [rsp+30H], xmm9
+ movdqa xmmword ptr [rsp+40H], xmm11
+ movdqa xmmword ptr [rsp+50H], xmm14
+ movdqa xmmword ptr [rsp+60H], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movzx eax, byte ptr [rsp+0A0H]
+ movzx r8d, r8b
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+10H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+20H]
+ movups xmm7, xmmword ptr [rdx+30H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ movaps xmm14, xmmword ptr [ROT8]
+ movaps xmm15, xmmword ptr [ROT16]
+ mov al, 7
+@@:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0CCH
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0C0H
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp @B
+@@:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rcx], xmm0
+ movups xmmword ptr [rcx+10H], xmm1
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+10H]
+ movdqa xmm8, xmmword ptr [rsp+20H]
+ movdqa xmm9, xmmword ptr [rsp+30H]
+ movdqa xmm11, xmmword ptr [rsp+40H]
+ movdqa xmm14, xmmword ptr [rsp+50H]
+ movdqa xmm15, xmmword ptr [rsp+60H]
+ add rsp, 120
+ ret
+_blake3_compress_in_place_sse41 ENDP
+blake3_compress_in_place_sse41 ENDP
+
+ALIGN 16
+blake3_compress_xof_sse41 PROC
+_blake3_compress_xof_sse41 PROC
+ sub rsp, 120
+ movdqa xmmword ptr [rsp], xmm6
+ movdqa xmmword ptr [rsp+10H], xmm7
+ movdqa xmmword ptr [rsp+20H], xmm8
+ movdqa xmmword ptr [rsp+30H], xmm9
+ movdqa xmmword ptr [rsp+40H], xmm11
+ movdqa xmmword ptr [rsp+50H], xmm14
+ movdqa xmmword ptr [rsp+60H], xmm15
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+10H]
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
+ movzx eax, byte ptr [rsp+0A0H]
+ movzx r8d, r8b
+ mov r10, qword ptr [rsp+0A8H]
+ shl rax, 32
+ add r8, rax
+ movq xmm3, r9
+ movq xmm4, r8
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rdx]
+ movups xmm5, xmmword ptr [rdx+10H]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rdx+20H]
+ movups xmm7, xmmword ptr [rdx+30H]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 93H
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 93H
+ movaps xmm14, xmmword ptr [ROT8]
+ movaps xmm15, xmmword ptr [ROT16]
+ mov al, 7
+@@:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 93H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 39H
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 39H
+ pshufd xmm3, xmm3, 4EH
+ pshufd xmm2, xmm2, 93H
+ dec al
+ jz @F
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0FH
+ pshufd xmm4, xmm8, 39H
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0CCH
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0C0H
+ pshufd xmm8, xmm8, 78H
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 1EH
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp @B
+@@:
+ movdqu xmm4, xmmword ptr [rcx]
+ movdqu xmm5, xmmword ptr [rcx+10H]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r10], xmm0
+ movups xmmword ptr [r10+10H], xmm1
+ movups xmmword ptr [r10+20H], xmm2
+ movups xmmword ptr [r10+30H], xmm3
+ movdqa xmm6, xmmword ptr [rsp]
+ movdqa xmm7, xmmword ptr [rsp+10H]
+ movdqa xmm8, xmmword ptr [rsp+20H]
+ movdqa xmm9, xmmword ptr [rsp+30H]
+ movdqa xmm11, xmmword ptr [rsp+40H]
+ movdqa xmm14, xmmword ptr [rsp+50H]
+ movdqa xmm15, xmmword ptr [rsp+60H]
+ add rsp, 120
+ ret
+_blake3_compress_xof_sse41 ENDP
+blake3_compress_xof_sse41 ENDP
+
+_TEXT ENDS
+
+
+_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
+ALIGN 64
+BLAKE3_IV:
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
+
+ADD0:
+ dd 0, 1, 2, 3
+
+ADD1:
+ dd 4 dup (4)
+
+BLAKE3_IV_0:
+ dd 4 dup (6A09E667H)
+
+BLAKE3_IV_1:
+ dd 4 dup (0BB67AE85H)
+
+BLAKE3_IV_2:
+ dd 4 dup (3C6EF372H)
+
+BLAKE3_IV_3:
+ dd 4 dup (0A54FF53AH)
+
+BLAKE3_BLOCK_LEN:
+ dd 4 dup (64)
+
+ROT16:
+ db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+
+ROT8:
+ db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+
+CMP_MSB_MASK:
+ dd 8 dup(80000000H)
+
+_RDATA ENDS
+END
+
diff --git a/deps/blake3/example.c b/deps/blake3/example.c
@@ -0,0 +1,27 @@
+#include "blake3.h"
+#include <stdio.h>
+#include <unistd.h>
+
+int main() {
+ // Initialize the hasher.
+ blake3_hasher hasher;
+ blake3_hasher_init(&hasher);
+
+ // Read input bytes from stdin.
+ unsigned char buf[65536];
+ ssize_t n;
+ while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) {
+ blake3_hasher_update(&hasher, buf, n);
+ }
+
+ // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+ uint8_t output[BLAKE3_OUT_LEN];
+ blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+ // Print the hash as hexadecimal.
+ for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+ printf("%02x", output[i]);
+ }
+ printf("\n");
+ return 0;
+}
diff --git a/deps/blake3/main.c b/deps/blake3/main.c
@@ -0,0 +1,166 @@
+/*
+ * This main file is intended for testing via `make test`. It does not build in
+ * other settings. See README.md in this directory for examples of how to build
+ * C code.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "blake3.h"
+#include "blake3_impl.h"
+
+#define HASH_MODE 0
+#define KEYED_HASH_MODE 1
+#define DERIVE_KEY_MODE 2
+
+static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) {
+ if ('0' <= c && c <= '9') {
+ *value = c - '0';
+ *valid = true;
+ } else if ('a' <= c && c <= 'f') {
+ *value = 10 + c - 'a';
+ *valid = true;
+ } else {
+ *valid = false;
+ }
+}
+
+static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) {
+ size_t hex_len = strlen(hex_key);
+ if (hex_len != 64) {
+ fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n",
+ hex_len);
+ return 1;
+ }
+ for (size_t i = 0; i < 64; i++) {
+ uint8_t value;
+ bool valid;
+ hex_char_value(hex_key[i], &value, &valid);
+ if (!valid) {
+ fprintf(stderr, "Invalid hex char.\n");
+ return 1;
+ }
+ if (i % 2 == 0) {
+ out[i / 2] = 0;
+ value <<= 4;
+ }
+ out[i / 2] += value;
+ }
+ return 0;
+}
+
+/* A little repetition here */
+enum cpu_feature {
+ SSE2 = 1 << 0,
+ SSSE3 = 1 << 1,
+ SSE41 = 1 << 2,
+ AVX = 1 << 3,
+ AVX2 = 1 << 4,
+ AVX512F = 1 << 5,
+ AVX512VL = 1 << 6,
+ /* ... */
+ UNDEFINED = 1 << 30
+};
+
+extern enum cpu_feature g_cpu_features;
+enum cpu_feature get_cpu_features();
+
+int main(int argc, char **argv) {
+ size_t out_len = BLAKE3_OUT_LEN;
+ uint8_t key[BLAKE3_KEY_LEN];
+ char *context = "";
+ uint8_t mode = HASH_MODE;
+ while (argc > 1) {
+ if (argc <= 2) {
+ fprintf(stderr, "Odd number of arguments.\n");
+ return 1;
+ }
+ if (strcmp("--length", argv[1]) == 0) {
+ char *endptr = NULL;
+ errno = 0;
+ unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10);
+ if (errno != 0 || out_len > SIZE_MAX || endptr == argv[2] ||
+ *endptr != 0) {
+ fprintf(stderr, "Bad length argument.\n");
+ return 1;
+ }
+ out_len = (size_t)out_len_ll;
+ } else if (strcmp("--keyed", argv[1]) == 0) {
+ mode = KEYED_HASH_MODE;
+ int ret = parse_key(argv[2], key);
+ if (ret != 0) {
+ return ret;
+ }
+ } else if (strcmp("--derive-key", argv[1]) == 0) {
+ mode = DERIVE_KEY_MODE;
+ context = argv[2];
+ } else {
+ fprintf(stderr, "Unknown flag.\n");
+ return 1;
+ }
+ argc -= 2;
+ argv += 2;
+ }
+
+ /*
+ * We're going to hash the input multiple times, so we need to buffer it all.
+ * This is just for test cases, so go ahead and assume that the input is less
+ * than 1 MiB.
+ */
+ size_t buf_capacity = 1 << 20;
+ uint8_t *buf = malloc(buf_capacity);
+ assert(buf != NULL);
+ size_t buf_len = 0;
+ while (1) {
+ size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin);
+ if (n == 0) {
+ break;
+ }
+ buf_len += n;
+ assert(buf_len < buf_capacity);
+ }
+
+ const int mask = get_cpu_features();
+ int feature = 0;
+ do {
+ fprintf(stderr, "Testing 0x%08X\n", feature);
+ g_cpu_features = feature;
+ blake3_hasher hasher;
+ switch (mode) {
+ case HASH_MODE:
+ blake3_hasher_init(&hasher);
+ break;
+ case KEYED_HASH_MODE:
+ blake3_hasher_init_keyed(&hasher, key);
+ break;
+ case DERIVE_KEY_MODE:
+ blake3_hasher_init_derive_key(&hasher, context);
+ break;
+ default:
+ abort();
+ }
+
+ blake3_hasher_update(&hasher, buf, buf_len);
+
+ /* TODO: An incremental output reader API to avoid this allocation. */
+ uint8_t *out = malloc(out_len);
+ if (out_len > 0 && out == NULL) {
+ fprintf(stderr, "malloc() failed.\n");
+ return 1;
+ }
+ blake3_hasher_finalize(&hasher, out, out_len);
+ for (size_t i = 0; i < out_len; i++) {
+ printf("%02x", out[i]);
+ }
+ printf("\n");
+ free(out);
+ feature = (feature - mask) & mask;
+ } while (feature != 0);
+ free(buf);
+ return 0;
+}