chibipub

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 46c3f95e36432af593b18ed0a6acd007026fda6e
parent fbd9c3b10b2f2ebbe20b74ac5892c5d9614e6627
Author: William Casarin <jb55@jb55.com>
Date:   Sun, 27 Dec 2020 14:55:43 -0800

blake3

Diffstat:
M.gitignore | 2++
MMakefile | 35++++++++++++++++++++++++++++++-----
Adeps/blake3/README.md | 270+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3.c | 607+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3.h | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx2.c | 325+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx2_x86-64_unix.S | 1815+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx2_x86-64_windows_gnu.S | 1817+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx2_x86-64_windows_msvc.asm | 1828+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx512.c | 1204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx512_x86-64_unix.S | 2585+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx512_x86-64_windows_gnu.S | 2615+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_avx512_x86-64_windows_msvc.asm | 2634+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_dispatch.c | 276+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_impl.h | 269+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_neon.c | 346+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_portable.c | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse2.c | 565+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse2_x86-64_unix.S | 2291++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse2_x86-64_windows_gnu.S | 2332+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse2_x86-64_windows_msvc.asm | 2350+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse41.c | 559+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse41_x86-64_unix.S | 2028+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse41_x86-64_windows_gnu.S | 2069+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/blake3_sse41_x86-64_windows_msvc.asm | 2089+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adeps/blake3/example.c | 27+++++++++++++++++++++++++++
Adeps/blake3/main.c | 166+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
27 files changed, 31319 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,6 +1,8 @@ .build-result /wolfsocks *.o +*.a +*.so /tags /test_out.ubjson src/test_json diff --git a/Makefile b/Makefile @@ -1,22 +1,47 @@ -CFLAGS = -Og -ggdb -Wall -Werror -Isrc $(shell pkg-config --cflags openssl) +CFLAGS = -Og -ggdb -Wall -Werror -Ideps -Isrc $(shell pkg-config --cflags openssl) LDFLAGS = $(shell pkg-config --libs openssl) -OBJS = src/http.o src/base64.o src/inbox.o src/json.o src/ubjson.o +OBJS = src/http.o \ + src/base64.o \ + src/inbox.o \ + src/json.o \ + src/ubjson.o \ + deps/blake3/blake3.a -HEADERS = $(wildcard src/*.h) +BLAKE3_OBJS = deps/blake3/blake3.o \ + deps/blake3/blake3_dispatch.o \ + deps/blake3/blake3_portable.o \ + deps/blake3/blake3_sse2_x86-64_unix.o \ + deps/blake3/blake3_sse41_x86-64_unix.o \ + deps/blake3/blake3_avx2_x86-64_unix.o + +HEADERS = $(wildcard src/*.h) all: wolfsocks +%.o: %.c + @echo "cc $<" + @$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $< + %.o: %.c %.h @echo "cc $<" @$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $< +%.o: %.S + @echo "cc $<" + @$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $< + +deps/blake3/blake3.a: $(BLAKE3_OBJS) + @echo "ar $@" + @ar rcs $@ $(BLAKE3_OBJS) + corpus/math.json: curl --compressed -sL 'https://jb55.com/s/5aaaae6d64be61fd.json' > $@ wolfsocks: src/wolfsocks.c $(OBJS) $(HEADERS) - $(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@ + @echo "ld $@" + @$(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@ src/test_json: src/test_json.c $(OBJS) $(HEADERS) corpus/math.json $(CC) $(CFLAGS) $< $(OBJS) $(LDFLAGS) -o $@ @@ -25,7 +50,7 @@ check: src/test_json ./src/test_json clean: fake - rm -f $(OBJS) wolfsocks src/test_json + rm -f $(OBJS) $(BLAKE3_OBJS) deps/blake3/blake3.a wolfsocks src/test_json tags: fake ctags src/*.c src/*.h diff --git a/deps/blake3/README.md b/deps/blake3/README.md @@ -0,0 +1,270 @@ +The official C implementation of BLAKE3. + +# Example + +An example program that hashes bytes from standard input and prints the +result: + +```c +#include "blake3.h" +#include <stdio.h> +#include <unistd.h> + +int main() { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + ssize_t n; + while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { + blake3_hasher_update(&hasher, buf, n); + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} +``` + +The code above is included in this directory as `example.c`. If you're +on x86\_64 with a Unix-like OS, you can compile a working binary like +this: + +```bash +gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +# API + +## The Struct + +```c +typedef struct { + // private fields +} blake3_hasher; +``` + +An incremental BLAKE3 hashing state, which can accept any number of +updates. This implementation doesn't allocate any heap memory, but +`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes +on x86-64. This size can be reduced by restricting the maximum input +length, as described in Section 5.4 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), +but this implementation doesn't currently support that strategy. + +## Common API Functions + +```c +void blake3_hasher_init( + blake3_hasher *self); +``` + +Initialize a `blake3_hasher` in the default hashing mode. + +--- + +```c +void blake3_hasher_update( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +Add input to the hasher. This can be called any number of times. + +--- + +```c +void blake3_hasher_finalize( + const blake3_hasher *self, + uint8_t *out, + size_t out_len); +``` + +Finalize the hasher and emit an output of any length. This doesn't +modify the hasher itself, and it's possible to finalize again after +adding more input. The constant `BLAKE3_OUT_LEN` provides the default +output length, 32 bytes. + +## Less Common API Functions + +```c +void blake3_hasher_init_keyed( + blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +``` + +Initialize a `blake3_hasher` in the keyed hashing mode. The key must be +exactly 32 bytes. + +--- + +```c +void blake3_hasher_init_derive_key( + blake3_hasher *self, + const char *context); +``` + +Initialize a `blake3_hasher` in the key derivation mode. The context +string is given as an initialization parameter, and afterwards input key +material should be given with `blake3_hasher_update`. The context string +is a null-terminated C string which should be **hardcoded, globally +unique, and application-specific**. The context string should not +include any dynamic input like salts, nonces, or identifiers read from a +database at runtime. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com +2019-12-25 16:18:03 session tokens v1"`. + +This function is intended for application code written in C. For +language bindings, see `blake3_hasher_init_derive_key_raw` below. + +--- + +```c +void blake3_hasher_init_derive_key_raw( + blake3_hasher *self, + const void *context, + size_t context_len); +``` + +As `blake3_hasher_init_derive_key` above, except that the context string +is given as a pointer to an array of arbitrary bytes with a provided +length. This is intended for writing language bindings, where C string +conversion would add unnecessary overhead and new error cases. Unicode +strings should be encoded as UTF-8. + +Application code in C should prefer `blake3_hasher_init_derive_key`, +which takes the context as a C string. If you need to use arbitrary +bytes as a context string in application code, consider whether you're +violating the requirement that context strings should be hardcoded. + +--- + +```c +void blake3_hasher_finalize_seek( + const blake3_hasher *self, + uint64_t seek, + uint8_t *out, + size_t out_len); +``` + +The same as `blake3_hasher_finalize`, but with an additional `seek` +parameter for the starting byte position in the output stream. To +efficiently stream a large output without allocating memory, call this +function in a loop, incrementing `seek` by the output length each time. + +# Building + +This implementation is just C and assembly files. It doesn't include a +public-facing build system. (The `Makefile` in this directory is only +for testing.) Instead, the intention is that you can include these files +in whatever build system you're already using. This section describes +the commands your build system should execute, or which you can execute +by hand. Note that these steps may change in future versions. + +## x86 + +Dynamic dispatch is enabled by default on x86. The implementation will +query the CPU at runtime to detect SIMD support, and it will use the +widest instruction set available. By default, `blake3_dispatch.c` +expects to be linked with code for five different instruction sets: +portable C, SSE2, SSE4.1, AVX2, and AVX-512. + +For each of the x86 SIMD instruction sets, two versions are available, +one in assembly (which is further divided into three flavors: Unix, +Windows MSVC, and Windows GNU) and one using C intrinsics. The assembly +versions are generally preferred: they perform better, they perform more +consistently across different compilers, and they build more quickly. On +the other hand, the assembly versions are x86\_64-only, and you need to +select the right flavor for your target platform. + +Here's an example of building a shared library on x86\_64 Linux using +the assembly implementations: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +When building the intrinsics-based implementations, you need to build +each implementation separately, with the corresponding instruction set +explicitly enabled in the compiler. Here's the same shared library using +the intrinsics-based implementations: + +```bash +gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o +gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o +gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o +gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o +``` + +Note above that building `blake3_avx512.c` requires both `-mavx512f` and +`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` +flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. +MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a +corresponding flag. + +If you want to omit SIMD code entirely, you need to explicitly disable +each instruction set. Here's an example of building a shared library on +x86 with only portable code: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ + -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c +``` + +## ARM NEON + +The NEON implementation is not enabled by default on ARM, since not all +ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an +example of building a shared library on ARM Linux with NEON support: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \ + blake3_portable.c blake3_neon.c +``` + +Note that on some targets (ARMv7 in particular), extra flags may be +required to activate NEON support in the compiler. If you see an error +like... + +``` +/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed +in call to always_inline ‘vaddq_u32’: target specific option mismatch +``` + +...then you may need to add something like `-mfpu=neon-vfpv4 +-mfloat-abi=hard`. + +## Other Platforms + +The portable implementation should work on most other architectures. For +example: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c +``` + +# Differences from the Rust Implementation + +The single-threaded Rust and C implementations use the same algorithms, +and their performance is the same if you use the assembly +implementations or if you compile the intrinsics-based implementations +with Clang. (Both Clang and rustc are LLVM-based.) + +The C implementation doesn't currently include any multithreading +optimizations. OpenMP support or similar might be added in the future. diff --git a/deps/blake3/blake3.c b/deps/blake3/blake3.c @@ -0,0 +1,607 @@ +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +#include "blake3.h" +#include "blake3_impl.h" + +const char * blake3_version(void) { + return BLAKE3_VERSION_STRING; +} + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement exendable ouput.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} diff --git a/deps/blake3/blake3.h b/deps/blake3/blake3.h @@ -0,0 +1,60 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include <stddef.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "0.3.7" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 +#define BLAKE3_MAX_SIMD_DEGREE 16 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +const char * blake3_version(void); +void blake3_hasher_init(blake3_hasher *self); +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/deps/blake3/blake3_avx2.c b/deps/blake3/blake3_avx2.c @@ -0,0 +1,325 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 8 + +INLINE __m256i loadu(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE void storeu(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m256i rot16(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m256i rot12(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); +} + +INLINE __m256i rot8(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, + 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m256i rot7(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); +} + +INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m256i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[8]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(h_vecs); + storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +#if !defined(BLAKE3_NO_SSE41) +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#else +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } +#if !defined(BLAKE3_NO_SSE41) + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#else + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +#endif +} diff --git a/deps/blake3/blake3_avx2_x86-64_unix.S b/deps/blake3/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/deps/blake3/blake3_avx2_x86-64_windows_gnu.S b/deps/blake3/blake3_avx2_x86-64_windows_gnu.S @@ -0,0 +1,1817 @@ +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +.section .text + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x2D0], xmm6 + vmovdqa xmmword ptr [rsp+0x2E0], xmm7 + vmovdqa xmmword ptr [rsp+0x2F0], xmm8 + vmovdqa xmmword ptr [rsp+0x300], xmm9 + vmovdqa xmmword ptr [rsp+0x310], xmm10 + vmovdqa xmmword ptr [rsp+0x320], xmm11 + vmovdqa xmmword ptr [rsp+0x330], xmm12 + vmovdqa xmmword ptr [rsp+0x340], xmm13 + vmovdqa xmmword ptr [rsp+0x350], xmm14 + vmovdqa xmmword ptr [rsp+0x360], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x260], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x2A0], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2C0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2C0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x2A0] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] + vmovdqa ymmword ptr [rsp+0x220], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x240] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x2D0] + vmovdqa xmm7, xmmword ptr [rsp+0x2E0] + vmovdqa xmm8, xmmword ptr [rsp+0x2F0] + vmovdqa xmm9, xmmword ptr [rsp+0x300] + vmovdqa xmm10, xmmword ptr [rsp+0x310] + vmovdqa xmm11, xmmword ptr [rsp+0x320] + vmovdqa xmm12, xmmword ptr [rsp+0x330] + vmovdqa xmm13, xmmword ptr [rsp+0x340] + vmovdqa xmm14, xmmword ptr [rsp+0x350] + vmovdqa xmm15, xmmword ptr [rsp+0x360] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x2C0] + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x260] + vmovaps xmm0, xmmword ptr [rsp+0x220] + vmovaps xmm1, xmmword ptr [rsp+0x230] + vmovaps xmm2, xmmword ptr [rsp+0x240] + vmovaps xmm3, xmmword ptr [rsp+0x250] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x220], xmm0 + vmovaps xmmword ptr [rsp+0x240], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x220] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x224] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x260] + vmovaps ymm0, ymmword ptr [rsp+0x220] + vmovups ymm1, ymmword ptr [rsp+0x228] + vmovaps ymm2, ymmword ptr [rsp+0x240] + vmovups ymm3, ymmword ptr [rsp+0x248] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x220], ymm0 + vmovaps ymmword ptr [rsp+0x240], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x220] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.section .rodata +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/deps/blake3/blake3_avx2_x86-64_windows_msvc.asm b/deps/blake3/blake3_avx2_x86-64_windows_msvc.asm @@ -0,0 +1,1828 @@ +public _blake3_hash_many_avx2 +public blake3_hash_many_avx2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx2 PROC +_blake3_hash_many_avx2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+2D0H], xmm6 + vmovdqa xmmword ptr [rsp+2E0H], xmm7 + vmovdqa xmmword ptr [rsp+2F0H], xmm8 + vmovdqa xmmword ptr [rsp+300H], xmm9 + vmovdqa xmmword ptr [rsp+310H], xmm10 + vmovdqa xmmword ptr [rsp+320H], xmm11 + vmovdqa xmmword ptr [rsp+330H], xmm12 + vmovdqa xmmword ptr [rsp+340H], xmm13 + vmovdqa xmmword ptr [rsp+350H], xmm14 + vmovdqa xmmword ptr [rsp+360H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+260H], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0] + vpand ymm2, ymm0, ymmword ptr [ADD1] + vmovdqa ymmword ptr [rsp+2A0H], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+220H], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm3 + shl rdx, 6 + mov qword ptr [rsp+2C0H], rdx + cmp rsi, 8 + jc final7blocks +outerloop8: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+2C0H] + cmove eax, ebx + mov dword ptr [rsp+200H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+20H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+40H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+60H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+80H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0E0H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+100H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+120H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+140H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+160H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+180H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+1A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+1C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+1E0H], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+200H] + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+220H] + vpxor ymm13, ymm1, ymmword ptr [rsp+240H] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+2A0H] + vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] + vmovdqa ymmword ptr [rsp+220H], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+240H] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + sub rsi, 8 + cmp rsi, 8 + jnc outerloop8 + test rsi, rsi + jnz final7blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+2D0H] + vmovdqa xmm7, xmmword ptr [rsp+2E0H] + vmovdqa xmm8, xmmword ptr [rsp+2F0H] + vmovdqa xmm9, xmmword ptr [rsp+300H] + vmovdqa xmm10, xmmword ptr [rsp+310H] + vmovdqa xmm11, xmmword ptr [rsp+320H] + vmovdqa xmm12, xmmword ptr [rsp+330H] + vmovdqa xmm13, xmmword ptr [rsp+340H] + vmovdqa xmm14, xmmword ptr [rsp+350H] + vmovdqa xmm15, xmmword ptr [rsp+360H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+2C0H] + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + test rsi, 4H + je final3blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+220H] + vbroadcasti128 ymm13, xmmword ptr [rsp+240H] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 50H + vpermq ymm15, ymm15, 50H + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] + vpblendd ymm14, ymm14, ymm12, 44H + vpblendd ymm15, ymm15, ymm12, 44H + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+20H], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vmovups ymm2, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + vmovups ymm10, ymmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 93H + vpshufd ymm15, ymm15, 93H + vpbroadcastd ymm2, dword ptr [rsp+200H] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+20H] + vpblendd ymm3, ymm3, ymm2, 88H + vpblendd ymm11, ymm11, ymm2, 88H + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vmovdqa ymm10, ymm2 + mov al, 7 +roundloop4: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+40H], ymm4 + nop + vmovdqa ymmword ptr [rsp+60H], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+80H], ymm5 + vmovdqa ymmword ptr [rsp+0A0H], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 93H + vpshufd ymm8, ymm8, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 39H + vpshufd ymm10, ymm10, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 39H + vpshufd ymm8, ymm8, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 93H + vpshufd ymm10, ymm10, 93H + dec al + je endroundloop4 + vmovdqa ymm4, ymmword ptr [rsp+40H] + vmovdqa ymm5, ymmword ptr [rsp+80H] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0FH + vpshufd ymm4, ymm12, 39H + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0AAH + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 88H + vpshufd ymm12, ymm12, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymmword ptr [rsp+40H], ymm13 + vmovdqa ymmword ptr [rsp+80H], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+60H] + vmovdqa ymm13, ymmword ptr [rsp+0A0H] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0FH + vpshufd ymm12, ymm5, 39H + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0AAH + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 88H + vpshufd ymm5, ymm5, 78H + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 1EH + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+40H] + vmovdqa ymm6, ymmword ptr [rsp+80H] + jmp roundloop4 +endroundloop4: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqu xmmword ptr [rbx+40H], xmm8 + vmovdqu xmmword ptr [rbx+50H], xmm9 + vextracti128 xmmword ptr [rbx+60H], ymm8, 01H + vextracti128 xmmword ptr [rbx+70H], ymm9, 01H + vmovaps xmm8, xmmword ptr [rsp+260H] + vmovaps xmm0, xmmword ptr [rsp+220H] + vmovaps xmm1, xmmword ptr [rsp+230H] + vmovaps xmm2, xmmword ptr [rsp+240H] + vmovaps xmm3, xmmword ptr [rsp+250H] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+220H], xmm0 + vmovaps xmmword ptr [rsp+240H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test rsi, 2H + je final1blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp+220H] + vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+224H] + vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + vbroadcasti128 ymm14, xmmword ptr [ROT16] + vbroadcasti128 ymm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+200H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovaps ymm8, ymmword ptr [rsp+260H] + vmovaps ymm0, ymmword ptr [rsp+220H] + vmovups ymm1, ymmword ptr [rsp+228H] + vmovaps ymm2, ymmword ptr [rsp+240H] + vmovups ymm3, ymmword ptr [rsp+248H] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+220H], ymm0 + vmovaps ymmword ptr [rsp+240H], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1blocks: + test rsi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm3, dword ptr [rsp+220H] + vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm14, xmmword ptr [ROT16] + vmovdqa xmm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx2 ENDP +blake3_hash_many_avx2 ENDP +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + +ADD1: + dd 8 dup (8) + +BLAKE3_IV_0: + dd 8 dup (6A09E667H) + +BLAKE3_IV_1: + dd 8 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 8 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 8 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 8 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +_RDATA ENDS +END diff --git a/deps/blake3/blake3_avx512.c b/deps/blake3/blake3_avx512.c @@ -0,0 +1,1204 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu_128(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE __m256i loadu_256(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE __m512i loadu_512(const uint8_t src[64]) { + return _mm512_loadu_si512((const __m512i *)src); +} + +INLINE void storeu_128(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE void storeu_256(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } + +INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } + +INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } + +INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } + +INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } + +INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } + +INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } + +INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } + +INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } + +INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } + +INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } + +INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } + +INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } + +INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } + +/* + * ---------------------------------------------------------------------------- + * compress_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} + +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +/* + * ---------------------------------------------------------------------------- + * hash4_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(__m128i vecs[4]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); +} + +void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1_128(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash8_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[15] = rot16_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot12_256(v[4]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[15] = rot8_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot7_256(v[4]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot16_256(v[15]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[4] = rot12_256(v[4]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot8_256(v[15]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + v[4] = rot7_256(v[4]); +} + +INLINE void transpose_vecs_256(__m256i vecs[8]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_256(&out[0]); + transpose_vecs_256(&out[8]); +} + +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); +} + +void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1_256(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + h_vecs[0] = xor_256(v[0], v[8]); + h_vecs[1] = xor_256(v[1], v[9]); + h_vecs[2] = xor_256(v[2], v[10]); + h_vecs[3] = xor_256(v[3], v[11]); + h_vecs[4] = xor_256(v[4], v[12]); + h_vecs[5] = xor_256(v[5], v[13]); + h_vecs[6] = xor_256(v[6], v[14]); + h_vecs[7] = xor_256(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_256(h_vecs); + storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash16_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[15] = rot16_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot12_512(v[4]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[15] = rot8_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot7_512(v[4]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot16_512(v[15]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[4] = rot12_512(v[4]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot8_512(v[15]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + v[4] = rot7_512(v[4]); +} + +// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order +#define LO_IMM8 0x88 + +INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, LO_IMM8); +} + +// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order +#define HI_IMM8 0xdd + +INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, HI_IMM8); +} + +INLINE void transpose_vecs_512(__m512i vecs[16]) { + // Interleave 32-bit lanes. The _0 unpack is lanes + // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes + // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. + __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); + __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); + __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); + __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); + __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); + __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); + __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); + __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); + __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); + __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); + __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); + __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); + __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); + __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); + __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); + __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); + + // Interleave 64-bit lates. The _0 unpack is lanes + // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes + // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes + // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes + // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. + __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); + __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); + __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); + __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); + __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); + __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); + __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); + __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); + __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); + __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); + __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); + __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); + __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); + __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); + __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); + __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); + + // Interleave 128-bit lanes. The _0 unpack is + // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is + // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. + __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); + __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); + __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); + __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); + __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); + __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); + __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); + __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); + __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); + __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); + __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); + __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); + __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); + __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); + __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); + __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); + + // Interleave 128-bit lanes again for the final outputs. + vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); + vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); + vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); + vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); + vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); + vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); + vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); + vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); + vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); + vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); + vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); + vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); + vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); + vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); + vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); + vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); +} + +INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, + size_t block_offset, __m512i out[16]) { + out[0] = loadu_512(&inputs[0][block_offset]); + out[1] = loadu_512(&inputs[1][block_offset]); + out[2] = loadu_512(&inputs[2][block_offset]); + out[3] = loadu_512(&inputs[3][block_offset]); + out[4] = loadu_512(&inputs[4][block_offset]); + out[5] = loadu_512(&inputs[5][block_offset]); + out[6] = loadu_512(&inputs[6][block_offset]); + out[7] = loadu_512(&inputs[7][block_offset]); + out[8] = loadu_512(&inputs[8][block_offset]); + out[9] = loadu_512(&inputs[9][block_offset]); + out[10] = loadu_512(&inputs[10][block_offset]); + out[11] = loadu_512(&inputs[11][block_offset]); + out[12] = loadu_512(&inputs[12][block_offset]); + out[13] = loadu_512(&inputs[13][block_offset]); + out[14] = loadu_512(&inputs[14][block_offset]); + out[15] = loadu_512(&inputs[15][block_offset]); + for (size_t i = 0; i < 16; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_512(out); +} + +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); + const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i add1 = _mm512_and_si512(mask, add0); + __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); + __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); + __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + __m512i h_vecs[8] = { + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), + }; + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); + __m512i block_flags_vec = set1_512(block_flags); + __m512i msg_vecs[16]; + transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + h_vecs[0] = xor_512(v[0], v[8]); + h_vecs[1] = xor_512(v[1], v[9]); + h_vecs[2] = xor_512(v[2], v[10]); + h_vecs[3] = xor_512(v[3], v[11]); + h_vecs[4] = xor_512(v[4], v[12]); + h_vecs[5] = xor_512(v[5], v[13]); + h_vecs[6] = xor_512(v[6], v[14]); + h_vecs[7] = xor_512(v[7], v[15]); + + block_flags = flags; + } + + // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 + // state vectors. Pad the matrix with zeros. After transposition, store the + // lower half of each vector. + __m512i padded[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + }; + transpose_vecs_512(padded); + _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); + _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); + _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); + _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); + _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); + _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); + _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); + _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); + _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); + _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); + _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); + _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); + _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); + _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); + _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); + _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 16) { + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } + inputs += 16; + num_inputs -= 16; + out = &out[16 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 8) { + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } + inputs += 8; + num_inputs -= 8; + out = &out[8 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 4) { + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/deps/blake3/blake3_avx512_x86-64_unix.S b/deps/blake3/blake3_avx512_x86-64_unix.S @@ -0,0 +1,2585 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/deps/blake3/blake3_avx512_x86-64_windows_gnu.S b/deps/blake3/blake3_avx512_x86-64_windows_gnu.S @@ -0,0 +1,2615 @@ +.intel_syntax noprefix + +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +.section .text +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x90], xmm6 + vmovdqa xmmword ptr [rsp+0xA0], xmm7 + vmovdqa xmmword ptr [rsp+0xB0], xmm8 + vmovdqa xmmword ptr [rsp+0xC0], xmm9 + vmovdqa xmmword ptr [rsp+0xD0], xmm10 + vmovdqa xmmword ptr [rsp+0xE0], xmm11 + vmovdqa xmmword ptr [rsp+0xF0], xmm12 + vmovdqa xmmword ptr [rsp+0x100], xmm13 + vmovdqa xmmword ptr [rsp+0x110], xmm14 + vmovdqa xmmword ptr [rsp+0x120], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + vmovdqa ymmword ptr [rsp+0x60], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x90], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jne 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x90] + vmovdqa xmm7, xmmword ptr [rsp+0xA0] + vmovdqa xmm8, xmmword ptr [rsp+0xB0] + vmovdqa xmm9, xmmword ptr [rsp+0xC0] + vmovdqa xmm10, xmmword ptr [rsp+0xD0] + vmovdqa xmm11, xmmword ptr [rsp+0xE0] + vmovdqa xmm12, xmmword ptr [rsp+0xF0] + vmovdqa xmm13, xmmword ptr [rsp+0x100] + vmovdqa xmm14, xmmword ptr [rsp+0x110] + vmovdqa xmm15, xmmword ptr [rsp+0x120] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6