chibipub

experimental activitypub node in C
git clone git://jb55.com/chibipub
Log | Files | Refs | README | LICENSE

blake3_sse2_x86-64_unix.S (68858B)


      1 #if defined(__ELF__) && defined(__linux__)
      2 .section .note.GNU-stack,"",%progbits
      3 #endif
      4 
      5 #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
      6 #if __has_include(<cet.h>)
      7 #include <cet.h>
      8 #endif
      9 #endif
     10 
     11 #if !defined(_CET_ENDBR)
     12 #define _CET_ENDBR
     13 #endif
     14 
     15 .intel_syntax noprefix
     16 .global blake3_hash_many_sse2
     17 .global _blake3_hash_many_sse2
     18 .global blake3_compress_in_place_sse2
     19 .global _blake3_compress_in_place_sse2
     20 .global blake3_compress_xof_sse2
     21 .global _blake3_compress_xof_sse2
     22 #ifdef __APPLE__
     23 .text
     24 #else
     25 .section .text
     26 #endif
     27         .p2align  6
     28 _blake3_hash_many_sse2:
     29 blake3_hash_many_sse2:
     30         _CET_ENDBR
     31         push    r15
     32         push    r14
     33         push    r13
     34         push    r12
     35         push    rbx
     36         push    rbp
     37         mov     rbp, rsp
     38         sub     rsp, 360
     39         and     rsp, 0xFFFFFFFFFFFFFFC0
     40         neg     r9d
     41         movd    xmm0, r9d
     42         pshufd  xmm0, xmm0, 0x00
     43         movdqa  xmmword ptr [rsp+0x130], xmm0
     44         movdqa  xmm1, xmm0
     45         pand    xmm1, xmmword ptr [ADD0+rip]
     46         pand    xmm0, xmmword ptr [ADD1+rip]
     47         movdqa  xmmword ptr [rsp+0x150], xmm0
     48         movd    xmm0, r8d
     49         pshufd  xmm0, xmm0, 0x00
     50         paddd   xmm0, xmm1
     51         movdqa  xmmword ptr [rsp+0x110], xmm0
     52         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
     53         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
     54         pcmpgtd xmm1, xmm0
     55         shr     r8, 32
     56         movd    xmm2, r8d
     57         pshufd  xmm2, xmm2, 0x00
     58         psubd   xmm2, xmm1
     59         movdqa  xmmword ptr [rsp+0x120], xmm2
     60         mov     rbx, qword ptr [rbp+0x50]
     61         mov     r15, rdx
     62         shl     r15, 6
     63         movzx   r13d, byte ptr [rbp+0x38]
     64         movzx   r12d, byte ptr [rbp+0x48]
     65         cmp     rsi, 4
     66         jc      3f
     67 2:
     68         movdqu  xmm3, xmmword ptr [rcx]
     69         pshufd  xmm0, xmm3, 0x00
     70         pshufd  xmm1, xmm3, 0x55
     71         pshufd  xmm2, xmm3, 0xAA
     72         pshufd  xmm3, xmm3, 0xFF
     73         movdqu  xmm7, xmmword ptr [rcx+0x10]
     74         pshufd  xmm4, xmm7, 0x00
     75         pshufd  xmm5, xmm7, 0x55
     76         pshufd  xmm6, xmm7, 0xAA
     77         pshufd  xmm7, xmm7, 0xFF
     78         mov     r8, qword ptr [rdi]
     79         mov     r9, qword ptr [rdi+0x8]
     80         mov     r10, qword ptr [rdi+0x10]
     81         mov     r11, qword ptr [rdi+0x18]
     82         movzx   eax, byte ptr [rbp+0x40]
     83         or      eax, r13d
     84         xor     edx, edx
     85 9:
     86         mov     r14d, eax
     87         or      eax, r12d
     88         add     rdx, 64
     89         cmp     rdx, r15
     90         cmovne  eax, r14d
     91         movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
     92         movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
     93         movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
     94         movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
     95         movdqa  xmm12, xmm8
     96         punpckldq xmm8, xmm9
     97         punpckhdq xmm12, xmm9
     98         movdqa  xmm14, xmm10
     99         punpckldq xmm10, xmm11
    100         punpckhdq xmm14, xmm11
    101         movdqa  xmm9, xmm8
    102         punpcklqdq xmm8, xmm10
    103         punpckhqdq xmm9, xmm10
    104         movdqa  xmm13, xmm12
    105         punpcklqdq xmm12, xmm14
    106         punpckhqdq xmm13, xmm14
    107         movdqa  xmmword ptr [rsp], xmm8
    108         movdqa  xmmword ptr [rsp+0x10], xmm9
    109         movdqa  xmmword ptr [rsp+0x20], xmm12
    110         movdqa  xmmword ptr [rsp+0x30], xmm13
    111         movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
    112         movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
    113         movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
    114         movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
    115         movdqa  xmm12, xmm8
    116         punpckldq xmm8, xmm9
    117         punpckhdq xmm12, xmm9
    118         movdqa  xmm14, xmm10
    119         punpckldq xmm10, xmm11
    120         punpckhdq xmm14, xmm11
    121         movdqa  xmm9, xmm8
    122         punpcklqdq xmm8, xmm10
    123         punpckhqdq xmm9, xmm10
    124         movdqa  xmm13, xmm12
    125         punpcklqdq xmm12, xmm14
    126         punpckhqdq xmm13, xmm14
    127         movdqa  xmmword ptr [rsp+0x40], xmm8
    128         movdqa  xmmword ptr [rsp+0x50], xmm9
    129         movdqa  xmmword ptr [rsp+0x60], xmm12
    130         movdqa  xmmword ptr [rsp+0x70], xmm13
    131         movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
    132         movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
    133         movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
    134         movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
    135         movdqa  xmm12, xmm8
    136         punpckldq xmm8, xmm9
    137         punpckhdq xmm12, xmm9
    138         movdqa  xmm14, xmm10
    139         punpckldq xmm10, xmm11
    140         punpckhdq xmm14, xmm11
    141         movdqa  xmm9, xmm8
    142         punpcklqdq xmm8, xmm10
    143         punpckhqdq xmm9, xmm10
    144         movdqa  xmm13, xmm12
    145         punpcklqdq xmm12, xmm14
    146         punpckhqdq xmm13, xmm14
    147         movdqa  xmmword ptr [rsp+0x80], xmm8
    148         movdqa  xmmword ptr [rsp+0x90], xmm9
    149         movdqa  xmmword ptr [rsp+0xA0], xmm12
    150         movdqa  xmmword ptr [rsp+0xB0], xmm13
    151         movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
    152         movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
    153         movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
    154         movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
    155         movdqa  xmm12, xmm8
    156         punpckldq xmm8, xmm9
    157         punpckhdq xmm12, xmm9
    158         movdqa  xmm14, xmm10
    159         punpckldq xmm10, xmm11
    160         punpckhdq xmm14, xmm11
    161         movdqa  xmm9, xmm8
    162         punpcklqdq xmm8, xmm10
    163         punpckhqdq xmm9, xmm10
    164         movdqa  xmm13, xmm12
    165         punpcklqdq xmm12, xmm14
    166         punpckhqdq xmm13, xmm14
    167         movdqa  xmmword ptr [rsp+0xC0], xmm8
    168         movdqa  xmmword ptr [rsp+0xD0], xmm9
    169         movdqa  xmmword ptr [rsp+0xE0], xmm12
    170         movdqa  xmmword ptr [rsp+0xF0], xmm13
    171         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
    172         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
    173         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
    174         movdqa  xmm12, xmmword ptr [rsp+0x110]
    175         movdqa  xmm13, xmmword ptr [rsp+0x120]
    176         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
    177         movd    xmm15, eax
    178         pshufd  xmm15, xmm15, 0x00
    179         prefetcht0 [r8+rdx+0x80]
    180         prefetcht0 [r9+rdx+0x80]
    181         prefetcht0 [r10+rdx+0x80]
    182         prefetcht0 [r11+rdx+0x80]
    183         paddd   xmm0, xmmword ptr [rsp]
    184         paddd   xmm1, xmmword ptr [rsp+0x20]
    185         paddd   xmm2, xmmword ptr [rsp+0x40]
    186         paddd   xmm3, xmmword ptr [rsp+0x60]
    187         paddd   xmm0, xmm4
    188         paddd   xmm1, xmm5
    189         paddd   xmm2, xmm6
    190         paddd   xmm3, xmm7
    191         pxor    xmm12, xmm0
    192         pxor    xmm13, xmm1
    193         pxor    xmm14, xmm2
    194         pxor    xmm15, xmm3
    195         pshuflw xmm12, xmm12, 0xB1
    196         pshufhw xmm12, xmm12, 0xB1
    197         pshuflw xmm13, xmm13, 0xB1
    198         pshufhw xmm13, xmm13, 0xB1
    199         pshuflw xmm14, xmm14, 0xB1
    200         pshufhw xmm14, xmm14, 0xB1
    201         pshuflw xmm15, xmm15, 0xB1
    202         pshufhw xmm15, xmm15, 0xB1
    203         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
    204         paddd   xmm8, xmm12
    205         paddd   xmm9, xmm13
    206         paddd   xmm10, xmm14
    207         paddd   xmm11, xmm15
    208         pxor    xmm4, xmm8
    209         pxor    xmm5, xmm9
    210         pxor    xmm6, xmm10
    211         pxor    xmm7, xmm11
    212         movdqa  xmmword ptr [rsp+0x100], xmm8
    213         movdqa  xmm8, xmm4
    214         psrld   xmm8, 12
    215         pslld   xmm4, 20
    216         por     xmm4, xmm8
    217         movdqa  xmm8, xmm5
    218         psrld   xmm8, 12
    219         pslld   xmm5, 20
    220         por     xmm5, xmm8
    221         movdqa  xmm8, xmm6
    222         psrld   xmm8, 12
    223         pslld   xmm6, 20
    224         por     xmm6, xmm8
    225         movdqa  xmm8, xmm7
    226         psrld   xmm8, 12
    227         pslld   xmm7, 20
    228         por     xmm7, xmm8
    229         paddd   xmm0, xmmword ptr [rsp+0x10]
    230         paddd   xmm1, xmmword ptr [rsp+0x30]
    231         paddd   xmm2, xmmword ptr [rsp+0x50]
    232         paddd   xmm3, xmmword ptr [rsp+0x70]
    233         paddd   xmm0, xmm4
    234         paddd   xmm1, xmm5
    235         paddd   xmm2, xmm6
    236         paddd   xmm3, xmm7
    237         pxor    xmm12, xmm0
    238         pxor    xmm13, xmm1
    239         pxor    xmm14, xmm2
    240         pxor    xmm15, xmm3
    241         movdqa  xmm8, xmm12
    242         psrld   xmm12, 8
    243         pslld   xmm8, 24
    244         pxor    xmm12, xmm8
    245         movdqa  xmm8, xmm13
    246         psrld   xmm13, 8
    247         pslld   xmm8, 24
    248         pxor    xmm13, xmm8
    249         movdqa  xmm8, xmm14
    250         psrld   xmm14, 8
    251         pslld   xmm8, 24
    252         pxor    xmm14, xmm8
    253         movdqa  xmm8, xmm15
    254         psrld   xmm15, 8
    255         pslld   xmm8, 24
    256         pxor    xmm15, xmm8
    257         movdqa  xmm8, xmmword ptr [rsp+0x100]
    258         paddd   xmm8, xmm12
    259         paddd   xmm9, xmm13
    260         paddd   xmm10, xmm14
    261         paddd   xmm11, xmm15
    262         pxor    xmm4, xmm8
    263         pxor    xmm5, xmm9
    264         pxor    xmm6, xmm10
    265         pxor    xmm7, xmm11
    266         movdqa  xmmword ptr [rsp+0x100], xmm8
    267         movdqa  xmm8, xmm4
    268         psrld   xmm8, 7
    269         pslld   xmm4, 25
    270         por     xmm4, xmm8
    271         movdqa  xmm8, xmm5
    272         psrld   xmm8, 7
    273         pslld   xmm5, 25
    274         por     xmm5, xmm8
    275         movdqa  xmm8, xmm6
    276         psrld   xmm8, 7
    277         pslld   xmm6, 25
    278         por     xmm6, xmm8
    279         movdqa  xmm8, xmm7
    280         psrld   xmm8, 7
    281         pslld   xmm7, 25
    282         por     xmm7, xmm8
    283         paddd   xmm0, xmmword ptr [rsp+0x80]
    284         paddd   xmm1, xmmword ptr [rsp+0xA0]
    285         paddd   xmm2, xmmword ptr [rsp+0xC0]
    286         paddd   xmm3, xmmword ptr [rsp+0xE0]
    287         paddd   xmm0, xmm5
    288         paddd   xmm1, xmm6
    289         paddd   xmm2, xmm7
    290         paddd   xmm3, xmm4
    291         pxor    xmm15, xmm0
    292         pxor    xmm12, xmm1
    293         pxor    xmm13, xmm2
    294         pxor    xmm14, xmm3
    295         pshuflw xmm15, xmm15, 0xB1
    296         pshufhw xmm15, xmm15, 0xB1
    297         pshuflw xmm12, xmm12, 0xB1
    298         pshufhw xmm12, xmm12, 0xB1
    299         pshuflw xmm13, xmm13, 0xB1
    300         pshufhw xmm13, xmm13, 0xB1
    301         pshuflw xmm14, xmm14, 0xB1
    302         pshufhw xmm14, xmm14, 0xB1
    303         paddd   xmm10, xmm15
    304         paddd   xmm11, xmm12
    305         movdqa  xmm8, xmmword ptr [rsp+0x100]
    306         paddd   xmm8, xmm13
    307         paddd   xmm9, xmm14
    308         pxor    xmm5, xmm10
    309         pxor    xmm6, xmm11
    310         pxor    xmm7, xmm8
    311         pxor    xmm4, xmm9
    312         movdqa  xmmword ptr [rsp+0x100], xmm8
    313         movdqa  xmm8, xmm5
    314         psrld   xmm8, 12
    315         pslld   xmm5, 20
    316         por     xmm5, xmm8
    317         movdqa  xmm8, xmm6
    318         psrld   xmm8, 12
    319         pslld   xmm6, 20
    320         por     xmm6, xmm8
    321         movdqa  xmm8, xmm7
    322         psrld   xmm8, 12
    323         pslld   xmm7, 20
    324         por     xmm7, xmm8
    325         movdqa  xmm8, xmm4
    326         psrld   xmm8, 12
    327         pslld   xmm4, 20
    328         por     xmm4, xmm8
    329         paddd   xmm0, xmmword ptr [rsp+0x90]
    330         paddd   xmm1, xmmword ptr [rsp+0xB0]
    331         paddd   xmm2, xmmword ptr [rsp+0xD0]
    332         paddd   xmm3, xmmword ptr [rsp+0xF0]
    333         paddd   xmm0, xmm5
    334         paddd   xmm1, xmm6
    335         paddd   xmm2, xmm7
    336         paddd   xmm3, xmm4
    337         pxor    xmm15, xmm0
    338         pxor    xmm12, xmm1
    339         pxor    xmm13, xmm2
    340         pxor    xmm14, xmm3
    341         movdqa  xmm8, xmm15
    342         psrld   xmm15, 8
    343         pslld   xmm8, 24
    344         pxor    xmm15, xmm8
    345         movdqa  xmm8, xmm12
    346         psrld   xmm12, 8
    347         pslld   xmm8, 24
    348         pxor    xmm12, xmm8
    349         movdqa  xmm8, xmm13
    350         psrld   xmm13, 8
    351         pslld   xmm8, 24
    352         pxor    xmm13, xmm8
    353         movdqa  xmm8, xmm14
    354         psrld   xmm14, 8
    355         pslld   xmm8, 24
    356         pxor    xmm14, xmm8
    357         paddd   xmm10, xmm15
    358         paddd   xmm11, xmm12
    359         movdqa  xmm8, xmmword ptr [rsp+0x100]
    360         paddd   xmm8, xmm13
    361         paddd   xmm9, xmm14
    362         pxor    xmm5, xmm10
    363         pxor    xmm6, xmm11
    364         pxor    xmm7, xmm8
    365         pxor    xmm4, xmm9
    366         movdqa  xmmword ptr [rsp+0x100], xmm8
    367         movdqa  xmm8, xmm5
    368         psrld   xmm8, 7
    369         pslld   xmm5, 25
    370         por     xmm5, xmm8
    371         movdqa  xmm8, xmm6
    372         psrld   xmm8, 7
    373         pslld   xmm6, 25
    374         por     xmm6, xmm8
    375         movdqa  xmm8, xmm7
    376         psrld   xmm8, 7
    377         pslld   xmm7, 25
    378         por     xmm7, xmm8
    379         movdqa  xmm8, xmm4
    380         psrld   xmm8, 7
    381         pslld   xmm4, 25
    382         por     xmm4, xmm8
    383         paddd   xmm0, xmmword ptr [rsp+0x20]
    384         paddd   xmm1, xmmword ptr [rsp+0x30]
    385         paddd   xmm2, xmmword ptr [rsp+0x70]
    386         paddd   xmm3, xmmword ptr [rsp+0x40]
    387         paddd   xmm0, xmm4
    388         paddd   xmm1, xmm5
    389         paddd   xmm2, xmm6
    390         paddd   xmm3, xmm7
    391         pxor    xmm12, xmm0
    392         pxor    xmm13, xmm1
    393         pxor    xmm14, xmm2
    394         pxor    xmm15, xmm3
    395         pshuflw xmm12, xmm12, 0xB1
    396         pshufhw xmm12, xmm12, 0xB1
    397         pshuflw xmm13, xmm13, 0xB1
    398         pshufhw xmm13, xmm13, 0xB1
    399         pshuflw xmm14, xmm14, 0xB1
    400         pshufhw xmm14, xmm14, 0xB1
    401         pshuflw xmm15, xmm15, 0xB1
    402         pshufhw xmm15, xmm15, 0xB1
    403         movdqa  xmm8, xmmword ptr [rsp+0x100]
    404         paddd   xmm8, xmm12
    405         paddd   xmm9, xmm13
    406         paddd   xmm10, xmm14
    407         paddd   xmm11, xmm15
    408         pxor    xmm4, xmm8
    409         pxor    xmm5, xmm9
    410         pxor    xmm6, xmm10
    411         pxor    xmm7, xmm11
    412         movdqa  xmmword ptr [rsp+0x100], xmm8
    413         movdqa  xmm8, xmm4
    414         psrld   xmm8, 12
    415         pslld   xmm4, 20
    416         por     xmm4, xmm8
    417         movdqa  xmm8, xmm5
    418         psrld   xmm8, 12
    419         pslld   xmm5, 20
    420         por     xmm5, xmm8
    421         movdqa  xmm8, xmm6
    422         psrld   xmm8, 12
    423         pslld   xmm6, 20
    424         por     xmm6, xmm8
    425         movdqa  xmm8, xmm7
    426         psrld   xmm8, 12
    427         pslld   xmm7, 20
    428         por     xmm7, xmm8
    429         paddd   xmm0, xmmword ptr [rsp+0x60]
    430         paddd   xmm1, xmmword ptr [rsp+0xA0]
    431         paddd   xmm2, xmmword ptr [rsp]
    432         paddd   xmm3, xmmword ptr [rsp+0xD0]
    433         paddd   xmm0, xmm4
    434         paddd   xmm1, xmm5
    435         paddd   xmm2, xmm6
    436         paddd   xmm3, xmm7
    437         pxor    xmm12, xmm0
    438         pxor    xmm13, xmm1
    439         pxor    xmm14, xmm2
    440         pxor    xmm15, xmm3
    441         movdqa  xmm8, xmm12
    442         psrld   xmm12, 8
    443         pslld   xmm8, 24
    444         pxor    xmm12, xmm8
    445         movdqa  xmm8, xmm13
    446         psrld   xmm13, 8
    447         pslld   xmm8, 24
    448         pxor    xmm13, xmm8
    449         movdqa  xmm8, xmm14
    450         psrld   xmm14, 8
    451         pslld   xmm8, 24
    452         pxor    xmm14, xmm8
    453         movdqa  xmm8, xmm15
    454         psrld   xmm15, 8
    455         pslld   xmm8, 24
    456         pxor    xmm15, xmm8
    457         movdqa  xmm8, xmmword ptr [rsp+0x100]
    458         paddd   xmm8, xmm12
    459         paddd   xmm9, xmm13
    460         paddd   xmm10, xmm14
    461         paddd   xmm11, xmm15
    462         pxor    xmm4, xmm8
    463         pxor    xmm5, xmm9
    464         pxor    xmm6, xmm10
    465         pxor    xmm7, xmm11
    466         movdqa  xmmword ptr [rsp+0x100], xmm8
    467         movdqa  xmm8, xmm4
    468         psrld   xmm8, 7
    469         pslld   xmm4, 25
    470         por     xmm4, xmm8
    471         movdqa  xmm8, xmm5
    472         psrld   xmm8, 7
    473         pslld   xmm5, 25
    474         por     xmm5, xmm8
    475         movdqa  xmm8, xmm6
    476         psrld   xmm8, 7
    477         pslld   xmm6, 25
    478         por     xmm6, xmm8
    479         movdqa  xmm8, xmm7
    480         psrld   xmm8, 7
    481         pslld   xmm7, 25
    482         por     xmm7, xmm8
    483         paddd   xmm0, xmmword ptr [rsp+0x10]
    484         paddd   xmm1, xmmword ptr [rsp+0xC0]
    485         paddd   xmm2, xmmword ptr [rsp+0x90]
    486         paddd   xmm3, xmmword ptr [rsp+0xF0]
    487         paddd   xmm0, xmm5
    488         paddd   xmm1, xmm6
    489         paddd   xmm2, xmm7
    490         paddd   xmm3, xmm4
    491         pxor    xmm15, xmm0
    492         pxor    xmm12, xmm1
    493         pxor    xmm13, xmm2
    494         pxor    xmm14, xmm3
    495         pshuflw xmm15, xmm15, 0xB1
    496         pshufhw xmm15, xmm15, 0xB1
    497         pshuflw xmm12, xmm12, 0xB1
    498         pshufhw xmm12, xmm12, 0xB1
    499         pshuflw xmm13, xmm13, 0xB1
    500         pshufhw xmm13, xmm13, 0xB1
    501         pshuflw xmm14, xmm14, 0xB1
    502         pshufhw xmm14, xmm14, 0xB1
    503         paddd   xmm10, xmm15
    504         paddd   xmm11, xmm12
    505         movdqa  xmm8, xmmword ptr [rsp+0x100]
    506         paddd   xmm8, xmm13
    507         paddd   xmm9, xmm14
    508         pxor    xmm5, xmm10
    509         pxor    xmm6, xmm11
    510         pxor    xmm7, xmm8
    511         pxor    xmm4, xmm9
    512         movdqa  xmmword ptr [rsp+0x100], xmm8
    513         movdqa  xmm8, xmm5
    514         psrld   xmm8, 12
    515         pslld   xmm5, 20
    516         por     xmm5, xmm8
    517         movdqa  xmm8, xmm6
    518         psrld   xmm8, 12
    519         pslld   xmm6, 20
    520         por     xmm6, xmm8
    521         movdqa  xmm8, xmm7
    522         psrld   xmm8, 12
    523         pslld   xmm7, 20
    524         por     xmm7, xmm8
    525         movdqa  xmm8, xmm4
    526         psrld   xmm8, 12
    527         pslld   xmm4, 20
    528         por     xmm4, xmm8
    529         paddd   xmm0, xmmword ptr [rsp+0xB0]
    530         paddd   xmm1, xmmword ptr [rsp+0x50]
    531         paddd   xmm2, xmmword ptr [rsp+0xE0]
    532         paddd   xmm3, xmmword ptr [rsp+0x80]
    533         paddd   xmm0, xmm5
    534         paddd   xmm1, xmm6
    535         paddd   xmm2, xmm7
    536         paddd   xmm3, xmm4
    537         pxor    xmm15, xmm0
    538         pxor    xmm12, xmm1
    539         pxor    xmm13, xmm2
    540         pxor    xmm14, xmm3
    541         movdqa  xmm8, xmm15
    542         psrld   xmm15, 8
    543         pslld   xmm8, 24
    544         pxor    xmm15, xmm8
    545         movdqa  xmm8, xmm12
    546         psrld   xmm12, 8
    547         pslld   xmm8, 24
    548         pxor    xmm12, xmm8
    549         movdqa  xmm8, xmm13
    550         psrld   xmm13, 8
    551         pslld   xmm8, 24
    552         pxor    xmm13, xmm8
    553         movdqa  xmm8, xmm14
    554         psrld   xmm14, 8
    555         pslld   xmm8, 24
    556         pxor    xmm14, xmm8
    557         paddd   xmm10, xmm15
    558         paddd   xmm11, xmm12
    559         movdqa  xmm8, xmmword ptr [rsp+0x100]
    560         paddd   xmm8, xmm13
    561         paddd   xmm9, xmm14
    562         pxor    xmm5, xmm10
    563         pxor    xmm6, xmm11
    564         pxor    xmm7, xmm8
    565         pxor    xmm4, xmm9
    566         movdqa  xmmword ptr [rsp+0x100], xmm8
    567         movdqa  xmm8, xmm5
    568         psrld   xmm8, 7
    569         pslld   xmm5, 25
    570         por     xmm5, xmm8
    571         movdqa  xmm8, xmm6
    572         psrld   xmm8, 7
    573         pslld   xmm6, 25
    574         por     xmm6, xmm8
    575         movdqa  xmm8, xmm7
    576         psrld   xmm8, 7
    577         pslld   xmm7, 25
    578         por     xmm7, xmm8
    579         movdqa  xmm8, xmm4
    580         psrld   xmm8, 7
    581         pslld   xmm4, 25
    582         por     xmm4, xmm8
    583         paddd   xmm0, xmmword ptr [rsp+0x30]
    584         paddd   xmm1, xmmword ptr [rsp+0xA0]
    585         paddd   xmm2, xmmword ptr [rsp+0xD0]
    586         paddd   xmm3, xmmword ptr [rsp+0x70]
    587         paddd   xmm0, xmm4
    588         paddd   xmm1, xmm5
    589         paddd   xmm2, xmm6
    590         paddd   xmm3, xmm7
    591         pxor    xmm12, xmm0
    592         pxor    xmm13, xmm1
    593         pxor    xmm14, xmm2
    594         pxor    xmm15, xmm3
    595         pshuflw xmm12, xmm12, 0xB1
    596         pshufhw xmm12, xmm12, 0xB1
    597         pshuflw xmm13, xmm13, 0xB1
    598         pshufhw xmm13, xmm13, 0xB1
    599         pshuflw xmm14, xmm14, 0xB1
    600         pshufhw xmm14, xmm14, 0xB1
    601         pshuflw xmm15, xmm15, 0xB1
    602         pshufhw xmm15, xmm15, 0xB1
    603         movdqa  xmm8, xmmword ptr [rsp+0x100]
    604         paddd   xmm8, xmm12
    605         paddd   xmm9, xmm13
    606         paddd   xmm10, xmm14
    607         paddd   xmm11, xmm15
    608         pxor    xmm4, xmm8
    609         pxor    xmm5, xmm9
    610         pxor    xmm6, xmm10
    611         pxor    xmm7, xmm11
    612         movdqa  xmmword ptr [rsp+0x100], xmm8
    613         movdqa  xmm8, xmm4
    614         psrld   xmm8, 12
    615         pslld   xmm4, 20
    616         por     xmm4, xmm8
    617         movdqa  xmm8, xmm5
    618         psrld   xmm8, 12
    619         pslld   xmm5, 20
    620         por     xmm5, xmm8
    621         movdqa  xmm8, xmm6
    622         psrld   xmm8, 12
    623         pslld   xmm6, 20
    624         por     xmm6, xmm8
    625         movdqa  xmm8, xmm7
    626         psrld   xmm8, 12
    627         pslld   xmm7, 20
    628         por     xmm7, xmm8
    629         paddd   xmm0, xmmword ptr [rsp+0x40]
    630         paddd   xmm1, xmmword ptr [rsp+0xC0]
    631         paddd   xmm2, xmmword ptr [rsp+0x20]
    632         paddd   xmm3, xmmword ptr [rsp+0xE0]
    633         paddd   xmm0, xmm4
    634         paddd   xmm1, xmm5
    635         paddd   xmm2, xmm6
    636         paddd   xmm3, xmm7
    637         pxor    xmm12, xmm0
    638         pxor    xmm13, xmm1
    639         pxor    xmm14, xmm2
    640         pxor    xmm15, xmm3
    641         movdqa  xmm8, xmm12
    642         psrld   xmm12, 8
    643         pslld   xmm8, 24
    644         pxor    xmm12, xmm8
    645         movdqa  xmm8, xmm13
    646         psrld   xmm13, 8
    647         pslld   xmm8, 24
    648         pxor    xmm13, xmm8
    649         movdqa  xmm8, xmm14
    650         psrld   xmm14, 8
    651         pslld   xmm8, 24
    652         pxor    xmm14, xmm8
    653         movdqa  xmm8, xmm15
    654         psrld   xmm15, 8
    655         pslld   xmm8, 24
    656         pxor    xmm15, xmm8
    657         movdqa  xmm8, xmmword ptr [rsp+0x100]
    658         paddd   xmm8, xmm12
    659         paddd   xmm9, xmm13
    660         paddd   xmm10, xmm14
    661         paddd   xmm11, xmm15
    662         pxor    xmm4, xmm8
    663         pxor    xmm5, xmm9
    664         pxor    xmm6, xmm10
    665         pxor    xmm7, xmm11
    666         movdqa  xmmword ptr [rsp+0x100], xmm8
    667         movdqa  xmm8, xmm4
    668         psrld   xmm8, 7
    669         pslld   xmm4, 25
    670         por     xmm4, xmm8
    671         movdqa  xmm8, xmm5
    672         psrld   xmm8, 7
    673         pslld   xmm5, 25
    674         por     xmm5, xmm8
    675         movdqa  xmm8, xmm6
    676         psrld   xmm8, 7
    677         pslld   xmm6, 25
    678         por     xmm6, xmm8
    679         movdqa  xmm8, xmm7
    680         psrld   xmm8, 7
    681         pslld   xmm7, 25
    682         por     xmm7, xmm8
    683         paddd   xmm0, xmmword ptr [rsp+0x60]
    684         paddd   xmm1, xmmword ptr [rsp+0x90]
    685         paddd   xmm2, xmmword ptr [rsp+0xB0]
    686         paddd   xmm3, xmmword ptr [rsp+0x80]
    687         paddd   xmm0, xmm5
    688         paddd   xmm1, xmm6
    689         paddd   xmm2, xmm7
    690         paddd   xmm3, xmm4
    691         pxor    xmm15, xmm0
    692         pxor    xmm12, xmm1
    693         pxor    xmm13, xmm2
    694         pxor    xmm14, xmm3
    695         pshuflw xmm15, xmm15, 0xB1
    696         pshufhw xmm15, xmm15, 0xB1
    697         pshuflw xmm12, xmm12, 0xB1
    698         pshufhw xmm12, xmm12, 0xB1
    699         pshuflw xmm13, xmm13, 0xB1
    700         pshufhw xmm13, xmm13, 0xB1
    701         pshuflw xmm14, xmm14, 0xB1
    702         pshufhw xmm14, xmm14, 0xB1
    703         paddd   xmm10, xmm15
    704         paddd   xmm11, xmm12
    705         movdqa  xmm8, xmmword ptr [rsp+0x100]
    706         paddd   xmm8, xmm13
    707         paddd   xmm9, xmm14
    708         pxor    xmm5, xmm10
    709         pxor    xmm6, xmm11
    710         pxor    xmm7, xmm8
    711         pxor    xmm4, xmm9
    712         movdqa  xmmword ptr [rsp+0x100], xmm8
    713         movdqa  xmm8, xmm5
    714         psrld   xmm8, 12
    715         pslld   xmm5, 20
    716         por     xmm5, xmm8
    717         movdqa  xmm8, xmm6
    718         psrld   xmm8, 12
    719         pslld   xmm6, 20
    720         por     xmm6, xmm8
    721         movdqa  xmm8, xmm7
    722         psrld   xmm8, 12
    723         pslld   xmm7, 20
    724         por     xmm7, xmm8
    725         movdqa  xmm8, xmm4
    726         psrld   xmm8, 12
    727         pslld   xmm4, 20
    728         por     xmm4, xmm8
    729         paddd   xmm0, xmmword ptr [rsp+0x50]
    730         paddd   xmm1, xmmword ptr [rsp]
    731         paddd   xmm2, xmmword ptr [rsp+0xF0]
    732         paddd   xmm3, xmmword ptr [rsp+0x10]
    733         paddd   xmm0, xmm5
    734         paddd   xmm1, xmm6
    735         paddd   xmm2, xmm7
    736         paddd   xmm3, xmm4
    737         pxor    xmm15, xmm0
    738         pxor    xmm12, xmm1
    739         pxor    xmm13, xmm2
    740         pxor    xmm14, xmm3
    741         movdqa  xmm8, xmm15
    742         psrld   xmm15, 8
    743         pslld   xmm8, 24
    744         pxor    xmm15, xmm8
    745         movdqa  xmm8, xmm12
    746         psrld   xmm12, 8
    747         pslld   xmm8, 24
    748         pxor    xmm12, xmm8
    749         movdqa  xmm8, xmm13
    750         psrld   xmm13, 8
    751         pslld   xmm8, 24
    752         pxor    xmm13, xmm8
    753         movdqa  xmm8, xmm14
    754         psrld   xmm14, 8
    755         pslld   xmm8, 24
    756         pxor    xmm14, xmm8
    757         paddd   xmm10, xmm15
    758         paddd   xmm11, xmm12
    759         movdqa  xmm8, xmmword ptr [rsp+0x100]
    760         paddd   xmm8, xmm13
    761         paddd   xmm9, xmm14
    762         pxor    xmm5, xmm10
    763         pxor    xmm6, xmm11
    764         pxor    xmm7, xmm8
    765         pxor    xmm4, xmm9
    766         movdqa  xmmword ptr [rsp+0x100], xmm8
    767         movdqa  xmm8, xmm5
    768         psrld   xmm8, 7
    769         pslld   xmm5, 25
    770         por     xmm5, xmm8
    771         movdqa  xmm8, xmm6
    772         psrld   xmm8, 7
    773         pslld   xmm6, 25
    774         por     xmm6, xmm8
    775         movdqa  xmm8, xmm7
    776         psrld   xmm8, 7
    777         pslld   xmm7, 25
    778         por     xmm7, xmm8
    779         movdqa  xmm8, xmm4
    780         psrld   xmm8, 7
    781         pslld   xmm4, 25
    782         por     xmm4, xmm8
    783         paddd   xmm0, xmmword ptr [rsp+0xA0]
    784         paddd   xmm1, xmmword ptr [rsp+0xC0]
    785         paddd   xmm2, xmmword ptr [rsp+0xE0]
    786         paddd   xmm3, xmmword ptr [rsp+0xD0]
    787         paddd   xmm0, xmm4
    788         paddd   xmm1, xmm5
    789         paddd   xmm2, xmm6
    790         paddd   xmm3, xmm7
    791         pxor    xmm12, xmm0
    792         pxor    xmm13, xmm1
    793         pxor    xmm14, xmm2
    794         pxor    xmm15, xmm3
    795         pshuflw xmm12, xmm12, 0xB1
    796         pshufhw xmm12, xmm12, 0xB1
    797         pshuflw xmm13, xmm13, 0xB1
    798         pshufhw xmm13, xmm13, 0xB1
    799         pshuflw xmm14, xmm14, 0xB1
    800         pshufhw xmm14, xmm14, 0xB1
    801         pshuflw xmm15, xmm15, 0xB1
    802         pshufhw xmm15, xmm15, 0xB1
    803         movdqa  xmm8, xmmword ptr [rsp+0x100]
    804         paddd   xmm8, xmm12
    805         paddd   xmm9, xmm13
    806         paddd   xmm10, xmm14
    807         paddd   xmm11, xmm15
    808         pxor    xmm4, xmm8
    809         pxor    xmm5, xmm9
    810         pxor    xmm6, xmm10
    811         pxor    xmm7, xmm11
    812         movdqa  xmmword ptr [rsp+0x100], xmm8
    813         movdqa  xmm8, xmm4
    814         psrld   xmm8, 12
    815         pslld   xmm4, 20
    816         por     xmm4, xmm8
    817         movdqa  xmm8, xmm5
    818         psrld   xmm8, 12
    819         pslld   xmm5, 20
    820         por     xmm5, xmm8
    821         movdqa  xmm8, xmm6
    822         psrld   xmm8, 12
    823         pslld   xmm6, 20
    824         por     xmm6, xmm8
    825         movdqa  xmm8, xmm7
    826         psrld   xmm8, 12
    827         pslld   xmm7, 20
    828         por     xmm7, xmm8
    829         paddd   xmm0, xmmword ptr [rsp+0x70]
    830         paddd   xmm1, xmmword ptr [rsp+0x90]
    831         paddd   xmm2, xmmword ptr [rsp+0x30]
    832         paddd   xmm3, xmmword ptr [rsp+0xF0]
    833         paddd   xmm0, xmm4
    834         paddd   xmm1, xmm5
    835         paddd   xmm2, xmm6
    836         paddd   xmm3, xmm7
    837         pxor    xmm12, xmm0
    838         pxor    xmm13, xmm1
    839         pxor    xmm14, xmm2
    840         pxor    xmm15, xmm3
    841         movdqa  xmm8, xmm12
    842         psrld   xmm12, 8
    843         pslld   xmm8, 24
    844         pxor    xmm12, xmm8
    845         movdqa  xmm8, xmm13
    846         psrld   xmm13, 8
    847         pslld   xmm8, 24
    848         pxor    xmm13, xmm8
    849         movdqa  xmm8, xmm14
    850         psrld   xmm14, 8
    851         pslld   xmm8, 24
    852         pxor    xmm14, xmm8
    853         movdqa  xmm8, xmm15
    854         psrld   xmm15, 8
    855         pslld   xmm8, 24
    856         pxor    xmm15, xmm8
    857         movdqa  xmm8, xmmword ptr [rsp+0x100]
    858         paddd   xmm8, xmm12
    859         paddd   xmm9, xmm13
    860         paddd   xmm10, xmm14
    861         paddd   xmm11, xmm15
    862         pxor    xmm4, xmm8
    863         pxor    xmm5, xmm9
    864         pxor    xmm6, xmm10
    865         pxor    xmm7, xmm11
    866         movdqa  xmmword ptr [rsp+0x100], xmm8
    867         movdqa  xmm8, xmm4
    868         psrld   xmm8, 7
    869         pslld   xmm4, 25
    870         por     xmm4, xmm8
    871         movdqa  xmm8, xmm5
    872         psrld   xmm8, 7
    873         pslld   xmm5, 25
    874         por     xmm5, xmm8
    875         movdqa  xmm8, xmm6
    876         psrld   xmm8, 7
    877         pslld   xmm6, 25
    878         por     xmm6, xmm8
    879         movdqa  xmm8, xmm7
    880         psrld   xmm8, 7
    881         pslld   xmm7, 25
    882         por     xmm7, xmm8
    883         paddd   xmm0, xmmword ptr [rsp+0x40]
    884         paddd   xmm1, xmmword ptr [rsp+0xB0]
    885         paddd   xmm2, xmmword ptr [rsp+0x50]
    886         paddd   xmm3, xmmword ptr [rsp+0x10]
    887         paddd   xmm0, xmm5
    888         paddd   xmm1, xmm6
    889         paddd   xmm2, xmm7
    890         paddd   xmm3, xmm4
    891         pxor    xmm15, xmm0
    892         pxor    xmm12, xmm1
    893         pxor    xmm13, xmm2
    894         pxor    xmm14, xmm3
    895         pshuflw xmm15, xmm15, 0xB1
    896         pshufhw xmm15, xmm15, 0xB1
    897         pshuflw xmm12, xmm12, 0xB1
    898         pshufhw xmm12, xmm12, 0xB1
    899         pshuflw xmm13, xmm13, 0xB1
    900         pshufhw xmm13, xmm13, 0xB1
    901         pshuflw xmm14, xmm14, 0xB1
    902         pshufhw xmm14, xmm14, 0xB1
    903         paddd   xmm10, xmm15
    904         paddd   xmm11, xmm12
    905         movdqa  xmm8, xmmword ptr [rsp+0x100]
    906         paddd   xmm8, xmm13
    907         paddd   xmm9, xmm14
    908         pxor    xmm5, xmm10
    909         pxor    xmm6, xmm11
    910         pxor    xmm7, xmm8
    911         pxor    xmm4, xmm9
    912         movdqa  xmmword ptr [rsp+0x100], xmm8
    913         movdqa  xmm8, xmm5
    914         psrld   xmm8, 12
    915         pslld   xmm5, 20
    916         por     xmm5, xmm8
    917         movdqa  xmm8, xmm6
    918         psrld   xmm8, 12
    919         pslld   xmm6, 20
    920         por     xmm6, xmm8
    921         movdqa  xmm8, xmm7
    922         psrld   xmm8, 12
    923         pslld   xmm7, 20
    924         por     xmm7, xmm8
    925         movdqa  xmm8, xmm4
    926         psrld   xmm8, 12
    927         pslld   xmm4, 20
    928         por     xmm4, xmm8
    929         paddd   xmm0, xmmword ptr [rsp]
    930         paddd   xmm1, xmmword ptr [rsp+0x20]
    931         paddd   xmm2, xmmword ptr [rsp+0x80]
    932         paddd   xmm3, xmmword ptr [rsp+0x60]
    933         paddd   xmm0, xmm5
    934         paddd   xmm1, xmm6
    935         paddd   xmm2, xmm7
    936         paddd   xmm3, xmm4
    937         pxor    xmm15, xmm0
    938         pxor    xmm12, xmm1
    939         pxor    xmm13, xmm2
    940         pxor    xmm14, xmm3
    941         movdqa  xmm8, xmm15
    942         psrld   xmm15, 8
    943         pslld   xmm8, 24
    944         pxor    xmm15, xmm8
    945         movdqa  xmm8, xmm12
    946         psrld   xmm12, 8
    947         pslld   xmm8, 24
    948         pxor    xmm12, xmm8
    949         movdqa  xmm8, xmm13
    950         psrld   xmm13, 8
    951         pslld   xmm8, 24
    952         pxor    xmm13, xmm8
    953         movdqa  xmm8, xmm14
    954         psrld   xmm14, 8
    955         pslld   xmm8, 24
    956         pxor    xmm14, xmm8
    957         paddd   xmm10, xmm15
    958         paddd   xmm11, xmm12
    959         movdqa  xmm8, xmmword ptr [rsp+0x100]
    960         paddd   xmm8, xmm13
    961         paddd   xmm9, xmm14
    962         pxor    xmm5, xmm10
    963         pxor    xmm6, xmm11
    964         pxor    xmm7, xmm8
    965         pxor    xmm4, xmm9
    966         movdqa  xmmword ptr [rsp+0x100], xmm8
    967         movdqa  xmm8, xmm5
    968         psrld   xmm8, 7
    969         pslld   xmm5, 25
    970         por     xmm5, xmm8
    971         movdqa  xmm8, xmm6
    972         psrld   xmm8, 7
    973         pslld   xmm6, 25
    974         por     xmm6, xmm8
    975         movdqa  xmm8, xmm7
    976         psrld   xmm8, 7
    977         pslld   xmm7, 25
    978         por     xmm7, xmm8
    979         movdqa  xmm8, xmm4
    980         psrld   xmm8, 7
    981         pslld   xmm4, 25
    982         por     xmm4, xmm8
    983         paddd   xmm0, xmmword ptr [rsp+0xC0]
    984         paddd   xmm1, xmmword ptr [rsp+0x90]
    985         paddd   xmm2, xmmword ptr [rsp+0xF0]
    986         paddd   xmm3, xmmword ptr [rsp+0xE0]
    987         paddd   xmm0, xmm4
    988         paddd   xmm1, xmm5
    989         paddd   xmm2, xmm6
    990         paddd   xmm3, xmm7
    991         pxor    xmm12, xmm0
    992         pxor    xmm13, xmm1
    993         pxor    xmm14, xmm2
    994         pxor    xmm15, xmm3
    995         pshuflw xmm12, xmm12, 0xB1
    996         pshufhw xmm12, xmm12, 0xB1
    997         pshuflw xmm13, xmm13, 0xB1
    998         pshufhw xmm13, xmm13, 0xB1
    999         pshuflw xmm14, xmm14, 0xB1
   1000         pshufhw xmm14, xmm14, 0xB1
   1001         pshuflw xmm15, xmm15, 0xB1
   1002         pshufhw xmm15, xmm15, 0xB1
   1003         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1004         paddd   xmm8, xmm12
   1005         paddd   xmm9, xmm13
   1006         paddd   xmm10, xmm14
   1007         paddd   xmm11, xmm15
   1008         pxor    xmm4, xmm8
   1009         pxor    xmm5, xmm9
   1010         pxor    xmm6, xmm10
   1011         pxor    xmm7, xmm11
   1012         movdqa  xmmword ptr [rsp+0x100], xmm8
   1013         movdqa  xmm8, xmm4
   1014         psrld   xmm8, 12
   1015         pslld   xmm4, 20
   1016         por     xmm4, xmm8
   1017         movdqa  xmm8, xmm5
   1018         psrld   xmm8, 12
   1019         pslld   xmm5, 20
   1020         por     xmm5, xmm8
   1021         movdqa  xmm8, xmm6
   1022         psrld   xmm8, 12
   1023         pslld   xmm6, 20
   1024         por     xmm6, xmm8
   1025         movdqa  xmm8, xmm7
   1026         psrld   xmm8, 12
   1027         pslld   xmm7, 20
   1028         por     xmm7, xmm8
   1029         paddd   xmm0, xmmword ptr [rsp+0xD0]
   1030         paddd   xmm1, xmmword ptr [rsp+0xB0]
   1031         paddd   xmm2, xmmword ptr [rsp+0xA0]
   1032         paddd   xmm3, xmmword ptr [rsp+0x80]
   1033         paddd   xmm0, xmm4
   1034         paddd   xmm1, xmm5
   1035         paddd   xmm2, xmm6
   1036         paddd   xmm3, xmm7
   1037         pxor    xmm12, xmm0
   1038         pxor    xmm13, xmm1
   1039         pxor    xmm14, xmm2
   1040         pxor    xmm15, xmm3
   1041         movdqa  xmm8, xmm12
   1042         psrld   xmm12, 8
   1043         pslld   xmm8, 24
   1044         pxor    xmm12, xmm8
   1045         movdqa  xmm8, xmm13
   1046         psrld   xmm13, 8
   1047         pslld   xmm8, 24
   1048         pxor    xmm13, xmm8
   1049         movdqa  xmm8, xmm14
   1050         psrld   xmm14, 8
   1051         pslld   xmm8, 24
   1052         pxor    xmm14, xmm8
   1053         movdqa  xmm8, xmm15
   1054         psrld   xmm15, 8
   1055         pslld   xmm8, 24
   1056         pxor    xmm15, xmm8
   1057         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1058         paddd   xmm8, xmm12
   1059         paddd   xmm9, xmm13
   1060         paddd   xmm10, xmm14
   1061         paddd   xmm11, xmm15
   1062         pxor    xmm4, xmm8
   1063         pxor    xmm5, xmm9
   1064         pxor    xmm6, xmm10
   1065         pxor    xmm7, xmm11
   1066         movdqa  xmmword ptr [rsp+0x100], xmm8
   1067         movdqa  xmm8, xmm4
   1068         psrld   xmm8, 7
   1069         pslld   xmm4, 25
   1070         por     xmm4, xmm8
   1071         movdqa  xmm8, xmm5
   1072         psrld   xmm8, 7
   1073         pslld   xmm5, 25
   1074         por     xmm5, xmm8
   1075         movdqa  xmm8, xmm6
   1076         psrld   xmm8, 7
   1077         pslld   xmm6, 25
   1078         por     xmm6, xmm8
   1079         movdqa  xmm8, xmm7
   1080         psrld   xmm8, 7
   1081         pslld   xmm7, 25
   1082         por     xmm7, xmm8
   1083         paddd   xmm0, xmmword ptr [rsp+0x70]
   1084         paddd   xmm1, xmmword ptr [rsp+0x50]
   1085         paddd   xmm2, xmmword ptr [rsp]
   1086         paddd   xmm3, xmmword ptr [rsp+0x60]
   1087         paddd   xmm0, xmm5
   1088         paddd   xmm1, xmm6
   1089         paddd   xmm2, xmm7
   1090         paddd   xmm3, xmm4
   1091         pxor    xmm15, xmm0
   1092         pxor    xmm12, xmm1
   1093         pxor    xmm13, xmm2
   1094         pxor    xmm14, xmm3
   1095         pshuflw xmm15, xmm15, 0xB1
   1096         pshufhw xmm15, xmm15, 0xB1
   1097         pshuflw xmm12, xmm12, 0xB1
   1098         pshufhw xmm12, xmm12, 0xB1
   1099         pshuflw xmm13, xmm13, 0xB1
   1100         pshufhw xmm13, xmm13, 0xB1
   1101         pshuflw xmm14, xmm14, 0xB1
   1102         pshufhw xmm14, xmm14, 0xB1
   1103         paddd   xmm10, xmm15
   1104         paddd   xmm11, xmm12
   1105         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1106         paddd   xmm8, xmm13
   1107         paddd   xmm9, xmm14
   1108         pxor    xmm5, xmm10
   1109         pxor    xmm6, xmm11
   1110         pxor    xmm7, xmm8
   1111         pxor    xmm4, xmm9
   1112         movdqa  xmmword ptr [rsp+0x100], xmm8
   1113         movdqa  xmm8, xmm5
   1114         psrld   xmm8, 12
   1115         pslld   xmm5, 20
   1116         por     xmm5, xmm8
   1117         movdqa  xmm8, xmm6
   1118         psrld   xmm8, 12
   1119         pslld   xmm6, 20
   1120         por     xmm6, xmm8
   1121         movdqa  xmm8, xmm7
   1122         psrld   xmm8, 12
   1123         pslld   xmm7, 20
   1124         por     xmm7, xmm8
   1125         movdqa  xmm8, xmm4
   1126         psrld   xmm8, 12
   1127         pslld   xmm4, 20
   1128         por     xmm4, xmm8
   1129         paddd   xmm0, xmmword ptr [rsp+0x20]
   1130         paddd   xmm1, xmmword ptr [rsp+0x30]
   1131         paddd   xmm2, xmmword ptr [rsp+0x10]
   1132         paddd   xmm3, xmmword ptr [rsp+0x40]
   1133         paddd   xmm0, xmm5
   1134         paddd   xmm1, xmm6
   1135         paddd   xmm2, xmm7
   1136         paddd   xmm3, xmm4
   1137         pxor    xmm15, xmm0
   1138         pxor    xmm12, xmm1
   1139         pxor    xmm13, xmm2
   1140         pxor    xmm14, xmm3
   1141         movdqa  xmm8, xmm15
   1142         psrld   xmm15, 8
   1143         pslld   xmm8, 24
   1144         pxor    xmm15, xmm8
   1145         movdqa  xmm8, xmm12
   1146         psrld   xmm12, 8
   1147         pslld   xmm8, 24
   1148         pxor    xmm12, xmm8
   1149         movdqa  xmm8, xmm13
   1150         psrld   xmm13, 8
   1151         pslld   xmm8, 24
   1152         pxor    xmm13, xmm8
   1153         movdqa  xmm8, xmm14
   1154         psrld   xmm14, 8
   1155         pslld   xmm8, 24
   1156         pxor    xmm14, xmm8
   1157         paddd   xmm10, xmm15
   1158         paddd   xmm11, xmm12
   1159         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1160         paddd   xmm8, xmm13
   1161         paddd   xmm9, xmm14
   1162         pxor    xmm5, xmm10
   1163         pxor    xmm6, xmm11
   1164         pxor    xmm7, xmm8
   1165         pxor    xmm4, xmm9
   1166         movdqa  xmmword ptr [rsp+0x100], xmm8
   1167         movdqa  xmm8, xmm5
   1168         psrld   xmm8, 7
   1169         pslld   xmm5, 25
   1170         por     xmm5, xmm8
   1171         movdqa  xmm8, xmm6
   1172         psrld   xmm8, 7
   1173         pslld   xmm6, 25
   1174         por     xmm6, xmm8
   1175         movdqa  xmm8, xmm7
   1176         psrld   xmm8, 7
   1177         pslld   xmm7, 25
   1178         por     xmm7, xmm8
   1179         movdqa  xmm8, xmm4
   1180         psrld   xmm8, 7
   1181         pslld   xmm4, 25
   1182         por     xmm4, xmm8
   1183         paddd   xmm0, xmmword ptr [rsp+0x90]
   1184         paddd   xmm1, xmmword ptr [rsp+0xB0]
   1185         paddd   xmm2, xmmword ptr [rsp+0x80]
   1186         paddd   xmm3, xmmword ptr [rsp+0xF0]
   1187         paddd   xmm0, xmm4
   1188         paddd   xmm1, xmm5
   1189         paddd   xmm2, xmm6
   1190         paddd   xmm3, xmm7
   1191         pxor    xmm12, xmm0
   1192         pxor    xmm13, xmm1
   1193         pxor    xmm14, xmm2
   1194         pxor    xmm15, xmm3
   1195         pshuflw xmm12, xmm12, 0xB1
   1196         pshufhw xmm12, xmm12, 0xB1
   1197         pshuflw xmm13, xmm13, 0xB1
   1198         pshufhw xmm13, xmm13, 0xB1
   1199         pshuflw xmm14, xmm14, 0xB1
   1200         pshufhw xmm14, xmm14, 0xB1
   1201         pshuflw xmm15, xmm15, 0xB1
   1202         pshufhw xmm15, xmm15, 0xB1
   1203         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1204         paddd   xmm8, xmm12
   1205         paddd   xmm9, xmm13
   1206         paddd   xmm10, xmm14
   1207         paddd   xmm11, xmm15
   1208         pxor    xmm4, xmm8
   1209         pxor    xmm5, xmm9
   1210         pxor    xmm6, xmm10
   1211         pxor    xmm7, xmm11
   1212         movdqa  xmmword ptr [rsp+0x100], xmm8
   1213         movdqa  xmm8, xmm4
   1214         psrld   xmm8, 12
   1215         pslld   xmm4, 20
   1216         por     xmm4, xmm8
   1217         movdqa  xmm8, xmm5
   1218         psrld   xmm8, 12
   1219         pslld   xmm5, 20
   1220         por     xmm5, xmm8
   1221         movdqa  xmm8, xmm6
   1222         psrld   xmm8, 12
   1223         pslld   xmm6, 20
   1224         por     xmm6, xmm8
   1225         movdqa  xmm8, xmm7
   1226         psrld   xmm8, 12
   1227         pslld   xmm7, 20
   1228         por     xmm7, xmm8
   1229         paddd   xmm0, xmmword ptr [rsp+0xE0]
   1230         paddd   xmm1, xmmword ptr [rsp+0x50]
   1231         paddd   xmm2, xmmword ptr [rsp+0xC0]
   1232         paddd   xmm3, xmmword ptr [rsp+0x10]
   1233         paddd   xmm0, xmm4
   1234         paddd   xmm1, xmm5
   1235         paddd   xmm2, xmm6
   1236         paddd   xmm3, xmm7
   1237         pxor    xmm12, xmm0
   1238         pxor    xmm13, xmm1
   1239         pxor    xmm14, xmm2
   1240         pxor    xmm15, xmm3
   1241         movdqa  xmm8, xmm12
   1242         psrld   xmm12, 8
   1243         pslld   xmm8, 24
   1244         pxor    xmm12, xmm8
   1245         movdqa  xmm8, xmm13
   1246         psrld   xmm13, 8
   1247         pslld   xmm8, 24
   1248         pxor    xmm13, xmm8
   1249         movdqa  xmm8, xmm14
   1250         psrld   xmm14, 8
   1251         pslld   xmm8, 24
   1252         pxor    xmm14, xmm8
   1253         movdqa  xmm8, xmm15
   1254         psrld   xmm15, 8
   1255         pslld   xmm8, 24
   1256         pxor    xmm15, xmm8
   1257         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1258         paddd   xmm8, xmm12
   1259         paddd   xmm9, xmm13
   1260         paddd   xmm10, xmm14
   1261         paddd   xmm11, xmm15
   1262         pxor    xmm4, xmm8
   1263         pxor    xmm5, xmm9
   1264         pxor    xmm6, xmm10
   1265         pxor    xmm7, xmm11
   1266         movdqa  xmmword ptr [rsp+0x100], xmm8
   1267         movdqa  xmm8, xmm4
   1268         psrld   xmm8, 7
   1269         pslld   xmm4, 25
   1270         por     xmm4, xmm8
   1271         movdqa  xmm8, xmm5
   1272         psrld   xmm8, 7
   1273         pslld   xmm5, 25
   1274         por     xmm5, xmm8
   1275         movdqa  xmm8, xmm6
   1276         psrld   xmm8, 7
   1277         pslld   xmm6, 25
   1278         por     xmm6, xmm8
   1279         movdqa  xmm8, xmm7
   1280         psrld   xmm8, 7
   1281         pslld   xmm7, 25
   1282         por     xmm7, xmm8
   1283         paddd   xmm0, xmmword ptr [rsp+0xD0]
   1284         paddd   xmm1, xmmword ptr [rsp]
   1285         paddd   xmm2, xmmword ptr [rsp+0x20]
   1286         paddd   xmm3, xmmword ptr [rsp+0x40]
   1287         paddd   xmm0, xmm5
   1288         paddd   xmm1, xmm6
   1289         paddd   xmm2, xmm7
   1290         paddd   xmm3, xmm4
   1291         pxor    xmm15, xmm0
   1292         pxor    xmm12, xmm1
   1293         pxor    xmm13, xmm2
   1294         pxor    xmm14, xmm3
   1295         pshuflw xmm15, xmm15, 0xB1
   1296         pshufhw xmm15, xmm15, 0xB1
   1297         pshuflw xmm12, xmm12, 0xB1
   1298         pshufhw xmm12, xmm12, 0xB1
   1299         pshuflw xmm13, xmm13, 0xB1
   1300         pshufhw xmm13, xmm13, 0xB1
   1301         pshuflw xmm14, xmm14, 0xB1
   1302         pshufhw xmm14, xmm14, 0xB1
   1303         paddd   xmm10, xmm15
   1304         paddd   xmm11, xmm12
   1305         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1306         paddd   xmm8, xmm13
   1307         paddd   xmm9, xmm14
   1308         pxor    xmm5, xmm10
   1309         pxor    xmm6, xmm11
   1310         pxor    xmm7, xmm8
   1311         pxor    xmm4, xmm9
   1312         movdqa  xmmword ptr [rsp+0x100], xmm8
   1313         movdqa  xmm8, xmm5
   1314         psrld   xmm8, 12
   1315         pslld   xmm5, 20
   1316         por     xmm5, xmm8
   1317         movdqa  xmm8, xmm6
   1318         psrld   xmm8, 12
   1319         pslld   xmm6, 20
   1320         por     xmm6, xmm8
   1321         movdqa  xmm8, xmm7
   1322         psrld   xmm8, 12
   1323         pslld   xmm7, 20
   1324         por     xmm7, xmm8
   1325         movdqa  xmm8, xmm4
   1326         psrld   xmm8, 12
   1327         pslld   xmm4, 20
   1328         por     xmm4, xmm8
   1329         paddd   xmm0, xmmword ptr [rsp+0x30]
   1330         paddd   xmm1, xmmword ptr [rsp+0xA0]
   1331         paddd   xmm2, xmmword ptr [rsp+0x60]
   1332         paddd   xmm3, xmmword ptr [rsp+0x70]
   1333         paddd   xmm0, xmm5
   1334         paddd   xmm1, xmm6
   1335         paddd   xmm2, xmm7
   1336         paddd   xmm3, xmm4
   1337         pxor    xmm15, xmm0
   1338         pxor    xmm12, xmm1
   1339         pxor    xmm13, xmm2
   1340         pxor    xmm14, xmm3
   1341         movdqa  xmm8, xmm15
   1342         psrld   xmm15, 8
   1343         pslld   xmm8, 24
   1344         pxor    xmm15, xmm8
   1345         movdqa  xmm8, xmm12
   1346         psrld   xmm12, 8
   1347         pslld   xmm8, 24
   1348         pxor    xmm12, xmm8
   1349         movdqa  xmm8, xmm13
   1350         psrld   xmm13, 8
   1351         pslld   xmm8, 24
   1352         pxor    xmm13, xmm8
   1353         movdqa  xmm8, xmm14
   1354         psrld   xmm14, 8
   1355         pslld   xmm8, 24
   1356         pxor    xmm14, xmm8
   1357         paddd   xmm10, xmm15
   1358         paddd   xmm11, xmm12
   1359         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1360         paddd   xmm8, xmm13
   1361         paddd   xmm9, xmm14
   1362         pxor    xmm5, xmm10
   1363         pxor    xmm6, xmm11
   1364         pxor    xmm7, xmm8
   1365         pxor    xmm4, xmm9
   1366         movdqa  xmmword ptr [rsp+0x100], xmm8
   1367         movdqa  xmm8, xmm5
   1368         psrld   xmm8, 7
   1369         pslld   xmm5, 25
   1370         por     xmm5, xmm8
   1371         movdqa  xmm8, xmm6
   1372         psrld   xmm8, 7
   1373         pslld   xmm6, 25
   1374         por     xmm6, xmm8
   1375         movdqa  xmm8, xmm7
   1376         psrld   xmm8, 7
   1377         pslld   xmm7, 25
   1378         por     xmm7, xmm8
   1379         movdqa  xmm8, xmm4
   1380         psrld   xmm8, 7
   1381         pslld   xmm4, 25
   1382         por     xmm4, xmm8
   1383         paddd   xmm0, xmmword ptr [rsp+0xB0]
   1384         paddd   xmm1, xmmword ptr [rsp+0x50]
   1385         paddd   xmm2, xmmword ptr [rsp+0x10]
   1386         paddd   xmm3, xmmword ptr [rsp+0x80]
   1387         paddd   xmm0, xmm4
   1388         paddd   xmm1, xmm5
   1389         paddd   xmm2, xmm6
   1390         paddd   xmm3, xmm7
   1391         pxor    xmm12, xmm0
   1392         pxor    xmm13, xmm1
   1393         pxor    xmm14, xmm2
   1394         pxor    xmm15, xmm3
   1395         pshuflw xmm12, xmm12, 0xB1
   1396         pshufhw xmm12, xmm12, 0xB1
   1397         pshuflw xmm13, xmm13, 0xB1
   1398         pshufhw xmm13, xmm13, 0xB1
   1399         pshuflw xmm14, xmm14, 0xB1
   1400         pshufhw xmm14, xmm14, 0xB1
   1401         pshuflw xmm15, xmm15, 0xB1
   1402         pshufhw xmm15, xmm15, 0xB1
   1403         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1404         paddd   xmm8, xmm12
   1405         paddd   xmm9, xmm13
   1406         paddd   xmm10, xmm14
   1407         paddd   xmm11, xmm15
   1408         pxor    xmm4, xmm8
   1409         pxor    xmm5, xmm9
   1410         pxor    xmm6, xmm10
   1411         pxor    xmm7, xmm11
   1412         movdqa  xmmword ptr [rsp+0x100], xmm8
   1413         movdqa  xmm8, xmm4
   1414         psrld   xmm8, 12
   1415         pslld   xmm4, 20
   1416         por     xmm4, xmm8
   1417         movdqa  xmm8, xmm5
   1418         psrld   xmm8, 12
   1419         pslld   xmm5, 20
   1420         por     xmm5, xmm8
   1421         movdqa  xmm8, xmm6
   1422         psrld   xmm8, 12
   1423         pslld   xmm6, 20
   1424         por     xmm6, xmm8
   1425         movdqa  xmm8, xmm7
   1426         psrld   xmm8, 12
   1427         pslld   xmm7, 20
   1428         por     xmm7, xmm8
   1429         paddd   xmm0, xmmword ptr [rsp+0xF0]
   1430         paddd   xmm1, xmmword ptr [rsp]
   1431         paddd   xmm2, xmmword ptr [rsp+0x90]
   1432         paddd   xmm3, xmmword ptr [rsp+0x60]
   1433         paddd   xmm0, xmm4
   1434         paddd   xmm1, xmm5
   1435         paddd   xmm2, xmm6
   1436         paddd   xmm3, xmm7
   1437         pxor    xmm12, xmm0
   1438         pxor    xmm13, xmm1
   1439         pxor    xmm14, xmm2
   1440         pxor    xmm15, xmm3
   1441         movdqa  xmm8, xmm12
   1442         psrld   xmm12, 8
   1443         pslld   xmm8, 24
   1444         pxor    xmm12, xmm8
   1445         movdqa  xmm8, xmm13
   1446         psrld   xmm13, 8
   1447         pslld   xmm8, 24
   1448         pxor    xmm13, xmm8
   1449         movdqa  xmm8, xmm14
   1450         psrld   xmm14, 8
   1451         pslld   xmm8, 24
   1452         pxor    xmm14, xmm8
   1453         movdqa  xmm8, xmm15
   1454         psrld   xmm15, 8
   1455         pslld   xmm8, 24
   1456         pxor    xmm15, xmm8
   1457         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1458         paddd   xmm8, xmm12
   1459         paddd   xmm9, xmm13
   1460         paddd   xmm10, xmm14
   1461         paddd   xmm11, xmm15
   1462         pxor    xmm4, xmm8
   1463         pxor    xmm5, xmm9
   1464         pxor    xmm6, xmm10
   1465         pxor    xmm7, xmm11
   1466         movdqa  xmmword ptr [rsp+0x100], xmm8
   1467         movdqa  xmm8, xmm4
   1468         psrld   xmm8, 7
   1469         pslld   xmm4, 25
   1470         por     xmm4, xmm8
   1471         movdqa  xmm8, xmm5
   1472         psrld   xmm8, 7
   1473         pslld   xmm5, 25
   1474         por     xmm5, xmm8
   1475         movdqa  xmm8, xmm6
   1476         psrld   xmm8, 7
   1477         pslld   xmm6, 25
   1478         por     xmm6, xmm8
   1479         movdqa  xmm8, xmm7
   1480         psrld   xmm8, 7
   1481         pslld   xmm7, 25
   1482         por     xmm7, xmm8
   1483         paddd   xmm0, xmmword ptr [rsp+0xE0]
   1484         paddd   xmm1, xmmword ptr [rsp+0x20]
   1485         paddd   xmm2, xmmword ptr [rsp+0x30]
   1486         paddd   xmm3, xmmword ptr [rsp+0x70]
   1487         paddd   xmm0, xmm5
   1488         paddd   xmm1, xmm6
   1489         paddd   xmm2, xmm7
   1490         paddd   xmm3, xmm4
   1491         pxor    xmm15, xmm0
   1492         pxor    xmm12, xmm1
   1493         pxor    xmm13, xmm2
   1494         pxor    xmm14, xmm3
   1495         pshuflw xmm15, xmm15, 0xB1
   1496         pshufhw xmm15, xmm15, 0xB1
   1497         pshuflw xmm12, xmm12, 0xB1
   1498         pshufhw xmm12, xmm12, 0xB1
   1499         pshuflw xmm13, xmm13, 0xB1
   1500         pshufhw xmm13, xmm13, 0xB1
   1501         pshuflw xmm14, xmm14, 0xB1
   1502         pshufhw xmm14, xmm14, 0xB1
   1503         paddd   xmm10, xmm15
   1504         paddd   xmm11, xmm12
   1505         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1506         paddd   xmm8, xmm13
   1507         paddd   xmm9, xmm14
   1508         pxor    xmm5, xmm10
   1509         pxor    xmm6, xmm11
   1510         pxor    xmm7, xmm8
   1511         pxor    xmm4, xmm9
   1512         movdqa  xmmword ptr [rsp+0x100], xmm8
   1513         movdqa  xmm8, xmm5
   1514         psrld   xmm8, 12
   1515         pslld   xmm5, 20
   1516         por     xmm5, xmm8
   1517         movdqa  xmm8, xmm6
   1518         psrld   xmm8, 12
   1519         pslld   xmm6, 20
   1520         por     xmm6, xmm8
   1521         movdqa  xmm8, xmm7
   1522         psrld   xmm8, 12
   1523         pslld   xmm7, 20
   1524         por     xmm7, xmm8
   1525         movdqa  xmm8, xmm4
   1526         psrld   xmm8, 12
   1527         pslld   xmm4, 20
   1528         por     xmm4, xmm8
   1529         paddd   xmm0, xmmword ptr [rsp+0xA0]
   1530         paddd   xmm1, xmmword ptr [rsp+0xC0]
   1531         paddd   xmm2, xmmword ptr [rsp+0x40]
   1532         paddd   xmm3, xmmword ptr [rsp+0xD0]
   1533         paddd   xmm0, xmm5
   1534         paddd   xmm1, xmm6
   1535         paddd   xmm2, xmm7
   1536         paddd   xmm3, xmm4
   1537         pxor    xmm15, xmm0
   1538         pxor    xmm12, xmm1
   1539         pxor    xmm13, xmm2
   1540         pxor    xmm14, xmm3
   1541         movdqa  xmm8, xmm15
   1542         psrld   xmm15, 8
   1543         pslld   xmm8, 24
   1544         pxor    xmm15, xmm8
   1545         movdqa  xmm8, xmm12
   1546         psrld   xmm12, 8
   1547         pslld   xmm8, 24
   1548         pxor    xmm12, xmm8
   1549         movdqa  xmm8, xmm13
   1550         psrld   xmm13, 8
   1551         pslld   xmm8, 24
   1552         pxor    xmm13, xmm8
   1553         movdqa  xmm8, xmm14
   1554         psrld   xmm14, 8
   1555         pslld   xmm8, 24
   1556         pxor    xmm14, xmm8
   1557         paddd   xmm10, xmm15
   1558         paddd   xmm11, xmm12
   1559         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1560         paddd   xmm8, xmm13
   1561         paddd   xmm9, xmm14
   1562         pxor    xmm5, xmm10
   1563         pxor    xmm6, xmm11
   1564         pxor    xmm7, xmm8
   1565         pxor    xmm4, xmm9
   1566         pxor    xmm0, xmm8
   1567         pxor    xmm1, xmm9
   1568         pxor    xmm2, xmm10
   1569         pxor    xmm3, xmm11
   1570         movdqa  xmm8, xmm5
   1571         psrld   xmm8, 7
   1572         pslld   xmm5, 25
   1573         por     xmm5, xmm8
   1574         movdqa  xmm8, xmm6
   1575         psrld   xmm8, 7
   1576         pslld   xmm6, 25
   1577         por     xmm6, xmm8
   1578         movdqa  xmm8, xmm7
   1579         psrld   xmm8, 7
   1580         pslld   xmm7, 25
   1581         por     xmm7, xmm8
   1582         movdqa  xmm8, xmm4
   1583         psrld   xmm8, 7
   1584         pslld   xmm4, 25
   1585         por     xmm4, xmm8
   1586         pxor    xmm4, xmm12
   1587         pxor    xmm5, xmm13
   1588         pxor    xmm6, xmm14
   1589         pxor    xmm7, xmm15
   1590         mov     eax, r13d
   1591         jne     9b
   1592         movdqa  xmm9, xmm0
   1593         punpckldq xmm0, xmm1
   1594         punpckhdq xmm9, xmm1
   1595         movdqa  xmm11, xmm2
   1596         punpckldq xmm2, xmm3
   1597         punpckhdq xmm11, xmm3
   1598         movdqa  xmm1, xmm0
   1599         punpcklqdq xmm0, xmm2
   1600         punpckhqdq xmm1, xmm2
   1601         movdqa  xmm3, xmm9
   1602         punpcklqdq xmm9, xmm11
   1603         punpckhqdq xmm3, xmm11
   1604         movdqu  xmmword ptr [rbx], xmm0
   1605         movdqu  xmmword ptr [rbx+0x20], xmm1
   1606         movdqu  xmmword ptr [rbx+0x40], xmm9
   1607         movdqu  xmmword ptr [rbx+0x60], xmm3
   1608         movdqa  xmm9, xmm4
   1609         punpckldq xmm4, xmm5
   1610         punpckhdq xmm9, xmm5
   1611         movdqa  xmm11, xmm6
   1612         punpckldq xmm6, xmm7
   1613         punpckhdq xmm11, xmm7
   1614         movdqa  xmm5, xmm4
   1615         punpcklqdq xmm4, xmm6
   1616         punpckhqdq xmm5, xmm6
   1617         movdqa  xmm7, xmm9
   1618         punpcklqdq xmm9, xmm11
   1619         punpckhqdq xmm7, xmm11
   1620         movdqu  xmmword ptr [rbx+0x10], xmm4
   1621         movdqu  xmmword ptr [rbx+0x30], xmm5
   1622         movdqu  xmmword ptr [rbx+0x50], xmm9
   1623         movdqu  xmmword ptr [rbx+0x70], xmm7
   1624         movdqa  xmm1, xmmword ptr [rsp+0x110]
   1625         movdqa  xmm0, xmm1
   1626         paddd   xmm1, xmmword ptr [rsp+0x150]
   1627         movdqa  xmmword ptr [rsp+0x110], xmm1
   1628         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
   1629         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
   1630         pcmpgtd xmm0, xmm1
   1631         movdqa  xmm1, xmmword ptr [rsp+0x120]
   1632         psubd   xmm1, xmm0
   1633         movdqa  xmmword ptr [rsp+0x120], xmm1
   1634         add     rbx, 128
   1635         add     rdi, 32
   1636         sub     rsi, 4
   1637         cmp     rsi, 4
   1638         jnc     2b
   1639         test    rsi, rsi
   1640         jnz     3f
   1641 4:
   1642         mov     rsp, rbp
   1643         pop     rbp
   1644         pop     rbx
   1645         pop     r12
   1646         pop     r13
   1647         pop     r14
   1648         pop     r15
   1649         ret
   1650 .p2align 5
   1651 3:
   1652         test    esi, 0x2
   1653         je      3f
   1654         movups  xmm0, xmmword ptr [rcx]
   1655         movups  xmm1, xmmword ptr [rcx+0x10]
   1656         movaps  xmm8, xmm0
   1657         movaps  xmm9, xmm1
   1658         movd    xmm13, dword ptr [rsp+0x110]
   1659         movd    xmm14, dword ptr [rsp+0x120]
   1660         punpckldq xmm13, xmm14
   1661         movaps  xmmword ptr [rsp], xmm13
   1662         movd    xmm14, dword ptr [rsp+0x114]
   1663         movd    xmm13, dword ptr [rsp+0x124]
   1664         punpckldq xmm14, xmm13
   1665         movaps  xmmword ptr [rsp+0x10], xmm14
   1666         mov     r8, qword ptr [rdi]
   1667         mov     r9, qword ptr [rdi+0x8]
   1668         movzx   eax, byte ptr [rbp+0x40]
   1669         or      eax, r13d
   1670         xor     edx, edx
   1671 2:
   1672         mov     r14d, eax
   1673         or      eax, r12d
   1674         add     rdx, 64
   1675         cmp     rdx, r15
   1676         cmovne  eax, r14d
   1677         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   1678         movaps  xmm10, xmm2
   1679         movups  xmm4, xmmword ptr [r8+rdx-0x40]
   1680         movups  xmm5, xmmword ptr [r8+rdx-0x30]
   1681         movaps  xmm3, xmm4
   1682         shufps  xmm4, xmm5, 136
   1683         shufps  xmm3, xmm5, 221
   1684         movaps  xmm5, xmm3
   1685         movups  xmm6, xmmword ptr [r8+rdx-0x20]
   1686         movups  xmm7, xmmword ptr [r8+rdx-0x10]
   1687         movaps  xmm3, xmm6
   1688         shufps  xmm6, xmm7, 136
   1689         pshufd  xmm6, xmm6, 0x93
   1690         shufps  xmm3, xmm7, 221
   1691         pshufd  xmm7, xmm3, 0x93
   1692         movups  xmm12, xmmword ptr [r9+rdx-0x40]
   1693         movups  xmm13, xmmword ptr [r9+rdx-0x30]
   1694         movaps  xmm11, xmm12
   1695         shufps  xmm12, xmm13, 136
   1696         shufps  xmm11, xmm13, 221
   1697         movaps  xmm13, xmm11
   1698         movups  xmm14, xmmword ptr [r9+rdx-0x20]
   1699         movups  xmm15, xmmword ptr [r9+rdx-0x10]
   1700         movaps  xmm11, xmm14
   1701         shufps  xmm14, xmm15, 136
   1702         pshufd  xmm14, xmm14, 0x93
   1703         shufps  xmm11, xmm15, 221
   1704         pshufd  xmm15, xmm11, 0x93
   1705         shl     rax, 0x20
   1706         or      rax, 0x40
   1707         movd    xmm3, rax
   1708         movdqa  xmmword ptr [rsp+0x20], xmm3
   1709         movaps  xmm3, xmmword ptr [rsp]
   1710         movaps  xmm11, xmmword ptr [rsp+0x10]
   1711         punpcklqdq xmm3, xmmword ptr [rsp+0x20]
   1712         punpcklqdq xmm11, xmmword ptr [rsp+0x20]
   1713         mov     al, 7
   1714 9:
   1715         paddd   xmm0, xmm4
   1716         paddd   xmm8, xmm12
   1717         movaps  xmmword ptr [rsp+0x20], xmm4
   1718         movaps  xmmword ptr [rsp+0x30], xmm12
   1719         paddd   xmm0, xmm1
   1720         paddd   xmm8, xmm9
   1721         pxor    xmm3, xmm0
   1722         pxor    xmm11, xmm8
   1723         pshuflw xmm3, xmm3, 0xB1
   1724         pshufhw xmm3, xmm3, 0xB1
   1725         pshuflw xmm11, xmm11, 0xB1
   1726         pshufhw xmm11, xmm11, 0xB1
   1727         paddd   xmm2, xmm3
   1728         paddd   xmm10, xmm11
   1729         pxor    xmm1, xmm2
   1730         pxor    xmm9, xmm10
   1731         movdqa  xmm4, xmm1
   1732         pslld   xmm1, 20
   1733         psrld   xmm4, 12
   1734         por     xmm1, xmm4
   1735         movdqa  xmm4, xmm9
   1736         pslld   xmm9, 20
   1737         psrld   xmm4, 12
   1738         por     xmm9, xmm4
   1739         paddd   xmm0, xmm5
   1740         paddd   xmm8, xmm13
   1741         movaps  xmmword ptr [rsp+0x40], xmm5
   1742         movaps  xmmword ptr [rsp+0x50], xmm13
   1743         paddd   xmm0, xmm1
   1744         paddd   xmm8, xmm9
   1745         pxor    xmm3, xmm0
   1746         pxor    xmm11, xmm8
   1747         movdqa  xmm13, xmm3
   1748         psrld   xmm3, 8
   1749         pslld   xmm13, 24
   1750         pxor    xmm3, xmm13
   1751         movdqa  xmm13, xmm11
   1752         psrld   xmm11, 8
   1753         pslld   xmm13, 24
   1754         pxor    xmm11, xmm13
   1755         paddd   xmm2, xmm3
   1756         paddd   xmm10, xmm11
   1757         pxor    xmm1, xmm2
   1758         pxor    xmm9, xmm10
   1759         movdqa  xmm4, xmm1
   1760         pslld   xmm1, 25
   1761         psrld   xmm4, 7
   1762         por     xmm1, xmm4
   1763         movdqa  xmm4, xmm9
   1764         pslld   xmm9, 25
   1765         psrld   xmm4, 7
   1766         por     xmm9, xmm4
   1767         pshufd  xmm0, xmm0, 0x93
   1768         pshufd  xmm8, xmm8, 0x93
   1769         pshufd  xmm3, xmm3, 0x4E
   1770         pshufd  xmm11, xmm11, 0x4E
   1771         pshufd  xmm2, xmm2, 0x39
   1772         pshufd  xmm10, xmm10, 0x39
   1773         paddd   xmm0, xmm6
   1774         paddd   xmm8, xmm14
   1775         paddd   xmm0, xmm1
   1776         paddd   xmm8, xmm9
   1777         pxor    xmm3, xmm0
   1778         pxor    xmm11, xmm8
   1779         pshuflw xmm3, xmm3, 0xB1
   1780         pshufhw xmm3, xmm3, 0xB1
   1781         pshuflw xmm11, xmm11, 0xB1
   1782         pshufhw xmm11, xmm11, 0xB1
   1783         paddd   xmm2, xmm3
   1784         paddd   xmm10, xmm11
   1785         pxor    xmm1, xmm2
   1786         pxor    xmm9, xmm10
   1787         movdqa  xmm4, xmm1
   1788         pslld   xmm1, 20
   1789         psrld   xmm4, 12
   1790         por     xmm1, xmm4
   1791         movdqa  xmm4, xmm9
   1792         pslld   xmm9, 20
   1793         psrld   xmm4, 12
   1794         por     xmm9, xmm4
   1795         paddd   xmm0, xmm7
   1796         paddd   xmm8, xmm15
   1797         paddd   xmm0, xmm1
   1798         paddd   xmm8, xmm9
   1799         pxor    xmm3, xmm0
   1800         pxor    xmm11, xmm8
   1801         movdqa  xmm13, xmm3
   1802         psrld   xmm3, 8
   1803         pslld   xmm13, 24
   1804         pxor    xmm3, xmm13
   1805         movdqa  xmm13, xmm11
   1806         psrld   xmm11, 8
   1807         pslld   xmm13, 24
   1808         pxor    xmm11, xmm13
   1809         paddd   xmm2, xmm3
   1810         paddd   xmm10, xmm11
   1811         pxor    xmm1, xmm2
   1812         pxor    xmm9, xmm10
   1813         movdqa  xmm4, xmm1
   1814         pslld   xmm1, 25
   1815         psrld   xmm4, 7
   1816         por     xmm1, xmm4
   1817         movdqa  xmm4, xmm9
   1818         pslld   xmm9, 25
   1819         psrld   xmm4, 7
   1820         por     xmm9, xmm4
   1821         pshufd  xmm0, xmm0, 0x39
   1822         pshufd  xmm8, xmm8, 0x39
   1823         pshufd  xmm3, xmm3, 0x4E
   1824         pshufd  xmm11, xmm11, 0x4E
   1825         pshufd  xmm2, xmm2, 0x93
   1826         pshufd  xmm10, xmm10, 0x93
   1827         dec     al
   1828         je      9f
   1829         movdqa  xmm12, xmmword ptr [rsp+0x20]
   1830         movdqa  xmm5, xmmword ptr [rsp+0x40]
   1831         pshufd  xmm13, xmm12, 0x0F
   1832         shufps  xmm12, xmm5, 214
   1833         pshufd  xmm4, xmm12, 0x39
   1834         movdqa  xmm12, xmm6
   1835         shufps  xmm12, xmm7, 250
   1836         pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
   1837         pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   1838         por     xmm13, xmm12
   1839         movdqa  xmmword ptr [rsp+0x20], xmm13
   1840         movdqa  xmm12, xmm7
   1841         punpcklqdq xmm12, xmm5
   1842         movdqa  xmm13, xmm6
   1843         pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   1844         pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   1845         por     xmm12, xmm13
   1846         pshufd  xmm12, xmm12, 0x78
   1847         punpckhdq xmm5, xmm7
   1848         punpckldq xmm6, xmm5
   1849         pshufd  xmm7, xmm6, 0x1E
   1850         movdqa  xmmword ptr [rsp+0x40], xmm12
   1851         movdqa  xmm5, xmmword ptr [rsp+0x30]
   1852         movdqa  xmm13, xmmword ptr [rsp+0x50]
   1853         pshufd  xmm6, xmm5, 0x0F
   1854         shufps  xmm5, xmm13, 214
   1855         pshufd  xmm12, xmm5, 0x39
   1856         movdqa  xmm5, xmm14
   1857         shufps  xmm5, xmm15, 250
   1858         pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
   1859         pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   1860         por     xmm6, xmm5
   1861         movdqa  xmm5, xmm15
   1862         punpcklqdq xmm5, xmm13
   1863         movdqa  xmmword ptr [rsp+0x30], xmm2
   1864         movdqa  xmm2, xmm14
   1865         pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   1866         pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   1867         por     xmm5, xmm2
   1868         movdqa  xmm2, xmmword ptr [rsp+0x30]
   1869         pshufd  xmm5, xmm5, 0x78
   1870         punpckhdq xmm13, xmm15
   1871         punpckldq xmm14, xmm13
   1872         pshufd  xmm15, xmm14, 0x1E
   1873         movdqa  xmm13, xmm6
   1874         movdqa  xmm14, xmm5
   1875         movdqa  xmm5, xmmword ptr [rsp+0x20]
   1876         movdqa  xmm6, xmmword ptr [rsp+0x40]
   1877         jmp     9b
   1878 9:
   1879         pxor    xmm0, xmm2
   1880         pxor    xmm1, xmm3
   1881         pxor    xmm8, xmm10
   1882         pxor    xmm9, xmm11
   1883         mov     eax, r13d
   1884         cmp     rdx, r15
   1885         jne     2b
   1886         movups  xmmword ptr [rbx], xmm0
   1887         movups  xmmword ptr [rbx+0x10], xmm1
   1888         movups  xmmword ptr [rbx+0x20], xmm8
   1889         movups  xmmword ptr [rbx+0x30], xmm9
   1890         mov     eax, dword ptr [rsp+0x130]
   1891         neg     eax
   1892         mov    r10d, dword ptr [rsp+0x110+8*rax]
   1893         mov    r11d, dword ptr [rsp+0x120+8*rax]
   1894         mov dword ptr [rsp+0x110], r10d
   1895         mov dword ptr [rsp+0x120], r11d
   1896         add     rdi, 16
   1897         add     rbx, 64
   1898         sub     rsi, 2
   1899 3:
   1900         test    esi, 0x1
   1901         je      4b
   1902         movups  xmm0, xmmword ptr [rcx]
   1903         movups  xmm1, xmmword ptr [rcx+0x10]
   1904         movd    xmm13, dword ptr [rsp+0x110]
   1905         movd    xmm14, dword ptr [rsp+0x120]
   1906         punpckldq xmm13, xmm14
   1907         mov     r8, qword ptr [rdi]
   1908         movzx   eax, byte ptr [rbp+0x40]
   1909         or      eax, r13d
   1910         xor     edx, edx
   1911 2:
   1912         mov     r14d, eax
   1913         or      eax, r12d
   1914         add     rdx, 64
   1915         cmp     rdx, r15
   1916         cmovne  eax, r14d
   1917         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   1918         shl     rax, 32
   1919         or      rax, 64
   1920         movd    xmm12, rax
   1921         movdqa  xmm3, xmm13
   1922         punpcklqdq xmm3, xmm12
   1923         movups  xmm4, xmmword ptr [r8+rdx-0x40]
   1924         movups  xmm5, xmmword ptr [r8+rdx-0x30]
   1925         movaps  xmm8, xmm4
   1926         shufps  xmm4, xmm5, 136
   1927         shufps  xmm8, xmm5, 221
   1928         movaps  xmm5, xmm8
   1929         movups  xmm6, xmmword ptr [r8+rdx-0x20]
   1930         movups  xmm7, xmmword ptr [r8+rdx-0x10]
   1931         movaps  xmm8, xmm6
   1932         shufps  xmm6, xmm7, 136
   1933         pshufd  xmm6, xmm6, 0x93
   1934         shufps  xmm8, xmm7, 221
   1935         pshufd  xmm7, xmm8, 0x93
   1936         mov     al, 7
   1937 9:
   1938         paddd   xmm0, xmm4
   1939         paddd   xmm0, xmm1
   1940         pxor    xmm3, xmm0
   1941         pshuflw xmm3, xmm3, 0xB1
   1942         pshufhw xmm3, xmm3, 0xB1
   1943         paddd   xmm2, xmm3
   1944         pxor    xmm1, xmm2
   1945         movdqa  xmm11, xmm1
   1946         pslld   xmm1, 20
   1947         psrld   xmm11, 12
   1948         por     xmm1, xmm11
   1949         paddd   xmm0, xmm5
   1950         paddd   xmm0, xmm1
   1951         pxor    xmm3, xmm0
   1952         movdqa  xmm14, xmm3
   1953         psrld   xmm3, 8
   1954         pslld   xmm14, 24
   1955         pxor    xmm3, xmm14
   1956         paddd   xmm2, xmm3
   1957         pxor    xmm1, xmm2
   1958         movdqa  xmm11, xmm1
   1959         pslld   xmm1, 25
   1960         psrld   xmm11, 7
   1961         por     xmm1, xmm11
   1962         pshufd  xmm0, xmm0, 0x93
   1963         pshufd  xmm3, xmm3, 0x4E
   1964         pshufd  xmm2, xmm2, 0x39
   1965         paddd   xmm0, xmm6
   1966         paddd   xmm0, xmm1
   1967         pxor    xmm3, xmm0
   1968         pshuflw xmm3, xmm3, 0xB1
   1969         pshufhw xmm3, xmm3, 0xB1
   1970         paddd   xmm2, xmm3
   1971         pxor    xmm1, xmm2
   1972         movdqa  xmm11, xmm1
   1973         pslld   xmm1, 20
   1974         psrld   xmm11, 12
   1975         por     xmm1, xmm11
   1976         paddd   xmm0, xmm7
   1977         paddd   xmm0, xmm1
   1978         pxor    xmm3, xmm0
   1979         movdqa  xmm14, xmm3
   1980         psrld   xmm3, 8
   1981         pslld   xmm14, 24
   1982         pxor    xmm3, xmm14
   1983         paddd   xmm2, xmm3
   1984         pxor    xmm1, xmm2
   1985         movdqa  xmm11, xmm1
   1986         pslld   xmm1, 25
   1987         psrld   xmm11, 7
   1988         por     xmm1, xmm11
   1989         pshufd  xmm0, xmm0, 0x39
   1990         pshufd  xmm3, xmm3, 0x4E
   1991         pshufd  xmm2, xmm2, 0x93
   1992         dec     al
   1993         jz      9f
   1994         movdqa  xmm8, xmm4
   1995         shufps  xmm8, xmm5, 214
   1996         pshufd  xmm9, xmm4, 0x0F
   1997         pshufd  xmm4, xmm8, 0x39
   1998         movdqa  xmm8, xmm6
   1999         shufps  xmm8, xmm7, 250
   2000         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2001         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2002         por     xmm9, xmm8
   2003         movdqa  xmm8, xmm7
   2004         punpcklqdq xmm8, xmm5
   2005         movdqa  xmm10, xmm6
   2006         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2007         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2008         por     xmm8, xmm10
   2009         pshufd  xmm8, xmm8, 0x78
   2010         punpckhdq xmm5, xmm7
   2011         punpckldq xmm6, xmm5
   2012         pshufd  xmm7, xmm6, 0x1E
   2013         movdqa  xmm5, xmm9
   2014         movdqa  xmm6, xmm8
   2015         jmp     9b
   2016 9:
   2017         pxor    xmm0, xmm2
   2018         pxor    xmm1, xmm3
   2019         mov     eax, r13d
   2020         cmp     rdx, r15
   2021         jne     2b
   2022         movups  xmmword ptr [rbx], xmm0
   2023         movups  xmmword ptr [rbx+0x10], xmm1
   2024         jmp     4b
   2025 
   2026 .p2align 6
   2027 blake3_compress_in_place_sse2:
   2028 _blake3_compress_in_place_sse2:
   2029         _CET_ENDBR
   2030         movups  xmm0, xmmword ptr [rdi]
   2031         movups  xmm1, xmmword ptr [rdi+0x10]
   2032         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   2033         shl     r8, 32
   2034         add     rdx, r8
   2035         movq    xmm3, rcx
   2036         movq    xmm4, rdx
   2037         punpcklqdq xmm3, xmm4
   2038         movups  xmm4, xmmword ptr [rsi]
   2039         movups  xmm5, xmmword ptr [rsi+0x10]
   2040         movaps  xmm8, xmm4
   2041         shufps  xmm4, xmm5, 136
   2042         shufps  xmm8, xmm5, 221
   2043         movaps  xmm5, xmm8
   2044         movups  xmm6, xmmword ptr [rsi+0x20]
   2045         movups  xmm7, xmmword ptr [rsi+0x30]
   2046         movaps  xmm8, xmm6
   2047         shufps  xmm6, xmm7, 136
   2048         pshufd  xmm6, xmm6, 0x93
   2049         shufps  xmm8, xmm7, 221
   2050         pshufd  xmm7, xmm8, 0x93
   2051         mov     al, 7
   2052 9:
   2053         paddd   xmm0, xmm4
   2054         paddd   xmm0, xmm1
   2055         pxor    xmm3, xmm0
   2056         pshuflw xmm3, xmm3, 0xB1
   2057         pshufhw xmm3, xmm3, 0xB1
   2058         paddd   xmm2, xmm3
   2059         pxor    xmm1, xmm2
   2060         movdqa  xmm11, xmm1
   2061         pslld   xmm1, 20
   2062         psrld   xmm11, 12
   2063         por     xmm1, xmm11
   2064         paddd   xmm0, xmm5
   2065         paddd   xmm0, xmm1
   2066         pxor    xmm3, xmm0
   2067         movdqa  xmm14, xmm3
   2068         psrld   xmm3, 8
   2069         pslld   xmm14, 24
   2070         pxor    xmm3, xmm14
   2071         paddd   xmm2, xmm3
   2072         pxor    xmm1, xmm2
   2073         movdqa  xmm11, xmm1
   2074         pslld   xmm1, 25
   2075         psrld   xmm11, 7
   2076         por     xmm1, xmm11
   2077         pshufd  xmm0, xmm0, 0x93
   2078         pshufd  xmm3, xmm3, 0x4E
   2079         pshufd  xmm2, xmm2, 0x39
   2080         paddd   xmm0, xmm6
   2081         paddd   xmm0, xmm1
   2082         pxor    xmm3, xmm0
   2083         pshuflw xmm3, xmm3, 0xB1
   2084         pshufhw xmm3, xmm3, 0xB1
   2085         paddd   xmm2, xmm3
   2086         pxor    xmm1, xmm2
   2087         movdqa  xmm11, xmm1
   2088         pslld   xmm1, 20
   2089         psrld   xmm11, 12
   2090         por     xmm1, xmm11
   2091         paddd   xmm0, xmm7
   2092         paddd   xmm0, xmm1
   2093         pxor    xmm3, xmm0
   2094         movdqa  xmm14, xmm3
   2095         psrld   xmm3, 8
   2096         pslld   xmm14, 24
   2097         pxor    xmm3, xmm14
   2098         paddd   xmm2, xmm3
   2099         pxor    xmm1, xmm2
   2100         movdqa  xmm11, xmm1
   2101         pslld   xmm1, 25
   2102         psrld   xmm11, 7
   2103         por     xmm1, xmm11
   2104         pshufd  xmm0, xmm0, 0x39
   2105         pshufd  xmm3, xmm3, 0x4E
   2106         pshufd  xmm2, xmm2, 0x93
   2107         dec     al
   2108         jz      9f
   2109         movdqa  xmm8, xmm4
   2110         shufps  xmm8, xmm5, 214
   2111         pshufd  xmm9, xmm4, 0x0F
   2112         pshufd  xmm4, xmm8, 0x39
   2113         movdqa  xmm8, xmm6
   2114         shufps  xmm8, xmm7, 250
   2115         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2116         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2117         por     xmm9, xmm8
   2118         movdqa  xmm8, xmm7
   2119         punpcklqdq xmm8, xmm5
   2120         movdqa  xmm10, xmm6
   2121         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2122         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2123         por     xmm8, xmm10
   2124         pshufd  xmm8, xmm8, 0x78
   2125         punpckhdq xmm5, xmm7
   2126         punpckldq xmm6, xmm5
   2127         pshufd  xmm7, xmm6, 0x1E
   2128         movdqa  xmm5, xmm9
   2129         movdqa  xmm6, xmm8
   2130         jmp     9b
   2131 9:
   2132         pxor    xmm0, xmm2
   2133         pxor    xmm1, xmm3
   2134         movups  xmmword ptr [rdi], xmm0
   2135         movups  xmmword ptr [rdi+0x10], xmm1
   2136         ret
   2137 
   2138 .p2align 6
   2139 blake3_compress_xof_sse2:
   2140 _blake3_compress_xof_sse2:
   2141         _CET_ENDBR
   2142         movups  xmm0, xmmword ptr [rdi]
   2143         movups  xmm1, xmmword ptr [rdi+0x10]
   2144         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   2145         movzx   eax, r8b
   2146         movzx   edx, dl
   2147         shl     rax, 32
   2148         add     rdx, rax
   2149         movq    xmm3, rcx
   2150         movq    xmm4, rdx
   2151         punpcklqdq xmm3, xmm4
   2152         movups  xmm4, xmmword ptr [rsi]
   2153         movups  xmm5, xmmword ptr [rsi+0x10]
   2154         movaps  xmm8, xmm4
   2155         shufps  xmm4, xmm5, 136
   2156         shufps  xmm8, xmm5, 221
   2157         movaps  xmm5, xmm8
   2158         movups  xmm6, xmmword ptr [rsi+0x20]
   2159         movups  xmm7, xmmword ptr [rsi+0x30]
   2160         movaps  xmm8, xmm6
   2161         shufps  xmm6, xmm7, 136
   2162         pshufd  xmm6, xmm6, 0x93
   2163         shufps  xmm8, xmm7, 221
   2164         pshufd  xmm7, xmm8, 0x93
   2165         mov     al, 7
   2166 9:
   2167         paddd   xmm0, xmm4
   2168         paddd   xmm0, xmm1
   2169         pxor    xmm3, xmm0
   2170         pshuflw xmm3, xmm3, 0xB1
   2171         pshufhw xmm3, xmm3, 0xB1
   2172         paddd   xmm2, xmm3
   2173         pxor    xmm1, xmm2
   2174         movdqa  xmm11, xmm1
   2175         pslld   xmm1, 20
   2176         psrld   xmm11, 12
   2177         por     xmm1, xmm11
   2178         paddd   xmm0, xmm5
   2179         paddd   xmm0, xmm1
   2180         pxor    xmm3, xmm0
   2181         movdqa  xmm14, xmm3
   2182         psrld   xmm3, 8
   2183         pslld   xmm14, 24
   2184         pxor    xmm3, xmm14
   2185         paddd   xmm2, xmm3
   2186         pxor    xmm1, xmm2
   2187         movdqa  xmm11, xmm1
   2188         pslld   xmm1, 25
   2189         psrld   xmm11, 7
   2190         por     xmm1, xmm11
   2191         pshufd  xmm0, xmm0, 0x93
   2192         pshufd  xmm3, xmm3, 0x4E
   2193         pshufd  xmm2, xmm2, 0x39
   2194         paddd   xmm0, xmm6
   2195         paddd   xmm0, xmm1
   2196         pxor    xmm3, xmm0
   2197         pshuflw xmm3, xmm3, 0xB1
   2198         pshufhw xmm3, xmm3, 0xB1
   2199         paddd   xmm2, xmm3
   2200         pxor    xmm1, xmm2
   2201         movdqa  xmm11, xmm1
   2202         pslld   xmm1, 20
   2203         psrld   xmm11, 12
   2204         por     xmm1, xmm11
   2205         paddd   xmm0, xmm7
   2206         paddd   xmm0, xmm1
   2207         pxor    xmm3, xmm0
   2208         movdqa  xmm14, xmm3
   2209         psrld   xmm3, 8
   2210         pslld   xmm14, 24
   2211         pxor    xmm3, xmm14
   2212         paddd   xmm2, xmm3
   2213         pxor    xmm1, xmm2
   2214         movdqa  xmm11, xmm1
   2215         pslld   xmm1, 25
   2216         psrld   xmm11, 7
   2217         por     xmm1, xmm11
   2218         pshufd  xmm0, xmm0, 0x39
   2219         pshufd  xmm3, xmm3, 0x4E
   2220         pshufd  xmm2, xmm2, 0x93
   2221         dec     al
   2222         jz      9f
   2223         movdqa  xmm8, xmm4
   2224         shufps  xmm8, xmm5, 214
   2225         pshufd  xmm9, xmm4, 0x0F
   2226         pshufd  xmm4, xmm8, 0x39
   2227         movdqa  xmm8, xmm6
   2228         shufps  xmm8, xmm7, 250
   2229         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2230         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2231         por     xmm9, xmm8
   2232         movdqa  xmm8, xmm7
   2233         punpcklqdq xmm8, xmm5
   2234         movdqa  xmm10, xmm6
   2235         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2236         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2237         por     xmm8, xmm10
   2238         pshufd  xmm8, xmm8, 0x78
   2239         punpckhdq xmm5, xmm7
   2240         punpckldq xmm6, xmm5
   2241         pshufd  xmm7, xmm6, 0x1E
   2242         movdqa  xmm5, xmm9
   2243         movdqa  xmm6, xmm8
   2244         jmp     9b
   2245 9:
   2246         movdqu  xmm4, xmmword ptr [rdi]
   2247         movdqu  xmm5, xmmword ptr [rdi+0x10]
   2248         pxor    xmm0, xmm2
   2249         pxor    xmm1, xmm3
   2250         pxor    xmm2, xmm4
   2251         pxor    xmm3, xmm5
   2252         movups  xmmword ptr [r9], xmm0
   2253         movups  xmmword ptr [r9+0x10], xmm1
   2254         movups  xmmword ptr [r9+0x20], xmm2
   2255         movups  xmmword ptr [r9+0x30], xmm3
   2256         ret
   2257 
   2258 
   2259 #ifdef __APPLE__
   2260 .static_data
   2261 #else
   2262 .section .rodata
   2263 #endif
   2264 .p2align  6
   2265 BLAKE3_IV:
   2266         .long  0x6A09E667, 0xBB67AE85
   2267         .long  0x3C6EF372, 0xA54FF53A
   2268 ADD0:	
   2269         .long  0, 1, 2, 3
   2270 ADD1:
   2271 	.long  4, 4, 4, 4
   2272 BLAKE3_IV_0:
   2273 	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
   2274 BLAKE3_IV_1:
   2275 	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
   2276 BLAKE3_IV_2:
   2277 	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
   2278 BLAKE3_IV_3:
   2279 	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
   2280 BLAKE3_BLOCK_LEN:
   2281 	.long  64, 64, 64, 64
   2282 CMP_MSB_MASK:
   2283 	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
   2284 PBLENDW_0x33_MASK:
   2285 	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
   2286 PBLENDW_0xCC_MASK:
   2287 	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
   2288 PBLENDW_0x3F_MASK:
   2289 	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
   2290 PBLENDW_0xC0_MASK:
   2291 	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF