chibipub

experimental activitypub node in C
git clone git://jb55.com/chibipub
Log | Files | Refs | README | LICENSE

blake3_sse2_x86-64_windows_gnu.S (71203B)


      1 .intel_syntax noprefix
      2 .global blake3_hash_many_sse2
      3 .global _blake3_hash_many_sse2
      4 .global blake3_compress_in_place_sse2
      5 .global _blake3_compress_in_place_sse2
      6 .global blake3_compress_xof_sse2
      7 .global _blake3_compress_xof_sse2
      8 .section .text
      9         .p2align  6
     10 _blake3_hash_many_sse2:
     11 blake3_hash_many_sse2:
     12         push    r15
     13         push    r14
     14         push    r13
     15         push    r12
     16         push    rsi
     17         push    rdi
     18         push    rbx
     19         push    rbp
     20         mov     rbp, rsp
     21         sub     rsp, 528
     22         and     rsp, 0xFFFFFFFFFFFFFFC0
     23         movdqa  xmmword ptr [rsp+0x170], xmm6
     24         movdqa  xmmword ptr [rsp+0x180], xmm7
     25         movdqa  xmmword ptr [rsp+0x190], xmm8
     26         movdqa  xmmword ptr [rsp+0x1A0], xmm9
     27         movdqa  xmmword ptr [rsp+0x1B0], xmm10
     28         movdqa  xmmword ptr [rsp+0x1C0], xmm11
     29         movdqa  xmmword ptr [rsp+0x1D0], xmm12
     30         movdqa  xmmword ptr [rsp+0x1E0], xmm13
     31         movdqa  xmmword ptr [rsp+0x1F0], xmm14
     32         movdqa  xmmword ptr [rsp+0x200], xmm15
     33         mov     rdi, rcx
     34         mov     rsi, rdx
     35         mov     rdx, r8
     36         mov     rcx, r9
     37         mov     r8, qword ptr [rbp+0x68]
     38         movzx   r9, byte ptr [rbp+0x70]
     39         neg     r9d
     40         movd    xmm0, r9d
     41         pshufd  xmm0, xmm0, 0x00
     42         movdqa  xmmword ptr [rsp+0x130], xmm0
     43         movdqa  xmm1, xmm0
     44         pand    xmm1, xmmword ptr [ADD0+rip]
     45         pand    xmm0, xmmword ptr [ADD1+rip]
     46         movdqa  xmmword ptr [rsp+0x150], xmm0
     47         movd    xmm0, r8d
     48         pshufd  xmm0, xmm0, 0x00
     49         paddd   xmm0, xmm1
     50         movdqa  xmmword ptr [rsp+0x110], xmm0
     51         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
     52         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
     53         pcmpgtd xmm1, xmm0
     54         shr     r8, 32
     55         movd    xmm2, r8d
     56         pshufd  xmm2, xmm2, 0x00
     57         psubd   xmm2, xmm1
     58         movdqa  xmmword ptr [rsp+0x120], xmm2
     59         mov     rbx, qword ptr [rbp+0x90]
     60         mov     r15, rdx
     61         shl     r15, 6
     62         movzx   r13d, byte ptr [rbp+0x78]
     63         movzx   r12d, byte ptr [rbp+0x88]
     64         cmp     rsi, 4
     65         jc      3f
     66 2:
     67         movdqu  xmm3, xmmword ptr [rcx]
     68         pshufd  xmm0, xmm3, 0x00
     69         pshufd  xmm1, xmm3, 0x55
     70         pshufd  xmm2, xmm3, 0xAA
     71         pshufd  xmm3, xmm3, 0xFF
     72         movdqu  xmm7, xmmword ptr [rcx+0x10]
     73         pshufd  xmm4, xmm7, 0x00
     74         pshufd  xmm5, xmm7, 0x55
     75         pshufd  xmm6, xmm7, 0xAA
     76         pshufd  xmm7, xmm7, 0xFF
     77         mov     r8, qword ptr [rdi]
     78         mov     r9, qword ptr [rdi+0x8]
     79         mov     r10, qword ptr [rdi+0x10]
     80         mov     r11, qword ptr [rdi+0x18]
     81         movzx   eax, byte ptr [rbp+0x80]
     82         or      eax, r13d
     83         xor     edx, edx
     84 9:
     85         mov     r14d, eax
     86         or      eax, r12d
     87         add     rdx, 64
     88         cmp     rdx, r15
     89         cmovne  eax, r14d
     90         movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
     91         movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
     92         movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
     93         movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
     94         movdqa  xmm12, xmm8
     95         punpckldq xmm8, xmm9
     96         punpckhdq xmm12, xmm9
     97         movdqa  xmm14, xmm10
     98         punpckldq xmm10, xmm11
     99         punpckhdq xmm14, xmm11
    100         movdqa  xmm9, xmm8
    101         punpcklqdq xmm8, xmm10
    102         punpckhqdq xmm9, xmm10
    103         movdqa  xmm13, xmm12
    104         punpcklqdq xmm12, xmm14
    105         punpckhqdq xmm13, xmm14
    106         movdqa  xmmword ptr [rsp], xmm8
    107         movdqa  xmmword ptr [rsp+0x10], xmm9
    108         movdqa  xmmword ptr [rsp+0x20], xmm12
    109         movdqa  xmmword ptr [rsp+0x30], xmm13
    110         movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
    111         movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
    112         movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
    113         movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
    114         movdqa  xmm12, xmm8
    115         punpckldq xmm8, xmm9
    116         punpckhdq xmm12, xmm9
    117         movdqa  xmm14, xmm10
    118         punpckldq xmm10, xmm11
    119         punpckhdq xmm14, xmm11
    120         movdqa  xmm9, xmm8
    121         punpcklqdq xmm8, xmm10
    122         punpckhqdq xmm9, xmm10
    123         movdqa  xmm13, xmm12
    124         punpcklqdq xmm12, xmm14
    125         punpckhqdq xmm13, xmm14
    126         movdqa  xmmword ptr [rsp+0x40], xmm8
    127         movdqa  xmmword ptr [rsp+0x50], xmm9
    128         movdqa  xmmword ptr [rsp+0x60], xmm12
    129         movdqa  xmmword ptr [rsp+0x70], xmm13
    130         movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
    131         movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
    132         movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
    133         movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
    134         movdqa  xmm12, xmm8
    135         punpckldq xmm8, xmm9
    136         punpckhdq xmm12, xmm9
    137         movdqa  xmm14, xmm10
    138         punpckldq xmm10, xmm11
    139         punpckhdq xmm14, xmm11
    140         movdqa  xmm9, xmm8
    141         punpcklqdq xmm8, xmm10
    142         punpckhqdq xmm9, xmm10
    143         movdqa  xmm13, xmm12
    144         punpcklqdq xmm12, xmm14
    145         punpckhqdq xmm13, xmm14
    146         movdqa  xmmword ptr [rsp+0x80], xmm8
    147         movdqa  xmmword ptr [rsp+0x90], xmm9
    148         movdqa  xmmword ptr [rsp+0xA0], xmm12
    149         movdqa  xmmword ptr [rsp+0xB0], xmm13
    150         movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
    151         movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
    152         movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
    153         movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
    154         movdqa  xmm12, xmm8
    155         punpckldq xmm8, xmm9
    156         punpckhdq xmm12, xmm9
    157         movdqa  xmm14, xmm10
    158         punpckldq xmm10, xmm11
    159         punpckhdq xmm14, xmm11
    160         movdqa  xmm9, xmm8
    161         punpcklqdq xmm8, xmm10
    162         punpckhqdq xmm9, xmm10
    163         movdqa  xmm13, xmm12
    164         punpcklqdq xmm12, xmm14
    165         punpckhqdq xmm13, xmm14
    166         movdqa  xmmword ptr [rsp+0xC0], xmm8
    167         movdqa  xmmword ptr [rsp+0xD0], xmm9
    168         movdqa  xmmword ptr [rsp+0xE0], xmm12
    169         movdqa  xmmword ptr [rsp+0xF0], xmm13
    170         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
    171         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
    172         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
    173         movdqa  xmm12, xmmword ptr [rsp+0x110]
    174         movdqa  xmm13, xmmword ptr [rsp+0x120]
    175         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
    176         movd    xmm15, eax
    177         pshufd  xmm15, xmm15, 0x00
    178         prefetcht0 [r8+rdx+0x80]
    179         prefetcht0 [r9+rdx+0x80]
    180         prefetcht0 [r10+rdx+0x80]
    181         prefetcht0 [r11+rdx+0x80]
    182         paddd   xmm0, xmmword ptr [rsp]
    183         paddd   xmm1, xmmword ptr [rsp+0x20]
    184         paddd   xmm2, xmmword ptr [rsp+0x40]
    185         paddd   xmm3, xmmword ptr [rsp+0x60]
    186         paddd   xmm0, xmm4
    187         paddd   xmm1, xmm5
    188         paddd   xmm2, xmm6
    189         paddd   xmm3, xmm7
    190         pxor    xmm12, xmm0
    191         pxor    xmm13, xmm1
    192         pxor    xmm14, xmm2
    193         pxor    xmm15, xmm3
    194         pshuflw xmm12, xmm12, 0xB1
    195         pshufhw xmm12, xmm12, 0xB1
    196         pshuflw xmm13, xmm13, 0xB1
    197         pshufhw xmm13, xmm13, 0xB1
    198         pshuflw xmm14, xmm14, 0xB1
    199         pshufhw xmm14, xmm14, 0xB1
    200         pshuflw xmm15, xmm15, 0xB1
    201         pshufhw xmm15, xmm15, 0xB1
    202         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
    203         paddd   xmm8, xmm12
    204         paddd   xmm9, xmm13
    205         paddd   xmm10, xmm14
    206         paddd   xmm11, xmm15
    207         pxor    xmm4, xmm8
    208         pxor    xmm5, xmm9
    209         pxor    xmm6, xmm10
    210         pxor    xmm7, xmm11
    211         movdqa  xmmword ptr [rsp+0x100], xmm8
    212         movdqa  xmm8, xmm4
    213         psrld   xmm8, 12
    214         pslld   xmm4, 20
    215         por     xmm4, xmm8
    216         movdqa  xmm8, xmm5
    217         psrld   xmm8, 12
    218         pslld   xmm5, 20
    219         por     xmm5, xmm8
    220         movdqa  xmm8, xmm6
    221         psrld   xmm8, 12
    222         pslld   xmm6, 20
    223         por     xmm6, xmm8
    224         movdqa  xmm8, xmm7
    225         psrld   xmm8, 12
    226         pslld   xmm7, 20
    227         por     xmm7, xmm8
    228         paddd   xmm0, xmmword ptr [rsp+0x10]
    229         paddd   xmm1, xmmword ptr [rsp+0x30]
    230         paddd   xmm2, xmmword ptr [rsp+0x50]
    231         paddd   xmm3, xmmword ptr [rsp+0x70]
    232         paddd   xmm0, xmm4
    233         paddd   xmm1, xmm5
    234         paddd   xmm2, xmm6
    235         paddd   xmm3, xmm7
    236         pxor    xmm12, xmm0
    237         pxor    xmm13, xmm1
    238         pxor    xmm14, xmm2
    239         pxor    xmm15, xmm3
    240         movdqa  xmm8, xmm12
    241         psrld   xmm12, 8
    242         pslld   xmm8, 24
    243         pxor    xmm12, xmm8
    244         movdqa  xmm8, xmm13
    245         psrld   xmm13, 8
    246         pslld   xmm8, 24
    247         pxor    xmm13, xmm8
    248         movdqa  xmm8, xmm14
    249         psrld   xmm14, 8
    250         pslld   xmm8, 24
    251         pxor    xmm14, xmm8
    252         movdqa  xmm8, xmm15
    253         psrld   xmm15, 8
    254         pslld   xmm8, 24
    255         pxor    xmm15, xmm8
    256         movdqa  xmm8, xmmword ptr [rsp+0x100]
    257         paddd   xmm8, xmm12
    258         paddd   xmm9, xmm13
    259         paddd   xmm10, xmm14
    260         paddd   xmm11, xmm15
    261         pxor    xmm4, xmm8
    262         pxor    xmm5, xmm9
    263         pxor    xmm6, xmm10
    264         pxor    xmm7, xmm11
    265         movdqa  xmmword ptr [rsp+0x100], xmm8
    266         movdqa  xmm8, xmm4
    267         psrld   xmm8, 7
    268         pslld   xmm4, 25
    269         por     xmm4, xmm8
    270         movdqa  xmm8, xmm5
    271         psrld   xmm8, 7
    272         pslld   xmm5, 25
    273         por     xmm5, xmm8
    274         movdqa  xmm8, xmm6
    275         psrld   xmm8, 7
    276         pslld   xmm6, 25
    277         por     xmm6, xmm8
    278         movdqa  xmm8, xmm7
    279         psrld   xmm8, 7
    280         pslld   xmm7, 25
    281         por     xmm7, xmm8
    282         paddd   xmm0, xmmword ptr [rsp+0x80]
    283         paddd   xmm1, xmmword ptr [rsp+0xA0]
    284         paddd   xmm2, xmmword ptr [rsp+0xC0]
    285         paddd   xmm3, xmmword ptr [rsp+0xE0]
    286         paddd   xmm0, xmm5
    287         paddd   xmm1, xmm6
    288         paddd   xmm2, xmm7
    289         paddd   xmm3, xmm4
    290         pxor    xmm15, xmm0
    291         pxor    xmm12, xmm1
    292         pxor    xmm13, xmm2
    293         pxor    xmm14, xmm3
    294         pshuflw xmm15, xmm15, 0xB1
    295         pshufhw xmm15, xmm15, 0xB1
    296         pshuflw xmm12, xmm12, 0xB1
    297         pshufhw xmm12, xmm12, 0xB1
    298         pshuflw xmm13, xmm13, 0xB1
    299         pshufhw xmm13, xmm13, 0xB1
    300         pshuflw xmm14, xmm14, 0xB1
    301         pshufhw xmm14, xmm14, 0xB1
    302         paddd   xmm10, xmm15
    303         paddd   xmm11, xmm12
    304         movdqa  xmm8, xmmword ptr [rsp+0x100]
    305         paddd   xmm8, xmm13
    306         paddd   xmm9, xmm14
    307         pxor    xmm5, xmm10
    308         pxor    xmm6, xmm11
    309         pxor    xmm7, xmm8
    310         pxor    xmm4, xmm9
    311         movdqa  xmmword ptr [rsp+0x100], xmm8
    312         movdqa  xmm8, xmm5
    313         psrld   xmm8, 12
    314         pslld   xmm5, 20
    315         por     xmm5, xmm8
    316         movdqa  xmm8, xmm6
    317         psrld   xmm8, 12
    318         pslld   xmm6, 20
    319         por     xmm6, xmm8
    320         movdqa  xmm8, xmm7
    321         psrld   xmm8, 12
    322         pslld   xmm7, 20
    323         por     xmm7, xmm8
    324         movdqa  xmm8, xmm4
    325         psrld   xmm8, 12
    326         pslld   xmm4, 20
    327         por     xmm4, xmm8
    328         paddd   xmm0, xmmword ptr [rsp+0x90]
    329         paddd   xmm1, xmmword ptr [rsp+0xB0]
    330         paddd   xmm2, xmmword ptr [rsp+0xD0]
    331         paddd   xmm3, xmmword ptr [rsp+0xF0]
    332         paddd   xmm0, xmm5
    333         paddd   xmm1, xmm6
    334         paddd   xmm2, xmm7
    335         paddd   xmm3, xmm4
    336         pxor    xmm15, xmm0
    337         pxor    xmm12, xmm1
    338         pxor    xmm13, xmm2
    339         pxor    xmm14, xmm3
    340         movdqa  xmm8, xmm15
    341         psrld   xmm15, 8
    342         pslld   xmm8, 24
    343         pxor    xmm15, xmm8
    344         movdqa  xmm8, xmm12
    345         psrld   xmm12, 8
    346         pslld   xmm8, 24
    347         pxor    xmm12, xmm8
    348         movdqa  xmm8, xmm13
    349         psrld   xmm13, 8
    350         pslld   xmm8, 24
    351         pxor    xmm13, xmm8
    352         movdqa  xmm8, xmm14
    353         psrld   xmm14, 8
    354         pslld   xmm8, 24
    355         pxor    xmm14, xmm8
    356         paddd   xmm10, xmm15
    357         paddd   xmm11, xmm12
    358         movdqa  xmm8, xmmword ptr [rsp+0x100]
    359         paddd   xmm8, xmm13
    360         paddd   xmm9, xmm14
    361         pxor    xmm5, xmm10
    362         pxor    xmm6, xmm11
    363         pxor    xmm7, xmm8
    364         pxor    xmm4, xmm9
    365         movdqa  xmmword ptr [rsp+0x100], xmm8
    366         movdqa  xmm8, xmm5
    367         psrld   xmm8, 7
    368         pslld   xmm5, 25
    369         por     xmm5, xmm8
    370         movdqa  xmm8, xmm6
    371         psrld   xmm8, 7
    372         pslld   xmm6, 25
    373         por     xmm6, xmm8
    374         movdqa  xmm8, xmm7
    375         psrld   xmm8, 7
    376         pslld   xmm7, 25
    377         por     xmm7, xmm8
    378         movdqa  xmm8, xmm4
    379         psrld   xmm8, 7
    380         pslld   xmm4, 25
    381         por     xmm4, xmm8
    382         paddd   xmm0, xmmword ptr [rsp+0x20]
    383         paddd   xmm1, xmmword ptr [rsp+0x30]
    384         paddd   xmm2, xmmword ptr [rsp+0x70]
    385         paddd   xmm3, xmmword ptr [rsp+0x40]
    386         paddd   xmm0, xmm4
    387         paddd   xmm1, xmm5
    388         paddd   xmm2, xmm6
    389         paddd   xmm3, xmm7
    390         pxor    xmm12, xmm0
    391         pxor    xmm13, xmm1
    392         pxor    xmm14, xmm2
    393         pxor    xmm15, xmm3
    394         pshuflw xmm12, xmm12, 0xB1
    395         pshufhw xmm12, xmm12, 0xB1
    396         pshuflw xmm13, xmm13, 0xB1
    397         pshufhw xmm13, xmm13, 0xB1
    398         pshuflw xmm14, xmm14, 0xB1
    399         pshufhw xmm14, xmm14, 0xB1
    400         pshuflw xmm15, xmm15, 0xB1
    401         pshufhw xmm15, xmm15, 0xB1
    402         movdqa  xmm8, xmmword ptr [rsp+0x100]
    403         paddd   xmm8, xmm12
    404         paddd   xmm9, xmm13
    405         paddd   xmm10, xmm14
    406         paddd   xmm11, xmm15
    407         pxor    xmm4, xmm8
    408         pxor    xmm5, xmm9
    409         pxor    xmm6, xmm10
    410         pxor    xmm7, xmm11
    411         movdqa  xmmword ptr [rsp+0x100], xmm8
    412         movdqa  xmm8, xmm4
    413         psrld   xmm8, 12
    414         pslld   xmm4, 20
    415         por     xmm4, xmm8
    416         movdqa  xmm8, xmm5
    417         psrld   xmm8, 12
    418         pslld   xmm5, 20
    419         por     xmm5, xmm8
    420         movdqa  xmm8, xmm6
    421         psrld   xmm8, 12
    422         pslld   xmm6, 20
    423         por     xmm6, xmm8
    424         movdqa  xmm8, xmm7
    425         psrld   xmm8, 12
    426         pslld   xmm7, 20
    427         por     xmm7, xmm8
    428         paddd   xmm0, xmmword ptr [rsp+0x60]
    429         paddd   xmm1, xmmword ptr [rsp+0xA0]
    430         paddd   xmm2, xmmword ptr [rsp]
    431         paddd   xmm3, xmmword ptr [rsp+0xD0]
    432         paddd   xmm0, xmm4
    433         paddd   xmm1, xmm5
    434         paddd   xmm2, xmm6
    435         paddd   xmm3, xmm7
    436         pxor    xmm12, xmm0
    437         pxor    xmm13, xmm1
    438         pxor    xmm14, xmm2
    439         pxor    xmm15, xmm3
    440         movdqa  xmm8, xmm12
    441         psrld   xmm12, 8
    442         pslld   xmm8, 24
    443         pxor    xmm12, xmm8
    444         movdqa  xmm8, xmm13
    445         psrld   xmm13, 8
    446         pslld   xmm8, 24
    447         pxor    xmm13, xmm8
    448         movdqa  xmm8, xmm14
    449         psrld   xmm14, 8
    450         pslld   xmm8, 24
    451         pxor    xmm14, xmm8
    452         movdqa  xmm8, xmm15
    453         psrld   xmm15, 8
    454         pslld   xmm8, 24
    455         pxor    xmm15, xmm8
    456         movdqa  xmm8, xmmword ptr [rsp+0x100]
    457         paddd   xmm8, xmm12
    458         paddd   xmm9, xmm13
    459         paddd   xmm10, xmm14
    460         paddd   xmm11, xmm15
    461         pxor    xmm4, xmm8
    462         pxor    xmm5, xmm9
    463         pxor    xmm6, xmm10
    464         pxor    xmm7, xmm11
    465         movdqa  xmmword ptr [rsp+0x100], xmm8
    466         movdqa  xmm8, xmm4
    467         psrld   xmm8, 7
    468         pslld   xmm4, 25
    469         por     xmm4, xmm8
    470         movdqa  xmm8, xmm5
    471         psrld   xmm8, 7
    472         pslld   xmm5, 25
    473         por     xmm5, xmm8
    474         movdqa  xmm8, xmm6
    475         psrld   xmm8, 7
    476         pslld   xmm6, 25
    477         por     xmm6, xmm8
    478         movdqa  xmm8, xmm7
    479         psrld   xmm8, 7
    480         pslld   xmm7, 25
    481         por     xmm7, xmm8
    482         paddd   xmm0, xmmword ptr [rsp+0x10]
    483         paddd   xmm1, xmmword ptr [rsp+0xC0]
    484         paddd   xmm2, xmmword ptr [rsp+0x90]
    485         paddd   xmm3, xmmword ptr [rsp+0xF0]
    486         paddd   xmm0, xmm5
    487         paddd   xmm1, xmm6
    488         paddd   xmm2, xmm7
    489         paddd   xmm3, xmm4
    490         pxor    xmm15, xmm0
    491         pxor    xmm12, xmm1
    492         pxor    xmm13, xmm2
    493         pxor    xmm14, xmm3
    494         pshuflw xmm15, xmm15, 0xB1
    495         pshufhw xmm15, xmm15, 0xB1
    496         pshuflw xmm12, xmm12, 0xB1
    497         pshufhw xmm12, xmm12, 0xB1
    498         pshuflw xmm13, xmm13, 0xB1
    499         pshufhw xmm13, xmm13, 0xB1
    500         pshuflw xmm14, xmm14, 0xB1
    501         pshufhw xmm14, xmm14, 0xB1
    502         paddd   xmm10, xmm15
    503         paddd   xmm11, xmm12
    504         movdqa  xmm8, xmmword ptr [rsp+0x100]
    505         paddd   xmm8, xmm13
    506         paddd   xmm9, xmm14
    507         pxor    xmm5, xmm10
    508         pxor    xmm6, xmm11
    509         pxor    xmm7, xmm8
    510         pxor    xmm4, xmm9
    511         movdqa  xmmword ptr [rsp+0x100], xmm8
    512         movdqa  xmm8, xmm5
    513         psrld   xmm8, 12
    514         pslld   xmm5, 20
    515         por     xmm5, xmm8
    516         movdqa  xmm8, xmm6
    517         psrld   xmm8, 12
    518         pslld   xmm6, 20
    519         por     xmm6, xmm8
    520         movdqa  xmm8, xmm7
    521         psrld   xmm8, 12
    522         pslld   xmm7, 20
    523         por     xmm7, xmm8
    524         movdqa  xmm8, xmm4
    525         psrld   xmm8, 12
    526         pslld   xmm4, 20
    527         por     xmm4, xmm8
    528         paddd   xmm0, xmmword ptr [rsp+0xB0]
    529         paddd   xmm1, xmmword ptr [rsp+0x50]
    530         paddd   xmm2, xmmword ptr [rsp+0xE0]
    531         paddd   xmm3, xmmword ptr [rsp+0x80]
    532         paddd   xmm0, xmm5
    533         paddd   xmm1, xmm6
    534         paddd   xmm2, xmm7
    535         paddd   xmm3, xmm4
    536         pxor    xmm15, xmm0
    537         pxor    xmm12, xmm1
    538         pxor    xmm13, xmm2
    539         pxor    xmm14, xmm3
    540         movdqa  xmm8, xmm15
    541         psrld   xmm15, 8
    542         pslld   xmm8, 24
    543         pxor    xmm15, xmm8
    544         movdqa  xmm8, xmm12
    545         psrld   xmm12, 8
    546         pslld   xmm8, 24
    547         pxor    xmm12, xmm8
    548         movdqa  xmm8, xmm13
    549         psrld   xmm13, 8
    550         pslld   xmm8, 24
    551         pxor    xmm13, xmm8
    552         movdqa  xmm8, xmm14
    553         psrld   xmm14, 8
    554         pslld   xmm8, 24
    555         pxor    xmm14, xmm8
    556         paddd   xmm10, xmm15
    557         paddd   xmm11, xmm12
    558         movdqa  xmm8, xmmword ptr [rsp+0x100]
    559         paddd   xmm8, xmm13
    560         paddd   xmm9, xmm14
    561         pxor    xmm5, xmm10
    562         pxor    xmm6, xmm11
    563         pxor    xmm7, xmm8
    564         pxor    xmm4, xmm9
    565         movdqa  xmmword ptr [rsp+0x100], xmm8
    566         movdqa  xmm8, xmm5
    567         psrld   xmm8, 7
    568         pslld   xmm5, 25
    569         por     xmm5, xmm8
    570         movdqa  xmm8, xmm6
    571         psrld   xmm8, 7
    572         pslld   xmm6, 25
    573         por     xmm6, xmm8
    574         movdqa  xmm8, xmm7
    575         psrld   xmm8, 7
    576         pslld   xmm7, 25
    577         por     xmm7, xmm8
    578         movdqa  xmm8, xmm4
    579         psrld   xmm8, 7
    580         pslld   xmm4, 25
    581         por     xmm4, xmm8
    582         paddd   xmm0, xmmword ptr [rsp+0x30]
    583         paddd   xmm1, xmmword ptr [rsp+0xA0]
    584         paddd   xmm2, xmmword ptr [rsp+0xD0]
    585         paddd   xmm3, xmmword ptr [rsp+0x70]
    586         paddd   xmm0, xmm4
    587         paddd   xmm1, xmm5
    588         paddd   xmm2, xmm6
    589         paddd   xmm3, xmm7
    590         pxor    xmm12, xmm0
    591         pxor    xmm13, xmm1
    592         pxor    xmm14, xmm2
    593         pxor    xmm15, xmm3
    594         pshuflw xmm12, xmm12, 0xB1
    595         pshufhw xmm12, xmm12, 0xB1
    596         pshuflw xmm13, xmm13, 0xB1
    597         pshufhw xmm13, xmm13, 0xB1
    598         pshuflw xmm14, xmm14, 0xB1
    599         pshufhw xmm14, xmm14, 0xB1
    600         pshuflw xmm15, xmm15, 0xB1
    601         pshufhw xmm15, xmm15, 0xB1
    602         movdqa  xmm8, xmmword ptr [rsp+0x100]
    603         paddd   xmm8, xmm12
    604         paddd   xmm9, xmm13
    605         paddd   xmm10, xmm14
    606         paddd   xmm11, xmm15
    607         pxor    xmm4, xmm8
    608         pxor    xmm5, xmm9
    609         pxor    xmm6, xmm10
    610         pxor    xmm7, xmm11
    611         movdqa  xmmword ptr [rsp+0x100], xmm8
    612         movdqa  xmm8, xmm4
    613         psrld   xmm8, 12
    614         pslld   xmm4, 20
    615         por     xmm4, xmm8
    616         movdqa  xmm8, xmm5
    617         psrld   xmm8, 12
    618         pslld   xmm5, 20
    619         por     xmm5, xmm8
    620         movdqa  xmm8, xmm6
    621         psrld   xmm8, 12
    622         pslld   xmm6, 20
    623         por     xmm6, xmm8
    624         movdqa  xmm8, xmm7
    625         psrld   xmm8, 12
    626         pslld   xmm7, 20
    627         por     xmm7, xmm8
    628         paddd   xmm0, xmmword ptr [rsp+0x40]
    629         paddd   xmm1, xmmword ptr [rsp+0xC0]
    630         paddd   xmm2, xmmword ptr [rsp+0x20]
    631         paddd   xmm3, xmmword ptr [rsp+0xE0]
    632         paddd   xmm0, xmm4
    633         paddd   xmm1, xmm5
    634         paddd   xmm2, xmm6
    635         paddd   xmm3, xmm7
    636         pxor    xmm12, xmm0
    637         pxor    xmm13, xmm1
    638         pxor    xmm14, xmm2
    639         pxor    xmm15, xmm3
    640         movdqa  xmm8, xmm12
    641         psrld   xmm12, 8
    642         pslld   xmm8, 24
    643         pxor    xmm12, xmm8
    644         movdqa  xmm8, xmm13
    645         psrld   xmm13, 8
    646         pslld   xmm8, 24
    647         pxor    xmm13, xmm8
    648         movdqa  xmm8, xmm14
    649         psrld   xmm14, 8
    650         pslld   xmm8, 24
    651         pxor    xmm14, xmm8
    652         movdqa  xmm8, xmm15
    653         psrld   xmm15, 8
    654         pslld   xmm8, 24
    655         pxor    xmm15, xmm8
    656         movdqa  xmm8, xmmword ptr [rsp+0x100]
    657         paddd   xmm8, xmm12
    658         paddd   xmm9, xmm13
    659         paddd   xmm10, xmm14
    660         paddd   xmm11, xmm15
    661         pxor    xmm4, xmm8
    662         pxor    xmm5, xmm9
    663         pxor    xmm6, xmm10
    664         pxor    xmm7, xmm11
    665         movdqa  xmmword ptr [rsp+0x100], xmm8
    666         movdqa  xmm8, xmm4
    667         psrld   xmm8, 7
    668         pslld   xmm4, 25
    669         por     xmm4, xmm8
    670         movdqa  xmm8, xmm5
    671         psrld   xmm8, 7
    672         pslld   xmm5, 25
    673         por     xmm5, xmm8
    674         movdqa  xmm8, xmm6
    675         psrld   xmm8, 7
    676         pslld   xmm6, 25
    677         por     xmm6, xmm8
    678         movdqa  xmm8, xmm7
    679         psrld   xmm8, 7
    680         pslld   xmm7, 25
    681         por     xmm7, xmm8
    682         paddd   xmm0, xmmword ptr [rsp+0x60]
    683         paddd   xmm1, xmmword ptr [rsp+0x90]
    684         paddd   xmm2, xmmword ptr [rsp+0xB0]
    685         paddd   xmm3, xmmword ptr [rsp+0x80]
    686         paddd   xmm0, xmm5
    687         paddd   xmm1, xmm6
    688         paddd   xmm2, xmm7
    689         paddd   xmm3, xmm4
    690         pxor    xmm15, xmm0
    691         pxor    xmm12, xmm1
    692         pxor    xmm13, xmm2
    693         pxor    xmm14, xmm3
    694         pshuflw xmm15, xmm15, 0xB1
    695         pshufhw xmm15, xmm15, 0xB1
    696         pshuflw xmm12, xmm12, 0xB1
    697         pshufhw xmm12, xmm12, 0xB1
    698         pshuflw xmm13, xmm13, 0xB1
    699         pshufhw xmm13, xmm13, 0xB1
    700         pshuflw xmm14, xmm14, 0xB1
    701         pshufhw xmm14, xmm14, 0xB1
    702         paddd   xmm10, xmm15
    703         paddd   xmm11, xmm12
    704         movdqa  xmm8, xmmword ptr [rsp+0x100]
    705         paddd   xmm8, xmm13
    706         paddd   xmm9, xmm14
    707         pxor    xmm5, xmm10
    708         pxor    xmm6, xmm11
    709         pxor    xmm7, xmm8
    710         pxor    xmm4, xmm9
    711         movdqa  xmmword ptr [rsp+0x100], xmm8
    712         movdqa  xmm8, xmm5
    713         psrld   xmm8, 12
    714         pslld   xmm5, 20
    715         por     xmm5, xmm8
    716         movdqa  xmm8, xmm6
    717         psrld   xmm8, 12
    718         pslld   xmm6, 20
    719         por     xmm6, xmm8
    720         movdqa  xmm8, xmm7
    721         psrld   xmm8, 12
    722         pslld   xmm7, 20
    723         por     xmm7, xmm8
    724         movdqa  xmm8, xmm4
    725         psrld   xmm8, 12
    726         pslld   xmm4, 20
    727         por     xmm4, xmm8
    728         paddd   xmm0, xmmword ptr [rsp+0x50]
    729         paddd   xmm1, xmmword ptr [rsp]
    730         paddd   xmm2, xmmword ptr [rsp+0xF0]
    731         paddd   xmm3, xmmword ptr [rsp+0x10]
    732         paddd   xmm0, xmm5
    733         paddd   xmm1, xmm6
    734         paddd   xmm2, xmm7
    735         paddd   xmm3, xmm4
    736         pxor    xmm15, xmm0
    737         pxor    xmm12, xmm1
    738         pxor    xmm13, xmm2
    739         pxor    xmm14, xmm3
    740         movdqa  xmm8, xmm15
    741         psrld   xmm15, 8
    742         pslld   xmm8, 24
    743         pxor    xmm15, xmm8
    744         movdqa  xmm8, xmm12
    745         psrld   xmm12, 8
    746         pslld   xmm8, 24
    747         pxor    xmm12, xmm8
    748         movdqa  xmm8, xmm13
    749         psrld   xmm13, 8
    750         pslld   xmm8, 24
    751         pxor    xmm13, xmm8
    752         movdqa  xmm8, xmm14
    753         psrld   xmm14, 8
    754         pslld   xmm8, 24
    755         pxor    xmm14, xmm8
    756         paddd   xmm10, xmm15
    757         paddd   xmm11, xmm12
    758         movdqa  xmm8, xmmword ptr [rsp+0x100]
    759         paddd   xmm8, xmm13
    760         paddd   xmm9, xmm14
    761         pxor    xmm5, xmm10
    762         pxor    xmm6, xmm11
    763         pxor    xmm7, xmm8
    764         pxor    xmm4, xmm9
    765         movdqa  xmmword ptr [rsp+0x100], xmm8
    766         movdqa  xmm8, xmm5
    767         psrld   xmm8, 7
    768         pslld   xmm5, 25
    769         por     xmm5, xmm8
    770         movdqa  xmm8, xmm6
    771         psrld   xmm8, 7
    772         pslld   xmm6, 25
    773         por     xmm6, xmm8
    774         movdqa  xmm8, xmm7
    775         psrld   xmm8, 7
    776         pslld   xmm7, 25
    777         por     xmm7, xmm8
    778         movdqa  xmm8, xmm4
    779         psrld   xmm8, 7
    780         pslld   xmm4, 25
    781         por     xmm4, xmm8
    782         paddd   xmm0, xmmword ptr [rsp+0xA0]
    783         paddd   xmm1, xmmword ptr [rsp+0xC0]
    784         paddd   xmm2, xmmword ptr [rsp+0xE0]
    785         paddd   xmm3, xmmword ptr [rsp+0xD0]
    786         paddd   xmm0, xmm4
    787         paddd   xmm1, xmm5
    788         paddd   xmm2, xmm6
    789         paddd   xmm3, xmm7
    790         pxor    xmm12, xmm0
    791         pxor    xmm13, xmm1
    792         pxor    xmm14, xmm2
    793         pxor    xmm15, xmm3
    794         pshuflw xmm12, xmm12, 0xB1
    795         pshufhw xmm12, xmm12, 0xB1
    796         pshuflw xmm13, xmm13, 0xB1
    797         pshufhw xmm13, xmm13, 0xB1
    798         pshuflw xmm14, xmm14, 0xB1
    799         pshufhw xmm14, xmm14, 0xB1
    800         pshuflw xmm15, xmm15, 0xB1
    801         pshufhw xmm15, xmm15, 0xB1
    802         movdqa  xmm8, xmmword ptr [rsp+0x100]
    803         paddd   xmm8, xmm12
    804         paddd   xmm9, xmm13
    805         paddd   xmm10, xmm14
    806         paddd   xmm11, xmm15
    807         pxor    xmm4, xmm8
    808         pxor    xmm5, xmm9
    809         pxor    xmm6, xmm10
    810         pxor    xmm7, xmm11
    811         movdqa  xmmword ptr [rsp+0x100], xmm8
    812         movdqa  xmm8, xmm4
    813         psrld   xmm8, 12
    814         pslld   xmm4, 20
    815         por     xmm4, xmm8
    816         movdqa  xmm8, xmm5
    817         psrld   xmm8, 12
    818         pslld   xmm5, 20
    819         por     xmm5, xmm8
    820         movdqa  xmm8, xmm6
    821         psrld   xmm8, 12
    822         pslld   xmm6, 20
    823         por     xmm6, xmm8
    824         movdqa  xmm8, xmm7
    825         psrld   xmm8, 12
    826         pslld   xmm7, 20
    827         por     xmm7, xmm8
    828         paddd   xmm0, xmmword ptr [rsp+0x70]
    829         paddd   xmm1, xmmword ptr [rsp+0x90]
    830         paddd   xmm2, xmmword ptr [rsp+0x30]
    831         paddd   xmm3, xmmword ptr [rsp+0xF0]
    832         paddd   xmm0, xmm4
    833         paddd   xmm1, xmm5
    834         paddd   xmm2, xmm6
    835         paddd   xmm3, xmm7
    836         pxor    xmm12, xmm0
    837         pxor    xmm13, xmm1
    838         pxor    xmm14, xmm2
    839         pxor    xmm15, xmm3
    840         movdqa  xmm8, xmm12
    841         psrld   xmm12, 8
    842         pslld   xmm8, 24
    843         pxor    xmm12, xmm8
    844         movdqa  xmm8, xmm13
    845         psrld   xmm13, 8
    846         pslld   xmm8, 24
    847         pxor    xmm13, xmm8
    848         movdqa  xmm8, xmm14
    849         psrld   xmm14, 8
    850         pslld   xmm8, 24
    851         pxor    xmm14, xmm8
    852         movdqa  xmm8, xmm15
    853         psrld   xmm15, 8
    854         pslld   xmm8, 24
    855         pxor    xmm15, xmm8
    856         movdqa  xmm8, xmmword ptr [rsp+0x100]
    857         paddd   xmm8, xmm12
    858         paddd   xmm9, xmm13
    859         paddd   xmm10, xmm14
    860         paddd   xmm11, xmm15
    861         pxor    xmm4, xmm8
    862         pxor    xmm5, xmm9
    863         pxor    xmm6, xmm10
    864         pxor    xmm7, xmm11
    865         movdqa  xmmword ptr [rsp+0x100], xmm8
    866         movdqa  xmm8, xmm4
    867         psrld   xmm8, 7
    868         pslld   xmm4, 25
    869         por     xmm4, xmm8
    870         movdqa  xmm8, xmm5
    871         psrld   xmm8, 7
    872         pslld   xmm5, 25
    873         por     xmm5, xmm8
    874         movdqa  xmm8, xmm6
    875         psrld   xmm8, 7
    876         pslld   xmm6, 25
    877         por     xmm6, xmm8
    878         movdqa  xmm8, xmm7
    879         psrld   xmm8, 7
    880         pslld   xmm7, 25
    881         por     xmm7, xmm8
    882         paddd   xmm0, xmmword ptr [rsp+0x40]
    883         paddd   xmm1, xmmword ptr [rsp+0xB0]
    884         paddd   xmm2, xmmword ptr [rsp+0x50]
    885         paddd   xmm3, xmmword ptr [rsp+0x10]
    886         paddd   xmm0, xmm5
    887         paddd   xmm1, xmm6
    888         paddd   xmm2, xmm7
    889         paddd   xmm3, xmm4
    890         pxor    xmm15, xmm0
    891         pxor    xmm12, xmm1
    892         pxor    xmm13, xmm2
    893         pxor    xmm14, xmm3
    894         pshuflw xmm15, xmm15, 0xB1
    895         pshufhw xmm15, xmm15, 0xB1
    896         pshuflw xmm12, xmm12, 0xB1
    897         pshufhw xmm12, xmm12, 0xB1
    898         pshuflw xmm13, xmm13, 0xB1
    899         pshufhw xmm13, xmm13, 0xB1
    900         pshuflw xmm14, xmm14, 0xB1
    901         pshufhw xmm14, xmm14, 0xB1
    902         paddd   xmm10, xmm15
    903         paddd   xmm11, xmm12
    904         movdqa  xmm8, xmmword ptr [rsp+0x100]
    905         paddd   xmm8, xmm13
    906         paddd   xmm9, xmm14
    907         pxor    xmm5, xmm10
    908         pxor    xmm6, xmm11
    909         pxor    xmm7, xmm8
    910         pxor    xmm4, xmm9
    911         movdqa  xmmword ptr [rsp+0x100], xmm8
    912         movdqa  xmm8, xmm5
    913         psrld   xmm8, 12
    914         pslld   xmm5, 20
    915         por     xmm5, xmm8
    916         movdqa  xmm8, xmm6
    917         psrld   xmm8, 12
    918         pslld   xmm6, 20
    919         por     xmm6, xmm8
    920         movdqa  xmm8, xmm7
    921         psrld   xmm8, 12
    922         pslld   xmm7, 20
    923         por     xmm7, xmm8
    924         movdqa  xmm8, xmm4
    925         psrld   xmm8, 12
    926         pslld   xmm4, 20
    927         por     xmm4, xmm8
    928         paddd   xmm0, xmmword ptr [rsp]
    929         paddd   xmm1, xmmword ptr [rsp+0x20]
    930         paddd   xmm2, xmmword ptr [rsp+0x80]
    931         paddd   xmm3, xmmword ptr [rsp+0x60]
    932         paddd   xmm0, xmm5
    933         paddd   xmm1, xmm6
    934         paddd   xmm2, xmm7
    935         paddd   xmm3, xmm4
    936         pxor    xmm15, xmm0
    937         pxor    xmm12, xmm1
    938         pxor    xmm13, xmm2
    939         pxor    xmm14, xmm3
    940         movdqa  xmm8, xmm15
    941         psrld   xmm15, 8
    942         pslld   xmm8, 24
    943         pxor    xmm15, xmm8
    944         movdqa  xmm8, xmm12
    945         psrld   xmm12, 8
    946         pslld   xmm8, 24
    947         pxor    xmm12, xmm8
    948         movdqa  xmm8, xmm13
    949         psrld   xmm13, 8
    950         pslld   xmm8, 24
    951         pxor    xmm13, xmm8
    952         movdqa  xmm8, xmm14
    953         psrld   xmm14, 8
    954         pslld   xmm8, 24
    955         pxor    xmm14, xmm8
    956         paddd   xmm10, xmm15
    957         paddd   xmm11, xmm12
    958         movdqa  xmm8, xmmword ptr [rsp+0x100]
    959         paddd   xmm8, xmm13
    960         paddd   xmm9, xmm14
    961         pxor    xmm5, xmm10
    962         pxor    xmm6, xmm11
    963         pxor    xmm7, xmm8
    964         pxor    xmm4, xmm9
    965         movdqa  xmmword ptr [rsp+0x100], xmm8
    966         movdqa  xmm8, xmm5
    967         psrld   xmm8, 7
    968         pslld   xmm5, 25
    969         por     xmm5, xmm8
    970         movdqa  xmm8, xmm6
    971         psrld   xmm8, 7
    972         pslld   xmm6, 25
    973         por     xmm6, xmm8
    974         movdqa  xmm8, xmm7
    975         psrld   xmm8, 7
    976         pslld   xmm7, 25
    977         por     xmm7, xmm8
    978         movdqa  xmm8, xmm4
    979         psrld   xmm8, 7
    980         pslld   xmm4, 25
    981         por     xmm4, xmm8
    982         paddd   xmm0, xmmword ptr [rsp+0xC0]
    983         paddd   xmm1, xmmword ptr [rsp+0x90]
    984         paddd   xmm2, xmmword ptr [rsp+0xF0]
    985         paddd   xmm3, xmmword ptr [rsp+0xE0]
    986         paddd   xmm0, xmm4
    987         paddd   xmm1, xmm5
    988         paddd   xmm2, xmm6
    989         paddd   xmm3, xmm7
    990         pxor    xmm12, xmm0
    991         pxor    xmm13, xmm1
    992         pxor    xmm14, xmm2
    993         pxor    xmm15, xmm3
    994         pshuflw xmm12, xmm12, 0xB1
    995         pshufhw xmm12, xmm12, 0xB1
    996         pshuflw xmm13, xmm13, 0xB1
    997         pshufhw xmm13, xmm13, 0xB1
    998         pshuflw xmm14, xmm14, 0xB1
    999         pshufhw xmm14, xmm14, 0xB1
   1000         pshuflw xmm15, xmm15, 0xB1
   1001         pshufhw xmm15, xmm15, 0xB1
   1002         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1003         paddd   xmm8, xmm12
   1004         paddd   xmm9, xmm13
   1005         paddd   xmm10, xmm14
   1006         paddd   xmm11, xmm15
   1007         pxor    xmm4, xmm8
   1008         pxor    xmm5, xmm9
   1009         pxor    xmm6, xmm10
   1010         pxor    xmm7, xmm11
   1011         movdqa  xmmword ptr [rsp+0x100], xmm8
   1012         movdqa  xmm8, xmm4
   1013         psrld   xmm8, 12
   1014         pslld   xmm4, 20
   1015         por     xmm4, xmm8
   1016         movdqa  xmm8, xmm5
   1017         psrld   xmm8, 12
   1018         pslld   xmm5, 20
   1019         por     xmm5, xmm8
   1020         movdqa  xmm8, xmm6
   1021         psrld   xmm8, 12
   1022         pslld   xmm6, 20
   1023         por     xmm6, xmm8
   1024         movdqa  xmm8, xmm7
   1025         psrld   xmm8, 12
   1026         pslld   xmm7, 20
   1027         por     xmm7, xmm8
   1028         paddd   xmm0, xmmword ptr [rsp+0xD0]
   1029         paddd   xmm1, xmmword ptr [rsp+0xB0]
   1030         paddd   xmm2, xmmword ptr [rsp+0xA0]
   1031         paddd   xmm3, xmmword ptr [rsp+0x80]
   1032         paddd   xmm0, xmm4
   1033         paddd   xmm1, xmm5
   1034         paddd   xmm2, xmm6
   1035         paddd   xmm3, xmm7
   1036         pxor    xmm12, xmm0
   1037         pxor    xmm13, xmm1
   1038         pxor    xmm14, xmm2
   1039         pxor    xmm15, xmm3
   1040         movdqa  xmm8, xmm12
   1041         psrld   xmm12, 8
   1042         pslld   xmm8, 24
   1043         pxor    xmm12, xmm8
   1044         movdqa  xmm8, xmm13
   1045         psrld   xmm13, 8
   1046         pslld   xmm8, 24
   1047         pxor    xmm13, xmm8
   1048         movdqa  xmm8, xmm14
   1049         psrld   xmm14, 8
   1050         pslld   xmm8, 24
   1051         pxor    xmm14, xmm8
   1052         movdqa  xmm8, xmm15
   1053         psrld   xmm15, 8
   1054         pslld   xmm8, 24
   1055         pxor    xmm15, xmm8
   1056         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1057         paddd   xmm8, xmm12
   1058         paddd   xmm9, xmm13
   1059         paddd   xmm10, xmm14
   1060         paddd   xmm11, xmm15
   1061         pxor    xmm4, xmm8
   1062         pxor    xmm5, xmm9
   1063         pxor    xmm6, xmm10
   1064         pxor    xmm7, xmm11
   1065         movdqa  xmmword ptr [rsp+0x100], xmm8
   1066         movdqa  xmm8, xmm4
   1067         psrld   xmm8, 7
   1068         pslld   xmm4, 25
   1069         por     xmm4, xmm8
   1070         movdqa  xmm8, xmm5
   1071         psrld   xmm8, 7
   1072         pslld   xmm5, 25
   1073         por     xmm5, xmm8
   1074         movdqa  xmm8, xmm6
   1075         psrld   xmm8, 7
   1076         pslld   xmm6, 25
   1077         por     xmm6, xmm8
   1078         movdqa  xmm8, xmm7
   1079         psrld   xmm8, 7
   1080         pslld   xmm7, 25
   1081         por     xmm7, xmm8
   1082         paddd   xmm0, xmmword ptr [rsp+0x70]
   1083         paddd   xmm1, xmmword ptr [rsp+0x50]
   1084         paddd   xmm2, xmmword ptr [rsp]
   1085         paddd   xmm3, xmmword ptr [rsp+0x60]
   1086         paddd   xmm0, xmm5
   1087         paddd   xmm1, xmm6
   1088         paddd   xmm2, xmm7
   1089         paddd   xmm3, xmm4
   1090         pxor    xmm15, xmm0
   1091         pxor    xmm12, xmm1
   1092         pxor    xmm13, xmm2
   1093         pxor    xmm14, xmm3
   1094         pshuflw xmm15, xmm15, 0xB1
   1095         pshufhw xmm15, xmm15, 0xB1
   1096         pshuflw xmm12, xmm12, 0xB1
   1097         pshufhw xmm12, xmm12, 0xB1
   1098         pshuflw xmm13, xmm13, 0xB1
   1099         pshufhw xmm13, xmm13, 0xB1
   1100         pshuflw xmm14, xmm14, 0xB1
   1101         pshufhw xmm14, xmm14, 0xB1
   1102         paddd   xmm10, xmm15
   1103         paddd   xmm11, xmm12
   1104         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1105         paddd   xmm8, xmm13
   1106         paddd   xmm9, xmm14
   1107         pxor    xmm5, xmm10
   1108         pxor    xmm6, xmm11
   1109         pxor    xmm7, xmm8
   1110         pxor    xmm4, xmm9
   1111         movdqa  xmmword ptr [rsp+0x100], xmm8
   1112         movdqa  xmm8, xmm5
   1113         psrld   xmm8, 12
   1114         pslld   xmm5, 20
   1115         por     xmm5, xmm8
   1116         movdqa  xmm8, xmm6
   1117         psrld   xmm8, 12
   1118         pslld   xmm6, 20
   1119         por     xmm6, xmm8
   1120         movdqa  xmm8, xmm7
   1121         psrld   xmm8, 12
   1122         pslld   xmm7, 20
   1123         por     xmm7, xmm8
   1124         movdqa  xmm8, xmm4
   1125         psrld   xmm8, 12
   1126         pslld   xmm4, 20
   1127         por     xmm4, xmm8
   1128         paddd   xmm0, xmmword ptr [rsp+0x20]
   1129         paddd   xmm1, xmmword ptr [rsp+0x30]
   1130         paddd   xmm2, xmmword ptr [rsp+0x10]
   1131         paddd   xmm3, xmmword ptr [rsp+0x40]
   1132         paddd   xmm0, xmm5
   1133         paddd   xmm1, xmm6
   1134         paddd   xmm2, xmm7
   1135         paddd   xmm3, xmm4
   1136         pxor    xmm15, xmm0
   1137         pxor    xmm12, xmm1
   1138         pxor    xmm13, xmm2
   1139         pxor    xmm14, xmm3
   1140         movdqa  xmm8, xmm15
   1141         psrld   xmm15, 8
   1142         pslld   xmm8, 24
   1143         pxor    xmm15, xmm8
   1144         movdqa  xmm8, xmm12
   1145         psrld   xmm12, 8
   1146         pslld   xmm8, 24
   1147         pxor    xmm12, xmm8
   1148         movdqa  xmm8, xmm13
   1149         psrld   xmm13, 8
   1150         pslld   xmm8, 24
   1151         pxor    xmm13, xmm8
   1152         movdqa  xmm8, xmm14
   1153         psrld   xmm14, 8
   1154         pslld   xmm8, 24
   1155         pxor    xmm14, xmm8
   1156         paddd   xmm10, xmm15
   1157         paddd   xmm11, xmm12
   1158         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1159         paddd   xmm8, xmm13
   1160         paddd   xmm9, xmm14
   1161         pxor    xmm5, xmm10
   1162         pxor    xmm6, xmm11
   1163         pxor    xmm7, xmm8
   1164         pxor    xmm4, xmm9
   1165         movdqa  xmmword ptr [rsp+0x100], xmm8
   1166         movdqa  xmm8, xmm5
   1167         psrld   xmm8, 7
   1168         pslld   xmm5, 25
   1169         por     xmm5, xmm8
   1170         movdqa  xmm8, xmm6
   1171         psrld   xmm8, 7
   1172         pslld   xmm6, 25
   1173         por     xmm6, xmm8
   1174         movdqa  xmm8, xmm7
   1175         psrld   xmm8, 7
   1176         pslld   xmm7, 25
   1177         por     xmm7, xmm8
   1178         movdqa  xmm8, xmm4
   1179         psrld   xmm8, 7
   1180         pslld   xmm4, 25
   1181         por     xmm4, xmm8
   1182         paddd   xmm0, xmmword ptr [rsp+0x90]
   1183         paddd   xmm1, xmmword ptr [rsp+0xB0]
   1184         paddd   xmm2, xmmword ptr [rsp+0x80]
   1185         paddd   xmm3, xmmword ptr [rsp+0xF0]
   1186         paddd   xmm0, xmm4
   1187         paddd   xmm1, xmm5
   1188         paddd   xmm2, xmm6
   1189         paddd   xmm3, xmm7
   1190         pxor    xmm12, xmm0
   1191         pxor    xmm13, xmm1
   1192         pxor    xmm14, xmm2
   1193         pxor    xmm15, xmm3
   1194         pshuflw xmm12, xmm12, 0xB1
   1195         pshufhw xmm12, xmm12, 0xB1
   1196         pshuflw xmm13, xmm13, 0xB1
   1197         pshufhw xmm13, xmm13, 0xB1
   1198         pshuflw xmm14, xmm14, 0xB1
   1199         pshufhw xmm14, xmm14, 0xB1
   1200         pshuflw xmm15, xmm15, 0xB1
   1201         pshufhw xmm15, xmm15, 0xB1
   1202         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1203         paddd   xmm8, xmm12
   1204         paddd   xmm9, xmm13
   1205         paddd   xmm10, xmm14
   1206         paddd   xmm11, xmm15
   1207         pxor    xmm4, xmm8
   1208         pxor    xmm5, xmm9
   1209         pxor    xmm6, xmm10
   1210         pxor    xmm7, xmm11
   1211         movdqa  xmmword ptr [rsp+0x100], xmm8
   1212         movdqa  xmm8, xmm4
   1213         psrld   xmm8, 12
   1214         pslld   xmm4, 20
   1215         por     xmm4, xmm8
   1216         movdqa  xmm8, xmm5
   1217         psrld   xmm8, 12
   1218         pslld   xmm5, 20
   1219         por     xmm5, xmm8
   1220         movdqa  xmm8, xmm6
   1221         psrld   xmm8, 12
   1222         pslld   xmm6, 20
   1223         por     xmm6, xmm8
   1224         movdqa  xmm8, xmm7
   1225         psrld   xmm8, 12
   1226         pslld   xmm7, 20
   1227         por     xmm7, xmm8
   1228         paddd   xmm0, xmmword ptr [rsp+0xE0]
   1229         paddd   xmm1, xmmword ptr [rsp+0x50]
   1230         paddd   xmm2, xmmword ptr [rsp+0xC0]
   1231         paddd   xmm3, xmmword ptr [rsp+0x10]
   1232         paddd   xmm0, xmm4
   1233         paddd   xmm1, xmm5
   1234         paddd   xmm2, xmm6
   1235         paddd   xmm3, xmm7
   1236         pxor    xmm12, xmm0
   1237         pxor    xmm13, xmm1
   1238         pxor    xmm14, xmm2
   1239         pxor    xmm15, xmm3
   1240         movdqa  xmm8, xmm12
   1241         psrld   xmm12, 8
   1242         pslld   xmm8, 24
   1243         pxor    xmm12, xmm8
   1244         movdqa  xmm8, xmm13
   1245         psrld   xmm13, 8
   1246         pslld   xmm8, 24
   1247         pxor    xmm13, xmm8
   1248         movdqa  xmm8, xmm14
   1249         psrld   xmm14, 8
   1250         pslld   xmm8, 24
   1251         pxor    xmm14, xmm8
   1252         movdqa  xmm8, xmm15
   1253         psrld   xmm15, 8
   1254         pslld   xmm8, 24
   1255         pxor    xmm15, xmm8
   1256         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1257         paddd   xmm8, xmm12
   1258         paddd   xmm9, xmm13
   1259         paddd   xmm10, xmm14
   1260         paddd   xmm11, xmm15
   1261         pxor    xmm4, xmm8
   1262         pxor    xmm5, xmm9
   1263         pxor    xmm6, xmm10
   1264         pxor    xmm7, xmm11
   1265         movdqa  xmmword ptr [rsp+0x100], xmm8
   1266         movdqa  xmm8, xmm4
   1267         psrld   xmm8, 7
   1268         pslld   xmm4, 25
   1269         por     xmm4, xmm8
   1270         movdqa  xmm8, xmm5
   1271         psrld   xmm8, 7
   1272         pslld   xmm5, 25
   1273         por     xmm5, xmm8
   1274         movdqa  xmm8, xmm6
   1275         psrld   xmm8, 7
   1276         pslld   xmm6, 25
   1277         por     xmm6, xmm8
   1278         movdqa  xmm8, xmm7
   1279         psrld   xmm8, 7
   1280         pslld   xmm7, 25
   1281         por     xmm7, xmm8
   1282         paddd   xmm0, xmmword ptr [rsp+0xD0]
   1283         paddd   xmm1, xmmword ptr [rsp]
   1284         paddd   xmm2, xmmword ptr [rsp+0x20]
   1285         paddd   xmm3, xmmword ptr [rsp+0x40]
   1286         paddd   xmm0, xmm5
   1287         paddd   xmm1, xmm6
   1288         paddd   xmm2, xmm7
   1289         paddd   xmm3, xmm4
   1290         pxor    xmm15, xmm0
   1291         pxor    xmm12, xmm1
   1292         pxor    xmm13, xmm2
   1293         pxor    xmm14, xmm3
   1294         pshuflw xmm15, xmm15, 0xB1
   1295         pshufhw xmm15, xmm15, 0xB1
   1296         pshuflw xmm12, xmm12, 0xB1
   1297         pshufhw xmm12, xmm12, 0xB1
   1298         pshuflw xmm13, xmm13, 0xB1
   1299         pshufhw xmm13, xmm13, 0xB1
   1300         pshuflw xmm14, xmm14, 0xB1
   1301         pshufhw xmm14, xmm14, 0xB1
   1302         paddd   xmm10, xmm15
   1303         paddd   xmm11, xmm12
   1304         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1305         paddd   xmm8, xmm13
   1306         paddd   xmm9, xmm14
   1307         pxor    xmm5, xmm10
   1308         pxor    xmm6, xmm11
   1309         pxor    xmm7, xmm8
   1310         pxor    xmm4, xmm9
   1311         movdqa  xmmword ptr [rsp+0x100], xmm8
   1312         movdqa  xmm8, xmm5
   1313         psrld   xmm8, 12
   1314         pslld   xmm5, 20
   1315         por     xmm5, xmm8
   1316         movdqa  xmm8, xmm6
   1317         psrld   xmm8, 12
   1318         pslld   xmm6, 20
   1319         por     xmm6, xmm8
   1320         movdqa  xmm8, xmm7
   1321         psrld   xmm8, 12
   1322         pslld   xmm7, 20
   1323         por     xmm7, xmm8
   1324         movdqa  xmm8, xmm4
   1325         psrld   xmm8, 12
   1326         pslld   xmm4, 20
   1327         por     xmm4, xmm8
   1328         paddd   xmm0, xmmword ptr [rsp+0x30]
   1329         paddd   xmm1, xmmword ptr [rsp+0xA0]
   1330         paddd   xmm2, xmmword ptr [rsp+0x60]
   1331         paddd   xmm3, xmmword ptr [rsp+0x70]
   1332         paddd   xmm0, xmm5
   1333         paddd   xmm1, xmm6
   1334         paddd   xmm2, xmm7
   1335         paddd   xmm3, xmm4
   1336         pxor    xmm15, xmm0
   1337         pxor    xmm12, xmm1
   1338         pxor    xmm13, xmm2
   1339         pxor    xmm14, xmm3
   1340         movdqa  xmm8, xmm15
   1341         psrld   xmm15, 8
   1342         pslld   xmm8, 24
   1343         pxor    xmm15, xmm8
   1344         movdqa  xmm8, xmm12
   1345         psrld   xmm12, 8
   1346         pslld   xmm8, 24
   1347         pxor    xmm12, xmm8
   1348         movdqa  xmm8, xmm13
   1349         psrld   xmm13, 8
   1350         pslld   xmm8, 24
   1351         pxor    xmm13, xmm8
   1352         movdqa  xmm8, xmm14
   1353         psrld   xmm14, 8
   1354         pslld   xmm8, 24
   1355         pxor    xmm14, xmm8
   1356         paddd   xmm10, xmm15
   1357         paddd   xmm11, xmm12
   1358         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1359         paddd   xmm8, xmm13
   1360         paddd   xmm9, xmm14
   1361         pxor    xmm5, xmm10
   1362         pxor    xmm6, xmm11
   1363         pxor    xmm7, xmm8
   1364         pxor    xmm4, xmm9
   1365         movdqa  xmmword ptr [rsp+0x100], xmm8
   1366         movdqa  xmm8, xmm5
   1367         psrld   xmm8, 7
   1368         pslld   xmm5, 25
   1369         por     xmm5, xmm8
   1370         movdqa  xmm8, xmm6
   1371         psrld   xmm8, 7
   1372         pslld   xmm6, 25
   1373         por     xmm6, xmm8
   1374         movdqa  xmm8, xmm7
   1375         psrld   xmm8, 7
   1376         pslld   xmm7, 25
   1377         por     xmm7, xmm8
   1378         movdqa  xmm8, xmm4
   1379         psrld   xmm8, 7
   1380         pslld   xmm4, 25
   1381         por     xmm4, xmm8
   1382         paddd   xmm0, xmmword ptr [rsp+0xB0]
   1383         paddd   xmm1, xmmword ptr [rsp+0x50]
   1384         paddd   xmm2, xmmword ptr [rsp+0x10]
   1385         paddd   xmm3, xmmword ptr [rsp+0x80]
   1386         paddd   xmm0, xmm4
   1387         paddd   xmm1, xmm5
   1388         paddd   xmm2, xmm6
   1389         paddd   xmm3, xmm7
   1390         pxor    xmm12, xmm0
   1391         pxor    xmm13, xmm1
   1392         pxor    xmm14, xmm2
   1393         pxor    xmm15, xmm3
   1394         pshuflw xmm12, xmm12, 0xB1
   1395         pshufhw xmm12, xmm12, 0xB1
   1396         pshuflw xmm13, xmm13, 0xB1
   1397         pshufhw xmm13, xmm13, 0xB1
   1398         pshuflw xmm14, xmm14, 0xB1
   1399         pshufhw xmm14, xmm14, 0xB1
   1400         pshuflw xmm15, xmm15, 0xB1
   1401         pshufhw xmm15, xmm15, 0xB1
   1402         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1403         paddd   xmm8, xmm12
   1404         paddd   xmm9, xmm13
   1405         paddd   xmm10, xmm14
   1406         paddd   xmm11, xmm15
   1407         pxor    xmm4, xmm8
   1408         pxor    xmm5, xmm9
   1409         pxor    xmm6, xmm10
   1410         pxor    xmm7, xmm11
   1411         movdqa  xmmword ptr [rsp+0x100], xmm8
   1412         movdqa  xmm8, xmm4
   1413         psrld   xmm8, 12
   1414         pslld   xmm4, 20
   1415         por     xmm4, xmm8
   1416         movdqa  xmm8, xmm5
   1417         psrld   xmm8, 12
   1418         pslld   xmm5, 20
   1419         por     xmm5, xmm8
   1420         movdqa  xmm8, xmm6
   1421         psrld   xmm8, 12
   1422         pslld   xmm6, 20
   1423         por     xmm6, xmm8
   1424         movdqa  xmm8, xmm7
   1425         psrld   xmm8, 12
   1426         pslld   xmm7, 20
   1427         por     xmm7, xmm8
   1428         paddd   xmm0, xmmword ptr [rsp+0xF0]
   1429         paddd   xmm1, xmmword ptr [rsp]
   1430         paddd   xmm2, xmmword ptr [rsp+0x90]
   1431         paddd   xmm3, xmmword ptr [rsp+0x60]
   1432         paddd   xmm0, xmm4
   1433         paddd   xmm1, xmm5
   1434         paddd   xmm2, xmm6
   1435         paddd   xmm3, xmm7
   1436         pxor    xmm12, xmm0
   1437         pxor    xmm13, xmm1
   1438         pxor    xmm14, xmm2
   1439         pxor    xmm15, xmm3
   1440         movdqa  xmm8, xmm12
   1441         psrld   xmm12, 8
   1442         pslld   xmm8, 24
   1443         pxor    xmm12, xmm8
   1444         movdqa  xmm8, xmm13
   1445         psrld   xmm13, 8
   1446         pslld   xmm8, 24
   1447         pxor    xmm13, xmm8
   1448         movdqa  xmm8, xmm14
   1449         psrld   xmm14, 8
   1450         pslld   xmm8, 24
   1451         pxor    xmm14, xmm8
   1452         movdqa  xmm8, xmm15
   1453         psrld   xmm15, 8
   1454         pslld   xmm8, 24
   1455         pxor    xmm15, xmm8
   1456         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1457         paddd   xmm8, xmm12
   1458         paddd   xmm9, xmm13
   1459         paddd   xmm10, xmm14
   1460         paddd   xmm11, xmm15
   1461         pxor    xmm4, xmm8
   1462         pxor    xmm5, xmm9
   1463         pxor    xmm6, xmm10
   1464         pxor    xmm7, xmm11
   1465         movdqa  xmmword ptr [rsp+0x100], xmm8
   1466         movdqa  xmm8, xmm4
   1467         psrld   xmm8, 7
   1468         pslld   xmm4, 25
   1469         por     xmm4, xmm8
   1470         movdqa  xmm8, xmm5
   1471         psrld   xmm8, 7
   1472         pslld   xmm5, 25
   1473         por     xmm5, xmm8
   1474         movdqa  xmm8, xmm6
   1475         psrld   xmm8, 7
   1476         pslld   xmm6, 25
   1477         por     xmm6, xmm8
   1478         movdqa  xmm8, xmm7
   1479         psrld   xmm8, 7
   1480         pslld   xmm7, 25
   1481         por     xmm7, xmm8
   1482         paddd   xmm0, xmmword ptr [rsp+0xE0]
   1483         paddd   xmm1, xmmword ptr [rsp+0x20]
   1484         paddd   xmm2, xmmword ptr [rsp+0x30]
   1485         paddd   xmm3, xmmword ptr [rsp+0x70]
   1486         paddd   xmm0, xmm5
   1487         paddd   xmm1, xmm6
   1488         paddd   xmm2, xmm7
   1489         paddd   xmm3, xmm4
   1490         pxor    xmm15, xmm0
   1491         pxor    xmm12, xmm1
   1492         pxor    xmm13, xmm2
   1493         pxor    xmm14, xmm3
   1494         pshuflw xmm15, xmm15, 0xB1
   1495         pshufhw xmm15, xmm15, 0xB1
   1496         pshuflw xmm12, xmm12, 0xB1
   1497         pshufhw xmm12, xmm12, 0xB1
   1498         pshuflw xmm13, xmm13, 0xB1
   1499         pshufhw xmm13, xmm13, 0xB1
   1500         pshuflw xmm14, xmm14, 0xB1
   1501         pshufhw xmm14, xmm14, 0xB1
   1502         paddd   xmm10, xmm15
   1503         paddd   xmm11, xmm12
   1504         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1505         paddd   xmm8, xmm13
   1506         paddd   xmm9, xmm14
   1507         pxor    xmm5, xmm10
   1508         pxor    xmm6, xmm11
   1509         pxor    xmm7, xmm8
   1510         pxor    xmm4, xmm9
   1511         movdqa  xmmword ptr [rsp+0x100], xmm8
   1512         movdqa  xmm8, xmm5
   1513         psrld   xmm8, 12
   1514         pslld   xmm5, 20
   1515         por     xmm5, xmm8
   1516         movdqa  xmm8, xmm6
   1517         psrld   xmm8, 12
   1518         pslld   xmm6, 20
   1519         por     xmm6, xmm8
   1520         movdqa  xmm8, xmm7
   1521         psrld   xmm8, 12
   1522         pslld   xmm7, 20
   1523         por     xmm7, xmm8
   1524         movdqa  xmm8, xmm4
   1525         psrld   xmm8, 12
   1526         pslld   xmm4, 20
   1527         por     xmm4, xmm8
   1528         paddd   xmm0, xmmword ptr [rsp+0xA0]
   1529         paddd   xmm1, xmmword ptr [rsp+0xC0]
   1530         paddd   xmm2, xmmword ptr [rsp+0x40]
   1531         paddd   xmm3, xmmword ptr [rsp+0xD0]
   1532         paddd   xmm0, xmm5
   1533         paddd   xmm1, xmm6
   1534         paddd   xmm2, xmm7
   1535         paddd   xmm3, xmm4
   1536         pxor    xmm15, xmm0
   1537         pxor    xmm12, xmm1
   1538         pxor    xmm13, xmm2
   1539         pxor    xmm14, xmm3
   1540         movdqa  xmm8, xmm15
   1541         psrld   xmm15, 8
   1542         pslld   xmm8, 24
   1543         pxor    xmm15, xmm8
   1544         movdqa  xmm8, xmm12
   1545         psrld   xmm12, 8
   1546         pslld   xmm8, 24
   1547         pxor    xmm12, xmm8
   1548         movdqa  xmm8, xmm13
   1549         psrld   xmm13, 8
   1550         pslld   xmm8, 24
   1551         pxor    xmm13, xmm8
   1552         movdqa  xmm8, xmm14
   1553         psrld   xmm14, 8
   1554         pslld   xmm8, 24
   1555         pxor    xmm14, xmm8
   1556         paddd   xmm10, xmm15
   1557         paddd   xmm11, xmm12
   1558         movdqa  xmm8, xmmword ptr [rsp+0x100]
   1559         paddd   xmm8, xmm13
   1560         paddd   xmm9, xmm14
   1561         pxor    xmm5, xmm10
   1562         pxor    xmm6, xmm11
   1563         pxor    xmm7, xmm8
   1564         pxor    xmm4, xmm9
   1565         pxor    xmm0, xmm8
   1566         pxor    xmm1, xmm9
   1567         pxor    xmm2, xmm10
   1568         pxor    xmm3, xmm11
   1569         movdqa  xmm8, xmm5
   1570         psrld   xmm8, 7
   1571         pslld   xmm5, 25
   1572         por     xmm5, xmm8
   1573         movdqa  xmm8, xmm6
   1574         psrld   xmm8, 7
   1575         pslld   xmm6, 25
   1576         por     xmm6, xmm8
   1577         movdqa  xmm8, xmm7
   1578         psrld   xmm8, 7
   1579         pslld   xmm7, 25
   1580         por     xmm7, xmm8
   1581         movdqa  xmm8, xmm4
   1582         psrld   xmm8, 7
   1583         pslld   xmm4, 25
   1584         por     xmm4, xmm8
   1585         pxor    xmm4, xmm12
   1586         pxor    xmm5, xmm13
   1587         pxor    xmm6, xmm14
   1588         pxor    xmm7, xmm15
   1589         mov     eax, r13d
   1590         jne     9b
   1591         movdqa  xmm9, xmm0
   1592         punpckldq xmm0, xmm1
   1593         punpckhdq xmm9, xmm1
   1594         movdqa  xmm11, xmm2
   1595         punpckldq xmm2, xmm3
   1596         punpckhdq xmm11, xmm3
   1597         movdqa  xmm1, xmm0
   1598         punpcklqdq xmm0, xmm2
   1599         punpckhqdq xmm1, xmm2
   1600         movdqa  xmm3, xmm9
   1601         punpcklqdq xmm9, xmm11
   1602         punpckhqdq xmm3, xmm11
   1603         movdqu  xmmword ptr [rbx], xmm0
   1604         movdqu  xmmword ptr [rbx+0x20], xmm1
   1605         movdqu  xmmword ptr [rbx+0x40], xmm9
   1606         movdqu  xmmword ptr [rbx+0x60], xmm3
   1607         movdqa  xmm9, xmm4
   1608         punpckldq xmm4, xmm5
   1609         punpckhdq xmm9, xmm5
   1610         movdqa  xmm11, xmm6
   1611         punpckldq xmm6, xmm7
   1612         punpckhdq xmm11, xmm7
   1613         movdqa  xmm5, xmm4
   1614         punpcklqdq xmm4, xmm6
   1615         punpckhqdq xmm5, xmm6
   1616         movdqa  xmm7, xmm9
   1617         punpcklqdq xmm9, xmm11
   1618         punpckhqdq xmm7, xmm11
   1619         movdqu  xmmword ptr [rbx+0x10], xmm4
   1620         movdqu  xmmword ptr [rbx+0x30], xmm5
   1621         movdqu  xmmword ptr [rbx+0x50], xmm9
   1622         movdqu  xmmword ptr [rbx+0x70], xmm7
   1623         movdqa  xmm1, xmmword ptr [rsp+0x110]
   1624         movdqa  xmm0, xmm1
   1625         paddd   xmm1, xmmword ptr [rsp+0x150]
   1626         movdqa  xmmword ptr [rsp+0x110], xmm1
   1627         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
   1628         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
   1629         pcmpgtd xmm0, xmm1
   1630         movdqa  xmm1, xmmword ptr [rsp+0x120]
   1631         psubd   xmm1, xmm0
   1632         movdqa  xmmword ptr [rsp+0x120], xmm1
   1633         add     rbx, 128
   1634         add     rdi, 32
   1635         sub     rsi, 4
   1636         cmp     rsi, 4
   1637         jnc     2b
   1638         test    rsi, rsi
   1639         jne     3f
   1640 4:
   1641         movdqa  xmm6, xmmword ptr [rsp+0x170]
   1642         movdqa  xmm7, xmmword ptr [rsp+0x180]
   1643         movdqa  xmm8, xmmword ptr [rsp+0x190]
   1644         movdqa  xmm9, xmmword ptr [rsp+0x1A0]
   1645         movdqa  xmm10, xmmword ptr [rsp+0x1B0]
   1646         movdqa  xmm11, xmmword ptr [rsp+0x1C0]
   1647         movdqa  xmm12, xmmword ptr [rsp+0x1D0]
   1648         movdqa  xmm13, xmmword ptr [rsp+0x1E0]
   1649         movdqa  xmm14, xmmword ptr [rsp+0x1F0]
   1650         movdqa  xmm15, xmmword ptr [rsp+0x200]
   1651         mov     rsp, rbp
   1652         pop     rbp
   1653         pop     rbx
   1654         pop     rdi
   1655         pop     rsi
   1656         pop     r12
   1657         pop     r13
   1658         pop     r14
   1659         pop     r15
   1660         ret
   1661 .p2align 5
   1662 3:
   1663         test    esi, 0x2
   1664         je      3f
   1665         movups  xmm0, xmmword ptr [rcx]
   1666         movups  xmm1, xmmword ptr [rcx+0x10]
   1667         movaps  xmm8, xmm0
   1668         movaps  xmm9, xmm1
   1669         movd    xmm13, dword ptr [rsp+0x110]
   1670         movd    xmm14, dword ptr [rsp+0x120]
   1671         punpckldq xmm13, xmm14
   1672         movaps  xmmword ptr [rsp], xmm13
   1673         movd    xmm14, dword ptr [rsp+0x114]
   1674         movd    xmm13, dword ptr [rsp+0x124]
   1675         punpckldq xmm14, xmm13
   1676         movaps  xmmword ptr [rsp+0x10], xmm14
   1677         mov     r8, qword ptr [rdi]
   1678         mov     r9, qword ptr [rdi+0x8]
   1679         movzx   eax, byte ptr [rbp+0x80]
   1680         or      eax, r13d
   1681         xor     edx, edx
   1682 2:
   1683         mov     r14d, eax
   1684         or      eax, r12d
   1685         add     rdx, 64
   1686         cmp     rdx, r15
   1687         cmovne  eax, r14d
   1688         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   1689         movaps  xmm10, xmm2
   1690         movups  xmm4, xmmword ptr [r8+rdx-0x40]
   1691         movups  xmm5, xmmword ptr [r8+rdx-0x30]
   1692         movaps  xmm3, xmm4
   1693         shufps  xmm4, xmm5, 136
   1694         shufps  xmm3, xmm5, 221
   1695         movaps  xmm5, xmm3
   1696         movups  xmm6, xmmword ptr [r8+rdx-0x20]
   1697         movups  xmm7, xmmword ptr [r8+rdx-0x10]
   1698         movaps  xmm3, xmm6
   1699         shufps  xmm6, xmm7, 136
   1700         pshufd  xmm6, xmm6, 0x93
   1701         shufps  xmm3, xmm7, 221
   1702         pshufd  xmm7, xmm3, 0x93
   1703         movups  xmm12, xmmword ptr [r9+rdx-0x40]
   1704         movups  xmm13, xmmword ptr [r9+rdx-0x30]
   1705         movaps  xmm11, xmm12
   1706         shufps  xmm12, xmm13, 136
   1707         shufps  xmm11, xmm13, 221
   1708         movaps  xmm13, xmm11
   1709         movups  xmm14, xmmword ptr [r9+rdx-0x20]
   1710         movups  xmm15, xmmword ptr [r9+rdx-0x10]
   1711         movaps  xmm11, xmm14
   1712         shufps  xmm14, xmm15, 136
   1713         pshufd  xmm14, xmm14, 0x93
   1714         shufps  xmm11, xmm15, 221
   1715         pshufd  xmm15, xmm11, 0x93
   1716         shl     rax, 0x20
   1717         or      rax, 0x40
   1718         movd    xmm3, rax
   1719         movdqa  xmmword ptr [rsp+0x20], xmm3
   1720         movaps  xmm3, xmmword ptr [rsp]
   1721         movaps  xmm11, xmmword ptr [rsp+0x10]
   1722         punpcklqdq xmm3, xmmword ptr [rsp+0x20]
   1723         punpcklqdq xmm11, xmmword ptr [rsp+0x20]
   1724         mov     al, 7
   1725 9:
   1726         paddd   xmm0, xmm4
   1727         paddd   xmm8, xmm12
   1728         movaps  xmmword ptr [rsp+0x20], xmm4
   1729         movaps  xmmword ptr [rsp+0x30], xmm12
   1730         paddd   xmm0, xmm1
   1731         paddd   xmm8, xmm9
   1732         pxor    xmm3, xmm0
   1733         pxor    xmm11, xmm8
   1734         pshuflw xmm3, xmm3, 0xB1
   1735         pshufhw xmm3, xmm3, 0xB1
   1736         pshuflw xmm11, xmm11, 0xB1
   1737         pshufhw xmm11, xmm11, 0xB1
   1738         paddd   xmm2, xmm3
   1739         paddd   xmm10, xmm11
   1740         pxor    xmm1, xmm2
   1741         pxor    xmm9, xmm10
   1742         movdqa  xmm4, xmm1
   1743         pslld   xmm1, 20
   1744         psrld   xmm4, 12
   1745         por     xmm1, xmm4
   1746         movdqa  xmm4, xmm9
   1747         pslld   xmm9, 20
   1748         psrld   xmm4, 12
   1749         por     xmm9, xmm4
   1750         paddd   xmm0, xmm5
   1751         paddd   xmm8, xmm13
   1752         movaps  xmmword ptr [rsp+0x40], xmm5
   1753         movaps  xmmword ptr [rsp+0x50], xmm13
   1754         paddd   xmm0, xmm1
   1755         paddd   xmm8, xmm9
   1756         pxor    xmm3, xmm0
   1757         pxor    xmm11, xmm8
   1758         movdqa  xmm13, xmm3
   1759         psrld   xmm3, 8
   1760         pslld   xmm13, 24
   1761         pxor    xmm3, xmm13
   1762         movdqa  xmm13, xmm11
   1763         psrld   xmm11, 8
   1764         pslld   xmm13, 24
   1765         pxor    xmm11, xmm13
   1766         paddd   xmm2, xmm3
   1767         paddd   xmm10, xmm11
   1768         pxor    xmm1, xmm2
   1769         pxor    xmm9, xmm10
   1770         movdqa  xmm4, xmm1
   1771         pslld   xmm1, 25
   1772         psrld   xmm4, 7
   1773         por     xmm1, xmm4
   1774         movdqa  xmm4, xmm9
   1775         pslld   xmm9, 25
   1776         psrld   xmm4, 7
   1777         por     xmm9, xmm4
   1778         pshufd  xmm0, xmm0, 0x93
   1779         pshufd  xmm8, xmm8, 0x93
   1780         pshufd  xmm3, xmm3, 0x4E
   1781         pshufd  xmm11, xmm11, 0x4E
   1782         pshufd  xmm2, xmm2, 0x39
   1783         pshufd  xmm10, xmm10, 0x39
   1784         paddd   xmm0, xmm6
   1785         paddd   xmm8, xmm14
   1786         paddd   xmm0, xmm1
   1787         paddd   xmm8, xmm9
   1788         pxor    xmm3, xmm0
   1789         pxor    xmm11, xmm8
   1790         pshuflw xmm3, xmm3, 0xB1
   1791         pshufhw xmm3, xmm3, 0xB1
   1792         pshuflw xmm11, xmm11, 0xB1
   1793         pshufhw xmm11, xmm11, 0xB1
   1794         paddd   xmm2, xmm3
   1795         paddd   xmm10, xmm11
   1796         pxor    xmm1, xmm2
   1797         pxor    xmm9, xmm10
   1798         movdqa  xmm4, xmm1
   1799         pslld   xmm1, 20
   1800         psrld   xmm4, 12
   1801         por     xmm1, xmm4
   1802         movdqa  xmm4, xmm9
   1803         pslld   xmm9, 20
   1804         psrld   xmm4, 12
   1805         por     xmm9, xmm4
   1806         paddd   xmm0, xmm7
   1807         paddd   xmm8, xmm15
   1808         paddd   xmm0, xmm1
   1809         paddd   xmm8, xmm9
   1810         pxor    xmm3, xmm0
   1811         pxor    xmm11, xmm8
   1812         movdqa  xmm13, xmm3
   1813         psrld   xmm3, 8
   1814         pslld   xmm13, 24
   1815         pxor    xmm3, xmm13
   1816         movdqa  xmm13, xmm11
   1817         psrld   xmm11, 8
   1818         pslld   xmm13, 24
   1819         pxor    xmm11, xmm13
   1820         paddd   xmm2, xmm3
   1821         paddd   xmm10, xmm11
   1822         pxor    xmm1, xmm2
   1823         pxor    xmm9, xmm10
   1824         movdqa  xmm4, xmm1
   1825         pslld   xmm1, 25
   1826         psrld   xmm4, 7
   1827         por     xmm1, xmm4
   1828         movdqa  xmm4, xmm9
   1829         pslld   xmm9, 25
   1830         psrld   xmm4, 7
   1831         por     xmm9, xmm4
   1832         pshufd  xmm0, xmm0, 0x39
   1833         pshufd  xmm8, xmm8, 0x39
   1834         pshufd  xmm3, xmm3, 0x4E
   1835         pshufd  xmm11, xmm11, 0x4E
   1836         pshufd  xmm2, xmm2, 0x93
   1837         pshufd  xmm10, xmm10, 0x93
   1838         dec     al
   1839         je      9f
   1840         movdqa  xmm12, xmmword ptr [rsp+0x20]
   1841         movdqa  xmm5, xmmword ptr [rsp+0x40]
   1842         pshufd  xmm13, xmm12, 0x0F
   1843         shufps  xmm12, xmm5, 214
   1844         pshufd  xmm4, xmm12, 0x39
   1845         movdqa  xmm12, xmm6
   1846         shufps  xmm12, xmm7, 250
   1847         pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
   1848         pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   1849         por     xmm13, xmm12
   1850         movdqa  xmmword ptr [rsp+0x20], xmm13
   1851         movdqa  xmm12, xmm7
   1852         punpcklqdq xmm12, xmm5
   1853         movdqa  xmm13, xmm6
   1854         pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   1855         pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   1856         por     xmm12, xmm13
   1857         pshufd  xmm12, xmm12, 0x78
   1858         punpckhdq xmm5, xmm7
   1859         punpckldq xmm6, xmm5
   1860         pshufd  xmm7, xmm6, 0x1E
   1861         movdqa  xmmword ptr [rsp+0x40], xmm12
   1862         movdqa  xmm5, xmmword ptr [rsp+0x30]
   1863         movdqa  xmm13, xmmword ptr [rsp+0x50]
   1864         pshufd  xmm6, xmm5, 0x0F
   1865         shufps  xmm5, xmm13, 214
   1866         pshufd  xmm12, xmm5, 0x39
   1867         movdqa  xmm5, xmm14
   1868         shufps  xmm5, xmm15, 250
   1869         pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
   1870         pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   1871         por     xmm6, xmm5
   1872         movdqa  xmm5, xmm15
   1873         punpcklqdq xmm5, xmm13
   1874         movdqa  xmmword ptr [rsp+0x30], xmm2
   1875         movdqa  xmm2, xmm14
   1876         pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   1877         pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   1878         por     xmm5, xmm2
   1879         movdqa  xmm2, xmmword ptr [rsp+0x30]
   1880         pshufd  xmm5, xmm5, 0x78
   1881         punpckhdq xmm13, xmm15
   1882         punpckldq xmm14, xmm13
   1883         pshufd  xmm15, xmm14, 0x1E
   1884         movdqa  xmm13, xmm6
   1885         movdqa  xmm14, xmm5
   1886         movdqa  xmm5, xmmword ptr [rsp+0x20]
   1887         movdqa  xmm6, xmmword ptr [rsp+0x40]
   1888         jmp     9b
   1889 9:
   1890         pxor    xmm0, xmm2
   1891         pxor    xmm1, xmm3
   1892         pxor    xmm8, xmm10
   1893         pxor    xmm9, xmm11
   1894         mov     eax, r13d
   1895         cmp     rdx, r15
   1896         jne     2b
   1897         movups  xmmword ptr [rbx], xmm0
   1898         movups  xmmword ptr [rbx+0x10], xmm1
   1899         movups  xmmword ptr [rbx+0x20], xmm8
   1900         movups  xmmword ptr [rbx+0x30], xmm9
   1901         mov     eax, dword ptr [rsp+0x130]
   1902         neg     eax
   1903         mov    r10d, dword ptr [rsp+0x110+8*rax]
   1904         mov    r11d, dword ptr [rsp+0x120+8*rax]
   1905         mov dword ptr [rsp+0x110], r10d
   1906         mov dword ptr [rsp+0x120], r11d
   1907         add     rdi, 16
   1908         add     rbx, 64
   1909         sub     rsi, 2
   1910 3:
   1911         test    esi, 0x1
   1912         je      4b
   1913         movups  xmm0, xmmword ptr [rcx]
   1914         movups  xmm1, xmmword ptr [rcx+0x10]
   1915         movd    xmm13, dword ptr [rsp+0x110]
   1916         movd    xmm14, dword ptr [rsp+0x120]
   1917         punpckldq xmm13, xmm14
   1918         mov     r8, qword ptr [rdi]
   1919         movzx   eax, byte ptr [rbp+0x80]
   1920         or      eax, r13d
   1921         xor     edx, edx
   1922 2:
   1923         mov     r14d, eax
   1924         or      eax, r12d
   1925         add     rdx, 64
   1926         cmp     rdx, r15
   1927         cmovne  eax, r14d
   1928         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   1929         shl     rax, 32
   1930         or      rax, 64
   1931         movd    xmm12, rax
   1932         movdqa  xmm3, xmm13
   1933         punpcklqdq xmm3, xmm12
   1934         movups  xmm4, xmmword ptr [r8+rdx-0x40]
   1935         movups  xmm5, xmmword ptr [r8+rdx-0x30]
   1936         movaps  xmm8, xmm4
   1937         shufps  xmm4, xmm5, 136
   1938         shufps  xmm8, xmm5, 221
   1939         movaps  xmm5, xmm8
   1940         movups  xmm6, xmmword ptr [r8+rdx-0x20]
   1941         movups  xmm7, xmmword ptr [r8+rdx-0x10]
   1942         movaps  xmm8, xmm6
   1943         shufps  xmm6, xmm7, 136
   1944         pshufd  xmm6, xmm6, 0x93
   1945         shufps  xmm8, xmm7, 221
   1946         pshufd  xmm7, xmm8, 0x93
   1947         mov     al, 7
   1948 9:
   1949         paddd   xmm0, xmm4
   1950         paddd   xmm0, xmm1
   1951         pxor    xmm3, xmm0
   1952         pshuflw xmm3, xmm3, 0xB1
   1953         pshufhw xmm3, xmm3, 0xB1
   1954         paddd   xmm2, xmm3
   1955         pxor    xmm1, xmm2
   1956         movdqa  xmm11, xmm1
   1957         pslld   xmm1, 20
   1958         psrld   xmm11, 12
   1959         por     xmm1, xmm11
   1960         paddd   xmm0, xmm5
   1961         paddd   xmm0, xmm1
   1962         pxor    xmm3, xmm0
   1963         movdqa  xmm14, xmm3
   1964         psrld   xmm3, 8
   1965         pslld   xmm14, 24
   1966         pxor    xmm3, xmm14
   1967         paddd   xmm2, xmm3
   1968         pxor    xmm1, xmm2
   1969         movdqa  xmm11, xmm1
   1970         pslld   xmm1, 25
   1971         psrld   xmm11, 7
   1972         por     xmm1, xmm11
   1973         pshufd  xmm0, xmm0, 0x93
   1974         pshufd  xmm3, xmm3, 0x4E
   1975         pshufd  xmm2, xmm2, 0x39
   1976         paddd   xmm0, xmm6
   1977         paddd   xmm0, xmm1
   1978         pxor    xmm3, xmm0
   1979         pshuflw xmm3, xmm3, 0xB1
   1980         pshufhw xmm3, xmm3, 0xB1
   1981         paddd   xmm2, xmm3
   1982         pxor    xmm1, xmm2
   1983         movdqa  xmm11, xmm1
   1984         pslld   xmm1, 20
   1985         psrld   xmm11, 12
   1986         por     xmm1, xmm11
   1987         paddd   xmm0, xmm7
   1988         paddd   xmm0, xmm1
   1989         pxor    xmm3, xmm0
   1990         movdqa  xmm14, xmm3
   1991         psrld   xmm3, 8
   1992         pslld   xmm14, 24
   1993         pxor    xmm3, xmm14
   1994         paddd   xmm2, xmm3
   1995         pxor    xmm1, xmm2
   1996         movdqa  xmm11, xmm1
   1997         pslld   xmm1, 25
   1998         psrld   xmm11, 7
   1999         por     xmm1, xmm11
   2000         pshufd  xmm0, xmm0, 0x39
   2001         pshufd  xmm3, xmm3, 0x4E
   2002         pshufd  xmm2, xmm2, 0x93
   2003         dec     al
   2004         jz      9f
   2005         movdqa  xmm8, xmm4
   2006         shufps  xmm8, xmm5, 214
   2007         pshufd  xmm9, xmm4, 0x0F
   2008         pshufd  xmm4, xmm8, 0x39
   2009         movdqa  xmm8, xmm6
   2010         shufps  xmm8, xmm7, 250
   2011         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2012         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2013         por     xmm9, xmm8
   2014         movdqa  xmm8, xmm7
   2015         punpcklqdq xmm8, xmm5
   2016         movdqa  xmm10, xmm6
   2017         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2018         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2019         por     xmm8, xmm10
   2020         pshufd  xmm8, xmm8, 0x78
   2021         punpckhdq xmm5, xmm7
   2022         punpckldq xmm6, xmm5
   2023         pshufd  xmm7, xmm6, 0x1E
   2024         movdqa  xmm5, xmm9
   2025         movdqa  xmm6, xmm8
   2026         jmp     9b
   2027 9:
   2028         pxor    xmm0, xmm2
   2029         pxor    xmm1, xmm3
   2030         mov     eax, r13d
   2031         cmp     rdx, r15
   2032         jne     2b
   2033         movups  xmmword ptr [rbx], xmm0
   2034         movups  xmmword ptr [rbx+0x10], xmm1
   2035         jmp     4b
   2036 
   2037 .p2align 6
   2038 blake3_compress_in_place_sse2:
   2039 _blake3_compress_in_place_sse2:
   2040         sub     rsp, 120
   2041         movdqa  xmmword ptr [rsp], xmm6
   2042         movdqa  xmmword ptr [rsp+0x10], xmm7
   2043         movdqa  xmmword ptr [rsp+0x20], xmm8
   2044         movdqa  xmmword ptr [rsp+0x30], xmm9
   2045         movdqa  xmmword ptr [rsp+0x40], xmm11
   2046         movdqa  xmmword ptr [rsp+0x50], xmm14
   2047         movdqa  xmmword ptr [rsp+0x60], xmm15
   2048         movups  xmm0, xmmword ptr [rcx]
   2049         movups  xmm1, xmmword ptr [rcx+0x10]
   2050         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   2051         movzx   eax, byte ptr [rsp+0xA0]
   2052         movzx   r8d, r8b
   2053         shl     rax, 32
   2054         add     r8, rax
   2055         movq    xmm3, r9
   2056         movq    xmm4, r8
   2057         punpcklqdq xmm3, xmm4
   2058         movups  xmm4, xmmword ptr [rdx]
   2059         movups  xmm5, xmmword ptr [rdx+0x10]
   2060         movaps  xmm8, xmm4
   2061         shufps  xmm4, xmm5, 136
   2062         shufps  xmm8, xmm5, 221
   2063         movaps  xmm5, xmm8
   2064         movups  xmm6, xmmword ptr [rdx+0x20]
   2065         movups  xmm7, xmmword ptr [rdx+0x30]
   2066         movaps  xmm8, xmm6
   2067         shufps  xmm6, xmm7, 136
   2068         pshufd  xmm6, xmm6, 0x93
   2069         shufps  xmm8, xmm7, 221
   2070         pshufd  xmm7, xmm8, 0x93
   2071         mov     al, 7
   2072 9:
   2073         paddd   xmm0, xmm4
   2074         paddd   xmm0, xmm1
   2075         pxor    xmm3, xmm0
   2076         pshuflw xmm3, xmm3, 0xB1
   2077         pshufhw xmm3, xmm3, 0xB1
   2078         paddd   xmm2, xmm3
   2079         pxor    xmm1, xmm2
   2080         movdqa  xmm11, xmm1
   2081         pslld   xmm1, 20
   2082         psrld   xmm11, 12
   2083         por     xmm1, xmm11
   2084         paddd   xmm0, xmm5
   2085         paddd   xmm0, xmm1
   2086         pxor    xmm3, xmm0
   2087         movdqa  xmm14, xmm3
   2088         psrld   xmm3, 8
   2089         pslld   xmm14, 24
   2090         pxor    xmm3, xmm14
   2091         paddd   xmm2, xmm3
   2092         pxor    xmm1, xmm2
   2093         movdqa  xmm11, xmm1
   2094         pslld   xmm1, 25
   2095         psrld   xmm11, 7
   2096         por     xmm1, xmm11
   2097         pshufd  xmm0, xmm0, 0x93
   2098         pshufd  xmm3, xmm3, 0x4E
   2099         pshufd  xmm2, xmm2, 0x39
   2100         paddd   xmm0, xmm6
   2101         paddd   xmm0, xmm1
   2102         pxor    xmm3, xmm0
   2103         pshuflw xmm3, xmm3, 0xB1
   2104         pshufhw xmm3, xmm3, 0xB1
   2105         paddd   xmm2, xmm3
   2106         pxor    xmm1, xmm2
   2107         movdqa  xmm11, xmm1
   2108         pslld   xmm1, 20
   2109         psrld   xmm11, 12
   2110         por     xmm1, xmm11
   2111         paddd   xmm0, xmm7
   2112         paddd   xmm0, xmm1
   2113         pxor    xmm3, xmm0
   2114         movdqa  xmm14, xmm3
   2115         psrld   xmm3, 8
   2116         pslld   xmm14, 24
   2117         pxor    xmm3, xmm14
   2118         paddd   xmm2, xmm3
   2119         pxor    xmm1, xmm2
   2120         movdqa  xmm11, xmm1
   2121         pslld   xmm1, 25
   2122         psrld   xmm11, 7
   2123         por     xmm1, xmm11
   2124         pshufd  xmm0, xmm0, 0x39
   2125         pshufd  xmm3, xmm3, 0x4E
   2126         pshufd  xmm2, xmm2, 0x93
   2127         dec     al
   2128         jz      9f
   2129         movdqa  xmm8, xmm4
   2130         shufps  xmm8, xmm5, 214
   2131         pshufd  xmm9, xmm4, 0x0F
   2132         pshufd  xmm4, xmm8, 0x39
   2133         movdqa  xmm8, xmm6
   2134         shufps  xmm8, xmm7, 250
   2135         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2136         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2137         por     xmm9, xmm8
   2138         movdqa  xmm8, xmm7
   2139         punpcklqdq xmm8, xmm5
   2140         movdqa  xmm10, xmm6
   2141         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2142         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2143         por     xmm8, xmm10
   2144         pshufd  xmm8, xmm8, 0x78
   2145         punpckhdq xmm5, xmm7
   2146         punpckldq xmm6, xmm5
   2147         pshufd  xmm7, xmm6, 0x1E
   2148         movdqa  xmm5, xmm9
   2149         movdqa  xmm6, xmm8
   2150         jmp     9b
   2151 9:
   2152         pxor    xmm0, xmm2
   2153         pxor    xmm1, xmm3
   2154         movups  xmmword ptr [rcx], xmm0
   2155         movups  xmmword ptr [rcx+0x10], xmm1
   2156         movdqa  xmm6, xmmword ptr [rsp]
   2157         movdqa  xmm7, xmmword ptr [rsp+0x10]
   2158         movdqa  xmm8, xmmword ptr [rsp+0x20]
   2159         movdqa  xmm9, xmmword ptr [rsp+0x30]
   2160         movdqa  xmm11, xmmword ptr [rsp+0x40]
   2161         movdqa  xmm14, xmmword ptr [rsp+0x50]
   2162         movdqa  xmm15, xmmword ptr [rsp+0x60]
   2163         add     rsp, 120
   2164         ret
   2165 
   2166 
   2167 .p2align 6
   2168 _blake3_compress_xof_sse2:
   2169 blake3_compress_xof_sse2:
   2170         sub     rsp, 120
   2171         movdqa  xmmword ptr [rsp], xmm6
   2172         movdqa  xmmword ptr [rsp+0x10], xmm7
   2173         movdqa  xmmword ptr [rsp+0x20], xmm8
   2174         movdqa  xmmword ptr [rsp+0x30], xmm9
   2175         movdqa  xmmword ptr [rsp+0x40], xmm11
   2176         movdqa  xmmword ptr [rsp+0x50], xmm14
   2177         movdqa  xmmword ptr [rsp+0x60], xmm15
   2178         movups  xmm0, xmmword ptr [rcx]
   2179         movups  xmm1, xmmword ptr [rcx+0x10]
   2180         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
   2181         movzx   eax, byte ptr [rsp+0xA0]
   2182         movzx   r8d, r8b
   2183         mov     r10, qword ptr [rsp+0xA8]
   2184         shl     rax, 32
   2185         add     r8, rax
   2186         movq    xmm3, r9
   2187         movq    xmm4, r8
   2188         punpcklqdq xmm3, xmm4
   2189         movups  xmm4, xmmword ptr [rdx]
   2190         movups  xmm5, xmmword ptr [rdx+0x10]
   2191         movaps  xmm8, xmm4
   2192         shufps  xmm4, xmm5, 136
   2193         shufps  xmm8, xmm5, 221
   2194         movaps  xmm5, xmm8
   2195         movups  xmm6, xmmword ptr [rdx+0x20]
   2196         movups  xmm7, xmmword ptr [rdx+0x30]
   2197         movaps  xmm8, xmm6
   2198         shufps  xmm6, xmm7, 136
   2199         pshufd  xmm6, xmm6, 0x93
   2200         shufps  xmm8, xmm7, 221
   2201         pshufd  xmm7, xmm8, 0x93
   2202         mov     al, 7
   2203 9:
   2204         paddd   xmm0, xmm4
   2205         paddd   xmm0, xmm1
   2206         pxor    xmm3, xmm0
   2207         pshuflw xmm3, xmm3, 0xB1
   2208         pshufhw xmm3, xmm3, 0xB1
   2209         paddd   xmm2, xmm3
   2210         pxor    xmm1, xmm2
   2211         movdqa  xmm11, xmm1
   2212         pslld   xmm1, 20
   2213         psrld   xmm11, 12
   2214         por     xmm1, xmm11
   2215         paddd   xmm0, xmm5
   2216         paddd   xmm0, xmm1
   2217         pxor    xmm3, xmm0
   2218         movdqa  xmm14, xmm3
   2219         psrld   xmm3, 8
   2220         pslld   xmm14, 24
   2221         pxor    xmm3, xmm14
   2222         paddd   xmm2, xmm3
   2223         pxor    xmm1, xmm2
   2224         movdqa  xmm11, xmm1
   2225         pslld   xmm1, 25
   2226         psrld   xmm11, 7
   2227         por     xmm1, xmm11
   2228         pshufd  xmm0, xmm0, 0x93
   2229         pshufd  xmm3, xmm3, 0x4E
   2230         pshufd  xmm2, xmm2, 0x39
   2231         paddd   xmm0, xmm6
   2232         paddd   xmm0, xmm1
   2233         pxor    xmm3, xmm0
   2234         pshuflw xmm3, xmm3, 0xB1
   2235         pshufhw xmm3, xmm3, 0xB1
   2236         paddd   xmm2, xmm3
   2237         pxor    xmm1, xmm2
   2238         movdqa  xmm11, xmm1
   2239         pslld   xmm1, 20
   2240         psrld   xmm11, 12
   2241         por     xmm1, xmm11
   2242         paddd   xmm0, xmm7
   2243         paddd   xmm0, xmm1
   2244         pxor    xmm3, xmm0
   2245         movdqa  xmm14, xmm3
   2246         psrld   xmm3, 8
   2247         pslld   xmm14, 24
   2248         pxor    xmm3, xmm14
   2249         paddd   xmm2, xmm3
   2250         pxor    xmm1, xmm2
   2251         movdqa  xmm11, xmm1
   2252         pslld   xmm1, 25
   2253         psrld   xmm11, 7
   2254         por     xmm1, xmm11
   2255         pshufd  xmm0, xmm0, 0x39
   2256         pshufd  xmm3, xmm3, 0x4E
   2257         pshufd  xmm2, xmm2, 0x93
   2258         dec     al
   2259         jz      9f
   2260         movdqa  xmm8, xmm4
   2261         shufps  xmm8, xmm5, 214
   2262         pshufd  xmm9, xmm4, 0x0F
   2263         pshufd  xmm4, xmm8, 0x39
   2264         movdqa  xmm8, xmm6
   2265         shufps  xmm8, xmm7, 250
   2266         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
   2267         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
   2268         por     xmm9, xmm8
   2269         movdqa  xmm8, xmm7
   2270         punpcklqdq xmm8, xmm5
   2271         movdqa  xmm10, xmm6
   2272         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
   2273         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
   2274         por     xmm8, xmm10
   2275         pshufd  xmm8, xmm8, 0x78
   2276         punpckhdq xmm5, xmm7
   2277         punpckldq xmm6, xmm5
   2278         pshufd  xmm7, xmm6, 0x1E
   2279         movdqa  xmm5, xmm9
   2280         movdqa  xmm6, xmm8
   2281         jmp     9b
   2282 9:
   2283         movdqu  xmm4, xmmword ptr [rcx]
   2284         movdqu  xmm5, xmmword ptr [rcx+0x10]
   2285         pxor    xmm0, xmm2
   2286         pxor    xmm1, xmm3
   2287         pxor    xmm2, xmm4
   2288         pxor    xmm3, xmm5
   2289         movups  xmmword ptr [r10], xmm0
   2290         movups  xmmword ptr [r10+0x10], xmm1
   2291         movups  xmmword ptr [r10+0x20], xmm2
   2292         movups  xmmword ptr [r10+0x30], xmm3
   2293         movdqa  xmm6, xmmword ptr [rsp]
   2294         movdqa  xmm7, xmmword ptr [rsp+0x10]
   2295         movdqa  xmm8, xmmword ptr [rsp+0x20]
   2296         movdqa  xmm9, xmmword ptr [rsp+0x30]
   2297         movdqa  xmm11, xmmword ptr [rsp+0x40]
   2298         movdqa  xmm14, xmmword ptr [rsp+0x50]
   2299         movdqa  xmm15, xmmword ptr [rsp+0x60]
   2300         add     rsp, 120
   2301         ret
   2302 
   2303 
   2304 .section .rodata
   2305 .p2align  6
   2306 BLAKE3_IV:
   2307         .long  0x6A09E667, 0xBB67AE85
   2308         .long  0x3C6EF372, 0xA54FF53A
   2309 ADD0:   
   2310         .long  0, 1, 2, 3
   2311 ADD1:
   2312         .long  4, 4, 4, 4
   2313 BLAKE3_IV_0:
   2314         .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
   2315 BLAKE3_IV_1:
   2316         .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
   2317 BLAKE3_IV_2:
   2318         .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
   2319 BLAKE3_IV_3:
   2320         .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
   2321 BLAKE3_BLOCK_LEN:
   2322         .long  64, 64, 64, 64
   2323 CMP_MSB_MASK:
   2324         .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
   2325 PBLENDW_0x33_MASK:
   2326         .long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
   2327 PBLENDW_0xCC_MASK:
   2328         .long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
   2329 PBLENDW_0x3F_MASK:
   2330         .long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
   2331 PBLENDW_0xC0_MASK:
   2332         .long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF