chibipub

experimental activitypub node in C
git clone git://jb55.com/chibipub
Log | Files | Refs | README | LICENSE

blake3_sse41_x86-64_windows_msvc.asm (63194B)


      1 public _blake3_hash_many_sse41
      2 public blake3_hash_many_sse41
      3 public blake3_compress_in_place_sse41
      4 public _blake3_compress_in_place_sse41
      5 public blake3_compress_xof_sse41
      6 public _blake3_compress_xof_sse41
      7 
      8 _TEXT   SEGMENT ALIGN(16) 'CODE'
      9 
     10 ALIGN   16
     11 blake3_hash_many_sse41 PROC
     12 _blake3_hash_many_sse41 PROC
     13         push    r15
     14         push    r14
     15         push    r13
     16         push    r12
     17         push    rsi
     18         push    rdi
     19         push    rbx
     20         push    rbp
     21         mov     rbp, rsp
     22         sub     rsp, 528
     23         and     rsp, 0FFFFFFFFFFFFFFC0H
     24         movdqa  xmmword ptr [rsp+170H], xmm6
     25         movdqa  xmmword ptr [rsp+180H], xmm7
     26         movdqa  xmmword ptr [rsp+190H], xmm8
     27         movdqa  xmmword ptr [rsp+1A0H], xmm9
     28         movdqa  xmmword ptr [rsp+1B0H], xmm10
     29         movdqa  xmmword ptr [rsp+1C0H], xmm11
     30         movdqa  xmmword ptr [rsp+1D0H], xmm12
     31         movdqa  xmmword ptr [rsp+1E0H], xmm13
     32         movdqa  xmmword ptr [rsp+1F0H], xmm14
     33         movdqa  xmmword ptr [rsp+200H], xmm15
     34         mov     rdi, rcx
     35         mov     rsi, rdx
     36         mov     rdx, r8
     37         mov     rcx, r9
     38         mov     r8, qword ptr [rbp+68H]
     39         movzx   r9, byte ptr [rbp+70H]
     40         neg     r9d
     41         movd    xmm0, r9d
     42         pshufd  xmm0, xmm0, 00H
     43         movdqa  xmmword ptr [rsp+130H], xmm0
     44         movdqa  xmm1, xmm0
     45         pand    xmm1, xmmword ptr [ADD0]
     46         pand    xmm0, xmmword ptr [ADD1]
     47         movdqa  xmmword ptr [rsp+150H], xmm0
     48         movd    xmm0, r8d
     49         pshufd  xmm0, xmm0, 00H
     50         paddd   xmm0, xmm1
     51         movdqa  xmmword ptr [rsp+110H], xmm0
     52         pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
     53         pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
     54         pcmpgtd xmm1, xmm0
     55         shr     r8, 32
     56         movd    xmm2, r8d
     57         pshufd  xmm2, xmm2, 00H
     58         psubd   xmm2, xmm1
     59         movdqa  xmmword ptr [rsp+120H], xmm2
     60         mov     rbx, qword ptr [rbp+90H]
     61         mov     r15, rdx
     62         shl     r15, 6
     63         movzx   r13d, byte ptr [rbp+78H]
     64         movzx   r12d, byte ptr [rbp+88H]
     65         cmp     rsi, 4
     66         jc      final3blocks
     67 outerloop4:
     68         movdqu  xmm3, xmmword ptr [rcx]
     69         pshufd  xmm0, xmm3, 00H
     70         pshufd  xmm1, xmm3, 55H
     71         pshufd  xmm2, xmm3, 0AAH
     72         pshufd  xmm3, xmm3, 0FFH
     73         movdqu  xmm7, xmmword ptr [rcx+10H]
     74         pshufd  xmm4, xmm7, 00H
     75         pshufd  xmm5, xmm7, 55H
     76         pshufd  xmm6, xmm7, 0AAH
     77         pshufd  xmm7, xmm7, 0FFH
     78         mov     r8, qword ptr [rdi]
     79         mov     r9, qword ptr [rdi+8H]
     80         mov     r10, qword ptr [rdi+10H]
     81         mov     r11, qword ptr [rdi+18H]
     82         movzx   eax, byte ptr [rbp+80H]
     83         or      eax, r13d
     84         xor     edx, edx
     85 innerloop4:
     86         mov     r14d, eax
     87         or      eax, r12d
     88         add     rdx, 64
     89         cmp     rdx, r15
     90         cmovne  eax, r14d
     91         movdqu  xmm8, xmmword ptr [r8+rdx-40H]
     92         movdqu  xmm9, xmmword ptr [r9+rdx-40H]
     93         movdqu  xmm10, xmmword ptr [r10+rdx-40H]
     94         movdqu  xmm11, xmmword ptr [r11+rdx-40H]
     95         movdqa  xmm12, xmm8
     96         punpckldq xmm8, xmm9
     97         punpckhdq xmm12, xmm9
     98         movdqa  xmm14, xmm10
     99         punpckldq xmm10, xmm11
    100         punpckhdq xmm14, xmm11
    101         movdqa  xmm9, xmm8
    102         punpcklqdq xmm8, xmm10
    103         punpckhqdq xmm9, xmm10
    104         movdqa  xmm13, xmm12
    105         punpcklqdq xmm12, xmm14
    106         punpckhqdq xmm13, xmm14
    107         movdqa  xmmword ptr [rsp], xmm8
    108         movdqa  xmmword ptr [rsp+10H], xmm9
    109         movdqa  xmmword ptr [rsp+20H], xmm12
    110         movdqa  xmmword ptr [rsp+30H], xmm13
    111         movdqu  xmm8, xmmword ptr [r8+rdx-30H]
    112         movdqu  xmm9, xmmword ptr [r9+rdx-30H]
    113         movdqu  xmm10, xmmword ptr [r10+rdx-30H]
    114         movdqu  xmm11, xmmword ptr [r11+rdx-30H]
    115         movdqa  xmm12, xmm8
    116         punpckldq xmm8, xmm9
    117         punpckhdq xmm12, xmm9
    118         movdqa  xmm14, xmm10
    119         punpckldq xmm10, xmm11
    120         punpckhdq xmm14, xmm11
    121         movdqa  xmm9, xmm8
    122         punpcklqdq xmm8, xmm10
    123         punpckhqdq xmm9, xmm10
    124         movdqa  xmm13, xmm12
    125         punpcklqdq xmm12, xmm14
    126         punpckhqdq xmm13, xmm14
    127         movdqa  xmmword ptr [rsp+40H], xmm8
    128         movdqa  xmmword ptr [rsp+50H], xmm9
    129         movdqa  xmmword ptr [rsp+60H], xmm12
    130         movdqa  xmmword ptr [rsp+70H], xmm13
    131         movdqu  xmm8, xmmword ptr [r8+rdx-20H]
    132         movdqu  xmm9, xmmword ptr [r9+rdx-20H]
    133         movdqu  xmm10, xmmword ptr [r10+rdx-20H]
    134         movdqu  xmm11, xmmword ptr [r11+rdx-20H]
    135         movdqa  xmm12, xmm8
    136         punpckldq xmm8, xmm9
    137         punpckhdq xmm12, xmm9
    138         movdqa  xmm14, xmm10
    139         punpckldq xmm10, xmm11
    140         punpckhdq xmm14, xmm11
    141         movdqa  xmm9, xmm8
    142         punpcklqdq xmm8, xmm10
    143         punpckhqdq xmm9, xmm10
    144         movdqa  xmm13, xmm12
    145         punpcklqdq xmm12, xmm14
    146         punpckhqdq xmm13, xmm14
    147         movdqa  xmmword ptr [rsp+80H], xmm8
    148         movdqa  xmmword ptr [rsp+90H], xmm9
    149         movdqa  xmmword ptr [rsp+0A0H], xmm12
    150         movdqa  xmmword ptr [rsp+0B0H], xmm13
    151         movdqu  xmm8, xmmword ptr [r8+rdx-10H]
    152         movdqu  xmm9, xmmword ptr [r9+rdx-10H]
    153         movdqu  xmm10, xmmword ptr [r10+rdx-10H]
    154         movdqu  xmm11, xmmword ptr [r11+rdx-10H]
    155         movdqa  xmm12, xmm8
    156         punpckldq xmm8, xmm9
    157         punpckhdq xmm12, xmm9
    158         movdqa  xmm14, xmm10
    159         punpckldq xmm10, xmm11
    160         punpckhdq xmm14, xmm11
    161         movdqa  xmm9, xmm8
    162         punpcklqdq xmm8, xmm10
    163         punpckhqdq xmm9, xmm10
    164         movdqa  xmm13, xmm12
    165         punpcklqdq xmm12, xmm14
    166         punpckhqdq xmm13, xmm14
    167         movdqa  xmmword ptr [rsp+0C0H], xmm8
    168         movdqa  xmmword ptr [rsp+0D0H], xmm9
    169         movdqa  xmmword ptr [rsp+0E0H], xmm12
    170         movdqa  xmmword ptr [rsp+0F0H], xmm13
    171         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
    172         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
    173         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
    174         movdqa  xmm12, xmmword ptr [rsp+110H]
    175         movdqa  xmm13, xmmword ptr [rsp+120H]
    176         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
    177         movd    xmm15, eax
    178         pshufd  xmm15, xmm15, 00H
    179         prefetcht0 byte ptr [r8+rdx+80H]
    180         prefetcht0 byte ptr [r9+rdx+80H]
    181         prefetcht0 byte ptr [r10+rdx+80H]
    182         prefetcht0 byte ptr [r11+rdx+80H]
    183         paddd   xmm0, xmmword ptr [rsp]
    184         paddd   xmm1, xmmword ptr [rsp+20H]
    185         paddd   xmm2, xmmword ptr [rsp+40H]
    186         paddd   xmm3, xmmword ptr [rsp+60H]
    187         paddd   xmm0, xmm4
    188         paddd   xmm1, xmm5
    189         paddd   xmm2, xmm6
    190         paddd   xmm3, xmm7
    191         pxor    xmm12, xmm0
    192         pxor    xmm13, xmm1
    193         pxor    xmm14, xmm2
    194         pxor    xmm15, xmm3
    195         movdqa  xmm8, xmmword ptr [ROT16]
    196         pshufb  xmm12, xmm8
    197         pshufb  xmm13, xmm8
    198         pshufb  xmm14, xmm8
    199         pshufb  xmm15, xmm8
    200         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
    201         paddd   xmm8, xmm12
    202         paddd   xmm9, xmm13
    203         paddd   xmm10, xmm14
    204         paddd   xmm11, xmm15
    205         pxor    xmm4, xmm8
    206         pxor    xmm5, xmm9
    207         pxor    xmm6, xmm10
    208         pxor    xmm7, xmm11
    209         movdqa  xmmword ptr [rsp+100H], xmm8
    210         movdqa  xmm8, xmm4
    211         psrld   xmm8, 12
    212         pslld   xmm4, 20
    213         por     xmm4, xmm8
    214         movdqa  xmm8, xmm5
    215         psrld   xmm8, 12
    216         pslld   xmm5, 20
    217         por     xmm5, xmm8
    218         movdqa  xmm8, xmm6
    219         psrld   xmm8, 12
    220         pslld   xmm6, 20
    221         por     xmm6, xmm8
    222         movdqa  xmm8, xmm7
    223         psrld   xmm8, 12
    224         pslld   xmm7, 20
    225         por     xmm7, xmm8
    226         paddd   xmm0, xmmword ptr [rsp+10H]
    227         paddd   xmm1, xmmword ptr [rsp+30H]
    228         paddd   xmm2, xmmword ptr [rsp+50H]
    229         paddd   xmm3, xmmword ptr [rsp+70H]
    230         paddd   xmm0, xmm4
    231         paddd   xmm1, xmm5
    232         paddd   xmm2, xmm6
    233         paddd   xmm3, xmm7
    234         pxor    xmm12, xmm0
    235         pxor    xmm13, xmm1
    236         pxor    xmm14, xmm2
    237         pxor    xmm15, xmm3
    238         movdqa  xmm8, xmmword ptr [ROT8]
    239         pshufb  xmm12, xmm8
    240         pshufb  xmm13, xmm8
    241         pshufb  xmm14, xmm8
    242         pshufb  xmm15, xmm8
    243         movdqa  xmm8, xmmword ptr [rsp+100H]
    244         paddd   xmm8, xmm12
    245         paddd   xmm9, xmm13
    246         paddd   xmm10, xmm14
    247         paddd   xmm11, xmm15
    248         pxor    xmm4, xmm8
    249         pxor    xmm5, xmm9
    250         pxor    xmm6, xmm10
    251         pxor    xmm7, xmm11
    252         movdqa  xmmword ptr [rsp+100H], xmm8
    253         movdqa  xmm8, xmm4
    254         psrld   xmm8, 7
    255         pslld   xmm4, 25
    256         por     xmm4, xmm8
    257         movdqa  xmm8, xmm5
    258         psrld   xmm8, 7
    259         pslld   xmm5, 25
    260         por     xmm5, xmm8
    261         movdqa  xmm8, xmm6
    262         psrld   xmm8, 7
    263         pslld   xmm6, 25
    264         por     xmm6, xmm8
    265         movdqa  xmm8, xmm7
    266         psrld   xmm8, 7
    267         pslld   xmm7, 25
    268         por     xmm7, xmm8
    269         paddd   xmm0, xmmword ptr [rsp+80H]
    270         paddd   xmm1, xmmword ptr [rsp+0A0H]
    271         paddd   xmm2, xmmword ptr [rsp+0C0H]
    272         paddd   xmm3, xmmword ptr [rsp+0E0H]
    273         paddd   xmm0, xmm5
    274         paddd   xmm1, xmm6
    275         paddd   xmm2, xmm7
    276         paddd   xmm3, xmm4
    277         pxor    xmm15, xmm0
    278         pxor    xmm12, xmm1
    279         pxor    xmm13, xmm2
    280         pxor    xmm14, xmm3
    281         movdqa  xmm8, xmmword ptr [ROT16]
    282         pshufb  xmm15, xmm8
    283         pshufb  xmm12, xmm8
    284         pshufb  xmm13, xmm8
    285         pshufb  xmm14, xmm8
    286         paddd   xmm10, xmm15
    287         paddd   xmm11, xmm12
    288         movdqa  xmm8, xmmword ptr [rsp+100H]
    289         paddd   xmm8, xmm13
    290         paddd   xmm9, xmm14
    291         pxor    xmm5, xmm10
    292         pxor    xmm6, xmm11
    293         pxor    xmm7, xmm8
    294         pxor    xmm4, xmm9
    295         movdqa  xmmword ptr [rsp+100H], xmm8
    296         movdqa  xmm8, xmm5
    297         psrld   xmm8, 12
    298         pslld   xmm5, 20
    299         por     xmm5, xmm8
    300         movdqa  xmm8, xmm6
    301         psrld   xmm8, 12
    302         pslld   xmm6, 20
    303         por     xmm6, xmm8
    304         movdqa  xmm8, xmm7
    305         psrld   xmm8, 12
    306         pslld   xmm7, 20
    307         por     xmm7, xmm8
    308         movdqa  xmm8, xmm4
    309         psrld   xmm8, 12
    310         pslld   xmm4, 20
    311         por     xmm4, xmm8
    312         paddd   xmm0, xmmword ptr [rsp+90H]
    313         paddd   xmm1, xmmword ptr [rsp+0B0H]
    314         paddd   xmm2, xmmword ptr [rsp+0D0H]
    315         paddd   xmm3, xmmword ptr [rsp+0F0H]
    316         paddd   xmm0, xmm5
    317         paddd   xmm1, xmm6
    318         paddd   xmm2, xmm7
    319         paddd   xmm3, xmm4
    320         pxor    xmm15, xmm0
    321         pxor    xmm12, xmm1
    322         pxor    xmm13, xmm2
    323         pxor    xmm14, xmm3
    324         movdqa  xmm8, xmmword ptr [ROT8]
    325         pshufb  xmm15, xmm8
    326         pshufb  xmm12, xmm8
    327         pshufb  xmm13, xmm8
    328         pshufb  xmm14, xmm8
    329         paddd   xmm10, xmm15
    330         paddd   xmm11, xmm12
    331         movdqa  xmm8, xmmword ptr [rsp+100H]
    332         paddd   xmm8, xmm13
    333         paddd   xmm9, xmm14
    334         pxor    xmm5, xmm10
    335         pxor    xmm6, xmm11
    336         pxor    xmm7, xmm8
    337         pxor    xmm4, xmm9
    338         movdqa  xmmword ptr [rsp+100H], xmm8
    339         movdqa  xmm8, xmm5
    340         psrld   xmm8, 7
    341         pslld   xmm5, 25
    342         por     xmm5, xmm8
    343         movdqa  xmm8, xmm6
    344         psrld   xmm8, 7
    345         pslld   xmm6, 25
    346         por     xmm6, xmm8
    347         movdqa  xmm8, xmm7
    348         psrld   xmm8, 7
    349         pslld   xmm7, 25
    350         por     xmm7, xmm8
    351         movdqa  xmm8, xmm4
    352         psrld   xmm8, 7
    353         pslld   xmm4, 25
    354         por     xmm4, xmm8
    355         paddd   xmm0, xmmword ptr [rsp+20H]
    356         paddd   xmm1, xmmword ptr [rsp+30H]
    357         paddd   xmm2, xmmword ptr [rsp+70H]
    358         paddd   xmm3, xmmword ptr [rsp+40H]
    359         paddd   xmm0, xmm4
    360         paddd   xmm1, xmm5
    361         paddd   xmm2, xmm6
    362         paddd   xmm3, xmm7
    363         pxor    xmm12, xmm0
    364         pxor    xmm13, xmm1
    365         pxor    xmm14, xmm2
    366         pxor    xmm15, xmm3
    367         movdqa  xmm8, xmmword ptr [ROT16]
    368         pshufb  xmm12, xmm8
    369         pshufb  xmm13, xmm8
    370         pshufb  xmm14, xmm8
    371         pshufb  xmm15, xmm8
    372         movdqa  xmm8, xmmword ptr [rsp+100H]
    373         paddd   xmm8, xmm12
    374         paddd   xmm9, xmm13
    375         paddd   xmm10, xmm14
    376         paddd   xmm11, xmm15
    377         pxor    xmm4, xmm8
    378         pxor    xmm5, xmm9
    379         pxor    xmm6, xmm10
    380         pxor    xmm7, xmm11
    381         movdqa  xmmword ptr [rsp+100H], xmm8
    382         movdqa  xmm8, xmm4
    383         psrld   xmm8, 12
    384         pslld   xmm4, 20
    385         por     xmm4, xmm8
    386         movdqa  xmm8, xmm5
    387         psrld   xmm8, 12
    388         pslld   xmm5, 20
    389         por     xmm5, xmm8
    390         movdqa  xmm8, xmm6
    391         psrld   xmm8, 12
    392         pslld   xmm6, 20
    393         por     xmm6, xmm8
    394         movdqa  xmm8, xmm7
    395         psrld   xmm8, 12
    396         pslld   xmm7, 20
    397         por     xmm7, xmm8
    398         paddd   xmm0, xmmword ptr [rsp+60H]
    399         paddd   xmm1, xmmword ptr [rsp+0A0H]
    400         paddd   xmm2, xmmword ptr [rsp]
    401         paddd   xmm3, xmmword ptr [rsp+0D0H]
    402         paddd   xmm0, xmm4
    403         paddd   xmm1, xmm5
    404         paddd   xmm2, xmm6
    405         paddd   xmm3, xmm7
    406         pxor    xmm12, xmm0
    407         pxor    xmm13, xmm1
    408         pxor    xmm14, xmm2
    409         pxor    xmm15, xmm3
    410         movdqa  xmm8, xmmword ptr [ROT8]
    411         pshufb  xmm12, xmm8
    412         pshufb  xmm13, xmm8
    413         pshufb  xmm14, xmm8
    414         pshufb  xmm15, xmm8
    415         movdqa  xmm8, xmmword ptr [rsp+100H]
    416         paddd   xmm8, xmm12
    417         paddd   xmm9, xmm13
    418         paddd   xmm10, xmm14
    419         paddd   xmm11, xmm15
    420         pxor    xmm4, xmm8
    421         pxor    xmm5, xmm9
    422         pxor    xmm6, xmm10
    423         pxor    xmm7, xmm11
    424         movdqa  xmmword ptr [rsp+100H], xmm8
    425         movdqa  xmm8, xmm4
    426         psrld   xmm8, 7
    427         pslld   xmm4, 25
    428         por     xmm4, xmm8
    429         movdqa  xmm8, xmm5
    430         psrld   xmm8, 7
    431         pslld   xmm5, 25
    432         por     xmm5, xmm8
    433         movdqa  xmm8, xmm6
    434         psrld   xmm8, 7
    435         pslld   xmm6, 25
    436         por     xmm6, xmm8
    437         movdqa  xmm8, xmm7
    438         psrld   xmm8, 7
    439         pslld   xmm7, 25
    440         por     xmm7, xmm8
    441         paddd   xmm0, xmmword ptr [rsp+10H]
    442         paddd   xmm1, xmmword ptr [rsp+0C0H]
    443         paddd   xmm2, xmmword ptr [rsp+90H]
    444         paddd   xmm3, xmmword ptr [rsp+0F0H]
    445         paddd   xmm0, xmm5
    446         paddd   xmm1, xmm6
    447         paddd   xmm2, xmm7
    448         paddd   xmm3, xmm4
    449         pxor    xmm15, xmm0
    450         pxor    xmm12, xmm1
    451         pxor    xmm13, xmm2
    452         pxor    xmm14, xmm3
    453         movdqa  xmm8, xmmword ptr [ROT16]
    454         pshufb  xmm15, xmm8
    455         pshufb  xmm12, xmm8
    456         pshufb  xmm13, xmm8
    457         pshufb  xmm14, xmm8
    458         paddd   xmm10, xmm15
    459         paddd   xmm11, xmm12
    460         movdqa  xmm8, xmmword ptr [rsp+100H]
    461         paddd   xmm8, xmm13
    462         paddd   xmm9, xmm14
    463         pxor    xmm5, xmm10
    464         pxor    xmm6, xmm11
    465         pxor    xmm7, xmm8
    466         pxor    xmm4, xmm9
    467         movdqa  xmmword ptr [rsp+100H], xmm8
    468         movdqa  xmm8, xmm5
    469         psrld   xmm8, 12
    470         pslld   xmm5, 20
    471         por     xmm5, xmm8
    472         movdqa  xmm8, xmm6
    473         psrld   xmm8, 12
    474         pslld   xmm6, 20
    475         por     xmm6, xmm8
    476         movdqa  xmm8, xmm7
    477         psrld   xmm8, 12
    478         pslld   xmm7, 20
    479         por     xmm7, xmm8
    480         movdqa  xmm8, xmm4
    481         psrld   xmm8, 12
    482         pslld   xmm4, 20
    483         por     xmm4, xmm8
    484         paddd   xmm0, xmmword ptr [rsp+0B0H]
    485         paddd   xmm1, xmmword ptr [rsp+50H]
    486         paddd   xmm2, xmmword ptr [rsp+0E0H]
    487         paddd   xmm3, xmmword ptr [rsp+80H]
    488         paddd   xmm0, xmm5
    489         paddd   xmm1, xmm6
    490         paddd   xmm2, xmm7
    491         paddd   xmm3, xmm4
    492         pxor    xmm15, xmm0
    493         pxor    xmm12, xmm1
    494         pxor    xmm13, xmm2
    495         pxor    xmm14, xmm3
    496         movdqa  xmm8, xmmword ptr [ROT8]
    497         pshufb  xmm15, xmm8
    498         pshufb  xmm12, xmm8
    499         pshufb  xmm13, xmm8
    500         pshufb  xmm14, xmm8
    501         paddd   xmm10, xmm15
    502         paddd   xmm11, xmm12
    503         movdqa  xmm8, xmmword ptr [rsp+100H]
    504         paddd   xmm8, xmm13
    505         paddd   xmm9, xmm14
    506         pxor    xmm5, xmm10
    507         pxor    xmm6, xmm11
    508         pxor    xmm7, xmm8
    509         pxor    xmm4, xmm9
    510         movdqa  xmmword ptr [rsp+100H], xmm8
    511         movdqa  xmm8, xmm5
    512         psrld   xmm8, 7
    513         pslld   xmm5, 25
    514         por     xmm5, xmm8
    515         movdqa  xmm8, xmm6
    516         psrld   xmm8, 7
    517         pslld   xmm6, 25
    518         por     xmm6, xmm8
    519         movdqa  xmm8, xmm7
    520         psrld   xmm8, 7
    521         pslld   xmm7, 25
    522         por     xmm7, xmm8
    523         movdqa  xmm8, xmm4
    524         psrld   xmm8, 7
    525         pslld   xmm4, 25
    526         por     xmm4, xmm8
    527         paddd   xmm0, xmmword ptr [rsp+30H]
    528         paddd   xmm1, xmmword ptr [rsp+0A0H]
    529         paddd   xmm2, xmmword ptr [rsp+0D0H]
    530         paddd   xmm3, xmmword ptr [rsp+70H]
    531         paddd   xmm0, xmm4
    532         paddd   xmm1, xmm5
    533         paddd   xmm2, xmm6
    534         paddd   xmm3, xmm7
    535         pxor    xmm12, xmm0
    536         pxor    xmm13, xmm1
    537         pxor    xmm14, xmm2
    538         pxor    xmm15, xmm3
    539         movdqa  xmm8, xmmword ptr [ROT16]
    540         pshufb  xmm12, xmm8
    541         pshufb  xmm13, xmm8
    542         pshufb  xmm14, xmm8
    543         pshufb  xmm15, xmm8
    544         movdqa  xmm8, xmmword ptr [rsp+100H]
    545         paddd   xmm8, xmm12
    546         paddd   xmm9, xmm13
    547         paddd   xmm10, xmm14
    548         paddd   xmm11, xmm15
    549         pxor    xmm4, xmm8
    550         pxor    xmm5, xmm9
    551         pxor    xmm6, xmm10
    552         pxor    xmm7, xmm11
    553         movdqa  xmmword ptr [rsp+100H], xmm8
    554         movdqa  xmm8, xmm4
    555         psrld   xmm8, 12
    556         pslld   xmm4, 20
    557         por     xmm4, xmm8
    558         movdqa  xmm8, xmm5
    559         psrld   xmm8, 12
    560         pslld   xmm5, 20
    561         por     xmm5, xmm8
    562         movdqa  xmm8, xmm6
    563         psrld   xmm8, 12
    564         pslld   xmm6, 20
    565         por     xmm6, xmm8
    566         movdqa  xmm8, xmm7
    567         psrld   xmm8, 12
    568         pslld   xmm7, 20
    569         por     xmm7, xmm8
    570         paddd   xmm0, xmmword ptr [rsp+40H]
    571         paddd   xmm1, xmmword ptr [rsp+0C0H]
    572         paddd   xmm2, xmmword ptr [rsp+20H]
    573         paddd   xmm3, xmmword ptr [rsp+0E0H]
    574         paddd   xmm0, xmm4
    575         paddd   xmm1, xmm5
    576         paddd   xmm2, xmm6
    577         paddd   xmm3, xmm7
    578         pxor    xmm12, xmm0
    579         pxor    xmm13, xmm1
    580         pxor    xmm14, xmm2
    581         pxor    xmm15, xmm3
    582         movdqa  xmm8, xmmword ptr [ROT8]
    583         pshufb  xmm12, xmm8
    584         pshufb  xmm13, xmm8
    585         pshufb  xmm14, xmm8
    586         pshufb  xmm15, xmm8
    587         movdqa  xmm8, xmmword ptr [rsp+100H]
    588         paddd   xmm8, xmm12
    589         paddd   xmm9, xmm13
    590         paddd   xmm10, xmm14
    591         paddd   xmm11, xmm15
    592         pxor    xmm4, xmm8
    593         pxor    xmm5, xmm9
    594         pxor    xmm6, xmm10
    595         pxor    xmm7, xmm11
    596         movdqa  xmmword ptr [rsp+100H], xmm8
    597         movdqa  xmm8, xmm4
    598         psrld   xmm8, 7
    599         pslld   xmm4, 25
    600         por     xmm4, xmm8
    601         movdqa  xmm8, xmm5
    602         psrld   xmm8, 7
    603         pslld   xmm5, 25
    604         por     xmm5, xmm8
    605         movdqa  xmm8, xmm6
    606         psrld   xmm8, 7
    607         pslld   xmm6, 25
    608         por     xmm6, xmm8
    609         movdqa  xmm8, xmm7
    610         psrld   xmm8, 7
    611         pslld   xmm7, 25
    612         por     xmm7, xmm8
    613         paddd   xmm0, xmmword ptr [rsp+60H]
    614         paddd   xmm1, xmmword ptr [rsp+90H]
    615         paddd   xmm2, xmmword ptr [rsp+0B0H]
    616         paddd   xmm3, xmmword ptr [rsp+80H]
    617         paddd   xmm0, xmm5
    618         paddd   xmm1, xmm6
    619         paddd   xmm2, xmm7
    620         paddd   xmm3, xmm4
    621         pxor    xmm15, xmm0
    622         pxor    xmm12, xmm1
    623         pxor    xmm13, xmm2
    624         pxor    xmm14, xmm3
    625         movdqa  xmm8, xmmword ptr [ROT16]
    626         pshufb  xmm15, xmm8
    627         pshufb  xmm12, xmm8
    628         pshufb  xmm13, xmm8
    629         pshufb  xmm14, xmm8
    630         paddd   xmm10, xmm15
    631         paddd   xmm11, xmm12
    632         movdqa  xmm8, xmmword ptr [rsp+100H]
    633         paddd   xmm8, xmm13
    634         paddd   xmm9, xmm14
    635         pxor    xmm5, xmm10
    636         pxor    xmm6, xmm11
    637         pxor    xmm7, xmm8
    638         pxor    xmm4, xmm9
    639         movdqa  xmmword ptr [rsp+100H], xmm8
    640         movdqa  xmm8, xmm5
    641         psrld   xmm8, 12
    642         pslld   xmm5, 20
    643         por     xmm5, xmm8
    644         movdqa  xmm8, xmm6
    645         psrld   xmm8, 12
    646         pslld   xmm6, 20
    647         por     xmm6, xmm8
    648         movdqa  xmm8, xmm7
    649         psrld   xmm8, 12
    650         pslld   xmm7, 20
    651         por     xmm7, xmm8
    652         movdqa  xmm8, xmm4
    653         psrld   xmm8, 12
    654         pslld   xmm4, 20
    655         por     xmm4, xmm8
    656         paddd   xmm0, xmmword ptr [rsp+50H]
    657         paddd   xmm1, xmmword ptr [rsp]
    658         paddd   xmm2, xmmword ptr [rsp+0F0H]
    659         paddd   xmm3, xmmword ptr [rsp+10H]
    660         paddd   xmm0, xmm5
    661         paddd   xmm1, xmm6
    662         paddd   xmm2, xmm7
    663         paddd   xmm3, xmm4
    664         pxor    xmm15, xmm0
    665         pxor    xmm12, xmm1
    666         pxor    xmm13, xmm2
    667         pxor    xmm14, xmm3
    668         movdqa  xmm8, xmmword ptr [ROT8]
    669         pshufb  xmm15, xmm8
    670         pshufb  xmm12, xmm8
    671         pshufb  xmm13, xmm8
    672         pshufb  xmm14, xmm8
    673         paddd   xmm10, xmm15
    674         paddd   xmm11, xmm12
    675         movdqa  xmm8, xmmword ptr [rsp+100H]
    676         paddd   xmm8, xmm13
    677         paddd   xmm9, xmm14
    678         pxor    xmm5, xmm10
    679         pxor    xmm6, xmm11
    680         pxor    xmm7, xmm8
    681         pxor    xmm4, xmm9
    682         movdqa  xmmword ptr [rsp+100H], xmm8
    683         movdqa  xmm8, xmm5
    684         psrld   xmm8, 7
    685         pslld   xmm5, 25
    686         por     xmm5, xmm8
    687         movdqa  xmm8, xmm6
    688         psrld   xmm8, 7
    689         pslld   xmm6, 25
    690         por     xmm6, xmm8
    691         movdqa  xmm8, xmm7
    692         psrld   xmm8, 7
    693         pslld   xmm7, 25
    694         por     xmm7, xmm8
    695         movdqa  xmm8, xmm4
    696         psrld   xmm8, 7
    697         pslld   xmm4, 25
    698         por     xmm4, xmm8
    699         paddd   xmm0, xmmword ptr [rsp+0A0H]
    700         paddd   xmm1, xmmword ptr [rsp+0C0H]
    701         paddd   xmm2, xmmword ptr [rsp+0E0H]
    702         paddd   xmm3, xmmword ptr [rsp+0D0H]
    703         paddd   xmm0, xmm4
    704         paddd   xmm1, xmm5
    705         paddd   xmm2, xmm6
    706         paddd   xmm3, xmm7
    707         pxor    xmm12, xmm0
    708         pxor    xmm13, xmm1
    709         pxor    xmm14, xmm2
    710         pxor    xmm15, xmm3
    711         movdqa  xmm8, xmmword ptr [ROT16]
    712         pshufb  xmm12, xmm8
    713         pshufb  xmm13, xmm8
    714         pshufb  xmm14, xmm8
    715         pshufb  xmm15, xmm8
    716         movdqa  xmm8, xmmword ptr [rsp+100H]
    717         paddd   xmm8, xmm12
    718         paddd   xmm9, xmm13
    719         paddd   xmm10, xmm14
    720         paddd   xmm11, xmm15
    721         pxor    xmm4, xmm8
    722         pxor    xmm5, xmm9
    723         pxor    xmm6, xmm10
    724         pxor    xmm7, xmm11
    725         movdqa  xmmword ptr [rsp+100H], xmm8
    726         movdqa  xmm8, xmm4
    727         psrld   xmm8, 12
    728         pslld   xmm4, 20
    729         por     xmm4, xmm8
    730         movdqa  xmm8, xmm5
    731         psrld   xmm8, 12
    732         pslld   xmm5, 20
    733         por     xmm5, xmm8
    734         movdqa  xmm8, xmm6
    735         psrld   xmm8, 12
    736         pslld   xmm6, 20
    737         por     xmm6, xmm8
    738         movdqa  xmm8, xmm7
    739         psrld   xmm8, 12
    740         pslld   xmm7, 20
    741         por     xmm7, xmm8
    742         paddd   xmm0, xmmword ptr [rsp+70H]
    743         paddd   xmm1, xmmword ptr [rsp+90H]
    744         paddd   xmm2, xmmword ptr [rsp+30H]
    745         paddd   xmm3, xmmword ptr [rsp+0F0H]
    746         paddd   xmm0, xmm4
    747         paddd   xmm1, xmm5
    748         paddd   xmm2, xmm6
    749         paddd   xmm3, xmm7
    750         pxor    xmm12, xmm0
    751         pxor    xmm13, xmm1
    752         pxor    xmm14, xmm2
    753         pxor    xmm15, xmm3
    754         movdqa  xmm8, xmmword ptr [ROT8]
    755         pshufb  xmm12, xmm8
    756         pshufb  xmm13, xmm8
    757         pshufb  xmm14, xmm8
    758         pshufb  xmm15, xmm8
    759         movdqa  xmm8, xmmword ptr [rsp+100H]
    760         paddd   xmm8, xmm12
    761         paddd   xmm9, xmm13
    762         paddd   xmm10, xmm14
    763         paddd   xmm11, xmm15
    764         pxor    xmm4, xmm8
    765         pxor    xmm5, xmm9
    766         pxor    xmm6, xmm10
    767         pxor    xmm7, xmm11
    768         movdqa  xmmword ptr [rsp+100H], xmm8
    769         movdqa  xmm8, xmm4
    770         psrld   xmm8, 7
    771         pslld   xmm4, 25
    772         por     xmm4, xmm8
    773         movdqa  xmm8, xmm5
    774         psrld   xmm8, 7
    775         pslld   xmm5, 25
    776         por     xmm5, xmm8
    777         movdqa  xmm8, xmm6
    778         psrld   xmm8, 7
    779         pslld   xmm6, 25
    780         por     xmm6, xmm8
    781         movdqa  xmm8, xmm7
    782         psrld   xmm8, 7
    783         pslld   xmm7, 25
    784         por     xmm7, xmm8
    785         paddd   xmm0, xmmword ptr [rsp+40H]
    786         paddd   xmm1, xmmword ptr [rsp+0B0H]
    787         paddd   xmm2, xmmword ptr [rsp+50H]
    788         paddd   xmm3, xmmword ptr [rsp+10H]
    789         paddd   xmm0, xmm5
    790         paddd   xmm1, xmm6
    791         paddd   xmm2, xmm7
    792         paddd   xmm3, xmm4
    793         pxor    xmm15, xmm0
    794         pxor    xmm12, xmm1
    795         pxor    xmm13, xmm2
    796         pxor    xmm14, xmm3
    797         movdqa  xmm8, xmmword ptr [ROT16]
    798         pshufb  xmm15, xmm8
    799         pshufb  xmm12, xmm8
    800         pshufb  xmm13, xmm8
    801         pshufb  xmm14, xmm8
    802         paddd   xmm10, xmm15
    803         paddd   xmm11, xmm12
    804         movdqa  xmm8, xmmword ptr [rsp+100H]
    805         paddd   xmm8, xmm13
    806         paddd   xmm9, xmm14
    807         pxor    xmm5, xmm10
    808         pxor    xmm6, xmm11
    809         pxor    xmm7, xmm8
    810         pxor    xmm4, xmm9
    811         movdqa  xmmword ptr [rsp+100H], xmm8
    812         movdqa  xmm8, xmm5
    813         psrld   xmm8, 12
    814         pslld   xmm5, 20
    815         por     xmm5, xmm8
    816         movdqa  xmm8, xmm6
    817         psrld   xmm8, 12
    818         pslld   xmm6, 20
    819         por     xmm6, xmm8
    820         movdqa  xmm8, xmm7
    821         psrld   xmm8, 12
    822         pslld   xmm7, 20
    823         por     xmm7, xmm8
    824         movdqa  xmm8, xmm4
    825         psrld   xmm8, 12
    826         pslld   xmm4, 20
    827         por     xmm4, xmm8
    828         paddd   xmm0, xmmword ptr [rsp]
    829         paddd   xmm1, xmmword ptr [rsp+20H]
    830         paddd   xmm2, xmmword ptr [rsp+80H]
    831         paddd   xmm3, xmmword ptr [rsp+60H]
    832         paddd   xmm0, xmm5
    833         paddd   xmm1, xmm6
    834         paddd   xmm2, xmm7
    835         paddd   xmm3, xmm4
    836         pxor    xmm15, xmm0
    837         pxor    xmm12, xmm1
    838         pxor    xmm13, xmm2
    839         pxor    xmm14, xmm3
    840         movdqa  xmm8, xmmword ptr [ROT8]
    841         pshufb  xmm15, xmm8
    842         pshufb  xmm12, xmm8
    843         pshufb  xmm13, xmm8
    844         pshufb  xmm14, xmm8
    845         paddd   xmm10, xmm15
    846         paddd   xmm11, xmm12
    847         movdqa  xmm8, xmmword ptr [rsp+100H]
    848         paddd   xmm8, xmm13
    849         paddd   xmm9, xmm14
    850         pxor    xmm5, xmm10
    851         pxor    xmm6, xmm11
    852         pxor    xmm7, xmm8
    853         pxor    xmm4, xmm9
    854         movdqa  xmmword ptr [rsp+100H], xmm8
    855         movdqa  xmm8, xmm5
    856         psrld   xmm8, 7
    857         pslld   xmm5, 25
    858         por     xmm5, xmm8
    859         movdqa  xmm8, xmm6
    860         psrld   xmm8, 7
    861         pslld   xmm6, 25
    862         por     xmm6, xmm8
    863         movdqa  xmm8, xmm7
    864         psrld   xmm8, 7
    865         pslld   xmm7, 25
    866         por     xmm7, xmm8
    867         movdqa  xmm8, xmm4
    868         psrld   xmm8, 7
    869         pslld   xmm4, 25
    870         por     xmm4, xmm8
    871         paddd   xmm0, xmmword ptr [rsp+0C0H]
    872         paddd   xmm1, xmmword ptr [rsp+90H]
    873         paddd   xmm2, xmmword ptr [rsp+0F0H]
    874         paddd   xmm3, xmmword ptr [rsp+0E0H]
    875         paddd   xmm0, xmm4
    876         paddd   xmm1, xmm5
    877         paddd   xmm2, xmm6
    878         paddd   xmm3, xmm7
    879         pxor    xmm12, xmm0
    880         pxor    xmm13, xmm1
    881         pxor    xmm14, xmm2
    882         pxor    xmm15, xmm3
    883         movdqa  xmm8, xmmword ptr [ROT16]
    884         pshufb  xmm12, xmm8
    885         pshufb  xmm13, xmm8
    886         pshufb  xmm14, xmm8
    887         pshufb  xmm15, xmm8
    888         movdqa  xmm8, xmmword ptr [rsp+100H]
    889         paddd   xmm8, xmm12
    890         paddd   xmm9, xmm13
    891         paddd   xmm10, xmm14
    892         paddd   xmm11, xmm15
    893         pxor    xmm4, xmm8
    894         pxor    xmm5, xmm9
    895         pxor    xmm6, xmm10
    896         pxor    xmm7, xmm11
    897         movdqa  xmmword ptr [rsp+100H], xmm8
    898         movdqa  xmm8, xmm4
    899         psrld   xmm8, 12
    900         pslld   xmm4, 20
    901         por     xmm4, xmm8
    902         movdqa  xmm8, xmm5
    903         psrld   xmm8, 12
    904         pslld   xmm5, 20
    905         por     xmm5, xmm8
    906         movdqa  xmm8, xmm6
    907         psrld   xmm8, 12
    908         pslld   xmm6, 20
    909         por     xmm6, xmm8
    910         movdqa  xmm8, xmm7
    911         psrld   xmm8, 12
    912         pslld   xmm7, 20
    913         por     xmm7, xmm8
    914         paddd   xmm0, xmmword ptr [rsp+0D0H]
    915         paddd   xmm1, xmmword ptr [rsp+0B0H]
    916         paddd   xmm2, xmmword ptr [rsp+0A0H]
    917         paddd   xmm3, xmmword ptr [rsp+80H]
    918         paddd   xmm0, xmm4
    919         paddd   xmm1, xmm5
    920         paddd   xmm2, xmm6
    921         paddd   xmm3, xmm7
    922         pxor    xmm12, xmm0
    923         pxor    xmm13, xmm1
    924         pxor    xmm14, xmm2
    925         pxor    xmm15, xmm3
    926         movdqa  xmm8, xmmword ptr [ROT8]
    927         pshufb  xmm12, xmm8
    928         pshufb  xmm13, xmm8
    929         pshufb  xmm14, xmm8
    930         pshufb  xmm15, xmm8
    931         movdqa  xmm8, xmmword ptr [rsp+100H]
    932         paddd   xmm8, xmm12
    933         paddd   xmm9, xmm13
    934         paddd   xmm10, xmm14
    935         paddd   xmm11, xmm15
    936         pxor    xmm4, xmm8
    937         pxor    xmm5, xmm9
    938         pxor    xmm6, xmm10
    939         pxor    xmm7, xmm11
    940         movdqa  xmmword ptr [rsp+100H], xmm8
    941         movdqa  xmm8, xmm4
    942         psrld   xmm8, 7
    943         pslld   xmm4, 25
    944         por     xmm4, xmm8
    945         movdqa  xmm8, xmm5
    946         psrld   xmm8, 7
    947         pslld   xmm5, 25
    948         por     xmm5, xmm8
    949         movdqa  xmm8, xmm6
    950         psrld   xmm8, 7
    951         pslld   xmm6, 25
    952         por     xmm6, xmm8
    953         movdqa  xmm8, xmm7
    954         psrld   xmm8, 7
    955         pslld   xmm7, 25
    956         por     xmm7, xmm8
    957         paddd   xmm0, xmmword ptr [rsp+70H]
    958         paddd   xmm1, xmmword ptr [rsp+50H]
    959         paddd   xmm2, xmmword ptr [rsp]
    960         paddd   xmm3, xmmword ptr [rsp+60H]
    961         paddd   xmm0, xmm5
    962         paddd   xmm1, xmm6
    963         paddd   xmm2, xmm7
    964         paddd   xmm3, xmm4
    965         pxor    xmm15, xmm0
    966         pxor    xmm12, xmm1
    967         pxor    xmm13, xmm2
    968         pxor    xmm14, xmm3
    969         movdqa  xmm8, xmmword ptr [ROT16]
    970         pshufb  xmm15, xmm8
    971         pshufb  xmm12, xmm8
    972         pshufb  xmm13, xmm8
    973         pshufb  xmm14, xmm8
    974         paddd   xmm10, xmm15
    975         paddd   xmm11, xmm12
    976         movdqa  xmm8, xmmword ptr [rsp+100H]
    977         paddd   xmm8, xmm13
    978         paddd   xmm9, xmm14
    979         pxor    xmm5, xmm10
    980         pxor    xmm6, xmm11
    981         pxor    xmm7, xmm8
    982         pxor    xmm4, xmm9
    983         movdqa  xmmword ptr [rsp+100H], xmm8
    984         movdqa  xmm8, xmm5
    985         psrld   xmm8, 12
    986         pslld   xmm5, 20
    987         por     xmm5, xmm8
    988         movdqa  xmm8, xmm6
    989         psrld   xmm8, 12
    990         pslld   xmm6, 20
    991         por     xmm6, xmm8
    992         movdqa  xmm8, xmm7
    993         psrld   xmm8, 12
    994         pslld   xmm7, 20
    995         por     xmm7, xmm8
    996         movdqa  xmm8, xmm4
    997         psrld   xmm8, 12
    998         pslld   xmm4, 20
    999         por     xmm4, xmm8
   1000         paddd   xmm0, xmmword ptr [rsp+20H]
   1001         paddd   xmm1, xmmword ptr [rsp+30H]
   1002         paddd   xmm2, xmmword ptr [rsp+10H]
   1003         paddd   xmm3, xmmword ptr [rsp+40H]
   1004         paddd   xmm0, xmm5
   1005         paddd   xmm1, xmm6
   1006         paddd   xmm2, xmm7
   1007         paddd   xmm3, xmm4
   1008         pxor    xmm15, xmm0
   1009         pxor    xmm12, xmm1
   1010         pxor    xmm13, xmm2
   1011         pxor    xmm14, xmm3
   1012         movdqa  xmm8, xmmword ptr [ROT8]
   1013         pshufb  xmm15, xmm8
   1014         pshufb  xmm12, xmm8
   1015         pshufb  xmm13, xmm8
   1016         pshufb  xmm14, xmm8
   1017         paddd   xmm10, xmm15
   1018         paddd   xmm11, xmm12
   1019         movdqa  xmm8, xmmword ptr [rsp+100H]
   1020         paddd   xmm8, xmm13
   1021         paddd   xmm9, xmm14
   1022         pxor    xmm5, xmm10
   1023         pxor    xmm6, xmm11
   1024         pxor    xmm7, xmm8
   1025         pxor    xmm4, xmm9
   1026         movdqa  xmmword ptr [rsp+100H], xmm8
   1027         movdqa  xmm8, xmm5
   1028         psrld   xmm8, 7
   1029         pslld   xmm5, 25
   1030         por     xmm5, xmm8
   1031         movdqa  xmm8, xmm6
   1032         psrld   xmm8, 7
   1033         pslld   xmm6, 25
   1034         por     xmm6, xmm8
   1035         movdqa  xmm8, xmm7
   1036         psrld   xmm8, 7
   1037         pslld   xmm7, 25
   1038         por     xmm7, xmm8
   1039         movdqa  xmm8, xmm4
   1040         psrld   xmm8, 7
   1041         pslld   xmm4, 25
   1042         por     xmm4, xmm8
   1043         paddd   xmm0, xmmword ptr [rsp+90H]
   1044         paddd   xmm1, xmmword ptr [rsp+0B0H]
   1045         paddd   xmm2, xmmword ptr [rsp+80H]
   1046         paddd   xmm3, xmmword ptr [rsp+0F0H]
   1047         paddd   xmm0, xmm4
   1048         paddd   xmm1, xmm5
   1049         paddd   xmm2, xmm6
   1050         paddd   xmm3, xmm7
   1051         pxor    xmm12, xmm0
   1052         pxor    xmm13, xmm1
   1053         pxor    xmm14, xmm2
   1054         pxor    xmm15, xmm3
   1055         movdqa  xmm8, xmmword ptr [ROT16]
   1056         pshufb  xmm12, xmm8
   1057         pshufb  xmm13, xmm8
   1058         pshufb  xmm14, xmm8
   1059         pshufb  xmm15, xmm8
   1060         movdqa  xmm8, xmmword ptr [rsp+100H]
   1061         paddd   xmm8, xmm12
   1062         paddd   xmm9, xmm13
   1063         paddd   xmm10, xmm14
   1064         paddd   xmm11, xmm15
   1065         pxor    xmm4, xmm8
   1066         pxor    xmm5, xmm9
   1067         pxor    xmm6, xmm10
   1068         pxor    xmm7, xmm11
   1069         movdqa  xmmword ptr [rsp+100H], xmm8
   1070         movdqa  xmm8, xmm4
   1071         psrld   xmm8, 12
   1072         pslld   xmm4, 20
   1073         por     xmm4, xmm8
   1074         movdqa  xmm8, xmm5
   1075         psrld   xmm8, 12
   1076         pslld   xmm5, 20
   1077         por     xmm5, xmm8
   1078         movdqa  xmm8, xmm6
   1079         psrld   xmm8, 12
   1080         pslld   xmm6, 20
   1081         por     xmm6, xmm8
   1082         movdqa  xmm8, xmm7
   1083         psrld   xmm8, 12
   1084         pslld   xmm7, 20
   1085         por     xmm7, xmm8
   1086         paddd   xmm0, xmmword ptr [rsp+0E0H]
   1087         paddd   xmm1, xmmword ptr [rsp+50H]
   1088         paddd   xmm2, xmmword ptr [rsp+0C0H]
   1089         paddd   xmm3, xmmword ptr [rsp+10H]
   1090         paddd   xmm0, xmm4
   1091         paddd   xmm1, xmm5
   1092         paddd   xmm2, xmm6
   1093         paddd   xmm3, xmm7
   1094         pxor    xmm12, xmm0
   1095         pxor    xmm13, xmm1
   1096         pxor    xmm14, xmm2
   1097         pxor    xmm15, xmm3
   1098         movdqa  xmm8, xmmword ptr [ROT8]
   1099         pshufb  xmm12, xmm8
   1100         pshufb  xmm13, xmm8
   1101         pshufb  xmm14, xmm8
   1102         pshufb  xmm15, xmm8
   1103         movdqa  xmm8, xmmword ptr [rsp+100H]
   1104         paddd   xmm8, xmm12
   1105         paddd   xmm9, xmm13
   1106         paddd   xmm10, xmm14
   1107         paddd   xmm11, xmm15
   1108         pxor    xmm4, xmm8
   1109         pxor    xmm5, xmm9
   1110         pxor    xmm6, xmm10
   1111         pxor    xmm7, xmm11
   1112         movdqa  xmmword ptr [rsp+100H], xmm8
   1113         movdqa  xmm8, xmm4
   1114         psrld   xmm8, 7
   1115         pslld   xmm4, 25
   1116         por     xmm4, xmm8
   1117         movdqa  xmm8, xmm5
   1118         psrld   xmm8, 7
   1119         pslld   xmm5, 25
   1120         por     xmm5, xmm8
   1121         movdqa  xmm8, xmm6
   1122         psrld   xmm8, 7
   1123         pslld   xmm6, 25
   1124         por     xmm6, xmm8
   1125         movdqa  xmm8, xmm7
   1126         psrld   xmm8, 7
   1127         pslld   xmm7, 25
   1128         por     xmm7, xmm8
   1129         paddd   xmm0, xmmword ptr [rsp+0D0H]
   1130         paddd   xmm1, xmmword ptr [rsp]
   1131         paddd   xmm2, xmmword ptr [rsp+20H]
   1132         paddd   xmm3, xmmword ptr [rsp+40H]
   1133         paddd   xmm0, xmm5
   1134         paddd   xmm1, xmm6
   1135         paddd   xmm2, xmm7
   1136         paddd   xmm3, xmm4
   1137         pxor    xmm15, xmm0
   1138         pxor    xmm12, xmm1
   1139         pxor    xmm13, xmm2
   1140         pxor    xmm14, xmm3
   1141         movdqa  xmm8, xmmword ptr [ROT16]
   1142         pshufb  xmm15, xmm8
   1143         pshufb  xmm12, xmm8
   1144         pshufb  xmm13, xmm8
   1145         pshufb  xmm14, xmm8
   1146         paddd   xmm10, xmm15
   1147         paddd   xmm11, xmm12
   1148         movdqa  xmm8, xmmword ptr [rsp+100H]
   1149         paddd   xmm8, xmm13
   1150         paddd   xmm9, xmm14
   1151         pxor    xmm5, xmm10
   1152         pxor    xmm6, xmm11
   1153         pxor    xmm7, xmm8
   1154         pxor    xmm4, xmm9
   1155         movdqa  xmmword ptr [rsp+100H], xmm8
   1156         movdqa  xmm8, xmm5
   1157         psrld   xmm8, 12
   1158         pslld   xmm5, 20
   1159         por     xmm5, xmm8
   1160         movdqa  xmm8, xmm6
   1161         psrld   xmm8, 12
   1162         pslld   xmm6, 20
   1163         por     xmm6, xmm8
   1164         movdqa  xmm8, xmm7
   1165         psrld   xmm8, 12
   1166         pslld   xmm7, 20
   1167         por     xmm7, xmm8
   1168         movdqa  xmm8, xmm4
   1169         psrld   xmm8, 12
   1170         pslld   xmm4, 20
   1171         por     xmm4, xmm8
   1172         paddd   xmm0, xmmword ptr [rsp+30H]
   1173         paddd   xmm1, xmmword ptr [rsp+0A0H]
   1174         paddd   xmm2, xmmword ptr [rsp+60H]
   1175         paddd   xmm3, xmmword ptr [rsp+70H]
   1176         paddd   xmm0, xmm5
   1177         paddd   xmm1, xmm6
   1178         paddd   xmm2, xmm7
   1179         paddd   xmm3, xmm4
   1180         pxor    xmm15, xmm0
   1181         pxor    xmm12, xmm1
   1182         pxor    xmm13, xmm2
   1183         pxor    xmm14, xmm3
   1184         movdqa  xmm8, xmmword ptr [ROT8]
   1185         pshufb  xmm15, xmm8
   1186         pshufb  xmm12, xmm8
   1187         pshufb  xmm13, xmm8
   1188         pshufb  xmm14, xmm8
   1189         paddd   xmm10, xmm15
   1190         paddd   xmm11, xmm12
   1191         movdqa  xmm8, xmmword ptr [rsp+100H]
   1192         paddd   xmm8, xmm13
   1193         paddd   xmm9, xmm14
   1194         pxor    xmm5, xmm10
   1195         pxor    xmm6, xmm11
   1196         pxor    xmm7, xmm8
   1197         pxor    xmm4, xmm9
   1198         movdqa  xmmword ptr [rsp+100H], xmm8
   1199         movdqa  xmm8, xmm5
   1200         psrld   xmm8, 7
   1201         pslld   xmm5, 25
   1202         por     xmm5, xmm8
   1203         movdqa  xmm8, xmm6
   1204         psrld   xmm8, 7
   1205         pslld   xmm6, 25
   1206         por     xmm6, xmm8
   1207         movdqa  xmm8, xmm7
   1208         psrld   xmm8, 7
   1209         pslld   xmm7, 25
   1210         por     xmm7, xmm8
   1211         movdqa  xmm8, xmm4
   1212         psrld   xmm8, 7
   1213         pslld   xmm4, 25
   1214         por     xmm4, xmm8
   1215         paddd   xmm0, xmmword ptr [rsp+0B0H]
   1216         paddd   xmm1, xmmword ptr [rsp+50H]
   1217         paddd   xmm2, xmmword ptr [rsp+10H]
   1218         paddd   xmm3, xmmword ptr [rsp+80H]
   1219         paddd   xmm0, xmm4
   1220         paddd   xmm1, xmm5
   1221         paddd   xmm2, xmm6
   1222         paddd   xmm3, xmm7
   1223         pxor    xmm12, xmm0
   1224         pxor    xmm13, xmm1
   1225         pxor    xmm14, xmm2
   1226         pxor    xmm15, xmm3
   1227         movdqa  xmm8, xmmword ptr [ROT16]
   1228         pshufb  xmm12, xmm8
   1229         pshufb  xmm13, xmm8
   1230         pshufb  xmm14, xmm8
   1231         pshufb  xmm15, xmm8
   1232         movdqa  xmm8, xmmword ptr [rsp+100H]
   1233         paddd   xmm8, xmm12
   1234         paddd   xmm9, xmm13
   1235         paddd   xmm10, xmm14
   1236         paddd   xmm11, xmm15
   1237         pxor    xmm4, xmm8
   1238         pxor    xmm5, xmm9
   1239         pxor    xmm6, xmm10
   1240         pxor    xmm7, xmm11
   1241         movdqa  xmmword ptr [rsp+100H], xmm8
   1242         movdqa  xmm8, xmm4
   1243         psrld   xmm8, 12
   1244         pslld   xmm4, 20
   1245         por     xmm4, xmm8
   1246         movdqa  xmm8, xmm5
   1247         psrld   xmm8, 12
   1248         pslld   xmm5, 20
   1249         por     xmm5, xmm8
   1250         movdqa  xmm8, xmm6
   1251         psrld   xmm8, 12
   1252         pslld   xmm6, 20
   1253         por     xmm6, xmm8
   1254         movdqa  xmm8, xmm7
   1255         psrld   xmm8, 12
   1256         pslld   xmm7, 20
   1257         por     xmm7, xmm8
   1258         paddd   xmm0, xmmword ptr [rsp+0F0H]
   1259         paddd   xmm1, xmmword ptr [rsp]
   1260         paddd   xmm2, xmmword ptr [rsp+90H]
   1261         paddd   xmm3, xmmword ptr [rsp+60H]
   1262         paddd   xmm0, xmm4
   1263         paddd   xmm1, xmm5
   1264         paddd   xmm2, xmm6
   1265         paddd   xmm3, xmm7
   1266         pxor    xmm12, xmm0
   1267         pxor    xmm13, xmm1
   1268         pxor    xmm14, xmm2
   1269         pxor    xmm15, xmm3
   1270         movdqa  xmm8, xmmword ptr [ROT8]
   1271         pshufb  xmm12, xmm8
   1272         pshufb  xmm13, xmm8
   1273         pshufb  xmm14, xmm8
   1274         pshufb  xmm15, xmm8
   1275         movdqa  xmm8, xmmword ptr [rsp+100H]
   1276         paddd   xmm8, xmm12
   1277         paddd   xmm9, xmm13
   1278         paddd   xmm10, xmm14
   1279         paddd   xmm11, xmm15
   1280         pxor    xmm4, xmm8
   1281         pxor    xmm5, xmm9
   1282         pxor    xmm6, xmm10
   1283         pxor    xmm7, xmm11
   1284         movdqa  xmmword ptr [rsp+100H], xmm8
   1285         movdqa  xmm8, xmm4
   1286         psrld   xmm8, 7
   1287         pslld   xmm4, 25
   1288         por     xmm4, xmm8
   1289         movdqa  xmm8, xmm5
   1290         psrld   xmm8, 7
   1291         pslld   xmm5, 25
   1292         por     xmm5, xmm8
   1293         movdqa  xmm8, xmm6
   1294         psrld   xmm8, 7
   1295         pslld   xmm6, 25
   1296         por     xmm6, xmm8
   1297         movdqa  xmm8, xmm7
   1298         psrld   xmm8, 7
   1299         pslld   xmm7, 25
   1300         por     xmm7, xmm8
   1301         paddd   xmm0, xmmword ptr [rsp+0E0H]
   1302         paddd   xmm1, xmmword ptr [rsp+20H]
   1303         paddd   xmm2, xmmword ptr [rsp+30H]
   1304         paddd   xmm3, xmmword ptr [rsp+70H]
   1305         paddd   xmm0, xmm5
   1306         paddd   xmm1, xmm6
   1307         paddd   xmm2, xmm7
   1308         paddd   xmm3, xmm4
   1309         pxor    xmm15, xmm0
   1310         pxor    xmm12, xmm1
   1311         pxor    xmm13, xmm2
   1312         pxor    xmm14, xmm3
   1313         movdqa  xmm8, xmmword ptr [ROT16]
   1314         pshufb  xmm15, xmm8
   1315         pshufb  xmm12, xmm8
   1316         pshufb  xmm13, xmm8
   1317         pshufb  xmm14, xmm8
   1318         paddd   xmm10, xmm15
   1319         paddd   xmm11, xmm12
   1320         movdqa  xmm8, xmmword ptr [rsp+100H]
   1321         paddd   xmm8, xmm13
   1322         paddd   xmm9, xmm14
   1323         pxor    xmm5, xmm10
   1324         pxor    xmm6, xmm11
   1325         pxor    xmm7, xmm8
   1326         pxor    xmm4, xmm9
   1327         movdqa  xmmword ptr [rsp+100H], xmm8
   1328         movdqa  xmm8, xmm5
   1329         psrld   xmm8, 12
   1330         pslld   xmm5, 20
   1331         por     xmm5, xmm8
   1332         movdqa  xmm8, xmm6
   1333         psrld   xmm8, 12
   1334         pslld   xmm6, 20
   1335         por     xmm6, xmm8
   1336         movdqa  xmm8, xmm7
   1337         psrld   xmm8, 12
   1338         pslld   xmm7, 20
   1339         por     xmm7, xmm8
   1340         movdqa  xmm8, xmm4
   1341         psrld   xmm8, 12
   1342         pslld   xmm4, 20
   1343         por     xmm4, xmm8
   1344         paddd   xmm0, xmmword ptr [rsp+0A0H]
   1345         paddd   xmm1, xmmword ptr [rsp+0C0H]
   1346         paddd   xmm2, xmmword ptr [rsp+40H]
   1347         paddd   xmm3, xmmword ptr [rsp+0D0H]
   1348         paddd   xmm0, xmm5
   1349         paddd   xmm1, xmm6
   1350         paddd   xmm2, xmm7
   1351         paddd   xmm3, xmm4
   1352         pxor    xmm15, xmm0
   1353         pxor    xmm12, xmm1
   1354         pxor    xmm13, xmm2
   1355         pxor    xmm14, xmm3
   1356         movdqa  xmm8, xmmword ptr [ROT8]
   1357         pshufb  xmm15, xmm8
   1358         pshufb  xmm12, xmm8
   1359         pshufb  xmm13, xmm8
   1360         pshufb  xmm14, xmm8
   1361         paddd   xmm10, xmm15
   1362         paddd   xmm11, xmm12
   1363         movdqa  xmm8, xmmword ptr [rsp+100H]
   1364         paddd   xmm8, xmm13
   1365         paddd   xmm9, xmm14
   1366         pxor    xmm5, xmm10
   1367         pxor    xmm6, xmm11
   1368         pxor    xmm7, xmm8
   1369         pxor    xmm4, xmm9
   1370         pxor    xmm0, xmm8
   1371         pxor    xmm1, xmm9
   1372         pxor    xmm2, xmm10
   1373         pxor    xmm3, xmm11
   1374         movdqa  xmm8, xmm5
   1375         psrld   xmm8, 7
   1376         pslld   xmm5, 25
   1377         por     xmm5, xmm8
   1378         movdqa  xmm8, xmm6
   1379         psrld   xmm8, 7
   1380         pslld   xmm6, 25
   1381         por     xmm6, xmm8
   1382         movdqa  xmm8, xmm7
   1383         psrld   xmm8, 7
   1384         pslld   xmm7, 25
   1385         por     xmm7, xmm8
   1386         movdqa  xmm8, xmm4
   1387         psrld   xmm8, 7
   1388         pslld   xmm4, 25
   1389         por     xmm4, xmm8
   1390         pxor    xmm4, xmm12
   1391         pxor    xmm5, xmm13
   1392         pxor    xmm6, xmm14
   1393         pxor    xmm7, xmm15
   1394         mov     eax, r13d
   1395         jne     innerloop4
   1396         movdqa  xmm9, xmm0
   1397         punpckldq xmm0, xmm1
   1398         punpckhdq xmm9, xmm1
   1399         movdqa  xmm11, xmm2
   1400         punpckldq xmm2, xmm3
   1401         punpckhdq xmm11, xmm3
   1402         movdqa  xmm1, xmm0
   1403         punpcklqdq xmm0, xmm2
   1404         punpckhqdq xmm1, xmm2
   1405         movdqa  xmm3, xmm9
   1406         punpcklqdq xmm9, xmm11
   1407         punpckhqdq xmm3, xmm11
   1408         movdqu  xmmword ptr [rbx], xmm0
   1409         movdqu  xmmword ptr [rbx+20H], xmm1
   1410         movdqu  xmmword ptr [rbx+40H], xmm9
   1411         movdqu  xmmword ptr [rbx+60H], xmm3
   1412         movdqa  xmm9, xmm4
   1413         punpckldq xmm4, xmm5
   1414         punpckhdq xmm9, xmm5
   1415         movdqa  xmm11, xmm6
   1416         punpckldq xmm6, xmm7
   1417         punpckhdq xmm11, xmm7
   1418         movdqa  xmm5, xmm4
   1419         punpcklqdq xmm4, xmm6
   1420         punpckhqdq xmm5, xmm6
   1421         movdqa  xmm7, xmm9
   1422         punpcklqdq xmm9, xmm11
   1423         punpckhqdq xmm7, xmm11
   1424         movdqu  xmmword ptr [rbx+10H], xmm4
   1425         movdqu  xmmword ptr [rbx+30H], xmm5
   1426         movdqu  xmmword ptr [rbx+50H], xmm9
   1427         movdqu  xmmword ptr [rbx+70H], xmm7
   1428         movdqa  xmm1, xmmword ptr [rsp+110H]
   1429         movdqa  xmm0, xmm1
   1430         paddd   xmm1, xmmword ptr [rsp+150H]
   1431         movdqa  xmmword ptr [rsp+110H], xmm1
   1432         pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
   1433         pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
   1434         pcmpgtd xmm0, xmm1
   1435         movdqa  xmm1, xmmword ptr [rsp+120H]
   1436         psubd   xmm1, xmm0
   1437         movdqa  xmmword ptr [rsp+120H], xmm1
   1438         add     rbx, 128
   1439         add     rdi, 32
   1440         sub     rsi, 4
   1441         cmp     rsi, 4
   1442         jnc     outerloop4
   1443         test    rsi, rsi
   1444         jne     final3blocks
   1445 unwind:
   1446         movdqa  xmm6, xmmword ptr [rsp+170H]
   1447         movdqa  xmm7, xmmword ptr [rsp+180H]
   1448         movdqa  xmm8, xmmword ptr [rsp+190H]
   1449         movdqa  xmm9, xmmword ptr [rsp+1A0H]
   1450         movdqa  xmm10, xmmword ptr [rsp+1B0H]
   1451         movdqa  xmm11, xmmword ptr [rsp+1C0H]
   1452         movdqa  xmm12, xmmword ptr [rsp+1D0H]
   1453         movdqa  xmm13, xmmword ptr [rsp+1E0H]
   1454         movdqa  xmm14, xmmword ptr [rsp+1F0H]
   1455         movdqa  xmm15, xmmword ptr [rsp+200H]
   1456         mov     rsp, rbp
   1457         pop     rbp
   1458         pop     rbx
   1459         pop     rdi
   1460         pop     rsi
   1461         pop     r12
   1462         pop     r13
   1463         pop     r14
   1464         pop     r15
   1465         ret
   1466 ALIGN   16
   1467 final3blocks:
   1468         test    esi, 2H
   1469         je      final1block
   1470         movups  xmm0, xmmword ptr [rcx]
   1471         movups  xmm1, xmmword ptr [rcx+10H]
   1472         movaps  xmm8, xmm0
   1473         movaps  xmm9, xmm1
   1474         movd    xmm13, dword ptr [rsp+110H]
   1475         pinsrd  xmm13, dword ptr [rsp+120H], 1
   1476         pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
   1477         movaps  xmmword ptr [rsp], xmm13
   1478         movd    xmm14, dword ptr [rsp+114H]
   1479         pinsrd  xmm14, dword ptr [rsp+124H], 1
   1480         pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
   1481         movaps  xmmword ptr [rsp+10H], xmm14
   1482         mov     r8, qword ptr [rdi]
   1483         mov     r9, qword ptr [rdi+8H]
   1484         movzx   eax, byte ptr [rbp+80H]
   1485         or      eax, r13d
   1486         xor     edx, edx
   1487 innerloop2:
   1488         mov     r14d, eax
   1489         or      eax, r12d
   1490         add     rdx, 64
   1491         cmp     rdx, r15
   1492         cmovne  eax, r14d
   1493         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1494         movaps  xmm10, xmm2
   1495         movups  xmm4, xmmword ptr [r8+rdx-40H]
   1496         movups  xmm5, xmmword ptr [r8+rdx-30H]
   1497         movaps  xmm3, xmm4
   1498         shufps  xmm4, xmm5, 136
   1499         shufps  xmm3, xmm5, 221
   1500         movaps  xmm5, xmm3
   1501         movups  xmm6, xmmword ptr [r8+rdx-20H]
   1502         movups  xmm7, xmmword ptr [r8+rdx-10H]
   1503         movaps  xmm3, xmm6
   1504         shufps  xmm6, xmm7, 136
   1505         pshufd  xmm6, xmm6, 93H
   1506         shufps  xmm3, xmm7, 221
   1507         pshufd  xmm7, xmm3, 93H
   1508         movups  xmm12, xmmword ptr [r9+rdx-40H]
   1509         movups  xmm13, xmmword ptr [r9+rdx-30H]
   1510         movaps  xmm11, xmm12
   1511         shufps  xmm12, xmm13, 136
   1512         shufps  xmm11, xmm13, 221
   1513         movaps  xmm13, xmm11
   1514         movups  xmm14, xmmword ptr [r9+rdx-20H]
   1515         movups  xmm15, xmmword ptr [r9+rdx-10H]
   1516         movaps  xmm11, xmm14
   1517         shufps  xmm14, xmm15, 136
   1518         pshufd  xmm14, xmm14, 93H
   1519         shufps  xmm11, xmm15, 221
   1520         pshufd  xmm15, xmm11, 93H
   1521         movaps  xmm3, xmmword ptr [rsp]
   1522         movaps  xmm11, xmmword ptr [rsp+10H]
   1523         pinsrd  xmm3, eax, 3
   1524         pinsrd  xmm11, eax, 3
   1525         mov     al, 7
   1526 roundloop2:
   1527         paddd   xmm0, xmm4
   1528         paddd   xmm8, xmm12
   1529         movaps  xmmword ptr [rsp+20H], xmm4
   1530         movaps  xmmword ptr [rsp+30H], xmm12
   1531         paddd   xmm0, xmm1
   1532         paddd   xmm8, xmm9
   1533         pxor    xmm3, xmm0
   1534         pxor    xmm11, xmm8
   1535         movaps  xmm12, xmmword ptr [ROT16]
   1536         pshufb  xmm3, xmm12
   1537         pshufb  xmm11, xmm12
   1538         paddd   xmm2, xmm3
   1539         paddd   xmm10, xmm11
   1540         pxor    xmm1, xmm2
   1541         pxor    xmm9, xmm10
   1542         movdqa  xmm4, xmm1
   1543         pslld   xmm1, 20
   1544         psrld   xmm4, 12
   1545         por     xmm1, xmm4
   1546         movdqa  xmm4, xmm9
   1547         pslld   xmm9, 20
   1548         psrld   xmm4, 12
   1549         por     xmm9, xmm4
   1550         paddd   xmm0, xmm5
   1551         paddd   xmm8, xmm13
   1552         movaps  xmmword ptr [rsp+40H], xmm5
   1553         movaps  xmmword ptr [rsp+50H], xmm13
   1554         paddd   xmm0, xmm1
   1555         paddd   xmm8, xmm9
   1556         pxor    xmm3, xmm0
   1557         pxor    xmm11, xmm8
   1558         movaps  xmm13, xmmword ptr [ROT8]
   1559         pshufb  xmm3, xmm13
   1560         pshufb  xmm11, xmm13
   1561         paddd   xmm2, xmm3
   1562         paddd   xmm10, xmm11
   1563         pxor    xmm1, xmm2
   1564         pxor    xmm9, xmm10
   1565         movdqa  xmm4, xmm1
   1566         pslld   xmm1, 25
   1567         psrld   xmm4, 7
   1568         por     xmm1, xmm4
   1569         movdqa  xmm4, xmm9
   1570         pslld   xmm9, 25
   1571         psrld   xmm4, 7
   1572         por     xmm9, xmm4
   1573         pshufd  xmm0, xmm0, 93H
   1574         pshufd  xmm8, xmm8, 93H
   1575         pshufd  xmm3, xmm3, 4EH
   1576         pshufd  xmm11, xmm11, 4EH
   1577         pshufd  xmm2, xmm2, 39H
   1578         pshufd  xmm10, xmm10, 39H
   1579         paddd   xmm0, xmm6
   1580         paddd   xmm8, xmm14
   1581         paddd   xmm0, xmm1
   1582         paddd   xmm8, xmm9
   1583         pxor    xmm3, xmm0
   1584         pxor    xmm11, xmm8
   1585         pshufb  xmm3, xmm12
   1586         pshufb  xmm11, xmm12
   1587         paddd   xmm2, xmm3
   1588         paddd   xmm10, xmm11
   1589         pxor    xmm1, xmm2
   1590         pxor    xmm9, xmm10
   1591         movdqa  xmm4, xmm1
   1592         pslld   xmm1, 20
   1593         psrld   xmm4, 12
   1594         por     xmm1, xmm4
   1595         movdqa  xmm4, xmm9
   1596         pslld   xmm9, 20
   1597         psrld   xmm4, 12
   1598         por     xmm9, xmm4
   1599         paddd   xmm0, xmm7
   1600         paddd   xmm8, xmm15
   1601         paddd   xmm0, xmm1
   1602         paddd   xmm8, xmm9
   1603         pxor    xmm3, xmm0
   1604         pxor    xmm11, xmm8
   1605         pshufb  xmm3, xmm13
   1606         pshufb  xmm11, xmm13
   1607         paddd   xmm2, xmm3
   1608         paddd   xmm10, xmm11
   1609         pxor    xmm1, xmm2
   1610         pxor    xmm9, xmm10
   1611         movdqa  xmm4, xmm1
   1612         pslld   xmm1, 25
   1613         psrld   xmm4, 7
   1614         por     xmm1, xmm4
   1615         movdqa  xmm4, xmm9
   1616         pslld   xmm9, 25
   1617         psrld   xmm4, 7
   1618         por     xmm9, xmm4
   1619         pshufd  xmm0, xmm0, 39H
   1620         pshufd  xmm8, xmm8, 39H
   1621         pshufd  xmm3, xmm3, 4EH
   1622         pshufd  xmm11, xmm11, 4EH
   1623         pshufd  xmm2, xmm2, 93H
   1624         pshufd  xmm10, xmm10, 93H
   1625         dec     al
   1626         je      endroundloop2
   1627         movdqa  xmm12, xmmword ptr [rsp+20H]
   1628         movdqa  xmm5, xmmword ptr [rsp+40H]
   1629         pshufd  xmm13, xmm12, 0FH
   1630         shufps  xmm12, xmm5, 214
   1631         pshufd  xmm4, xmm12, 39H
   1632         movdqa  xmm12, xmm6
   1633         shufps  xmm12, xmm7, 250
   1634         pblendw xmm13, xmm12, 0CCH
   1635         movdqa  xmm12, xmm7
   1636         punpcklqdq xmm12, xmm5
   1637         pblendw xmm12, xmm6, 0C0H
   1638         pshufd  xmm12, xmm12, 78H
   1639         punpckhdq xmm5, xmm7
   1640         punpckldq xmm6, xmm5
   1641         pshufd  xmm7, xmm6, 1EH
   1642         movdqa  xmmword ptr [rsp+20H], xmm13
   1643         movdqa  xmmword ptr [rsp+40H], xmm12
   1644         movdqa  xmm5, xmmword ptr [rsp+30H]
   1645         movdqa  xmm13, xmmword ptr [rsp+50H]
   1646         pshufd  xmm6, xmm5, 0FH
   1647         shufps  xmm5, xmm13, 214
   1648         pshufd  xmm12, xmm5, 39H
   1649         movdqa  xmm5, xmm14
   1650         shufps  xmm5, xmm15, 250
   1651         pblendw xmm6, xmm5, 0CCH
   1652         movdqa  xmm5, xmm15
   1653         punpcklqdq xmm5, xmm13
   1654         pblendw xmm5, xmm14, 0C0H
   1655         pshufd  xmm5, xmm5, 78H
   1656         punpckhdq xmm13, xmm15
   1657         punpckldq xmm14, xmm13
   1658         pshufd  xmm15, xmm14, 1EH
   1659         movdqa  xmm13, xmm6
   1660         movdqa  xmm14, xmm5
   1661         movdqa  xmm5, xmmword ptr [rsp+20H]
   1662         movdqa  xmm6, xmmword ptr [rsp+40H]
   1663         jmp     roundloop2
   1664 endroundloop2:
   1665         pxor    xmm0, xmm2
   1666         pxor    xmm1, xmm3
   1667         pxor    xmm8, xmm10
   1668         pxor    xmm9, xmm11
   1669         mov     eax, r13d
   1670         cmp     rdx, r15
   1671         jne     innerloop2
   1672         movups  xmmword ptr [rbx], xmm0
   1673         movups  xmmword ptr [rbx+10H], xmm1
   1674         movups  xmmword ptr [rbx+20H], xmm8
   1675         movups  xmmword ptr [rbx+30H], xmm9
   1676         movdqa  xmm0, xmmword ptr [rsp+130H]
   1677         movdqa  xmm1, xmmword ptr [rsp+110H]
   1678         movdqa  xmm2, xmmword ptr [rsp+120H]
   1679         movdqu  xmm3, xmmword ptr [rsp+118H]
   1680         movdqu  xmm4, xmmword ptr [rsp+128H]
   1681         blendvps xmm1, xmm3, xmm0
   1682         blendvps xmm2, xmm4, xmm0
   1683         movdqa  xmmword ptr [rsp+110H], xmm1
   1684         movdqa  xmmword ptr [rsp+120H], xmm2
   1685         add     rdi, 16
   1686         add     rbx, 64
   1687         sub     rsi, 2
   1688 final1block:
   1689         test    esi, 1H
   1690         je      unwind
   1691         movups  xmm0, xmmword ptr [rcx]
   1692         movups  xmm1, xmmword ptr [rcx+10H]
   1693         movd    xmm13, dword ptr [rsp+110H]
   1694         pinsrd  xmm13, dword ptr [rsp+120H], 1
   1695         pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
   1696         movaps  xmm14, xmmword ptr [ROT8]
   1697         movaps  xmm15, xmmword ptr [ROT16]
   1698         mov     r8, qword ptr [rdi]
   1699         movzx   eax, byte ptr [rbp+80H]
   1700         or      eax, r13d
   1701         xor     edx, edx
   1702 innerloop1:
   1703         mov     r14d, eax
   1704         or      eax, r12d
   1705         add     rdx, 64
   1706         cmp     rdx, r15
   1707         cmovne  eax, r14d
   1708         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1709         movaps  xmm3, xmm13
   1710         pinsrd  xmm3, eax, 3
   1711         movups  xmm4, xmmword ptr [r8+rdx-40H]
   1712         movups  xmm5, xmmword ptr [r8+rdx-30H]
   1713         movaps  xmm8, xmm4
   1714         shufps  xmm4, xmm5, 136
   1715         shufps  xmm8, xmm5, 221
   1716         movaps  xmm5, xmm8
   1717         movups  xmm6, xmmword ptr [r8+rdx-20H]
   1718         movups  xmm7, xmmword ptr [r8+rdx-10H]
   1719         movaps  xmm8, xmm6
   1720         shufps  xmm6, xmm7, 136
   1721         pshufd  xmm6, xmm6, 93H
   1722         shufps  xmm8, xmm7, 221
   1723         pshufd  xmm7, xmm8, 93H
   1724         mov     al, 7
   1725 roundloop1:
   1726         paddd   xmm0, xmm4
   1727         paddd   xmm0, xmm1
   1728         pxor    xmm3, xmm0
   1729         pshufb  xmm3, xmm15
   1730         paddd   xmm2, xmm3
   1731         pxor    xmm1, xmm2
   1732         movdqa  xmm11, xmm1
   1733         pslld   xmm1, 20
   1734         psrld   xmm11, 12
   1735         por     xmm1, xmm11
   1736         paddd   xmm0, xmm5
   1737         paddd   xmm0, xmm1
   1738         pxor    xmm3, xmm0
   1739         pshufb  xmm3, xmm14
   1740         paddd   xmm2, xmm3
   1741         pxor    xmm1, xmm2
   1742         movdqa  xmm11, xmm1
   1743         pslld   xmm1, 25
   1744         psrld   xmm11, 7
   1745         por     xmm1, xmm11
   1746         pshufd  xmm0, xmm0, 93H
   1747         pshufd  xmm3, xmm3, 4EH
   1748         pshufd  xmm2, xmm2, 39H
   1749         paddd   xmm0, xmm6
   1750         paddd   xmm0, xmm1
   1751         pxor    xmm3, xmm0
   1752         pshufb  xmm3, xmm15
   1753         paddd   xmm2, xmm3
   1754         pxor    xmm1, xmm2
   1755         movdqa  xmm11, xmm1
   1756         pslld   xmm1, 20
   1757         psrld   xmm11, 12
   1758         por     xmm1, xmm11
   1759         paddd   xmm0, xmm7
   1760         paddd   xmm0, xmm1
   1761         pxor    xmm3, xmm0
   1762         pshufb  xmm3, xmm14
   1763         paddd   xmm2, xmm3
   1764         pxor    xmm1, xmm2
   1765         movdqa  xmm11, xmm1
   1766         pslld   xmm1, 25
   1767         psrld   xmm11, 7
   1768         por     xmm1, xmm11
   1769         pshufd  xmm0, xmm0, 39H
   1770         pshufd  xmm3, xmm3, 4EH
   1771         pshufd  xmm2, xmm2, 93H
   1772         dec     al
   1773         jz      endroundloop1
   1774         movdqa  xmm8, xmm4
   1775         shufps  xmm8, xmm5, 214
   1776         pshufd  xmm9, xmm4, 0FH
   1777         pshufd  xmm4, xmm8, 39H
   1778         movdqa  xmm8, xmm6
   1779         shufps  xmm8, xmm7, 250
   1780         pblendw xmm9, xmm8, 0CCH
   1781         movdqa  xmm8, xmm7
   1782         punpcklqdq xmm8, xmm5
   1783         pblendw xmm8, xmm6, 0C0H
   1784         pshufd  xmm8, xmm8, 78H
   1785         punpckhdq xmm5, xmm7
   1786         punpckldq xmm6, xmm5
   1787         pshufd  xmm7, xmm6, 1EH
   1788         movdqa  xmm5, xmm9
   1789         movdqa  xmm6, xmm8
   1790         jmp     roundloop1
   1791 endroundloop1:
   1792         pxor    xmm0, xmm2
   1793         pxor    xmm1, xmm3
   1794         mov     eax, r13d
   1795         cmp     rdx, r15
   1796         jne     innerloop1
   1797         movups  xmmword ptr [rbx], xmm0
   1798         movups  xmmword ptr [rbx+10H], xmm1
   1799         jmp     unwind
   1800 _blake3_hash_many_sse41 ENDP
   1801 blake3_hash_many_sse41 ENDP
   1802 
   1803 blake3_compress_in_place_sse41 PROC
   1804 _blake3_compress_in_place_sse41 PROC
   1805         sub     rsp, 120
   1806         movdqa  xmmword ptr [rsp], xmm6
   1807         movdqa  xmmword ptr [rsp+10H], xmm7
   1808         movdqa  xmmword ptr [rsp+20H], xmm8
   1809         movdqa  xmmword ptr [rsp+30H], xmm9
   1810         movdqa  xmmword ptr [rsp+40H], xmm11
   1811         movdqa  xmmword ptr [rsp+50H], xmm14
   1812         movdqa  xmmword ptr [rsp+60H], xmm15
   1813         movups  xmm0, xmmword ptr [rcx]
   1814         movups  xmm1, xmmword ptr [rcx+10H]
   1815         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1816         movzx   eax, byte ptr [rsp+0A0H]
   1817         movzx   r8d, r8b
   1818         shl     rax, 32
   1819         add     r8, rax
   1820         movq    xmm3, r9
   1821         movq    xmm4, r8
   1822         punpcklqdq xmm3, xmm4
   1823         movups  xmm4, xmmword ptr [rdx]
   1824         movups  xmm5, xmmword ptr [rdx+10H]
   1825         movaps  xmm8, xmm4
   1826         shufps  xmm4, xmm5, 136
   1827         shufps  xmm8, xmm5, 221
   1828         movaps  xmm5, xmm8
   1829         movups  xmm6, xmmword ptr [rdx+20H]
   1830         movups  xmm7, xmmword ptr [rdx+30H]
   1831         movaps  xmm8, xmm6
   1832         shufps  xmm6, xmm7, 136
   1833         pshufd  xmm6, xmm6, 93H
   1834         shufps  xmm8, xmm7, 221
   1835         pshufd  xmm7, xmm8, 93H
   1836         movaps  xmm14, xmmword ptr [ROT8]
   1837         movaps  xmm15, xmmword ptr [ROT16]
   1838         mov     al, 7
   1839 @@:
   1840         paddd   xmm0, xmm4
   1841         paddd   xmm0, xmm1
   1842         pxor    xmm3, xmm0
   1843         pshufb  xmm3, xmm15
   1844         paddd   xmm2, xmm3
   1845         pxor    xmm1, xmm2
   1846         movdqa  xmm11, xmm1
   1847         pslld   xmm1, 20
   1848         psrld   xmm11, 12
   1849         por     xmm1, xmm11
   1850         paddd   xmm0, xmm5
   1851         paddd   xmm0, xmm1
   1852         pxor    xmm3, xmm0
   1853         pshufb  xmm3, xmm14
   1854         paddd   xmm2, xmm3
   1855         pxor    xmm1, xmm2
   1856         movdqa  xmm11, xmm1
   1857         pslld   xmm1, 25
   1858         psrld   xmm11, 7
   1859         por     xmm1, xmm11
   1860         pshufd  xmm0, xmm0, 93H
   1861         pshufd  xmm3, xmm3, 4EH
   1862         pshufd  xmm2, xmm2, 39H
   1863         paddd   xmm0, xmm6
   1864         paddd   xmm0, xmm1
   1865         pxor    xmm3, xmm0
   1866         pshufb  xmm3, xmm15
   1867         paddd   xmm2, xmm3
   1868         pxor    xmm1, xmm2
   1869         movdqa  xmm11, xmm1
   1870         pslld   xmm1, 20
   1871         psrld   xmm11, 12
   1872         por     xmm1, xmm11
   1873         paddd   xmm0, xmm7
   1874         paddd   xmm0, xmm1
   1875         pxor    xmm3, xmm0
   1876         pshufb  xmm3, xmm14
   1877         paddd   xmm2, xmm3
   1878         pxor    xmm1, xmm2
   1879         movdqa  xmm11, xmm1
   1880         pslld   xmm1, 25
   1881         psrld   xmm11, 7
   1882         por     xmm1, xmm11
   1883         pshufd  xmm0, xmm0, 39H
   1884         pshufd  xmm3, xmm3, 4EH
   1885         pshufd  xmm2, xmm2, 93H
   1886         dec     al
   1887         jz      @F
   1888         movdqa  xmm8, xmm4
   1889         shufps  xmm8, xmm5, 214
   1890         pshufd  xmm9, xmm4, 0FH
   1891         pshufd  xmm4, xmm8, 39H
   1892         movdqa  xmm8, xmm6
   1893         shufps  xmm8, xmm7, 250
   1894         pblendw xmm9, xmm8, 0CCH
   1895         movdqa  xmm8, xmm7
   1896         punpcklqdq xmm8, xmm5
   1897         pblendw xmm8, xmm6, 0C0H
   1898         pshufd  xmm8, xmm8, 78H
   1899         punpckhdq xmm5, xmm7
   1900         punpckldq xmm6, xmm5
   1901         pshufd  xmm7, xmm6, 1EH
   1902         movdqa  xmm5, xmm9
   1903         movdqa  xmm6, xmm8
   1904         jmp     @B
   1905 @@:
   1906         pxor    xmm0, xmm2
   1907         pxor    xmm1, xmm3
   1908         movups  xmmword ptr [rcx], xmm0
   1909         movups  xmmword ptr [rcx+10H], xmm1
   1910         movdqa  xmm6, xmmword ptr [rsp]
   1911         movdqa  xmm7, xmmword ptr [rsp+10H]
   1912         movdqa  xmm8, xmmword ptr [rsp+20H]
   1913         movdqa  xmm9, xmmword ptr [rsp+30H]
   1914         movdqa  xmm11, xmmword ptr [rsp+40H]
   1915         movdqa  xmm14, xmmword ptr [rsp+50H]
   1916         movdqa  xmm15, xmmword ptr [rsp+60H]
   1917         add     rsp, 120
   1918         ret
   1919 _blake3_compress_in_place_sse41 ENDP
   1920 blake3_compress_in_place_sse41 ENDP
   1921 
   1922 ALIGN 16
   1923 blake3_compress_xof_sse41 PROC
   1924 _blake3_compress_xof_sse41 PROC
   1925         sub     rsp, 120
   1926         movdqa  xmmword ptr [rsp], xmm6
   1927         movdqa  xmmword ptr [rsp+10H], xmm7
   1928         movdqa  xmmword ptr [rsp+20H], xmm8
   1929         movdqa  xmmword ptr [rsp+30H], xmm9
   1930         movdqa  xmmword ptr [rsp+40H], xmm11
   1931         movdqa  xmmword ptr [rsp+50H], xmm14
   1932         movdqa  xmmword ptr [rsp+60H], xmm15
   1933         movups  xmm0, xmmword ptr [rcx]
   1934         movups  xmm1, xmmword ptr [rcx+10H]
   1935         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1936         movzx   eax, byte ptr [rsp+0A0H]
   1937         movzx   r8d, r8b
   1938         mov     r10, qword ptr [rsp+0A8H]
   1939         shl     rax, 32
   1940         add     r8, rax
   1941         movq    xmm3, r9
   1942         movq    xmm4, r8
   1943         punpcklqdq xmm3, xmm4
   1944         movups  xmm4, xmmword ptr [rdx]
   1945         movups  xmm5, xmmword ptr [rdx+10H]
   1946         movaps  xmm8, xmm4
   1947         shufps  xmm4, xmm5, 136
   1948         shufps  xmm8, xmm5, 221
   1949         movaps  xmm5, xmm8
   1950         movups  xmm6, xmmword ptr [rdx+20H]
   1951         movups  xmm7, xmmword ptr [rdx+30H]
   1952         movaps  xmm8, xmm6
   1953         shufps  xmm6, xmm7, 136
   1954         pshufd  xmm6, xmm6, 93H
   1955         shufps  xmm8, xmm7, 221
   1956         pshufd  xmm7, xmm8, 93H
   1957         movaps  xmm14, xmmword ptr [ROT8]
   1958         movaps  xmm15, xmmword ptr [ROT16]
   1959         mov     al, 7
   1960 @@:
   1961         paddd   xmm0, xmm4
   1962         paddd   xmm0, xmm1
   1963         pxor    xmm3, xmm0
   1964         pshufb  xmm3, xmm15
   1965         paddd   xmm2, xmm3
   1966         pxor    xmm1, xmm2
   1967         movdqa  xmm11, xmm1
   1968         pslld   xmm1, 20
   1969         psrld   xmm11, 12
   1970         por     xmm1, xmm11
   1971         paddd   xmm0, xmm5
   1972         paddd   xmm0, xmm1
   1973         pxor    xmm3, xmm0
   1974         pshufb  xmm3, xmm14
   1975         paddd   xmm2, xmm3
   1976         pxor    xmm1, xmm2
   1977         movdqa  xmm11, xmm1
   1978         pslld   xmm1, 25
   1979         psrld   xmm11, 7
   1980         por     xmm1, xmm11
   1981         pshufd  xmm0, xmm0, 93H
   1982         pshufd  xmm3, xmm3, 4EH
   1983         pshufd  xmm2, xmm2, 39H
   1984         paddd   xmm0, xmm6
   1985         paddd   xmm0, xmm1
   1986         pxor    xmm3, xmm0
   1987         pshufb  xmm3, xmm15
   1988         paddd   xmm2, xmm3
   1989         pxor    xmm1, xmm2
   1990         movdqa  xmm11, xmm1
   1991         pslld   xmm1, 20
   1992         psrld   xmm11, 12
   1993         por     xmm1, xmm11
   1994         paddd   xmm0, xmm7
   1995         paddd   xmm0, xmm1
   1996         pxor    xmm3, xmm0
   1997         pshufb  xmm3, xmm14
   1998         paddd   xmm2, xmm3
   1999         pxor    xmm1, xmm2
   2000         movdqa  xmm11, xmm1
   2001         pslld   xmm1, 25
   2002         psrld   xmm11, 7
   2003         por     xmm1, xmm11
   2004         pshufd  xmm0, xmm0, 39H
   2005         pshufd  xmm3, xmm3, 4EH
   2006         pshufd  xmm2, xmm2, 93H
   2007         dec     al
   2008         jz      @F
   2009         movdqa  xmm8, xmm4
   2010         shufps  xmm8, xmm5, 214
   2011         pshufd  xmm9, xmm4, 0FH
   2012         pshufd  xmm4, xmm8, 39H
   2013         movdqa  xmm8, xmm6
   2014         shufps  xmm8, xmm7, 250
   2015         pblendw xmm9, xmm8, 0CCH
   2016         movdqa  xmm8, xmm7
   2017         punpcklqdq xmm8, xmm5
   2018         pblendw xmm8, xmm6, 0C0H
   2019         pshufd  xmm8, xmm8, 78H
   2020         punpckhdq xmm5, xmm7
   2021         punpckldq xmm6, xmm5
   2022         pshufd  xmm7, xmm6, 1EH
   2023         movdqa  xmm5, xmm9
   2024         movdqa  xmm6, xmm8
   2025         jmp     @B
   2026 @@:
   2027         movdqu  xmm4, xmmword ptr [rcx]
   2028         movdqu  xmm5, xmmword ptr [rcx+10H]
   2029         pxor    xmm0, xmm2
   2030         pxor    xmm1, xmm3
   2031         pxor    xmm2, xmm4
   2032         pxor    xmm3, xmm5
   2033         movups  xmmword ptr [r10], xmm0
   2034         movups  xmmword ptr [r10+10H], xmm1
   2035         movups  xmmword ptr [r10+20H], xmm2
   2036         movups  xmmword ptr [r10+30H], xmm3
   2037         movdqa  xmm6, xmmword ptr [rsp]
   2038         movdqa  xmm7, xmmword ptr [rsp+10H]
   2039         movdqa  xmm8, xmmword ptr [rsp+20H]
   2040         movdqa  xmm9, xmmword ptr [rsp+30H]
   2041         movdqa  xmm11, xmmword ptr [rsp+40H]
   2042         movdqa  xmm14, xmmword ptr [rsp+50H]
   2043         movdqa  xmm15, xmmword ptr [rsp+60H]
   2044         add     rsp, 120
   2045         ret
   2046 _blake3_compress_xof_sse41 ENDP
   2047 blake3_compress_xof_sse41 ENDP
   2048 
   2049 _TEXT ENDS
   2050 
   2051 
   2052 _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
   2053 ALIGN   64
   2054 BLAKE3_IV:
   2055         dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
   2056 
   2057 ADD0:
   2058         dd 0, 1, 2, 3
   2059 
   2060 ADD1:
   2061         dd 4 dup (4)
   2062 
   2063 BLAKE3_IV_0:
   2064         dd 4 dup (6A09E667H)
   2065 
   2066 BLAKE3_IV_1:
   2067         dd 4 dup (0BB67AE85H)
   2068 
   2069 BLAKE3_IV_2:
   2070         dd 4 dup (3C6EF372H)
   2071 
   2072 BLAKE3_IV_3:
   2073         dd 4 dup (0A54FF53AH)
   2074 
   2075 BLAKE3_BLOCK_LEN:
   2076         dd 4 dup (64)
   2077 
   2078 ROT16:
   2079         db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
   2080 
   2081 ROT8:
   2082         db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
   2083 
   2084 CMP_MSB_MASK:
   2085         dd 8 dup(80000000H)
   2086 
   2087 _RDATA ENDS
   2088 END
   2089