chibipub

experimental activitypub node in C
git clone git://jb55.com/chibipub
Log | Files | Refs | README | LICENSE

blake3_sse2_x86-64_windows_msvc.asm (70982B)


      1 public _blake3_hash_many_sse2
      2 public blake3_hash_many_sse2
      3 public blake3_compress_in_place_sse2
      4 public _blake3_compress_in_place_sse2
      5 public blake3_compress_xof_sse2
      6 public _blake3_compress_xof_sse2
      7 
      8 _TEXT   SEGMENT ALIGN(16) 'CODE'
      9 
     10 ALIGN   16
     11 blake3_hash_many_sse2 PROC
     12 _blake3_hash_many_sse2 PROC
     13         push    r15
     14         push    r14
     15         push    r13
     16         push    r12
     17         push    rsi
     18         push    rdi
     19         push    rbx
     20         push    rbp
     21         mov     rbp, rsp
     22         sub     rsp, 528
     23         and     rsp, 0FFFFFFFFFFFFFFC0H
     24         movdqa  xmmword ptr [rsp+170H], xmm6
     25         movdqa  xmmword ptr [rsp+180H], xmm7
     26         movdqa  xmmword ptr [rsp+190H], xmm8
     27         movdqa  xmmword ptr [rsp+1A0H], xmm9
     28         movdqa  xmmword ptr [rsp+1B0H], xmm10
     29         movdqa  xmmword ptr [rsp+1C0H], xmm11
     30         movdqa  xmmword ptr [rsp+1D0H], xmm12
     31         movdqa  xmmword ptr [rsp+1E0H], xmm13
     32         movdqa  xmmword ptr [rsp+1F0H], xmm14
     33         movdqa  xmmword ptr [rsp+200H], xmm15
     34         mov     rdi, rcx
     35         mov     rsi, rdx
     36         mov     rdx, r8
     37         mov     rcx, r9
     38         mov     r8, qword ptr [rbp+68H]
     39         movzx   r9, byte ptr [rbp+70H]
     40         neg     r9d
     41         movd    xmm0, r9d
     42         pshufd  xmm0, xmm0, 00H
     43         movdqa  xmmword ptr [rsp+130H], xmm0
     44         movdqa  xmm1, xmm0
     45         pand    xmm1, xmmword ptr [ADD0]
     46         pand    xmm0, xmmword ptr [ADD1]
     47         movdqa  xmmword ptr [rsp+150H], xmm0
     48         movd    xmm0, r8d
     49         pshufd  xmm0, xmm0, 00H
     50         paddd   xmm0, xmm1
     51         movdqa  xmmword ptr [rsp+110H], xmm0
     52         pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
     53         pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
     54         pcmpgtd xmm1, xmm0
     55         shr     r8, 32
     56         movd    xmm2, r8d
     57         pshufd  xmm2, xmm2, 00H
     58         psubd   xmm2, xmm1
     59         movdqa  xmmword ptr [rsp+120H], xmm2
     60         mov     rbx, qword ptr [rbp+90H]
     61         mov     r15, rdx
     62         shl     r15, 6
     63         movzx   r13d, byte ptr [rbp+78H]
     64         movzx   r12d, byte ptr [rbp+88H]
     65         cmp     rsi, 4
     66         jc      final3blocks
     67 outerloop4:
     68         movdqu  xmm3, xmmword ptr [rcx]
     69         pshufd  xmm0, xmm3, 00H
     70         pshufd  xmm1, xmm3, 55H
     71         pshufd  xmm2, xmm3, 0AAH
     72         pshufd  xmm3, xmm3, 0FFH
     73         movdqu  xmm7, xmmword ptr [rcx+10H]
     74         pshufd  xmm4, xmm7, 00H
     75         pshufd  xmm5, xmm7, 55H
     76         pshufd  xmm6, xmm7, 0AAH
     77         pshufd  xmm7, xmm7, 0FFH
     78         mov     r8, qword ptr [rdi]
     79         mov     r9, qword ptr [rdi+8H]
     80         mov     r10, qword ptr [rdi+10H]
     81         mov     r11, qword ptr [rdi+18H]
     82         movzx   eax, byte ptr [rbp+80H]
     83         or      eax, r13d
     84         xor     edx, edx
     85 innerloop4:
     86         mov     r14d, eax
     87         or      eax, r12d
     88         add     rdx, 64
     89         cmp     rdx, r15
     90         cmovne  eax, r14d
     91         movdqu  xmm8, xmmword ptr [r8+rdx-40H]
     92         movdqu  xmm9, xmmword ptr [r9+rdx-40H]
     93         movdqu  xmm10, xmmword ptr [r10+rdx-40H]
     94         movdqu  xmm11, xmmword ptr [r11+rdx-40H]
     95         movdqa  xmm12, xmm8
     96         punpckldq xmm8, xmm9
     97         punpckhdq xmm12, xmm9
     98         movdqa  xmm14, xmm10
     99         punpckldq xmm10, xmm11
    100         punpckhdq xmm14, xmm11
    101         movdqa  xmm9, xmm8
    102         punpcklqdq xmm8, xmm10
    103         punpckhqdq xmm9, xmm10
    104         movdqa  xmm13, xmm12
    105         punpcklqdq xmm12, xmm14
    106         punpckhqdq xmm13, xmm14
    107         movdqa  xmmword ptr [rsp], xmm8
    108         movdqa  xmmword ptr [rsp+10H], xmm9
    109         movdqa  xmmword ptr [rsp+20H], xmm12
    110         movdqa  xmmword ptr [rsp+30H], xmm13
    111         movdqu  xmm8, xmmword ptr [r8+rdx-30H]
    112         movdqu  xmm9, xmmword ptr [r9+rdx-30H]
    113         movdqu  xmm10, xmmword ptr [r10+rdx-30H]
    114         movdqu  xmm11, xmmword ptr [r11+rdx-30H]
    115         movdqa  xmm12, xmm8
    116         punpckldq xmm8, xmm9
    117         punpckhdq xmm12, xmm9
    118         movdqa  xmm14, xmm10
    119         punpckldq xmm10, xmm11
    120         punpckhdq xmm14, xmm11
    121         movdqa  xmm9, xmm8
    122         punpcklqdq xmm8, xmm10
    123         punpckhqdq xmm9, xmm10
    124         movdqa  xmm13, xmm12
    125         punpcklqdq xmm12, xmm14
    126         punpckhqdq xmm13, xmm14
    127         movdqa  xmmword ptr [rsp+40H], xmm8
    128         movdqa  xmmword ptr [rsp+50H], xmm9
    129         movdqa  xmmword ptr [rsp+60H], xmm12
    130         movdqa  xmmword ptr [rsp+70H], xmm13
    131         movdqu  xmm8, xmmword ptr [r8+rdx-20H]
    132         movdqu  xmm9, xmmword ptr [r9+rdx-20H]
    133         movdqu  xmm10, xmmword ptr [r10+rdx-20H]
    134         movdqu  xmm11, xmmword ptr [r11+rdx-20H]
    135         movdqa  xmm12, xmm8
    136         punpckldq xmm8, xmm9
    137         punpckhdq xmm12, xmm9
    138         movdqa  xmm14, xmm10
    139         punpckldq xmm10, xmm11
    140         punpckhdq xmm14, xmm11
    141         movdqa  xmm9, xmm8
    142         punpcklqdq xmm8, xmm10
    143         punpckhqdq xmm9, xmm10
    144         movdqa  xmm13, xmm12
    145         punpcklqdq xmm12, xmm14
    146         punpckhqdq xmm13, xmm14
    147         movdqa  xmmword ptr [rsp+80H], xmm8
    148         movdqa  xmmword ptr [rsp+90H], xmm9
    149         movdqa  xmmword ptr [rsp+0A0H], xmm12
    150         movdqa  xmmword ptr [rsp+0B0H], xmm13
    151         movdqu  xmm8, xmmword ptr [r8+rdx-10H]
    152         movdqu  xmm9, xmmword ptr [r9+rdx-10H]
    153         movdqu  xmm10, xmmword ptr [r10+rdx-10H]
    154         movdqu  xmm11, xmmword ptr [r11+rdx-10H]
    155         movdqa  xmm12, xmm8
    156         punpckldq xmm8, xmm9
    157         punpckhdq xmm12, xmm9
    158         movdqa  xmm14, xmm10
    159         punpckldq xmm10, xmm11
    160         punpckhdq xmm14, xmm11
    161         movdqa  xmm9, xmm8
    162         punpcklqdq xmm8, xmm10
    163         punpckhqdq xmm9, xmm10
    164         movdqa  xmm13, xmm12
    165         punpcklqdq xmm12, xmm14
    166         punpckhqdq xmm13, xmm14
    167         movdqa  xmmword ptr [rsp+0C0H], xmm8
    168         movdqa  xmmword ptr [rsp+0D0H], xmm9
    169         movdqa  xmmword ptr [rsp+0E0H], xmm12
    170         movdqa  xmmword ptr [rsp+0F0H], xmm13
    171         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
    172         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
    173         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
    174         movdqa  xmm12, xmmword ptr [rsp+110H]
    175         movdqa  xmm13, xmmword ptr [rsp+120H]
    176         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
    177         movd    xmm15, eax
    178         pshufd  xmm15, xmm15, 00H
    179         prefetcht0 byte ptr [r8+rdx+80H]
    180         prefetcht0 byte ptr [r9+rdx+80H]
    181         prefetcht0 byte ptr [r10+rdx+80H]
    182         prefetcht0 byte ptr [r11+rdx+80H]
    183         paddd   xmm0, xmmword ptr [rsp]
    184         paddd   xmm1, xmmword ptr [rsp+20H]
    185         paddd   xmm2, xmmword ptr [rsp+40H]
    186         paddd   xmm3, xmmword ptr [rsp+60H]
    187         paddd   xmm0, xmm4
    188         paddd   xmm1, xmm5
    189         paddd   xmm2, xmm6
    190         paddd   xmm3, xmm7
    191         pxor    xmm12, xmm0
    192         pxor    xmm13, xmm1
    193         pxor    xmm14, xmm2
    194         pxor    xmm15, xmm3
    195         pshuflw xmm12, xmm12, 0B1H
    196         pshufhw xmm12, xmm12, 0B1H
    197         pshuflw xmm13, xmm13, 0B1H
    198         pshufhw xmm13, xmm13, 0B1H
    199         pshuflw xmm14, xmm14, 0B1H
    200         pshufhw xmm14, xmm14, 0B1H
    201         pshuflw xmm15, xmm15, 0B1H
    202         pshufhw xmm15, xmm15, 0B1H
    203         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
    204         paddd   xmm8, xmm12
    205         paddd   xmm9, xmm13
    206         paddd   xmm10, xmm14
    207         paddd   xmm11, xmm15
    208         pxor    xmm4, xmm8
    209         pxor    xmm5, xmm9
    210         pxor    xmm6, xmm10
    211         pxor    xmm7, xmm11
    212         movdqa  xmmword ptr [rsp+100H], xmm8
    213         movdqa  xmm8, xmm4
    214         psrld   xmm8, 12
    215         pslld   xmm4, 20
    216         por     xmm4, xmm8
    217         movdqa  xmm8, xmm5
    218         psrld   xmm8, 12
    219         pslld   xmm5, 20
    220         por     xmm5, xmm8
    221         movdqa  xmm8, xmm6
    222         psrld   xmm8, 12
    223         pslld   xmm6, 20
    224         por     xmm6, xmm8
    225         movdqa  xmm8, xmm7
    226         psrld   xmm8, 12
    227         pslld   xmm7, 20
    228         por     xmm7, xmm8
    229         paddd   xmm0, xmmword ptr [rsp+10H]
    230         paddd   xmm1, xmmword ptr [rsp+30H]
    231         paddd   xmm2, xmmword ptr [rsp+50H]
    232         paddd   xmm3, xmmword ptr [rsp+70H]
    233         paddd   xmm0, xmm4
    234         paddd   xmm1, xmm5
    235         paddd   xmm2, xmm6
    236         paddd   xmm3, xmm7
    237         pxor    xmm12, xmm0
    238         pxor    xmm13, xmm1
    239         pxor    xmm14, xmm2
    240         pxor    xmm15, xmm3
    241         movdqa  xmm8, xmm12
    242         psrld   xmm12, 8
    243         pslld   xmm8, 24
    244         pxor    xmm12, xmm8
    245         movdqa  xmm8, xmm13
    246         psrld   xmm13, 8
    247         pslld   xmm8, 24
    248         pxor    xmm13, xmm8
    249         movdqa  xmm8, xmm14
    250         psrld   xmm14, 8
    251         pslld   xmm8, 24
    252         pxor    xmm14, xmm8
    253         movdqa  xmm8, xmm15
    254         psrld   xmm15, 8
    255         pslld   xmm8, 24
    256         pxor    xmm15, xmm8
    257         movdqa  xmm8, xmmword ptr [rsp+100H]
    258         paddd   xmm8, xmm12
    259         paddd   xmm9, xmm13
    260         paddd   xmm10, xmm14
    261         paddd   xmm11, xmm15
    262         pxor    xmm4, xmm8
    263         pxor    xmm5, xmm9
    264         pxor    xmm6, xmm10
    265         pxor    xmm7, xmm11
    266         movdqa  xmmword ptr [rsp+100H], xmm8
    267         movdqa  xmm8, xmm4
    268         psrld   xmm8, 7
    269         pslld   xmm4, 25
    270         por     xmm4, xmm8
    271         movdqa  xmm8, xmm5
    272         psrld   xmm8, 7
    273         pslld   xmm5, 25
    274         por     xmm5, xmm8
    275         movdqa  xmm8, xmm6
    276         psrld   xmm8, 7
    277         pslld   xmm6, 25
    278         por     xmm6, xmm8
    279         movdqa  xmm8, xmm7
    280         psrld   xmm8, 7
    281         pslld   xmm7, 25
    282         por     xmm7, xmm8
    283         paddd   xmm0, xmmword ptr [rsp+80H]
    284         paddd   xmm1, xmmword ptr [rsp+0A0H]
    285         paddd   xmm2, xmmword ptr [rsp+0C0H]
    286         paddd   xmm3, xmmword ptr [rsp+0E0H]
    287         paddd   xmm0, xmm5
    288         paddd   xmm1, xmm6
    289         paddd   xmm2, xmm7
    290         paddd   xmm3, xmm4
    291         pxor    xmm15, xmm0
    292         pxor    xmm12, xmm1
    293         pxor    xmm13, xmm2
    294         pxor    xmm14, xmm3
    295         pshuflw xmm15, xmm15, 0B1H
    296         pshufhw xmm15, xmm15, 0B1H
    297         pshuflw xmm12, xmm12, 0B1H
    298         pshufhw xmm12, xmm12, 0B1H
    299         pshuflw xmm13, xmm13, 0B1H
    300         pshufhw xmm13, xmm13, 0B1H
    301         pshuflw xmm14, xmm14, 0B1H
    302         pshufhw xmm14, xmm14, 0B1H
    303         paddd   xmm10, xmm15
    304         paddd   xmm11, xmm12
    305         movdqa  xmm8, xmmword ptr [rsp+100H]
    306         paddd   xmm8, xmm13
    307         paddd   xmm9, xmm14
    308         pxor    xmm5, xmm10
    309         pxor    xmm6, xmm11
    310         pxor    xmm7, xmm8
    311         pxor    xmm4, xmm9
    312         movdqa  xmmword ptr [rsp+100H], xmm8
    313         movdqa  xmm8, xmm5
    314         psrld   xmm8, 12
    315         pslld   xmm5, 20
    316         por     xmm5, xmm8
    317         movdqa  xmm8, xmm6
    318         psrld   xmm8, 12
    319         pslld   xmm6, 20
    320         por     xmm6, xmm8
    321         movdqa  xmm8, xmm7
    322         psrld   xmm8, 12
    323         pslld   xmm7, 20
    324         por     xmm7, xmm8
    325         movdqa  xmm8, xmm4
    326         psrld   xmm8, 12
    327         pslld   xmm4, 20
    328         por     xmm4, xmm8
    329         paddd   xmm0, xmmword ptr [rsp+90H]
    330         paddd   xmm1, xmmword ptr [rsp+0B0H]
    331         paddd   xmm2, xmmword ptr [rsp+0D0H]
    332         paddd   xmm3, xmmword ptr [rsp+0F0H]
    333         paddd   xmm0, xmm5
    334         paddd   xmm1, xmm6
    335         paddd   xmm2, xmm7
    336         paddd   xmm3, xmm4
    337         pxor    xmm15, xmm0
    338         pxor    xmm12, xmm1
    339         pxor    xmm13, xmm2
    340         pxor    xmm14, xmm3
    341         movdqa  xmm8, xmm15
    342         psrld   xmm15, 8
    343         pslld   xmm8, 24
    344         pxor    xmm15, xmm8
    345         movdqa  xmm8, xmm12
    346         psrld   xmm12, 8
    347         pslld   xmm8, 24
    348         pxor    xmm12, xmm8
    349         movdqa  xmm8, xmm13
    350         psrld   xmm13, 8
    351         pslld   xmm8, 24
    352         pxor    xmm13, xmm8
    353         movdqa  xmm8, xmm14
    354         psrld   xmm14, 8
    355         pslld   xmm8, 24
    356         pxor    xmm14, xmm8
    357         paddd   xmm10, xmm15
    358         paddd   xmm11, xmm12
    359         movdqa  xmm8, xmmword ptr [rsp+100H]
    360         paddd   xmm8, xmm13
    361         paddd   xmm9, xmm14
    362         pxor    xmm5, xmm10
    363         pxor    xmm6, xmm11
    364         pxor    xmm7, xmm8
    365         pxor    xmm4, xmm9
    366         movdqa  xmmword ptr [rsp+100H], xmm8
    367         movdqa  xmm8, xmm5
    368         psrld   xmm8, 7
    369         pslld   xmm5, 25
    370         por     xmm5, xmm8
    371         movdqa  xmm8, xmm6
    372         psrld   xmm8, 7
    373         pslld   xmm6, 25
    374         por     xmm6, xmm8
    375         movdqa  xmm8, xmm7
    376         psrld   xmm8, 7
    377         pslld   xmm7, 25
    378         por     xmm7, xmm8
    379         movdqa  xmm8, xmm4
    380         psrld   xmm8, 7
    381         pslld   xmm4, 25
    382         por     xmm4, xmm8
    383         paddd   xmm0, xmmword ptr [rsp+20H]
    384         paddd   xmm1, xmmword ptr [rsp+30H]
    385         paddd   xmm2, xmmword ptr [rsp+70H]
    386         paddd   xmm3, xmmword ptr [rsp+40H]
    387         paddd   xmm0, xmm4
    388         paddd   xmm1, xmm5
    389         paddd   xmm2, xmm6
    390         paddd   xmm3, xmm7
    391         pxor    xmm12, xmm0
    392         pxor    xmm13, xmm1
    393         pxor    xmm14, xmm2
    394         pxor    xmm15, xmm3
    395         pshuflw xmm12, xmm12, 0B1H
    396         pshufhw xmm12, xmm12, 0B1H
    397         pshuflw xmm13, xmm13, 0B1H
    398         pshufhw xmm13, xmm13, 0B1H
    399         pshuflw xmm14, xmm14, 0B1H
    400         pshufhw xmm14, xmm14, 0B1H
    401         pshuflw xmm15, xmm15, 0B1H
    402         pshufhw xmm15, xmm15, 0B1H
    403         movdqa  xmm8, xmmword ptr [rsp+100H]
    404         paddd   xmm8, xmm12
    405         paddd   xmm9, xmm13
    406         paddd   xmm10, xmm14
    407         paddd   xmm11, xmm15
    408         pxor    xmm4, xmm8
    409         pxor    xmm5, xmm9
    410         pxor    xmm6, xmm10
    411         pxor    xmm7, xmm11
    412         movdqa  xmmword ptr [rsp+100H], xmm8
    413         movdqa  xmm8, xmm4
    414         psrld   xmm8, 12
    415         pslld   xmm4, 20
    416         por     xmm4, xmm8
    417         movdqa  xmm8, xmm5
    418         psrld   xmm8, 12
    419         pslld   xmm5, 20
    420         por     xmm5, xmm8
    421         movdqa  xmm8, xmm6
    422         psrld   xmm8, 12
    423         pslld   xmm6, 20
    424         por     xmm6, xmm8
    425         movdqa  xmm8, xmm7
    426         psrld   xmm8, 12
    427         pslld   xmm7, 20
    428         por     xmm7, xmm8
    429         paddd   xmm0, xmmword ptr [rsp+60H]
    430         paddd   xmm1, xmmword ptr [rsp+0A0H]
    431         paddd   xmm2, xmmword ptr [rsp]
    432         paddd   xmm3, xmmword ptr [rsp+0D0H]
    433         paddd   xmm0, xmm4
    434         paddd   xmm1, xmm5
    435         paddd   xmm2, xmm6
    436         paddd   xmm3, xmm7
    437         pxor    xmm12, xmm0
    438         pxor    xmm13, xmm1
    439         pxor    xmm14, xmm2
    440         pxor    xmm15, xmm3
    441         movdqa  xmm8, xmm12
    442         psrld   xmm12, 8
    443         pslld   xmm8, 24
    444         pxor    xmm12, xmm8
    445         movdqa  xmm8, xmm13
    446         psrld   xmm13, 8
    447         pslld   xmm8, 24
    448         pxor    xmm13, xmm8
    449         movdqa  xmm8, xmm14
    450         psrld   xmm14, 8
    451         pslld   xmm8, 24
    452         pxor    xmm14, xmm8
    453         movdqa  xmm8, xmm15
    454         psrld   xmm15, 8
    455         pslld   xmm8, 24
    456         pxor    xmm15, xmm8
    457         movdqa  xmm8, xmmword ptr [rsp+100H]
    458         paddd   xmm8, xmm12
    459         paddd   xmm9, xmm13
    460         paddd   xmm10, xmm14
    461         paddd   xmm11, xmm15
    462         pxor    xmm4, xmm8
    463         pxor    xmm5, xmm9
    464         pxor    xmm6, xmm10
    465         pxor    xmm7, xmm11
    466         movdqa  xmmword ptr [rsp+100H], xmm8
    467         movdqa  xmm8, xmm4
    468         psrld   xmm8, 7
    469         pslld   xmm4, 25
    470         por     xmm4, xmm8
    471         movdqa  xmm8, xmm5
    472         psrld   xmm8, 7
    473         pslld   xmm5, 25
    474         por     xmm5, xmm8
    475         movdqa  xmm8, xmm6
    476         psrld   xmm8, 7
    477         pslld   xmm6, 25
    478         por     xmm6, xmm8
    479         movdqa  xmm8, xmm7
    480         psrld   xmm8, 7
    481         pslld   xmm7, 25
    482         por     xmm7, xmm8
    483         paddd   xmm0, xmmword ptr [rsp+10H]
    484         paddd   xmm1, xmmword ptr [rsp+0C0H]
    485         paddd   xmm2, xmmword ptr [rsp+90H]
    486         paddd   xmm3, xmmword ptr [rsp+0F0H]
    487         paddd   xmm0, xmm5
    488         paddd   xmm1, xmm6
    489         paddd   xmm2, xmm7
    490         paddd   xmm3, xmm4
    491         pxor    xmm15, xmm0
    492         pxor    xmm12, xmm1
    493         pxor    xmm13, xmm2
    494         pxor    xmm14, xmm3
    495         pshuflw xmm15, xmm15, 0B1H
    496         pshufhw xmm15, xmm15, 0B1H
    497         pshuflw xmm12, xmm12, 0B1H
    498         pshufhw xmm12, xmm12, 0B1H
    499         pshuflw xmm13, xmm13, 0B1H
    500         pshufhw xmm13, xmm13, 0B1H
    501         pshuflw xmm14, xmm14, 0B1H
    502         pshufhw xmm14, xmm14, 0B1H
    503         paddd   xmm10, xmm15
    504         paddd   xmm11, xmm12
    505         movdqa  xmm8, xmmword ptr [rsp+100H]
    506         paddd   xmm8, xmm13
    507         paddd   xmm9, xmm14
    508         pxor    xmm5, xmm10
    509         pxor    xmm6, xmm11
    510         pxor    xmm7, xmm8
    511         pxor    xmm4, xmm9
    512         movdqa  xmmword ptr [rsp+100H], xmm8
    513         movdqa  xmm8, xmm5
    514         psrld   xmm8, 12
    515         pslld   xmm5, 20
    516         por     xmm5, xmm8
    517         movdqa  xmm8, xmm6
    518         psrld   xmm8, 12
    519         pslld   xmm6, 20
    520         por     xmm6, xmm8
    521         movdqa  xmm8, xmm7
    522         psrld   xmm8, 12
    523         pslld   xmm7, 20
    524         por     xmm7, xmm8
    525         movdqa  xmm8, xmm4
    526         psrld   xmm8, 12
    527         pslld   xmm4, 20
    528         por     xmm4, xmm8
    529         paddd   xmm0, xmmword ptr [rsp+0B0H]
    530         paddd   xmm1, xmmword ptr [rsp+50H]
    531         paddd   xmm2, xmmword ptr [rsp+0E0H]
    532         paddd   xmm3, xmmword ptr [rsp+80H]
    533         paddd   xmm0, xmm5
    534         paddd   xmm1, xmm6
    535         paddd   xmm2, xmm7
    536         paddd   xmm3, xmm4
    537         pxor    xmm15, xmm0
    538         pxor    xmm12, xmm1
    539         pxor    xmm13, xmm2
    540         pxor    xmm14, xmm3
    541         movdqa  xmm8, xmm15
    542         psrld   xmm15, 8
    543         pslld   xmm8, 24
    544         pxor    xmm15, xmm8
    545         movdqa  xmm8, xmm12
    546         psrld   xmm12, 8
    547         pslld   xmm8, 24
    548         pxor    xmm12, xmm8
    549         movdqa  xmm8, xmm13
    550         psrld   xmm13, 8
    551         pslld   xmm8, 24
    552         pxor    xmm13, xmm8
    553         movdqa  xmm8, xmm14
    554         psrld   xmm14, 8
    555         pslld   xmm8, 24
    556         pxor    xmm14, xmm8
    557         paddd   xmm10, xmm15
    558         paddd   xmm11, xmm12
    559         movdqa  xmm8, xmmword ptr [rsp+100H]
    560         paddd   xmm8, xmm13
    561         paddd   xmm9, xmm14
    562         pxor    xmm5, xmm10
    563         pxor    xmm6, xmm11
    564         pxor    xmm7, xmm8
    565         pxor    xmm4, xmm9
    566         movdqa  xmmword ptr [rsp+100H], xmm8
    567         movdqa  xmm8, xmm5
    568         psrld   xmm8, 7
    569         pslld   xmm5, 25
    570         por     xmm5, xmm8
    571         movdqa  xmm8, xmm6
    572         psrld   xmm8, 7
    573         pslld   xmm6, 25
    574         por     xmm6, xmm8
    575         movdqa  xmm8, xmm7
    576         psrld   xmm8, 7
    577         pslld   xmm7, 25
    578         por     xmm7, xmm8
    579         movdqa  xmm8, xmm4
    580         psrld   xmm8, 7
    581         pslld   xmm4, 25
    582         por     xmm4, xmm8
    583         paddd   xmm0, xmmword ptr [rsp+30H]
    584         paddd   xmm1, xmmword ptr [rsp+0A0H]
    585         paddd   xmm2, xmmword ptr [rsp+0D0H]
    586         paddd   xmm3, xmmword ptr [rsp+70H]
    587         paddd   xmm0, xmm4
    588         paddd   xmm1, xmm5
    589         paddd   xmm2, xmm6
    590         paddd   xmm3, xmm7
    591         pxor    xmm12, xmm0
    592         pxor    xmm13, xmm1
    593         pxor    xmm14, xmm2
    594         pxor    xmm15, xmm3
    595         pshuflw xmm12, xmm12, 0B1H
    596         pshufhw xmm12, xmm12, 0B1H
    597         pshuflw xmm13, xmm13, 0B1H
    598         pshufhw xmm13, xmm13, 0B1H
    599         pshuflw xmm14, xmm14, 0B1H
    600         pshufhw xmm14, xmm14, 0B1H
    601         pshuflw xmm15, xmm15, 0B1H
    602         pshufhw xmm15, xmm15, 0B1H
    603         movdqa  xmm8, xmmword ptr [rsp+100H]
    604         paddd   xmm8, xmm12
    605         paddd   xmm9, xmm13
    606         paddd   xmm10, xmm14
    607         paddd   xmm11, xmm15
    608         pxor    xmm4, xmm8
    609         pxor    xmm5, xmm9
    610         pxor    xmm6, xmm10
    611         pxor    xmm7, xmm11
    612         movdqa  xmmword ptr [rsp+100H], xmm8
    613         movdqa  xmm8, xmm4
    614         psrld   xmm8, 12
    615         pslld   xmm4, 20
    616         por     xmm4, xmm8
    617         movdqa  xmm8, xmm5
    618         psrld   xmm8, 12
    619         pslld   xmm5, 20
    620         por     xmm5, xmm8
    621         movdqa  xmm8, xmm6
    622         psrld   xmm8, 12
    623         pslld   xmm6, 20
    624         por     xmm6, xmm8
    625         movdqa  xmm8, xmm7
    626         psrld   xmm8, 12
    627         pslld   xmm7, 20
    628         por     xmm7, xmm8
    629         paddd   xmm0, xmmword ptr [rsp+40H]
    630         paddd   xmm1, xmmword ptr [rsp+0C0H]
    631         paddd   xmm2, xmmword ptr [rsp+20H]
    632         paddd   xmm3, xmmword ptr [rsp+0E0H]
    633         paddd   xmm0, xmm4
    634         paddd   xmm1, xmm5
    635         paddd   xmm2, xmm6
    636         paddd   xmm3, xmm7
    637         pxor    xmm12, xmm0
    638         pxor    xmm13, xmm1
    639         pxor    xmm14, xmm2
    640         pxor    xmm15, xmm3
    641         movdqa  xmm8, xmm12
    642         psrld   xmm12, 8
    643         pslld   xmm8, 24
    644         pxor    xmm12, xmm8
    645         movdqa  xmm8, xmm13
    646         psrld   xmm13, 8
    647         pslld   xmm8, 24
    648         pxor    xmm13, xmm8
    649         movdqa  xmm8, xmm14
    650         psrld   xmm14, 8
    651         pslld   xmm8, 24
    652         pxor    xmm14, xmm8
    653         movdqa  xmm8, xmm15
    654         psrld   xmm15, 8
    655         pslld   xmm8, 24
    656         pxor    xmm15, xmm8
    657         movdqa  xmm8, xmmword ptr [rsp+100H]
    658         paddd   xmm8, xmm12
    659         paddd   xmm9, xmm13
    660         paddd   xmm10, xmm14
    661         paddd   xmm11, xmm15
    662         pxor    xmm4, xmm8
    663         pxor    xmm5, xmm9
    664         pxor    xmm6, xmm10
    665         pxor    xmm7, xmm11
    666         movdqa  xmmword ptr [rsp+100H], xmm8
    667         movdqa  xmm8, xmm4
    668         psrld   xmm8, 7
    669         pslld   xmm4, 25
    670         por     xmm4, xmm8
    671         movdqa  xmm8, xmm5
    672         psrld   xmm8, 7
    673         pslld   xmm5, 25
    674         por     xmm5, xmm8
    675         movdqa  xmm8, xmm6
    676         psrld   xmm8, 7
    677         pslld   xmm6, 25
    678         por     xmm6, xmm8
    679         movdqa  xmm8, xmm7
    680         psrld   xmm8, 7
    681         pslld   xmm7, 25
    682         por     xmm7, xmm8
    683         paddd   xmm0, xmmword ptr [rsp+60H]
    684         paddd   xmm1, xmmword ptr [rsp+90H]
    685         paddd   xmm2, xmmword ptr [rsp+0B0H]
    686         paddd   xmm3, xmmword ptr [rsp+80H]
    687         paddd   xmm0, xmm5
    688         paddd   xmm1, xmm6
    689         paddd   xmm2, xmm7
    690         paddd   xmm3, xmm4
    691         pxor    xmm15, xmm0
    692         pxor    xmm12, xmm1
    693         pxor    xmm13, xmm2
    694         pxor    xmm14, xmm3
    695         pshuflw xmm15, xmm15, 0B1H
    696         pshufhw xmm15, xmm15, 0B1H
    697         pshuflw xmm12, xmm12, 0B1H
    698         pshufhw xmm12, xmm12, 0B1H
    699         pshuflw xmm13, xmm13, 0B1H
    700         pshufhw xmm13, xmm13, 0B1H
    701         pshuflw xmm14, xmm14, 0B1H
    702         pshufhw xmm14, xmm14, 0B1H
    703         paddd   xmm10, xmm15
    704         paddd   xmm11, xmm12
    705         movdqa  xmm8, xmmword ptr [rsp+100H]
    706         paddd   xmm8, xmm13
    707         paddd   xmm9, xmm14
    708         pxor    xmm5, xmm10
    709         pxor    xmm6, xmm11
    710         pxor    xmm7, xmm8
    711         pxor    xmm4, xmm9
    712         movdqa  xmmword ptr [rsp+100H], xmm8
    713         movdqa  xmm8, xmm5
    714         psrld   xmm8, 12
    715         pslld   xmm5, 20
    716         por     xmm5, xmm8
    717         movdqa  xmm8, xmm6
    718         psrld   xmm8, 12
    719         pslld   xmm6, 20
    720         por     xmm6, xmm8
    721         movdqa  xmm8, xmm7
    722         psrld   xmm8, 12
    723         pslld   xmm7, 20
    724         por     xmm7, xmm8
    725         movdqa  xmm8, xmm4
    726         psrld   xmm8, 12
    727         pslld   xmm4, 20
    728         por     xmm4, xmm8
    729         paddd   xmm0, xmmword ptr [rsp+50H]
    730         paddd   xmm1, xmmword ptr [rsp]
    731         paddd   xmm2, xmmword ptr [rsp+0F0H]
    732         paddd   xmm3, xmmword ptr [rsp+10H]
    733         paddd   xmm0, xmm5
    734         paddd   xmm1, xmm6
    735         paddd   xmm2, xmm7
    736         paddd   xmm3, xmm4
    737         pxor    xmm15, xmm0
    738         pxor    xmm12, xmm1
    739         pxor    xmm13, xmm2
    740         pxor    xmm14, xmm3
    741         movdqa  xmm8, xmm15
    742         psrld   xmm15, 8
    743         pslld   xmm8, 24
    744         pxor    xmm15, xmm8
    745         movdqa  xmm8, xmm12
    746         psrld   xmm12, 8
    747         pslld   xmm8, 24
    748         pxor    xmm12, xmm8
    749         movdqa  xmm8, xmm13
    750         psrld   xmm13, 8
    751         pslld   xmm8, 24
    752         pxor    xmm13, xmm8
    753         movdqa  xmm8, xmm14
    754         psrld   xmm14, 8
    755         pslld   xmm8, 24
    756         pxor    xmm14, xmm8
    757         paddd   xmm10, xmm15
    758         paddd   xmm11, xmm12
    759         movdqa  xmm8, xmmword ptr [rsp+100H]
    760         paddd   xmm8, xmm13
    761         paddd   xmm9, xmm14
    762         pxor    xmm5, xmm10
    763         pxor    xmm6, xmm11
    764         pxor    xmm7, xmm8
    765         pxor    xmm4, xmm9
    766         movdqa  xmmword ptr [rsp+100H], xmm8
    767         movdqa  xmm8, xmm5
    768         psrld   xmm8, 7
    769         pslld   xmm5, 25
    770         por     xmm5, xmm8
    771         movdqa  xmm8, xmm6
    772         psrld   xmm8, 7
    773         pslld   xmm6, 25
    774         por     xmm6, xmm8
    775         movdqa  xmm8, xmm7
    776         psrld   xmm8, 7
    777         pslld   xmm7, 25
    778         por     xmm7, xmm8
    779         movdqa  xmm8, xmm4
    780         psrld   xmm8, 7
    781         pslld   xmm4, 25
    782         por     xmm4, xmm8
    783         paddd   xmm0, xmmword ptr [rsp+0A0H]
    784         paddd   xmm1, xmmword ptr [rsp+0C0H]
    785         paddd   xmm2, xmmword ptr [rsp+0E0H]
    786         paddd   xmm3, xmmword ptr [rsp+0D0H]
    787         paddd   xmm0, xmm4
    788         paddd   xmm1, xmm5
    789         paddd   xmm2, xmm6
    790         paddd   xmm3, xmm7
    791         pxor    xmm12, xmm0
    792         pxor    xmm13, xmm1
    793         pxor    xmm14, xmm2
    794         pxor    xmm15, xmm3
    795         pshuflw xmm12, xmm12, 0B1H
    796         pshufhw xmm12, xmm12, 0B1H
    797         pshuflw xmm13, xmm13, 0B1H
    798         pshufhw xmm13, xmm13, 0B1H
    799         pshuflw xmm14, xmm14, 0B1H
    800         pshufhw xmm14, xmm14, 0B1H
    801         pshuflw xmm15, xmm15, 0B1H
    802         pshufhw xmm15, xmm15, 0B1H
    803         movdqa  xmm8, xmmword ptr [rsp+100H]
    804         paddd   xmm8, xmm12
    805         paddd   xmm9, xmm13
    806         paddd   xmm10, xmm14
    807         paddd   xmm11, xmm15
    808         pxor    xmm4, xmm8
    809         pxor    xmm5, xmm9
    810         pxor    xmm6, xmm10
    811         pxor    xmm7, xmm11
    812         movdqa  xmmword ptr [rsp+100H], xmm8
    813         movdqa  xmm8, xmm4
    814         psrld   xmm8, 12
    815         pslld   xmm4, 20
    816         por     xmm4, xmm8
    817         movdqa  xmm8, xmm5
    818         psrld   xmm8, 12
    819         pslld   xmm5, 20
    820         por     xmm5, xmm8
    821         movdqa  xmm8, xmm6
    822         psrld   xmm8, 12
    823         pslld   xmm6, 20
    824         por     xmm6, xmm8
    825         movdqa  xmm8, xmm7
    826         psrld   xmm8, 12
    827         pslld   xmm7, 20
    828         por     xmm7, xmm8
    829         paddd   xmm0, xmmword ptr [rsp+70H]
    830         paddd   xmm1, xmmword ptr [rsp+90H]
    831         paddd   xmm2, xmmword ptr [rsp+30H]
    832         paddd   xmm3, xmmword ptr [rsp+0F0H]
    833         paddd   xmm0, xmm4
    834         paddd   xmm1, xmm5
    835         paddd   xmm2, xmm6
    836         paddd   xmm3, xmm7
    837         pxor    xmm12, xmm0
    838         pxor    xmm13, xmm1
    839         pxor    xmm14, xmm2
    840         pxor    xmm15, xmm3
    841         movdqa  xmm8, xmm12
    842         psrld   xmm12, 8
    843         pslld   xmm8, 24
    844         pxor    xmm12, xmm8
    845         movdqa  xmm8, xmm13
    846         psrld   xmm13, 8
    847         pslld   xmm8, 24
    848         pxor    xmm13, xmm8
    849         movdqa  xmm8, xmm14
    850         psrld   xmm14, 8
    851         pslld   xmm8, 24
    852         pxor    xmm14, xmm8
    853         movdqa  xmm8, xmm15
    854         psrld   xmm15, 8
    855         pslld   xmm8, 24
    856         pxor    xmm15, xmm8
    857         movdqa  xmm8, xmmword ptr [rsp+100H]
    858         paddd   xmm8, xmm12
    859         paddd   xmm9, xmm13
    860         paddd   xmm10, xmm14
    861         paddd   xmm11, xmm15
    862         pxor    xmm4, xmm8
    863         pxor    xmm5, xmm9
    864         pxor    xmm6, xmm10
    865         pxor    xmm7, xmm11
    866         movdqa  xmmword ptr [rsp+100H], xmm8
    867         movdqa  xmm8, xmm4
    868         psrld   xmm8, 7
    869         pslld   xmm4, 25
    870         por     xmm4, xmm8
    871         movdqa  xmm8, xmm5
    872         psrld   xmm8, 7
    873         pslld   xmm5, 25
    874         por     xmm5, xmm8
    875         movdqa  xmm8, xmm6
    876         psrld   xmm8, 7
    877         pslld   xmm6, 25
    878         por     xmm6, xmm8
    879         movdqa  xmm8, xmm7
    880         psrld   xmm8, 7
    881         pslld   xmm7, 25
    882         por     xmm7, xmm8
    883         paddd   xmm0, xmmword ptr [rsp+40H]
    884         paddd   xmm1, xmmword ptr [rsp+0B0H]
    885         paddd   xmm2, xmmword ptr [rsp+50H]
    886         paddd   xmm3, xmmword ptr [rsp+10H]
    887         paddd   xmm0, xmm5
    888         paddd   xmm1, xmm6
    889         paddd   xmm2, xmm7
    890         paddd   xmm3, xmm4
    891         pxor    xmm15, xmm0
    892         pxor    xmm12, xmm1
    893         pxor    xmm13, xmm2
    894         pxor    xmm14, xmm3
    895         pshuflw xmm15, xmm15, 0B1H
    896         pshufhw xmm15, xmm15, 0B1H
    897         pshuflw xmm12, xmm12, 0B1H
    898         pshufhw xmm12, xmm12, 0B1H
    899         pshuflw xmm13, xmm13, 0B1H
    900         pshufhw xmm13, xmm13, 0B1H
    901         pshuflw xmm14, xmm14, 0B1H
    902         pshufhw xmm14, xmm14, 0B1H
    903         paddd   xmm10, xmm15
    904         paddd   xmm11, xmm12
    905         movdqa  xmm8, xmmword ptr [rsp+100H]
    906         paddd   xmm8, xmm13
    907         paddd   xmm9, xmm14
    908         pxor    xmm5, xmm10
    909         pxor    xmm6, xmm11
    910         pxor    xmm7, xmm8
    911         pxor    xmm4, xmm9
    912         movdqa  xmmword ptr [rsp+100H], xmm8
    913         movdqa  xmm8, xmm5
    914         psrld   xmm8, 12
    915         pslld   xmm5, 20
    916         por     xmm5, xmm8
    917         movdqa  xmm8, xmm6
    918         psrld   xmm8, 12
    919         pslld   xmm6, 20
    920         por     xmm6, xmm8
    921         movdqa  xmm8, xmm7
    922         psrld   xmm8, 12
    923         pslld   xmm7, 20
    924         por     xmm7, xmm8
    925         movdqa  xmm8, xmm4
    926         psrld   xmm8, 12
    927         pslld   xmm4, 20
    928         por     xmm4, xmm8
    929         paddd   xmm0, xmmword ptr [rsp]
    930         paddd   xmm1, xmmword ptr [rsp+20H]
    931         paddd   xmm2, xmmword ptr [rsp+80H]
    932         paddd   xmm3, xmmword ptr [rsp+60H]
    933         paddd   xmm0, xmm5
    934         paddd   xmm1, xmm6
    935         paddd   xmm2, xmm7
    936         paddd   xmm3, xmm4
    937         pxor    xmm15, xmm0
    938         pxor    xmm12, xmm1
    939         pxor    xmm13, xmm2
    940         pxor    xmm14, xmm3
    941         movdqa  xmm8, xmm15
    942         psrld   xmm15, 8
    943         pslld   xmm8, 24
    944         pxor    xmm15, xmm8
    945         movdqa  xmm8, xmm12
    946         psrld   xmm12, 8
    947         pslld   xmm8, 24
    948         pxor    xmm12, xmm8
    949         movdqa  xmm8, xmm13
    950         psrld   xmm13, 8
    951         pslld   xmm8, 24
    952         pxor    xmm13, xmm8
    953         movdqa  xmm8, xmm14
    954         psrld   xmm14, 8
    955         pslld   xmm8, 24
    956         pxor    xmm14, xmm8
    957         paddd   xmm10, xmm15
    958         paddd   xmm11, xmm12
    959         movdqa  xmm8, xmmword ptr [rsp+100H]
    960         paddd   xmm8, xmm13
    961         paddd   xmm9, xmm14
    962         pxor    xmm5, xmm10
    963         pxor    xmm6, xmm11
    964         pxor    xmm7, xmm8
    965         pxor    xmm4, xmm9
    966         movdqa  xmmword ptr [rsp+100H], xmm8
    967         movdqa  xmm8, xmm5
    968         psrld   xmm8, 7
    969         pslld   xmm5, 25
    970         por     xmm5, xmm8
    971         movdqa  xmm8, xmm6
    972         psrld   xmm8, 7
    973         pslld   xmm6, 25
    974         por     xmm6, xmm8
    975         movdqa  xmm8, xmm7
    976         psrld   xmm8, 7
    977         pslld   xmm7, 25
    978         por     xmm7, xmm8
    979         movdqa  xmm8, xmm4
    980         psrld   xmm8, 7
    981         pslld   xmm4, 25
    982         por     xmm4, xmm8
    983         paddd   xmm0, xmmword ptr [rsp+0C0H]
    984         paddd   xmm1, xmmword ptr [rsp+90H]
    985         paddd   xmm2, xmmword ptr [rsp+0F0H]
    986         paddd   xmm3, xmmword ptr [rsp+0E0H]
    987         paddd   xmm0, xmm4
    988         paddd   xmm1, xmm5
    989         paddd   xmm2, xmm6
    990         paddd   xmm3, xmm7
    991         pxor    xmm12, xmm0
    992         pxor    xmm13, xmm1
    993         pxor    xmm14, xmm2
    994         pxor    xmm15, xmm3
    995         pshuflw xmm12, xmm12, 0B1H
    996         pshufhw xmm12, xmm12, 0B1H
    997         pshuflw xmm13, xmm13, 0B1H
    998         pshufhw xmm13, xmm13, 0B1H
    999         pshuflw xmm14, xmm14, 0B1H
   1000         pshufhw xmm14, xmm14, 0B1H
   1001         pshuflw xmm15, xmm15, 0B1H
   1002         pshufhw xmm15, xmm15, 0B1H
   1003         movdqa  xmm8, xmmword ptr [rsp+100H]
   1004         paddd   xmm8, xmm12
   1005         paddd   xmm9, xmm13
   1006         paddd   xmm10, xmm14
   1007         paddd   xmm11, xmm15
   1008         pxor    xmm4, xmm8
   1009         pxor    xmm5, xmm9
   1010         pxor    xmm6, xmm10
   1011         pxor    xmm7, xmm11
   1012         movdqa  xmmword ptr [rsp+100H], xmm8
   1013         movdqa  xmm8, xmm4
   1014         psrld   xmm8, 12
   1015         pslld   xmm4, 20
   1016         por     xmm4, xmm8
   1017         movdqa  xmm8, xmm5
   1018         psrld   xmm8, 12
   1019         pslld   xmm5, 20
   1020         por     xmm5, xmm8
   1021         movdqa  xmm8, xmm6
   1022         psrld   xmm8, 12
   1023         pslld   xmm6, 20
   1024         por     xmm6, xmm8
   1025         movdqa  xmm8, xmm7
   1026         psrld   xmm8, 12
   1027         pslld   xmm7, 20
   1028         por     xmm7, xmm8
   1029         paddd   xmm0, xmmword ptr [rsp+0D0H]
   1030         paddd   xmm1, xmmword ptr [rsp+0B0H]
   1031         paddd   xmm2, xmmword ptr [rsp+0A0H]
   1032         paddd   xmm3, xmmword ptr [rsp+80H]
   1033         paddd   xmm0, xmm4
   1034         paddd   xmm1, xmm5
   1035         paddd   xmm2, xmm6
   1036         paddd   xmm3, xmm7
   1037         pxor    xmm12, xmm0
   1038         pxor    xmm13, xmm1
   1039         pxor    xmm14, xmm2
   1040         pxor    xmm15, xmm3
   1041         movdqa  xmm8, xmm12
   1042         psrld   xmm12, 8
   1043         pslld   xmm8, 24
   1044         pxor    xmm12, xmm8
   1045         movdqa  xmm8, xmm13
   1046         psrld   xmm13, 8
   1047         pslld   xmm8, 24
   1048         pxor    xmm13, xmm8
   1049         movdqa  xmm8, xmm14
   1050         psrld   xmm14, 8
   1051         pslld   xmm8, 24
   1052         pxor    xmm14, xmm8
   1053         movdqa  xmm8, xmm15
   1054         psrld   xmm15, 8
   1055         pslld   xmm8, 24
   1056         pxor    xmm15, xmm8
   1057         movdqa  xmm8, xmmword ptr [rsp+100H]
   1058         paddd   xmm8, xmm12
   1059         paddd   xmm9, xmm13
   1060         paddd   xmm10, xmm14
   1061         paddd   xmm11, xmm15
   1062         pxor    xmm4, xmm8
   1063         pxor    xmm5, xmm9
   1064         pxor    xmm6, xmm10
   1065         pxor    xmm7, xmm11
   1066         movdqa  xmmword ptr [rsp+100H], xmm8
   1067         movdqa  xmm8, xmm4
   1068         psrld   xmm8, 7
   1069         pslld   xmm4, 25
   1070         por     xmm4, xmm8
   1071         movdqa  xmm8, xmm5
   1072         psrld   xmm8, 7
   1073         pslld   xmm5, 25
   1074         por     xmm5, xmm8
   1075         movdqa  xmm8, xmm6
   1076         psrld   xmm8, 7
   1077         pslld   xmm6, 25
   1078         por     xmm6, xmm8
   1079         movdqa  xmm8, xmm7
   1080         psrld   xmm8, 7
   1081         pslld   xmm7, 25
   1082         por     xmm7, xmm8
   1083         paddd   xmm0, xmmword ptr [rsp+70H]
   1084         paddd   xmm1, xmmword ptr [rsp+50H]
   1085         paddd   xmm2, xmmword ptr [rsp]
   1086         paddd   xmm3, xmmword ptr [rsp+60H]
   1087         paddd   xmm0, xmm5
   1088         paddd   xmm1, xmm6
   1089         paddd   xmm2, xmm7
   1090         paddd   xmm3, xmm4
   1091         pxor    xmm15, xmm0
   1092         pxor    xmm12, xmm1
   1093         pxor    xmm13, xmm2
   1094         pxor    xmm14, xmm3
   1095         pshuflw xmm15, xmm15, 0B1H
   1096         pshufhw xmm15, xmm15, 0B1H
   1097         pshuflw xmm12, xmm12, 0B1H
   1098         pshufhw xmm12, xmm12, 0B1H
   1099         pshuflw xmm13, xmm13, 0B1H
   1100         pshufhw xmm13, xmm13, 0B1H
   1101         pshuflw xmm14, xmm14, 0B1H
   1102         pshufhw xmm14, xmm14, 0B1H
   1103         paddd   xmm10, xmm15
   1104         paddd   xmm11, xmm12
   1105         movdqa  xmm8, xmmword ptr [rsp+100H]
   1106         paddd   xmm8, xmm13
   1107         paddd   xmm9, xmm14
   1108         pxor    xmm5, xmm10
   1109         pxor    xmm6, xmm11
   1110         pxor    xmm7, xmm8
   1111         pxor    xmm4, xmm9
   1112         movdqa  xmmword ptr [rsp+100H], xmm8
   1113         movdqa  xmm8, xmm5
   1114         psrld   xmm8, 12
   1115         pslld   xmm5, 20
   1116         por     xmm5, xmm8
   1117         movdqa  xmm8, xmm6
   1118         psrld   xmm8, 12
   1119         pslld   xmm6, 20
   1120         por     xmm6, xmm8
   1121         movdqa  xmm8, xmm7
   1122         psrld   xmm8, 12
   1123         pslld   xmm7, 20
   1124         por     xmm7, xmm8
   1125         movdqa  xmm8, xmm4
   1126         psrld   xmm8, 12
   1127         pslld   xmm4, 20
   1128         por     xmm4, xmm8
   1129         paddd   xmm0, xmmword ptr [rsp+20H]
   1130         paddd   xmm1, xmmword ptr [rsp+30H]
   1131         paddd   xmm2, xmmword ptr [rsp+10H]
   1132         paddd   xmm3, xmmword ptr [rsp+40H]
   1133         paddd   xmm0, xmm5
   1134         paddd   xmm1, xmm6
   1135         paddd   xmm2, xmm7
   1136         paddd   xmm3, xmm4
   1137         pxor    xmm15, xmm0
   1138         pxor    xmm12, xmm1
   1139         pxor    xmm13, xmm2
   1140         pxor    xmm14, xmm3
   1141         movdqa  xmm8, xmm15
   1142         psrld   xmm15, 8
   1143         pslld   xmm8, 24
   1144         pxor    xmm15, xmm8
   1145         movdqa  xmm8, xmm12
   1146         psrld   xmm12, 8
   1147         pslld   xmm8, 24
   1148         pxor    xmm12, xmm8
   1149         movdqa  xmm8, xmm13
   1150         psrld   xmm13, 8
   1151         pslld   xmm8, 24
   1152         pxor    xmm13, xmm8
   1153         movdqa  xmm8, xmm14
   1154         psrld   xmm14, 8
   1155         pslld   xmm8, 24
   1156         pxor    xmm14, xmm8
   1157         paddd   xmm10, xmm15
   1158         paddd   xmm11, xmm12
   1159         movdqa  xmm8, xmmword ptr [rsp+100H]
   1160         paddd   xmm8, xmm13
   1161         paddd   xmm9, xmm14
   1162         pxor    xmm5, xmm10
   1163         pxor    xmm6, xmm11
   1164         pxor    xmm7, xmm8
   1165         pxor    xmm4, xmm9
   1166         movdqa  xmmword ptr [rsp+100H], xmm8
   1167         movdqa  xmm8, xmm5
   1168         psrld   xmm8, 7
   1169         pslld   xmm5, 25
   1170         por     xmm5, xmm8
   1171         movdqa  xmm8, xmm6
   1172         psrld   xmm8, 7
   1173         pslld   xmm6, 25
   1174         por     xmm6, xmm8
   1175         movdqa  xmm8, xmm7
   1176         psrld   xmm8, 7
   1177         pslld   xmm7, 25
   1178         por     xmm7, xmm8
   1179         movdqa  xmm8, xmm4
   1180         psrld   xmm8, 7
   1181         pslld   xmm4, 25
   1182         por     xmm4, xmm8
   1183         paddd   xmm0, xmmword ptr [rsp+90H]
   1184         paddd   xmm1, xmmword ptr [rsp+0B0H]
   1185         paddd   xmm2, xmmword ptr [rsp+80H]
   1186         paddd   xmm3, xmmword ptr [rsp+0F0H]
   1187         paddd   xmm0, xmm4
   1188         paddd   xmm1, xmm5
   1189         paddd   xmm2, xmm6
   1190         paddd   xmm3, xmm7
   1191         pxor    xmm12, xmm0
   1192         pxor    xmm13, xmm1
   1193         pxor    xmm14, xmm2
   1194         pxor    xmm15, xmm3
   1195         pshuflw xmm12, xmm12, 0B1H
   1196         pshufhw xmm12, xmm12, 0B1H
   1197         pshuflw xmm13, xmm13, 0B1H
   1198         pshufhw xmm13, xmm13, 0B1H
   1199         pshuflw xmm14, xmm14, 0B1H
   1200         pshufhw xmm14, xmm14, 0B1H
   1201         pshuflw xmm15, xmm15, 0B1H
   1202         pshufhw xmm15, xmm15, 0B1H
   1203         movdqa  xmm8, xmmword ptr [rsp+100H]
   1204         paddd   xmm8, xmm12
   1205         paddd   xmm9, xmm13
   1206         paddd   xmm10, xmm14
   1207         paddd   xmm11, xmm15
   1208         pxor    xmm4, xmm8
   1209         pxor    xmm5, xmm9
   1210         pxor    xmm6, xmm10
   1211         pxor    xmm7, xmm11
   1212         movdqa  xmmword ptr [rsp+100H], xmm8
   1213         movdqa  xmm8, xmm4
   1214         psrld   xmm8, 12
   1215         pslld   xmm4, 20
   1216         por     xmm4, xmm8
   1217         movdqa  xmm8, xmm5
   1218         psrld   xmm8, 12
   1219         pslld   xmm5, 20
   1220         por     xmm5, xmm8
   1221         movdqa  xmm8, xmm6
   1222         psrld   xmm8, 12
   1223         pslld   xmm6, 20
   1224         por     xmm6, xmm8
   1225         movdqa  xmm8, xmm7
   1226         psrld   xmm8, 12
   1227         pslld   xmm7, 20
   1228         por     xmm7, xmm8
   1229         paddd   xmm0, xmmword ptr [rsp+0E0H]
   1230         paddd   xmm1, xmmword ptr [rsp+50H]
   1231         paddd   xmm2, xmmword ptr [rsp+0C0H]
   1232         paddd   xmm3, xmmword ptr [rsp+10H]
   1233         paddd   xmm0, xmm4
   1234         paddd   xmm1, xmm5
   1235         paddd   xmm2, xmm6
   1236         paddd   xmm3, xmm7
   1237         pxor    xmm12, xmm0
   1238         pxor    xmm13, xmm1
   1239         pxor    xmm14, xmm2
   1240         pxor    xmm15, xmm3
   1241         movdqa  xmm8, xmm12
   1242         psrld   xmm12, 8
   1243         pslld   xmm8, 24
   1244         pxor    xmm12, xmm8
   1245         movdqa  xmm8, xmm13
   1246         psrld   xmm13, 8
   1247         pslld   xmm8, 24
   1248         pxor    xmm13, xmm8
   1249         movdqa  xmm8, xmm14
   1250         psrld   xmm14, 8
   1251         pslld   xmm8, 24
   1252         pxor    xmm14, xmm8
   1253         movdqa  xmm8, xmm15
   1254         psrld   xmm15, 8
   1255         pslld   xmm8, 24
   1256         pxor    xmm15, xmm8
   1257         movdqa  xmm8, xmmword ptr [rsp+100H]
   1258         paddd   xmm8, xmm12
   1259         paddd   xmm9, xmm13
   1260         paddd   xmm10, xmm14
   1261         paddd   xmm11, xmm15
   1262         pxor    xmm4, xmm8
   1263         pxor    xmm5, xmm9
   1264         pxor    xmm6, xmm10
   1265         pxor    xmm7, xmm11
   1266         movdqa  xmmword ptr [rsp+100H], xmm8
   1267         movdqa  xmm8, xmm4
   1268         psrld   xmm8, 7
   1269         pslld   xmm4, 25
   1270         por     xmm4, xmm8
   1271         movdqa  xmm8, xmm5
   1272         psrld   xmm8, 7
   1273         pslld   xmm5, 25
   1274         por     xmm5, xmm8
   1275         movdqa  xmm8, xmm6
   1276         psrld   xmm8, 7
   1277         pslld   xmm6, 25
   1278         por     xmm6, xmm8
   1279         movdqa  xmm8, xmm7
   1280         psrld   xmm8, 7
   1281         pslld   xmm7, 25
   1282         por     xmm7, xmm8
   1283         paddd   xmm0, xmmword ptr [rsp+0D0H]
   1284         paddd   xmm1, xmmword ptr [rsp]
   1285         paddd   xmm2, xmmword ptr [rsp+20H]
   1286         paddd   xmm3, xmmword ptr [rsp+40H]
   1287         paddd   xmm0, xmm5
   1288         paddd   xmm1, xmm6
   1289         paddd   xmm2, xmm7
   1290         paddd   xmm3, xmm4
   1291         pxor    xmm15, xmm0
   1292         pxor    xmm12, xmm1
   1293         pxor    xmm13, xmm2
   1294         pxor    xmm14, xmm3
   1295         pshuflw xmm15, xmm15, 0B1H
   1296         pshufhw xmm15, xmm15, 0B1H
   1297         pshuflw xmm12, xmm12, 0B1H
   1298         pshufhw xmm12, xmm12, 0B1H
   1299         pshuflw xmm13, xmm13, 0B1H
   1300         pshufhw xmm13, xmm13, 0B1H
   1301         pshuflw xmm14, xmm14, 0B1H
   1302         pshufhw xmm14, xmm14, 0B1H
   1303         paddd   xmm10, xmm15
   1304         paddd   xmm11, xmm12
   1305         movdqa  xmm8, xmmword ptr [rsp+100H]
   1306         paddd   xmm8, xmm13
   1307         paddd   xmm9, xmm14
   1308         pxor    xmm5, xmm10
   1309         pxor    xmm6, xmm11
   1310         pxor    xmm7, xmm8
   1311         pxor    xmm4, xmm9
   1312         movdqa  xmmword ptr [rsp+100H], xmm8
   1313         movdqa  xmm8, xmm5
   1314         psrld   xmm8, 12
   1315         pslld   xmm5, 20
   1316         por     xmm5, xmm8
   1317         movdqa  xmm8, xmm6
   1318         psrld   xmm8, 12
   1319         pslld   xmm6, 20
   1320         por     xmm6, xmm8
   1321         movdqa  xmm8, xmm7
   1322         psrld   xmm8, 12
   1323         pslld   xmm7, 20
   1324         por     xmm7, xmm8
   1325         movdqa  xmm8, xmm4
   1326         psrld   xmm8, 12
   1327         pslld   xmm4, 20
   1328         por     xmm4, xmm8
   1329         paddd   xmm0, xmmword ptr [rsp+30H]
   1330         paddd   xmm1, xmmword ptr [rsp+0A0H]
   1331         paddd   xmm2, xmmword ptr [rsp+60H]
   1332         paddd   xmm3, xmmword ptr [rsp+70H]
   1333         paddd   xmm0, xmm5
   1334         paddd   xmm1, xmm6
   1335         paddd   xmm2, xmm7
   1336         paddd   xmm3, xmm4
   1337         pxor    xmm15, xmm0
   1338         pxor    xmm12, xmm1
   1339         pxor    xmm13, xmm2
   1340         pxor    xmm14, xmm3
   1341         movdqa  xmm8, xmm15
   1342         psrld   xmm15, 8
   1343         pslld   xmm8, 24
   1344         pxor    xmm15, xmm8
   1345         movdqa  xmm8, xmm12
   1346         psrld   xmm12, 8
   1347         pslld   xmm8, 24
   1348         pxor    xmm12, xmm8
   1349         movdqa  xmm8, xmm13
   1350         psrld   xmm13, 8
   1351         pslld   xmm8, 24
   1352         pxor    xmm13, xmm8
   1353         movdqa  xmm8, xmm14
   1354         psrld   xmm14, 8
   1355         pslld   xmm8, 24
   1356         pxor    xmm14, xmm8
   1357         paddd   xmm10, xmm15
   1358         paddd   xmm11, xmm12
   1359         movdqa  xmm8, xmmword ptr [rsp+100H]
   1360         paddd   xmm8, xmm13
   1361         paddd   xmm9, xmm14
   1362         pxor    xmm5, xmm10
   1363         pxor    xmm6, xmm11
   1364         pxor    xmm7, xmm8
   1365         pxor    xmm4, xmm9
   1366         movdqa  xmmword ptr [rsp+100H], xmm8
   1367         movdqa  xmm8, xmm5
   1368         psrld   xmm8, 7
   1369         pslld   xmm5, 25
   1370         por     xmm5, xmm8
   1371         movdqa  xmm8, xmm6
   1372         psrld   xmm8, 7
   1373         pslld   xmm6, 25
   1374         por     xmm6, xmm8
   1375         movdqa  xmm8, xmm7
   1376         psrld   xmm8, 7
   1377         pslld   xmm7, 25
   1378         por     xmm7, xmm8
   1379         movdqa  xmm8, xmm4
   1380         psrld   xmm8, 7
   1381         pslld   xmm4, 25
   1382         por     xmm4, xmm8
   1383         paddd   xmm0, xmmword ptr [rsp+0B0H]
   1384         paddd   xmm1, xmmword ptr [rsp+50H]
   1385         paddd   xmm2, xmmword ptr [rsp+10H]
   1386         paddd   xmm3, xmmword ptr [rsp+80H]
   1387         paddd   xmm0, xmm4
   1388         paddd   xmm1, xmm5
   1389         paddd   xmm2, xmm6
   1390         paddd   xmm3, xmm7
   1391         pxor    xmm12, xmm0
   1392         pxor    xmm13, xmm1
   1393         pxor    xmm14, xmm2
   1394         pxor    xmm15, xmm3
   1395         pshuflw xmm12, xmm12, 0B1H
   1396         pshufhw xmm12, xmm12, 0B1H
   1397         pshuflw xmm13, xmm13, 0B1H
   1398         pshufhw xmm13, xmm13, 0B1H
   1399         pshuflw xmm14, xmm14, 0B1H
   1400         pshufhw xmm14, xmm14, 0B1H
   1401         pshuflw xmm15, xmm15, 0B1H
   1402         pshufhw xmm15, xmm15, 0B1H
   1403         movdqa  xmm8, xmmword ptr [rsp+100H]
   1404         paddd   xmm8, xmm12
   1405         paddd   xmm9, xmm13
   1406         paddd   xmm10, xmm14
   1407         paddd   xmm11, xmm15
   1408         pxor    xmm4, xmm8
   1409         pxor    xmm5, xmm9
   1410         pxor    xmm6, xmm10
   1411         pxor    xmm7, xmm11
   1412         movdqa  xmmword ptr [rsp+100H], xmm8
   1413         movdqa  xmm8, xmm4
   1414         psrld   xmm8, 12
   1415         pslld   xmm4, 20
   1416         por     xmm4, xmm8
   1417         movdqa  xmm8, xmm5
   1418         psrld   xmm8, 12
   1419         pslld   xmm5, 20
   1420         por     xmm5, xmm8
   1421         movdqa  xmm8, xmm6
   1422         psrld   xmm8, 12
   1423         pslld   xmm6, 20
   1424         por     xmm6, xmm8
   1425         movdqa  xmm8, xmm7
   1426         psrld   xmm8, 12
   1427         pslld   xmm7, 20
   1428         por     xmm7, xmm8
   1429         paddd   xmm0, xmmword ptr [rsp+0F0H]
   1430         paddd   xmm1, xmmword ptr [rsp]
   1431         paddd   xmm2, xmmword ptr [rsp+90H]
   1432         paddd   xmm3, xmmword ptr [rsp+60H]
   1433         paddd   xmm0, xmm4
   1434         paddd   xmm1, xmm5
   1435         paddd   xmm2, xmm6
   1436         paddd   xmm3, xmm7
   1437         pxor    xmm12, xmm0
   1438         pxor    xmm13, xmm1
   1439         pxor    xmm14, xmm2
   1440         pxor    xmm15, xmm3
   1441         movdqa  xmm8, xmm12
   1442         psrld   xmm12, 8
   1443         pslld   xmm8, 24
   1444         pxor    xmm12, xmm8
   1445         movdqa  xmm8, xmm13
   1446         psrld   xmm13, 8
   1447         pslld   xmm8, 24
   1448         pxor    xmm13, xmm8
   1449         movdqa  xmm8, xmm14
   1450         psrld   xmm14, 8
   1451         pslld   xmm8, 24
   1452         pxor    xmm14, xmm8
   1453         movdqa  xmm8, xmm15
   1454         psrld   xmm15, 8
   1455         pslld   xmm8, 24
   1456         pxor    xmm15, xmm8
   1457         movdqa  xmm8, xmmword ptr [rsp+100H]
   1458         paddd   xmm8, xmm12
   1459         paddd   xmm9, xmm13
   1460         paddd   xmm10, xmm14
   1461         paddd   xmm11, xmm15
   1462         pxor    xmm4, xmm8
   1463         pxor    xmm5, xmm9
   1464         pxor    xmm6, xmm10
   1465         pxor    xmm7, xmm11
   1466         movdqa  xmmword ptr [rsp+100H], xmm8
   1467         movdqa  xmm8, xmm4
   1468         psrld   xmm8, 7
   1469         pslld   xmm4, 25
   1470         por     xmm4, xmm8
   1471         movdqa  xmm8, xmm5
   1472         psrld   xmm8, 7
   1473         pslld   xmm5, 25
   1474         por     xmm5, xmm8
   1475         movdqa  xmm8, xmm6
   1476         psrld   xmm8, 7
   1477         pslld   xmm6, 25
   1478         por     xmm6, xmm8
   1479         movdqa  xmm8, xmm7
   1480         psrld   xmm8, 7
   1481         pslld   xmm7, 25
   1482         por     xmm7, xmm8
   1483         paddd   xmm0, xmmword ptr [rsp+0E0H]
   1484         paddd   xmm1, xmmword ptr [rsp+20H]
   1485         paddd   xmm2, xmmword ptr [rsp+30H]
   1486         paddd   xmm3, xmmword ptr [rsp+70H]
   1487         paddd   xmm0, xmm5
   1488         paddd   xmm1, xmm6
   1489         paddd   xmm2, xmm7
   1490         paddd   xmm3, xmm4
   1491         pxor    xmm15, xmm0
   1492         pxor    xmm12, xmm1
   1493         pxor    xmm13, xmm2
   1494         pxor    xmm14, xmm3
   1495         pshuflw xmm15, xmm15, 0B1H
   1496         pshufhw xmm15, xmm15, 0B1H
   1497         pshuflw xmm12, xmm12, 0B1H
   1498         pshufhw xmm12, xmm12, 0B1H
   1499         pshuflw xmm13, xmm13, 0B1H
   1500         pshufhw xmm13, xmm13, 0B1H
   1501         pshuflw xmm14, xmm14, 0B1H
   1502         pshufhw xmm14, xmm14, 0B1H
   1503         paddd   xmm10, xmm15
   1504         paddd   xmm11, xmm12
   1505         movdqa  xmm8, xmmword ptr [rsp+100H]
   1506         paddd   xmm8, xmm13
   1507         paddd   xmm9, xmm14
   1508         pxor    xmm5, xmm10
   1509         pxor    xmm6, xmm11
   1510         pxor    xmm7, xmm8
   1511         pxor    xmm4, xmm9
   1512         movdqa  xmmword ptr [rsp+100H], xmm8
   1513         movdqa  xmm8, xmm5
   1514         psrld   xmm8, 12
   1515         pslld   xmm5, 20
   1516         por     xmm5, xmm8
   1517         movdqa  xmm8, xmm6
   1518         psrld   xmm8, 12
   1519         pslld   xmm6, 20
   1520         por     xmm6, xmm8
   1521         movdqa  xmm8, xmm7
   1522         psrld   xmm8, 12
   1523         pslld   xmm7, 20
   1524         por     xmm7, xmm8
   1525         movdqa  xmm8, xmm4
   1526         psrld   xmm8, 12
   1527         pslld   xmm4, 20
   1528         por     xmm4, xmm8
   1529         paddd   xmm0, xmmword ptr [rsp+0A0H]
   1530         paddd   xmm1, xmmword ptr [rsp+0C0H]
   1531         paddd   xmm2, xmmword ptr [rsp+40H]
   1532         paddd   xmm3, xmmword ptr [rsp+0D0H]
   1533         paddd   xmm0, xmm5
   1534         paddd   xmm1, xmm6
   1535         paddd   xmm2, xmm7
   1536         paddd   xmm3, xmm4
   1537         pxor    xmm15, xmm0
   1538         pxor    xmm12, xmm1
   1539         pxor    xmm13, xmm2
   1540         pxor    xmm14, xmm3
   1541         movdqa  xmm8, xmm15
   1542         psrld   xmm15, 8
   1543         pslld   xmm8, 24
   1544         pxor    xmm15, xmm8
   1545         movdqa  xmm8, xmm12
   1546         psrld   xmm12, 8
   1547         pslld   xmm8, 24
   1548         pxor    xmm12, xmm8
   1549         movdqa  xmm8, xmm13
   1550         psrld   xmm13, 8
   1551         pslld   xmm8, 24
   1552         pxor    xmm13, xmm8
   1553         movdqa  xmm8, xmm14
   1554         psrld   xmm14, 8
   1555         pslld   xmm8, 24
   1556         pxor    xmm14, xmm8
   1557         paddd   xmm10, xmm15
   1558         paddd   xmm11, xmm12
   1559         movdqa  xmm8, xmmword ptr [rsp+100H]
   1560         paddd   xmm8, xmm13
   1561         paddd   xmm9, xmm14
   1562         pxor    xmm5, xmm10
   1563         pxor    xmm6, xmm11
   1564         pxor    xmm7, xmm8
   1565         pxor    xmm4, xmm9
   1566         pxor    xmm0, xmm8
   1567         pxor    xmm1, xmm9
   1568         pxor    xmm2, xmm10
   1569         pxor    xmm3, xmm11
   1570         movdqa  xmm8, xmm5
   1571         psrld   xmm8, 7
   1572         pslld   xmm5, 25
   1573         por     xmm5, xmm8
   1574         movdqa  xmm8, xmm6
   1575         psrld   xmm8, 7
   1576         pslld   xmm6, 25
   1577         por     xmm6, xmm8
   1578         movdqa  xmm8, xmm7
   1579         psrld   xmm8, 7
   1580         pslld   xmm7, 25
   1581         por     xmm7, xmm8
   1582         movdqa  xmm8, xmm4
   1583         psrld   xmm8, 7
   1584         pslld   xmm4, 25
   1585         por     xmm4, xmm8
   1586         pxor    xmm4, xmm12
   1587         pxor    xmm5, xmm13
   1588         pxor    xmm6, xmm14
   1589         pxor    xmm7, xmm15
   1590         mov     eax, r13d
   1591         jne     innerloop4
   1592         movdqa  xmm9, xmm0
   1593         punpckldq xmm0, xmm1
   1594         punpckhdq xmm9, xmm1
   1595         movdqa  xmm11, xmm2
   1596         punpckldq xmm2, xmm3
   1597         punpckhdq xmm11, xmm3
   1598         movdqa  xmm1, xmm0
   1599         punpcklqdq xmm0, xmm2
   1600         punpckhqdq xmm1, xmm2
   1601         movdqa  xmm3, xmm9
   1602         punpcklqdq xmm9, xmm11
   1603         punpckhqdq xmm3, xmm11
   1604         movdqu  xmmword ptr [rbx], xmm0
   1605         movdqu  xmmword ptr [rbx+20H], xmm1
   1606         movdqu  xmmword ptr [rbx+40H], xmm9
   1607         movdqu  xmmword ptr [rbx+60H], xmm3
   1608         movdqa  xmm9, xmm4
   1609         punpckldq xmm4, xmm5
   1610         punpckhdq xmm9, xmm5
   1611         movdqa  xmm11, xmm6
   1612         punpckldq xmm6, xmm7
   1613         punpckhdq xmm11, xmm7
   1614         movdqa  xmm5, xmm4
   1615         punpcklqdq xmm4, xmm6
   1616         punpckhqdq xmm5, xmm6
   1617         movdqa  xmm7, xmm9
   1618         punpcklqdq xmm9, xmm11
   1619         punpckhqdq xmm7, xmm11
   1620         movdqu  xmmword ptr [rbx+10H], xmm4
   1621         movdqu  xmmword ptr [rbx+30H], xmm5
   1622         movdqu  xmmword ptr [rbx+50H], xmm9
   1623         movdqu  xmmword ptr [rbx+70H], xmm7
   1624         movdqa  xmm1, xmmword ptr [rsp+110H]
   1625         movdqa  xmm0, xmm1
   1626         paddd   xmm1, xmmword ptr [rsp+150H]
   1627         movdqa  xmmword ptr [rsp+110H], xmm1
   1628         pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
   1629         pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
   1630         pcmpgtd xmm0, xmm1
   1631         movdqa  xmm1, xmmword ptr [rsp+120H]
   1632         psubd   xmm1, xmm0
   1633         movdqa  xmmword ptr [rsp+120H], xmm1
   1634         add     rbx, 128
   1635         add     rdi, 32
   1636         sub     rsi, 4
   1637         cmp     rsi, 4
   1638         jnc     outerloop4
   1639         test    rsi, rsi
   1640         jne     final3blocks
   1641 unwind:
   1642         movdqa  xmm6, xmmword ptr [rsp+170H]
   1643         movdqa  xmm7, xmmword ptr [rsp+180H]
   1644         movdqa  xmm8, xmmword ptr [rsp+190H]
   1645         movdqa  xmm9, xmmword ptr [rsp+1A0H]
   1646         movdqa  xmm10, xmmword ptr [rsp+1B0H]
   1647         movdqa  xmm11, xmmword ptr [rsp+1C0H]
   1648         movdqa  xmm12, xmmword ptr [rsp+1D0H]
   1649         movdqa  xmm13, xmmword ptr [rsp+1E0H]
   1650         movdqa  xmm14, xmmword ptr [rsp+1F0H]
   1651         movdqa  xmm15, xmmword ptr [rsp+200H]
   1652         mov     rsp, rbp
   1653         pop     rbp
   1654         pop     rbx
   1655         pop     rdi
   1656         pop     rsi
   1657         pop     r12
   1658         pop     r13
   1659         pop     r14
   1660         pop     r15
   1661         ret
   1662 ALIGN   16
   1663 final3blocks:
   1664         test    esi, 2H
   1665         je      final1block
   1666         movups  xmm0, xmmword ptr [rcx]
   1667         movups  xmm1, xmmword ptr [rcx+10H]
   1668         movaps  xmm8, xmm0
   1669         movaps  xmm9, xmm1
   1670         movd    xmm13, dword ptr [rsp+110H]
   1671         movd    xmm14, dword ptr [rsp+120H]
   1672         punpckldq xmm13, xmm14
   1673         movaps  xmmword ptr [rsp], xmm13
   1674         movd    xmm14, dword ptr [rsp+114H]
   1675         movd    xmm13, dword ptr [rsp+124H]
   1676         punpckldq xmm14, xmm13
   1677         movaps  xmmword ptr [rsp+10H], xmm14
   1678         mov     r8, qword ptr [rdi]
   1679         mov     r9, qword ptr [rdi+8H]
   1680         movzx   eax, byte ptr [rbp+80H]
   1681         or      eax, r13d
   1682         xor     edx, edx
   1683 innerloop2:
   1684         mov     r14d, eax
   1685         or      eax, r12d
   1686         add     rdx, 64
   1687         cmp     rdx, r15
   1688         cmovne  eax, r14d
   1689         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1690         movaps  xmm10, xmm2
   1691         movups  xmm4, xmmword ptr [r8+rdx-40H]
   1692         movups  xmm5, xmmword ptr [r8+rdx-30H]
   1693         movaps  xmm3, xmm4
   1694         shufps  xmm4, xmm5, 136
   1695         shufps  xmm3, xmm5, 221
   1696         movaps  xmm5, xmm3
   1697         movups  xmm6, xmmword ptr [r8+rdx-20H]
   1698         movups  xmm7, xmmword ptr [r8+rdx-10H]
   1699         movaps  xmm3, xmm6
   1700         shufps  xmm6, xmm7, 136
   1701         pshufd  xmm6, xmm6, 93H
   1702         shufps  xmm3, xmm7, 221
   1703         pshufd  xmm7, xmm3, 93H
   1704         movups  xmm12, xmmword ptr [r9+rdx-40H]
   1705         movups  xmm13, xmmword ptr [r9+rdx-30H]
   1706         movaps  xmm11, xmm12
   1707         shufps  xmm12, xmm13, 136
   1708         shufps  xmm11, xmm13, 221
   1709         movaps  xmm13, xmm11
   1710         movups  xmm14, xmmword ptr [r9+rdx-20H]
   1711         movups  xmm15, xmmword ptr [r9+rdx-10H]
   1712         movaps  xmm11, xmm14
   1713         shufps  xmm14, xmm15, 136
   1714         pshufd  xmm14, xmm14, 93H
   1715         shufps  xmm11, xmm15, 221
   1716         pshufd  xmm15, xmm11, 93H
   1717         shl     rax, 20H
   1718         or      rax, 40H
   1719         movd    xmm3, rax
   1720         movdqa  xmmword ptr [rsp+20H], xmm3
   1721         movaps  xmm3, xmmword ptr [rsp]
   1722         movaps  xmm11, xmmword ptr [rsp+10H]
   1723         punpcklqdq xmm3, xmmword ptr [rsp+20H]
   1724         punpcklqdq xmm11, xmmword ptr [rsp+20H]
   1725         mov     al, 7
   1726 roundloop2:
   1727         paddd   xmm0, xmm4
   1728         paddd   xmm8, xmm12
   1729         movaps  xmmword ptr [rsp+20H], xmm4
   1730         movaps  xmmword ptr [rsp+30H], xmm12
   1731         paddd   xmm0, xmm1
   1732         paddd   xmm8, xmm9
   1733         pxor    xmm3, xmm0
   1734         pxor    xmm11, xmm8
   1735         pshuflw xmm3, xmm3, 0B1H
   1736         pshufhw xmm3, xmm3, 0B1H
   1737         pshuflw xmm11, xmm11, 0B1H
   1738         pshufhw xmm11, xmm11, 0B1H
   1739         paddd   xmm2, xmm3
   1740         paddd   xmm10, xmm11
   1741         pxor    xmm1, xmm2
   1742         pxor    xmm9, xmm10
   1743         movdqa  xmm4, xmm1
   1744         pslld   xmm1, 20
   1745         psrld   xmm4, 12
   1746         por     xmm1, xmm4
   1747         movdqa  xmm4, xmm9
   1748         pslld   xmm9, 20
   1749         psrld   xmm4, 12
   1750         por     xmm9, xmm4
   1751         paddd   xmm0, xmm5
   1752         paddd   xmm8, xmm13
   1753         movaps  xmmword ptr [rsp+40H], xmm5
   1754         movaps  xmmword ptr [rsp+50H], xmm13
   1755         paddd   xmm0, xmm1
   1756         paddd   xmm8, xmm9
   1757         pxor    xmm3, xmm0
   1758         pxor    xmm11, xmm8
   1759         movdqa  xmm13, xmm3
   1760         psrld   xmm3, 8
   1761         pslld   xmm13, 24
   1762         pxor    xmm3, xmm13
   1763         movdqa  xmm13, xmm11
   1764         psrld   xmm11, 8
   1765         pslld   xmm13, 24
   1766         pxor    xmm11, xmm13
   1767         paddd   xmm2, xmm3
   1768         paddd   xmm10, xmm11
   1769         pxor    xmm1, xmm2
   1770         pxor    xmm9, xmm10
   1771         movdqa  xmm4, xmm1
   1772         pslld   xmm1, 25
   1773         psrld   xmm4, 7
   1774         por     xmm1, xmm4
   1775         movdqa  xmm4, xmm9
   1776         pslld   xmm9, 25
   1777         psrld   xmm4, 7
   1778         por     xmm9, xmm4
   1779         pshufd  xmm0, xmm0, 93H
   1780         pshufd  xmm8, xmm8, 93H
   1781         pshufd  xmm3, xmm3, 4EH
   1782         pshufd  xmm11, xmm11, 4EH
   1783         pshufd  xmm2, xmm2, 39H
   1784         pshufd  xmm10, xmm10, 39H
   1785         paddd   xmm0, xmm6
   1786         paddd   xmm8, xmm14
   1787         paddd   xmm0, xmm1
   1788         paddd   xmm8, xmm9
   1789         pxor    xmm3, xmm0
   1790         pxor    xmm11, xmm8
   1791         pshuflw xmm3, xmm3, 0B1H
   1792         pshufhw xmm3, xmm3, 0B1H
   1793         pshuflw xmm11, xmm11, 0B1H
   1794         pshufhw xmm11, xmm11, 0B1H
   1795         paddd   xmm2, xmm3
   1796         paddd   xmm10, xmm11
   1797         pxor    xmm1, xmm2
   1798         pxor    xmm9, xmm10
   1799         movdqa  xmm4, xmm1
   1800         pslld   xmm1, 20
   1801         psrld   xmm4, 12
   1802         por     xmm1, xmm4
   1803         movdqa  xmm4, xmm9
   1804         pslld   xmm9, 20
   1805         psrld   xmm4, 12
   1806         por     xmm9, xmm4
   1807         paddd   xmm0, xmm7
   1808         paddd   xmm8, xmm15
   1809         paddd   xmm0, xmm1
   1810         paddd   xmm8, xmm9
   1811         pxor    xmm3, xmm0
   1812         pxor    xmm11, xmm8
   1813         movdqa  xmm13, xmm3
   1814         psrld   xmm3, 8
   1815         pslld   xmm13, 24
   1816         pxor    xmm3, xmm13
   1817         movdqa  xmm13, xmm11
   1818         psrld   xmm11, 8
   1819         pslld   xmm13, 24
   1820         pxor    xmm11, xmm13
   1821         paddd   xmm2, xmm3
   1822         paddd   xmm10, xmm11
   1823         pxor    xmm1, xmm2
   1824         pxor    xmm9, xmm10
   1825         movdqa  xmm4, xmm1
   1826         pslld   xmm1, 25
   1827         psrld   xmm4, 7
   1828         por     xmm1, xmm4
   1829         movdqa  xmm4, xmm9
   1830         pslld   xmm9, 25
   1831         psrld   xmm4, 7
   1832         por     xmm9, xmm4
   1833         pshufd  xmm0, xmm0, 39H
   1834         pshufd  xmm8, xmm8, 39H
   1835         pshufd  xmm3, xmm3, 4EH
   1836         pshufd  xmm11, xmm11, 4EH
   1837         pshufd  xmm2, xmm2, 93H
   1838         pshufd  xmm10, xmm10, 93H
   1839         dec     al
   1840         je      endroundloop2
   1841         movdqa  xmm12, xmmword ptr [rsp+20H]
   1842         movdqa  xmm5, xmmword ptr [rsp+40H]
   1843         pshufd  xmm13, xmm12, 0FH
   1844         shufps  xmm12, xmm5, 214
   1845         pshufd  xmm4, xmm12, 39H
   1846         movdqa  xmm12, xmm6
   1847         shufps  xmm12, xmm7, 250
   1848         pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK]
   1849         pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
   1850         por     xmm13, xmm12
   1851         movdqa  xmmword ptr [rsp+20H], xmm13
   1852         movdqa  xmm12, xmm7
   1853         punpcklqdq xmm12, xmm5
   1854         movdqa  xmm13, xmm6
   1855         pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
   1856         pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
   1857         por     xmm12, xmm13
   1858         pshufd  xmm12, xmm12, 78H
   1859         punpckhdq xmm5, xmm7
   1860         punpckldq xmm6, xmm5
   1861         pshufd  xmm7, xmm6, 1EH
   1862         movdqa  xmmword ptr [rsp+40H], xmm12
   1863         movdqa  xmm5, xmmword ptr [rsp+30H]
   1864         movdqa  xmm13, xmmword ptr [rsp+50H]
   1865         pshufd  xmm6, xmm5, 0FH
   1866         shufps  xmm5, xmm13, 214
   1867         pshufd  xmm12, xmm5, 39H
   1868         movdqa  xmm5, xmm14
   1869         shufps  xmm5, xmm15, 250
   1870         pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK]
   1871         pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
   1872         por     xmm6, xmm5
   1873         movdqa  xmm5, xmm15
   1874         punpcklqdq xmm5, xmm13
   1875         movdqa  xmmword ptr [rsp+30H], xmm2
   1876         movdqa  xmm2, xmm14
   1877         pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
   1878         pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
   1879         por     xmm5, xmm2
   1880         movdqa  xmm2, xmmword ptr [rsp+30H]
   1881         pshufd  xmm5, xmm5, 78H
   1882         punpckhdq xmm13, xmm15
   1883         punpckldq xmm14, xmm13
   1884         pshufd  xmm15, xmm14, 1EH
   1885         movdqa  xmm13, xmm6
   1886         movdqa  xmm14, xmm5
   1887         movdqa  xmm5, xmmword ptr [rsp+20H]
   1888         movdqa  xmm6, xmmword ptr [rsp+40H]
   1889         jmp     roundloop2
   1890 endroundloop2:
   1891         pxor    xmm0, xmm2
   1892         pxor    xmm1, xmm3
   1893         pxor    xmm8, xmm10
   1894         pxor    xmm9, xmm11
   1895         mov     eax, r13d
   1896         cmp     rdx, r15
   1897         jne     innerloop2
   1898         movups  xmmword ptr [rbx], xmm0
   1899         movups  xmmword ptr [rbx+10H], xmm1
   1900         movups  xmmword ptr [rbx+20H], xmm8
   1901         movups  xmmword ptr [rbx+30H], xmm9
   1902         mov     eax, dword ptr [rsp+130H]
   1903         neg     eax
   1904         mov    r10d, dword ptr [rsp+110H+8*rax]
   1905         mov    r11d, dword ptr [rsp+120H+8*rax]
   1906         mov dword ptr [rsp+110H], r10d
   1907         mov dword ptr [rsp+120H], r11d
   1908         add     rdi, 16
   1909         add     rbx, 64
   1910         sub     rsi, 2
   1911 final1block:
   1912         test    esi, 1H
   1913         je      unwind
   1914         movups  xmm0, xmmword ptr [rcx]
   1915         movups  xmm1, xmmword ptr [rcx+10H]
   1916         movd    xmm13, dword ptr [rsp+110H]
   1917         movd    xmm14, dword ptr [rsp+120H]
   1918         punpckldq xmm13, xmm14
   1919         mov     r8, qword ptr [rdi]
   1920         movzx   eax, byte ptr [rbp+80H]
   1921         or      eax, r13d
   1922         xor     edx, edx
   1923 innerloop1:
   1924         mov     r14d, eax
   1925         or      eax, r12d
   1926         add     rdx, 64
   1927         cmp     rdx, r15
   1928         cmovne  eax, r14d
   1929         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   1930         shl     rax, 32
   1931         or      rax, 64
   1932         movd    xmm12, rax
   1933         movdqa  xmm3, xmm13
   1934         punpcklqdq xmm3, xmm12
   1935         movups  xmm4, xmmword ptr [r8+rdx-40H]
   1936         movups  xmm5, xmmword ptr [r8+rdx-30H]
   1937         movaps  xmm8, xmm4
   1938         shufps  xmm4, xmm5, 136
   1939         shufps  xmm8, xmm5, 221
   1940         movaps  xmm5, xmm8
   1941         movups  xmm6, xmmword ptr [r8+rdx-20H]
   1942         movups  xmm7, xmmword ptr [r8+rdx-10H]
   1943         movaps  xmm8, xmm6
   1944         shufps  xmm6, xmm7, 136
   1945         pshufd  xmm6, xmm6, 93H
   1946         shufps  xmm8, xmm7, 221
   1947         pshufd  xmm7, xmm8, 93H
   1948         mov     al, 7
   1949 roundloop1:
   1950         paddd   xmm0, xmm4
   1951         paddd   xmm0, xmm1
   1952         pxor    xmm3, xmm0
   1953         pshuflw xmm3, xmm3, 0B1H
   1954         pshufhw xmm3, xmm3, 0B1H
   1955         paddd   xmm2, xmm3
   1956         pxor    xmm1, xmm2
   1957         movdqa  xmm11, xmm1
   1958         pslld   xmm1, 20
   1959         psrld   xmm11, 12
   1960         por     xmm1, xmm11
   1961         paddd   xmm0, xmm5
   1962         paddd   xmm0, xmm1
   1963         pxor    xmm3, xmm0
   1964         movdqa  xmm14, xmm3
   1965         psrld   xmm3, 8
   1966         pslld   xmm14, 24
   1967         pxor    xmm3, xmm14
   1968         paddd   xmm2, xmm3
   1969         pxor    xmm1, xmm2
   1970         movdqa  xmm11, xmm1
   1971         pslld   xmm1, 25
   1972         psrld   xmm11, 7
   1973         por     xmm1, xmm11
   1974         pshufd  xmm0, xmm0, 93H
   1975         pshufd  xmm3, xmm3, 4EH
   1976         pshufd  xmm2, xmm2, 39H
   1977         paddd   xmm0, xmm6
   1978         paddd   xmm0, xmm1
   1979         pxor    xmm3, xmm0
   1980         pshuflw xmm3, xmm3, 0B1H
   1981         pshufhw xmm3, xmm3, 0B1H
   1982         paddd   xmm2, xmm3
   1983         pxor    xmm1, xmm2
   1984         movdqa  xmm11, xmm1
   1985         pslld   xmm1, 20
   1986         psrld   xmm11, 12
   1987         por     xmm1, xmm11
   1988         paddd   xmm0, xmm7
   1989         paddd   xmm0, xmm1
   1990         pxor    xmm3, xmm0
   1991         movdqa  xmm14, xmm3
   1992         psrld   xmm3, 8
   1993         pslld   xmm14, 24
   1994         pxor    xmm3, xmm14
   1995         paddd   xmm2, xmm3
   1996         pxor    xmm1, xmm2
   1997         movdqa  xmm11, xmm1
   1998         pslld   xmm1, 25
   1999         psrld   xmm11, 7
   2000         por     xmm1, xmm11
   2001         pshufd  xmm0, xmm0, 39H
   2002         pshufd  xmm3, xmm3, 4EH
   2003         pshufd  xmm2, xmm2, 93H
   2004         dec     al
   2005         jz      endroundloop1
   2006         movdqa  xmm8, xmm4
   2007         shufps  xmm8, xmm5, 214
   2008         pshufd  xmm9, xmm4, 0FH
   2009         pshufd  xmm4, xmm8, 39H
   2010         movdqa  xmm8, xmm6
   2011         shufps  xmm8, xmm7, 250
   2012         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
   2013         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
   2014         por     xmm9, xmm8
   2015         movdqa  xmm8, xmm7
   2016         punpcklqdq xmm8, xmm5
   2017         movdqa  xmm10, xmm6
   2018         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
   2019         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
   2020         por     xmm8, xmm10
   2021         pshufd  xmm8, xmm8, 78H
   2022         punpckhdq xmm5, xmm7
   2023         punpckldq xmm6, xmm5
   2024         pshufd  xmm7, xmm6, 1EH
   2025         movdqa  xmm5, xmm9
   2026         movdqa  xmm6, xmm8
   2027         jmp     roundloop1
   2028 endroundloop1:
   2029         pxor    xmm0, xmm2
   2030         pxor    xmm1, xmm3
   2031         mov     eax, r13d
   2032         cmp     rdx, r15
   2033         jne     innerloop1
   2034         movups  xmmword ptr [rbx], xmm0
   2035         movups  xmmword ptr [rbx+10H], xmm1
   2036         jmp     unwind
   2037 _blake3_hash_many_sse2 ENDP
   2038 blake3_hash_many_sse2 ENDP
   2039 
   2040 blake3_compress_in_place_sse2 PROC
   2041 _blake3_compress_in_place_sse2 PROC
   2042         sub     rsp, 120
   2043         movdqa  xmmword ptr [rsp], xmm6
   2044         movdqa  xmmword ptr [rsp+10H], xmm7
   2045         movdqa  xmmword ptr [rsp+20H], xmm8
   2046         movdqa  xmmword ptr [rsp+30H], xmm9
   2047         movdqa  xmmword ptr [rsp+40H], xmm11
   2048         movdqa  xmmword ptr [rsp+50H], xmm14
   2049         movdqa  xmmword ptr [rsp+60H], xmm15
   2050         movups  xmm0, xmmword ptr [rcx]
   2051         movups  xmm1, xmmword ptr [rcx+10H]
   2052         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   2053         movzx   eax, byte ptr [rsp+0A0H]
   2054         movzx   r8d, r8b
   2055         shl     rax, 32
   2056         add     r8, rax
   2057         movq    xmm3, r9
   2058         movq    xmm4, r8
   2059         punpcklqdq xmm3, xmm4
   2060         movups  xmm4, xmmword ptr [rdx]
   2061         movups  xmm5, xmmword ptr [rdx+10H]
   2062         movaps  xmm8, xmm4
   2063         shufps  xmm4, xmm5, 136
   2064         shufps  xmm8, xmm5, 221
   2065         movaps  xmm5, xmm8
   2066         movups  xmm6, xmmword ptr [rdx+20H]
   2067         movups  xmm7, xmmword ptr [rdx+30H]
   2068         movaps  xmm8, xmm6
   2069         shufps  xmm6, xmm7, 136
   2070         pshufd  xmm6, xmm6, 93H
   2071         shufps  xmm8, xmm7, 221
   2072         pshufd  xmm7, xmm8, 93H
   2073         mov     al, 7
   2074 @@:
   2075         paddd   xmm0, xmm4
   2076         paddd   xmm0, xmm1
   2077         pxor    xmm3, xmm0
   2078         pshuflw xmm3, xmm3, 0B1H
   2079         pshufhw xmm3, xmm3, 0B1H
   2080         paddd   xmm2, xmm3
   2081         pxor    xmm1, xmm2
   2082         movdqa  xmm11, xmm1
   2083         pslld   xmm1, 20
   2084         psrld   xmm11, 12
   2085         por     xmm1, xmm11
   2086         paddd   xmm0, xmm5
   2087         paddd   xmm0, xmm1
   2088         pxor    xmm3, xmm0
   2089         movdqa  xmm14, xmm3
   2090         psrld   xmm3, 8
   2091         pslld   xmm14, 24
   2092         pxor    xmm3, xmm14
   2093         paddd   xmm2, xmm3
   2094         pxor    xmm1, xmm2
   2095         movdqa  xmm11, xmm1
   2096         pslld   xmm1, 25
   2097         psrld   xmm11, 7
   2098         por     xmm1, xmm11
   2099         pshufd  xmm0, xmm0, 93H
   2100         pshufd  xmm3, xmm3, 4EH
   2101         pshufd  xmm2, xmm2, 39H
   2102         paddd   xmm0, xmm6
   2103         paddd   xmm0, xmm1
   2104         pxor    xmm3, xmm0
   2105         pshuflw xmm3, xmm3, 0B1H
   2106         pshufhw xmm3, xmm3, 0B1H
   2107         paddd   xmm2, xmm3
   2108         pxor    xmm1, xmm2
   2109         movdqa  xmm11, xmm1
   2110         pslld   xmm1, 20
   2111         psrld   xmm11, 12
   2112         por     xmm1, xmm11
   2113         paddd   xmm0, xmm7
   2114         paddd   xmm0, xmm1
   2115         pxor    xmm3, xmm0
   2116         movdqa  xmm14, xmm3
   2117         psrld   xmm3, 8
   2118         pslld   xmm14, 24
   2119         pxor    xmm3, xmm14
   2120         paddd   xmm2, xmm3
   2121         pxor    xmm1, xmm2
   2122         movdqa  xmm11, xmm1
   2123         pslld   xmm1, 25
   2124         psrld   xmm11, 7
   2125         por     xmm1, xmm11
   2126         pshufd  xmm0, xmm0, 39H
   2127         pshufd  xmm3, xmm3, 4EH
   2128         pshufd  xmm2, xmm2, 93H
   2129         dec     al
   2130         jz      @F
   2131         movdqa  xmm8, xmm4
   2132         shufps  xmm8, xmm5, 214
   2133         pshufd  xmm9, xmm4, 0FH
   2134         pshufd  xmm4, xmm8, 39H
   2135         movdqa  xmm8, xmm6
   2136         shufps  xmm8, xmm7, 250
   2137         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
   2138         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
   2139         por     xmm9, xmm8
   2140         movdqa  xmm8, xmm7
   2141         punpcklqdq xmm8, xmm5
   2142         movdqa  xmm10, xmm6
   2143         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
   2144         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
   2145         por     xmm8, xmm10
   2146         pshufd  xmm8, xmm8, 78H
   2147         punpckhdq xmm5, xmm7
   2148         punpckldq xmm6, xmm5
   2149         pshufd  xmm7, xmm6, 1EH
   2150         movdqa  xmm5, xmm9
   2151         movdqa  xmm6, xmm8
   2152         jmp     @B
   2153 @@:
   2154         pxor    xmm0, xmm2
   2155         pxor    xmm1, xmm3
   2156         movups  xmmword ptr [rcx], xmm0
   2157         movups  xmmword ptr [rcx+10H], xmm1
   2158         movdqa  xmm6, xmmword ptr [rsp]
   2159         movdqa  xmm7, xmmword ptr [rsp+10H]
   2160         movdqa  xmm8, xmmword ptr [rsp+20H]
   2161         movdqa  xmm9, xmmword ptr [rsp+30H]
   2162         movdqa  xmm11, xmmword ptr [rsp+40H]
   2163         movdqa  xmm14, xmmword ptr [rsp+50H]
   2164         movdqa  xmm15, xmmword ptr [rsp+60H]
   2165         add     rsp, 120
   2166         ret
   2167 _blake3_compress_in_place_sse2 ENDP
   2168 blake3_compress_in_place_sse2 ENDP
   2169 
   2170 ALIGN 16
   2171 blake3_compress_xof_sse2 PROC
   2172 _blake3_compress_xof_sse2 PROC
   2173         sub     rsp, 120
   2174         movdqa  xmmword ptr [rsp], xmm6
   2175         movdqa  xmmword ptr [rsp+10H], xmm7
   2176         movdqa  xmmword ptr [rsp+20H], xmm8
   2177         movdqa  xmmword ptr [rsp+30H], xmm9
   2178         movdqa  xmmword ptr [rsp+40H], xmm11
   2179         movdqa  xmmword ptr [rsp+50H], xmm14
   2180         movdqa  xmmword ptr [rsp+60H], xmm15
   2181         movups  xmm0, xmmword ptr [rcx]
   2182         movups  xmm1, xmmword ptr [rcx+10H]
   2183         movaps  xmm2, xmmword ptr [BLAKE3_IV]
   2184         movzx   eax, byte ptr [rsp+0A0H]
   2185         movzx   r8d, r8b
   2186         mov     r10, qword ptr [rsp+0A8H]
   2187         shl     rax, 32
   2188         add     r8, rax
   2189         movq    xmm3, r9
   2190         movq    xmm4, r8
   2191         punpcklqdq xmm3, xmm4
   2192         movups  xmm4, xmmword ptr [rdx]
   2193         movups  xmm5, xmmword ptr [rdx+10H]
   2194         movaps  xmm8, xmm4
   2195         shufps  xmm4, xmm5, 136
   2196         shufps  xmm8, xmm5, 221
   2197         movaps  xmm5, xmm8
   2198         movups  xmm6, xmmword ptr [rdx+20H]
   2199         movups  xmm7, xmmword ptr [rdx+30H]
   2200         movaps  xmm8, xmm6
   2201         shufps  xmm6, xmm7, 136
   2202         pshufd  xmm6, xmm6, 93H
   2203         shufps  xmm8, xmm7, 221
   2204         pshufd  xmm7, xmm8, 93H
   2205         mov     al, 7
   2206 @@:
   2207         paddd   xmm0, xmm4
   2208         paddd   xmm0, xmm1
   2209         pxor    xmm3, xmm0
   2210         pshuflw xmm3, xmm3, 0B1H
   2211         pshufhw xmm3, xmm3, 0B1H
   2212         paddd   xmm2, xmm3
   2213         pxor    xmm1, xmm2
   2214         movdqa  xmm11, xmm1
   2215         pslld   xmm1, 20
   2216         psrld   xmm11, 12
   2217         por     xmm1, xmm11
   2218         paddd   xmm0, xmm5
   2219         paddd   xmm0, xmm1
   2220         pxor    xmm3, xmm0
   2221         movdqa  xmm14, xmm3
   2222         psrld   xmm3, 8
   2223         pslld   xmm14, 24
   2224         pxor    xmm3, xmm14
   2225         paddd   xmm2, xmm3
   2226         pxor    xmm1, xmm2
   2227         movdqa  xmm11, xmm1
   2228         pslld   xmm1, 25
   2229         psrld   xmm11, 7
   2230         por     xmm1, xmm11
   2231         pshufd  xmm0, xmm0, 93H
   2232         pshufd  xmm3, xmm3, 4EH
   2233         pshufd  xmm2, xmm2, 39H
   2234         paddd   xmm0, xmm6
   2235         paddd   xmm0, xmm1
   2236         pxor    xmm3, xmm0
   2237         pshuflw xmm3, xmm3, 0B1H
   2238         pshufhw xmm3, xmm3, 0B1H
   2239         paddd   xmm2, xmm3
   2240         pxor    xmm1, xmm2
   2241         movdqa  xmm11, xmm1
   2242         pslld   xmm1, 20
   2243         psrld   xmm11, 12
   2244         por     xmm1, xmm11
   2245         paddd   xmm0, xmm7
   2246         paddd   xmm0, xmm1
   2247         pxor    xmm3, xmm0
   2248         movdqa  xmm14, xmm3
   2249         psrld   xmm3, 8
   2250         pslld   xmm14, 24
   2251         pxor    xmm3, xmm14
   2252         paddd   xmm2, xmm3
   2253         pxor    xmm1, xmm2
   2254         movdqa  xmm11, xmm1
   2255         pslld   xmm1, 25
   2256         psrld   xmm11, 7
   2257         por     xmm1, xmm11
   2258         pshufd  xmm0, xmm0, 39H
   2259         pshufd  xmm3, xmm3, 4EH
   2260         pshufd  xmm2, xmm2, 93H
   2261         dec     al
   2262         jz      @F
   2263         movdqa  xmm8, xmm4
   2264         shufps  xmm8, xmm5, 214
   2265         pshufd  xmm9, xmm4, 0FH
   2266         pshufd  xmm4, xmm8, 39H
   2267         movdqa  xmm8, xmm6
   2268         shufps  xmm8, xmm7, 250
   2269         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK]
   2270         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
   2271         por     xmm9, xmm8
   2272         movdqa  xmm8, xmm7
   2273         punpcklqdq xmm8, xmm5
   2274         movdqa  xmm10, xmm6
   2275         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
   2276         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
   2277         por     xmm8, xmm10
   2278         pshufd  xmm8, xmm8, 78H
   2279         punpckhdq xmm5, xmm7
   2280         punpckldq xmm6, xmm5
   2281         pshufd  xmm7, xmm6, 1EH
   2282         movdqa  xmm5, xmm9
   2283         movdqa  xmm6, xmm8
   2284         jmp     @B
   2285 @@:
   2286         movdqu  xmm4, xmmword ptr [rcx]
   2287         movdqu  xmm5, xmmword ptr [rcx+10H]
   2288         pxor    xmm0, xmm2
   2289         pxor    xmm1, xmm3
   2290         pxor    xmm2, xmm4
   2291         pxor    xmm3, xmm5
   2292         movups  xmmword ptr [r10], xmm0
   2293         movups  xmmword ptr [r10+10H], xmm1
   2294         movups  xmmword ptr [r10+20H], xmm2
   2295         movups  xmmword ptr [r10+30H], xmm3
   2296         movdqa  xmm6, xmmword ptr [rsp]
   2297         movdqa  xmm7, xmmword ptr [rsp+10H]
   2298         movdqa  xmm8, xmmword ptr [rsp+20H]
   2299         movdqa  xmm9, xmmword ptr [rsp+30H]
   2300         movdqa  xmm11, xmmword ptr [rsp+40H]
   2301         movdqa  xmm14, xmmword ptr [rsp+50H]
   2302         movdqa  xmm15, xmmword ptr [rsp+60H]
   2303         add     rsp, 120
   2304         ret
   2305 _blake3_compress_xof_sse2 ENDP
   2306 blake3_compress_xof_sse2 ENDP
   2307 
   2308 _TEXT ENDS
   2309 
   2310 
   2311 _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
   2312 ALIGN   64
   2313 BLAKE3_IV:
   2314         dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
   2315 
   2316 ADD0:
   2317         dd 0, 1, 2, 3
   2318 
   2319 ADD1:
   2320         dd 4 dup (4)
   2321 
   2322 BLAKE3_IV_0:
   2323         dd 4 dup (6A09E667H)
   2324 
   2325 BLAKE3_IV_1:
   2326         dd 4 dup (0BB67AE85H)
   2327 
   2328 BLAKE3_IV_2:
   2329         dd 4 dup (3C6EF372H)
   2330 
   2331 BLAKE3_IV_3:
   2332         dd 4 dup (0A54FF53AH)
   2333 
   2334 BLAKE3_BLOCK_LEN:
   2335         dd 4 dup (64)
   2336 
   2337 CMP_MSB_MASK:
   2338         dd 8 dup(80000000H)
   2339 
   2340 PBLENDW_0x33_MASK:
   2341        dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
   2342 PBLENDW_0xCC_MASK:
   2343        dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
   2344 PBLENDW_0x3F_MASK:
   2345 	dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
   2346 PBLENDW_0xC0_MASK:
   2347        dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
   2348 
   2349 _RDATA ENDS
   2350 END