blake3_sse41_x86-64_windows_msvc.asm (63194B)
1 public _blake3_hash_many_sse41 2 public blake3_hash_many_sse41 3 public blake3_compress_in_place_sse41 4 public _blake3_compress_in_place_sse41 5 public blake3_compress_xof_sse41 6 public _blake3_compress_xof_sse41 7 8 _TEXT SEGMENT ALIGN(16) 'CODE' 9 10 ALIGN 16 11 blake3_hash_many_sse41 PROC 12 _blake3_hash_many_sse41 PROC 13 push r15 14 push r14 15 push r13 16 push r12 17 push rsi 18 push rdi 19 push rbx 20 push rbp 21 mov rbp, rsp 22 sub rsp, 528 23 and rsp, 0FFFFFFFFFFFFFFC0H 24 movdqa xmmword ptr [rsp+170H], xmm6 25 movdqa xmmword ptr [rsp+180H], xmm7 26 movdqa xmmword ptr [rsp+190H], xmm8 27 movdqa xmmword ptr [rsp+1A0H], xmm9 28 movdqa xmmword ptr [rsp+1B0H], xmm10 29 movdqa xmmword ptr [rsp+1C0H], xmm11 30 movdqa xmmword ptr [rsp+1D0H], xmm12 31 movdqa xmmword ptr [rsp+1E0H], xmm13 32 movdqa xmmword ptr [rsp+1F0H], xmm14 33 movdqa xmmword ptr [rsp+200H], xmm15 34 mov rdi, rcx 35 mov rsi, rdx 36 mov rdx, r8 37 mov rcx, r9 38 mov r8, qword ptr [rbp+68H] 39 movzx r9, byte ptr [rbp+70H] 40 neg r9d 41 movd xmm0, r9d 42 pshufd xmm0, xmm0, 00H 43 movdqa xmmword ptr [rsp+130H], xmm0 44 movdqa xmm1, xmm0 45 pand xmm1, xmmword ptr [ADD0] 46 pand xmm0, xmmword ptr [ADD1] 47 movdqa xmmword ptr [rsp+150H], xmm0 48 movd xmm0, r8d 49 pshufd xmm0, xmm0, 00H 50 paddd xmm0, xmm1 51 movdqa xmmword ptr [rsp+110H], xmm0 52 pxor xmm0, xmmword ptr [CMP_MSB_MASK] 53 pxor xmm1, xmmword ptr [CMP_MSB_MASK] 54 pcmpgtd xmm1, xmm0 55 shr r8, 32 56 movd xmm2, r8d 57 pshufd xmm2, xmm2, 00H 58 psubd xmm2, xmm1 59 movdqa xmmword ptr [rsp+120H], xmm2 60 mov rbx, qword ptr [rbp+90H] 61 mov r15, rdx 62 shl r15, 6 63 movzx r13d, byte ptr [rbp+78H] 64 movzx r12d, byte ptr [rbp+88H] 65 cmp rsi, 4 66 jc final3blocks 67 outerloop4: 68 movdqu xmm3, xmmword ptr [rcx] 69 pshufd xmm0, xmm3, 00H 70 pshufd xmm1, xmm3, 55H 71 pshufd xmm2, xmm3, 0AAH 72 pshufd xmm3, xmm3, 0FFH 73 movdqu xmm7, xmmword ptr [rcx+10H] 74 pshufd xmm4, xmm7, 00H 75 pshufd xmm5, xmm7, 55H 76 pshufd xmm6, xmm7, 0AAH 77 pshufd xmm7, xmm7, 0FFH 78 mov r8, qword ptr [rdi] 79 mov r9, qword ptr [rdi+8H] 80 mov r10, qword ptr [rdi+10H] 81 mov r11, qword ptr [rdi+18H] 82 movzx eax, byte ptr [rbp+80H] 83 or eax, r13d 84 xor edx, edx 85 innerloop4: 86 mov r14d, eax 87 or eax, r12d 88 add rdx, 64 89 cmp rdx, r15 90 cmovne eax, r14d 91 movdqu xmm8, xmmword ptr [r8+rdx-40H] 92 movdqu xmm9, xmmword ptr [r9+rdx-40H] 93 movdqu xmm10, xmmword ptr [r10+rdx-40H] 94 movdqu xmm11, xmmword ptr [r11+rdx-40H] 95 movdqa xmm12, xmm8 96 punpckldq xmm8, xmm9 97 punpckhdq xmm12, xmm9 98 movdqa xmm14, xmm10 99 punpckldq xmm10, xmm11 100 punpckhdq xmm14, xmm11 101 movdqa xmm9, xmm8 102 punpcklqdq xmm8, xmm10 103 punpckhqdq xmm9, xmm10 104 movdqa xmm13, xmm12 105 punpcklqdq xmm12, xmm14 106 punpckhqdq xmm13, xmm14 107 movdqa xmmword ptr [rsp], xmm8 108 movdqa xmmword ptr [rsp+10H], xmm9 109 movdqa xmmword ptr [rsp+20H], xmm12 110 movdqa xmmword ptr [rsp+30H], xmm13 111 movdqu xmm8, xmmword ptr [r8+rdx-30H] 112 movdqu xmm9, xmmword ptr [r9+rdx-30H] 113 movdqu xmm10, xmmword ptr [r10+rdx-30H] 114 movdqu xmm11, xmmword ptr [r11+rdx-30H] 115 movdqa xmm12, xmm8 116 punpckldq xmm8, xmm9 117 punpckhdq xmm12, xmm9 118 movdqa xmm14, xmm10 119 punpckldq xmm10, xmm11 120 punpckhdq xmm14, xmm11 121 movdqa xmm9, xmm8 122 punpcklqdq xmm8, xmm10 123 punpckhqdq xmm9, xmm10 124 movdqa xmm13, xmm12 125 punpcklqdq xmm12, xmm14 126 punpckhqdq xmm13, xmm14 127 movdqa xmmword ptr [rsp+40H], xmm8 128 movdqa xmmword ptr [rsp+50H], xmm9 129 movdqa xmmword ptr [rsp+60H], xmm12 130 movdqa xmmword ptr [rsp+70H], xmm13 131 movdqu xmm8, xmmword ptr [r8+rdx-20H] 132 movdqu xmm9, xmmword ptr [r9+rdx-20H] 133 movdqu xmm10, xmmword ptr [r10+rdx-20H] 134 movdqu xmm11, xmmword ptr [r11+rdx-20H] 135 movdqa xmm12, xmm8 136 punpckldq xmm8, xmm9 137 punpckhdq xmm12, xmm9 138 movdqa xmm14, xmm10 139 punpckldq xmm10, xmm11 140 punpckhdq xmm14, xmm11 141 movdqa xmm9, xmm8 142 punpcklqdq xmm8, xmm10 143 punpckhqdq xmm9, xmm10 144 movdqa xmm13, xmm12 145 punpcklqdq xmm12, xmm14 146 punpckhqdq xmm13, xmm14 147 movdqa xmmword ptr [rsp+80H], xmm8 148 movdqa xmmword ptr [rsp+90H], xmm9 149 movdqa xmmword ptr [rsp+0A0H], xmm12 150 movdqa xmmword ptr [rsp+0B0H], xmm13 151 movdqu xmm8, xmmword ptr [r8+rdx-10H] 152 movdqu xmm9, xmmword ptr [r9+rdx-10H] 153 movdqu xmm10, xmmword ptr [r10+rdx-10H] 154 movdqu xmm11, xmmword ptr [r11+rdx-10H] 155 movdqa xmm12, xmm8 156 punpckldq xmm8, xmm9 157 punpckhdq xmm12, xmm9 158 movdqa xmm14, xmm10 159 punpckldq xmm10, xmm11 160 punpckhdq xmm14, xmm11 161 movdqa xmm9, xmm8 162 punpcklqdq xmm8, xmm10 163 punpckhqdq xmm9, xmm10 164 movdqa xmm13, xmm12 165 punpcklqdq xmm12, xmm14 166 punpckhqdq xmm13, xmm14 167 movdqa xmmword ptr [rsp+0C0H], xmm8 168 movdqa xmmword ptr [rsp+0D0H], xmm9 169 movdqa xmmword ptr [rsp+0E0H], xmm12 170 movdqa xmmword ptr [rsp+0F0H], xmm13 171 movdqa xmm9, xmmword ptr [BLAKE3_IV_1] 172 movdqa xmm10, xmmword ptr [BLAKE3_IV_2] 173 movdqa xmm11, xmmword ptr [BLAKE3_IV_3] 174 movdqa xmm12, xmmword ptr [rsp+110H] 175 movdqa xmm13, xmmword ptr [rsp+120H] 176 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] 177 movd xmm15, eax 178 pshufd xmm15, xmm15, 00H 179 prefetcht0 byte ptr [r8+rdx+80H] 180 prefetcht0 byte ptr [r9+rdx+80H] 181 prefetcht0 byte ptr [r10+rdx+80H] 182 prefetcht0 byte ptr [r11+rdx+80H] 183 paddd xmm0, xmmword ptr [rsp] 184 paddd xmm1, xmmword ptr [rsp+20H] 185 paddd xmm2, xmmword ptr [rsp+40H] 186 paddd xmm3, xmmword ptr [rsp+60H] 187 paddd xmm0, xmm4 188 paddd xmm1, xmm5 189 paddd xmm2, xmm6 190 paddd xmm3, xmm7 191 pxor xmm12, xmm0 192 pxor xmm13, xmm1 193 pxor xmm14, xmm2 194 pxor xmm15, xmm3 195 movdqa xmm8, xmmword ptr [ROT16] 196 pshufb xmm12, xmm8 197 pshufb xmm13, xmm8 198 pshufb xmm14, xmm8 199 pshufb xmm15, xmm8 200 movdqa xmm8, xmmword ptr [BLAKE3_IV_0] 201 paddd xmm8, xmm12 202 paddd xmm9, xmm13 203 paddd xmm10, xmm14 204 paddd xmm11, xmm15 205 pxor xmm4, xmm8 206 pxor xmm5, xmm9 207 pxor xmm6, xmm10 208 pxor xmm7, xmm11 209 movdqa xmmword ptr [rsp+100H], xmm8 210 movdqa xmm8, xmm4 211 psrld xmm8, 12 212 pslld xmm4, 20 213 por xmm4, xmm8 214 movdqa xmm8, xmm5 215 psrld xmm8, 12 216 pslld xmm5, 20 217 por xmm5, xmm8 218 movdqa xmm8, xmm6 219 psrld xmm8, 12 220 pslld xmm6, 20 221 por xmm6, xmm8 222 movdqa xmm8, xmm7 223 psrld xmm8, 12 224 pslld xmm7, 20 225 por xmm7, xmm8 226 paddd xmm0, xmmword ptr [rsp+10H] 227 paddd xmm1, xmmword ptr [rsp+30H] 228 paddd xmm2, xmmword ptr [rsp+50H] 229 paddd xmm3, xmmword ptr [rsp+70H] 230 paddd xmm0, xmm4 231 paddd xmm1, xmm5 232 paddd xmm2, xmm6 233 paddd xmm3, xmm7 234 pxor xmm12, xmm0 235 pxor xmm13, xmm1 236 pxor xmm14, xmm2 237 pxor xmm15, xmm3 238 movdqa xmm8, xmmword ptr [ROT8] 239 pshufb xmm12, xmm8 240 pshufb xmm13, xmm8 241 pshufb xmm14, xmm8 242 pshufb xmm15, xmm8 243 movdqa xmm8, xmmword ptr [rsp+100H] 244 paddd xmm8, xmm12 245 paddd xmm9, xmm13 246 paddd xmm10, xmm14 247 paddd xmm11, xmm15 248 pxor xmm4, xmm8 249 pxor xmm5, xmm9 250 pxor xmm6, xmm10 251 pxor xmm7, xmm11 252 movdqa xmmword ptr [rsp+100H], xmm8 253 movdqa xmm8, xmm4 254 psrld xmm8, 7 255 pslld xmm4, 25 256 por xmm4, xmm8 257 movdqa xmm8, xmm5 258 psrld xmm8, 7 259 pslld xmm5, 25 260 por xmm5, xmm8 261 movdqa xmm8, xmm6 262 psrld xmm8, 7 263 pslld xmm6, 25 264 por xmm6, xmm8 265 movdqa xmm8, xmm7 266 psrld xmm8, 7 267 pslld xmm7, 25 268 por xmm7, xmm8 269 paddd xmm0, xmmword ptr [rsp+80H] 270 paddd xmm1, xmmword ptr [rsp+0A0H] 271 paddd xmm2, xmmword ptr [rsp+0C0H] 272 paddd xmm3, xmmword ptr [rsp+0E0H] 273 paddd xmm0, xmm5 274 paddd xmm1, xmm6 275 paddd xmm2, xmm7 276 paddd xmm3, xmm4 277 pxor xmm15, xmm0 278 pxor xmm12, xmm1 279 pxor xmm13, xmm2 280 pxor xmm14, xmm3 281 movdqa xmm8, xmmword ptr [ROT16] 282 pshufb xmm15, xmm8 283 pshufb xmm12, xmm8 284 pshufb xmm13, xmm8 285 pshufb xmm14, xmm8 286 paddd xmm10, xmm15 287 paddd xmm11, xmm12 288 movdqa xmm8, xmmword ptr [rsp+100H] 289 paddd xmm8, xmm13 290 paddd xmm9, xmm14 291 pxor xmm5, xmm10 292 pxor xmm6, xmm11 293 pxor xmm7, xmm8 294 pxor xmm4, xmm9 295 movdqa xmmword ptr [rsp+100H], xmm8 296 movdqa xmm8, xmm5 297 psrld xmm8, 12 298 pslld xmm5, 20 299 por xmm5, xmm8 300 movdqa xmm8, xmm6 301 psrld xmm8, 12 302 pslld xmm6, 20 303 por xmm6, xmm8 304 movdqa xmm8, xmm7 305 psrld xmm8, 12 306 pslld xmm7, 20 307 por xmm7, xmm8 308 movdqa xmm8, xmm4 309 psrld xmm8, 12 310 pslld xmm4, 20 311 por xmm4, xmm8 312 paddd xmm0, xmmword ptr [rsp+90H] 313 paddd xmm1, xmmword ptr [rsp+0B0H] 314 paddd xmm2, xmmword ptr [rsp+0D0H] 315 paddd xmm3, xmmword ptr [rsp+0F0H] 316 paddd xmm0, xmm5 317 paddd xmm1, xmm6 318 paddd xmm2, xmm7 319 paddd xmm3, xmm4 320 pxor xmm15, xmm0 321 pxor xmm12, xmm1 322 pxor xmm13, xmm2 323 pxor xmm14, xmm3 324 movdqa xmm8, xmmword ptr [ROT8] 325 pshufb xmm15, xmm8 326 pshufb xmm12, xmm8 327 pshufb xmm13, xmm8 328 pshufb xmm14, xmm8 329 paddd xmm10, xmm15 330 paddd xmm11, xmm12 331 movdqa xmm8, xmmword ptr [rsp+100H] 332 paddd xmm8, xmm13 333 paddd xmm9, xmm14 334 pxor xmm5, xmm10 335 pxor xmm6, xmm11 336 pxor xmm7, xmm8 337 pxor xmm4, xmm9 338 movdqa xmmword ptr [rsp+100H], xmm8 339 movdqa xmm8, xmm5 340 psrld xmm8, 7 341 pslld xmm5, 25 342 por xmm5, xmm8 343 movdqa xmm8, xmm6 344 psrld xmm8, 7 345 pslld xmm6, 25 346 por xmm6, xmm8 347 movdqa xmm8, xmm7 348 psrld xmm8, 7 349 pslld xmm7, 25 350 por xmm7, xmm8 351 movdqa xmm8, xmm4 352 psrld xmm8, 7 353 pslld xmm4, 25 354 por xmm4, xmm8 355 paddd xmm0, xmmword ptr [rsp+20H] 356 paddd xmm1, xmmword ptr [rsp+30H] 357 paddd xmm2, xmmword ptr [rsp+70H] 358 paddd xmm3, xmmword ptr [rsp+40H] 359 paddd xmm0, xmm4 360 paddd xmm1, xmm5 361 paddd xmm2, xmm6 362 paddd xmm3, xmm7 363 pxor xmm12, xmm0 364 pxor xmm13, xmm1 365 pxor xmm14, xmm2 366 pxor xmm15, xmm3 367 movdqa xmm8, xmmword ptr [ROT16] 368 pshufb xmm12, xmm8 369 pshufb xmm13, xmm8 370 pshufb xmm14, xmm8 371 pshufb xmm15, xmm8 372 movdqa xmm8, xmmword ptr [rsp+100H] 373 paddd xmm8, xmm12 374 paddd xmm9, xmm13 375 paddd xmm10, xmm14 376 paddd xmm11, xmm15 377 pxor xmm4, xmm8 378 pxor xmm5, xmm9 379 pxor xmm6, xmm10 380 pxor xmm7, xmm11 381 movdqa xmmword ptr [rsp+100H], xmm8 382 movdqa xmm8, xmm4 383 psrld xmm8, 12 384 pslld xmm4, 20 385 por xmm4, xmm8 386 movdqa xmm8, xmm5 387 psrld xmm8, 12 388 pslld xmm5, 20 389 por xmm5, xmm8 390 movdqa xmm8, xmm6 391 psrld xmm8, 12 392 pslld xmm6, 20 393 por xmm6, xmm8 394 movdqa xmm8, xmm7 395 psrld xmm8, 12 396 pslld xmm7, 20 397 por xmm7, xmm8 398 paddd xmm0, xmmword ptr [rsp+60H] 399 paddd xmm1, xmmword ptr [rsp+0A0H] 400 paddd xmm2, xmmword ptr [rsp] 401 paddd xmm3, xmmword ptr [rsp+0D0H] 402 paddd xmm0, xmm4 403 paddd xmm1, xmm5 404 paddd xmm2, xmm6 405 paddd xmm3, xmm7 406 pxor xmm12, xmm0 407 pxor xmm13, xmm1 408 pxor xmm14, xmm2 409 pxor xmm15, xmm3 410 movdqa xmm8, xmmword ptr [ROT8] 411 pshufb xmm12, xmm8 412 pshufb xmm13, xmm8 413 pshufb xmm14, xmm8 414 pshufb xmm15, xmm8 415 movdqa xmm8, xmmword ptr [rsp+100H] 416 paddd xmm8, xmm12 417 paddd xmm9, xmm13 418 paddd xmm10, xmm14 419 paddd xmm11, xmm15 420 pxor xmm4, xmm8 421 pxor xmm5, xmm9 422 pxor xmm6, xmm10 423 pxor xmm7, xmm11 424 movdqa xmmword ptr [rsp+100H], xmm8 425 movdqa xmm8, xmm4 426 psrld xmm8, 7 427 pslld xmm4, 25 428 por xmm4, xmm8 429 movdqa xmm8, xmm5 430 psrld xmm8, 7 431 pslld xmm5, 25 432 por xmm5, xmm8 433 movdqa xmm8, xmm6 434 psrld xmm8, 7 435 pslld xmm6, 25 436 por xmm6, xmm8 437 movdqa xmm8, xmm7 438 psrld xmm8, 7 439 pslld xmm7, 25 440 por xmm7, xmm8 441 paddd xmm0, xmmword ptr [rsp+10H] 442 paddd xmm1, xmmword ptr [rsp+0C0H] 443 paddd xmm2, xmmword ptr [rsp+90H] 444 paddd xmm3, xmmword ptr [rsp+0F0H] 445 paddd xmm0, xmm5 446 paddd xmm1, xmm6 447 paddd xmm2, xmm7 448 paddd xmm3, xmm4 449 pxor xmm15, xmm0 450 pxor xmm12, xmm1 451 pxor xmm13, xmm2 452 pxor xmm14, xmm3 453 movdqa xmm8, xmmword ptr [ROT16] 454 pshufb xmm15, xmm8 455 pshufb xmm12, xmm8 456 pshufb xmm13, xmm8 457 pshufb xmm14, xmm8 458 paddd xmm10, xmm15 459 paddd xmm11, xmm12 460 movdqa xmm8, xmmword ptr [rsp+100H] 461 paddd xmm8, xmm13 462 paddd xmm9, xmm14 463 pxor xmm5, xmm10 464 pxor xmm6, xmm11 465 pxor xmm7, xmm8 466 pxor xmm4, xmm9 467 movdqa xmmword ptr [rsp+100H], xmm8 468 movdqa xmm8, xmm5 469 psrld xmm8, 12 470 pslld xmm5, 20 471 por xmm5, xmm8 472 movdqa xmm8, xmm6 473 psrld xmm8, 12 474 pslld xmm6, 20 475 por xmm6, xmm8 476 movdqa xmm8, xmm7 477 psrld xmm8, 12 478 pslld xmm7, 20 479 por xmm7, xmm8 480 movdqa xmm8, xmm4 481 psrld xmm8, 12 482 pslld xmm4, 20 483 por xmm4, xmm8 484 paddd xmm0, xmmword ptr [rsp+0B0H] 485 paddd xmm1, xmmword ptr [rsp+50H] 486 paddd xmm2, xmmword ptr [rsp+0E0H] 487 paddd xmm3, xmmword ptr [rsp+80H] 488 paddd xmm0, xmm5 489 paddd xmm1, xmm6 490 paddd xmm2, xmm7 491 paddd xmm3, xmm4 492 pxor xmm15, xmm0 493 pxor xmm12, xmm1 494 pxor xmm13, xmm2 495 pxor xmm14, xmm3 496 movdqa xmm8, xmmword ptr [ROT8] 497 pshufb xmm15, xmm8 498 pshufb xmm12, xmm8 499 pshufb xmm13, xmm8 500 pshufb xmm14, xmm8 501 paddd xmm10, xmm15 502 paddd xmm11, xmm12 503 movdqa xmm8, xmmword ptr [rsp+100H] 504 paddd xmm8, xmm13 505 paddd xmm9, xmm14 506 pxor xmm5, xmm10 507 pxor xmm6, xmm11 508 pxor xmm7, xmm8 509 pxor xmm4, xmm9 510 movdqa xmmword ptr [rsp+100H], xmm8 511 movdqa xmm8, xmm5 512 psrld xmm8, 7 513 pslld xmm5, 25 514 por xmm5, xmm8 515 movdqa xmm8, xmm6 516 psrld xmm8, 7 517 pslld xmm6, 25 518 por xmm6, xmm8 519 movdqa xmm8, xmm7 520 psrld xmm8, 7 521 pslld xmm7, 25 522 por xmm7, xmm8 523 movdqa xmm8, xmm4 524 psrld xmm8, 7 525 pslld xmm4, 25 526 por xmm4, xmm8 527 paddd xmm0, xmmword ptr [rsp+30H] 528 paddd xmm1, xmmword ptr [rsp+0A0H] 529 paddd xmm2, xmmword ptr [rsp+0D0H] 530 paddd xmm3, xmmword ptr [rsp+70H] 531 paddd xmm0, xmm4 532 paddd xmm1, xmm5 533 paddd xmm2, xmm6 534 paddd xmm3, xmm7 535 pxor xmm12, xmm0 536 pxor xmm13, xmm1 537 pxor xmm14, xmm2 538 pxor xmm15, xmm3 539 movdqa xmm8, xmmword ptr [ROT16] 540 pshufb xmm12, xmm8 541 pshufb xmm13, xmm8 542 pshufb xmm14, xmm8 543 pshufb xmm15, xmm8 544 movdqa xmm8, xmmword ptr [rsp+100H] 545 paddd xmm8, xmm12 546 paddd xmm9, xmm13 547 paddd xmm10, xmm14 548 paddd xmm11, xmm15 549 pxor xmm4, xmm8 550 pxor xmm5, xmm9 551 pxor xmm6, xmm10 552 pxor xmm7, xmm11 553 movdqa xmmword ptr [rsp+100H], xmm8 554 movdqa xmm8, xmm4 555 psrld xmm8, 12 556 pslld xmm4, 20 557 por xmm4, xmm8 558 movdqa xmm8, xmm5 559 psrld xmm8, 12 560 pslld xmm5, 20 561 por xmm5, xmm8 562 movdqa xmm8, xmm6 563 psrld xmm8, 12 564 pslld xmm6, 20 565 por xmm6, xmm8 566 movdqa xmm8, xmm7 567 psrld xmm8, 12 568 pslld xmm7, 20 569 por xmm7, xmm8 570 paddd xmm0, xmmword ptr [rsp+40H] 571 paddd xmm1, xmmword ptr [rsp+0C0H] 572 paddd xmm2, xmmword ptr [rsp+20H] 573 paddd xmm3, xmmword ptr [rsp+0E0H] 574 paddd xmm0, xmm4 575 paddd xmm1, xmm5 576 paddd xmm2, xmm6 577 paddd xmm3, xmm7 578 pxor xmm12, xmm0 579 pxor xmm13, xmm1 580 pxor xmm14, xmm2 581 pxor xmm15, xmm3 582 movdqa xmm8, xmmword ptr [ROT8] 583 pshufb xmm12, xmm8 584 pshufb xmm13, xmm8 585 pshufb xmm14, xmm8 586 pshufb xmm15, xmm8 587 movdqa xmm8, xmmword ptr [rsp+100H] 588 paddd xmm8, xmm12 589 paddd xmm9, xmm13 590 paddd xmm10, xmm14 591 paddd xmm11, xmm15 592 pxor xmm4, xmm8 593 pxor xmm5, xmm9 594 pxor xmm6, xmm10 595 pxor xmm7, xmm11 596 movdqa xmmword ptr [rsp+100H], xmm8 597 movdqa xmm8, xmm4 598 psrld xmm8, 7 599 pslld xmm4, 25 600 por xmm4, xmm8 601 movdqa xmm8, xmm5 602 psrld xmm8, 7 603 pslld xmm5, 25 604 por xmm5, xmm8 605 movdqa xmm8, xmm6 606 psrld xmm8, 7 607 pslld xmm6, 25 608 por xmm6, xmm8 609 movdqa xmm8, xmm7 610 psrld xmm8, 7 611 pslld xmm7, 25 612 por xmm7, xmm8 613 paddd xmm0, xmmword ptr [rsp+60H] 614 paddd xmm1, xmmword ptr [rsp+90H] 615 paddd xmm2, xmmword ptr [rsp+0B0H] 616 paddd xmm3, xmmword ptr [rsp+80H] 617 paddd xmm0, xmm5 618 paddd xmm1, xmm6 619 paddd xmm2, xmm7 620 paddd xmm3, xmm4 621 pxor xmm15, xmm0 622 pxor xmm12, xmm1 623 pxor xmm13, xmm2 624 pxor xmm14, xmm3 625 movdqa xmm8, xmmword ptr [ROT16] 626 pshufb xmm15, xmm8 627 pshufb xmm12, xmm8 628 pshufb xmm13, xmm8 629 pshufb xmm14, xmm8 630 paddd xmm10, xmm15 631 paddd xmm11, xmm12 632 movdqa xmm8, xmmword ptr [rsp+100H] 633 paddd xmm8, xmm13 634 paddd xmm9, xmm14 635 pxor xmm5, xmm10 636 pxor xmm6, xmm11 637 pxor xmm7, xmm8 638 pxor xmm4, xmm9 639 movdqa xmmword ptr [rsp+100H], xmm8 640 movdqa xmm8, xmm5 641 psrld xmm8, 12 642 pslld xmm5, 20 643 por xmm5, xmm8 644 movdqa xmm8, xmm6 645 psrld xmm8, 12 646 pslld xmm6, 20 647 por xmm6, xmm8 648 movdqa xmm8, xmm7 649 psrld xmm8, 12 650 pslld xmm7, 20 651 por xmm7, xmm8 652 movdqa xmm8, xmm4 653 psrld xmm8, 12 654 pslld xmm4, 20 655 por xmm4, xmm8 656 paddd xmm0, xmmword ptr [rsp+50H] 657 paddd xmm1, xmmword ptr [rsp] 658 paddd xmm2, xmmword ptr [rsp+0F0H] 659 paddd xmm3, xmmword ptr [rsp+10H] 660 paddd xmm0, xmm5 661 paddd xmm1, xmm6 662 paddd xmm2, xmm7 663 paddd xmm3, xmm4 664 pxor xmm15, xmm0 665 pxor xmm12, xmm1 666 pxor xmm13, xmm2 667 pxor xmm14, xmm3 668 movdqa xmm8, xmmword ptr [ROT8] 669 pshufb xmm15, xmm8 670 pshufb xmm12, xmm8 671 pshufb xmm13, xmm8 672 pshufb xmm14, xmm8 673 paddd xmm10, xmm15 674 paddd xmm11, xmm12 675 movdqa xmm8, xmmword ptr [rsp+100H] 676 paddd xmm8, xmm13 677 paddd xmm9, xmm14 678 pxor xmm5, xmm10 679 pxor xmm6, xmm11 680 pxor xmm7, xmm8 681 pxor xmm4, xmm9 682 movdqa xmmword ptr [rsp+100H], xmm8 683 movdqa xmm8, xmm5 684 psrld xmm8, 7 685 pslld xmm5, 25 686 por xmm5, xmm8 687 movdqa xmm8, xmm6 688 psrld xmm8, 7 689 pslld xmm6, 25 690 por xmm6, xmm8 691 movdqa xmm8, xmm7 692 psrld xmm8, 7 693 pslld xmm7, 25 694 por xmm7, xmm8 695 movdqa xmm8, xmm4 696 psrld xmm8, 7 697 pslld xmm4, 25 698 por xmm4, xmm8 699 paddd xmm0, xmmword ptr [rsp+0A0H] 700 paddd xmm1, xmmword ptr [rsp+0C0H] 701 paddd xmm2, xmmword ptr [rsp+0E0H] 702 paddd xmm3, xmmword ptr [rsp+0D0H] 703 paddd xmm0, xmm4 704 paddd xmm1, xmm5 705 paddd xmm2, xmm6 706 paddd xmm3, xmm7 707 pxor xmm12, xmm0 708 pxor xmm13, xmm1 709 pxor xmm14, xmm2 710 pxor xmm15, xmm3 711 movdqa xmm8, xmmword ptr [ROT16] 712 pshufb xmm12, xmm8 713 pshufb xmm13, xmm8 714 pshufb xmm14, xmm8 715 pshufb xmm15, xmm8 716 movdqa xmm8, xmmword ptr [rsp+100H] 717 paddd xmm8, xmm12 718 paddd xmm9, xmm13 719 paddd xmm10, xmm14 720 paddd xmm11, xmm15 721 pxor xmm4, xmm8 722 pxor xmm5, xmm9 723 pxor xmm6, xmm10 724 pxor xmm7, xmm11 725 movdqa xmmword ptr [rsp+100H], xmm8 726 movdqa xmm8, xmm4 727 psrld xmm8, 12 728 pslld xmm4, 20 729 por xmm4, xmm8 730 movdqa xmm8, xmm5 731 psrld xmm8, 12 732 pslld xmm5, 20 733 por xmm5, xmm8 734 movdqa xmm8, xmm6 735 psrld xmm8, 12 736 pslld xmm6, 20 737 por xmm6, xmm8 738 movdqa xmm8, xmm7 739 psrld xmm8, 12 740 pslld xmm7, 20 741 por xmm7, xmm8 742 paddd xmm0, xmmword ptr [rsp+70H] 743 paddd xmm1, xmmword ptr [rsp+90H] 744 paddd xmm2, xmmword ptr [rsp+30H] 745 paddd xmm3, xmmword ptr [rsp+0F0H] 746 paddd xmm0, xmm4 747 paddd xmm1, xmm5 748 paddd xmm2, xmm6 749 paddd xmm3, xmm7 750 pxor xmm12, xmm0 751 pxor xmm13, xmm1 752 pxor xmm14, xmm2 753 pxor xmm15, xmm3 754 movdqa xmm8, xmmword ptr [ROT8] 755 pshufb xmm12, xmm8 756 pshufb xmm13, xmm8 757 pshufb xmm14, xmm8 758 pshufb xmm15, xmm8 759 movdqa xmm8, xmmword ptr [rsp+100H] 760 paddd xmm8, xmm12 761 paddd xmm9, xmm13 762 paddd xmm10, xmm14 763 paddd xmm11, xmm15 764 pxor xmm4, xmm8 765 pxor xmm5, xmm9 766 pxor xmm6, xmm10 767 pxor xmm7, xmm11 768 movdqa xmmword ptr [rsp+100H], xmm8 769 movdqa xmm8, xmm4 770 psrld xmm8, 7 771 pslld xmm4, 25 772 por xmm4, xmm8 773 movdqa xmm8, xmm5 774 psrld xmm8, 7 775 pslld xmm5, 25 776 por xmm5, xmm8 777 movdqa xmm8, xmm6 778 psrld xmm8, 7 779 pslld xmm6, 25 780 por xmm6, xmm8 781 movdqa xmm8, xmm7 782 psrld xmm8, 7 783 pslld xmm7, 25 784 por xmm7, xmm8 785 paddd xmm0, xmmword ptr [rsp+40H] 786 paddd xmm1, xmmword ptr [rsp+0B0H] 787 paddd xmm2, xmmword ptr [rsp+50H] 788 paddd xmm3, xmmword ptr [rsp+10H] 789 paddd xmm0, xmm5 790 paddd xmm1, xmm6 791 paddd xmm2, xmm7 792 paddd xmm3, xmm4 793 pxor xmm15, xmm0 794 pxor xmm12, xmm1 795 pxor xmm13, xmm2 796 pxor xmm14, xmm3 797 movdqa xmm8, xmmword ptr [ROT16] 798 pshufb xmm15, xmm8 799 pshufb xmm12, xmm8 800 pshufb xmm13, xmm8 801 pshufb xmm14, xmm8 802 paddd xmm10, xmm15 803 paddd xmm11, xmm12 804 movdqa xmm8, xmmword ptr [rsp+100H] 805 paddd xmm8, xmm13 806 paddd xmm9, xmm14 807 pxor xmm5, xmm10 808 pxor xmm6, xmm11 809 pxor xmm7, xmm8 810 pxor xmm4, xmm9 811 movdqa xmmword ptr [rsp+100H], xmm8 812 movdqa xmm8, xmm5 813 psrld xmm8, 12 814 pslld xmm5, 20 815 por xmm5, xmm8 816 movdqa xmm8, xmm6 817 psrld xmm8, 12 818 pslld xmm6, 20 819 por xmm6, xmm8 820 movdqa xmm8, xmm7 821 psrld xmm8, 12 822 pslld xmm7, 20 823 por xmm7, xmm8 824 movdqa xmm8, xmm4 825 psrld xmm8, 12 826 pslld xmm4, 20 827 por xmm4, xmm8 828 paddd xmm0, xmmword ptr [rsp] 829 paddd xmm1, xmmword ptr [rsp+20H] 830 paddd xmm2, xmmword ptr [rsp+80H] 831 paddd xmm3, xmmword ptr [rsp+60H] 832 paddd xmm0, xmm5 833 paddd xmm1, xmm6 834 paddd xmm2, xmm7 835 paddd xmm3, xmm4 836 pxor xmm15, xmm0 837 pxor xmm12, xmm1 838 pxor xmm13, xmm2 839 pxor xmm14, xmm3 840 movdqa xmm8, xmmword ptr [ROT8] 841 pshufb xmm15, xmm8 842 pshufb xmm12, xmm8 843 pshufb xmm13, xmm8 844 pshufb xmm14, xmm8 845 paddd xmm10, xmm15 846 paddd xmm11, xmm12 847 movdqa xmm8, xmmword ptr [rsp+100H] 848 paddd xmm8, xmm13 849 paddd xmm9, xmm14 850 pxor xmm5, xmm10 851 pxor xmm6, xmm11 852 pxor xmm7, xmm8 853 pxor xmm4, xmm9 854 movdqa xmmword ptr [rsp+100H], xmm8 855 movdqa xmm8, xmm5 856 psrld xmm8, 7 857 pslld xmm5, 25 858 por xmm5, xmm8 859 movdqa xmm8, xmm6 860 psrld xmm8, 7 861 pslld xmm6, 25 862 por xmm6, xmm8 863 movdqa xmm8, xmm7 864 psrld xmm8, 7 865 pslld xmm7, 25 866 por xmm7, xmm8 867 movdqa xmm8, xmm4 868 psrld xmm8, 7 869 pslld xmm4, 25 870 por xmm4, xmm8 871 paddd xmm0, xmmword ptr [rsp+0C0H] 872 paddd xmm1, xmmword ptr [rsp+90H] 873 paddd xmm2, xmmword ptr [rsp+0F0H] 874 paddd xmm3, xmmword ptr [rsp+0E0H] 875 paddd xmm0, xmm4 876 paddd xmm1, xmm5 877 paddd xmm2, xmm6 878 paddd xmm3, xmm7 879 pxor xmm12, xmm0 880 pxor xmm13, xmm1 881 pxor xmm14, xmm2 882 pxor xmm15, xmm3 883 movdqa xmm8, xmmword ptr [ROT16] 884 pshufb xmm12, xmm8 885 pshufb xmm13, xmm8 886 pshufb xmm14, xmm8 887 pshufb xmm15, xmm8 888 movdqa xmm8, xmmword ptr [rsp+100H] 889 paddd xmm8, xmm12 890 paddd xmm9, xmm13 891 paddd xmm10, xmm14 892 paddd xmm11, xmm15 893 pxor xmm4, xmm8 894 pxor xmm5, xmm9 895 pxor xmm6, xmm10 896 pxor xmm7, xmm11 897 movdqa xmmword ptr [rsp+100H], xmm8 898 movdqa xmm8, xmm4 899 psrld xmm8, 12 900 pslld xmm4, 20 901 por xmm4, xmm8 902 movdqa xmm8, xmm5 903 psrld xmm8, 12 904 pslld xmm5, 20 905 por xmm5, xmm8 906 movdqa xmm8, xmm6 907 psrld xmm8, 12 908 pslld xmm6, 20 909 por xmm6, xmm8 910 movdqa xmm8, xmm7 911 psrld xmm8, 12 912 pslld xmm7, 20 913 por xmm7, xmm8 914 paddd xmm0, xmmword ptr [rsp+0D0H] 915 paddd xmm1, xmmword ptr [rsp+0B0H] 916 paddd xmm2, xmmword ptr [rsp+0A0H] 917 paddd xmm3, xmmword ptr [rsp+80H] 918 paddd xmm0, xmm4 919 paddd xmm1, xmm5 920 paddd xmm2, xmm6 921 paddd xmm3, xmm7 922 pxor xmm12, xmm0 923 pxor xmm13, xmm1 924 pxor xmm14, xmm2 925 pxor xmm15, xmm3 926 movdqa xmm8, xmmword ptr [ROT8] 927 pshufb xmm12, xmm8 928 pshufb xmm13, xmm8 929 pshufb xmm14, xmm8 930 pshufb xmm15, xmm8 931 movdqa xmm8, xmmword ptr [rsp+100H] 932 paddd xmm8, xmm12 933 paddd xmm9, xmm13 934 paddd xmm10, xmm14 935 paddd xmm11, xmm15 936 pxor xmm4, xmm8 937 pxor xmm5, xmm9 938 pxor xmm6, xmm10 939 pxor xmm7, xmm11 940 movdqa xmmword ptr [rsp+100H], xmm8 941 movdqa xmm8, xmm4 942 psrld xmm8, 7 943 pslld xmm4, 25 944 por xmm4, xmm8 945 movdqa xmm8, xmm5 946 psrld xmm8, 7 947 pslld xmm5, 25 948 por xmm5, xmm8 949 movdqa xmm8, xmm6 950 psrld xmm8, 7 951 pslld xmm6, 25 952 por xmm6, xmm8 953 movdqa xmm8, xmm7 954 psrld xmm8, 7 955 pslld xmm7, 25 956 por xmm7, xmm8 957 paddd xmm0, xmmword ptr [rsp+70H] 958 paddd xmm1, xmmword ptr [rsp+50H] 959 paddd xmm2, xmmword ptr [rsp] 960 paddd xmm3, xmmword ptr [rsp+60H] 961 paddd xmm0, xmm5 962 paddd xmm1, xmm6 963 paddd xmm2, xmm7 964 paddd xmm3, xmm4 965 pxor xmm15, xmm0 966 pxor xmm12, xmm1 967 pxor xmm13, xmm2 968 pxor xmm14, xmm3 969 movdqa xmm8, xmmword ptr [ROT16] 970 pshufb xmm15, xmm8 971 pshufb xmm12, xmm8 972 pshufb xmm13, xmm8 973 pshufb xmm14, xmm8 974 paddd xmm10, xmm15 975 paddd xmm11, xmm12 976 movdqa xmm8, xmmword ptr [rsp+100H] 977 paddd xmm8, xmm13 978 paddd xmm9, xmm14 979 pxor xmm5, xmm10 980 pxor xmm6, xmm11 981 pxor xmm7, xmm8 982 pxor xmm4, xmm9 983 movdqa xmmword ptr [rsp+100H], xmm8 984 movdqa xmm8, xmm5 985 psrld xmm8, 12 986 pslld xmm5, 20 987 por xmm5, xmm8 988 movdqa xmm8, xmm6 989 psrld xmm8, 12 990 pslld xmm6, 20 991 por xmm6, xmm8 992 movdqa xmm8, xmm7 993 psrld xmm8, 12 994 pslld xmm7, 20 995 por xmm7, xmm8 996 movdqa xmm8, xmm4 997 psrld xmm8, 12 998 pslld xmm4, 20 999 por xmm4, xmm8 1000 paddd xmm0, xmmword ptr [rsp+20H] 1001 paddd xmm1, xmmword ptr [rsp+30H] 1002 paddd xmm2, xmmword ptr [rsp+10H] 1003 paddd xmm3, xmmword ptr [rsp+40H] 1004 paddd xmm0, xmm5 1005 paddd xmm1, xmm6 1006 paddd xmm2, xmm7 1007 paddd xmm3, xmm4 1008 pxor xmm15, xmm0 1009 pxor xmm12, xmm1 1010 pxor xmm13, xmm2 1011 pxor xmm14, xmm3 1012 movdqa xmm8, xmmword ptr [ROT8] 1013 pshufb xmm15, xmm8 1014 pshufb xmm12, xmm8 1015 pshufb xmm13, xmm8 1016 pshufb xmm14, xmm8 1017 paddd xmm10, xmm15 1018 paddd xmm11, xmm12 1019 movdqa xmm8, xmmword ptr [rsp+100H] 1020 paddd xmm8, xmm13 1021 paddd xmm9, xmm14 1022 pxor xmm5, xmm10 1023 pxor xmm6, xmm11 1024 pxor xmm7, xmm8 1025 pxor xmm4, xmm9 1026 movdqa xmmword ptr [rsp+100H], xmm8 1027 movdqa xmm8, xmm5 1028 psrld xmm8, 7 1029 pslld xmm5, 25 1030 por xmm5, xmm8 1031 movdqa xmm8, xmm6 1032 psrld xmm8, 7 1033 pslld xmm6, 25 1034 por xmm6, xmm8 1035 movdqa xmm8, xmm7 1036 psrld xmm8, 7 1037 pslld xmm7, 25 1038 por xmm7, xmm8 1039 movdqa xmm8, xmm4 1040 psrld xmm8, 7 1041 pslld xmm4, 25 1042 por xmm4, xmm8 1043 paddd xmm0, xmmword ptr [rsp+90H] 1044 paddd xmm1, xmmword ptr [rsp+0B0H] 1045 paddd xmm2, xmmword ptr [rsp+80H] 1046 paddd xmm3, xmmword ptr [rsp+0F0H] 1047 paddd xmm0, xmm4 1048 paddd xmm1, xmm5 1049 paddd xmm2, xmm6 1050 paddd xmm3, xmm7 1051 pxor xmm12, xmm0 1052 pxor xmm13, xmm1 1053 pxor xmm14, xmm2 1054 pxor xmm15, xmm3 1055 movdqa xmm8, xmmword ptr [ROT16] 1056 pshufb xmm12, xmm8 1057 pshufb xmm13, xmm8 1058 pshufb xmm14, xmm8 1059 pshufb xmm15, xmm8 1060 movdqa xmm8, xmmword ptr [rsp+100H] 1061 paddd xmm8, xmm12 1062 paddd xmm9, xmm13 1063 paddd xmm10, xmm14 1064 paddd xmm11, xmm15 1065 pxor xmm4, xmm8 1066 pxor xmm5, xmm9 1067 pxor xmm6, xmm10 1068 pxor xmm7, xmm11 1069 movdqa xmmword ptr [rsp+100H], xmm8 1070 movdqa xmm8, xmm4 1071 psrld xmm8, 12 1072 pslld xmm4, 20 1073 por xmm4, xmm8 1074 movdqa xmm8, xmm5 1075 psrld xmm8, 12 1076 pslld xmm5, 20 1077 por xmm5, xmm8 1078 movdqa xmm8, xmm6 1079 psrld xmm8, 12 1080 pslld xmm6, 20 1081 por xmm6, xmm8 1082 movdqa xmm8, xmm7 1083 psrld xmm8, 12 1084 pslld xmm7, 20 1085 por xmm7, xmm8 1086 paddd xmm0, xmmword ptr [rsp+0E0H] 1087 paddd xmm1, xmmword ptr [rsp+50H] 1088 paddd xmm2, xmmword ptr [rsp+0C0H] 1089 paddd xmm3, xmmword ptr [rsp+10H] 1090 paddd xmm0, xmm4 1091 paddd xmm1, xmm5 1092 paddd xmm2, xmm6 1093 paddd xmm3, xmm7 1094 pxor xmm12, xmm0 1095 pxor xmm13, xmm1 1096 pxor xmm14, xmm2 1097 pxor xmm15, xmm3 1098 movdqa xmm8, xmmword ptr [ROT8] 1099 pshufb xmm12, xmm8 1100 pshufb xmm13, xmm8 1101 pshufb xmm14, xmm8 1102 pshufb xmm15, xmm8 1103 movdqa xmm8, xmmword ptr [rsp+100H] 1104 paddd xmm8, xmm12 1105 paddd xmm9, xmm13 1106 paddd xmm10, xmm14 1107 paddd xmm11, xmm15 1108 pxor xmm4, xmm8 1109 pxor xmm5, xmm9 1110 pxor xmm6, xmm10 1111 pxor xmm7, xmm11 1112 movdqa xmmword ptr [rsp+100H], xmm8 1113 movdqa xmm8, xmm4 1114 psrld xmm8, 7 1115 pslld xmm4, 25 1116 por xmm4, xmm8 1117 movdqa xmm8, xmm5 1118 psrld xmm8, 7 1119 pslld xmm5, 25 1120 por xmm5, xmm8 1121 movdqa xmm8, xmm6 1122 psrld xmm8, 7 1123 pslld xmm6, 25 1124 por xmm6, xmm8 1125 movdqa xmm8, xmm7 1126 psrld xmm8, 7 1127 pslld xmm7, 25 1128 por xmm7, xmm8 1129 paddd xmm0, xmmword ptr [rsp+0D0H] 1130 paddd xmm1, xmmword ptr [rsp] 1131 paddd xmm2, xmmword ptr [rsp+20H] 1132 paddd xmm3, xmmword ptr [rsp+40H] 1133 paddd xmm0, xmm5 1134 paddd xmm1, xmm6 1135 paddd xmm2, xmm7 1136 paddd xmm3, xmm4 1137 pxor xmm15, xmm0 1138 pxor xmm12, xmm1 1139 pxor xmm13, xmm2 1140 pxor xmm14, xmm3 1141 movdqa xmm8, xmmword ptr [ROT16] 1142 pshufb xmm15, xmm8 1143 pshufb xmm12, xmm8 1144 pshufb xmm13, xmm8 1145 pshufb xmm14, xmm8 1146 paddd xmm10, xmm15 1147 paddd xmm11, xmm12 1148 movdqa xmm8, xmmword ptr [rsp+100H] 1149 paddd xmm8, xmm13 1150 paddd xmm9, xmm14 1151 pxor xmm5, xmm10 1152 pxor xmm6, xmm11 1153 pxor xmm7, xmm8 1154 pxor xmm4, xmm9 1155 movdqa xmmword ptr [rsp+100H], xmm8 1156 movdqa xmm8, xmm5 1157 psrld xmm8, 12 1158 pslld xmm5, 20 1159 por xmm5, xmm8 1160 movdqa xmm8, xmm6 1161 psrld xmm8, 12 1162 pslld xmm6, 20 1163 por xmm6, xmm8 1164 movdqa xmm8, xmm7 1165 psrld xmm8, 12 1166 pslld xmm7, 20 1167 por xmm7, xmm8 1168 movdqa xmm8, xmm4 1169 psrld xmm8, 12 1170 pslld xmm4, 20 1171 por xmm4, xmm8 1172 paddd xmm0, xmmword ptr [rsp+30H] 1173 paddd xmm1, xmmword ptr [rsp+0A0H] 1174 paddd xmm2, xmmword ptr [rsp+60H] 1175 paddd xmm3, xmmword ptr [rsp+70H] 1176 paddd xmm0, xmm5 1177 paddd xmm1, xmm6 1178 paddd xmm2, xmm7 1179 paddd xmm3, xmm4 1180 pxor xmm15, xmm0 1181 pxor xmm12, xmm1 1182 pxor xmm13, xmm2 1183 pxor xmm14, xmm3 1184 movdqa xmm8, xmmword ptr [ROT8] 1185 pshufb xmm15, xmm8 1186 pshufb xmm12, xmm8 1187 pshufb xmm13, xmm8 1188 pshufb xmm14, xmm8 1189 paddd xmm10, xmm15 1190 paddd xmm11, xmm12 1191 movdqa xmm8, xmmword ptr [rsp+100H] 1192 paddd xmm8, xmm13 1193 paddd xmm9, xmm14 1194 pxor xmm5, xmm10 1195 pxor xmm6, xmm11 1196 pxor xmm7, xmm8 1197 pxor xmm4, xmm9 1198 movdqa xmmword ptr [rsp+100H], xmm8 1199 movdqa xmm8, xmm5 1200 psrld xmm8, 7 1201 pslld xmm5, 25 1202 por xmm5, xmm8 1203 movdqa xmm8, xmm6 1204 psrld xmm8, 7 1205 pslld xmm6, 25 1206 por xmm6, xmm8 1207 movdqa xmm8, xmm7 1208 psrld xmm8, 7 1209 pslld xmm7, 25 1210 por xmm7, xmm8 1211 movdqa xmm8, xmm4 1212 psrld xmm8, 7 1213 pslld xmm4, 25 1214 por xmm4, xmm8 1215 paddd xmm0, xmmword ptr [rsp+0B0H] 1216 paddd xmm1, xmmword ptr [rsp+50H] 1217 paddd xmm2, xmmword ptr [rsp+10H] 1218 paddd xmm3, xmmword ptr [rsp+80H] 1219 paddd xmm0, xmm4 1220 paddd xmm1, xmm5 1221 paddd xmm2, xmm6 1222 paddd xmm3, xmm7 1223 pxor xmm12, xmm0 1224 pxor xmm13, xmm1 1225 pxor xmm14, xmm2 1226 pxor xmm15, xmm3 1227 movdqa xmm8, xmmword ptr [ROT16] 1228 pshufb xmm12, xmm8 1229 pshufb xmm13, xmm8 1230 pshufb xmm14, xmm8 1231 pshufb xmm15, xmm8 1232 movdqa xmm8, xmmword ptr [rsp+100H] 1233 paddd xmm8, xmm12 1234 paddd xmm9, xmm13 1235 paddd xmm10, xmm14 1236 paddd xmm11, xmm15 1237 pxor xmm4, xmm8 1238 pxor xmm5, xmm9 1239 pxor xmm6, xmm10 1240 pxor xmm7, xmm11 1241 movdqa xmmword ptr [rsp+100H], xmm8 1242 movdqa xmm8, xmm4 1243 psrld xmm8, 12 1244 pslld xmm4, 20 1245 por xmm4, xmm8 1246 movdqa xmm8, xmm5 1247 psrld xmm8, 12 1248 pslld xmm5, 20 1249 por xmm5, xmm8 1250 movdqa xmm8, xmm6 1251 psrld xmm8, 12 1252 pslld xmm6, 20 1253 por xmm6, xmm8 1254 movdqa xmm8, xmm7 1255 psrld xmm8, 12 1256 pslld xmm7, 20 1257 por xmm7, xmm8 1258 paddd xmm0, xmmword ptr [rsp+0F0H] 1259 paddd xmm1, xmmword ptr [rsp] 1260 paddd xmm2, xmmword ptr [rsp+90H] 1261 paddd xmm3, xmmword ptr [rsp+60H] 1262 paddd xmm0, xmm4 1263 paddd xmm1, xmm5 1264 paddd xmm2, xmm6 1265 paddd xmm3, xmm7 1266 pxor xmm12, xmm0 1267 pxor xmm13, xmm1 1268 pxor xmm14, xmm2 1269 pxor xmm15, xmm3 1270 movdqa xmm8, xmmword ptr [ROT8] 1271 pshufb xmm12, xmm8 1272 pshufb xmm13, xmm8 1273 pshufb xmm14, xmm8 1274 pshufb xmm15, xmm8 1275 movdqa xmm8, xmmword ptr [rsp+100H] 1276 paddd xmm8, xmm12 1277 paddd xmm9, xmm13 1278 paddd xmm10, xmm14 1279 paddd xmm11, xmm15 1280 pxor xmm4, xmm8 1281 pxor xmm5, xmm9 1282 pxor xmm6, xmm10 1283 pxor xmm7, xmm11 1284 movdqa xmmword ptr [rsp+100H], xmm8 1285 movdqa xmm8, xmm4 1286 psrld xmm8, 7 1287 pslld xmm4, 25 1288 por xmm4, xmm8 1289 movdqa xmm8, xmm5 1290 psrld xmm8, 7 1291 pslld xmm5, 25 1292 por xmm5, xmm8 1293 movdqa xmm8, xmm6 1294 psrld xmm8, 7 1295 pslld xmm6, 25 1296 por xmm6, xmm8 1297 movdqa xmm8, xmm7 1298 psrld xmm8, 7 1299 pslld xmm7, 25 1300 por xmm7, xmm8 1301 paddd xmm0, xmmword ptr [rsp+0E0H] 1302 paddd xmm1, xmmword ptr [rsp+20H] 1303 paddd xmm2, xmmword ptr [rsp+30H] 1304 paddd xmm3, xmmword ptr [rsp+70H] 1305 paddd xmm0, xmm5 1306 paddd xmm1, xmm6 1307 paddd xmm2, xmm7 1308 paddd xmm3, xmm4 1309 pxor xmm15, xmm0 1310 pxor xmm12, xmm1 1311 pxor xmm13, xmm2 1312 pxor xmm14, xmm3 1313 movdqa xmm8, xmmword ptr [ROT16] 1314 pshufb xmm15, xmm8 1315 pshufb xmm12, xmm8 1316 pshufb xmm13, xmm8 1317 pshufb xmm14, xmm8 1318 paddd xmm10, xmm15 1319 paddd xmm11, xmm12 1320 movdqa xmm8, xmmword ptr [rsp+100H] 1321 paddd xmm8, xmm13 1322 paddd xmm9, xmm14 1323 pxor xmm5, xmm10 1324 pxor xmm6, xmm11 1325 pxor xmm7, xmm8 1326 pxor xmm4, xmm9 1327 movdqa xmmword ptr [rsp+100H], xmm8 1328 movdqa xmm8, xmm5 1329 psrld xmm8, 12 1330 pslld xmm5, 20 1331 por xmm5, xmm8 1332 movdqa xmm8, xmm6 1333 psrld xmm8, 12 1334 pslld xmm6, 20 1335 por xmm6, xmm8 1336 movdqa xmm8, xmm7 1337 psrld xmm8, 12 1338 pslld xmm7, 20 1339 por xmm7, xmm8 1340 movdqa xmm8, xmm4 1341 psrld xmm8, 12 1342 pslld xmm4, 20 1343 por xmm4, xmm8 1344 paddd xmm0, xmmword ptr [rsp+0A0H] 1345 paddd xmm1, xmmword ptr [rsp+0C0H] 1346 paddd xmm2, xmmword ptr [rsp+40H] 1347 paddd xmm3, xmmword ptr [rsp+0D0H] 1348 paddd xmm0, xmm5 1349 paddd xmm1, xmm6 1350 paddd xmm2, xmm7 1351 paddd xmm3, xmm4 1352 pxor xmm15, xmm0 1353 pxor xmm12, xmm1 1354 pxor xmm13, xmm2 1355 pxor xmm14, xmm3 1356 movdqa xmm8, xmmword ptr [ROT8] 1357 pshufb xmm15, xmm8 1358 pshufb xmm12, xmm8 1359 pshufb xmm13, xmm8 1360 pshufb xmm14, xmm8 1361 paddd xmm10, xmm15 1362 paddd xmm11, xmm12 1363 movdqa xmm8, xmmword ptr [rsp+100H] 1364 paddd xmm8, xmm13 1365 paddd xmm9, xmm14 1366 pxor xmm5, xmm10 1367 pxor xmm6, xmm11 1368 pxor xmm7, xmm8 1369 pxor xmm4, xmm9 1370 pxor xmm0, xmm8 1371 pxor xmm1, xmm9 1372 pxor xmm2, xmm10 1373 pxor xmm3, xmm11 1374 movdqa xmm8, xmm5 1375 psrld xmm8, 7 1376 pslld xmm5, 25 1377 por xmm5, xmm8 1378 movdqa xmm8, xmm6 1379 psrld xmm8, 7 1380 pslld xmm6, 25 1381 por xmm6, xmm8 1382 movdqa xmm8, xmm7 1383 psrld xmm8, 7 1384 pslld xmm7, 25 1385 por xmm7, xmm8 1386 movdqa xmm8, xmm4 1387 psrld xmm8, 7 1388 pslld xmm4, 25 1389 por xmm4, xmm8 1390 pxor xmm4, xmm12 1391 pxor xmm5, xmm13 1392 pxor xmm6, xmm14 1393 pxor xmm7, xmm15 1394 mov eax, r13d 1395 jne innerloop4 1396 movdqa xmm9, xmm0 1397 punpckldq xmm0, xmm1 1398 punpckhdq xmm9, xmm1 1399 movdqa xmm11, xmm2 1400 punpckldq xmm2, xmm3 1401 punpckhdq xmm11, xmm3 1402 movdqa xmm1, xmm0 1403 punpcklqdq xmm0, xmm2 1404 punpckhqdq xmm1, xmm2 1405 movdqa xmm3, xmm9 1406 punpcklqdq xmm9, xmm11 1407 punpckhqdq xmm3, xmm11 1408 movdqu xmmword ptr [rbx], xmm0 1409 movdqu xmmword ptr [rbx+20H], xmm1 1410 movdqu xmmword ptr [rbx+40H], xmm9 1411 movdqu xmmword ptr [rbx+60H], xmm3 1412 movdqa xmm9, xmm4 1413 punpckldq xmm4, xmm5 1414 punpckhdq xmm9, xmm5 1415 movdqa xmm11, xmm6 1416 punpckldq xmm6, xmm7 1417 punpckhdq xmm11, xmm7 1418 movdqa xmm5, xmm4 1419 punpcklqdq xmm4, xmm6 1420 punpckhqdq xmm5, xmm6 1421 movdqa xmm7, xmm9 1422 punpcklqdq xmm9, xmm11 1423 punpckhqdq xmm7, xmm11 1424 movdqu xmmword ptr [rbx+10H], xmm4 1425 movdqu xmmword ptr [rbx+30H], xmm5 1426 movdqu xmmword ptr [rbx+50H], xmm9 1427 movdqu xmmword ptr [rbx+70H], xmm7 1428 movdqa xmm1, xmmword ptr [rsp+110H] 1429 movdqa xmm0, xmm1 1430 paddd xmm1, xmmword ptr [rsp+150H] 1431 movdqa xmmword ptr [rsp+110H], xmm1 1432 pxor xmm0, xmmword ptr [CMP_MSB_MASK] 1433 pxor xmm1, xmmword ptr [CMP_MSB_MASK] 1434 pcmpgtd xmm0, xmm1 1435 movdqa xmm1, xmmword ptr [rsp+120H] 1436 psubd xmm1, xmm0 1437 movdqa xmmword ptr [rsp+120H], xmm1 1438 add rbx, 128 1439 add rdi, 32 1440 sub rsi, 4 1441 cmp rsi, 4 1442 jnc outerloop4 1443 test rsi, rsi 1444 jne final3blocks 1445 unwind: 1446 movdqa xmm6, xmmword ptr [rsp+170H] 1447 movdqa xmm7, xmmword ptr [rsp+180H] 1448 movdqa xmm8, xmmword ptr [rsp+190H] 1449 movdqa xmm9, xmmword ptr [rsp+1A0H] 1450 movdqa xmm10, xmmword ptr [rsp+1B0H] 1451 movdqa xmm11, xmmword ptr [rsp+1C0H] 1452 movdqa xmm12, xmmword ptr [rsp+1D0H] 1453 movdqa xmm13, xmmword ptr [rsp+1E0H] 1454 movdqa xmm14, xmmword ptr [rsp+1F0H] 1455 movdqa xmm15, xmmword ptr [rsp+200H] 1456 mov rsp, rbp 1457 pop rbp 1458 pop rbx 1459 pop rdi 1460 pop rsi 1461 pop r12 1462 pop r13 1463 pop r14 1464 pop r15 1465 ret 1466 ALIGN 16 1467 final3blocks: 1468 test esi, 2H 1469 je final1block 1470 movups xmm0, xmmword ptr [rcx] 1471 movups xmm1, xmmword ptr [rcx+10H] 1472 movaps xmm8, xmm0 1473 movaps xmm9, xmm1 1474 movd xmm13, dword ptr [rsp+110H] 1475 pinsrd xmm13, dword ptr [rsp+120H], 1 1476 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 1477 movaps xmmword ptr [rsp], xmm13 1478 movd xmm14, dword ptr [rsp+114H] 1479 pinsrd xmm14, dword ptr [rsp+124H], 1 1480 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 1481 movaps xmmword ptr [rsp+10H], xmm14 1482 mov r8, qword ptr [rdi] 1483 mov r9, qword ptr [rdi+8H] 1484 movzx eax, byte ptr [rbp+80H] 1485 or eax, r13d 1486 xor edx, edx 1487 innerloop2: 1488 mov r14d, eax 1489 or eax, r12d 1490 add rdx, 64 1491 cmp rdx, r15 1492 cmovne eax, r14d 1493 movaps xmm2, xmmword ptr [BLAKE3_IV] 1494 movaps xmm10, xmm2 1495 movups xmm4, xmmword ptr [r8+rdx-40H] 1496 movups xmm5, xmmword ptr [r8+rdx-30H] 1497 movaps xmm3, xmm4 1498 shufps xmm4, xmm5, 136 1499 shufps xmm3, xmm5, 221 1500 movaps xmm5, xmm3 1501 movups xmm6, xmmword ptr [r8+rdx-20H] 1502 movups xmm7, xmmword ptr [r8+rdx-10H] 1503 movaps xmm3, xmm6 1504 shufps xmm6, xmm7, 136 1505 pshufd xmm6, xmm6, 93H 1506 shufps xmm3, xmm7, 221 1507 pshufd xmm7, xmm3, 93H 1508 movups xmm12, xmmword ptr [r9+rdx-40H] 1509 movups xmm13, xmmword ptr [r9+rdx-30H] 1510 movaps xmm11, xmm12 1511 shufps xmm12, xmm13, 136 1512 shufps xmm11, xmm13, 221 1513 movaps xmm13, xmm11 1514 movups xmm14, xmmword ptr [r9+rdx-20H] 1515 movups xmm15, xmmword ptr [r9+rdx-10H] 1516 movaps xmm11, xmm14 1517 shufps xmm14, xmm15, 136 1518 pshufd xmm14, xmm14, 93H 1519 shufps xmm11, xmm15, 221 1520 pshufd xmm15, xmm11, 93H 1521 movaps xmm3, xmmword ptr [rsp] 1522 movaps xmm11, xmmword ptr [rsp+10H] 1523 pinsrd xmm3, eax, 3 1524 pinsrd xmm11, eax, 3 1525 mov al, 7 1526 roundloop2: 1527 paddd xmm0, xmm4 1528 paddd xmm8, xmm12 1529 movaps xmmword ptr [rsp+20H], xmm4 1530 movaps xmmword ptr [rsp+30H], xmm12 1531 paddd xmm0, xmm1 1532 paddd xmm8, xmm9 1533 pxor xmm3, xmm0 1534 pxor xmm11, xmm8 1535 movaps xmm12, xmmword ptr [ROT16] 1536 pshufb xmm3, xmm12 1537 pshufb xmm11, xmm12 1538 paddd xmm2, xmm3 1539 paddd xmm10, xmm11 1540 pxor xmm1, xmm2 1541 pxor xmm9, xmm10 1542 movdqa xmm4, xmm1 1543 pslld xmm1, 20 1544 psrld xmm4, 12 1545 por xmm1, xmm4 1546 movdqa xmm4, xmm9 1547 pslld xmm9, 20 1548 psrld xmm4, 12 1549 por xmm9, xmm4 1550 paddd xmm0, xmm5 1551 paddd xmm8, xmm13 1552 movaps xmmword ptr [rsp+40H], xmm5 1553 movaps xmmword ptr [rsp+50H], xmm13 1554 paddd xmm0, xmm1 1555 paddd xmm8, xmm9 1556 pxor xmm3, xmm0 1557 pxor xmm11, xmm8 1558 movaps xmm13, xmmword ptr [ROT8] 1559 pshufb xmm3, xmm13 1560 pshufb xmm11, xmm13 1561 paddd xmm2, xmm3 1562 paddd xmm10, xmm11 1563 pxor xmm1, xmm2 1564 pxor xmm9, xmm10 1565 movdqa xmm4, xmm1 1566 pslld xmm1, 25 1567 psrld xmm4, 7 1568 por xmm1, xmm4 1569 movdqa xmm4, xmm9 1570 pslld xmm9, 25 1571 psrld xmm4, 7 1572 por xmm9, xmm4 1573 pshufd xmm0, xmm0, 93H 1574 pshufd xmm8, xmm8, 93H 1575 pshufd xmm3, xmm3, 4EH 1576 pshufd xmm11, xmm11, 4EH 1577 pshufd xmm2, xmm2, 39H 1578 pshufd xmm10, xmm10, 39H 1579 paddd xmm0, xmm6 1580 paddd xmm8, xmm14 1581 paddd xmm0, xmm1 1582 paddd xmm8, xmm9 1583 pxor xmm3, xmm0 1584 pxor xmm11, xmm8 1585 pshufb xmm3, xmm12 1586 pshufb xmm11, xmm12 1587 paddd xmm2, xmm3 1588 paddd xmm10, xmm11 1589 pxor xmm1, xmm2 1590 pxor xmm9, xmm10 1591 movdqa xmm4, xmm1 1592 pslld xmm1, 20 1593 psrld xmm4, 12 1594 por xmm1, xmm4 1595 movdqa xmm4, xmm9 1596 pslld xmm9, 20 1597 psrld xmm4, 12 1598 por xmm9, xmm4 1599 paddd xmm0, xmm7 1600 paddd xmm8, xmm15 1601 paddd xmm0, xmm1 1602 paddd xmm8, xmm9 1603 pxor xmm3, xmm0 1604 pxor xmm11, xmm8 1605 pshufb xmm3, xmm13 1606 pshufb xmm11, xmm13 1607 paddd xmm2, xmm3 1608 paddd xmm10, xmm11 1609 pxor xmm1, xmm2 1610 pxor xmm9, xmm10 1611 movdqa xmm4, xmm1 1612 pslld xmm1, 25 1613 psrld xmm4, 7 1614 por xmm1, xmm4 1615 movdqa xmm4, xmm9 1616 pslld xmm9, 25 1617 psrld xmm4, 7 1618 por xmm9, xmm4 1619 pshufd xmm0, xmm0, 39H 1620 pshufd xmm8, xmm8, 39H 1621 pshufd xmm3, xmm3, 4EH 1622 pshufd xmm11, xmm11, 4EH 1623 pshufd xmm2, xmm2, 93H 1624 pshufd xmm10, xmm10, 93H 1625 dec al 1626 je endroundloop2 1627 movdqa xmm12, xmmword ptr [rsp+20H] 1628 movdqa xmm5, xmmword ptr [rsp+40H] 1629 pshufd xmm13, xmm12, 0FH 1630 shufps xmm12, xmm5, 214 1631 pshufd xmm4, xmm12, 39H 1632 movdqa xmm12, xmm6 1633 shufps xmm12, xmm7, 250 1634 pblendw xmm13, xmm12, 0CCH 1635 movdqa xmm12, xmm7 1636 punpcklqdq xmm12, xmm5 1637 pblendw xmm12, xmm6, 0C0H 1638 pshufd xmm12, xmm12, 78H 1639 punpckhdq xmm5, xmm7 1640 punpckldq xmm6, xmm5 1641 pshufd xmm7, xmm6, 1EH 1642 movdqa xmmword ptr [rsp+20H], xmm13 1643 movdqa xmmword ptr [rsp+40H], xmm12 1644 movdqa xmm5, xmmword ptr [rsp+30H] 1645 movdqa xmm13, xmmword ptr [rsp+50H] 1646 pshufd xmm6, xmm5, 0FH 1647 shufps xmm5, xmm13, 214 1648 pshufd xmm12, xmm5, 39H 1649 movdqa xmm5, xmm14 1650 shufps xmm5, xmm15, 250 1651 pblendw xmm6, xmm5, 0CCH 1652 movdqa xmm5, xmm15 1653 punpcklqdq xmm5, xmm13 1654 pblendw xmm5, xmm14, 0C0H 1655 pshufd xmm5, xmm5, 78H 1656 punpckhdq xmm13, xmm15 1657 punpckldq xmm14, xmm13 1658 pshufd xmm15, xmm14, 1EH 1659 movdqa xmm13, xmm6 1660 movdqa xmm14, xmm5 1661 movdqa xmm5, xmmword ptr [rsp+20H] 1662 movdqa xmm6, xmmword ptr [rsp+40H] 1663 jmp roundloop2 1664 endroundloop2: 1665 pxor xmm0, xmm2 1666 pxor xmm1, xmm3 1667 pxor xmm8, xmm10 1668 pxor xmm9, xmm11 1669 mov eax, r13d 1670 cmp rdx, r15 1671 jne innerloop2 1672 movups xmmword ptr [rbx], xmm0 1673 movups xmmword ptr [rbx+10H], xmm1 1674 movups xmmword ptr [rbx+20H], xmm8 1675 movups xmmword ptr [rbx+30H], xmm9 1676 movdqa xmm0, xmmword ptr [rsp+130H] 1677 movdqa xmm1, xmmword ptr [rsp+110H] 1678 movdqa xmm2, xmmword ptr [rsp+120H] 1679 movdqu xmm3, xmmword ptr [rsp+118H] 1680 movdqu xmm4, xmmword ptr [rsp+128H] 1681 blendvps xmm1, xmm3, xmm0 1682 blendvps xmm2, xmm4, xmm0 1683 movdqa xmmword ptr [rsp+110H], xmm1 1684 movdqa xmmword ptr [rsp+120H], xmm2 1685 add rdi, 16 1686 add rbx, 64 1687 sub rsi, 2 1688 final1block: 1689 test esi, 1H 1690 je unwind 1691 movups xmm0, xmmword ptr [rcx] 1692 movups xmm1, xmmword ptr [rcx+10H] 1693 movd xmm13, dword ptr [rsp+110H] 1694 pinsrd xmm13, dword ptr [rsp+120H], 1 1695 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 1696 movaps xmm14, xmmword ptr [ROT8] 1697 movaps xmm15, xmmword ptr [ROT16] 1698 mov r8, qword ptr [rdi] 1699 movzx eax, byte ptr [rbp+80H] 1700 or eax, r13d 1701 xor edx, edx 1702 innerloop1: 1703 mov r14d, eax 1704 or eax, r12d 1705 add rdx, 64 1706 cmp rdx, r15 1707 cmovne eax, r14d 1708 movaps xmm2, xmmword ptr [BLAKE3_IV] 1709 movaps xmm3, xmm13 1710 pinsrd xmm3, eax, 3 1711 movups xmm4, xmmword ptr [r8+rdx-40H] 1712 movups xmm5, xmmword ptr [r8+rdx-30H] 1713 movaps xmm8, xmm4 1714 shufps xmm4, xmm5, 136 1715 shufps xmm8, xmm5, 221 1716 movaps xmm5, xmm8 1717 movups xmm6, xmmword ptr [r8+rdx-20H] 1718 movups xmm7, xmmword ptr [r8+rdx-10H] 1719 movaps xmm8, xmm6 1720 shufps xmm6, xmm7, 136 1721 pshufd xmm6, xmm6, 93H 1722 shufps xmm8, xmm7, 221 1723 pshufd xmm7, xmm8, 93H 1724 mov al, 7 1725 roundloop1: 1726 paddd xmm0, xmm4 1727 paddd xmm0, xmm1 1728 pxor xmm3, xmm0 1729 pshufb xmm3, xmm15 1730 paddd xmm2, xmm3 1731 pxor xmm1, xmm2 1732 movdqa xmm11, xmm1 1733 pslld xmm1, 20 1734 psrld xmm11, 12 1735 por xmm1, xmm11 1736 paddd xmm0, xmm5 1737 paddd xmm0, xmm1 1738 pxor xmm3, xmm0 1739 pshufb xmm3, xmm14 1740 paddd xmm2, xmm3 1741 pxor xmm1, xmm2 1742 movdqa xmm11, xmm1 1743 pslld xmm1, 25 1744 psrld xmm11, 7 1745 por xmm1, xmm11 1746 pshufd xmm0, xmm0, 93H 1747 pshufd xmm3, xmm3, 4EH 1748 pshufd xmm2, xmm2, 39H 1749 paddd xmm0, xmm6 1750 paddd xmm0, xmm1 1751 pxor xmm3, xmm0 1752 pshufb xmm3, xmm15 1753 paddd xmm2, xmm3 1754 pxor xmm1, xmm2 1755 movdqa xmm11, xmm1 1756 pslld xmm1, 20 1757 psrld xmm11, 12 1758 por xmm1, xmm11 1759 paddd xmm0, xmm7 1760 paddd xmm0, xmm1 1761 pxor xmm3, xmm0 1762 pshufb xmm3, xmm14 1763 paddd xmm2, xmm3 1764 pxor xmm1, xmm2 1765 movdqa xmm11, xmm1 1766 pslld xmm1, 25 1767 psrld xmm11, 7 1768 por xmm1, xmm11 1769 pshufd xmm0, xmm0, 39H 1770 pshufd xmm3, xmm3, 4EH 1771 pshufd xmm2, xmm2, 93H 1772 dec al 1773 jz endroundloop1 1774 movdqa xmm8, xmm4 1775 shufps xmm8, xmm5, 214 1776 pshufd xmm9, xmm4, 0FH 1777 pshufd xmm4, xmm8, 39H 1778 movdqa xmm8, xmm6 1779 shufps xmm8, xmm7, 250 1780 pblendw xmm9, xmm8, 0CCH 1781 movdqa xmm8, xmm7 1782 punpcklqdq xmm8, xmm5 1783 pblendw xmm8, xmm6, 0C0H 1784 pshufd xmm8, xmm8, 78H 1785 punpckhdq xmm5, xmm7 1786 punpckldq xmm6, xmm5 1787 pshufd xmm7, xmm6, 1EH 1788 movdqa xmm5, xmm9 1789 movdqa xmm6, xmm8 1790 jmp roundloop1 1791 endroundloop1: 1792 pxor xmm0, xmm2 1793 pxor xmm1, xmm3 1794 mov eax, r13d 1795 cmp rdx, r15 1796 jne innerloop1 1797 movups xmmword ptr [rbx], xmm0 1798 movups xmmword ptr [rbx+10H], xmm1 1799 jmp unwind 1800 _blake3_hash_many_sse41 ENDP 1801 blake3_hash_many_sse41 ENDP 1802 1803 blake3_compress_in_place_sse41 PROC 1804 _blake3_compress_in_place_sse41 PROC 1805 sub rsp, 120 1806 movdqa xmmword ptr [rsp], xmm6 1807 movdqa xmmword ptr [rsp+10H], xmm7 1808 movdqa xmmword ptr [rsp+20H], xmm8 1809 movdqa xmmword ptr [rsp+30H], xmm9 1810 movdqa xmmword ptr [rsp+40H], xmm11 1811 movdqa xmmword ptr [rsp+50H], xmm14 1812 movdqa xmmword ptr [rsp+60H], xmm15 1813 movups xmm0, xmmword ptr [rcx] 1814 movups xmm1, xmmword ptr [rcx+10H] 1815 movaps xmm2, xmmword ptr [BLAKE3_IV] 1816 movzx eax, byte ptr [rsp+0A0H] 1817 movzx r8d, r8b 1818 shl rax, 32 1819 add r8, rax 1820 movq xmm3, r9 1821 movq xmm4, r8 1822 punpcklqdq xmm3, xmm4 1823 movups xmm4, xmmword ptr [rdx] 1824 movups xmm5, xmmword ptr [rdx+10H] 1825 movaps xmm8, xmm4 1826 shufps xmm4, xmm5, 136 1827 shufps xmm8, xmm5, 221 1828 movaps xmm5, xmm8 1829 movups xmm6, xmmword ptr [rdx+20H] 1830 movups xmm7, xmmword ptr [rdx+30H] 1831 movaps xmm8, xmm6 1832 shufps xmm6, xmm7, 136 1833 pshufd xmm6, xmm6, 93H 1834 shufps xmm8, xmm7, 221 1835 pshufd xmm7, xmm8, 93H 1836 movaps xmm14, xmmword ptr [ROT8] 1837 movaps xmm15, xmmword ptr [ROT16] 1838 mov al, 7 1839 @@: 1840 paddd xmm0, xmm4 1841 paddd xmm0, xmm1 1842 pxor xmm3, xmm0 1843 pshufb xmm3, xmm15 1844 paddd xmm2, xmm3 1845 pxor xmm1, xmm2 1846 movdqa xmm11, xmm1 1847 pslld xmm1, 20 1848 psrld xmm11, 12 1849 por xmm1, xmm11 1850 paddd xmm0, xmm5 1851 paddd xmm0, xmm1 1852 pxor xmm3, xmm0 1853 pshufb xmm3, xmm14 1854 paddd xmm2, xmm3 1855 pxor xmm1, xmm2 1856 movdqa xmm11, xmm1 1857 pslld xmm1, 25 1858 psrld xmm11, 7 1859 por xmm1, xmm11 1860 pshufd xmm0, xmm0, 93H 1861 pshufd xmm3, xmm3, 4EH 1862 pshufd xmm2, xmm2, 39H 1863 paddd xmm0, xmm6 1864 paddd xmm0, xmm1 1865 pxor xmm3, xmm0 1866 pshufb xmm3, xmm15 1867 paddd xmm2, xmm3 1868 pxor xmm1, xmm2 1869 movdqa xmm11, xmm1 1870 pslld xmm1, 20 1871 psrld xmm11, 12 1872 por xmm1, xmm11 1873 paddd xmm0, xmm7 1874 paddd xmm0, xmm1 1875 pxor xmm3, xmm0 1876 pshufb xmm3, xmm14 1877 paddd xmm2, xmm3 1878 pxor xmm1, xmm2 1879 movdqa xmm11, xmm1 1880 pslld xmm1, 25 1881 psrld xmm11, 7 1882 por xmm1, xmm11 1883 pshufd xmm0, xmm0, 39H 1884 pshufd xmm3, xmm3, 4EH 1885 pshufd xmm2, xmm2, 93H 1886 dec al 1887 jz @F 1888 movdqa xmm8, xmm4 1889 shufps xmm8, xmm5, 214 1890 pshufd xmm9, xmm4, 0FH 1891 pshufd xmm4, xmm8, 39H 1892 movdqa xmm8, xmm6 1893 shufps xmm8, xmm7, 250 1894 pblendw xmm9, xmm8, 0CCH 1895 movdqa xmm8, xmm7 1896 punpcklqdq xmm8, xmm5 1897 pblendw xmm8, xmm6, 0C0H 1898 pshufd xmm8, xmm8, 78H 1899 punpckhdq xmm5, xmm7 1900 punpckldq xmm6, xmm5 1901 pshufd xmm7, xmm6, 1EH 1902 movdqa xmm5, xmm9 1903 movdqa xmm6, xmm8 1904 jmp @B 1905 @@: 1906 pxor xmm0, xmm2 1907 pxor xmm1, xmm3 1908 movups xmmword ptr [rcx], xmm0 1909 movups xmmword ptr [rcx+10H], xmm1 1910 movdqa xmm6, xmmword ptr [rsp] 1911 movdqa xmm7, xmmword ptr [rsp+10H] 1912 movdqa xmm8, xmmword ptr [rsp+20H] 1913 movdqa xmm9, xmmword ptr [rsp+30H] 1914 movdqa xmm11, xmmword ptr [rsp+40H] 1915 movdqa xmm14, xmmword ptr [rsp+50H] 1916 movdqa xmm15, xmmword ptr [rsp+60H] 1917 add rsp, 120 1918 ret 1919 _blake3_compress_in_place_sse41 ENDP 1920 blake3_compress_in_place_sse41 ENDP 1921 1922 ALIGN 16 1923 blake3_compress_xof_sse41 PROC 1924 _blake3_compress_xof_sse41 PROC 1925 sub rsp, 120 1926 movdqa xmmword ptr [rsp], xmm6 1927 movdqa xmmword ptr [rsp+10H], xmm7 1928 movdqa xmmword ptr [rsp+20H], xmm8 1929 movdqa xmmword ptr [rsp+30H], xmm9 1930 movdqa xmmword ptr [rsp+40H], xmm11 1931 movdqa xmmword ptr [rsp+50H], xmm14 1932 movdqa xmmword ptr [rsp+60H], xmm15 1933 movups xmm0, xmmword ptr [rcx] 1934 movups xmm1, xmmword ptr [rcx+10H] 1935 movaps xmm2, xmmword ptr [BLAKE3_IV] 1936 movzx eax, byte ptr [rsp+0A0H] 1937 movzx r8d, r8b 1938 mov r10, qword ptr [rsp+0A8H] 1939 shl rax, 32 1940 add r8, rax 1941 movq xmm3, r9 1942 movq xmm4, r8 1943 punpcklqdq xmm3, xmm4 1944 movups xmm4, xmmword ptr [rdx] 1945 movups xmm5, xmmword ptr [rdx+10H] 1946 movaps xmm8, xmm4 1947 shufps xmm4, xmm5, 136 1948 shufps xmm8, xmm5, 221 1949 movaps xmm5, xmm8 1950 movups xmm6, xmmword ptr [rdx+20H] 1951 movups xmm7, xmmword ptr [rdx+30H] 1952 movaps xmm8, xmm6 1953 shufps xmm6, xmm7, 136 1954 pshufd xmm6, xmm6, 93H 1955 shufps xmm8, xmm7, 221 1956 pshufd xmm7, xmm8, 93H 1957 movaps xmm14, xmmword ptr [ROT8] 1958 movaps xmm15, xmmword ptr [ROT16] 1959 mov al, 7 1960 @@: 1961 paddd xmm0, xmm4 1962 paddd xmm0, xmm1 1963 pxor xmm3, xmm0 1964 pshufb xmm3, xmm15 1965 paddd xmm2, xmm3 1966 pxor xmm1, xmm2 1967 movdqa xmm11, xmm1 1968 pslld xmm1, 20 1969 psrld xmm11, 12 1970 por xmm1, xmm11 1971 paddd xmm0, xmm5 1972 paddd xmm0, xmm1 1973 pxor xmm3, xmm0 1974 pshufb xmm3, xmm14 1975 paddd xmm2, xmm3 1976 pxor xmm1, xmm2 1977 movdqa xmm11, xmm1 1978 pslld xmm1, 25 1979 psrld xmm11, 7 1980 por xmm1, xmm11 1981 pshufd xmm0, xmm0, 93H 1982 pshufd xmm3, xmm3, 4EH 1983 pshufd xmm2, xmm2, 39H 1984 paddd xmm0, xmm6 1985 paddd xmm0, xmm1 1986 pxor xmm3, xmm0 1987 pshufb xmm3, xmm15 1988 paddd xmm2, xmm3 1989 pxor xmm1, xmm2 1990 movdqa xmm11, xmm1 1991 pslld xmm1, 20 1992 psrld xmm11, 12 1993 por xmm1, xmm11 1994 paddd xmm0, xmm7 1995 paddd xmm0, xmm1 1996 pxor xmm3, xmm0 1997 pshufb xmm3, xmm14 1998 paddd xmm2, xmm3 1999 pxor xmm1, xmm2 2000 movdqa xmm11, xmm1 2001 pslld xmm1, 25 2002 psrld xmm11, 7 2003 por xmm1, xmm11 2004 pshufd xmm0, xmm0, 39H 2005 pshufd xmm3, xmm3, 4EH 2006 pshufd xmm2, xmm2, 93H 2007 dec al 2008 jz @F 2009 movdqa xmm8, xmm4 2010 shufps xmm8, xmm5, 214 2011 pshufd xmm9, xmm4, 0FH 2012 pshufd xmm4, xmm8, 39H 2013 movdqa xmm8, xmm6 2014 shufps xmm8, xmm7, 250 2015 pblendw xmm9, xmm8, 0CCH 2016 movdqa xmm8, xmm7 2017 punpcklqdq xmm8, xmm5 2018 pblendw xmm8, xmm6, 0C0H 2019 pshufd xmm8, xmm8, 78H 2020 punpckhdq xmm5, xmm7 2021 punpckldq xmm6, xmm5 2022 pshufd xmm7, xmm6, 1EH 2023 movdqa xmm5, xmm9 2024 movdqa xmm6, xmm8 2025 jmp @B 2026 @@: 2027 movdqu xmm4, xmmword ptr [rcx] 2028 movdqu xmm5, xmmword ptr [rcx+10H] 2029 pxor xmm0, xmm2 2030 pxor xmm1, xmm3 2031 pxor xmm2, xmm4 2032 pxor xmm3, xmm5 2033 movups xmmword ptr [r10], xmm0 2034 movups xmmword ptr [r10+10H], xmm1 2035 movups xmmword ptr [r10+20H], xmm2 2036 movups xmmword ptr [r10+30H], xmm3 2037 movdqa xmm6, xmmword ptr [rsp] 2038 movdqa xmm7, xmmword ptr [rsp+10H] 2039 movdqa xmm8, xmmword ptr [rsp+20H] 2040 movdqa xmm9, xmmword ptr [rsp+30H] 2041 movdqa xmm11, xmmword ptr [rsp+40H] 2042 movdqa xmm14, xmmword ptr [rsp+50H] 2043 movdqa xmm15, xmmword ptr [rsp+60H] 2044 add rsp, 120 2045 ret 2046 _blake3_compress_xof_sse41 ENDP 2047 blake3_compress_xof_sse41 ENDP 2048 2049 _TEXT ENDS 2050 2051 2052 _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' 2053 ALIGN 64 2054 BLAKE3_IV: 2055 dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH 2056 2057 ADD0: 2058 dd 0, 1, 2, 3 2059 2060 ADD1: 2061 dd 4 dup (4) 2062 2063 BLAKE3_IV_0: 2064 dd 4 dup (6A09E667H) 2065 2066 BLAKE3_IV_1: 2067 dd 4 dup (0BB67AE85H) 2068 2069 BLAKE3_IV_2: 2070 dd 4 dup (3C6EF372H) 2071 2072 BLAKE3_IV_3: 2073 dd 4 dup (0A54FF53AH) 2074 2075 BLAKE3_BLOCK_LEN: 2076 dd 4 dup (64) 2077 2078 ROT16: 2079 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 2080 2081 ROT8: 2082 db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 2083 2084 CMP_MSB_MASK: 2085 dd 8 dup(80000000H) 2086 2087 _RDATA ENDS 2088 END 2089