blake3_sse2_x86-64_unix.S (68858B)
1 #if defined(__ELF__) && defined(__linux__) 2 .section .note.GNU-stack,"",%progbits 3 #endif 4 5 #if defined(__ELF__) && defined(__CET__) && defined(__has_include) 6 #if __has_include(<cet.h>) 7 #include <cet.h> 8 #endif 9 #endif 10 11 #if !defined(_CET_ENDBR) 12 #define _CET_ENDBR 13 #endif 14 15 .intel_syntax noprefix 16 .global blake3_hash_many_sse2 17 .global _blake3_hash_many_sse2 18 .global blake3_compress_in_place_sse2 19 .global _blake3_compress_in_place_sse2 20 .global blake3_compress_xof_sse2 21 .global _blake3_compress_xof_sse2 22 #ifdef __APPLE__ 23 .text 24 #else 25 .section .text 26 #endif 27 .p2align 6 28 _blake3_hash_many_sse2: 29 blake3_hash_many_sse2: 30 _CET_ENDBR 31 push r15 32 push r14 33 push r13 34 push r12 35 push rbx 36 push rbp 37 mov rbp, rsp 38 sub rsp, 360 39 and rsp, 0xFFFFFFFFFFFFFFC0 40 neg r9d 41 movd xmm0, r9d 42 pshufd xmm0, xmm0, 0x00 43 movdqa xmmword ptr [rsp+0x130], xmm0 44 movdqa xmm1, xmm0 45 pand xmm1, xmmword ptr [ADD0+rip] 46 pand xmm0, xmmword ptr [ADD1+rip] 47 movdqa xmmword ptr [rsp+0x150], xmm0 48 movd xmm0, r8d 49 pshufd xmm0, xmm0, 0x00 50 paddd xmm0, xmm1 51 movdqa xmmword ptr [rsp+0x110], xmm0 52 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 53 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 54 pcmpgtd xmm1, xmm0 55 shr r8, 32 56 movd xmm2, r8d 57 pshufd xmm2, xmm2, 0x00 58 psubd xmm2, xmm1 59 movdqa xmmword ptr [rsp+0x120], xmm2 60 mov rbx, qword ptr [rbp+0x50] 61 mov r15, rdx 62 shl r15, 6 63 movzx r13d, byte ptr [rbp+0x38] 64 movzx r12d, byte ptr [rbp+0x48] 65 cmp rsi, 4 66 jc 3f 67 2: 68 movdqu xmm3, xmmword ptr [rcx] 69 pshufd xmm0, xmm3, 0x00 70 pshufd xmm1, xmm3, 0x55 71 pshufd xmm2, xmm3, 0xAA 72 pshufd xmm3, xmm3, 0xFF 73 movdqu xmm7, xmmword ptr [rcx+0x10] 74 pshufd xmm4, xmm7, 0x00 75 pshufd xmm5, xmm7, 0x55 76 pshufd xmm6, xmm7, 0xAA 77 pshufd xmm7, xmm7, 0xFF 78 mov r8, qword ptr [rdi] 79 mov r9, qword ptr [rdi+0x8] 80 mov r10, qword ptr [rdi+0x10] 81 mov r11, qword ptr [rdi+0x18] 82 movzx eax, byte ptr [rbp+0x40] 83 or eax, r13d 84 xor edx, edx 85 9: 86 mov r14d, eax 87 or eax, r12d 88 add rdx, 64 89 cmp rdx, r15 90 cmovne eax, r14d 91 movdqu xmm8, xmmword ptr [r8+rdx-0x40] 92 movdqu xmm9, xmmword ptr [r9+rdx-0x40] 93 movdqu xmm10, xmmword ptr [r10+rdx-0x40] 94 movdqu xmm11, xmmword ptr [r11+rdx-0x40] 95 movdqa xmm12, xmm8 96 punpckldq xmm8, xmm9 97 punpckhdq xmm12, xmm9 98 movdqa xmm14, xmm10 99 punpckldq xmm10, xmm11 100 punpckhdq xmm14, xmm11 101 movdqa xmm9, xmm8 102 punpcklqdq xmm8, xmm10 103 punpckhqdq xmm9, xmm10 104 movdqa xmm13, xmm12 105 punpcklqdq xmm12, xmm14 106 punpckhqdq xmm13, xmm14 107 movdqa xmmword ptr [rsp], xmm8 108 movdqa xmmword ptr [rsp+0x10], xmm9 109 movdqa xmmword ptr [rsp+0x20], xmm12 110 movdqa xmmword ptr [rsp+0x30], xmm13 111 movdqu xmm8, xmmword ptr [r8+rdx-0x30] 112 movdqu xmm9, xmmword ptr [r9+rdx-0x30] 113 movdqu xmm10, xmmword ptr [r10+rdx-0x30] 114 movdqu xmm11, xmmword ptr [r11+rdx-0x30] 115 movdqa xmm12, xmm8 116 punpckldq xmm8, xmm9 117 punpckhdq xmm12, xmm9 118 movdqa xmm14, xmm10 119 punpckldq xmm10, xmm11 120 punpckhdq xmm14, xmm11 121 movdqa xmm9, xmm8 122 punpcklqdq xmm8, xmm10 123 punpckhqdq xmm9, xmm10 124 movdqa xmm13, xmm12 125 punpcklqdq xmm12, xmm14 126 punpckhqdq xmm13, xmm14 127 movdqa xmmword ptr [rsp+0x40], xmm8 128 movdqa xmmword ptr [rsp+0x50], xmm9 129 movdqa xmmword ptr [rsp+0x60], xmm12 130 movdqa xmmword ptr [rsp+0x70], xmm13 131 movdqu xmm8, xmmword ptr [r8+rdx-0x20] 132 movdqu xmm9, xmmword ptr [r9+rdx-0x20] 133 movdqu xmm10, xmmword ptr [r10+rdx-0x20] 134 movdqu xmm11, xmmword ptr [r11+rdx-0x20] 135 movdqa xmm12, xmm8 136 punpckldq xmm8, xmm9 137 punpckhdq xmm12, xmm9 138 movdqa xmm14, xmm10 139 punpckldq xmm10, xmm11 140 punpckhdq xmm14, xmm11 141 movdqa xmm9, xmm8 142 punpcklqdq xmm8, xmm10 143 punpckhqdq xmm9, xmm10 144 movdqa xmm13, xmm12 145 punpcklqdq xmm12, xmm14 146 punpckhqdq xmm13, xmm14 147 movdqa xmmword ptr [rsp+0x80], xmm8 148 movdqa xmmword ptr [rsp+0x90], xmm9 149 movdqa xmmword ptr [rsp+0xA0], xmm12 150 movdqa xmmword ptr [rsp+0xB0], xmm13 151 movdqu xmm8, xmmword ptr [r8+rdx-0x10] 152 movdqu xmm9, xmmword ptr [r9+rdx-0x10] 153 movdqu xmm10, xmmword ptr [r10+rdx-0x10] 154 movdqu xmm11, xmmword ptr [r11+rdx-0x10] 155 movdqa xmm12, xmm8 156 punpckldq xmm8, xmm9 157 punpckhdq xmm12, xmm9 158 movdqa xmm14, xmm10 159 punpckldq xmm10, xmm11 160 punpckhdq xmm14, xmm11 161 movdqa xmm9, xmm8 162 punpcklqdq xmm8, xmm10 163 punpckhqdq xmm9, xmm10 164 movdqa xmm13, xmm12 165 punpcklqdq xmm12, xmm14 166 punpckhqdq xmm13, xmm14 167 movdqa xmmword ptr [rsp+0xC0], xmm8 168 movdqa xmmword ptr [rsp+0xD0], xmm9 169 movdqa xmmword ptr [rsp+0xE0], xmm12 170 movdqa xmmword ptr [rsp+0xF0], xmm13 171 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] 172 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] 173 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] 174 movdqa xmm12, xmmword ptr [rsp+0x110] 175 movdqa xmm13, xmmword ptr [rsp+0x120] 176 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] 177 movd xmm15, eax 178 pshufd xmm15, xmm15, 0x00 179 prefetcht0 [r8+rdx+0x80] 180 prefetcht0 [r9+rdx+0x80] 181 prefetcht0 [r10+rdx+0x80] 182 prefetcht0 [r11+rdx+0x80] 183 paddd xmm0, xmmword ptr [rsp] 184 paddd xmm1, xmmword ptr [rsp+0x20] 185 paddd xmm2, xmmword ptr [rsp+0x40] 186 paddd xmm3, xmmword ptr [rsp+0x60] 187 paddd xmm0, xmm4 188 paddd xmm1, xmm5 189 paddd xmm2, xmm6 190 paddd xmm3, xmm7 191 pxor xmm12, xmm0 192 pxor xmm13, xmm1 193 pxor xmm14, xmm2 194 pxor xmm15, xmm3 195 pshuflw xmm12, xmm12, 0xB1 196 pshufhw xmm12, xmm12, 0xB1 197 pshuflw xmm13, xmm13, 0xB1 198 pshufhw xmm13, xmm13, 0xB1 199 pshuflw xmm14, xmm14, 0xB1 200 pshufhw xmm14, xmm14, 0xB1 201 pshuflw xmm15, xmm15, 0xB1 202 pshufhw xmm15, xmm15, 0xB1 203 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] 204 paddd xmm8, xmm12 205 paddd xmm9, xmm13 206 paddd xmm10, xmm14 207 paddd xmm11, xmm15 208 pxor xmm4, xmm8 209 pxor xmm5, xmm9 210 pxor xmm6, xmm10 211 pxor xmm7, xmm11 212 movdqa xmmword ptr [rsp+0x100], xmm8 213 movdqa xmm8, xmm4 214 psrld xmm8, 12 215 pslld xmm4, 20 216 por xmm4, xmm8 217 movdqa xmm8, xmm5 218 psrld xmm8, 12 219 pslld xmm5, 20 220 por xmm5, xmm8 221 movdqa xmm8, xmm6 222 psrld xmm8, 12 223 pslld xmm6, 20 224 por xmm6, xmm8 225 movdqa xmm8, xmm7 226 psrld xmm8, 12 227 pslld xmm7, 20 228 por xmm7, xmm8 229 paddd xmm0, xmmword ptr [rsp+0x10] 230 paddd xmm1, xmmword ptr [rsp+0x30] 231 paddd xmm2, xmmword ptr [rsp+0x50] 232 paddd xmm3, xmmword ptr [rsp+0x70] 233 paddd xmm0, xmm4 234 paddd xmm1, xmm5 235 paddd xmm2, xmm6 236 paddd xmm3, xmm7 237 pxor xmm12, xmm0 238 pxor xmm13, xmm1 239 pxor xmm14, xmm2 240 pxor xmm15, xmm3 241 movdqa xmm8, xmm12 242 psrld xmm12, 8 243 pslld xmm8, 24 244 pxor xmm12, xmm8 245 movdqa xmm8, xmm13 246 psrld xmm13, 8 247 pslld xmm8, 24 248 pxor xmm13, xmm8 249 movdqa xmm8, xmm14 250 psrld xmm14, 8 251 pslld xmm8, 24 252 pxor xmm14, xmm8 253 movdqa xmm8, xmm15 254 psrld xmm15, 8 255 pslld xmm8, 24 256 pxor xmm15, xmm8 257 movdqa xmm8, xmmword ptr [rsp+0x100] 258 paddd xmm8, xmm12 259 paddd xmm9, xmm13 260 paddd xmm10, xmm14 261 paddd xmm11, xmm15 262 pxor xmm4, xmm8 263 pxor xmm5, xmm9 264 pxor xmm6, xmm10 265 pxor xmm7, xmm11 266 movdqa xmmword ptr [rsp+0x100], xmm8 267 movdqa xmm8, xmm4 268 psrld xmm8, 7 269 pslld xmm4, 25 270 por xmm4, xmm8 271 movdqa xmm8, xmm5 272 psrld xmm8, 7 273 pslld xmm5, 25 274 por xmm5, xmm8 275 movdqa xmm8, xmm6 276 psrld xmm8, 7 277 pslld xmm6, 25 278 por xmm6, xmm8 279 movdqa xmm8, xmm7 280 psrld xmm8, 7 281 pslld xmm7, 25 282 por xmm7, xmm8 283 paddd xmm0, xmmword ptr [rsp+0x80] 284 paddd xmm1, xmmword ptr [rsp+0xA0] 285 paddd xmm2, xmmword ptr [rsp+0xC0] 286 paddd xmm3, xmmword ptr [rsp+0xE0] 287 paddd xmm0, xmm5 288 paddd xmm1, xmm6 289 paddd xmm2, xmm7 290 paddd xmm3, xmm4 291 pxor xmm15, xmm0 292 pxor xmm12, xmm1 293 pxor xmm13, xmm2 294 pxor xmm14, xmm3 295 pshuflw xmm15, xmm15, 0xB1 296 pshufhw xmm15, xmm15, 0xB1 297 pshuflw xmm12, xmm12, 0xB1 298 pshufhw xmm12, xmm12, 0xB1 299 pshuflw xmm13, xmm13, 0xB1 300 pshufhw xmm13, xmm13, 0xB1 301 pshuflw xmm14, xmm14, 0xB1 302 pshufhw xmm14, xmm14, 0xB1 303 paddd xmm10, xmm15 304 paddd xmm11, xmm12 305 movdqa xmm8, xmmword ptr [rsp+0x100] 306 paddd xmm8, xmm13 307 paddd xmm9, xmm14 308 pxor xmm5, xmm10 309 pxor xmm6, xmm11 310 pxor xmm7, xmm8 311 pxor xmm4, xmm9 312 movdqa xmmword ptr [rsp+0x100], xmm8 313 movdqa xmm8, xmm5 314 psrld xmm8, 12 315 pslld xmm5, 20 316 por xmm5, xmm8 317 movdqa xmm8, xmm6 318 psrld xmm8, 12 319 pslld xmm6, 20 320 por xmm6, xmm8 321 movdqa xmm8, xmm7 322 psrld xmm8, 12 323 pslld xmm7, 20 324 por xmm7, xmm8 325 movdqa xmm8, xmm4 326 psrld xmm8, 12 327 pslld xmm4, 20 328 por xmm4, xmm8 329 paddd xmm0, xmmword ptr [rsp+0x90] 330 paddd xmm1, xmmword ptr [rsp+0xB0] 331 paddd xmm2, xmmword ptr [rsp+0xD0] 332 paddd xmm3, xmmword ptr [rsp+0xF0] 333 paddd xmm0, xmm5 334 paddd xmm1, xmm6 335 paddd xmm2, xmm7 336 paddd xmm3, xmm4 337 pxor xmm15, xmm0 338 pxor xmm12, xmm1 339 pxor xmm13, xmm2 340 pxor xmm14, xmm3 341 movdqa xmm8, xmm15 342 psrld xmm15, 8 343 pslld xmm8, 24 344 pxor xmm15, xmm8 345 movdqa xmm8, xmm12 346 psrld xmm12, 8 347 pslld xmm8, 24 348 pxor xmm12, xmm8 349 movdqa xmm8, xmm13 350 psrld xmm13, 8 351 pslld xmm8, 24 352 pxor xmm13, xmm8 353 movdqa xmm8, xmm14 354 psrld xmm14, 8 355 pslld xmm8, 24 356 pxor xmm14, xmm8 357 paddd xmm10, xmm15 358 paddd xmm11, xmm12 359 movdqa xmm8, xmmword ptr [rsp+0x100] 360 paddd xmm8, xmm13 361 paddd xmm9, xmm14 362 pxor xmm5, xmm10 363 pxor xmm6, xmm11 364 pxor xmm7, xmm8 365 pxor xmm4, xmm9 366 movdqa xmmword ptr [rsp+0x100], xmm8 367 movdqa xmm8, xmm5 368 psrld xmm8, 7 369 pslld xmm5, 25 370 por xmm5, xmm8 371 movdqa xmm8, xmm6 372 psrld xmm8, 7 373 pslld xmm6, 25 374 por xmm6, xmm8 375 movdqa xmm8, xmm7 376 psrld xmm8, 7 377 pslld xmm7, 25 378 por xmm7, xmm8 379 movdqa xmm8, xmm4 380 psrld xmm8, 7 381 pslld xmm4, 25 382 por xmm4, xmm8 383 paddd xmm0, xmmword ptr [rsp+0x20] 384 paddd xmm1, xmmword ptr [rsp+0x30] 385 paddd xmm2, xmmword ptr [rsp+0x70] 386 paddd xmm3, xmmword ptr [rsp+0x40] 387 paddd xmm0, xmm4 388 paddd xmm1, xmm5 389 paddd xmm2, xmm6 390 paddd xmm3, xmm7 391 pxor xmm12, xmm0 392 pxor xmm13, xmm1 393 pxor xmm14, xmm2 394 pxor xmm15, xmm3 395 pshuflw xmm12, xmm12, 0xB1 396 pshufhw xmm12, xmm12, 0xB1 397 pshuflw xmm13, xmm13, 0xB1 398 pshufhw xmm13, xmm13, 0xB1 399 pshuflw xmm14, xmm14, 0xB1 400 pshufhw xmm14, xmm14, 0xB1 401 pshuflw xmm15, xmm15, 0xB1 402 pshufhw xmm15, xmm15, 0xB1 403 movdqa xmm8, xmmword ptr [rsp+0x100] 404 paddd xmm8, xmm12 405 paddd xmm9, xmm13 406 paddd xmm10, xmm14 407 paddd xmm11, xmm15 408 pxor xmm4, xmm8 409 pxor xmm5, xmm9 410 pxor xmm6, xmm10 411 pxor xmm7, xmm11 412 movdqa xmmword ptr [rsp+0x100], xmm8 413 movdqa xmm8, xmm4 414 psrld xmm8, 12 415 pslld xmm4, 20 416 por xmm4, xmm8 417 movdqa xmm8, xmm5 418 psrld xmm8, 12 419 pslld xmm5, 20 420 por xmm5, xmm8 421 movdqa xmm8, xmm6 422 psrld xmm8, 12 423 pslld xmm6, 20 424 por xmm6, xmm8 425 movdqa xmm8, xmm7 426 psrld xmm8, 12 427 pslld xmm7, 20 428 por xmm7, xmm8 429 paddd xmm0, xmmword ptr [rsp+0x60] 430 paddd xmm1, xmmword ptr [rsp+0xA0] 431 paddd xmm2, xmmword ptr [rsp] 432 paddd xmm3, xmmword ptr [rsp+0xD0] 433 paddd xmm0, xmm4 434 paddd xmm1, xmm5 435 paddd xmm2, xmm6 436 paddd xmm3, xmm7 437 pxor xmm12, xmm0 438 pxor xmm13, xmm1 439 pxor xmm14, xmm2 440 pxor xmm15, xmm3 441 movdqa xmm8, xmm12 442 psrld xmm12, 8 443 pslld xmm8, 24 444 pxor xmm12, xmm8 445 movdqa xmm8, xmm13 446 psrld xmm13, 8 447 pslld xmm8, 24 448 pxor xmm13, xmm8 449 movdqa xmm8, xmm14 450 psrld xmm14, 8 451 pslld xmm8, 24 452 pxor xmm14, xmm8 453 movdqa xmm8, xmm15 454 psrld xmm15, 8 455 pslld xmm8, 24 456 pxor xmm15, xmm8 457 movdqa xmm8, xmmword ptr [rsp+0x100] 458 paddd xmm8, xmm12 459 paddd xmm9, xmm13 460 paddd xmm10, xmm14 461 paddd xmm11, xmm15 462 pxor xmm4, xmm8 463 pxor xmm5, xmm9 464 pxor xmm6, xmm10 465 pxor xmm7, xmm11 466 movdqa xmmword ptr [rsp+0x100], xmm8 467 movdqa xmm8, xmm4 468 psrld xmm8, 7 469 pslld xmm4, 25 470 por xmm4, xmm8 471 movdqa xmm8, xmm5 472 psrld xmm8, 7 473 pslld xmm5, 25 474 por xmm5, xmm8 475 movdqa xmm8, xmm6 476 psrld xmm8, 7 477 pslld xmm6, 25 478 por xmm6, xmm8 479 movdqa xmm8, xmm7 480 psrld xmm8, 7 481 pslld xmm7, 25 482 por xmm7, xmm8 483 paddd xmm0, xmmword ptr [rsp+0x10] 484 paddd xmm1, xmmword ptr [rsp+0xC0] 485 paddd xmm2, xmmword ptr [rsp+0x90] 486 paddd xmm3, xmmword ptr [rsp+0xF0] 487 paddd xmm0, xmm5 488 paddd xmm1, xmm6 489 paddd xmm2, xmm7 490 paddd xmm3, xmm4 491 pxor xmm15, xmm0 492 pxor xmm12, xmm1 493 pxor xmm13, xmm2 494 pxor xmm14, xmm3 495 pshuflw xmm15, xmm15, 0xB1 496 pshufhw xmm15, xmm15, 0xB1 497 pshuflw xmm12, xmm12, 0xB1 498 pshufhw xmm12, xmm12, 0xB1 499 pshuflw xmm13, xmm13, 0xB1 500 pshufhw xmm13, xmm13, 0xB1 501 pshuflw xmm14, xmm14, 0xB1 502 pshufhw xmm14, xmm14, 0xB1 503 paddd xmm10, xmm15 504 paddd xmm11, xmm12 505 movdqa xmm8, xmmword ptr [rsp+0x100] 506 paddd xmm8, xmm13 507 paddd xmm9, xmm14 508 pxor xmm5, xmm10 509 pxor xmm6, xmm11 510 pxor xmm7, xmm8 511 pxor xmm4, xmm9 512 movdqa xmmword ptr [rsp+0x100], xmm8 513 movdqa xmm8, xmm5 514 psrld xmm8, 12 515 pslld xmm5, 20 516 por xmm5, xmm8 517 movdqa xmm8, xmm6 518 psrld xmm8, 12 519 pslld xmm6, 20 520 por xmm6, xmm8 521 movdqa xmm8, xmm7 522 psrld xmm8, 12 523 pslld xmm7, 20 524 por xmm7, xmm8 525 movdqa xmm8, xmm4 526 psrld xmm8, 12 527 pslld xmm4, 20 528 por xmm4, xmm8 529 paddd xmm0, xmmword ptr [rsp+0xB0] 530 paddd xmm1, xmmword ptr [rsp+0x50] 531 paddd xmm2, xmmword ptr [rsp+0xE0] 532 paddd xmm3, xmmword ptr [rsp+0x80] 533 paddd xmm0, xmm5 534 paddd xmm1, xmm6 535 paddd xmm2, xmm7 536 paddd xmm3, xmm4 537 pxor xmm15, xmm0 538 pxor xmm12, xmm1 539 pxor xmm13, xmm2 540 pxor xmm14, xmm3 541 movdqa xmm8, xmm15 542 psrld xmm15, 8 543 pslld xmm8, 24 544 pxor xmm15, xmm8 545 movdqa xmm8, xmm12 546 psrld xmm12, 8 547 pslld xmm8, 24 548 pxor xmm12, xmm8 549 movdqa xmm8, xmm13 550 psrld xmm13, 8 551 pslld xmm8, 24 552 pxor xmm13, xmm8 553 movdqa xmm8, xmm14 554 psrld xmm14, 8 555 pslld xmm8, 24 556 pxor xmm14, xmm8 557 paddd xmm10, xmm15 558 paddd xmm11, xmm12 559 movdqa xmm8, xmmword ptr [rsp+0x100] 560 paddd xmm8, xmm13 561 paddd xmm9, xmm14 562 pxor xmm5, xmm10 563 pxor xmm6, xmm11 564 pxor xmm7, xmm8 565 pxor xmm4, xmm9 566 movdqa xmmword ptr [rsp+0x100], xmm8 567 movdqa xmm8, xmm5 568 psrld xmm8, 7 569 pslld xmm5, 25 570 por xmm5, xmm8 571 movdqa xmm8, xmm6 572 psrld xmm8, 7 573 pslld xmm6, 25 574 por xmm6, xmm8 575 movdqa xmm8, xmm7 576 psrld xmm8, 7 577 pslld xmm7, 25 578 por xmm7, xmm8 579 movdqa xmm8, xmm4 580 psrld xmm8, 7 581 pslld xmm4, 25 582 por xmm4, xmm8 583 paddd xmm0, xmmword ptr [rsp+0x30] 584 paddd xmm1, xmmword ptr [rsp+0xA0] 585 paddd xmm2, xmmword ptr [rsp+0xD0] 586 paddd xmm3, xmmword ptr [rsp+0x70] 587 paddd xmm0, xmm4 588 paddd xmm1, xmm5 589 paddd xmm2, xmm6 590 paddd xmm3, xmm7 591 pxor xmm12, xmm0 592 pxor xmm13, xmm1 593 pxor xmm14, xmm2 594 pxor xmm15, xmm3 595 pshuflw xmm12, xmm12, 0xB1 596 pshufhw xmm12, xmm12, 0xB1 597 pshuflw xmm13, xmm13, 0xB1 598 pshufhw xmm13, xmm13, 0xB1 599 pshuflw xmm14, xmm14, 0xB1 600 pshufhw xmm14, xmm14, 0xB1 601 pshuflw xmm15, xmm15, 0xB1 602 pshufhw xmm15, xmm15, 0xB1 603 movdqa xmm8, xmmword ptr [rsp+0x100] 604 paddd xmm8, xmm12 605 paddd xmm9, xmm13 606 paddd xmm10, xmm14 607 paddd xmm11, xmm15 608 pxor xmm4, xmm8 609 pxor xmm5, xmm9 610 pxor xmm6, xmm10 611 pxor xmm7, xmm11 612 movdqa xmmword ptr [rsp+0x100], xmm8 613 movdqa xmm8, xmm4 614 psrld xmm8, 12 615 pslld xmm4, 20 616 por xmm4, xmm8 617 movdqa xmm8, xmm5 618 psrld xmm8, 12 619 pslld xmm5, 20 620 por xmm5, xmm8 621 movdqa xmm8, xmm6 622 psrld xmm8, 12 623 pslld xmm6, 20 624 por xmm6, xmm8 625 movdqa xmm8, xmm7 626 psrld xmm8, 12 627 pslld xmm7, 20 628 por xmm7, xmm8 629 paddd xmm0, xmmword ptr [rsp+0x40] 630 paddd xmm1, xmmword ptr [rsp+0xC0] 631 paddd xmm2, xmmword ptr [rsp+0x20] 632 paddd xmm3, xmmword ptr [rsp+0xE0] 633 paddd xmm0, xmm4 634 paddd xmm1, xmm5 635 paddd xmm2, xmm6 636 paddd xmm3, xmm7 637 pxor xmm12, xmm0 638 pxor xmm13, xmm1 639 pxor xmm14, xmm2 640 pxor xmm15, xmm3 641 movdqa xmm8, xmm12 642 psrld xmm12, 8 643 pslld xmm8, 24 644 pxor xmm12, xmm8 645 movdqa xmm8, xmm13 646 psrld xmm13, 8 647 pslld xmm8, 24 648 pxor xmm13, xmm8 649 movdqa xmm8, xmm14 650 psrld xmm14, 8 651 pslld xmm8, 24 652 pxor xmm14, xmm8 653 movdqa xmm8, xmm15 654 psrld xmm15, 8 655 pslld xmm8, 24 656 pxor xmm15, xmm8 657 movdqa xmm8, xmmword ptr [rsp+0x100] 658 paddd xmm8, xmm12 659 paddd xmm9, xmm13 660 paddd xmm10, xmm14 661 paddd xmm11, xmm15 662 pxor xmm4, xmm8 663 pxor xmm5, xmm9 664 pxor xmm6, xmm10 665 pxor xmm7, xmm11 666 movdqa xmmword ptr [rsp+0x100], xmm8 667 movdqa xmm8, xmm4 668 psrld xmm8, 7 669 pslld xmm4, 25 670 por xmm4, xmm8 671 movdqa xmm8, xmm5 672 psrld xmm8, 7 673 pslld xmm5, 25 674 por xmm5, xmm8 675 movdqa xmm8, xmm6 676 psrld xmm8, 7 677 pslld xmm6, 25 678 por xmm6, xmm8 679 movdqa xmm8, xmm7 680 psrld xmm8, 7 681 pslld xmm7, 25 682 por xmm7, xmm8 683 paddd xmm0, xmmword ptr [rsp+0x60] 684 paddd xmm1, xmmword ptr [rsp+0x90] 685 paddd xmm2, xmmword ptr [rsp+0xB0] 686 paddd xmm3, xmmword ptr [rsp+0x80] 687 paddd xmm0, xmm5 688 paddd xmm1, xmm6 689 paddd xmm2, xmm7 690 paddd xmm3, xmm4 691 pxor xmm15, xmm0 692 pxor xmm12, xmm1 693 pxor xmm13, xmm2 694 pxor xmm14, xmm3 695 pshuflw xmm15, xmm15, 0xB1 696 pshufhw xmm15, xmm15, 0xB1 697 pshuflw xmm12, xmm12, 0xB1 698 pshufhw xmm12, xmm12, 0xB1 699 pshuflw xmm13, xmm13, 0xB1 700 pshufhw xmm13, xmm13, 0xB1 701 pshuflw xmm14, xmm14, 0xB1 702 pshufhw xmm14, xmm14, 0xB1 703 paddd xmm10, xmm15 704 paddd xmm11, xmm12 705 movdqa xmm8, xmmword ptr [rsp+0x100] 706 paddd xmm8, xmm13 707 paddd xmm9, xmm14 708 pxor xmm5, xmm10 709 pxor xmm6, xmm11 710 pxor xmm7, xmm8 711 pxor xmm4, xmm9 712 movdqa xmmword ptr [rsp+0x100], xmm8 713 movdqa xmm8, xmm5 714 psrld xmm8, 12 715 pslld xmm5, 20 716 por xmm5, xmm8 717 movdqa xmm8, xmm6 718 psrld xmm8, 12 719 pslld xmm6, 20 720 por xmm6, xmm8 721 movdqa xmm8, xmm7 722 psrld xmm8, 12 723 pslld xmm7, 20 724 por xmm7, xmm8 725 movdqa xmm8, xmm4 726 psrld xmm8, 12 727 pslld xmm4, 20 728 por xmm4, xmm8 729 paddd xmm0, xmmword ptr [rsp+0x50] 730 paddd xmm1, xmmword ptr [rsp] 731 paddd xmm2, xmmword ptr [rsp+0xF0] 732 paddd xmm3, xmmword ptr [rsp+0x10] 733 paddd xmm0, xmm5 734 paddd xmm1, xmm6 735 paddd xmm2, xmm7 736 paddd xmm3, xmm4 737 pxor xmm15, xmm0 738 pxor xmm12, xmm1 739 pxor xmm13, xmm2 740 pxor xmm14, xmm3 741 movdqa xmm8, xmm15 742 psrld xmm15, 8 743 pslld xmm8, 24 744 pxor xmm15, xmm8 745 movdqa xmm8, xmm12 746 psrld xmm12, 8 747 pslld xmm8, 24 748 pxor xmm12, xmm8 749 movdqa xmm8, xmm13 750 psrld xmm13, 8 751 pslld xmm8, 24 752 pxor xmm13, xmm8 753 movdqa xmm8, xmm14 754 psrld xmm14, 8 755 pslld xmm8, 24 756 pxor xmm14, xmm8 757 paddd xmm10, xmm15 758 paddd xmm11, xmm12 759 movdqa xmm8, xmmword ptr [rsp+0x100] 760 paddd xmm8, xmm13 761 paddd xmm9, xmm14 762 pxor xmm5, xmm10 763 pxor xmm6, xmm11 764 pxor xmm7, xmm8 765 pxor xmm4, xmm9 766 movdqa xmmword ptr [rsp+0x100], xmm8 767 movdqa xmm8, xmm5 768 psrld xmm8, 7 769 pslld xmm5, 25 770 por xmm5, xmm8 771 movdqa xmm8, xmm6 772 psrld xmm8, 7 773 pslld xmm6, 25 774 por xmm6, xmm8 775 movdqa xmm8, xmm7 776 psrld xmm8, 7 777 pslld xmm7, 25 778 por xmm7, xmm8 779 movdqa xmm8, xmm4 780 psrld xmm8, 7 781 pslld xmm4, 25 782 por xmm4, xmm8 783 paddd xmm0, xmmword ptr [rsp+0xA0] 784 paddd xmm1, xmmword ptr [rsp+0xC0] 785 paddd xmm2, xmmword ptr [rsp+0xE0] 786 paddd xmm3, xmmword ptr [rsp+0xD0] 787 paddd xmm0, xmm4 788 paddd xmm1, xmm5 789 paddd xmm2, xmm6 790 paddd xmm3, xmm7 791 pxor xmm12, xmm0 792 pxor xmm13, xmm1 793 pxor xmm14, xmm2 794 pxor xmm15, xmm3 795 pshuflw xmm12, xmm12, 0xB1 796 pshufhw xmm12, xmm12, 0xB1 797 pshuflw xmm13, xmm13, 0xB1 798 pshufhw xmm13, xmm13, 0xB1 799 pshuflw xmm14, xmm14, 0xB1 800 pshufhw xmm14, xmm14, 0xB1 801 pshuflw xmm15, xmm15, 0xB1 802 pshufhw xmm15, xmm15, 0xB1 803 movdqa xmm8, xmmword ptr [rsp+0x100] 804 paddd xmm8, xmm12 805 paddd xmm9, xmm13 806 paddd xmm10, xmm14 807 paddd xmm11, xmm15 808 pxor xmm4, xmm8 809 pxor xmm5, xmm9 810 pxor xmm6, xmm10 811 pxor xmm7, xmm11 812 movdqa xmmword ptr [rsp+0x100], xmm8 813 movdqa xmm8, xmm4 814 psrld xmm8, 12 815 pslld xmm4, 20 816 por xmm4, xmm8 817 movdqa xmm8, xmm5 818 psrld xmm8, 12 819 pslld xmm5, 20 820 por xmm5, xmm8 821 movdqa xmm8, xmm6 822 psrld xmm8, 12 823 pslld xmm6, 20 824 por xmm6, xmm8 825 movdqa xmm8, xmm7 826 psrld xmm8, 12 827 pslld xmm7, 20 828 por xmm7, xmm8 829 paddd xmm0, xmmword ptr [rsp+0x70] 830 paddd xmm1, xmmword ptr [rsp+0x90] 831 paddd xmm2, xmmword ptr [rsp+0x30] 832 paddd xmm3, xmmword ptr [rsp+0xF0] 833 paddd xmm0, xmm4 834 paddd xmm1, xmm5 835 paddd xmm2, xmm6 836 paddd xmm3, xmm7 837 pxor xmm12, xmm0 838 pxor xmm13, xmm1 839 pxor xmm14, xmm2 840 pxor xmm15, xmm3 841 movdqa xmm8, xmm12 842 psrld xmm12, 8 843 pslld xmm8, 24 844 pxor xmm12, xmm8 845 movdqa xmm8, xmm13 846 psrld xmm13, 8 847 pslld xmm8, 24 848 pxor xmm13, xmm8 849 movdqa xmm8, xmm14 850 psrld xmm14, 8 851 pslld xmm8, 24 852 pxor xmm14, xmm8 853 movdqa xmm8, xmm15 854 psrld xmm15, 8 855 pslld xmm8, 24 856 pxor xmm15, xmm8 857 movdqa xmm8, xmmword ptr [rsp+0x100] 858 paddd xmm8, xmm12 859 paddd xmm9, xmm13 860 paddd xmm10, xmm14 861 paddd xmm11, xmm15 862 pxor xmm4, xmm8 863 pxor xmm5, xmm9 864 pxor xmm6, xmm10 865 pxor xmm7, xmm11 866 movdqa xmmword ptr [rsp+0x100], xmm8 867 movdqa xmm8, xmm4 868 psrld xmm8, 7 869 pslld xmm4, 25 870 por xmm4, xmm8 871 movdqa xmm8, xmm5 872 psrld xmm8, 7 873 pslld xmm5, 25 874 por xmm5, xmm8 875 movdqa xmm8, xmm6 876 psrld xmm8, 7 877 pslld xmm6, 25 878 por xmm6, xmm8 879 movdqa xmm8, xmm7 880 psrld xmm8, 7 881 pslld xmm7, 25 882 por xmm7, xmm8 883 paddd xmm0, xmmword ptr [rsp+0x40] 884 paddd xmm1, xmmword ptr [rsp+0xB0] 885 paddd xmm2, xmmword ptr [rsp+0x50] 886 paddd xmm3, xmmword ptr [rsp+0x10] 887 paddd xmm0, xmm5 888 paddd xmm1, xmm6 889 paddd xmm2, xmm7 890 paddd xmm3, xmm4 891 pxor xmm15, xmm0 892 pxor xmm12, xmm1 893 pxor xmm13, xmm2 894 pxor xmm14, xmm3 895 pshuflw xmm15, xmm15, 0xB1 896 pshufhw xmm15, xmm15, 0xB1 897 pshuflw xmm12, xmm12, 0xB1 898 pshufhw xmm12, xmm12, 0xB1 899 pshuflw xmm13, xmm13, 0xB1 900 pshufhw xmm13, xmm13, 0xB1 901 pshuflw xmm14, xmm14, 0xB1 902 pshufhw xmm14, xmm14, 0xB1 903 paddd xmm10, xmm15 904 paddd xmm11, xmm12 905 movdqa xmm8, xmmword ptr [rsp+0x100] 906 paddd xmm8, xmm13 907 paddd xmm9, xmm14 908 pxor xmm5, xmm10 909 pxor xmm6, xmm11 910 pxor xmm7, xmm8 911 pxor xmm4, xmm9 912 movdqa xmmword ptr [rsp+0x100], xmm8 913 movdqa xmm8, xmm5 914 psrld xmm8, 12 915 pslld xmm5, 20 916 por xmm5, xmm8 917 movdqa xmm8, xmm6 918 psrld xmm8, 12 919 pslld xmm6, 20 920 por xmm6, xmm8 921 movdqa xmm8, xmm7 922 psrld xmm8, 12 923 pslld xmm7, 20 924 por xmm7, xmm8 925 movdqa xmm8, xmm4 926 psrld xmm8, 12 927 pslld xmm4, 20 928 por xmm4, xmm8 929 paddd xmm0, xmmword ptr [rsp] 930 paddd xmm1, xmmword ptr [rsp+0x20] 931 paddd xmm2, xmmword ptr [rsp+0x80] 932 paddd xmm3, xmmword ptr [rsp+0x60] 933 paddd xmm0, xmm5 934 paddd xmm1, xmm6 935 paddd xmm2, xmm7 936 paddd xmm3, xmm4 937 pxor xmm15, xmm0 938 pxor xmm12, xmm1 939 pxor xmm13, xmm2 940 pxor xmm14, xmm3 941 movdqa xmm8, xmm15 942 psrld xmm15, 8 943 pslld xmm8, 24 944 pxor xmm15, xmm8 945 movdqa xmm8, xmm12 946 psrld xmm12, 8 947 pslld xmm8, 24 948 pxor xmm12, xmm8 949 movdqa xmm8, xmm13 950 psrld xmm13, 8 951 pslld xmm8, 24 952 pxor xmm13, xmm8 953 movdqa xmm8, xmm14 954 psrld xmm14, 8 955 pslld xmm8, 24 956 pxor xmm14, xmm8 957 paddd xmm10, xmm15 958 paddd xmm11, xmm12 959 movdqa xmm8, xmmword ptr [rsp+0x100] 960 paddd xmm8, xmm13 961 paddd xmm9, xmm14 962 pxor xmm5, xmm10 963 pxor xmm6, xmm11 964 pxor xmm7, xmm8 965 pxor xmm4, xmm9 966 movdqa xmmword ptr [rsp+0x100], xmm8 967 movdqa xmm8, xmm5 968 psrld xmm8, 7 969 pslld xmm5, 25 970 por xmm5, xmm8 971 movdqa xmm8, xmm6 972 psrld xmm8, 7 973 pslld xmm6, 25 974 por xmm6, xmm8 975 movdqa xmm8, xmm7 976 psrld xmm8, 7 977 pslld xmm7, 25 978 por xmm7, xmm8 979 movdqa xmm8, xmm4 980 psrld xmm8, 7 981 pslld xmm4, 25 982 por xmm4, xmm8 983 paddd xmm0, xmmword ptr [rsp+0xC0] 984 paddd xmm1, xmmword ptr [rsp+0x90] 985 paddd xmm2, xmmword ptr [rsp+0xF0] 986 paddd xmm3, xmmword ptr [rsp+0xE0] 987 paddd xmm0, xmm4 988 paddd xmm1, xmm5 989 paddd xmm2, xmm6 990 paddd xmm3, xmm7 991 pxor xmm12, xmm0 992 pxor xmm13, xmm1 993 pxor xmm14, xmm2 994 pxor xmm15, xmm3 995 pshuflw xmm12, xmm12, 0xB1 996 pshufhw xmm12, xmm12, 0xB1 997 pshuflw xmm13, xmm13, 0xB1 998 pshufhw xmm13, xmm13, 0xB1 999 pshuflw xmm14, xmm14, 0xB1 1000 pshufhw xmm14, xmm14, 0xB1 1001 pshuflw xmm15, xmm15, 0xB1 1002 pshufhw xmm15, xmm15, 0xB1 1003 movdqa xmm8, xmmword ptr [rsp+0x100] 1004 paddd xmm8, xmm12 1005 paddd xmm9, xmm13 1006 paddd xmm10, xmm14 1007 paddd xmm11, xmm15 1008 pxor xmm4, xmm8 1009 pxor xmm5, xmm9 1010 pxor xmm6, xmm10 1011 pxor xmm7, xmm11 1012 movdqa xmmword ptr [rsp+0x100], xmm8 1013 movdqa xmm8, xmm4 1014 psrld xmm8, 12 1015 pslld xmm4, 20 1016 por xmm4, xmm8 1017 movdqa xmm8, xmm5 1018 psrld xmm8, 12 1019 pslld xmm5, 20 1020 por xmm5, xmm8 1021 movdqa xmm8, xmm6 1022 psrld xmm8, 12 1023 pslld xmm6, 20 1024 por xmm6, xmm8 1025 movdqa xmm8, xmm7 1026 psrld xmm8, 12 1027 pslld xmm7, 20 1028 por xmm7, xmm8 1029 paddd xmm0, xmmword ptr [rsp+0xD0] 1030 paddd xmm1, xmmword ptr [rsp+0xB0] 1031 paddd xmm2, xmmword ptr [rsp+0xA0] 1032 paddd xmm3, xmmword ptr [rsp+0x80] 1033 paddd xmm0, xmm4 1034 paddd xmm1, xmm5 1035 paddd xmm2, xmm6 1036 paddd xmm3, xmm7 1037 pxor xmm12, xmm0 1038 pxor xmm13, xmm1 1039 pxor xmm14, xmm2 1040 pxor xmm15, xmm3 1041 movdqa xmm8, xmm12 1042 psrld xmm12, 8 1043 pslld xmm8, 24 1044 pxor xmm12, xmm8 1045 movdqa xmm8, xmm13 1046 psrld xmm13, 8 1047 pslld xmm8, 24 1048 pxor xmm13, xmm8 1049 movdqa xmm8, xmm14 1050 psrld xmm14, 8 1051 pslld xmm8, 24 1052 pxor xmm14, xmm8 1053 movdqa xmm8, xmm15 1054 psrld xmm15, 8 1055 pslld xmm8, 24 1056 pxor xmm15, xmm8 1057 movdqa xmm8, xmmword ptr [rsp+0x100] 1058 paddd xmm8, xmm12 1059 paddd xmm9, xmm13 1060 paddd xmm10, xmm14 1061 paddd xmm11, xmm15 1062 pxor xmm4, xmm8 1063 pxor xmm5, xmm9 1064 pxor xmm6, xmm10 1065 pxor xmm7, xmm11 1066 movdqa xmmword ptr [rsp+0x100], xmm8 1067 movdqa xmm8, xmm4 1068 psrld xmm8, 7 1069 pslld xmm4, 25 1070 por xmm4, xmm8 1071 movdqa xmm8, xmm5 1072 psrld xmm8, 7 1073 pslld xmm5, 25 1074 por xmm5, xmm8 1075 movdqa xmm8, xmm6 1076 psrld xmm8, 7 1077 pslld xmm6, 25 1078 por xmm6, xmm8 1079 movdqa xmm8, xmm7 1080 psrld xmm8, 7 1081 pslld xmm7, 25 1082 por xmm7, xmm8 1083 paddd xmm0, xmmword ptr [rsp+0x70] 1084 paddd xmm1, xmmword ptr [rsp+0x50] 1085 paddd xmm2, xmmword ptr [rsp] 1086 paddd xmm3, xmmword ptr [rsp+0x60] 1087 paddd xmm0, xmm5 1088 paddd xmm1, xmm6 1089 paddd xmm2, xmm7 1090 paddd xmm3, xmm4 1091 pxor xmm15, xmm0 1092 pxor xmm12, xmm1 1093 pxor xmm13, xmm2 1094 pxor xmm14, xmm3 1095 pshuflw xmm15, xmm15, 0xB1 1096 pshufhw xmm15, xmm15, 0xB1 1097 pshuflw xmm12, xmm12, 0xB1 1098 pshufhw xmm12, xmm12, 0xB1 1099 pshuflw xmm13, xmm13, 0xB1 1100 pshufhw xmm13, xmm13, 0xB1 1101 pshuflw xmm14, xmm14, 0xB1 1102 pshufhw xmm14, xmm14, 0xB1 1103 paddd xmm10, xmm15 1104 paddd xmm11, xmm12 1105 movdqa xmm8, xmmword ptr [rsp+0x100] 1106 paddd xmm8, xmm13 1107 paddd xmm9, xmm14 1108 pxor xmm5, xmm10 1109 pxor xmm6, xmm11 1110 pxor xmm7, xmm8 1111 pxor xmm4, xmm9 1112 movdqa xmmword ptr [rsp+0x100], xmm8 1113 movdqa xmm8, xmm5 1114 psrld xmm8, 12 1115 pslld xmm5, 20 1116 por xmm5, xmm8 1117 movdqa xmm8, xmm6 1118 psrld xmm8, 12 1119 pslld xmm6, 20 1120 por xmm6, xmm8 1121 movdqa xmm8, xmm7 1122 psrld xmm8, 12 1123 pslld xmm7, 20 1124 por xmm7, xmm8 1125 movdqa xmm8, xmm4 1126 psrld xmm8, 12 1127 pslld xmm4, 20 1128 por xmm4, xmm8 1129 paddd xmm0, xmmword ptr [rsp+0x20] 1130 paddd xmm1, xmmword ptr [rsp+0x30] 1131 paddd xmm2, xmmword ptr [rsp+0x10] 1132 paddd xmm3, xmmword ptr [rsp+0x40] 1133 paddd xmm0, xmm5 1134 paddd xmm1, xmm6 1135 paddd xmm2, xmm7 1136 paddd xmm3, xmm4 1137 pxor xmm15, xmm0 1138 pxor xmm12, xmm1 1139 pxor xmm13, xmm2 1140 pxor xmm14, xmm3 1141 movdqa xmm8, xmm15 1142 psrld xmm15, 8 1143 pslld xmm8, 24 1144 pxor xmm15, xmm8 1145 movdqa xmm8, xmm12 1146 psrld xmm12, 8 1147 pslld xmm8, 24 1148 pxor xmm12, xmm8 1149 movdqa xmm8, xmm13 1150 psrld xmm13, 8 1151 pslld xmm8, 24 1152 pxor xmm13, xmm8 1153 movdqa xmm8, xmm14 1154 psrld xmm14, 8 1155 pslld xmm8, 24 1156 pxor xmm14, xmm8 1157 paddd xmm10, xmm15 1158 paddd xmm11, xmm12 1159 movdqa xmm8, xmmword ptr [rsp+0x100] 1160 paddd xmm8, xmm13 1161 paddd xmm9, xmm14 1162 pxor xmm5, xmm10 1163 pxor xmm6, xmm11 1164 pxor xmm7, xmm8 1165 pxor xmm4, xmm9 1166 movdqa xmmword ptr [rsp+0x100], xmm8 1167 movdqa xmm8, xmm5 1168 psrld xmm8, 7 1169 pslld xmm5, 25 1170 por xmm5, xmm8 1171 movdqa xmm8, xmm6 1172 psrld xmm8, 7 1173 pslld xmm6, 25 1174 por xmm6, xmm8 1175 movdqa xmm8, xmm7 1176 psrld xmm8, 7 1177 pslld xmm7, 25 1178 por xmm7, xmm8 1179 movdqa xmm8, xmm4 1180 psrld xmm8, 7 1181 pslld xmm4, 25 1182 por xmm4, xmm8 1183 paddd xmm0, xmmword ptr [rsp+0x90] 1184 paddd xmm1, xmmword ptr [rsp+0xB0] 1185 paddd xmm2, xmmword ptr [rsp+0x80] 1186 paddd xmm3, xmmword ptr [rsp+0xF0] 1187 paddd xmm0, xmm4 1188 paddd xmm1, xmm5 1189 paddd xmm2, xmm6 1190 paddd xmm3, xmm7 1191 pxor xmm12, xmm0 1192 pxor xmm13, xmm1 1193 pxor xmm14, xmm2 1194 pxor xmm15, xmm3 1195 pshuflw xmm12, xmm12, 0xB1 1196 pshufhw xmm12, xmm12, 0xB1 1197 pshuflw xmm13, xmm13, 0xB1 1198 pshufhw xmm13, xmm13, 0xB1 1199 pshuflw xmm14, xmm14, 0xB1 1200 pshufhw xmm14, xmm14, 0xB1 1201 pshuflw xmm15, xmm15, 0xB1 1202 pshufhw xmm15, xmm15, 0xB1 1203 movdqa xmm8, xmmword ptr [rsp+0x100] 1204 paddd xmm8, xmm12 1205 paddd xmm9, xmm13 1206 paddd xmm10, xmm14 1207 paddd xmm11, xmm15 1208 pxor xmm4, xmm8 1209 pxor xmm5, xmm9 1210 pxor xmm6, xmm10 1211 pxor xmm7, xmm11 1212 movdqa xmmword ptr [rsp+0x100], xmm8 1213 movdqa xmm8, xmm4 1214 psrld xmm8, 12 1215 pslld xmm4, 20 1216 por xmm4, xmm8 1217 movdqa xmm8, xmm5 1218 psrld xmm8, 12 1219 pslld xmm5, 20 1220 por xmm5, xmm8 1221 movdqa xmm8, xmm6 1222 psrld xmm8, 12 1223 pslld xmm6, 20 1224 por xmm6, xmm8 1225 movdqa xmm8, xmm7 1226 psrld xmm8, 12 1227 pslld xmm7, 20 1228 por xmm7, xmm8 1229 paddd xmm0, xmmword ptr [rsp+0xE0] 1230 paddd xmm1, xmmword ptr [rsp+0x50] 1231 paddd xmm2, xmmword ptr [rsp+0xC0] 1232 paddd xmm3, xmmword ptr [rsp+0x10] 1233 paddd xmm0, xmm4 1234 paddd xmm1, xmm5 1235 paddd xmm2, xmm6 1236 paddd xmm3, xmm7 1237 pxor xmm12, xmm0 1238 pxor xmm13, xmm1 1239 pxor xmm14, xmm2 1240 pxor xmm15, xmm3 1241 movdqa xmm8, xmm12 1242 psrld xmm12, 8 1243 pslld xmm8, 24 1244 pxor xmm12, xmm8 1245 movdqa xmm8, xmm13 1246 psrld xmm13, 8 1247 pslld xmm8, 24 1248 pxor xmm13, xmm8 1249 movdqa xmm8, xmm14 1250 psrld xmm14, 8 1251 pslld xmm8, 24 1252 pxor xmm14, xmm8 1253 movdqa xmm8, xmm15 1254 psrld xmm15, 8 1255 pslld xmm8, 24 1256 pxor xmm15, xmm8 1257 movdqa xmm8, xmmword ptr [rsp+0x100] 1258 paddd xmm8, xmm12 1259 paddd xmm9, xmm13 1260 paddd xmm10, xmm14 1261 paddd xmm11, xmm15 1262 pxor xmm4, xmm8 1263 pxor xmm5, xmm9 1264 pxor xmm6, xmm10 1265 pxor xmm7, xmm11 1266 movdqa xmmword ptr [rsp+0x100], xmm8 1267 movdqa xmm8, xmm4 1268 psrld xmm8, 7 1269 pslld xmm4, 25 1270 por xmm4, xmm8 1271 movdqa xmm8, xmm5 1272 psrld xmm8, 7 1273 pslld xmm5, 25 1274 por xmm5, xmm8 1275 movdqa xmm8, xmm6 1276 psrld xmm8, 7 1277 pslld xmm6, 25 1278 por xmm6, xmm8 1279 movdqa xmm8, xmm7 1280 psrld xmm8, 7 1281 pslld xmm7, 25 1282 por xmm7, xmm8 1283 paddd xmm0, xmmword ptr [rsp+0xD0] 1284 paddd xmm1, xmmword ptr [rsp] 1285 paddd xmm2, xmmword ptr [rsp+0x20] 1286 paddd xmm3, xmmword ptr [rsp+0x40] 1287 paddd xmm0, xmm5 1288 paddd xmm1, xmm6 1289 paddd xmm2, xmm7 1290 paddd xmm3, xmm4 1291 pxor xmm15, xmm0 1292 pxor xmm12, xmm1 1293 pxor xmm13, xmm2 1294 pxor xmm14, xmm3 1295 pshuflw xmm15, xmm15, 0xB1 1296 pshufhw xmm15, xmm15, 0xB1 1297 pshuflw xmm12, xmm12, 0xB1 1298 pshufhw xmm12, xmm12, 0xB1 1299 pshuflw xmm13, xmm13, 0xB1 1300 pshufhw xmm13, xmm13, 0xB1 1301 pshuflw xmm14, xmm14, 0xB1 1302 pshufhw xmm14, xmm14, 0xB1 1303 paddd xmm10, xmm15 1304 paddd xmm11, xmm12 1305 movdqa xmm8, xmmword ptr [rsp+0x100] 1306 paddd xmm8, xmm13 1307 paddd xmm9, xmm14 1308 pxor xmm5, xmm10 1309 pxor xmm6, xmm11 1310 pxor xmm7, xmm8 1311 pxor xmm4, xmm9 1312 movdqa xmmword ptr [rsp+0x100], xmm8 1313 movdqa xmm8, xmm5 1314 psrld xmm8, 12 1315 pslld xmm5, 20 1316 por xmm5, xmm8 1317 movdqa xmm8, xmm6 1318 psrld xmm8, 12 1319 pslld xmm6, 20 1320 por xmm6, xmm8 1321 movdqa xmm8, xmm7 1322 psrld xmm8, 12 1323 pslld xmm7, 20 1324 por xmm7, xmm8 1325 movdqa xmm8, xmm4 1326 psrld xmm8, 12 1327 pslld xmm4, 20 1328 por xmm4, xmm8 1329 paddd xmm0, xmmword ptr [rsp+0x30] 1330 paddd xmm1, xmmword ptr [rsp+0xA0] 1331 paddd xmm2, xmmword ptr [rsp+0x60] 1332 paddd xmm3, xmmword ptr [rsp+0x70] 1333 paddd xmm0, xmm5 1334 paddd xmm1, xmm6 1335 paddd xmm2, xmm7 1336 paddd xmm3, xmm4 1337 pxor xmm15, xmm0 1338 pxor xmm12, xmm1 1339 pxor xmm13, xmm2 1340 pxor xmm14, xmm3 1341 movdqa xmm8, xmm15 1342 psrld xmm15, 8 1343 pslld xmm8, 24 1344 pxor xmm15, xmm8 1345 movdqa xmm8, xmm12 1346 psrld xmm12, 8 1347 pslld xmm8, 24 1348 pxor xmm12, xmm8 1349 movdqa xmm8, xmm13 1350 psrld xmm13, 8 1351 pslld xmm8, 24 1352 pxor xmm13, xmm8 1353 movdqa xmm8, xmm14 1354 psrld xmm14, 8 1355 pslld xmm8, 24 1356 pxor xmm14, xmm8 1357 paddd xmm10, xmm15 1358 paddd xmm11, xmm12 1359 movdqa xmm8, xmmword ptr [rsp+0x100] 1360 paddd xmm8, xmm13 1361 paddd xmm9, xmm14 1362 pxor xmm5, xmm10 1363 pxor xmm6, xmm11 1364 pxor xmm7, xmm8 1365 pxor xmm4, xmm9 1366 movdqa xmmword ptr [rsp+0x100], xmm8 1367 movdqa xmm8, xmm5 1368 psrld xmm8, 7 1369 pslld xmm5, 25 1370 por xmm5, xmm8 1371 movdqa xmm8, xmm6 1372 psrld xmm8, 7 1373 pslld xmm6, 25 1374 por xmm6, xmm8 1375 movdqa xmm8, xmm7 1376 psrld xmm8, 7 1377 pslld xmm7, 25 1378 por xmm7, xmm8 1379 movdqa xmm8, xmm4 1380 psrld xmm8, 7 1381 pslld xmm4, 25 1382 por xmm4, xmm8 1383 paddd xmm0, xmmword ptr [rsp+0xB0] 1384 paddd xmm1, xmmword ptr [rsp+0x50] 1385 paddd xmm2, xmmword ptr [rsp+0x10] 1386 paddd xmm3, xmmword ptr [rsp+0x80] 1387 paddd xmm0, xmm4 1388 paddd xmm1, xmm5 1389 paddd xmm2, xmm6 1390 paddd xmm3, xmm7 1391 pxor xmm12, xmm0 1392 pxor xmm13, xmm1 1393 pxor xmm14, xmm2 1394 pxor xmm15, xmm3 1395 pshuflw xmm12, xmm12, 0xB1 1396 pshufhw xmm12, xmm12, 0xB1 1397 pshuflw xmm13, xmm13, 0xB1 1398 pshufhw xmm13, xmm13, 0xB1 1399 pshuflw xmm14, xmm14, 0xB1 1400 pshufhw xmm14, xmm14, 0xB1 1401 pshuflw xmm15, xmm15, 0xB1 1402 pshufhw xmm15, xmm15, 0xB1 1403 movdqa xmm8, xmmword ptr [rsp+0x100] 1404 paddd xmm8, xmm12 1405 paddd xmm9, xmm13 1406 paddd xmm10, xmm14 1407 paddd xmm11, xmm15 1408 pxor xmm4, xmm8 1409 pxor xmm5, xmm9 1410 pxor xmm6, xmm10 1411 pxor xmm7, xmm11 1412 movdqa xmmword ptr [rsp+0x100], xmm8 1413 movdqa xmm8, xmm4 1414 psrld xmm8, 12 1415 pslld xmm4, 20 1416 por xmm4, xmm8 1417 movdqa xmm8, xmm5 1418 psrld xmm8, 12 1419 pslld xmm5, 20 1420 por xmm5, xmm8 1421 movdqa xmm8, xmm6 1422 psrld xmm8, 12 1423 pslld xmm6, 20 1424 por xmm6, xmm8 1425 movdqa xmm8, xmm7 1426 psrld xmm8, 12 1427 pslld xmm7, 20 1428 por xmm7, xmm8 1429 paddd xmm0, xmmword ptr [rsp+0xF0] 1430 paddd xmm1, xmmword ptr [rsp] 1431 paddd xmm2, xmmword ptr [rsp+0x90] 1432 paddd xmm3, xmmword ptr [rsp+0x60] 1433 paddd xmm0, xmm4 1434 paddd xmm1, xmm5 1435 paddd xmm2, xmm6 1436 paddd xmm3, xmm7 1437 pxor xmm12, xmm0 1438 pxor xmm13, xmm1 1439 pxor xmm14, xmm2 1440 pxor xmm15, xmm3 1441 movdqa xmm8, xmm12 1442 psrld xmm12, 8 1443 pslld xmm8, 24 1444 pxor xmm12, xmm8 1445 movdqa xmm8, xmm13 1446 psrld xmm13, 8 1447 pslld xmm8, 24 1448 pxor xmm13, xmm8 1449 movdqa xmm8, xmm14 1450 psrld xmm14, 8 1451 pslld xmm8, 24 1452 pxor xmm14, xmm8 1453 movdqa xmm8, xmm15 1454 psrld xmm15, 8 1455 pslld xmm8, 24 1456 pxor xmm15, xmm8 1457 movdqa xmm8, xmmword ptr [rsp+0x100] 1458 paddd xmm8, xmm12 1459 paddd xmm9, xmm13 1460 paddd xmm10, xmm14 1461 paddd xmm11, xmm15 1462 pxor xmm4, xmm8 1463 pxor xmm5, xmm9 1464 pxor xmm6, xmm10 1465 pxor xmm7, xmm11 1466 movdqa xmmword ptr [rsp+0x100], xmm8 1467 movdqa xmm8, xmm4 1468 psrld xmm8, 7 1469 pslld xmm4, 25 1470 por xmm4, xmm8 1471 movdqa xmm8, xmm5 1472 psrld xmm8, 7 1473 pslld xmm5, 25 1474 por xmm5, xmm8 1475 movdqa xmm8, xmm6 1476 psrld xmm8, 7 1477 pslld xmm6, 25 1478 por xmm6, xmm8 1479 movdqa xmm8, xmm7 1480 psrld xmm8, 7 1481 pslld xmm7, 25 1482 por xmm7, xmm8 1483 paddd xmm0, xmmword ptr [rsp+0xE0] 1484 paddd xmm1, xmmword ptr [rsp+0x20] 1485 paddd xmm2, xmmword ptr [rsp+0x30] 1486 paddd xmm3, xmmword ptr [rsp+0x70] 1487 paddd xmm0, xmm5 1488 paddd xmm1, xmm6 1489 paddd xmm2, xmm7 1490 paddd xmm3, xmm4 1491 pxor xmm15, xmm0 1492 pxor xmm12, xmm1 1493 pxor xmm13, xmm2 1494 pxor xmm14, xmm3 1495 pshuflw xmm15, xmm15, 0xB1 1496 pshufhw xmm15, xmm15, 0xB1 1497 pshuflw xmm12, xmm12, 0xB1 1498 pshufhw xmm12, xmm12, 0xB1 1499 pshuflw xmm13, xmm13, 0xB1 1500 pshufhw xmm13, xmm13, 0xB1 1501 pshuflw xmm14, xmm14, 0xB1 1502 pshufhw xmm14, xmm14, 0xB1 1503 paddd xmm10, xmm15 1504 paddd xmm11, xmm12 1505 movdqa xmm8, xmmword ptr [rsp+0x100] 1506 paddd xmm8, xmm13 1507 paddd xmm9, xmm14 1508 pxor xmm5, xmm10 1509 pxor xmm6, xmm11 1510 pxor xmm7, xmm8 1511 pxor xmm4, xmm9 1512 movdqa xmmword ptr [rsp+0x100], xmm8 1513 movdqa xmm8, xmm5 1514 psrld xmm8, 12 1515 pslld xmm5, 20 1516 por xmm5, xmm8 1517 movdqa xmm8, xmm6 1518 psrld xmm8, 12 1519 pslld xmm6, 20 1520 por xmm6, xmm8 1521 movdqa xmm8, xmm7 1522 psrld xmm8, 12 1523 pslld xmm7, 20 1524 por xmm7, xmm8 1525 movdqa xmm8, xmm4 1526 psrld xmm8, 12 1527 pslld xmm4, 20 1528 por xmm4, xmm8 1529 paddd xmm0, xmmword ptr [rsp+0xA0] 1530 paddd xmm1, xmmword ptr [rsp+0xC0] 1531 paddd xmm2, xmmword ptr [rsp+0x40] 1532 paddd xmm3, xmmword ptr [rsp+0xD0] 1533 paddd xmm0, xmm5 1534 paddd xmm1, xmm6 1535 paddd xmm2, xmm7 1536 paddd xmm3, xmm4 1537 pxor xmm15, xmm0 1538 pxor xmm12, xmm1 1539 pxor xmm13, xmm2 1540 pxor xmm14, xmm3 1541 movdqa xmm8, xmm15 1542 psrld xmm15, 8 1543 pslld xmm8, 24 1544 pxor xmm15, xmm8 1545 movdqa xmm8, xmm12 1546 psrld xmm12, 8 1547 pslld xmm8, 24 1548 pxor xmm12, xmm8 1549 movdqa xmm8, xmm13 1550 psrld xmm13, 8 1551 pslld xmm8, 24 1552 pxor xmm13, xmm8 1553 movdqa xmm8, xmm14 1554 psrld xmm14, 8 1555 pslld xmm8, 24 1556 pxor xmm14, xmm8 1557 paddd xmm10, xmm15 1558 paddd xmm11, xmm12 1559 movdqa xmm8, xmmword ptr [rsp+0x100] 1560 paddd xmm8, xmm13 1561 paddd xmm9, xmm14 1562 pxor xmm5, xmm10 1563 pxor xmm6, xmm11 1564 pxor xmm7, xmm8 1565 pxor xmm4, xmm9 1566 pxor xmm0, xmm8 1567 pxor xmm1, xmm9 1568 pxor xmm2, xmm10 1569 pxor xmm3, xmm11 1570 movdqa xmm8, xmm5 1571 psrld xmm8, 7 1572 pslld xmm5, 25 1573 por xmm5, xmm8 1574 movdqa xmm8, xmm6 1575 psrld xmm8, 7 1576 pslld xmm6, 25 1577 por xmm6, xmm8 1578 movdqa xmm8, xmm7 1579 psrld xmm8, 7 1580 pslld xmm7, 25 1581 por xmm7, xmm8 1582 movdqa xmm8, xmm4 1583 psrld xmm8, 7 1584 pslld xmm4, 25 1585 por xmm4, xmm8 1586 pxor xmm4, xmm12 1587 pxor xmm5, xmm13 1588 pxor xmm6, xmm14 1589 pxor xmm7, xmm15 1590 mov eax, r13d 1591 jne 9b 1592 movdqa xmm9, xmm0 1593 punpckldq xmm0, xmm1 1594 punpckhdq xmm9, xmm1 1595 movdqa xmm11, xmm2 1596 punpckldq xmm2, xmm3 1597 punpckhdq xmm11, xmm3 1598 movdqa xmm1, xmm0 1599 punpcklqdq xmm0, xmm2 1600 punpckhqdq xmm1, xmm2 1601 movdqa xmm3, xmm9 1602 punpcklqdq xmm9, xmm11 1603 punpckhqdq xmm3, xmm11 1604 movdqu xmmword ptr [rbx], xmm0 1605 movdqu xmmword ptr [rbx+0x20], xmm1 1606 movdqu xmmword ptr [rbx+0x40], xmm9 1607 movdqu xmmword ptr [rbx+0x60], xmm3 1608 movdqa xmm9, xmm4 1609 punpckldq xmm4, xmm5 1610 punpckhdq xmm9, xmm5 1611 movdqa xmm11, xmm6 1612 punpckldq xmm6, xmm7 1613 punpckhdq xmm11, xmm7 1614 movdqa xmm5, xmm4 1615 punpcklqdq xmm4, xmm6 1616 punpckhqdq xmm5, xmm6 1617 movdqa xmm7, xmm9 1618 punpcklqdq xmm9, xmm11 1619 punpckhqdq xmm7, xmm11 1620 movdqu xmmword ptr [rbx+0x10], xmm4 1621 movdqu xmmword ptr [rbx+0x30], xmm5 1622 movdqu xmmword ptr [rbx+0x50], xmm9 1623 movdqu xmmword ptr [rbx+0x70], xmm7 1624 movdqa xmm1, xmmword ptr [rsp+0x110] 1625 movdqa xmm0, xmm1 1626 paddd xmm1, xmmword ptr [rsp+0x150] 1627 movdqa xmmword ptr [rsp+0x110], xmm1 1628 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] 1629 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] 1630 pcmpgtd xmm0, xmm1 1631 movdqa xmm1, xmmword ptr [rsp+0x120] 1632 psubd xmm1, xmm0 1633 movdqa xmmword ptr [rsp+0x120], xmm1 1634 add rbx, 128 1635 add rdi, 32 1636 sub rsi, 4 1637 cmp rsi, 4 1638 jnc 2b 1639 test rsi, rsi 1640 jnz 3f 1641 4: 1642 mov rsp, rbp 1643 pop rbp 1644 pop rbx 1645 pop r12 1646 pop r13 1647 pop r14 1648 pop r15 1649 ret 1650 .p2align 5 1651 3: 1652 test esi, 0x2 1653 je 3f 1654 movups xmm0, xmmword ptr [rcx] 1655 movups xmm1, xmmword ptr [rcx+0x10] 1656 movaps xmm8, xmm0 1657 movaps xmm9, xmm1 1658 movd xmm13, dword ptr [rsp+0x110] 1659 movd xmm14, dword ptr [rsp+0x120] 1660 punpckldq xmm13, xmm14 1661 movaps xmmword ptr [rsp], xmm13 1662 movd xmm14, dword ptr [rsp+0x114] 1663 movd xmm13, dword ptr [rsp+0x124] 1664 punpckldq xmm14, xmm13 1665 movaps xmmword ptr [rsp+0x10], xmm14 1666 mov r8, qword ptr [rdi] 1667 mov r9, qword ptr [rdi+0x8] 1668 movzx eax, byte ptr [rbp+0x40] 1669 or eax, r13d 1670 xor edx, edx 1671 2: 1672 mov r14d, eax 1673 or eax, r12d 1674 add rdx, 64 1675 cmp rdx, r15 1676 cmovne eax, r14d 1677 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1678 movaps xmm10, xmm2 1679 movups xmm4, xmmword ptr [r8+rdx-0x40] 1680 movups xmm5, xmmword ptr [r8+rdx-0x30] 1681 movaps xmm3, xmm4 1682 shufps xmm4, xmm5, 136 1683 shufps xmm3, xmm5, 221 1684 movaps xmm5, xmm3 1685 movups xmm6, xmmword ptr [r8+rdx-0x20] 1686 movups xmm7, xmmword ptr [r8+rdx-0x10] 1687 movaps xmm3, xmm6 1688 shufps xmm6, xmm7, 136 1689 pshufd xmm6, xmm6, 0x93 1690 shufps xmm3, xmm7, 221 1691 pshufd xmm7, xmm3, 0x93 1692 movups xmm12, xmmword ptr [r9+rdx-0x40] 1693 movups xmm13, xmmword ptr [r9+rdx-0x30] 1694 movaps xmm11, xmm12 1695 shufps xmm12, xmm13, 136 1696 shufps xmm11, xmm13, 221 1697 movaps xmm13, xmm11 1698 movups xmm14, xmmword ptr [r9+rdx-0x20] 1699 movups xmm15, xmmword ptr [r9+rdx-0x10] 1700 movaps xmm11, xmm14 1701 shufps xmm14, xmm15, 136 1702 pshufd xmm14, xmm14, 0x93 1703 shufps xmm11, xmm15, 221 1704 pshufd xmm15, xmm11, 0x93 1705 shl rax, 0x20 1706 or rax, 0x40 1707 movd xmm3, rax 1708 movdqa xmmword ptr [rsp+0x20], xmm3 1709 movaps xmm3, xmmword ptr [rsp] 1710 movaps xmm11, xmmword ptr [rsp+0x10] 1711 punpcklqdq xmm3, xmmword ptr [rsp+0x20] 1712 punpcklqdq xmm11, xmmword ptr [rsp+0x20] 1713 mov al, 7 1714 9: 1715 paddd xmm0, xmm4 1716 paddd xmm8, xmm12 1717 movaps xmmword ptr [rsp+0x20], xmm4 1718 movaps xmmword ptr [rsp+0x30], xmm12 1719 paddd xmm0, xmm1 1720 paddd xmm8, xmm9 1721 pxor xmm3, xmm0 1722 pxor xmm11, xmm8 1723 pshuflw xmm3, xmm3, 0xB1 1724 pshufhw xmm3, xmm3, 0xB1 1725 pshuflw xmm11, xmm11, 0xB1 1726 pshufhw xmm11, xmm11, 0xB1 1727 paddd xmm2, xmm3 1728 paddd xmm10, xmm11 1729 pxor xmm1, xmm2 1730 pxor xmm9, xmm10 1731 movdqa xmm4, xmm1 1732 pslld xmm1, 20 1733 psrld xmm4, 12 1734 por xmm1, xmm4 1735 movdqa xmm4, xmm9 1736 pslld xmm9, 20 1737 psrld xmm4, 12 1738 por xmm9, xmm4 1739 paddd xmm0, xmm5 1740 paddd xmm8, xmm13 1741 movaps xmmword ptr [rsp+0x40], xmm5 1742 movaps xmmword ptr [rsp+0x50], xmm13 1743 paddd xmm0, xmm1 1744 paddd xmm8, xmm9 1745 pxor xmm3, xmm0 1746 pxor xmm11, xmm8 1747 movdqa xmm13, xmm3 1748 psrld xmm3, 8 1749 pslld xmm13, 24 1750 pxor xmm3, xmm13 1751 movdqa xmm13, xmm11 1752 psrld xmm11, 8 1753 pslld xmm13, 24 1754 pxor xmm11, xmm13 1755 paddd xmm2, xmm3 1756 paddd xmm10, xmm11 1757 pxor xmm1, xmm2 1758 pxor xmm9, xmm10 1759 movdqa xmm4, xmm1 1760 pslld xmm1, 25 1761 psrld xmm4, 7 1762 por xmm1, xmm4 1763 movdqa xmm4, xmm9 1764 pslld xmm9, 25 1765 psrld xmm4, 7 1766 por xmm9, xmm4 1767 pshufd xmm0, xmm0, 0x93 1768 pshufd xmm8, xmm8, 0x93 1769 pshufd xmm3, xmm3, 0x4E 1770 pshufd xmm11, xmm11, 0x4E 1771 pshufd xmm2, xmm2, 0x39 1772 pshufd xmm10, xmm10, 0x39 1773 paddd xmm0, xmm6 1774 paddd xmm8, xmm14 1775 paddd xmm0, xmm1 1776 paddd xmm8, xmm9 1777 pxor xmm3, xmm0 1778 pxor xmm11, xmm8 1779 pshuflw xmm3, xmm3, 0xB1 1780 pshufhw xmm3, xmm3, 0xB1 1781 pshuflw xmm11, xmm11, 0xB1 1782 pshufhw xmm11, xmm11, 0xB1 1783 paddd xmm2, xmm3 1784 paddd xmm10, xmm11 1785 pxor xmm1, xmm2 1786 pxor xmm9, xmm10 1787 movdqa xmm4, xmm1 1788 pslld xmm1, 20 1789 psrld xmm4, 12 1790 por xmm1, xmm4 1791 movdqa xmm4, xmm9 1792 pslld xmm9, 20 1793 psrld xmm4, 12 1794 por xmm9, xmm4 1795 paddd xmm0, xmm7 1796 paddd xmm8, xmm15 1797 paddd xmm0, xmm1 1798 paddd xmm8, xmm9 1799 pxor xmm3, xmm0 1800 pxor xmm11, xmm8 1801 movdqa xmm13, xmm3 1802 psrld xmm3, 8 1803 pslld xmm13, 24 1804 pxor xmm3, xmm13 1805 movdqa xmm13, xmm11 1806 psrld xmm11, 8 1807 pslld xmm13, 24 1808 pxor xmm11, xmm13 1809 paddd xmm2, xmm3 1810 paddd xmm10, xmm11 1811 pxor xmm1, xmm2 1812 pxor xmm9, xmm10 1813 movdqa xmm4, xmm1 1814 pslld xmm1, 25 1815 psrld xmm4, 7 1816 por xmm1, xmm4 1817 movdqa xmm4, xmm9 1818 pslld xmm9, 25 1819 psrld xmm4, 7 1820 por xmm9, xmm4 1821 pshufd xmm0, xmm0, 0x39 1822 pshufd xmm8, xmm8, 0x39 1823 pshufd xmm3, xmm3, 0x4E 1824 pshufd xmm11, xmm11, 0x4E 1825 pshufd xmm2, xmm2, 0x93 1826 pshufd xmm10, xmm10, 0x93 1827 dec al 1828 je 9f 1829 movdqa xmm12, xmmword ptr [rsp+0x20] 1830 movdqa xmm5, xmmword ptr [rsp+0x40] 1831 pshufd xmm13, xmm12, 0x0F 1832 shufps xmm12, xmm5, 214 1833 pshufd xmm4, xmm12, 0x39 1834 movdqa xmm12, xmm6 1835 shufps xmm12, xmm7, 250 1836 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] 1837 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1838 por xmm13, xmm12 1839 movdqa xmmword ptr [rsp+0x20], xmm13 1840 movdqa xmm12, xmm7 1841 punpcklqdq xmm12, xmm5 1842 movdqa xmm13, xmm6 1843 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1844 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1845 por xmm12, xmm13 1846 pshufd xmm12, xmm12, 0x78 1847 punpckhdq xmm5, xmm7 1848 punpckldq xmm6, xmm5 1849 pshufd xmm7, xmm6, 0x1E 1850 movdqa xmmword ptr [rsp+0x40], xmm12 1851 movdqa xmm5, xmmword ptr [rsp+0x30] 1852 movdqa xmm13, xmmword ptr [rsp+0x50] 1853 pshufd xmm6, xmm5, 0x0F 1854 shufps xmm5, xmm13, 214 1855 pshufd xmm12, xmm5, 0x39 1856 movdqa xmm5, xmm14 1857 shufps xmm5, xmm15, 250 1858 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] 1859 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] 1860 por xmm6, xmm5 1861 movdqa xmm5, xmm15 1862 punpcklqdq xmm5, xmm13 1863 movdqa xmmword ptr [rsp+0x30], xmm2 1864 movdqa xmm2, xmm14 1865 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] 1866 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] 1867 por xmm5, xmm2 1868 movdqa xmm2, xmmword ptr [rsp+0x30] 1869 pshufd xmm5, xmm5, 0x78 1870 punpckhdq xmm13, xmm15 1871 punpckldq xmm14, xmm13 1872 pshufd xmm15, xmm14, 0x1E 1873 movdqa xmm13, xmm6 1874 movdqa xmm14, xmm5 1875 movdqa xmm5, xmmword ptr [rsp+0x20] 1876 movdqa xmm6, xmmword ptr [rsp+0x40] 1877 jmp 9b 1878 9: 1879 pxor xmm0, xmm2 1880 pxor xmm1, xmm3 1881 pxor xmm8, xmm10 1882 pxor xmm9, xmm11 1883 mov eax, r13d 1884 cmp rdx, r15 1885 jne 2b 1886 movups xmmword ptr [rbx], xmm0 1887 movups xmmword ptr [rbx+0x10], xmm1 1888 movups xmmword ptr [rbx+0x20], xmm8 1889 movups xmmword ptr [rbx+0x30], xmm9 1890 mov eax, dword ptr [rsp+0x130] 1891 neg eax 1892 mov r10d, dword ptr [rsp+0x110+8*rax] 1893 mov r11d, dword ptr [rsp+0x120+8*rax] 1894 mov dword ptr [rsp+0x110], r10d 1895 mov dword ptr [rsp+0x120], r11d 1896 add rdi, 16 1897 add rbx, 64 1898 sub rsi, 2 1899 3: 1900 test esi, 0x1 1901 je 4b 1902 movups xmm0, xmmword ptr [rcx] 1903 movups xmm1, xmmword ptr [rcx+0x10] 1904 movd xmm13, dword ptr [rsp+0x110] 1905 movd xmm14, dword ptr [rsp+0x120] 1906 punpckldq xmm13, xmm14 1907 mov r8, qword ptr [rdi] 1908 movzx eax, byte ptr [rbp+0x40] 1909 or eax, r13d 1910 xor edx, edx 1911 2: 1912 mov r14d, eax 1913 or eax, r12d 1914 add rdx, 64 1915 cmp rdx, r15 1916 cmovne eax, r14d 1917 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 1918 shl rax, 32 1919 or rax, 64 1920 movd xmm12, rax 1921 movdqa xmm3, xmm13 1922 punpcklqdq xmm3, xmm12 1923 movups xmm4, xmmword ptr [r8+rdx-0x40] 1924 movups xmm5, xmmword ptr [r8+rdx-0x30] 1925 movaps xmm8, xmm4 1926 shufps xmm4, xmm5, 136 1927 shufps xmm8, xmm5, 221 1928 movaps xmm5, xmm8 1929 movups xmm6, xmmword ptr [r8+rdx-0x20] 1930 movups xmm7, xmmword ptr [r8+rdx-0x10] 1931 movaps xmm8, xmm6 1932 shufps xmm6, xmm7, 136 1933 pshufd xmm6, xmm6, 0x93 1934 shufps xmm8, xmm7, 221 1935 pshufd xmm7, xmm8, 0x93 1936 mov al, 7 1937 9: 1938 paddd xmm0, xmm4 1939 paddd xmm0, xmm1 1940 pxor xmm3, xmm0 1941 pshuflw xmm3, xmm3, 0xB1 1942 pshufhw xmm3, xmm3, 0xB1 1943 paddd xmm2, xmm3 1944 pxor xmm1, xmm2 1945 movdqa xmm11, xmm1 1946 pslld xmm1, 20 1947 psrld xmm11, 12 1948 por xmm1, xmm11 1949 paddd xmm0, xmm5 1950 paddd xmm0, xmm1 1951 pxor xmm3, xmm0 1952 movdqa xmm14, xmm3 1953 psrld xmm3, 8 1954 pslld xmm14, 24 1955 pxor xmm3, xmm14 1956 paddd xmm2, xmm3 1957 pxor xmm1, xmm2 1958 movdqa xmm11, xmm1 1959 pslld xmm1, 25 1960 psrld xmm11, 7 1961 por xmm1, xmm11 1962 pshufd xmm0, xmm0, 0x93 1963 pshufd xmm3, xmm3, 0x4E 1964 pshufd xmm2, xmm2, 0x39 1965 paddd xmm0, xmm6 1966 paddd xmm0, xmm1 1967 pxor xmm3, xmm0 1968 pshuflw xmm3, xmm3, 0xB1 1969 pshufhw xmm3, xmm3, 0xB1 1970 paddd xmm2, xmm3 1971 pxor xmm1, xmm2 1972 movdqa xmm11, xmm1 1973 pslld xmm1, 20 1974 psrld xmm11, 12 1975 por xmm1, xmm11 1976 paddd xmm0, xmm7 1977 paddd xmm0, xmm1 1978 pxor xmm3, xmm0 1979 movdqa xmm14, xmm3 1980 psrld xmm3, 8 1981 pslld xmm14, 24 1982 pxor xmm3, xmm14 1983 paddd xmm2, xmm3 1984 pxor xmm1, xmm2 1985 movdqa xmm11, xmm1 1986 pslld xmm1, 25 1987 psrld xmm11, 7 1988 por xmm1, xmm11 1989 pshufd xmm0, xmm0, 0x39 1990 pshufd xmm3, xmm3, 0x4E 1991 pshufd xmm2, xmm2, 0x93 1992 dec al 1993 jz 9f 1994 movdqa xmm8, xmm4 1995 shufps xmm8, xmm5, 214 1996 pshufd xmm9, xmm4, 0x0F 1997 pshufd xmm4, xmm8, 0x39 1998 movdqa xmm8, xmm6 1999 shufps xmm8, xmm7, 250 2000 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2001 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2002 por xmm9, xmm8 2003 movdqa xmm8, xmm7 2004 punpcklqdq xmm8, xmm5 2005 movdqa xmm10, xmm6 2006 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2007 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2008 por xmm8, xmm10 2009 pshufd xmm8, xmm8, 0x78 2010 punpckhdq xmm5, xmm7 2011 punpckldq xmm6, xmm5 2012 pshufd xmm7, xmm6, 0x1E 2013 movdqa xmm5, xmm9 2014 movdqa xmm6, xmm8 2015 jmp 9b 2016 9: 2017 pxor xmm0, xmm2 2018 pxor xmm1, xmm3 2019 mov eax, r13d 2020 cmp rdx, r15 2021 jne 2b 2022 movups xmmword ptr [rbx], xmm0 2023 movups xmmword ptr [rbx+0x10], xmm1 2024 jmp 4b 2025 2026 .p2align 6 2027 blake3_compress_in_place_sse2: 2028 _blake3_compress_in_place_sse2: 2029 _CET_ENDBR 2030 movups xmm0, xmmword ptr [rdi] 2031 movups xmm1, xmmword ptr [rdi+0x10] 2032 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2033 shl r8, 32 2034 add rdx, r8 2035 movq xmm3, rcx 2036 movq xmm4, rdx 2037 punpcklqdq xmm3, xmm4 2038 movups xmm4, xmmword ptr [rsi] 2039 movups xmm5, xmmword ptr [rsi+0x10] 2040 movaps xmm8, xmm4 2041 shufps xmm4, xmm5, 136 2042 shufps xmm8, xmm5, 221 2043 movaps xmm5, xmm8 2044 movups xmm6, xmmword ptr [rsi+0x20] 2045 movups xmm7, xmmword ptr [rsi+0x30] 2046 movaps xmm8, xmm6 2047 shufps xmm6, xmm7, 136 2048 pshufd xmm6, xmm6, 0x93 2049 shufps xmm8, xmm7, 221 2050 pshufd xmm7, xmm8, 0x93 2051 mov al, 7 2052 9: 2053 paddd xmm0, xmm4 2054 paddd xmm0, xmm1 2055 pxor xmm3, xmm0 2056 pshuflw xmm3, xmm3, 0xB1 2057 pshufhw xmm3, xmm3, 0xB1 2058 paddd xmm2, xmm3 2059 pxor xmm1, xmm2 2060 movdqa xmm11, xmm1 2061 pslld xmm1, 20 2062 psrld xmm11, 12 2063 por xmm1, xmm11 2064 paddd xmm0, xmm5 2065 paddd xmm0, xmm1 2066 pxor xmm3, xmm0 2067 movdqa xmm14, xmm3 2068 psrld xmm3, 8 2069 pslld xmm14, 24 2070 pxor xmm3, xmm14 2071 paddd xmm2, xmm3 2072 pxor xmm1, xmm2 2073 movdqa xmm11, xmm1 2074 pslld xmm1, 25 2075 psrld xmm11, 7 2076 por xmm1, xmm11 2077 pshufd xmm0, xmm0, 0x93 2078 pshufd xmm3, xmm3, 0x4E 2079 pshufd xmm2, xmm2, 0x39 2080 paddd xmm0, xmm6 2081 paddd xmm0, xmm1 2082 pxor xmm3, xmm0 2083 pshuflw xmm3, xmm3, 0xB1 2084 pshufhw xmm3, xmm3, 0xB1 2085 paddd xmm2, xmm3 2086 pxor xmm1, xmm2 2087 movdqa xmm11, xmm1 2088 pslld xmm1, 20 2089 psrld xmm11, 12 2090 por xmm1, xmm11 2091 paddd xmm0, xmm7 2092 paddd xmm0, xmm1 2093 pxor xmm3, xmm0 2094 movdqa xmm14, xmm3 2095 psrld xmm3, 8 2096 pslld xmm14, 24 2097 pxor xmm3, xmm14 2098 paddd xmm2, xmm3 2099 pxor xmm1, xmm2 2100 movdqa xmm11, xmm1 2101 pslld xmm1, 25 2102 psrld xmm11, 7 2103 por xmm1, xmm11 2104 pshufd xmm0, xmm0, 0x39 2105 pshufd xmm3, xmm3, 0x4E 2106 pshufd xmm2, xmm2, 0x93 2107 dec al 2108 jz 9f 2109 movdqa xmm8, xmm4 2110 shufps xmm8, xmm5, 214 2111 pshufd xmm9, xmm4, 0x0F 2112 pshufd xmm4, xmm8, 0x39 2113 movdqa xmm8, xmm6 2114 shufps xmm8, xmm7, 250 2115 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2116 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2117 por xmm9, xmm8 2118 movdqa xmm8, xmm7 2119 punpcklqdq xmm8, xmm5 2120 movdqa xmm10, xmm6 2121 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2122 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2123 por xmm8, xmm10 2124 pshufd xmm8, xmm8, 0x78 2125 punpckhdq xmm5, xmm7 2126 punpckldq xmm6, xmm5 2127 pshufd xmm7, xmm6, 0x1E 2128 movdqa xmm5, xmm9 2129 movdqa xmm6, xmm8 2130 jmp 9b 2131 9: 2132 pxor xmm0, xmm2 2133 pxor xmm1, xmm3 2134 movups xmmword ptr [rdi], xmm0 2135 movups xmmword ptr [rdi+0x10], xmm1 2136 ret 2137 2138 .p2align 6 2139 blake3_compress_xof_sse2: 2140 _blake3_compress_xof_sse2: 2141 _CET_ENDBR 2142 movups xmm0, xmmword ptr [rdi] 2143 movups xmm1, xmmword ptr [rdi+0x10] 2144 movaps xmm2, xmmword ptr [BLAKE3_IV+rip] 2145 movzx eax, r8b 2146 movzx edx, dl 2147 shl rax, 32 2148 add rdx, rax 2149 movq xmm3, rcx 2150 movq xmm4, rdx 2151 punpcklqdq xmm3, xmm4 2152 movups xmm4, xmmword ptr [rsi] 2153 movups xmm5, xmmword ptr [rsi+0x10] 2154 movaps xmm8, xmm4 2155 shufps xmm4, xmm5, 136 2156 shufps xmm8, xmm5, 221 2157 movaps xmm5, xmm8 2158 movups xmm6, xmmword ptr [rsi+0x20] 2159 movups xmm7, xmmword ptr [rsi+0x30] 2160 movaps xmm8, xmm6 2161 shufps xmm6, xmm7, 136 2162 pshufd xmm6, xmm6, 0x93 2163 shufps xmm8, xmm7, 221 2164 pshufd xmm7, xmm8, 0x93 2165 mov al, 7 2166 9: 2167 paddd xmm0, xmm4 2168 paddd xmm0, xmm1 2169 pxor xmm3, xmm0 2170 pshuflw xmm3, xmm3, 0xB1 2171 pshufhw xmm3, xmm3, 0xB1 2172 paddd xmm2, xmm3 2173 pxor xmm1, xmm2 2174 movdqa xmm11, xmm1 2175 pslld xmm1, 20 2176 psrld xmm11, 12 2177 por xmm1, xmm11 2178 paddd xmm0, xmm5 2179 paddd xmm0, xmm1 2180 pxor xmm3, xmm0 2181 movdqa xmm14, xmm3 2182 psrld xmm3, 8 2183 pslld xmm14, 24 2184 pxor xmm3, xmm14 2185 paddd xmm2, xmm3 2186 pxor xmm1, xmm2 2187 movdqa xmm11, xmm1 2188 pslld xmm1, 25 2189 psrld xmm11, 7 2190 por xmm1, xmm11 2191 pshufd xmm0, xmm0, 0x93 2192 pshufd xmm3, xmm3, 0x4E 2193 pshufd xmm2, xmm2, 0x39 2194 paddd xmm0, xmm6 2195 paddd xmm0, xmm1 2196 pxor xmm3, xmm0 2197 pshuflw xmm3, xmm3, 0xB1 2198 pshufhw xmm3, xmm3, 0xB1 2199 paddd xmm2, xmm3 2200 pxor xmm1, xmm2 2201 movdqa xmm11, xmm1 2202 pslld xmm1, 20 2203 psrld xmm11, 12 2204 por xmm1, xmm11 2205 paddd xmm0, xmm7 2206 paddd xmm0, xmm1 2207 pxor xmm3, xmm0 2208 movdqa xmm14, xmm3 2209 psrld xmm3, 8 2210 pslld xmm14, 24 2211 pxor xmm3, xmm14 2212 paddd xmm2, xmm3 2213 pxor xmm1, xmm2 2214 movdqa xmm11, xmm1 2215 pslld xmm1, 25 2216 psrld xmm11, 7 2217 por xmm1, xmm11 2218 pshufd xmm0, xmm0, 0x39 2219 pshufd xmm3, xmm3, 0x4E 2220 pshufd xmm2, xmm2, 0x93 2221 dec al 2222 jz 9f 2223 movdqa xmm8, xmm4 2224 shufps xmm8, xmm5, 214 2225 pshufd xmm9, xmm4, 0x0F 2226 pshufd xmm4, xmm8, 0x39 2227 movdqa xmm8, xmm6 2228 shufps xmm8, xmm7, 250 2229 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] 2230 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] 2231 por xmm9, xmm8 2232 movdqa xmm8, xmm7 2233 punpcklqdq xmm8, xmm5 2234 movdqa xmm10, xmm6 2235 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] 2236 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] 2237 por xmm8, xmm10 2238 pshufd xmm8, xmm8, 0x78 2239 punpckhdq xmm5, xmm7 2240 punpckldq xmm6, xmm5 2241 pshufd xmm7, xmm6, 0x1E 2242 movdqa xmm5, xmm9 2243 movdqa xmm6, xmm8 2244 jmp 9b 2245 9: 2246 movdqu xmm4, xmmword ptr [rdi] 2247 movdqu xmm5, xmmword ptr [rdi+0x10] 2248 pxor xmm0, xmm2 2249 pxor xmm1, xmm3 2250 pxor xmm2, xmm4 2251 pxor xmm3, xmm5 2252 movups xmmword ptr [r9], xmm0 2253 movups xmmword ptr [r9+0x10], xmm1 2254 movups xmmword ptr [r9+0x20], xmm2 2255 movups xmmword ptr [r9+0x30], xmm3 2256 ret 2257 2258 2259 #ifdef __APPLE__ 2260 .static_data 2261 #else 2262 .section .rodata 2263 #endif 2264 .p2align 6 2265 BLAKE3_IV: 2266 .long 0x6A09E667, 0xBB67AE85 2267 .long 0x3C6EF372, 0xA54FF53A 2268 ADD0: 2269 .long 0, 1, 2, 3 2270 ADD1: 2271 .long 4, 4, 4, 4 2272 BLAKE3_IV_0: 2273 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 2274 BLAKE3_IV_1: 2275 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 2276 BLAKE3_IV_2: 2277 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 2278 BLAKE3_IV_3: 2279 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A 2280 BLAKE3_BLOCK_LEN: 2281 .long 64, 64, 64, 64 2282 CMP_MSB_MASK: 2283 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 2284 PBLENDW_0x33_MASK: 2285 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 2286 PBLENDW_0xCC_MASK: 2287 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 2288 PBLENDW_0x3F_MASK: 2289 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 2290 PBLENDW_0xC0_MASK: 2291 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF