sha256_avx1.asm (15713B)
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2 ; Copyright (c) 2012, Intel Corporation 3 ; 4 ; All rights reserved. 5 ; 6 ; Redistribution and use in source and binary forms, with or without 7 ; modification, are permitted provided that the following conditions are 8 ; met: 9 ; 10 ; * Redistributions of source code must retain the above copyright 11 ; notice, this list of conditions and the following disclaimer. 12 ; 13 ; * Redistributions in binary form must reproduce the above copyright 14 ; notice, this list of conditions and the following disclaimer in the 15 ; documentation and/or other materials provided with the 16 ; distribution. 17 ; 18 ; * Neither the name of the Intel Corporation nor the names of its 19 ; contributors may be used to endorse or promote products derived from 20 ; this software without specific prior written permission. 21 ; 22 ; 23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY 24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 35 ; 36 ; Example YASM command lines: 37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm 38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm 39 ; 40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 41 ; 42 ; This code is described in an Intel White-Paper: 43 ; "Fast SHA-256 Implementations on Intel Architecture Processors" 44 ; 45 ; To find it, surf to http://www.intel.com/p/en_US/embedded 46 ; and search for that title. 47 ; The paper is expected to be released roughly at the end of April, 2012 48 ; 49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 50 ; This code schedules 1 blocks at a time, with 4 lanes per block 51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 52 53 %define VMOVDQ vmovdqu ;; assume buffers not aligned 54 55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros 56 57 ; addm [mem], reg 58 ; Add reg to mem using reg-mem add and store 59 %macro addm 2 60 add %2, %1 61 mov %1, %2 62 %endm 63 64 %macro MY_ROR 2 65 shld %1,%1,(32-(%2)) 66 %endm 67 68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 69 70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 71 ; Load xmm with mem and byte swap each dword 72 %macro COPY_XMM_AND_BSWAP 3 73 VMOVDQ %1, %2 74 vpshufb %1, %1, %3 75 %endmacro 76 77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 78 79 %define X0 xmm4 80 %define X1 xmm5 81 %define X2 xmm6 82 %define X3 xmm7 83 84 %define XTMP0 xmm0 85 %define XTMP1 xmm1 86 %define XTMP2 xmm2 87 %define XTMP3 xmm3 88 %define XTMP4 xmm8 89 %define XFER xmm9 90 %define XTMP5 xmm11 91 92 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA 93 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 94 %define BYTE_FLIP_MASK xmm13 95 96 %ifdef LINUX 97 %define NUM_BLKS rdx ; 3rd arg 98 %define CTX rsi ; 2nd arg 99 %define INP rdi ; 1st arg 100 101 %define SRND rdi ; clobbers INP 102 %define c ecx 103 %define d r8d 104 %define e edx 105 %else 106 %define NUM_BLKS r8 ; 3rd arg 107 %define CTX rdx ; 2nd arg 108 %define INP rcx ; 1st arg 109 110 %define SRND rcx ; clobbers INP 111 %define c edi 112 %define d esi 113 %define e r8d 114 115 %endif 116 %define TBL rbp 117 %define a eax 118 %define b ebx 119 120 %define f r9d 121 %define g r10d 122 %define h r11d 123 124 %define y0 r13d 125 %define y1 r14d 126 %define y2 r15d 127 128 129 _INP_END_SIZE equ 8 130 _INP_SIZE equ 8 131 _XFER_SIZE equ 8 132 %ifdef LINUX 133 _XMM_SAVE_SIZE equ 0 134 %else 135 _XMM_SAVE_SIZE equ 8*16 136 %endif 137 ; STACK_SIZE plus pushes must be an odd multiple of 8 138 _ALIGN_SIZE equ 8 139 140 _INP_END equ 0 141 _INP equ _INP_END + _INP_END_SIZE 142 _XFER equ _INP + _INP_SIZE 143 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE 144 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE 145 146 ; rotate_Xs 147 ; Rotate values of symbols X0...X3 148 %macro rotate_Xs 0 149 %xdefine X_ X0 150 %xdefine X0 X1 151 %xdefine X1 X2 152 %xdefine X2 X3 153 %xdefine X3 X_ 154 %endm 155 156 ; ROTATE_ARGS 157 ; Rotate values of symbols a...h 158 %macro ROTATE_ARGS 0 159 %xdefine TMP_ h 160 %xdefine h g 161 %xdefine g f 162 %xdefine f e 163 %xdefine e d 164 %xdefine d c 165 %xdefine c b 166 %xdefine b a 167 %xdefine a TMP_ 168 %endm 169 170 %macro FOUR_ROUNDS_AND_SCHED 0 171 ;; compute s0 four at a time and s1 two at a time 172 ;; compute W[-16] + W[-7] 4 at a time 173 ;vmovdqa XTMP0, X3 174 mov y0, e ; y0 = e 175 MY_ROR y0, (25-11) ; y0 = e >> (25-11) 176 mov y1, a ; y1 = a 177 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] 178 MY_ROR y1, (22-13) ; y1 = a >> (22-13) 179 xor y0, e ; y0 = e ^ (e >> (25-11)) 180 mov y2, f ; y2 = f 181 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) 182 ;vmovdqa XTMP1, X1 183 xor y1, a ; y1 = a ^ (a >> (22-13) 184 xor y2, g ; y2 = f^g 185 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] 186 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 187 and y2, e ; y2 = (f^g)&e 188 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) 189 ;; compute s0 190 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] 191 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 192 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 193 xor y2, g ; y2 = CH = ((f^g)&e)^g 194 195 196 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 197 add y2, y0 ; y2 = S1 + CH 198 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH 199 200 mov y0, a ; y0 = a 201 add h, y2 ; h = h + S1 + CH + k + w 202 mov y2, a ; y2 = a 203 204 vpsrld XTMP2, XTMP1, 7 205 206 or y0, c ; y0 = a|c 207 add d, h ; d = d + h + S1 + CH + k + w 208 and y2, c ; y2 = a&c 209 210 vpslld XTMP3, XTMP1, (32-7) 211 212 and y0, b ; y0 = (a|c)&b 213 add h, y1 ; h = h + S1 + CH + k + w + S0 214 215 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] nostrdb: MY_ROR 7 216 217 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) 218 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ 219 220 ROTATE_ARGS 221 222 mov y0, e ; y0 = e 223 mov y1, a ; y1 = a 224 225 226 MY_ROR y0, (25-11) ; y0 = e >> (25-11) 227 xor y0, e ; y0 = e ^ (e >> (25-11)) 228 mov y2, f ; y2 = f 229 MY_ROR y1, (22-13) ; y1 = a >> (22-13) 230 231 vpsrld XTMP2, XTMP1,18 232 233 xor y1, a ; y1 = a ^ (a >> (22-13) 234 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) 235 xor y2, g ; y2 = f^g 236 237 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] nostrdb: >> 3 238 239 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) 240 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 241 and y2, e ; y2 = (f^g)&e 242 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 243 244 vpslld XTMP1, XTMP1, (32-18) 245 246 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 247 xor y2, g ; y2 = CH = ((f^g)&e)^g 248 249 vpxor XTMP3, XTMP3, XTMP1 250 251 add y2, y0 ; y2 = S1 + CH 252 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH 253 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 254 255 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] nostrdb: MY_ROR 7 ^ W[-15] MY_ROR 18 256 257 mov y0, a ; y0 = a 258 add h, y2 ; h = h + S1 + CH + k + w 259 mov y2, a ; y2 = a 260 261 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 262 263 or y0, c ; y0 = a|c 264 add d, h ; d = d + h + S1 + CH + k + w 265 and y2, c ; y2 = a&c 266 ;; compute low s1 267 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} 268 and y0, b ; y0 = (a|c)&b 269 add h, y1 ; h = h + S1 + CH + k + w + S0 270 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 271 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) 272 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ 273 274 ROTATE_ARGS 275 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} 276 277 mov y0, e ; y0 = e 278 mov y1, a ; y1 = a 279 MY_ROR y0, (25-11) ; y0 = e >> (25-11) 280 281 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} 282 283 xor y0, e ; y0 = e ^ (e >> (25-11)) 284 MY_ROR y1, (22-13) ; y1 = a >> (22-13) 285 mov y2, f ; y2 = f 286 xor y1, a ; y1 = a ^ (a >> (22-13) 287 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) 288 289 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} 290 291 xor y2, g ; y2 = f^g 292 293 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} 294 295 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 296 and y2, e ; y2 = (f^g)&e 297 298 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} 299 300 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) 301 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 302 xor y2, g ; y2 = CH = ((f^g)&e)^g 303 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 304 vpxor XTMP2, XTMP2, XTMP3 305 add y2, y0 ; y2 = S1 + CH 306 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 307 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH 308 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} 309 mov y0, a ; y0 = a 310 add h, y2 ; h = h + S1 + CH + k + w 311 mov y2, a ; y2 = a 312 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} 313 or y0, c ; y0 = a|c 314 add d, h ; d = d + h + S1 + CH + k + w 315 and y2, c ; y2 = a&c 316 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} 317 and y0, b ; y0 = (a|c)&b 318 add h, y1 ; h = h + S1 + CH + k + w + S0 319 ;; compute high s1 320 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} 321 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) 322 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ 323 324 ROTATE_ARGS 325 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} 326 mov y0, e ; y0 = e 327 MY_ROR y0, (25-11) ; y0 = e >> (25-11) 328 mov y1, a ; y1 = a 329 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} 330 MY_ROR y1, (22-13) ; y1 = a >> (22-13) 331 xor y0, e ; y0 = e ^ (e >> (25-11)) 332 mov y2, f ; y2 = f 333 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) 334 335 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} 336 337 xor y1, a ; y1 = a ^ (a >> (22-13) 338 xor y2, g ; y2 = f^g 339 340 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} 341 342 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 343 and y2, e ; y2 = (f^g)&e 344 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) 345 346 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} 347 348 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 349 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 350 xor y2, g ; y2 = CH = ((f^g)&e)^g 351 352 vpxor XTMP2, XTMP2, XTMP3 353 354 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 355 add y2, y0 ; y2 = S1 + CH 356 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH 357 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} 358 mov y0, a ; y0 = a 359 add h, y2 ; h = h + S1 + CH + k + w 360 mov y2, a ; y2 = a 361 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} 362 or y0, c ; y0 = a|c 363 add d, h ; d = d + h + S1 + CH + k + w 364 and y2, c ; y2 = a&c 365 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} 366 and y0, b ; y0 = (a|c)&b 367 add h, y1 ; h = h + S1 + CH + k + w + S0 368 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) 369 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ 370 371 ROTATE_ARGS 372 rotate_Xs 373 %endm 374 375 ;; input is [rsp + _XFER + %1 * 4] 376 %macro DO_ROUND 1 377 mov y0, e ; y0 = e 378 MY_ROR y0, (25-11) ; y0 = e >> (25-11) 379 mov y1, a ; y1 = a 380 xor y0, e ; y0 = e ^ (e >> (25-11)) 381 MY_ROR y1, (22-13) ; y1 = a >> (22-13) 382 mov y2, f ; y2 = f 383 xor y1, a ; y1 = a ^ (a >> (22-13) 384 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) 385 xor y2, g ; y2 = f^g 386 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 387 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) 388 and y2, e ; y2 = (f^g)&e 389 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 390 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 391 xor y2, g ; y2 = CH = ((f^g)&e)^g 392 add y2, y0 ; y2 = S1 + CH 393 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 394 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH 395 mov y0, a ; y0 = a 396 add h, y2 ; h = h + S1 + CH + k + w 397 mov y2, a ; y2 = a 398 or y0, c ; y0 = a|c 399 add d, h ; d = d + h + S1 + CH + k + w 400 and y2, c ; y2 = a&c 401 and y0, b ; y0 = (a|c)&b 402 add h, y1 ; h = h + S1 + CH + k + w + S0 403 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) 404 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ 405 ROTATE_ARGS 406 %endm 407 408 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 409 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 410 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) 411 ;; arg 1 : pointer to input data 412 ;; arg 2 : pointer to digest 413 ;; arg 3 : Num blocks 414 section .text 415 global sha256_avx 416 align 32 417 sha256_avx: 418 push rbx 419 %ifndef LINUX 420 push rsi 421 push rdi 422 %endif 423 push rbp 424 push r13 425 push r14 426 push r15 427 428 sub rsp,STACK_SIZE 429 %ifndef LINUX 430 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 431 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 432 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 433 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 434 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 435 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 436 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 437 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 438 %endif 439 440 shl NUM_BLKS, 6 ; convert to bytes 441 jz done_hash 442 add NUM_BLKS, INP ; pointer to end of data 443 mov [rsp + _INP_END], NUM_BLKS 444 445 ;; load initial digest 446 mov a,[4*0 + CTX] 447 mov b,[4*1 + CTX] 448 mov c,[4*2 + CTX] 449 mov d,[4*3 + CTX] 450 mov e,[4*4 + CTX] 451 mov f,[4*5 + CTX] 452 mov g,[4*6 + CTX] 453 mov h,[4*7 + CTX] 454 455 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] 456 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] 457 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] 458 459 loop0: 460 lea TBL,[K256 wrt rip] 461 462 ;; byte swap first 16 dwords 463 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK 464 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK 465 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK 466 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK 467 468 mov [rsp + _INP], INP 469 470 ;; schedule 48 input dwords, by doing 3 rounds of 16 each 471 mov SRND, 3 472 align 16 473 loop1: 474 vpaddd XFER, X0, [TBL + 0*16] 475 vmovdqa [rsp + _XFER], XFER 476 FOUR_ROUNDS_AND_SCHED 477 478 vpaddd XFER, X0, [TBL + 1*16] 479 vmovdqa [rsp + _XFER], XFER 480 FOUR_ROUNDS_AND_SCHED 481 482 vpaddd XFER, X0, [TBL + 2*16] 483 vmovdqa [rsp + _XFER], XFER 484 FOUR_ROUNDS_AND_SCHED 485 486 vpaddd XFER, X0, [TBL + 3*16] 487 vmovdqa [rsp + _XFER], XFER 488 add TBL, 4*16 489 FOUR_ROUNDS_AND_SCHED 490 491 sub SRND, 1 492 jne loop1 493 494 mov SRND, 2 495 loop2: 496 vpaddd XFER, X0, [TBL + 0*16] 497 vmovdqa [rsp + _XFER], XFER 498 DO_ROUND 0 499 DO_ROUND 1 500 DO_ROUND 2 501 DO_ROUND 3 502 503 vpaddd XFER, X1, [TBL + 1*16] 504 vmovdqa [rsp + _XFER], XFER 505 add TBL, 2*16 506 DO_ROUND 0 507 DO_ROUND 1 508 DO_ROUND 2 509 DO_ROUND 3 510 511 vmovdqa X0, X2 512 vmovdqa X1, X3 513 514 sub SRND, 1 515 jne loop2 516 517 518 addm [4*0 + CTX],a 519 addm [4*1 + CTX],b 520 addm [4*2 + CTX],c 521 addm [4*3 + CTX],d 522 addm [4*4 + CTX],e 523 addm [4*5 + CTX],f 524 addm [4*6 + CTX],g 525 addm [4*7 + CTX],h 526 527 mov INP, [rsp + _INP] 528 add INP, 64 529 cmp INP, [rsp + _INP_END] 530 jne loop0 531 532 done_hash: 533 %ifndef LINUX 534 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] 535 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] 536 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] 537 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] 538 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] 539 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] 540 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] 541 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] 542 %endif 543 544 545 add rsp, STACK_SIZE 546 547 pop r15 548 pop r14 549 pop r13 550 pop rbp 551 %ifndef LINUX 552 pop rdi 553 pop rsi 554 %endif 555 pop rbx 556 557 ret 558 559 560 section .data 561 align 64 562 K256: 563 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 564 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 565 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 566 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 567 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 568 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 569 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 570 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 571 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 572 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 573 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 574 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 575 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 576 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 577 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 578 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 579 580 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 581 582 ; shuffle xBxA -> 00BA 583 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 584 585 ; shuffle xDxC -> DC00 586 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF