sha256_avx2_rorx2.asm (25414B)
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2 ; Copyright (c) 2012, Intel Corporation 3 ; 4 ; All rights reserved. 5 ; 6 ; Redistribution and use in source and binary forms, with or without 7 ; modification, are permitted provided that the following conditions are 8 ; met: 9 ; 10 ; * Redistributions of source code must retain the above copyright 11 ; notice, this list of conditions and the following disclaimer. 12 ; 13 ; * Redistributions in binary form must reproduce the above copyright 14 ; notice, this list of conditions and the following disclaimer in the 15 ; documentation and/or other materials provided with the 16 ; distribution. 17 ; 18 ; * Neither the name of the Intel Corporation nor the names of its 19 ; contributors may be used to endorse or promote products derived from 20 ; this software without specific prior written permission. 21 ; 22 ; 23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY 24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 35 ; 36 ; Example YASM command lines: 37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm 38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm 39 ; 40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 41 ; 42 ; This code is described in an Intel White-Paper: 43 ; "Fast SHA-256 Implementations on Intel Architecture Processors" 44 ; 45 ; To find it, surf to http://www.intel.com/p/en_US/embedded 46 ; and search for that title. 47 ; The paper is expected to be released roughly at the end of April, 2012 48 ; 49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 50 ; This code schedules 2 blocks at a time, with 4 lanes per block 51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 52 53 %define VMOVDQ vmovdqu ;; assume buffers not aligned 54 55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros 56 57 ; addm [mem], reg 58 ; Add reg to mem using reg-mem add and store 59 %macro addm 2 60 add %2, %1 61 mov %1, %2 62 %endm 63 64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 65 66 %define X0 ymm4 67 %define X1 ymm5 68 %define X2 ymm6 69 %define X3 ymm7 70 71 ; XMM versions of above 72 %define XWORD0 xmm4 73 %define XWORD1 xmm5 74 %define XWORD2 xmm6 75 %define XWORD3 xmm7 76 77 %define XTMP0 ymm0 78 %define XTMP1 ymm1 79 %define XTMP2 ymm2 80 %define XTMP3 ymm3 81 %define XTMP4 ymm8 82 %define XFER ymm9 83 %define XTMP5 ymm11 84 85 %define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA 86 %define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00 87 %define BYTE_FLIP_MASK ymm13 88 89 %define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK 90 91 %ifdef LINUX 92 %define NUM_BLKS rdx ; 3rd arg 93 %define CTX rsi ; 2nd arg 94 %define INP rdi ; 1st arg 95 %define c ecx 96 %define d r8d 97 %define e edx ; clobbers NUM_BLKS 98 %define y3 edi ; clobbers INP 99 %else 100 %define NUM_BLKS r8 ; 3rd arg 101 %define CTX rdx ; 2nd arg 102 %define INP rcx ; 1st arg 103 %define c edi 104 %define d esi 105 %define e r8d ; clobbers NUM_BLKS 106 %define y3 ecx ; clobbers INP 107 108 %endif 109 110 111 %define TBL rbp 112 %define SRND CTX ; SRND is same register as CTX 113 114 %define a eax 115 %define b ebx 116 %define f r9d 117 %define g r10d 118 %define h r11d 119 %define old_h r11d 120 121 %define T1 r12d 122 %define y0 r13d 123 %define y1 r14d 124 %define y2 r15d 125 126 127 _XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round 128 %ifdef LINUX 129 _XMM_SAVE_SIZE equ 0 130 %else 131 _XMM_SAVE_SIZE equ 8*16 132 %endif 133 _INP_END_SIZE equ 8 134 _INP_SIZE equ 8 135 _CTX_SIZE equ 8 136 _RSP_SIZE equ 8 137 138 _XFER equ 0 139 _XMM_SAVE equ _XFER + _XFER_SIZE 140 _INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE 141 _INP equ _INP_END + _INP_END_SIZE 142 _CTX equ _INP + _INP_SIZE 143 _RSP equ _CTX + _CTX_SIZE 144 STACK_SIZE equ _RSP + _RSP_SIZE 145 146 ; rotate_Xs 147 ; Rotate values of symbols X0...X3 148 %macro rotate_Xs 0 149 %xdefine X_ X0 150 %xdefine X0 X1 151 %xdefine X1 X2 152 %xdefine X2 X3 153 %xdefine X3 X_ 154 %endm 155 156 ; ROTATE_ARGS 157 ; Rotate values of symbols a...h 158 %macro ROTATE_ARGS 0 159 %xdefine old_h h 160 %xdefine TMP_ h 161 %xdefine h g 162 %xdefine g f 163 %xdefine f e 164 %xdefine e d 165 %xdefine d c 166 %xdefine c b 167 %xdefine b a 168 %xdefine a TMP_ 169 %endm 170 171 %macro FOUR_ROUNDS_AND_SCHED 1 172 %define %%XFER %1 173 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; 174 175 mov y3, a ; y3 = a ; MAJA 176 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 177 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 178 179 add h, dword[%%XFER+0*4] ; h = k + w + h ; -- 180 or y3, c ; y3 = a|c ; MAJA 181 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] 182 mov y2, f ; y2 = f ; CH 183 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 184 185 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 186 xor y2, g ; y2 = f^g ; CH 187 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 188 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 189 190 and y2, e ; y2 = (f^g)&e ; CH 191 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 192 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 193 add d, h ; d = k + w + h + d ; -- 194 195 and y3, b ; y3 = (a|c)&b ; MAJA 196 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] 197 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 198 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 199 200 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 201 vpsrld XTMP2, XTMP1, 7 202 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 203 mov T1, a ; T1 = a ; MAJB 204 and T1, c ; T1 = a&c ; MAJB 205 206 add y2, y0 ; y2 = S1 + CH ; -- 207 vpslld XTMP3, XTMP1, (32-7) 208 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 209 add h, y1 ; h = k + w + h + S0 ; -- 210 211 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 212 vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] nostrdb: ror 7 213 214 vpsrld XTMP2, XTMP1,18 215 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 216 add h, y3 ; h = t1 + S0 + MAJ ; -- 217 218 219 ROTATE_ARGS 220 221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; 222 223 224 mov y3, a ; y3 = a ; MAJA 225 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 226 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 227 add h, dword[%%XFER+1*4] ; h = k + w + h ; -- 228 or y3, c ; y3 = a|c ; MAJA 229 230 231 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] nostrdb: >> 3 232 mov y2, f ; y2 = f ; CH 233 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 234 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 235 xor y2, g ; y2 = f^g ; CH 236 237 238 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 239 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 240 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 241 and y2, e ; y2 = (f^g)&e ; CH 242 add d, h ; d = k + w + h + d ; -- 243 244 vpslld XTMP1, XTMP1, (32-18) 245 and y3, b ; y3 = (a|c)&b ; MAJA 246 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 247 248 vpxor XTMP3, XTMP3, XTMP1 249 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 250 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 251 252 vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] nostrdb: ror 7 ^ W[-15] ror 18 253 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 254 mov T1, a ; T1 = a ; MAJB 255 and T1, c ; T1 = a&c ; MAJB 256 add y2, y0 ; y2 = S1 + CH ; -- 257 258 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 259 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} 260 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 261 add h, y1 ; h = k + w + h + S0 ; -- 262 263 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 264 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 265 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 266 add h, y3 ; h = t1 + S0 + MAJ ; -- 267 268 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} 269 270 271 ROTATE_ARGS 272 273 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; 274 275 mov y3, a ; y3 = a ; MAJA 276 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 277 add h, [%%XFER+2*4] ; h = k + w + h ; -- 278 279 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA} 280 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 281 or y3, c ; y3 = a|c ; MAJA 282 mov y2, f ; y2 = f ; CH 283 xor y2, g ; y2 = f^g ; CH 284 285 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 286 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 287 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} 288 and y2, e ; y2 = (f^g)&e ; CH 289 290 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 291 vpxor XTMP2, XTMP2, XTMP3 292 add d, h ; d = k + w + h + d ; -- 293 and y3, b ; y3 = (a|c)&b ; MAJA 294 295 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 296 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 297 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} 298 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 299 300 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} 301 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 302 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 303 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} 304 305 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 306 mov T1, a ; T1 = a ; MAJB 307 and T1, c ; T1 = a&c ; MAJB 308 add y2, y0 ; y2 = S1 + CH ; -- 309 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} 310 311 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 312 add h, y1 ; h = k + w + h + S0 ; -- 313 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 314 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 315 316 add h, y3 ; h = t1 + S0 + MAJ ; -- 317 318 319 ROTATE_ARGS 320 321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; 322 323 mov y3, a ; y3 = a ; MAJA 324 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 325 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 326 add h, dword[%%XFER+3*4] ; h = k + w + h ; -- 327 or y3, c ; y3 = a|c ; MAJA 328 329 330 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} 331 mov y2, f ; y2 = f ; CH 332 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 333 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 334 xor y2, g ; y2 = f^g ; CH 335 336 337 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC} 338 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 339 and y2, e ; y2 = (f^g)&e ; CH 340 add d, h ; d = k + w + h + d ; -- 341 and y3, b ; y3 = (a|c)&b ; MAJA 342 343 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} 344 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 345 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 346 347 vpxor XTMP2, XTMP2, XTMP3 348 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 349 add y2, y0 ; y2 = S1 + CH ; -- 350 351 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} 352 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 353 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 354 355 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 356 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} 357 358 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} 359 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 360 mov T1, a ; T1 = a ; MAJB 361 and T1, c ; T1 = a&c ; MAJB 362 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 363 364 add h, y1 ; h = k + w + h + S0 ; -- 365 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 366 add h, y3 ; h = t1 + S0 + MAJ ; -- 367 368 ROTATE_ARGS 369 rotate_Xs 370 %endm 371 372 %macro DO_4ROUNDS 1 373 %define %%XFER %1 374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; 375 376 mov y2, f ; y2 = f ; CH 377 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 378 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 379 xor y2, g ; y2 = f^g ; CH 380 381 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 382 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 383 and y2, e ; y2 = (f^g)&e ; CH 384 385 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 386 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 387 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 388 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 389 mov y3, a ; y3 = a ; MAJA 390 391 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 392 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 393 add h, dword[%%XFER + 4*0] ; h = k + w + h ; -- 394 or y3, c ; y3 = a|c ; MAJA 395 396 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 397 mov T1, a ; T1 = a ; MAJB 398 and y3, b ; y3 = (a|c)&b ; MAJA 399 and T1, c ; T1 = a&c ; MAJB 400 add y2, y0 ; y2 = S1 + CH ; -- 401 402 403 add d, h ; d = k + w + h + d ; -- 404 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 405 add h, y1 ; h = k + w + h + S0 ; -- 406 407 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 408 409 410 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 411 412 ;add h, y3 ; h = t1 + S0 + MAJ ; -- 413 414 ROTATE_ARGS 415 416 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; 417 418 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 419 mov y2, f ; y2 = f ; CH 420 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 421 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 422 xor y2, g ; y2 = f^g ; CH 423 424 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 425 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 426 and y2, e ; y2 = (f^g)&e ; CH 427 add old_h, y3 ; h = t1 + S0 + MAJ ; -- 428 429 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 430 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 431 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 432 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 433 mov y3, a ; y3 = a ; MAJA 434 435 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 436 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 437 add h, dword[%%XFER + 4*1] ; h = k + w + h ; -- 438 or y3, c ; y3 = a|c ; MAJA 439 440 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 441 mov T1, a ; T1 = a ; MAJB 442 and y3, b ; y3 = (a|c)&b ; MAJA 443 and T1, c ; T1 = a&c ; MAJB 444 add y2, y0 ; y2 = S1 + CH ; -- 445 446 447 add d, h ; d = k + w + h + d ; -- 448 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 449 add h, y1 ; h = k + w + h + S0 ; -- 450 451 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 452 453 454 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 455 456 ;add h, y3 ; h = t1 + S0 + MAJ ; -- 457 458 ROTATE_ARGS 459 460 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 461 462 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 463 mov y2, f ; y2 = f ; CH 464 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 465 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 466 xor y2, g ; y2 = f^g ; CH 467 468 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 469 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 470 and y2, e ; y2 = (f^g)&e ; CH 471 add old_h, y3 ; h = t1 + S0 + MAJ ; -- 472 473 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 474 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 475 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 476 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 477 mov y3, a ; y3 = a ; MAJA 478 479 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 480 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 481 add h, dword[%%XFER + 4*2] ; h = k + w + h ; -- 482 or y3, c ; y3 = a|c ; MAJA 483 484 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 485 mov T1, a ; T1 = a ; MAJB 486 and y3, b ; y3 = (a|c)&b ; MAJA 487 and T1, c ; T1 = a&c ; MAJB 488 add y2, y0 ; y2 = S1 + CH ; -- 489 490 491 add d, h ; d = k + w + h + d ; -- 492 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 493 add h, y1 ; h = k + w + h + S0 ; -- 494 495 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 496 497 498 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 499 500 ;add h, y3 ; h = t1 + S0 + MAJ ; -- 501 502 ROTATE_ARGS 503 504 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; 505 506 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 507 mov y2, f ; y2 = f ; CH 508 rorx y0, e, 25 ; y0 = e >> 25 ; S1A 509 rorx y1, e, 11 ; y1 = e >> 11 ; S1B 510 xor y2, g ; y2 = f^g ; CH 511 512 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 513 rorx y1, e, 6 ; y1 = (e >> 6) ; S1 514 and y2, e ; y2 = (f^g)&e ; CH 515 add old_h, y3 ; h = t1 + S0 + MAJ ; -- 516 517 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 518 rorx T1, a, 13 ; T1 = a >> 13 ; S0B 519 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH 520 rorx y1, a, 22 ; y1 = a >> 22 ; S0A 521 mov y3, a ; y3 = a ; MAJA 522 523 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 524 rorx T1, a, 2 ; T1 = (a >> 2) ; S0 525 add h, dword[%%XFER + 4*3] ; h = k + w + h ; -- 526 or y3, c ; y3 = a|c ; MAJA 527 528 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 529 mov T1, a ; T1 = a ; MAJB 530 and y3, b ; y3 = (a|c)&b ; MAJA 531 and T1, c ; T1 = a&c ; MAJB 532 add y2, y0 ; y2 = S1 + CH ; -- 533 534 535 add d, h ; d = k + w + h + d ; -- 536 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ 537 add h, y1 ; h = k + w + h + S0 ; -- 538 539 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- 540 541 542 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- 543 544 add h, y3 ; h = t1 + S0 + MAJ ; -- 545 546 ROTATE_ARGS 547 548 %endm 549 550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 551 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 552 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 553 ;; arg 1 : pointer to input data 554 ;; arg 2 : pointer to digest 555 ;; arg 3 : Num blocks 556 section .text 557 global sha256_rorx 558 align 32 559 sha256_rorx: 560 push rbx 561 %ifndef LINUX 562 push rsi 563 push rdi 564 %endif 565 push rbp 566 push r12 567 push r13 568 push r14 569 push r15 570 571 mov rax, rsp 572 sub rsp,STACK_SIZE 573 and rsp, -32 574 mov [rsp + _RSP], rax 575 576 %ifndef LINUX 577 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 578 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 579 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 580 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 581 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 582 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 583 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 584 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 585 %endif 586 587 shl NUM_BLKS, 6 ; convert to bytes 588 jz done_hash 589 lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block 590 mov [rsp + _INP_END], NUM_BLKS 591 592 cmp INP, NUM_BLKS 593 je only_one_block 594 595 ;; load initial digest 596 mov a,[4*0 + CTX] 597 mov b,[4*1 + CTX] 598 mov c,[4*2 + CTX] 599 mov d,[4*3 + CTX] 600 mov e,[4*4 + CTX] 601 mov f,[4*5 + CTX] 602 mov g,[4*6 + CTX] 603 mov h,[4*7 + CTX] 604 605 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] 606 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] 607 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] 608 609 mov [rsp + _CTX], CTX 610 611 loop0: 612 lea TBL,[K256 wrt rip] 613 614 ;; Load first 16 dwords from two blocks 615 VMOVDQ XTMP0, [INP + 0*32] 616 VMOVDQ XTMP1, [INP + 1*32] 617 VMOVDQ XTMP2, [INP + 2*32] 618 VMOVDQ XTMP3, [INP + 3*32] 619 620 ;; byte swap data 621 vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK 622 vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK 623 vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK 624 vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK 625 626 ;; transpose data into high/low halves 627 vperm2i128 X0, XTMP0, XTMP2, 0x20 628 vperm2i128 X1, XTMP0, XTMP2, 0x31 629 vperm2i128 X2, XTMP1, XTMP3, 0x20 630 vperm2i128 X3, XTMP1, XTMP3, 0x31 631 632 last_block_enter: 633 add INP, 64 634 mov [rsp + _INP], INP 635 636 ;; schedule 48 input dwords, by doing 3 rounds of 12 each 637 xor SRND, SRND 638 639 align 16 640 loop1: 641 vpaddd XFER, X0, [TBL + SRND + 0*32] 642 vmovdqa [rsp + _XFER + SRND + 0*32], XFER 643 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32 644 645 vpaddd XFER, X0, [TBL + SRND + 1*32] 646 vmovdqa [rsp + _XFER + SRND + 1*32], XFER 647 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32 648 649 vpaddd XFER, X0, [TBL + SRND + 2*32] 650 vmovdqa [rsp + _XFER + SRND + 2*32], XFER 651 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32 652 653 vpaddd XFER, X0, [TBL + SRND + 3*32] 654 vmovdqa [rsp + _XFER + SRND + 3*32], XFER 655 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32 656 657 add SRND, 4*32 658 cmp SRND, 3 * 4*32 659 jb loop1 660 661 loop2: 662 ;; Do last 16 rounds with no scheduling 663 vpaddd XFER, X0, [TBL + SRND + 0*32] 664 vmovdqa [rsp + _XFER + SRND + 0*32], XFER 665 DO_4ROUNDS rsp + _XFER + SRND + 0*32 666 vpaddd XFER, X1, [TBL + SRND + 1*32] 667 vmovdqa [rsp + _XFER + SRND + 1*32], XFER 668 DO_4ROUNDS rsp + _XFER + SRND + 1*32 669 add SRND, 2*32 670 671 vmovdqa X0, X2 672 vmovdqa X1, X3 673 674 cmp SRND, 4 * 4*32 675 jb loop2 676 677 mov CTX, [rsp + _CTX] 678 mov INP, [rsp + _INP] 679 680 addm [4*0 + CTX],a 681 addm [4*1 + CTX],b 682 addm [4*2 + CTX],c 683 addm [4*3 + CTX],d 684 addm [4*4 + CTX],e 685 addm [4*5 + CTX],f 686 addm [4*6 + CTX],g 687 addm [4*7 + CTX],h 688 689 cmp INP, [rsp + _INP_END] 690 ja done_hash 691 692 ;;;; Do second block using previously scheduled results 693 xor SRND, SRND 694 align 16 695 loop3: 696 DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16 697 DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16 698 add SRND, 2*32 699 cmp SRND, 4 * 4*32 700 jb loop3 701 702 mov CTX, [rsp + _CTX] 703 mov INP, [rsp + _INP] 704 add INP, 64 705 706 addm [4*0 + CTX],a 707 addm [4*1 + CTX],b 708 addm [4*2 + CTX],c 709 addm [4*3 + CTX],d 710 addm [4*4 + CTX],e 711 addm [4*5 + CTX],f 712 addm [4*6 + CTX],g 713 addm [4*7 + CTX],h 714 715 cmp INP, [rsp + _INP_END] 716 jb loop0 717 ja done_hash 718 719 do_last_block: 720 ;;;; do last block 721 lea TBL,[K256 wrt rip] 722 723 VMOVDQ XWORD0, [INP + 0*16] 724 VMOVDQ XWORD1, [INP + 1*16] 725 VMOVDQ XWORD2, [INP + 2*16] 726 VMOVDQ XWORD3, [INP + 3*16] 727 728 vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK 729 vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK 730 vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK 731 vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK 732 733 jmp last_block_enter 734 735 only_one_block: 736 737 ;; load initial digest 738 mov a,[4*0 + CTX] 739 mov b,[4*1 + CTX] 740 mov c,[4*2 + CTX] 741 mov d,[4*3 + CTX] 742 mov e,[4*4 + CTX] 743 mov f,[4*5 + CTX] 744 mov g,[4*6 + CTX] 745 mov h,[4*7 + CTX] 746 747 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] 748 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] 749 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] 750 751 mov [rsp + _CTX], CTX 752 jmp do_last_block 753 754 done_hash: 755 %ifndef LINUX 756 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] 757 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] 758 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] 759 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] 760 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] 761 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] 762 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] 763 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] 764 %endif 765 766 mov rsp, [rsp + _RSP] 767 768 pop r15 769 pop r14 770 pop r13 771 pop r12 772 pop rbp 773 %ifndef LINUX 774 pop rdi 775 pop rsi 776 %endif 777 pop rbx 778 779 ret 780 781 section .data 782 align 64 783 K256: 784 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 785 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 786 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 787 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 788 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 789 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 790 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 791 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 792 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 793 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 794 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 795 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 796 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 797 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 798 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 799 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 800 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 801 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 802 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 803 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 804 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 805 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 806 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 807 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 808 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 809 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 810 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 811 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 812 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 813 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 814 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 815 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 816 817 PSHUFFLE_BYTE_FLIP_MASK: 818 ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 819 820 ; shuffle xBxA -> 00BA 821 _SHUF_00BA: 822 ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 823 824 ; shuffle xDxC -> DC00 825 _SHUF_DC00: 826 ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF