damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

sha256_avx2_rorx2.asm (25414B)


      1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      2 ; Copyright (c) 2012, Intel Corporation
      3 ;
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are
      8 ; met:
      9 ;
     10 ; * Redistributions of source code must retain the above copyright
     11 ;   notice, this list of conditions and the following disclaimer.
     12 ;
     13 ; * Redistributions in binary form must reproduce the above copyright
     14 ;   notice, this list of conditions and the following disclaimer in the
     15 ;   documentation and/or other materials provided with the
     16 ;   distribution.
     17 ;
     18 ; * Neither the name of the Intel Corporation nor the names of its
     19 ;   contributors may be used to endorse or promote products derived from
     20 ;   this software without specific prior written permission.
     21 ;
     22 ;
     23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
     24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
     27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     35 ;
     36 ; Example YASM command lines:
     37 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
     38 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
     39 ;
     40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     41 ;
     42 ; This code is described in an Intel White-Paper:
     43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
     44 ;
     45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
     46 ; and search for that title.
     47 ; The paper is expected to be released roughly at the end of April, 2012
     48 ;
     49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     50 ; This code schedules 2 blocks at a time, with 4 lanes per block
     51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     52 
     53 %define	VMOVDQ vmovdqu ;; assume buffers not aligned
     54 
     55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
     56 
     57 ; addm [mem], reg
     58 ; Add reg to mem using reg-mem add and store
     59 %macro addm 2
     60 	add	%2, %1
     61 	mov	%1, %2
     62 %endm
     63 
     64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     65 
     66 %define X0 ymm4
     67 %define X1 ymm5
     68 %define X2 ymm6
     69 %define X3 ymm7
     70 
     71 ; XMM versions of above
     72 %define XWORD0 xmm4
     73 %define XWORD1 xmm5
     74 %define XWORD2 xmm6
     75 %define XWORD3 xmm7
     76 
     77 %define XTMP0 ymm0
     78 %define XTMP1 ymm1
     79 %define XTMP2 ymm2
     80 %define XTMP3 ymm3
     81 %define XTMP4 ymm8
     82 %define XFER  ymm9
     83 %define XTMP5 ymm11
     84 
     85 %define SHUF_00BA	ymm10 ; shuffle xBxA -> 00BA
     86 %define SHUF_DC00	ymm12 ; shuffle xDxC -> DC00
     87 %define BYTE_FLIP_MASK	ymm13
     88 
     89 %define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
     90 
     91 %ifdef LINUX
     92 %define NUM_BLKS rdx	; 3rd arg
     93 %define CTX	rsi   	; 2nd arg
     94 %define INP	rdi	; 1st arg
     95 %define c 	ecx
     96 %define d 	r8d
     97 %define e       edx	; clobbers NUM_BLKS
     98 %define y3 	edi	; clobbers INP
     99 %else
    100 %define NUM_BLKS r8     ; 3rd arg
    101 %define CTX	rdx 	; 2nd arg
    102 %define INP	rcx 	; 1st arg
    103 %define c 	edi
    104 %define d 	esi
    105 %define e 	r8d	; clobbers NUM_BLKS
    106 %define y3 	ecx	; clobbers INP
    107 
    108 %endif
    109 
    110 
    111 %define TBL	rbp
    112 %define SRND	CTX	; SRND is same register as CTX
    113 
    114 %define a eax
    115 %define b ebx
    116 %define f r9d
    117 %define g r10d
    118 %define h r11d
    119 %define old_h r11d
    120 
    121 %define T1 r12d
    122 %define y0 r13d
    123 %define y1 r14d
    124 %define y2 r15d
    125 
    126 
    127 _XFER_SIZE	equ 2*64*4	; 2 blocks, 64 rounds, 4 bytes/round
    128 %ifdef LINUX
    129 _XMM_SAVE_SIZE	equ 0
    130 %else
    131 _XMM_SAVE_SIZE	equ 8*16
    132 %endif
    133 _INP_END_SIZE	equ 8
    134 _INP_SIZE	equ 8
    135 _CTX_SIZE	equ 8
    136 _RSP_SIZE	equ 8
    137 
    138 _XFER		equ 0
    139 _XMM_SAVE	equ _XFER     + _XFER_SIZE
    140 _INP_END	equ _XMM_SAVE + _XMM_SAVE_SIZE
    141 _INP		equ _INP_END  + _INP_END_SIZE
    142 _CTX		equ _INP      + _INP_SIZE
    143 _RSP		equ _CTX      + _CTX_SIZE
    144 STACK_SIZE	equ _RSP      + _RSP_SIZE
    145 
    146 ; rotate_Xs
    147 ; Rotate values of symbols X0...X3
    148 %macro rotate_Xs 0
    149 %xdefine X_ X0
    150 %xdefine X0 X1
    151 %xdefine X1 X2
    152 %xdefine X2 X3
    153 %xdefine X3 X_
    154 %endm
    155 
    156 ; ROTATE_ARGS
    157 ; Rotate values of symbols a...h
    158 %macro ROTATE_ARGS 0
    159 %xdefine old_h h
    160 %xdefine TMP_ h
    161 %xdefine h g
    162 %xdefine g f
    163 %xdefine f e
    164 %xdefine e d
    165 %xdefine d c
    166 %xdefine c b
    167 %xdefine b a
    168 %xdefine a TMP_
    169 %endm
    170 
    171 %macro FOUR_ROUNDS_AND_SCHED 1
    172 %define %%XFER %1
    173 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    174 
    175 	mov	y3, a		; y3 = a                                ; MAJA
    176 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    177 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    178 
    179 	add	h, dword[%%XFER+0*4]		; h = k + w + h         ; --
    180 	or	y3, c		; y3 = a|c                              ; MAJA
    181 		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
    182 	mov	y2, f		; y2 = f                                ; CH
    183 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    184 
    185 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    186 	xor	y2, g		; y2 = f^g                              ; CH
    187 		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6)					; S1
    188 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    189 
    190 	and	y2, e		; y2 = (f^g)&e                          ; CH
    191 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    192 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    193 	add	d, h		; d = k + w + h + d                     ; --
    194 
    195 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    196 		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
    197 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    198 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    199 
    200 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    201 		vpsrld	XTMP2, XTMP1, 7
    202 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    203 	mov	T1, a		; T1 = a                                ; MAJB
    204 	and	T1, c		; T1 = a&c                              ; MAJB
    205 
    206 	add	y2, y0		; y2 = S1 + CH                          ; --
    207 		vpslld	XTMP3, XTMP1, (32-7)
    208 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    209 	add	h, y1		; h = k + w + h + S0                    ; --
    210 
    211 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    212 		vpor	XTMP3, XTMP3, XTMP2	; XTMP3 = W[-15] nostrdb: ror 7
    213 
    214 		vpsrld	XTMP2, XTMP1,18
    215 	add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    216 	add	h, y3		; h = t1 + S0 + MAJ                     ; --
    217 
    218 
    219 ROTATE_ARGS
    220 
    221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    222 
    223 
    224 	mov	y3, a		; y3 = a                                ; MAJA
    225 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    226 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    227 	add	h, dword[%%XFER+1*4]		; h = k + w + h         ; --
    228 	or	y3, c		; y3 = a|c                              ; MAJA
    229 
    230 
    231 		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] nostrdb: >> 3
    232 	mov	y2, f		; y2 = f                                ; CH
    233 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    234 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    235 	xor	y2, g		; y2 = f^g                              ; CH
    236 
    237 
    238 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    239 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    240 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    241 	and	y2, e		; y2 = (f^g)&e                          ; CH
    242 	add	d, h		; d = k + w + h + d                     ; --
    243 
    244 		vpslld	XTMP1, XTMP1, (32-18)
    245 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    246 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    247 
    248 		vpxor	XTMP3, XTMP3, XTMP1
    249 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    250 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    251 
    252 		vpxor	XTMP3, XTMP3, XTMP2	; XTMP3 = W[-15] nostrdb: ror 7 ^ W[-15] ror 18
    253 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    254 	mov	T1, a		; T1 = a                                ; MAJB
    255 	and	T1, c		; T1 = a&c                              ; MAJB
    256 	add	y2, y0		; y2 = S1 + CH                          ; --
    257 
    258 		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
    259 		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
    260 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    261 	add	h, y1		; h = k + w + h + S0                    ; --
    262 
    263 		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
    264 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    265 	add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    266 	add	h, y3		; h = t1 + S0 + MAJ                     ; --
    267 
    268 		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
    269 
    270 
    271 ROTATE_ARGS
    272 
    273 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    274 
    275 	mov	y3, a		; y3 = a                                ; MAJA
    276 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    277 	add	h, [%%XFER+2*4]		; h = k + w + h         ; --
    278 
    279 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] ror 19 {xBxA}
    280 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    281 	or	y3, c		; y3 = a|c                              ; MAJA
    282 	mov	y2, f		; y2 = f                                ; CH
    283 	xor	y2, g		; y2 = f^g                              ; CH
    284 
    285 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    286 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    287 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] ror 17 {xBxA}
    288 	and	y2, e		; y2 = (f^g)&e                          ; CH
    289 
    290 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    291 		vpxor	XTMP2, XTMP2, XTMP3
    292 	add	d, h		; d = k + w + h + d                     ; --
    293 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    294 
    295 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    296 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    297 		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
    298 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    299 
    300 		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
    301 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    302 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    303 		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
    304 
    305 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    306 	mov	T1, a		; T1 = a                                ; MAJB
    307 	and	T1, c		; T1 = a&c                              ; MAJB
    308 	add	y2, y0		; y2 = S1 + CH                          ; --
    309 		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
    310 
    311 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    312 	add	h, y1		; h = k + w + h + S0                    ; --
    313 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    314 	add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    315 
    316 	add	h, y3		; h = t1 + S0 + MAJ                     ; --
    317 
    318 
    319 ROTATE_ARGS
    320 
    321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    322 
    323 	mov	y3, a		; y3 = a                                ; MAJA
    324 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    325 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    326 	add	h, dword[%%XFER+3*4]		; h = k + w + h         ; --
    327 	or	y3, c		; y3 = a|c                              ; MAJA
    328 
    329 
    330 		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
    331 	mov	y2, f		; y2 = f                                ; CH
    332 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    333 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    334 	xor	y2, g		; y2 = f^g                              ; CH
    335 
    336 
    337 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] ror 19 {xDxC}
    338 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    339 	and	y2, e		; y2 = (f^g)&e                          ; CH
    340 	add	d, h		; d = k + w + h + d                     ; --
    341 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    342 
    343 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] ror 17 {xDxC}
    344 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    345 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    346 
    347 		vpxor	XTMP2, XTMP2, XTMP3
    348 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    349 	add	y2, y0		; y2 = S1 + CH                          ; --
    350 
    351 		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
    352 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    353 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    354 
    355 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    356 		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
    357 
    358 		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
    359 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    360 	mov	T1, a		; T1 = a                                ; MAJB
    361 	and	T1, c		; T1 = a&c                              ; MAJB
    362 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    363 
    364 	add	h, y1		; h = k + w + h + S0                    ; --
    365 	add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    366 	add	h, y3		; h = t1 + S0 + MAJ                     ; --
    367 
    368 ROTATE_ARGS
    369 rotate_Xs
    370 %endm
    371 
    372 %macro DO_4ROUNDS 1
    373 %define %%XFER %1
    374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
    375 
    376 	mov	y2, f		; y2 = f                                ; CH
    377 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    378 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    379 	xor	y2, g		; y2 = f^g                              ; CH
    380 
    381 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    382 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    383 	and	y2, e		; y2 = (f^g)&e                          ; CH
    384 
    385 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    386 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    387 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    388 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    389 	mov	y3, a		; y3 = a                                ; MAJA
    390 
    391 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    392 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    393 	add	h, dword[%%XFER + 4*0]		; h = k + w + h ; --
    394 	or	y3, c		; y3 = a|c                              ; MAJA
    395 
    396 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    397 	mov	T1, a		; T1 = a                                ; MAJB
    398 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    399 	and	T1, c		; T1 = a&c                              ; MAJB
    400 	add	y2, y0		; y2 = S1 + CH                          ; --
    401 
    402 
    403 	add	d, h		; d = k + w + h + d                     ; --
    404 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    405 	add	h, y1		; h = k + w + h + S0                    ; --
    406 
    407 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    408 
    409 
    410 	;add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    411 
    412 	;add	h, y3		; h = t1 + S0 + MAJ                     ; --
    413 
    414 	ROTATE_ARGS
    415 
    416 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
    417 
    418 	add	old_h, y2	; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    419 	mov	y2, f		; y2 = f                                ; CH
    420 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    421 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    422 	xor	y2, g		; y2 = f^g                              ; CH
    423 
    424 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    425 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    426 	and	y2, e		; y2 = (f^g)&e                          ; CH
    427 	add	old_h, y3	; h = t1 + S0 + MAJ                     ; --
    428 
    429 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    430 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    431 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    432 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    433 	mov	y3, a		; y3 = a                                ; MAJA
    434 
    435 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    436 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    437 	add	h, dword[%%XFER + 4*1]		; h = k + w + h ; --
    438 	or	y3, c		; y3 = a|c                              ; MAJA
    439 
    440 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    441 	mov	T1, a		; T1 = a                                ; MAJB
    442 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    443 	and	T1, c		; T1 = a&c                              ; MAJB
    444 	add	y2, y0		; y2 = S1 + CH                          ; --
    445 
    446 
    447 	add	d, h		; d = k + w + h + d                     ; --
    448 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    449 	add	h, y1		; h = k + w + h + S0                    ; --
    450 
    451 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    452 
    453 
    454 	;add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    455 
    456 	;add	h, y3		; h = t1 + S0 + MAJ                     ; --
    457 
    458 	ROTATE_ARGS
    459 
    460 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    461 
    462 	add	old_h, y2	; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    463 	mov	y2, f		; y2 = f                                ; CH
    464 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    465 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    466 	xor	y2, g		; y2 = f^g                              ; CH
    467 
    468 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    469 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    470 	and	y2, e		; y2 = (f^g)&e                          ; CH
    471 	add	old_h, y3	; h = t1 + S0 + MAJ                     ; --
    472 
    473 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    474 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    475 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    476 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    477 	mov	y3, a		; y3 = a                                ; MAJA
    478 
    479 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    480 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    481 	add	h, dword[%%XFER + 4*2]		; h = k + w + h ; --
    482 	or	y3, c		; y3 = a|c                              ; MAJA
    483 
    484 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    485 	mov	T1, a		; T1 = a                                ; MAJB
    486 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    487 	and	T1, c		; T1 = a&c                              ; MAJB
    488 	add	y2, y0		; y2 = S1 + CH                          ; --
    489 
    490 
    491 	add	d, h		; d = k + w + h + d                     ; --
    492 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    493 	add	h, y1		; h = k + w + h + S0                    ; --
    494 
    495 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    496 
    497 
    498 	;add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    499 
    500 	;add	h, y3		; h = t1 + S0 + MAJ                     ; --
    501 
    502 	ROTATE_ARGS
    503 
    504 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
    505 
    506 	add	old_h, y2	; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    507 	mov	y2, f		; y2 = f                                ; CH
    508 	rorx	y0, e, 25	; y0 = e >> 25				; S1A
    509 	rorx	y1, e, 11	; y1 = e >> 11				; S1B
    510 	xor	y2, g		; y2 = f^g                              ; CH
    511 
    512 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11)		; S1
    513 	rorx	y1, e, 6	; y1 = (e >> 6)				; S1
    514 	and	y2, e		; y2 = (f^g)&e                          ; CH
    515 	add	old_h, y3	; h = t1 + S0 + MAJ                     ; --
    516 
    517 	xor	y0, y1		; y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1
    518 	rorx	T1, a, 13	; T1 = a >> 13				; S0B
    519 	xor	y2, g		; y2 = CH = ((f^g)&e)^g                 ; CH
    520 	rorx	y1, a, 22	; y1 = a >> 22				; S0A
    521 	mov	y3, a		; y3 = a                                ; MAJA
    522 
    523 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13)		; S0
    524 	rorx	T1, a, 2	; T1 = (a >> 2)				; S0
    525 	add	h, dword[%%XFER + 4*3]		; h = k + w + h ; --
    526 	or	y3, c		; y3 = a|c                              ; MAJA
    527 
    528 	xor	y1, T1		; y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0
    529 	mov	T1, a		; T1 = a                                ; MAJB
    530 	and	y3, b		; y3 = (a|c)&b                          ; MAJA
    531 	and	T1, c		; T1 = a&c                              ; MAJB
    532 	add	y2, y0		; y2 = S1 + CH                          ; --
    533 
    534 
    535 	add	d, h		; d = k + w + h + d                     ; --
    536 	or	y3, T1		; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ
    537 	add	h, y1		; h = k + w + h + S0                    ; --
    538 
    539 	add	d, y2		; d = k + w + h + d + S1 + CH = d + t1  ; --
    540 
    541 
    542 	add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --
    543 
    544 	add	h, y3		; h = t1 + S0 + MAJ                     ; --
    545 
    546 	ROTATE_ARGS
    547 
    548 %endm
    549 
    550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    551 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    552 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
    553 ;; arg 1 : pointer to input data
    554 ;; arg 2 : pointer to digest
    555 ;; arg 3 : Num blocks
    556 section .text
    557 global sha256_rorx
    558 align 32
    559 sha256_rorx:
    560 	push	rbx
    561 %ifndef LINUX
    562 	push	rsi
    563 	push	rdi
    564 %endif
    565 	push	rbp
    566 	push	r12
    567 	push	r13
    568 	push	r14
    569 	push	r15
    570 
    571 	mov	rax, rsp
    572 	sub	rsp,STACK_SIZE
    573 	and	rsp, -32
    574 	mov	[rsp + _RSP], rax
    575 
    576 %ifndef LINUX
    577 	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6
    578 	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
    579 	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8
    580 	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9
    581 	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
    582 	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
    583 	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
    584 	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
    585 %endif
    586 
    587 	shl	NUM_BLKS, 6	; convert to bytes
    588 	jz	done_hash
    589 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
    590 	mov	[rsp + _INP_END], NUM_BLKS
    591 
    592 	cmp	INP, NUM_BLKS
    593 	je	only_one_block
    594 
    595 	;; load initial digest
    596 	mov	a,[4*0 + CTX]
    597 	mov	b,[4*1 + CTX]
    598 	mov	c,[4*2 + CTX]
    599 	mov	d,[4*3 + CTX]
    600 	mov	e,[4*4 + CTX]
    601 	mov	f,[4*5 + CTX]
    602 	mov	g,[4*6 + CTX]
    603 	mov	h,[4*7 + CTX]
    604 
    605 	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
    606 	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
    607 	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
    608 
    609 	mov	[rsp + _CTX], CTX
    610 
    611 loop0:
    612 	lea	TBL,[K256 wrt rip]
    613 
    614 	;; Load first 16 dwords from two blocks
    615 	VMOVDQ	XTMP0, [INP + 0*32]
    616 	VMOVDQ	XTMP1, [INP + 1*32]
    617 	VMOVDQ	XTMP2, [INP + 2*32]
    618 	VMOVDQ	XTMP3, [INP + 3*32]
    619 
    620 	;; byte swap data
    621 	vpshufb	XTMP0, XTMP0, BYTE_FLIP_MASK
    622 	vpshufb	XTMP1, XTMP1, BYTE_FLIP_MASK
    623 	vpshufb	XTMP2, XTMP2, BYTE_FLIP_MASK
    624 	vpshufb	XTMP3, XTMP3, BYTE_FLIP_MASK
    625 
    626 	;; transpose data into high/low halves
    627 	vperm2i128	X0, XTMP0, XTMP2, 0x20
    628 	vperm2i128	X1, XTMP0, XTMP2, 0x31
    629 	vperm2i128	X2, XTMP1, XTMP3, 0x20
    630 	vperm2i128	X3, XTMP1, XTMP3, 0x31
    631 
    632 last_block_enter:
    633 	add	INP, 64
    634 	mov	[rsp + _INP], INP
    635 
    636 	;; schedule 48 input dwords, by doing 3 rounds of 12 each
    637 	xor	SRND, SRND
    638 
    639 align 16
    640 loop1:
    641 	vpaddd	XFER, X0, [TBL + SRND + 0*32]
    642 	vmovdqa [rsp + _XFER + SRND + 0*32], XFER
    643 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32
    644 
    645 	vpaddd	XFER, X0, [TBL + SRND + 1*32]
    646 	vmovdqa [rsp + _XFER + SRND + 1*32], XFER
    647 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32
    648 
    649 	vpaddd	XFER, X0, [TBL + SRND + 2*32]
    650 	vmovdqa [rsp + _XFER + SRND + 2*32], XFER
    651 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32
    652 
    653 	vpaddd	XFER, X0, [TBL + SRND + 3*32]
    654 	vmovdqa [rsp + _XFER + SRND + 3*32], XFER
    655 	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32
    656 
    657 	add	SRND, 4*32
    658 	cmp	SRND, 3 * 4*32
    659 	jb	loop1
    660 
    661 loop2:
    662 	;; Do last 16 rounds with no scheduling
    663 	vpaddd	XFER, X0, [TBL + SRND + 0*32]
    664 	vmovdqa [rsp + _XFER + SRND + 0*32], XFER
    665 	DO_4ROUNDS	rsp + _XFER + SRND + 0*32
    666 	vpaddd	XFER, X1, [TBL + SRND + 1*32]
    667 	vmovdqa [rsp + _XFER + SRND + 1*32], XFER
    668 	DO_4ROUNDS	rsp + _XFER + SRND + 1*32
    669 	add	SRND, 2*32
    670 
    671 	vmovdqa	X0, X2
    672 	vmovdqa	X1, X3
    673 
    674 	cmp	SRND, 4 * 4*32
    675 	jb	loop2
    676 
    677 	mov	CTX, [rsp + _CTX]
    678 	mov	INP, [rsp + _INP]
    679 
    680 	addm	[4*0 + CTX],a
    681 	addm	[4*1 + CTX],b
    682 	addm	[4*2 + CTX],c
    683 	addm	[4*3 + CTX],d
    684 	addm	[4*4 + CTX],e
    685 	addm	[4*5 + CTX],f
    686 	addm	[4*6 + CTX],g
    687 	addm	[4*7 + CTX],h
    688 
    689 	cmp	INP, [rsp + _INP_END]
    690 	ja	done_hash
    691 
    692 	;;;; Do second block using previously scheduled results
    693 	xor	SRND, SRND
    694 align 16
    695 loop3:
    696 	DO_4ROUNDS	rsp + _XFER + SRND + 0*32 + 16
    697 	DO_4ROUNDS	rsp + _XFER + SRND + 1*32 + 16
    698 	add	SRND, 2*32
    699 	cmp	SRND, 4 * 4*32
    700 	jb loop3
    701 
    702 	mov	CTX, [rsp + _CTX]
    703 	mov	INP, [rsp + _INP]
    704 	add	INP, 64
    705 
    706 	addm	[4*0 + CTX],a
    707 	addm	[4*1 + CTX],b
    708 	addm	[4*2 + CTX],c
    709 	addm	[4*3 + CTX],d
    710 	addm	[4*4 + CTX],e
    711 	addm	[4*5 + CTX],f
    712 	addm	[4*6 + CTX],g
    713 	addm	[4*7 + CTX],h
    714 
    715 	cmp	INP, [rsp + _INP_END]
    716 	jb	loop0
    717 	ja	done_hash
    718 
    719 do_last_block:
    720 	;;;; do last block
    721 	lea	TBL,[K256 wrt rip]
    722 
    723 	VMOVDQ	XWORD0, [INP + 0*16]
    724 	VMOVDQ	XWORD1, [INP + 1*16]
    725 	VMOVDQ	XWORD2, [INP + 2*16]
    726 	VMOVDQ	XWORD3, [INP + 3*16]
    727 
    728 	vpshufb	XWORD0, XWORD0, X_BYTE_FLIP_MASK
    729 	vpshufb	XWORD1, XWORD1, X_BYTE_FLIP_MASK
    730 	vpshufb	XWORD2, XWORD2, X_BYTE_FLIP_MASK
    731 	vpshufb	XWORD3, XWORD3, X_BYTE_FLIP_MASK
    732 
    733 	jmp	last_block_enter
    734 
    735 only_one_block:
    736 
    737 	;; load initial digest
    738 	mov	a,[4*0 + CTX]
    739 	mov	b,[4*1 + CTX]
    740 	mov	c,[4*2 + CTX]
    741 	mov	d,[4*3 + CTX]
    742 	mov	e,[4*4 + CTX]
    743 	mov	f,[4*5 + CTX]
    744 	mov	g,[4*6 + CTX]
    745 	mov	h,[4*7 + CTX]
    746 
    747 	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
    748 	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
    749 	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
    750 
    751 	mov	[rsp + _CTX], CTX
    752 	jmp	do_last_block
    753 
    754 done_hash:
    755 %ifndef LINUX
    756 	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
    757 	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
    758 	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
    759 	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
    760 	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
    761 	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
    762 	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
    763 	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
    764 %endif
    765 
    766 	mov	rsp, [rsp + _RSP]
    767 
    768 	pop	r15
    769 	pop	r14
    770 	pop	r13
    771 	pop	r12
    772 	pop	rbp
    773 %ifndef LINUX
    774 	pop	rdi
    775 	pop	rsi
    776 %endif
    777 	pop	rbx
    778 
    779 	ret
    780 
    781 section .data
    782 align 64
    783 K256:
    784 	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    785 	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    786 	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    787 	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    788 	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    789 	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    790 	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    791 	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    792 	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    793 	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    794 	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    795 	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    796 	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    797 	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    798 	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    799 	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    800 	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    801 	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    802 	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    803 	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    804 	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    805 	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    806 	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    807 	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    808 	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    809 	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    810 	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    811 	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    812 	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    813 	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    814 	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    815 	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    816 
    817 PSHUFFLE_BYTE_FLIP_MASK:
    818 	ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
    819 
    820 ; shuffle xBxA -> 00BA
    821 _SHUF_00BA:
    822 	ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
    823 
    824 ; shuffle xDxC -> DC00
    825 _SHUF_DC00:
    826 	ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF