damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

sha256_avx1.asm (15713B)


      1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      2 ; Copyright (c) 2012, Intel Corporation
      3 ;
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are
      8 ; met:
      9 ;
     10 ; * Redistributions of source code must retain the above copyright
     11 ;   notice, this list of conditions and the following disclaimer.
     12 ;
     13 ; * Redistributions in binary form must reproduce the above copyright
     14 ;   notice, this list of conditions and the following disclaimer in the
     15 ;   documentation and/or other materials provided with the
     16 ;   distribution.
     17 ;
     18 ; * Neither the name of the Intel Corporation nor the names of its
     19 ;   contributors may be used to endorse or promote products derived from
     20 ;   this software without specific prior written permission.
     21 ;
     22 ;
     23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
     24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
     27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     35 ;
     36 ; Example YASM command lines:
     37 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
     38 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
     39 ;
     40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     41 ;
     42 ; This code is described in an Intel White-Paper:
     43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
     44 ;
     45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
     46 ; and search for that title.
     47 ; The paper is expected to be released roughly at the end of April, 2012
     48 ;
     49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     50 ; This code schedules 1 blocks at a time, with 4 lanes per block
     51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     52 
     53 %define	VMOVDQ vmovdqu ;; assume buffers not aligned
     54 
     55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
     56 
     57 ; addm [mem], reg
     58 ; Add reg to mem using reg-mem add and store
     59 %macro addm 2
     60 	add	%2, %1
     61 	mov	%1, %2
     62 %endm
     63 
     64 %macro MY_ROR 2
     65 	shld	%1,%1,(32-(%2))
     66 %endm
     67 
     68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     69 
     70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
     71 ; Load xmm with mem and byte swap each dword
     72 %macro COPY_XMM_AND_BSWAP 3
     73 	VMOVDQ %1, %2
     74 	vpshufb %1, %1, %3
     75 %endmacro
     76 
     77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     78 
     79 %define X0 xmm4
     80 %define X1 xmm5
     81 %define X2 xmm6
     82 %define X3 xmm7
     83 
     84 %define XTMP0 xmm0
     85 %define XTMP1 xmm1
     86 %define XTMP2 xmm2
     87 %define XTMP3 xmm3
     88 %define XTMP4 xmm8
     89 %define XFER  xmm9
     90 %define XTMP5 xmm11
     91 
     92 %define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
     93 %define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
     94 %define BYTE_FLIP_MASK	xmm13
     95 
     96 %ifdef LINUX
     97 %define NUM_BLKS rdx	; 3rd arg
     98 %define CTX	rsi	; 2nd arg
     99 %define INP	rdi	; 1st arg
    100 
    101 %define SRND	rdi	; clobbers INP
    102 %define c	ecx
    103 %define d 	r8d
    104 %define e 	edx
    105 %else
    106 %define NUM_BLKS r8	; 3rd arg
    107 %define CTX	rdx 	; 2nd arg
    108 %define INP	rcx 	; 1st arg
    109 
    110 %define SRND	rcx	; clobbers INP
    111 %define c 	edi
    112 %define d	esi
    113 %define e 	r8d
    114 
    115 %endif
    116 %define TBL	rbp
    117 %define a eax
    118 %define b ebx
    119 
    120 %define f r9d
    121 %define g r10d
    122 %define h r11d
    123 
    124 %define y0 r13d
    125 %define y1 r14d
    126 %define y2 r15d
    127 
    128 
    129 _INP_END_SIZE	equ 8
    130 _INP_SIZE	equ 8
    131 _XFER_SIZE	equ 8
    132 %ifdef LINUX
    133 _XMM_SAVE_SIZE	equ 0
    134 %else
    135 _XMM_SAVE_SIZE	equ 8*16
    136 %endif
    137 ; STACK_SIZE plus pushes must be an odd multiple of 8
    138 _ALIGN_SIZE	equ 8
    139 
    140 _INP_END	equ 0
    141 _INP		equ _INP_END  + _INP_END_SIZE
    142 _XFER		equ _INP      + _INP_SIZE
    143 _XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
    144 STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE
    145 
    146 ; rotate_Xs
    147 ; Rotate values of symbols X0...X3
    148 %macro rotate_Xs 0
    149 %xdefine X_ X0
    150 %xdefine X0 X1
    151 %xdefine X1 X2
    152 %xdefine X2 X3
    153 %xdefine X3 X_
    154 %endm
    155 
    156 ; ROTATE_ARGS
    157 ; Rotate values of symbols a...h
    158 %macro ROTATE_ARGS 0
    159 %xdefine TMP_ h
    160 %xdefine h g
    161 %xdefine g f
    162 %xdefine f e
    163 %xdefine e d
    164 %xdefine d c
    165 %xdefine c b
    166 %xdefine b a
    167 %xdefine a TMP_
    168 %endm
    169 
    170 %macro FOUR_ROUNDS_AND_SCHED 0
    171 		;; compute s0 four at a time and s1 two at a time
    172 		;; compute W[-16] + W[-7] 4 at a time
    173 		;vmovdqa	XTMP0, X3
    174 	mov	y0, e		; y0 = e
    175 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
    176 	mov	y1, a		; y1 = a
    177 		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
    178 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
    179 	xor	y0, e		; y0 = e ^ (e >> (25-11))
    180 	mov	y2, f		; y2 = f
    181 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
    182 		;vmovdqa	XTMP1, X1
    183 	xor	y1, a		; y1 = a ^ (a >> (22-13)
    184 	xor	y2, g		; y2 = f^g
    185 		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
    186 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    187 	and	y2, e		; y2 = (f^g)&e
    188 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
    189 		;; compute s0
    190 		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
    191 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    192 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    193 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
    194 
    195 
    196 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    197 	add	y2, y0		; y2 = S1 + CH
    198 	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH
    199 
    200 	mov	y0, a		; y0 = a
    201 	add	h, y2		; h = h + S1 + CH + k + w
    202 	mov	y2, a		; y2 = a
    203 
    204 		vpsrld	XTMP2, XTMP1, 7
    205 
    206 	or	y0, c		; y0 = a|c
    207 	add	d, h		; d = d + h + S1 + CH + k + w
    208 	and	y2, c		; y2 = a&c
    209 
    210 		vpslld	XTMP3, XTMP1, (32-7)
    211 
    212 	and	y0, b		; y0 = (a|c)&b
    213 	add	h, y1		; h = h + S1 + CH + k + w + S0
    214 
    215 		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] nostrdb: MY_ROR 7
    216 
    217 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
    218 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
    219 
    220 ROTATE_ARGS
    221 
    222 	mov	y0, e		; y0 = e
    223 	mov	y1, a		; y1 = a
    224 
    225 
    226 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
    227 	xor	y0, e		; y0 = e ^ (e >> (25-11))
    228 	mov	y2, f		; y2 = f
    229 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
    230 
    231 		vpsrld	XTMP2, XTMP1,18
    232 
    233 	xor	y1, a		; y1 = a ^ (a >> (22-13)
    234 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
    235 	xor	y2, g		; y2 = f^g
    236 
    237 		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] nostrdb: >> 3
    238 
    239 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
    240 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    241 	and	y2, e		; y2 = (f^g)&e
    242 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    243 
    244 		vpslld	XTMP1, XTMP1, (32-18)
    245 
    246 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    247 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
    248 
    249 		vpxor	XTMP3, XTMP3, XTMP1
    250 
    251 	add	y2, y0		; y2 = S1 + CH
    252 	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
    253 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    254 
    255 		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] nostrdb: MY_ROR 7 ^ W[-15] MY_ROR 18
    256 
    257 	mov	y0, a		; y0 = a
    258 	add	h, y2		; h = h + S1 + CH + k + w
    259 	mov	y2, a		; y2 = a
    260 
    261 		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
    262 
    263 	or	y0, c		; y0 = a|c
    264 	add	d, h		; d = d + h + S1 + CH + k + w
    265 	and	y2, c		; y2 = a&c
    266 		;; compute low s1
    267 		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
    268 	and	y0, b		; y0 = (a|c)&b
    269 	add	h, y1		; h = h + S1 + CH + k + w + S0
    270 		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
    271 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
    272 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
    273 
    274 ROTATE_ARGS
    275 		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
    276 
    277 	mov	y0, e		; y0 = e
    278 	mov	y1, a		; y1 = a
    279 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
    280 
    281 		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
    282 
    283 	xor	y0, e		; y0 = e ^ (e >> (25-11))
    284 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
    285 	mov	y2, f		; y2 = f
    286 	xor	y1, a		; y1 = a ^ (a >> (22-13)
    287 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
    288 
    289 		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
    290 
    291 	xor	y2, g		; y2 = f^g
    292 
    293 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
    294 
    295 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    296 	and	y2, e		; y2 = (f^g)&e
    297 
    298 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
    299 
    300 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
    301 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    302 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
    303 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    304 		vpxor	XTMP2, XTMP2, XTMP3
    305 	add	y2, y0		; y2 = S1 + CH
    306 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    307 	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
    308 		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
    309 	mov	y0, a		; y0 = a
    310 	add	h, y2		; h = h + S1 + CH + k + w
    311 	mov	y2, a		; y2 = a
    312 		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
    313 	or	y0, c		; y0 = a|c
    314 	add	d, h		; d = d + h + S1 + CH + k + w
    315 	and	y2, c		; y2 = a&c
    316 		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
    317 	and	y0, b		; y0 = (a|c)&b
    318 	add	h, y1		; h = h + S1 + CH + k + w + S0
    319 		;; compute high s1
    320 		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
    321 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
    322 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
    323 
    324 ROTATE_ARGS
    325 		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
    326 	mov	y0, e		; y0 = e
    327 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
    328 	mov	y1, a		; y1 = a
    329 		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
    330 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
    331 	xor	y0, e		; y0 = e ^ (e >> (25-11))
    332 	mov	y2, f		; y2 = f
    333 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
    334 
    335 		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
    336 
    337 	xor	y1, a		; y1 = a ^ (a >> (22-13)
    338 	xor	y2, g		; y2 = f^g
    339 
    340 		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
    341 
    342 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    343 	and	y2, e		; y2 = (f^g)&e
    344 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
    345 
    346 		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
    347 
    348 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    349 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    350 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
    351 
    352 		vpxor	XTMP2, XTMP2, XTMP3
    353 
    354 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    355 	add	y2, y0		; y2 = S1 + CH
    356 	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
    357 		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
    358 	mov	y0, a		; y0 = a
    359 	add	h, y2		; h = h + S1 + CH + k + w
    360 	mov	y2, a		; y2 = a
    361 		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
    362 	or	y0, c		; y0 = a|c
    363 	add	d, h		; d = d + h + S1 + CH + k + w
    364 	and	y2, c		; y2 = a&c
    365 		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
    366 	and	y0, b		; y0 = (a|c)&b
    367 	add	h, y1		; h = h + S1 + CH + k + w + S0
    368 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
    369 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
    370 
    371 ROTATE_ARGS
    372 rotate_Xs
    373 %endm
    374 
    375 ;; input is [rsp + _XFER + %1 * 4]
    376 %macro DO_ROUND 1
    377 	mov	y0, e		; y0 = e
    378 	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
    379 	mov	y1, a		; y1 = a
    380 	xor	y0, e		; y0 = e ^ (e >> (25-11))
    381 	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
    382 	mov	y2, f		; y2 = f
    383 	xor	y1, a		; y1 = a ^ (a >> (22-13)
    384 	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
    385 	xor	y2, g		; y2 = f^g
    386 	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
    387 	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
    388 	and	y2, e		; y2 = (f^g)&e
    389 	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
    390 	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
    391 	xor	y2, g		; y2 = CH = ((f^g)&e)^g
    392 	add	y2, y0		; y2 = S1 + CH
    393 	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
    394 	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
    395 	mov	y0, a		; y0 = a
    396 	add	h, y2		; h = h + S1 + CH + k + w
    397 	mov	y2, a		; y2 = a
    398 	or	y0, c		; y0 = a|c
    399 	add	d, h		; d = d + h + S1 + CH + k + w
    400 	and	y2, c		; y2 = a&c
    401 	and	y0, b		; y0 = (a|c)&b
    402 	add	h, y1		; h = h + S1 + CH + k + w + S0
    403 	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
    404 	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
    405 	ROTATE_ARGS
    406 %endm
    407 
    408 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    409 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    410 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
    411 ;; arg 1 : pointer to input data
    412 ;; arg 2 : pointer to digest
    413 ;; arg 3 : Num blocks
    414 section .text
    415 global sha256_avx
    416 align 32
    417 sha256_avx:
    418 	push	rbx
    419 %ifndef LINUX
    420 	push	rsi
    421 	push	rdi
    422 %endif
    423 	push	rbp
    424 	push	r13
    425 	push	r14
    426 	push	r15
    427 
    428 	sub	rsp,STACK_SIZE
    429 %ifndef LINUX
    430 	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6
    431 	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
    432 	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8
    433 	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9
    434 	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
    435 	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
    436 	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
    437 	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
    438 %endif
    439 
    440 	shl	NUM_BLKS, 6	; convert to bytes
    441 	jz	done_hash
    442 	add	NUM_BLKS, INP	; pointer to end of data
    443 	mov	[rsp + _INP_END], NUM_BLKS
    444 
    445 	;; load initial digest
    446 	mov	a,[4*0 + CTX]
    447 	mov	b,[4*1 + CTX]
    448 	mov	c,[4*2 + CTX]
    449 	mov	d,[4*3 + CTX]
    450 	mov	e,[4*4 + CTX]
    451 	mov	f,[4*5 + CTX]
    452 	mov	g,[4*6 + CTX]
    453 	mov	h,[4*7 + CTX]
    454 
    455 	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
    456 	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
    457 	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]
    458 
    459 loop0:
    460 	lea	TBL,[K256 wrt rip]
    461 
    462 	;; byte swap first 16 dwords
    463 	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
    464 	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
    465 	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
    466 	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
    467 
    468 	mov	[rsp + _INP], INP
    469 
    470 	;; schedule 48 input dwords, by doing 3 rounds of 16 each
    471 	mov	SRND, 3
    472 align 16
    473 loop1:
    474 	vpaddd	XFER, X0, [TBL + 0*16]
    475 	vmovdqa	[rsp + _XFER], XFER
    476 	FOUR_ROUNDS_AND_SCHED
    477 
    478 	vpaddd	XFER, X0, [TBL + 1*16]
    479 	vmovdqa	[rsp + _XFER], XFER
    480 	FOUR_ROUNDS_AND_SCHED
    481 
    482 	vpaddd	XFER, X0, [TBL + 2*16]
    483 	vmovdqa	[rsp + _XFER], XFER
    484 	FOUR_ROUNDS_AND_SCHED
    485 
    486 	vpaddd	XFER, X0, [TBL + 3*16]
    487 	vmovdqa	[rsp + _XFER], XFER
    488 	add	TBL, 4*16
    489 	FOUR_ROUNDS_AND_SCHED
    490 
    491 	sub	SRND, 1
    492 	jne	loop1
    493 
    494 	mov	SRND, 2
    495 loop2:
    496 	vpaddd	XFER, X0, [TBL + 0*16]
    497 	vmovdqa	[rsp + _XFER], XFER
    498 	DO_ROUND	0
    499 	DO_ROUND	1
    500 	DO_ROUND	2
    501 	DO_ROUND	3
    502 
    503 	vpaddd	XFER, X1, [TBL + 1*16]
    504 	vmovdqa	[rsp + _XFER], XFER
    505 	add	TBL, 2*16
    506 	DO_ROUND	0
    507 	DO_ROUND	1
    508 	DO_ROUND	2
    509 	DO_ROUND	3
    510 
    511 	vmovdqa	X0, X2
    512 	vmovdqa	X1, X3
    513 
    514 	sub	SRND, 1
    515 	jne	loop2
    516 
    517 
    518 	addm	[4*0 + CTX],a
    519 	addm	[4*1 + CTX],b
    520 	addm	[4*2 + CTX],c
    521 	addm	[4*3 + CTX],d
    522 	addm	[4*4 + CTX],e
    523 	addm	[4*5 + CTX],f
    524 	addm	[4*6 + CTX],g
    525 	addm	[4*7 + CTX],h
    526 
    527 	mov	INP, [rsp + _INP]
    528 	add	INP, 64
    529 	cmp	INP, [rsp + _INP_END]
    530 	jne	loop0
    531 
    532 done_hash:
    533 %ifndef LINUX
    534 	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
    535 	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
    536 	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
    537 	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
    538 	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
    539 	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
    540 	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
    541 	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
    542 %endif
    543 
    544 
    545 	add	rsp, STACK_SIZE
    546 
    547 	pop	r15
    548 	pop	r14
    549 	pop	r13
    550 	pop	rbp
    551 %ifndef LINUX
    552 	pop	rdi
    553 	pop	rsi
    554 %endif
    555 	pop	rbx
    556 
    557 	ret
    558 
    559 
    560 section .data
    561 align 64
    562 K256:
    563 	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    564 	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    565 	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    566 	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    567 	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    568 	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    569 	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    570 	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    571 	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    572 	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    573 	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    574 	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    575 	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    576 	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    577 	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    578 	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    579 
    580 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
    581 
    582 ; shuffle xBxA -> 00BA
    583 _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
    584 
    585 ; shuffle xDxC -> DC00
    586 _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF