1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright (c) 2012, Intel Corporation
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions are
10 ; * Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; * Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the
18 ; * Neither the name of the Intel Corporation nor the names of its
19 ; contributors may be used to endorse or promote products derived from
20 ; this software without specific prior written permission.
23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 ; Example YASM command lines:
37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42 ; This code is described in an Intel White-Paper:
43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
46 ; and search for that title.
47 ; The paper is expected to be released roughly at the end of April, 2012
49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50 ; This code schedules 1 blocks at a time, with 4 lanes per block
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 %define MOVDQ movdqu ;; assume buffers not aligned
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
58 ; Add reg to mem using reg-mem add and store
64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
66 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
67 ; Load xmm with mem and byte swap each dword
68 %macro COPY_XMM_AND_BSWAP 3
73 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
87 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
88 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
89 %define BYTE_FLIP_MASK xmm12
92 %define NUM_BLKS rdx ; 3rd arg
93 %define CTX rsi ; 2nd arg
94 %define INP rdi ; 1st arg
96 %define SRND rdi ; clobbers INP
101 %define NUM_BLKS r8 ; 3rd arg
102 %define CTX rdx ; 2nd arg
103 %define INP rcx ; 1st arg
105 %define SRND rcx ; clobbers INP
131 _XMM_SAVE_SIZE equ 7*16
133 ; STACK_SIZE plus pushes must be an odd multiple of 8
137 _INP equ _INP_END + _INP_END_SIZE
138 _XFER equ _INP + _INP_SIZE
139 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
140 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
143 ; Rotate values of symbols X0...X3
153 ; Rotate values of symbols a...h
166 %macro FOUR_ROUNDS_AND_SCHED 0
167 ;; compute s0 four at a time and s1 two at a time
168 ;; compute W[-16] + W[-7] 4 at a time
171 ror y0, (25-11) ; y0 = e >> (25-11)
173 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
174 ror y1, (22-13) ; y1 = a >> (22-13)
175 xor y0, e ; y0 = e ^ (e >> (25-11))
177 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
179 xor y1, a ; y1 = a ^ (a >> (22-13)
181 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
182 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
183 and y2, e ; y2 = (f^g)&e
184 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
186 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
187 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
188 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
189 xor y2, g ; y2 = CH = ((f^g)&e)^g
190 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
191 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
192 add y2, y0 ; y2 = S1 + CH
193 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
194 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
196 add h, y2 ; h = h + S1 + CH + k + w
200 add d, h ; d = d + h + S1 + CH + k + w
203 and y0, b ; y0 = (a|c)&b
204 add h, y1 ; h = h + S1 + CH + k + w + S0
205 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
206 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
207 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
210 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
213 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
214 ror y0, (25-11) ; y0 = e >> (25-11)
215 xor y0, e ; y0 = e ^ (e >> (25-11))
217 ror y1, (22-13) ; y1 = a >> (22-13)
219 xor y1, a ; y1 = a ^ (a >> (22-13)
220 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
223 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
224 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
225 and y2, e ; y2 = (f^g)&e
226 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
228 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
229 xor y2, g ; y2 = CH = ((f^g)&e)^g
230 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
231 add y2, y0 ; y2 = S1 + CH
232 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
233 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
234 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
236 add h, y2 ; h = h + S1 + CH + k + w
238 pxor XTMP1, XTMP4 ; XTMP1 = s0
240 add d, h ; d = d + h + S1 + CH + k + w
243 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
244 and y0, b ; y0 = (a|c)&b
245 add h, y1 ; h = h + S1 + CH + k + w + S0
246 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
247 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
248 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
251 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
254 ror y0, (25-11) ; y0 = e >> (25-11)
255 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
256 xor y0, e ; y0 = e ^ (e >> (25-11))
257 ror y1, (22-13) ; y1 = a >> (22-13)
259 xor y1, a ; y1 = a ^ (a >> (22-13)
260 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
261 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
263 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
264 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
265 and y2, e ; y2 = (f^g)&e
266 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
267 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
268 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
269 xor y2, g ; y2 = CH = ((f^g)&e)^g
270 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
272 add y2, y0 ; y2 = S1 + CH
273 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
274 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
275 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
277 add h, y2 ; h = h + S1 + CH + k + w
279 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
281 add d, h ; d = d + h + S1 + CH + k + w
283 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
284 and y0, b ; y0 = (a|c)&b
285 add h, y1 ; h = h + S1 + CH + k + w + S0
287 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
288 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
289 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
292 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
294 ror y0, (25-11) ; y0 = e >> (25-11)
296 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
297 ror y1, (22-13) ; y1 = a >> (22-13)
298 xor y0, e ; y0 = e ^ (e >> (25-11))
300 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
301 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
302 xor y1, a ; y1 = a ^ (a >> (22-13)
304 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
305 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
306 and y2, e ; y2 = (f^g)&e
307 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
308 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
309 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
310 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
311 xor y2, g ; y2 = CH = ((f^g)&e)^g
313 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
314 add y2, y0 ; y2 = S1 + CH
315 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
316 pxor X0, XTMP2 ; X0 = s1 {xDxC}
318 add h, y2 ; h = h + S1 + CH + k + w
320 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
322 add d, h ; d = d + h + S1 + CH + k + w
324 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
325 and y0, b ; y0 = (a|c)&b
326 add h, y1 ; h = h + S1 + CH + k + w + S0
327 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
328 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
334 ;; input is [rsp + _XFER + %1 * 4]
337 ror y0, (25-11) ; y0 = e >> (25-11)
339 xor y0, e ; y0 = e ^ (e >> (25-11))
340 ror y1, (22-13) ; y1 = a >> (22-13)
342 xor y1, a ; y1 = a ^ (a >> (22-13)
343 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
345 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
346 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
347 and y2, e ; y2 = (f^g)&e
348 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
349 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
350 xor y2, g ; y2 = CH = ((f^g)&e)^g
351 add y2, y0 ; y2 = S1 + CH
352 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
353 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
355 add h, y2 ; h = h + S1 + CH + k + w
358 add d, h ; d = d + h + S1 + CH + k + w
360 and y0, b ; y0 = (a|c)&b
361 add h, y1 ; h = h + S1 + CH + k + w + S0
362 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
363 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
367 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
369 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
370 ;; arg 1 : pointer to input data
371 ;; arg 2 : pointer to digest
372 ;; arg 3 : Num blocks
389 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
390 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
391 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
392 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
393 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
394 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
395 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
398 shl NUM_BLKS, 6 ; convert to bytes
400 add NUM_BLKS, INP ; pointer to end of data
401 mov [rsp + _INP_END], NUM_BLKS
403 ;; load initial digest
413 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
414 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
415 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
418 lea TBL,[K256 wrt rip]
420 ;; byte swap first 16 dwords
421 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
422 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
423 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
424 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
426 mov [rsp + _INP], INP
428 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
432 movdqa XFER, [TBL + 0*16]
434 movdqa [rsp + _XFER], XFER
435 FOUR_ROUNDS_AND_SCHED
437 movdqa XFER, [TBL + 1*16]
439 movdqa [rsp + _XFER], XFER
440 FOUR_ROUNDS_AND_SCHED
442 movdqa XFER, [TBL + 2*16]
444 movdqa [rsp + _XFER], XFER
445 FOUR_ROUNDS_AND_SCHED
447 movdqa XFER, [TBL + 3*16]
449 movdqa [rsp + _XFER], XFER
451 FOUR_ROUNDS_AND_SCHED
458 paddd X0, [TBL + 0*16]
459 movdqa [rsp + _XFER], X0
464 paddd X1, [TBL + 1*16]
465 movdqa [rsp + _XFER], X1
487 mov INP, [rsp + _INP]
489 cmp INP, [rsp + _INP_END]
494 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
495 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
496 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
497 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
498 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
499 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
500 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
521 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
522 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
523 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
524 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
525 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
526 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
527 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
528 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
529 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
530 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
531 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
532 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
533 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
534 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
535 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
536 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
538 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
540 ; shuffle xBxA -> 00BA
541 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
543 ; shuffle xDxC -> DC00
544 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF