1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright (c) 2012, Intel Corporation
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions are
10 ; * Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; * Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the
18 ; * Neither the name of the Intel Corporation nor the names of its
19 ; contributors may be used to endorse or promote products derived from
20 ; this software without specific prior written permission.
23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 ; Example YASM command lines:
37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42 ; This code is described in an Intel White-Paper:
43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
46 ; and search for that title.
47 ; The paper is expected to be released roughly at the end of April, 2012
49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50 ; This code schedules 1 blocks at a time, with 4 lanes per block
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 %define VMOVDQ vmovdqu ;; assume buffers not aligned
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
58 ; Add reg to mem using reg-mem add and store
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
71 ; Load xmm with mem and byte swap each dword
72 %macro COPY_XMM_AND_BSWAP 3
77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
92 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
93 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
94 %define BYTE_FLIP_MASK xmm13
97 %define NUM_BLKS rdx ; 3rd arg
98 %define CTX rsi ; 2nd arg
99 %define INP rdi ; 1st arg
101 %define SRND rdi ; clobbers INP
106 %define NUM_BLKS r8 ; 3rd arg
107 %define CTX rdx ; 2nd arg
108 %define INP rcx ; 1st arg
110 %define SRND rcx ; clobbers INP
135 _XMM_SAVE_SIZE equ 8*16
137 ; STACK_SIZE plus pushes must be an odd multiple of 8
141 _INP equ _INP_END + _INP_END_SIZE
142 _XFER equ _INP + _INP_SIZE
143 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
144 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
147 ; Rotate values of symbols X0...X3
157 ; Rotate values of symbols a...h
170 %macro FOUR_ROUNDS_AND_SCHED 0
171 ;; compute s0 four at a time and s1 two at a time
172 ;; compute W[-16] + W[-7] 4 at a time
175 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
177 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
178 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
179 xor y0, e ; y0 = e ^ (e >> (25-11))
181 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
183 xor y1, a ; y1 = a ^ (a >> (22-13)
185 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
186 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
187 and y2, e ; y2 = (f^g)&e
188 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
190 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
191 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
192 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
193 xor y2, g ; y2 = CH = ((f^g)&e)^g
196 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
197 add y2, y0 ; y2 = S1 + CH
198 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
201 add h, y2 ; h = h + S1 + CH + k + w
204 vpsrld XTMP2, XTMP1, 7
207 add d, h ; d = d + h + S1 + CH + k + w
210 vpslld XTMP3, XTMP1, (32-7)
212 and y0, b ; y0 = (a|c)&b
213 add h, y1 ; h = h + S1 + CH + k + w + S0
215 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
217 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
218 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
226 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
227 xor y0, e ; y0 = e ^ (e >> (25-11))
229 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
231 vpsrld XTMP2, XTMP1,18
233 xor y1, a ; y1 = a ^ (a >> (22-13)
234 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
237 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
239 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
240 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
241 and y2, e ; y2 = (f^g)&e
242 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
244 vpslld XTMP1, XTMP1, (32-18)
246 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247 xor y2, g ; y2 = CH = ((f^g)&e)^g
249 vpxor XTMP3, XTMP3, XTMP1
251 add y2, y0 ; y2 = S1 + CH
252 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
253 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
255 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
258 add h, y2 ; h = h + S1 + CH + k + w
261 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
264 add d, h ; d = d + h + S1 + CH + k + w
267 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
268 and y0, b ; y0 = (a|c)&b
269 add h, y1 ; h = h + S1 + CH + k + w + S0
270 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
271 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
272 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
275 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
279 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
281 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
283 xor y0, e ; y0 = e ^ (e >> (25-11))
284 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
286 xor y1, a ; y1 = a ^ (a >> (22-13)
287 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
289 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
293 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
295 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
296 and y2, e ; y2 = (f^g)&e
298 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
300 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
301 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
302 xor y2, g ; y2 = CH = ((f^g)&e)^g
303 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
304 vpxor XTMP2, XTMP2, XTMP3
305 add y2, y0 ; y2 = S1 + CH
306 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
307 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
308 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
310 add h, y2 ; h = h + S1 + CH + k + w
312 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
314 add d, h ; d = d + h + S1 + CH + k + w
316 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
317 and y0, b ; y0 = (a|c)&b
318 add h, y1 ; h = h + S1 + CH + k + w + S0
320 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
321 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
322 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
325 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
327 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
329 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
330 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
331 xor y0, e ; y0 = e ^ (e >> (25-11))
333 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
335 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
337 xor y1, a ; y1 = a ^ (a >> (22-13)
340 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
342 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
343 and y2, e ; y2 = (f^g)&e
344 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
346 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
348 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
349 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
350 xor y2, g ; y2 = CH = ((f^g)&e)^g
352 vpxor XTMP2, XTMP2, XTMP3
354 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
355 add y2, y0 ; y2 = S1 + CH
356 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
357 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
359 add h, y2 ; h = h + S1 + CH + k + w
361 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
363 add d, h ; d = d + h + S1 + CH + k + w
365 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
366 and y0, b ; y0 = (a|c)&b
367 add h, y1 ; h = h + S1 + CH + k + w + S0
368 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
369 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
375 ;; input is [rsp + _XFER + %1 * 4]
378 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
380 xor y0, e ; y0 = e ^ (e >> (25-11))
381 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
383 xor y1, a ; y1 = a ^ (a >> (22-13)
384 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
386 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
387 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
388 and y2, e ; y2 = (f^g)&e
389 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
390 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
391 xor y2, g ; y2 = CH = ((f^g)&e)^g
392 add y2, y0 ; y2 = S1 + CH
393 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
394 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
396 add h, y2 ; h = h + S1 + CH + k + w
399 add d, h ; d = d + h + S1 + CH + k + w
401 and y0, b ; y0 = (a|c)&b
402 add h, y1 ; h = h + S1 + CH + k + w + S0
403 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
404 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
408 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
409 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
410 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
411 ;; arg 1 : pointer to input data
412 ;; arg 2 : pointer to digest
413 ;; arg 3 : Num blocks
430 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
431 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
432 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
433 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
434 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
435 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
436 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
437 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
440 shl NUM_BLKS, 6 ; convert to bytes
442 add NUM_BLKS, INP ; pointer to end of data
443 mov [rsp + _INP_END], NUM_BLKS
445 ;; load initial digest
455 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
456 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
457 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
460 lea TBL,[K256 wrt rip]
462 ;; byte swap first 16 dwords
463 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
464 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
465 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
466 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
468 mov [rsp + _INP], INP
470 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
474 vpaddd XFER, X0, [TBL + 0*16]
475 vmovdqa [rsp + _XFER], XFER
476 FOUR_ROUNDS_AND_SCHED
478 vpaddd XFER, X0, [TBL + 1*16]
479 vmovdqa [rsp + _XFER], XFER
480 FOUR_ROUNDS_AND_SCHED
482 vpaddd XFER, X0, [TBL + 2*16]
483 vmovdqa [rsp + _XFER], XFER
484 FOUR_ROUNDS_AND_SCHED
486 vpaddd XFER, X0, [TBL + 3*16]
487 vmovdqa [rsp + _XFER], XFER
489 FOUR_ROUNDS_AND_SCHED
496 vpaddd XFER, X0, [TBL + 0*16]
497 vmovdqa [rsp + _XFER], XFER
503 vpaddd XFER, X1, [TBL + 1*16]
504 vmovdqa [rsp + _XFER], XFER
527 mov INP, [rsp + _INP]
529 cmp INP, [rsp + _INP_END]
534 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
535 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
536 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
537 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
538 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
539 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
540 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
541 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
563 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
564 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
565 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
566 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
567 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
568 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
569 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
570 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
571 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
572 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
573 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
574 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
575 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
576 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
577 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
578 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
580 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
582 ; shuffle xBxA -> 00BA
583 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
585 ; shuffle xDxC -> DC00
586 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF