1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright (c) 2012, Intel Corporation
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions are
10 ; * Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; * Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the
18 ; * Neither the name of the Intel Corporation nor the names of its
19 ; contributors may be used to endorse or promote products derived from
20 ; this software without specific prior written permission.
23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 ; Example YASM command lines:
37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42 ; This code is described in an Intel White-Paper:
43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
46 ; and search for that title.
47 ; The paper is expected to be released roughly at the end of April, 2012
49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50 ; This code schedules 2 blocks at a time, with 4 lanes per block
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 %define VMOVDQ vmovdqu ;; assume buffers not aligned
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
58 ; Add reg to mem using reg-mem add and store
64 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
71 ; XMM versions of above
85 %define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA
86 %define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00
87 %define BYTE_FLIP_MASK ymm13
89 %define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
92 %define NUM_BLKS rdx ; 3rd arg
93 %define CTX rsi ; 2nd arg
94 %define INP rdi ; 1st arg
97 %define e edx ; clobbers NUM_BLKS
98 %define y3 edi ; clobbers INP
100 %define NUM_BLKS r8 ; 3rd arg
101 %define CTX rdx ; 2nd arg
102 %define INP rcx ; 1st arg
105 %define e r8d ; clobbers NUM_BLKS
106 %define y3 ecx ; clobbers INP
112 %define SRND CTX ; SRND is same register as CTX
127 _XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round
131 _XMM_SAVE_SIZE equ 8*16
139 _XMM_SAVE equ _XFER + _XFER_SIZE
140 _INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE
141 _INP equ _INP_END + _INP_END_SIZE
142 _CTX equ _INP + _INP_SIZE
143 _RSP equ _CTX + _CTX_SIZE
144 STACK_SIZE equ _RSP + _RSP_SIZE
147 ; Rotate values of symbols X0...X3
157 ; Rotate values of symbols a...h
171 %macro FOUR_ROUNDS_AND_SCHED 1
173 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175 mov y3, a ; y3 = a ; MAJA
176 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
177 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
179 add h, dword[%%XFER+0*4] ; h = k + w + h ; --
180 or y3, c ; y3 = a|c ; MAJA
181 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
182 mov y2, f ; y2 = f ; CH
183 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
185 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
186 xor y2, g ; y2 = f^g ; CH
187 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
188 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
190 and y2, e ; y2 = (f^g)&e ; CH
191 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
192 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
193 add d, h ; d = k + w + h + d ; --
195 and y3, b ; y3 = (a|c)&b ; MAJA
196 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
197 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
198 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
200 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
201 vpsrld XTMP2, XTMP1, 7
202 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
203 mov T1, a ; T1 = a ; MAJB
204 and T1, c ; T1 = a&c ; MAJB
206 add y2, y0 ; y2 = S1 + CH ; --
207 vpslld XTMP3, XTMP1, (32-7)
208 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
209 add h, y1 ; h = k + w + h + S0 ; --
211 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
212 vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7
214 vpsrld XTMP2, XTMP1,18
215 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
216 add h, y3 ; h = t1 + S0 + MAJ ; --
221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224 mov y3, a ; y3 = a ; MAJA
225 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
226 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
227 add h, dword[%%XFER+1*4] ; h = k + w + h ; --
228 or y3, c ; y3 = a|c ; MAJA
231 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
232 mov y2, f ; y2 = f ; CH
233 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
234 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
235 xor y2, g ; y2 = f^g ; CH
238 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
239 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
240 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
241 and y2, e ; y2 = (f^g)&e ; CH
242 add d, h ; d = k + w + h + d ; --
244 vpslld XTMP1, XTMP1, (32-18)
245 and y3, b ; y3 = (a|c)&b ; MAJA
246 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
248 vpxor XTMP3, XTMP3, XTMP1
249 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
250 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
252 vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
253 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
254 mov T1, a ; T1 = a ; MAJB
255 and T1, c ; T1 = a&c ; MAJB
256 add y2, y0 ; y2 = S1 + CH ; --
258 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
259 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
260 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
261 add h, y1 ; h = k + w + h + S0 ; --
263 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
264 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
265 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
266 add h, y3 ; h = t1 + S0 + MAJ ; --
268 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
273 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
275 mov y3, a ; y3 = a ; MAJA
276 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
277 add h, [%%XFER+2*4] ; h = k + w + h ; --
279 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
280 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
281 or y3, c ; y3 = a|c ; MAJA
282 mov y2, f ; y2 = f ; CH
283 xor y2, g ; y2 = f^g ; CH
285 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
286 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
287 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
288 and y2, e ; y2 = (f^g)&e ; CH
290 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
291 vpxor XTMP2, XTMP2, XTMP3
292 add d, h ; d = k + w + h + d ; --
293 and y3, b ; y3 = (a|c)&b ; MAJA
295 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
296 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
297 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
298 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
300 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
301 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
302 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
303 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
305 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
306 mov T1, a ; T1 = a ; MAJB
307 and T1, c ; T1 = a&c ; MAJB
308 add y2, y0 ; y2 = S1 + CH ; --
309 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
311 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
312 add h, y1 ; h = k + w + h + S0 ; --
313 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
314 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
316 add h, y3 ; h = t1 + S0 + MAJ ; --
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323 mov y3, a ; y3 = a ; MAJA
324 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
325 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
326 add h, dword[%%XFER+3*4] ; h = k + w + h ; --
327 or y3, c ; y3 = a|c ; MAJA
330 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
331 mov y2, f ; y2 = f ; CH
332 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
333 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
334 xor y2, g ; y2 = f^g ; CH
337 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
338 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
339 and y2, e ; y2 = (f^g)&e ; CH
340 add d, h ; d = k + w + h + d ; --
341 and y3, b ; y3 = (a|c)&b ; MAJA
343 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
344 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
345 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
347 vpxor XTMP2, XTMP2, XTMP3
348 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
349 add y2, y0 ; y2 = S1 + CH ; --
351 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
352 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
353 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
355 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
356 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
358 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
359 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
360 mov T1, a ; T1 = a ; MAJB
361 and T1, c ; T1 = a&c ; MAJB
362 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
364 add h, y1 ; h = k + w + h + S0 ; --
365 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
366 add h, y3 ; h = t1 + S0 + MAJ ; --
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 mov y2, f ; y2 = f ; CH
377 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
378 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
379 xor y2, g ; y2 = f^g ; CH
381 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
382 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
383 and y2, e ; y2 = (f^g)&e ; CH
385 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
386 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
387 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
388 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
389 mov y3, a ; y3 = a ; MAJA
391 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
392 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
393 add h, dword[%%XFER + 4*0] ; h = k + w + h ; --
394 or y3, c ; y3 = a|c ; MAJA
396 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
397 mov T1, a ; T1 = a ; MAJB
398 and y3, b ; y3 = (a|c)&b ; MAJA
399 and T1, c ; T1 = a&c ; MAJB
400 add y2, y0 ; y2 = S1 + CH ; --
403 add d, h ; d = k + w + h + d ; --
404 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
405 add h, y1 ; h = k + w + h + S0 ; --
407 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
410 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
412 ;add h, y3 ; h = t1 + S0 + MAJ ; --
416 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
418 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
419 mov y2, f ; y2 = f ; CH
420 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
421 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
422 xor y2, g ; y2 = f^g ; CH
424 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
425 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
426 and y2, e ; y2 = (f^g)&e ; CH
427 add old_h, y3 ; h = t1 + S0 + MAJ ; --
429 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
430 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
431 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
432 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
433 mov y3, a ; y3 = a ; MAJA
435 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
436 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
437 add h, dword[%%XFER + 4*1] ; h = k + w + h ; --
438 or y3, c ; y3 = a|c ; MAJA
440 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
441 mov T1, a ; T1 = a ; MAJB
442 and y3, b ; y3 = (a|c)&b ; MAJA
443 and T1, c ; T1 = a&c ; MAJB
444 add y2, y0 ; y2 = S1 + CH ; --
447 add d, h ; d = k + w + h + d ; --
448 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
449 add h, y1 ; h = k + w + h + S0 ; --
451 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
454 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
456 ;add h, y3 ; h = t1 + S0 + MAJ ; --
460 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
462 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
463 mov y2, f ; y2 = f ; CH
464 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
465 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
466 xor y2, g ; y2 = f^g ; CH
468 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
469 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
470 and y2, e ; y2 = (f^g)&e ; CH
471 add old_h, y3 ; h = t1 + S0 + MAJ ; --
473 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
474 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
475 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
476 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
477 mov y3, a ; y3 = a ; MAJA
479 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
480 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
481 add h, dword[%%XFER + 4*2] ; h = k + w + h ; --
482 or y3, c ; y3 = a|c ; MAJA
484 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
485 mov T1, a ; T1 = a ; MAJB
486 and y3, b ; y3 = (a|c)&b ; MAJA
487 and T1, c ; T1 = a&c ; MAJB
488 add y2, y0 ; y2 = S1 + CH ; --
491 add d, h ; d = k + w + h + d ; --
492 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
493 add h, y1 ; h = k + w + h + S0 ; --
495 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
498 ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
500 ;add h, y3 ; h = t1 + S0 + MAJ ; --
504 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
506 add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
507 mov y2, f ; y2 = f ; CH
508 rorx y0, e, 25 ; y0 = e >> 25 ; S1A
509 rorx y1, e, 11 ; y1 = e >> 11 ; S1B
510 xor y2, g ; y2 = f^g ; CH
512 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
513 rorx y1, e, 6 ; y1 = (e >> 6) ; S1
514 and y2, e ; y2 = (f^g)&e ; CH
515 add old_h, y3 ; h = t1 + S0 + MAJ ; --
517 xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
518 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
519 xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
520 rorx y1, a, 22 ; y1 = a >> 22 ; S0A
521 mov y3, a ; y3 = a ; MAJA
523 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
524 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
525 add h, dword[%%XFER + 4*3] ; h = k + w + h ; --
526 or y3, c ; y3 = a|c ; MAJA
528 xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
529 mov T1, a ; T1 = a ; MAJB
530 and y3, b ; y3 = (a|c)&b ; MAJA
531 and T1, c ; T1 = a&c ; MAJB
532 add y2, y0 ; y2 = S1 + CH ; --
535 add d, h ; d = k + w + h + d ; --
536 or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
537 add h, y1 ; h = k + w + h + S0 ; --
539 add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
542 add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
544 add h, y3 ; h = t1 + S0 + MAJ ; --
550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
551 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
552 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
553 ;; arg 1 : pointer to input data
554 ;; arg 2 : pointer to digest
555 ;; arg 3 : Num blocks
574 mov [rsp + _RSP], rax
577 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
578 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
579 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
580 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
581 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
582 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
583 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
584 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
587 shl NUM_BLKS, 6 ; convert to bytes
589 lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
590 mov [rsp + _INP_END], NUM_BLKS
595 ;; load initial digest
605 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
606 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
607 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
609 mov [rsp + _CTX], CTX
612 lea TBL,[K256 wrt rip]
614 ;; Load first 16 dwords from two blocks
615 VMOVDQ XTMP0, [INP + 0*32]
616 VMOVDQ XTMP1, [INP + 1*32]
617 VMOVDQ XTMP2, [INP + 2*32]
618 VMOVDQ XTMP3, [INP + 3*32]
621 vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK
622 vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK
623 vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK
624 vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK
626 ;; transpose data into high/low halves
627 vperm2i128 X0, XTMP0, XTMP2, 0x20
628 vperm2i128 X1, XTMP0, XTMP2, 0x31
629 vperm2i128 X2, XTMP1, XTMP3, 0x20
630 vperm2i128 X3, XTMP1, XTMP3, 0x31
634 mov [rsp + _INP], INP
636 ;; schedule 48 input dwords, by doing 3 rounds of 12 each
641 vpaddd XFER, X0, [TBL + SRND + 0*32]
642 vmovdqa [rsp + _XFER + SRND + 0*32], XFER
643 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32
645 vpaddd XFER, X0, [TBL + SRND + 1*32]
646 vmovdqa [rsp + _XFER + SRND + 1*32], XFER
647 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32
649 vpaddd XFER, X0, [TBL + SRND + 2*32]
650 vmovdqa [rsp + _XFER + SRND + 2*32], XFER
651 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32
653 vpaddd XFER, X0, [TBL + SRND + 3*32]
654 vmovdqa [rsp + _XFER + SRND + 3*32], XFER
655 FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32
662 ;; Do last 16 rounds with no scheduling
663 vpaddd XFER, X0, [TBL + SRND + 0*32]
664 vmovdqa [rsp + _XFER + SRND + 0*32], XFER
665 DO_4ROUNDS rsp + _XFER + SRND + 0*32
666 vpaddd XFER, X1, [TBL + SRND + 1*32]
667 vmovdqa [rsp + _XFER + SRND + 1*32], XFER
668 DO_4ROUNDS rsp + _XFER + SRND + 1*32
677 mov CTX, [rsp + _CTX]
678 mov INP, [rsp + _INP]
689 cmp INP, [rsp + _INP_END]
692 ;;;; Do second block using previously scheduled results
696 DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16
697 DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16
702 mov CTX, [rsp + _CTX]
703 mov INP, [rsp + _INP]
715 cmp INP, [rsp + _INP_END]
721 lea TBL,[K256 wrt rip]
723 VMOVDQ XWORD0, [INP + 0*16]
724 VMOVDQ XWORD1, [INP + 1*16]
725 VMOVDQ XWORD2, [INP + 2*16]
726 VMOVDQ XWORD3, [INP + 3*16]
728 vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK
729 vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK
730 vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK
731 vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK
737 ;; load initial digest
747 vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
748 vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
749 vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
751 mov [rsp + _CTX], CTX
756 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
757 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
758 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
759 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
760 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
761 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
762 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
763 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
766 mov rsp, [rsp + _RSP]
784 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
785 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
786 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
787 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
788 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
789 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
790 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
791 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
792 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
793 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
794 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
795 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
796 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
797 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
798 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
799 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
800 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
801 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
802 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
803 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
804 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
805 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
806 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
807 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
808 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
809 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
810 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
811 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
812 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
813 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
814 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
815 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
817 PSHUFFLE_BYTE_FLIP_MASK:
818 ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
820 ; shuffle xBxA -> 00BA
822 ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
824 ; shuffle xDxC -> DC00
826 ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF