1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright (c) 2012, Intel Corporation
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions are
10 ; * Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; * Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the
18 ; * Neither the name of the Intel Corporation nor the names of its
19 ; contributors may be used to endorse or promote products derived from
20 ; this software without specific prior written permission.
23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 ; Example YASM command lines:
37 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx8.obj -g cv8 sha256_avx2_rorx8.asm
38 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx8.o sha256_avx2_rorx8.asm
40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42 ; This code is described in an Intel White-Paper:
43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
45 ; To find it, surf to http://www.intel.com/p/en_US/embedded
46 ; and search for that title.
47 ; The paper is expected to be released roughly at the end of April, 2012
49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50 ; This code schedules 8 blocks at a time, with 1 lane per block
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
54 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
61 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
78 %define INP rdi ; 1st arg
79 %define CTX rsi ; 2nd arg
80 %define NUM_BLKS rdx ; 3rd arg
83 %define e edx ; dword version of NUM_BLKS
84 %define z3 edi ; dword version of INP
86 %define INP rcx ; 1st arg
87 %define CTX rdx ; 2nd arg
88 %define NUM_BLKS r8 ; 3rd arg
91 %define e r8d ; dword version of NUM_BLKS
92 %define z3 ecx ; dword version of INP
113 _KTMSG_SIZE equ 16*32 ; Second 3/4 of KTMSG overlaps TMSG
118 _XMM_SAVE_SIZE equ 7*16
124 _IDX_LIMIT_SIZE equ 8
127 ;; KTMSG must overlap TMSG such that the second 3/4 of KTMSG overlaps the
128 ;; first 3/4 of TMSG. (We onl need 16 words of TMSG at any time.)
129 _KTMSG equ _EXTRA_SIZE
130 _TMSG equ _KTMSG + _KTMSG_SIZE
131 _XMM_SAVE equ _TMSG + _TMSG_SIZE
132 _INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE
133 _INP equ _INP_END + _INP_END_SIZE
134 _RND equ _INP + _INP_SIZE
135 _CTX equ _RND + _RND_SIZE
136 _IDX_LIMIT equ _CTX + _CTX_SIZE
137 _RSP equ _IDX_LIMIT + _IDX_LIMIT_SIZE
138 STACK_SIZE equ _RSP + _RSP_SIZE
153 ; PRORD reg, imm, tmp
158 vpslld %%tmp, %%reg, (32-(%%imm))
159 vpsrld %%reg, %%reg, %%imm
160 vpor %%reg, %%reg, %%tmp
164 ; PRORD_nd reg, imm, tmp, src
170 vpslld %%tmp, %%src, (32-(%%imm))
171 vpsrld %%reg, %%src, %%imm
172 vpor %%reg, %%reg, %%tmp
180 ; PRORD_nd dst, src, amt
182 PRORD_nd %1, %3, TTMP5, %2
185 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
186 ; "transpose" data in {r0...r7} using temps {t0...t1}
187 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
188 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
189 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
190 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
191 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
192 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
193 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
194 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
195 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
197 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
198 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
199 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
200 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
201 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
202 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
203 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
204 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
205 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
218 ; process top half (r0..r3) {a...d}
219 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
220 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
221 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
222 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
223 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
224 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
225 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
226 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
228 ; use r2 in place of t0
229 ; process bottom half (r4..r7) {e...h}
230 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
231 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
232 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
233 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
234 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
235 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
236 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
237 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
239 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
240 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
241 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
242 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
243 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
244 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
245 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
246 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
249 %macro SHA256_X8MS_8RNDS 0
251 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
255 vmovdqa TT0, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT0 = Load W[i-15]
256 vmovdqa TTMP2,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP2 = Load W[i-2]
257 mov z2, f ; z2 = f ; CH
258 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
259 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
260 xor z2, g ; z2 = f^g ; CH
262 PRORD_nd TTMP1,TT0,7 ;; TTMP1 = W[i-15] ror 7
263 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
264 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
265 and z2, e ; z2 = (f^g)&e ; CH
267 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
268 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
269 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
270 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
271 mov z3, a ; z3 = a ; MAJA
273 vpsrld TT0,TT0,3 ;; TT0 = W[i-15] shr 3
274 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
275 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
276 add h, dword[rsp + _KTMSG + IDX + 32*0] ; h = k + w + h
279 or z3, c ; z3 = a|c ; MAJA
281 vpxor TT0,TT0,TTMP1 ;; TT0 = (W[i-15] ror 7) xor (W[i-15] shr 3)
282 PRORD TTMP1,18-7 ;; TTMP1 = W[i-15] ror 18
283 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
284 mov T1, a ; T1 = a ; MAJB
285 and z3, b ; z3 = (a|c)&b ; MAJA
286 and T1, c ; T1 = a&c ; MAJB
287 add z2, z0 ; z2 = S1 + CH ; --
290 vpxor TT0,TTMP1,TT0 ;; TT0 = s0
291 PRORD_nd TTMP1,TTMP2,17 ;; TTMP1 = W[i-2] ror 17
292 vpsrld TTMP2,TTMP2,10 ;; TTMP2 = W[i-2] shr 25
293 vpxor TTMP2,TTMP1,TTMP2 ;; TTMP2 = (W[i-2] ror 17) xor (W[i-2] shr 25)
294 add d, h ; d = k + w + h + d ; --
295 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
296 add h, z1 ; h = k + w + h + S0 ; --
298 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
301 PRORD TTMP1,19-17 ;; TTMP1 = W[i-2] ror 19
302 vpxor TTMP1,TTMP1,TTMP2 ;; TTMP1 = s1
303 vpaddd TT0,TT0,TTMP1 ;; TT0 = s0 + s1
304 vpaddd TT0,TT0,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
305 vpaddd TT0,TT0,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
306 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT0 ;; Save TT0 to stack
307 vpaddd TT0, TT0, [TBL + IDX + (i+16)*32]
308 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT0 ;; Save TT0 to stack
310 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
312 ;add h, z3 ; h = t1 + S0 + MAJ ; --
316 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320 vmovdqa TT1, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT1 = Load W[i-15]
321 vmovdqa TTMP4,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP4 = Load W[i-2]
323 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
324 mov z2, f ; z2 = f ; CH
325 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
326 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
327 xor z2, g ; z2 = f^g ; CH
329 PRORD_nd TTMP3,TT1,7 ;; TTMP3 = W[i-15] ror 7
330 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
331 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
332 and z2, e ; z2 = (f^g)&e ; CH
333 add old_h, z3 ; h = t1 + S0 + MAJ ; --
336 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
337 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
338 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
339 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
340 mov z3, a ; z3 = a ; MAJA
342 vpsrld TT1,TT1,3 ;; TT1 = W[i-15] shr 3
343 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
344 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
345 add h, dword[rsp + _KTMSG + IDX + 32*1] ; h = k + w + h
346 or z3, c ; z3 = a|c ; MAJA
348 vpxor TT1,TT1,TTMP3 ;; TT1 = (W[i-15] ror 7) xor (W[i-15] shr 3)
349 PRORD TTMP3,18-7 ;; TTMP3 = W[i-15] ror 18
350 vpxor TT1,TTMP3,TT1 ;; TT1 = s0
351 PRORD_nd TTMP3,TTMP4,17 ;; TTMP3 = W[i-2] ror 17
352 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
353 mov T1, a ; T1 = a ; MAJB
354 and z3, b ; z3 = (a|c)&b ; MAJA
355 and T1, c ; T1 = a&c ; MAJB
356 add z2, z0 ; z2 = S1 + CH ; --
359 vpsrld TTMP4,TTMP4,10 ;; TTMP4 = W[i-2] shr 25
360 vpxor TTMP4,TTMP3,TTMP4 ;; TTMP4 = (W[i-2] ror 17) xor (W[i-2] shr 25)
361 PRORD TTMP3,19-17 ;; TTMP3 = W[i-2] ror 19
362 vpxor TTMP3,TTMP3,TTMP4 ;; TTMP3 = s1
363 add d, h ; d = k + w + h + d ; --
364 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
365 add h, z1 ; h = k + w + h + S0 ; --
367 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
369 vpaddd TT1,TT1,TTMP3 ;; TT1 = s0 + s1
370 vpaddd TT1,TT1,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
371 vpaddd TT1,TT1,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
372 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT1 ;; Save TT1 to stack
373 vpaddd TT1, TT1, [TBL + IDX + (i+16)*32]
374 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT1 ;; Save TT1 to stack
376 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
378 ;add h, z3 ; h = t1 + S0 + MAJ ; --
381 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 vmovdqa TT2, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT2 = Load W[i-15]
386 vmovdqa TTMP2,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP2 = Load W[i-2]
387 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
388 mov z2, f ; z2 = f ; CH
389 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
390 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
391 xor z2, g ; z2 = f^g ; CH
393 PRORD_nd TTMP1,TT2,7 ;; TTMP1 = W[i-15] ror 7
394 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
395 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
396 and z2, e ; z2 = (f^g)&e ; CH
397 add old_h, z3 ; h = t1 + S0 + MAJ ; --
399 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
400 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
401 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
402 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
403 mov z3, a ; z3 = a ; MAJA
405 vpsrld TT2,TT2,3 ;; TT2 = W[i-15] shr 3
406 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
407 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
408 add h, dword[rsp + _KTMSG + IDX + 32*2] ; h = k + w + h
409 or z3, c ; z3 = a|c ; MAJA
411 vpxor TT2,TT2,TTMP1 ;; TT2 = (W[i-15] ror 7) xor (W[i-15] shr 3)
412 PRORD TTMP1,18-7 ;; TTMP1 = W[i-15] ror 18
413 vpxor TT2,TTMP1,TT2 ;; TT2 = s0
414 PRORD_nd TTMP1,TTMP2,17 ;; TTMP1 = W[i-2] ror 17
415 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
416 mov T1, a ; T1 = a ; MAJB
417 and z3, b ; z3 = (a|c)&b ; MAJA
418 and T1, c ; T1 = a&c ; MAJB
419 add z2, z0 ; z2 = S1 + CH ; --
422 vpsrld TTMP2,TTMP2,10 ;; TTMP2 = W[i-2] shr 25
423 vpxor TTMP2,TTMP1,TTMP2 ;; TTMP2 = (W[i-2] ror 17) xor (W[i-2] shr 25)
424 PRORD TTMP1,19-17 ;; TTMP1 = W[i-2] ror 19
425 vpxor TTMP1,TTMP1,TTMP2 ;; TTMP1 = s1
426 add d, h ; d = k + w + h + d ; --
427 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
428 add h, z1 ; h = k + w + h + S0 ; --
430 vpaddd TT2,TT2,TTMP1 ;; TT2 = s0 + s1
431 vpaddd TT2,TT2,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
432 vpaddd TT2,TT2,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
433 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT2 ;; Save TT2 to stack
434 vpaddd TT2, TT2, [TBL + IDX + (i+16)*32]
435 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT2 ;; Save TT2 to stack
436 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
439 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
441 ;add h, z3 ; h = t1 + S0 + MAJ ; --
446 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
450 vmovdqa TT3, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT3 = Load W[i-15]
451 vmovdqa TTMP4,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP4 = Load W[i-2]
452 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
453 mov z2, f ; z2 = f ; CH
454 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
455 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
456 xor z2, g ; z2 = f^g ; CH
458 PRORD_nd TTMP3,TT3,7 ;; TTMP3 = W[i-15] ror 7
459 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
460 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
461 and z2, e ; z2 = (f^g)&e ; CH
462 add old_h, z3 ; h = t1 + S0 + MAJ ; --
465 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
466 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
467 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
468 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
469 mov z3, a ; z3 = a ; MAJA
471 vpsrld TT3,TT3,3 ;; TT3 = W[i-15] shr 3
472 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
473 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
474 add h, dword[rsp + _KTMSG + IDX + 32*3] ; h = k + w + h
475 or z3, c ; z3 = a|c ; MAJA
477 vpxor TT3,TT3,TTMP3 ;; TT3 = (W[i-15] ror 7) xor (W[i-15] shr 3)
478 PRORD TTMP3,18-7 ;; TTMP3 = W[i-15] ror 18
479 vpxor TT3,TTMP3,TT3 ;; TT3 = s0
480 PRORD_nd TTMP3,TTMP4,17 ;; TTMP3 = W[i-2] ror 17
481 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
482 mov T1, a ; T1 = a ; MAJB
483 and z3, b ; z3 = (a|c)&b ; MAJA
484 and T1, c ; T1 = a&c ; MAJB
485 add z2, z0 ; z2 = S1 + CH ; --
488 vpsrld TTMP4,TTMP4,10 ;; TTMP4 = W[i-2] shr 25
489 vpxor TTMP4,TTMP3,TTMP4 ;; TTMP4 = (W[i-2] ror 17) xor (W[i-2] shr 25)
490 PRORD TTMP3,19-17 ;; TTMP3 = W[i-2] ror 19
491 vpxor TTMP3,TTMP3,TTMP4 ;; TTMP3 = s1
492 add d, h ; d = k + w + h + d ; --
493 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
494 add h, z1 ; h = k + w + h + S0 ; --
496 vpaddd TT3,TT3,TTMP3 ;; TT3 = s0 + s1
497 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
500 add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
502 vpaddd TT3,TT3,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
503 vpaddd TT3,TT3,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
504 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT3 ;; Save TT3 to stack
505 vpaddd TT3, TT3, [TBL + IDX + (i+16)*32]
506 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT3 ;; Save TT3 to stack
508 add h, z3 ; h = t1 + S0 + MAJ ; --
513 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
517 vmovdqa TT4, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT4 = Load W[i-15]
518 vmovdqa TTMP2,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP2 = Load W[i-2]
519 mov z2, f ; z2 = f ; CH
520 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
521 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
522 xor z2, g ; z2 = f^g ; CH
524 PRORD_nd TTMP1,TT4,7 ;; TTMP1 = W[i-15] ror 7
525 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
526 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
527 and z2, e ; z2 = (f^g)&e ; CH
529 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
530 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
531 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
532 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
533 mov z3, a ; z3 = a ; MAJA
535 vpsrld TT4,TT4,3 ;; TT4 = W[i-15] shr 3
536 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
537 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
538 add h, dword[rsp + _KTMSG + IDX + 32*4] ; h = k + w + h
539 or z3, c ; z3 = a|c ; MAJA
541 vpxor TT4,TT4,TTMP1 ;; TT4 = (W[i-15] ror 7) xor (W[i-15] shr 3)
542 PRORD TTMP1,18-7 ;; TTMP1 = W[i-15] ror 18
543 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
544 mov T1, a ; T1 = a ; MAJB
545 and z3, b ; z3 = (a|c)&b ; MAJA
546 and T1, c ; T1 = a&c ; MAJB
547 add z2, z0 ; z2 = S1 + CH ; --
550 vpxor TT4,TTMP1,TT4 ;; TT4 = s0
551 PRORD_nd TTMP1,TTMP2,17 ;; TTMP1 = W[i-2] ror 17
552 vpsrld TTMP2,TTMP2,10 ;; TTMP2 = W[i-2] shr 25
553 vpxor TTMP2,TTMP1,TTMP2 ;; TTMP2 = (W[i-2] ror 17) xor (W[i-2] shr 25)
554 add d, h ; d = k + w + h + d ; --
555 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
556 add h, z1 ; h = k + w + h + S0 ; --
558 PRORD TTMP1,19-17 ;; TTMP1 = W[i-2] ror 19
559 vpxor TTMP1,TTMP1,TTMP2 ;; TTMP1 = s1
560 vpaddd TT4,TT4,TTMP1 ;; TT4 = s0 + s1
561 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
564 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
566 ;add h, z3 ; h = t1 + S0 + MAJ ; --
568 vpaddd TT4,TT4,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
569 vpaddd TT4,TT4,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
570 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT4 ;; Save TT4 to stack
571 vpaddd TT4, TT4, [TBL + IDX + (i+16)*32]
572 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT4 ;; Save TT4 to stack
575 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 5 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
579 vmovdqa TT5, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT5 = Load W[i-15]
580 vmovdqa TTMP4,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP4 = Load W[i-2]
581 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
582 mov z2, f ; z2 = f ; CH
583 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
584 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
585 xor z2, g ; z2 = f^g ; CH
587 PRORD_nd TTMP3,TT5,7 ;; TTMP3 = W[i-15] ror 7
588 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
589 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
590 and z2, e ; z2 = (f^g)&e ; CH
591 add old_h, z3 ; h = t1 + S0 + MAJ ; --
594 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
595 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
596 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
597 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
598 mov z3, a ; z3 = a ; MAJA
600 vpsrld TT5,TT5,3 ;; TT5 = W[i-15] shr 3
601 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
602 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
603 add h, dword[rsp + _KTMSG + IDX + 32*5] ; h = k + w + h
604 or z3, c ; z3 = a|c ; MAJA
606 vpxor TT5,TT5,TTMP3 ;; TT5 = (W[i-15] ror 7) xor (W[i-15] shr 3)
607 PRORD TTMP3,18-7 ;; TTMP3 = W[i-15] ror 18
608 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
609 mov T1, a ; T1 = a ; MAJB
610 and z3, b ; z3 = (a|c)&b ; MAJA
611 and T1, c ; T1 = a&c ; MAJB
612 add z2, z0 ; z2 = S1 + CH ; --
615 vpxor TT5,TTMP3,TT5 ;; TT5 = s0
616 PRORD_nd TTMP3,TTMP4,17 ;; TTMP3 = W[i-2] ror 17
617 vpsrld TTMP4,TTMP4,10 ;; TTMP4 = W[i-2] shr 25
618 vpxor TTMP4,TTMP3,TTMP4 ;; TTMP4 = (W[i-2] ror 17) xor (W[i-2] shr 25)
619 add d, h ; d = k + w + h + d ; --
620 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
621 add h, z1 ; h = k + w + h + S0 ; --
623 PRORD TTMP3,19-17 ;; TTMP3 = W[i-2] ror 19
624 vpxor TTMP3,TTMP3,TTMP4 ;; TTMP3 = s1
625 vpaddd TT5,TT5,TTMP3 ;; TT5 = s0 + s1
626 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
629 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
631 ;add h, z3 ; h = t1 + S0 + MAJ ; --
633 vpaddd TT5,TT5,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
634 vpaddd TT5,TT5,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
635 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT5 ;; Save TT5 to stack
636 vpaddd TT5, TT5, [TBL + IDX + (i+16)*32]
637 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT5 ;; Save TT5 to stack
641 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
645 vmovdqa TT6, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT6 = Load W[i-15]
646 vmovdqa TTMP2,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP2 = Load W[i-2]
648 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
649 mov z2, f ; z2 = f ; CH
650 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
651 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
652 xor z2, g ; z2 = f^g ; CH
654 PRORD_nd TTMP1,TT6,7 ;; TTMP1 = W[i-15] ror 7
655 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
656 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
657 and z2, e ; z2 = (f^g)&e ; CH
658 add old_h, z3 ; h = t1 + S0 + MAJ ; --
660 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
661 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
662 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
663 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
664 mov z3, a ; z3 = a ; MAJA
666 vpsrld TT6,TT6,3 ;; TT6 = W[i-15] shr 3
667 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
668 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
669 add h, dword[rsp + _KTMSG + IDX + 32*6] ; h = k + w + h
670 or z3, c ; z3 = a|c ; MAJA
672 vpxor TT6,TT6,TTMP1 ;; TT6 = (W[i-15] ror 7) xor (W[i-15] shr 3)
673 PRORD TTMP1,18-7 ;; TTMP1 = W[i-15] ror 18
674 vpxor TT6,TTMP1,TT6 ;; TT6 = s0
675 PRORD_nd TTMP1,TTMP2,17 ;; TTMP1 = W[i-2] ror 17
676 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
677 mov T1, a ; T1 = a ; MAJB
678 and z3, b ; z3 = (a|c)&b ; MAJA
679 and T1, c ; T1 = a&c ; MAJB
680 add z2, z0 ; z2 = S1 + CH ; --
683 vpsrld TTMP2,TTMP2,10 ;; TTMP2 = W[i-2] shr 25
684 vpxor TTMP2,TTMP1,TTMP2 ;; TTMP2 = (W[i-2] ror 17) xor (W[i-2] shr 25)
685 PRORD TTMP1,19-17 ;; TTMP1 = W[i-2] ror 19
686 vpxor TTMP1,TTMP1,TTMP2 ;; TTMP1 = s1
687 add d, h ; d = k + w + h + d ; --
688 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
689 add h, z1 ; h = k + w + h + S0 ; --
691 vpaddd TT6,TT6,TTMP1 ;; TT6 = s0 + s1
692 vpaddd TT6,TT6,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
693 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
696 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
698 ;add h, z3 ; h = t1 + S0 + MAJ ; --
700 vpaddd TT6,TT6,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
701 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT6 ;; Save TT6 to stack
702 vpaddd TT6, TT6, [TBL + IDX + (i+16)*32]
703 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT6 ;; Save TT6 to stack
706 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
710 vmovdqa TT7, [rsp + _TMSG + IDX + 32*(i+1)] ;; TT7 = Load W[i-15]
711 vmovdqa TTMP4,[rsp + _TMSG + IDX + 32*(i+14)] ;; TTMP4 = Load W[i-2]
713 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
714 mov z2, f ; z2 = f ; CH
715 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
716 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
717 xor z2, g ; z2 = f^g ; CH
719 PRORD_nd TTMP3,TT7,7 ;; TTMP3 = W[i-15] ror 7
720 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
721 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
722 and z2, e ; z2 = (f^g)&e ; CH
723 add old_h, z3 ; h = t1 + S0 + MAJ ; --
726 vpsrld TT7,TT7,3 ;; TT7 = W[i-15] shr 3
727 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
728 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
729 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
730 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
731 mov z3, a ; z3 = a ; MAJA
733 vpxor TT7,TT7,TTMP3 ;; TT7 = (W[i-15] ror 7) xor (W[i-15] shr 3)
734 PRORD TTMP3,18-7 ;; TTMP3 = W[i-15] ror 18
735 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
736 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
737 add h, dword[rsp + _KTMSG + IDX + 32*7] ; h = k + w + h
738 or z3, c ; z3 = a|c ; MAJA
740 vpxor TT7,TTMP3,TT7 ;; TT7 = s0
741 PRORD_nd TTMP3,TTMP4,17 ;; TTMP3 = W[i-2] ror 17
742 vpsrld TTMP4,TTMP4,10 ;; TTMP4 = W[i-2] shr 25
743 vpxor TTMP4,TTMP3,TTMP4 ;; TTMP4 = (W[i-2] ror 17) xor (W[i-2] shr 25)
744 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
745 mov T1, a ; T1 = a ; MAJB
746 and z3, b ; z3 = (a|c)&b ; MAJA
747 and T1, c ; T1 = a&c ; MAJB
748 add z2, z0 ; z2 = S1 + CH ; --
751 PRORD TTMP3,19-17 ;; TTMP3 = W[i-2] ror 19
752 vpxor TTMP3,TTMP3,TTMP4 ;; TTMP3 = s1
753 add d, h ; d = k + w + h + d ; --
754 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
755 add h, z1 ; h = k + w + h + S0 ; --
757 vpaddd TT7,TT7,TTMP3 ;; TT7 = s0 + s1
758 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
761 vpaddd TT7,TT7,[rsp + _TMSG + IDX + 32*(i+9)] ;; add W[i-7]
762 add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
764 vpaddd TT7,TT7,[rsp + _TMSG + IDX + 32*(i+0)] ;; add W[i-16]
765 add h, z3 ; h = t1 + S0 + MAJ ; --
767 vmovdqa [rsp + _TMSG + IDX + 16*32 + i*32], TT7 ;; Save TT7 to stack
768 vpaddd TT7, TT7, [TBL + IDX + (i+16)*32]
769 vmovdqa [rsp + _KTMSG + IDX + 16*32 + i*32], TT7 ;; Save TT7 to stack
777 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
779 mov z2, f ; z2 = f ; CH
780 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
781 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
782 xor z2, g ; z2 = f^g ; CH
784 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
785 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
786 and z2, e ; z2 = (f^g)&e ; CH
788 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
789 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
790 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
791 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
792 mov z3, a ; z3 = a ; MAJA
795 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
796 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
797 add h, dword[rsp + _KTMSG + IDX + 32*0] ; h = k + w + h
799 or z3, c ; z3 = a|c ; MAJA
801 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
802 mov T1, a ; T1 = a ; MAJB
803 and z3, b ; z3 = (a|c)&b ; MAJA
804 and T1, c ; T1 = a&c ; MAJB
805 add z2, z0 ; z2 = S1 + CH ; --
808 add d, h ; d = k + w + h + d ; --
809 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
810 add h, z1 ; h = k + w + h + S0 ; --
812 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
815 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
817 ;add h, z3 ; h = t1 + S0 + MAJ ; --
821 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
823 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
824 mov z2, f ; z2 = f ; CH
825 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
826 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
827 xor z2, g ; z2 = f^g ; CH
829 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
830 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
831 and z2, e ; z2 = (f^g)&e ; CH
832 add old_h, z3 ; h = t1 + S0 + MAJ ; --
834 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
835 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
836 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
837 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
838 mov z3, a ; z3 = a ; MAJA
840 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
841 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
842 add h, dword[rsp + _KTMSG + IDX + 32*1] ; h = k + w + h
843 or z3, c ; z3 = a|c ; MAJA
845 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
846 mov T1, a ; T1 = a ; MAJB
847 and z3, b ; z3 = (a|c)&b ; MAJA
848 and T1, c ; T1 = a&c ; MAJB
849 add z2, z0 ; z2 = S1 + CH ; --
852 add d, h ; d = k + w + h + d ; --
853 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
854 add h, z1 ; h = k + w + h + S0 ; --
856 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
859 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
861 ;add h, z3 ; h = t1 + S0 + MAJ ; --
864 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
866 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
867 mov z2, f ; z2 = f ; CH
868 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
869 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
870 xor z2, g ; z2 = f^g ; CH
872 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
873 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
874 and z2, e ; z2 = (f^g)&e ; CH
875 add old_h, z3 ; h = t1 + S0 + MAJ ; --
877 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
878 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
879 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
880 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
881 mov z3, a ; z3 = a ; MAJA
883 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
884 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
885 add h, dword[rsp + _KTMSG + IDX + 32*2] ; h = k + w + h
886 or z3, c ; z3 = a|c ; MAJA
888 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
889 mov T1, a ; T1 = a ; MAJB
890 and z3, b ; z3 = (a|c)&b ; MAJA
891 and T1, c ; T1 = a&c ; MAJB
892 add z2, z0 ; z2 = S1 + CH ; --
895 add d, h ; d = k + w + h + d ; --
896 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
897 add h, z1 ; h = k + w + h + S0 ; --
899 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
902 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
904 ;add h, z3 ; h = t1 + S0 + MAJ ; --
908 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
910 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
911 mov z2, f ; z2 = f ; CH
912 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
913 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
914 xor z2, g ; z2 = f^g ; CH
916 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
917 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
918 and z2, e ; z2 = (f^g)&e ; CH
919 add old_h, z3 ; h = t1 + S0 + MAJ ; --
922 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
923 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
924 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
925 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
926 mov z3, a ; z3 = a ; MAJA
928 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
929 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
930 add h, dword[rsp + _KTMSG + IDX + 32*3] ; h = k + w + h
931 or z3, c ; z3 = a|c ; MAJA
933 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
934 mov T1, a ; T1 = a ; MAJB
935 and z3, b ; z3 = (a|c)&b ; MAJA
936 and T1, c ; T1 = a&c ; MAJB
937 add z2, z0 ; z2 = S1 + CH ; --
940 add d, h ; d = k + w + h + d ; --
941 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
942 add h, z1 ; h = k + w + h + S0 ; --
944 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
947 add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
949 add h, z3 ; h = t1 + S0 + MAJ ; --
954 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
956 mov z2, f ; z2 = f ; CH
957 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
958 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
959 xor z2, g ; z2 = f^g ; CH
961 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
962 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
963 and z2, e ; z2 = (f^g)&e ; CH
965 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
966 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
967 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
968 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
969 mov z3, a ; z3 = a ; MAJA
971 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
972 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
973 add h, dword[rsp + _KTMSG + IDX + 32*4] ; h = k + w + h
974 or z3, c ; z3 = a|c ; MAJA
976 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
977 mov T1, a ; T1 = a ; MAJB
978 and z3, b ; z3 = (a|c)&b ; MAJA
979 and T1, c ; T1 = a&c ; MAJB
980 add z2, z0 ; z2 = S1 + CH ; --
983 add d, h ; d = k + w + h + d ; --
984 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
985 add h, z1 ; h = k + w + h + S0 ; --
987 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
990 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
992 ;add h, z3 ; h = t1 + S0 + MAJ ; --
995 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 5 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
997 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
998 mov z2, f ; z2 = f ; CH
999 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
1000 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
1001 xor z2, g ; z2 = f^g ; CH
1003 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
1004 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
1005 and z2, e ; z2 = (f^g)&e ; CH
1006 add old_h, z3 ; h = t1 + S0 + MAJ ; --
1009 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
1010 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
1011 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
1012 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
1013 mov z3, a ; z3 = a ; MAJA
1015 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
1016 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
1017 add h, dword[rsp + _KTMSG + IDX + 32*5] ; h = k + w + h
1018 or z3, c ; z3 = a|c ; MAJA
1020 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
1021 mov T1, a ; T1 = a ; MAJB
1022 and z3, b ; z3 = (a|c)&b ; MAJA
1023 and T1, c ; T1 = a&c ; MAJB
1024 add z2, z0 ; z2 = S1 + CH ; --
1027 add d, h ; d = k + w + h + d ; --
1028 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
1029 add h, z1 ; h = k + w + h + S0 ; --
1031 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
1034 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
1036 ;add h, z3 ; h = t1 + S0 + MAJ ; --
1040 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1042 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
1043 mov z2, f ; z2 = f ; CH
1044 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
1045 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
1046 xor z2, g ; z2 = f^g ; CH
1048 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
1049 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
1050 and z2, e ; z2 = (f^g)&e ; CH
1051 add old_h, z3 ; h = t1 + S0 + MAJ ; --
1054 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
1055 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
1056 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
1057 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
1058 mov z3, a ; z3 = a ; MAJA
1060 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
1061 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
1062 add h, dword[rsp + _KTMSG + IDX + 32*6] ; h = k + w + h
1063 or z3, c ; z3 = a|c ; MAJA
1065 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
1066 mov T1, a ; T1 = a ; MAJB
1067 and z3, b ; z3 = (a|c)&b ; MAJA
1068 and T1, c ; T1 = a&c ; MAJB
1069 add z2, z0 ; z2 = S1 + CH ; --
1072 add d, h ; d = k + w + h + d ; --
1073 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
1074 add h, z1 ; h = k + w + h + S0 ; --
1076 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
1079 ;add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
1081 ;add h, z3 ; h = t1 + S0 + MAJ ; --
1084 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1086 add old_h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
1087 mov z2, f ; z2 = f ; CH
1088 rorx z0, e, 25 ; z0 = e >> 25 ; S1A
1089 rorx z1, e, 11 ; z1 = e >> 11 ; S1B
1090 xor z2, g ; z2 = f^g ; CH
1092 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ; S1
1093 rorx z1, e, 6 ; z1 = (e >> 6) ; S1
1094 and z2, e ; z2 = (f^g)&e ; CH
1095 add old_h, z3 ; h = t1 + S0 + MAJ ; --
1098 xor z0, z1 ; z0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
1099 rorx T1, a, 13 ; T1 = a >> 13 ; S0B
1100 xor z2, g ; z2 = CH = ((f^g)&e)^g ; CH
1101 rorx z1, a, 22 ; z1 = a >> 22 ; S0A
1102 mov z3, a ; z3 = a ; MAJA
1104 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ; S0
1105 rorx T1, a, 2 ; T1 = (a >> 2) ; S0
1106 add h, dword[rsp + _KTMSG + IDX + 32*7] ; h = k + w + h
1107 or z3, c ; z3 = a|c ; MAJA
1109 xor z1, T1 ; z1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
1110 mov T1, a ; T1 = a ; MAJB
1111 and z3, b ; z3 = (a|c)&b ; MAJA
1112 and T1, c ; T1 = a&c ; MAJB
1113 add z2, z0 ; z2 = S1 + CH ; --
1116 add d, h ; d = k + w + h + d ; --
1117 or z3, T1 ; z3 = MAJ = (a|c)&b)|(a&c) ; MAJ
1118 add h, z1 ; h = k + w + h + S0 ; --
1120 add d, z2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
1123 add h, z2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
1125 add h, z3 ; h = t1 + S0 + MAJ ; --
1132 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1133 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1134 ;; void sha256_rorx_x8ms(void *input_data, UINT32 digest[8], UINT64 num_blks)
1135 ;; arg 1 : pointer to input data
1136 ;; arg 2 : pointer to digest
1137 ;; arg 3 : Num blocks
1139 global sha256_rorx_x8ms
1156 mov [rsp + _RSP], rax
1158 mov qword [rsp + _IDX_LIMIT], 32
1161 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
1162 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
1163 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
1164 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
1165 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
1166 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
1167 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
1170 shl NUM_BLKS, 6 ; convert to bytes
1172 lea NUM_BLKS, [NUM_BLKS + INP - 8*64]
1173 mov [rsp + _INP_END], NUM_BLKS
1174 mov [rsp + _CTX], CTX
1177 jb less_than_8_blocks
1179 ;; load initial digest
1194 lea TBL,[K256_SIMD wrt rip]
1196 vmovdqa TTMP3, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1197 ;; Load 8 blocks of message and transpose and save to stack
1200 vmovups TT0,[INP+0*64+i*32]
1201 vpshufb TT0, TT0, TTMP3
1202 vmovups TT1,[INP+1*64+i*32]
1203 vpshufb TT1, TT1, TTMP3
1204 vmovups TT2,[INP+2*64+i*32]
1205 vpshufb TT2, TT2, TTMP3
1206 vmovups TT3,[INP+3*64+i*32]
1207 vpshufb TT3, TT3, TTMP3
1208 vmovups TT4,[INP+4*64+i*32]
1209 vpshufb TT4, TT4, TTMP3
1210 vmovups TT5,[INP+5*64+i*32]
1211 vpshufb TT5, TT5, TTMP3
1212 vmovups TT6,[INP+6*64+i*32]
1213 vpshufb TT6, TT6, TTMP3
1214 vmovups TT7,[INP+7*64+i*32]
1215 vpshufb TT7, TT7, TTMP3
1217 TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TTMP1, TTMP2
1219 vmovdqa [rsp + _TMSG + 0*32 + i*8*32], TT0
1220 vpaddd TT0, TT0, [TBL + 0*32 + i*8*32]
1221 vmovdqa [rsp + _KTMSG + 0*32 + i*8*32], TT0
1223 vmovdqa [rsp + _TMSG + 1*32 + i*8*32], TT1
1224 vpaddd TT1, TT1, [TBL + 1*32 + i*8*32]
1225 vmovdqa [rsp + _KTMSG + 1*32 + i*8*32], TT1
1227 vmovdqa [rsp + _TMSG + 2*32 + i*8*32], TT2
1228 vpaddd TT2, TT2, [TBL + 2*32 + i*8*32]
1229 vmovdqa [rsp + _KTMSG + 2*32 + i*8*32], TT2
1231 vmovdqa [rsp + _TMSG + 3*32 + i*8*32], TT3
1232 vpaddd TT3, TT3, [TBL + 3*32 + i*8*32]
1233 vmovdqa [rsp + _KTMSG + 3*32 + i*8*32], TT3
1235 vmovdqa [rsp + _TMSG + 4*32 + i*8*32], TT4
1236 vpaddd TT4, TT4, [TBL + 4*32 + i*8*32]
1237 vmovdqa [rsp + _KTMSG + 4*32 + i*8*32], TT4
1239 vmovdqa [rsp + _TMSG + 5*32 + i*8*32], TT5
1240 vpaddd TT5, TT5, [TBL + 5*32 + i*8*32]
1241 vmovdqa [rsp + _KTMSG + 5*32 + i*8*32], TT5
1243 vmovdqa [rsp + _TMSG + 6*32 + i*8*32], TT6
1244 vpaddd TT6, TT6, [TBL + 6*32 + i*8*32]
1245 vmovdqa [rsp + _KTMSG + 6*32 + i*8*32], TT6
1247 vmovdqa [rsp + _TMSG + 7*32 + i*8*32], TT7
1248 vpaddd TT7, TT7, [TBL + 7*32 + i*8*32]
1249 vmovdqa [rsp + _KTMSG + 7*32 + i*8*32], TT7
1257 ;; Save Input Msg pointer to stack
1259 mov [rsp + _INP], INP
1261 ;; Initialize Msg Index to Zero
1264 sha256_x8ms_8rnds_loop:
1266 ;; Perform Message Scheduling of the next 8 rounds (from round 17 to 64)
1267 ;; Also perform compress function for first block from round 1 to 16.
1271 ;; Check how many rounds have been performed
1274 jne sha256_x8ms_8rnds_loop
1276 mov CTX, [rsp + _CTX]
1278 compress_block_loop:
1280 ;; Perform 8 rounds of compression
1285 jb compress_block_loop
1287 ;; Update the State when block compression has been completed
1297 sub IDX, (8 * 8*32) - 4
1300 ;; Check if the 8th block has been compressed
1301 cmp IDX, [rsp + _IDX_LIMIT]
1302 jne compress_block_loop
1304 ;; Check if the last set of 8 blocks has been processed
1305 mov INP, [rsp + _INP]
1306 cmp INP, [rsp + _INP_END]
1307 jbe eight_blocks_loop
1310 mov z1q, [rsp + _INP_END]
1312 ; z1q is minus number of NULL blocks left out of 8
1316 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1318 ; z1q is -1...-7 (*64) meaning we need to process 7...1 more blocks
1321 lea TBL,[K256_SIMD wrt rip]
1322 sar z1q, 4 ; convert to blks * 4
1324 vmovdqa TTMP3, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1325 ;; Load 8 blocks of message and transpose and save to stack
1328 vmovups TT0,[INP+0*64+i*32]
1329 vpshufb TT0, TT0, TTMP3
1330 vmovups TT1,[INP+1*64+i*32]
1331 vpshufb TT1, TT1, TTMP3
1332 vmovups TT2,[INP+2*64+i*32]
1333 vpshufb TT2, TT2, TTMP3
1334 vmovups TT3,[INP+3*64+i*32]
1335 vpshufb TT3, TT3, TTMP3
1336 vmovups TT4,[INP+4*64+i*32]
1337 vpshufb TT4, TT4, TTMP3
1338 vmovups TT5,[INP+5*64+i*32]
1339 vpshufb TT5, TT5, TTMP3
1340 vmovups TT6,[INP+6*64+i*32]
1341 vpshufb TT6, TT6, TTMP3
1342 vmovups TT7,[INP+7*64+i*32]
1343 vpshufb TT7, TT7, TTMP3
1345 TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TTMP1, TTMP2
1347 vmovdqu [rsp + z1q + _TMSG + 0*32 + i*8*32], TT0
1348 vpaddd TT0, TT0, [TBL + 0*32 + i*8*32]
1349 vmovdqu [rsp + z1q + _KTMSG + 0*32 + i*8*32], TT0
1351 vmovdqu [rsp + z1q + _TMSG + 1*32 + i*8*32], TT1
1352 vpaddd TT1, TT1, [TBL + 1*32 + i*8*32]
1353 vmovdqu [rsp + z1q + _KTMSG + 1*32 + i*8*32], TT1
1355 vmovdqu [rsp + z1q + _TMSG + 2*32 + i*8*32], TT2
1356 vpaddd TT2, TT2, [TBL + 2*32 + i*8*32]
1357 vmovdqu [rsp + z1q + _KTMSG + 2*32 + i*8*32], TT2
1359 vmovdqu [rsp + z1q + _TMSG + 3*32 + i*8*32], TT3
1360 vpaddd TT3, TT3, [TBL + 3*32 + i*8*32]
1361 vmovdqu [rsp + z1q + _KTMSG + 3*32 + i*8*32], TT3
1363 vmovdqu [rsp + z1q + _TMSG + 4*32 + i*8*32], TT4
1364 vpaddd TT4, TT4, [TBL + 4*32 + i*8*32]
1365 vmovdqu [rsp + z1q + _KTMSG + 4*32 + i*8*32], TT4
1367 vmovdqu [rsp + z1q + _TMSG + 5*32 + i*8*32], TT5
1368 vpaddd TT5, TT5, [TBL + 5*32 + i*8*32]
1369 vmovdqu [rsp + z1q + _KTMSG + 5*32 + i*8*32], TT5
1371 vmovdqu [rsp + z1q + _TMSG + 6*32 + i*8*32], TT6
1372 vpaddd TT6, TT6, [TBL + 6*32 + i*8*32]
1373 vmovdqu [rsp + z1q + _KTMSG + 6*32 + i*8*32], TT6
1375 vmovdqu [rsp + z1q + _TMSG + 7*32 + i*8*32], TT7
1376 vpaddd TT7, TT7, [TBL + 7*32 + i*8*32]
1377 vmovdqu [rsp + z1q + _KTMSG + 7*32 + i*8*32], TT7
1382 add z1q, 4*8 ; z1q = 4 * (number of blocks to proc)
1383 mov [rsp + _IDX_LIMIT], z1q
1389 ;; load initial digest
1400 and z1q, 4095 ; offset into page
1401 cmp z1q, 4096 - (8*64)
1405 mov z1q, [rsp + _INP_END]
1407 sar z1q, 4 ; convert to blks * 4
1408 add z1q, 4*8 ; z1q = 4 * (number of blocks to proc)
1409 mov [rsp + _IDX_LIMIT], z1q
1410 jmp eight_blocks_loop
1414 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1415 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1416 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1417 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1418 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1419 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1420 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1423 mov rsp, [rsp + _RSP]
1441 ddq 0x428a2f98428a2f98428a2f98428a2f98,0x428a2f98428a2f98428a2f98428a2f98
1442 ddq 0x71374491713744917137449171374491,0x71374491713744917137449171374491
1443 ddq 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf,0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
1444 ddq 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5,0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
1445 ddq 0x3956c25b3956c25b3956c25b3956c25b,0x3956c25b3956c25b3956c25b3956c25b
1446 ddq 0x59f111f159f111f159f111f159f111f1,0x59f111f159f111f159f111f159f111f1
1447 ddq 0x923f82a4923f82a4923f82a4923f82a4,0x923f82a4923f82a4923f82a4923f82a4
1448 ddq 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5,0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
1449 ddq 0xd807aa98d807aa98d807aa98d807aa98,0xd807aa98d807aa98d807aa98d807aa98
1450 ddq 0x12835b0112835b0112835b0112835b01,0x12835b0112835b0112835b0112835b01
1451 ddq 0x243185be243185be243185be243185be,0x243185be243185be243185be243185be
1452 ddq 0x550c7dc3550c7dc3550c7dc3550c7dc3,0x550c7dc3550c7dc3550c7dc3550c7dc3
1453 ddq 0x72be5d7472be5d7472be5d7472be5d74,0x72be5d7472be5d7472be5d7472be5d74
1454 ddq 0x80deb1fe80deb1fe80deb1fe80deb1fe,0x80deb1fe80deb1fe80deb1fe80deb1fe
1455 ddq 0x9bdc06a79bdc06a79bdc06a79bdc06a7,0x9bdc06a79bdc06a79bdc06a79bdc06a7
1456 ddq 0xc19bf174c19bf174c19bf174c19bf174,0xc19bf174c19bf174c19bf174c19bf174
1457 ddq 0xe49b69c1e49b69c1e49b69c1e49b69c1,0xe49b69c1e49b69c1e49b69c1e49b69c1
1458 ddq 0xefbe4786efbe4786efbe4786efbe4786,0xefbe4786efbe4786efbe4786efbe4786
1459 ddq 0x0fc19dc60fc19dc60fc19dc60fc19dc6,0x0fc19dc60fc19dc60fc19dc60fc19dc6
1460 ddq 0x240ca1cc240ca1cc240ca1cc240ca1cc,0x240ca1cc240ca1cc240ca1cc240ca1cc
1461 ddq 0x2de92c6f2de92c6f2de92c6f2de92c6f,0x2de92c6f2de92c6f2de92c6f2de92c6f
1462 ddq 0x4a7484aa4a7484aa4a7484aa4a7484aa,0x4a7484aa4a7484aa4a7484aa4a7484aa
1463 ddq 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc,0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
1464 ddq 0x76f988da76f988da76f988da76f988da,0x76f988da76f988da76f988da76f988da
1465 ddq 0x983e5152983e5152983e5152983e5152,0x983e5152983e5152983e5152983e5152
1466 ddq 0xa831c66da831c66da831c66da831c66d,0xa831c66da831c66da831c66da831c66d
1467 ddq 0xb00327c8b00327c8b00327c8b00327c8,0xb00327c8b00327c8b00327c8b00327c8
1468 ddq 0xbf597fc7bf597fc7bf597fc7bf597fc7,0xbf597fc7bf597fc7bf597fc7bf597fc7
1469 ddq 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3,0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
1470 ddq 0xd5a79147d5a79147d5a79147d5a79147,0xd5a79147d5a79147d5a79147d5a79147
1471 ddq 0x06ca635106ca635106ca635106ca6351,0x06ca635106ca635106ca635106ca6351
1472 ddq 0x14292967142929671429296714292967,0x14292967142929671429296714292967
1473 ddq 0x27b70a8527b70a8527b70a8527b70a85,0x27b70a8527b70a8527b70a8527b70a85
1474 ddq 0x2e1b21382e1b21382e1b21382e1b2138,0x2e1b21382e1b21382e1b21382e1b2138
1475 ddq 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc,0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
1476 ddq 0x53380d1353380d1353380d1353380d13,0x53380d1353380d1353380d1353380d13
1477 ddq 0x650a7354650a7354650a7354650a7354,0x650a7354650a7354650a7354650a7354
1478 ddq 0x766a0abb766a0abb766a0abb766a0abb,0x766a0abb766a0abb766a0abb766a0abb
1479 ddq 0x81c2c92e81c2c92e81c2c92e81c2c92e,0x81c2c92e81c2c92e81c2c92e81c2c92e
1480 ddq 0x92722c8592722c8592722c8592722c85,0x92722c8592722c8592722c8592722c85
1481 ddq 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1,0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
1482 ddq 0xa81a664ba81a664ba81a664ba81a664b,0xa81a664ba81a664ba81a664ba81a664b
1483 ddq 0xc24b8b70c24b8b70c24b8b70c24b8b70,0xc24b8b70c24b8b70c24b8b70c24b8b70
1484 ddq 0xc76c51a3c76c51a3c76c51a3c76c51a3,0xc76c51a3c76c51a3c76c51a3c76c51a3
1485 ddq 0xd192e819d192e819d192e819d192e819,0xd192e819d192e819d192e819d192e819
1486 ddq 0xd6990624d6990624d6990624d6990624,0xd6990624d6990624d6990624d6990624
1487 ddq 0xf40e3585f40e3585f40e3585f40e3585,0xf40e3585f40e3585f40e3585f40e3585
1488 ddq 0x106aa070106aa070106aa070106aa070,0x106aa070106aa070106aa070106aa070
1489 ddq 0x19a4c11619a4c11619a4c11619a4c116,0x19a4c11619a4c11619a4c11619a4c116
1490 ddq 0x1e376c081e376c081e376c081e376c08,0x1e376c081e376c081e376c081e376c08
1491 ddq 0x2748774c2748774c2748774c2748774c,0x2748774c2748774c2748774c2748774c
1492 ddq 0x34b0bcb534b0bcb534b0bcb534b0bcb5,0x34b0bcb534b0bcb534b0bcb534b0bcb5
1493 ddq 0x391c0cb3391c0cb3391c0cb3391c0cb3,0x391c0cb3391c0cb3391c0cb3391c0cb3
1494 ddq 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a,0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
1495 ddq 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f,0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
1496 ddq 0x682e6ff3682e6ff3682e6ff3682e6ff3,0x682e6ff3682e6ff3682e6ff3682e6ff3
1497 ddq 0x748f82ee748f82ee748f82ee748f82ee,0x748f82ee748f82ee748f82ee748f82ee
1498 ddq 0x78a5636f78a5636f78a5636f78a5636f,0x78a5636f78a5636f78a5636f78a5636f
1499 ddq 0x84c8781484c8781484c8781484c87814,0x84c8781484c8781484c8781484c87814
1500 ddq 0x8cc702088cc702088cc702088cc70208,0x8cc702088cc702088cc702088cc70208
1501 ddq 0x90befffa90befffa90befffa90befffa,0x90befffa90befffa90befffa90befffa
1502 ddq 0xa4506ceba4506ceba4506ceba4506ceb,0xa4506ceba4506ceba4506ceba4506ceb
1503 ddq 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7,0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
1504 ddq 0xc67178f2c67178f2c67178f2c67178f2,0xc67178f2c67178f2c67178f2c67178f2
1506 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1507 ddq 0x0c0d0e0f08090a0b0405060700010203