diff options
author | Tim Chen <tim.c.chen@linux.intel.com> | 2013-03-26 16:59:55 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2013-04-25 09:00:58 -0400 |
commit | 5663535b69eef3940dcdb3110f95651304fe41af (patch) | |
tree | 71f31ac0c52ca2bc942bc3d08fc66646adfa54ff /arch/x86/crypto | |
parent | e01d69cb01956e97b6880c1952e264b19473e7f3 (diff) |
crypto: sha512 - Optimized SHA512 x86_64 assembly routine using AVX2 RORX instruction.
Provides SHA512 x86_64 assembly routine optimized with SSE, AVX and
AVX2's RORX instructions. Speedup of 70% or more has been
measured over the generic implementation.
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/sha512-avx2-asm.S | 743 |
1 files changed, 743 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S new file mode 100644 index 000000000000..568b96105f5c --- /dev/null +++ b/arch/x86/crypto/sha512-avx2-asm.S | |||
@@ -0,0 +1,743 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-512 with AVX2 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # David Cote <david.m.cote@intel.com> | ||
10 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | # | ||
12 | # This software is available to you under a choice of one of two | ||
13 | # licenses. You may choose to be licensed under the terms of the GNU | ||
14 | # General Public License (GPL) Version 2, available from the file | ||
15 | # COPYING in the main directory of this source tree, or the | ||
16 | # OpenIB.org BSD license below: | ||
17 | # | ||
18 | # Redistribution and use in source and binary forms, with or | ||
19 | # without modification, are permitted provided that the following | ||
20 | # conditions are met: | ||
21 | # | ||
22 | # - Redistributions of source code must retain the above | ||
23 | # copyright notice, this list of conditions and the following | ||
24 | # disclaimer. | ||
25 | # | ||
26 | # - Redistributions in binary form must reproduce the above | ||
27 | # copyright notice, this list of conditions and the following | ||
28 | # disclaimer in the documentation and/or other materials | ||
29 | # provided with the distribution. | ||
30 | # | ||
31 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
32 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
33 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
34 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
35 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
36 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
37 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
38 | # SOFTWARE. | ||
39 | # | ||
40 | ######################################################################## | ||
41 | # | ||
42 | # This code is described in an Intel White-Paper: | ||
43 | # "Fast SHA-512 Implementations on Intel Architecture Processors" | ||
44 | # | ||
45 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
46 | # and search for that title. | ||
47 | # | ||
48 | ######################################################################## | ||
49 | # This code schedules 1 blocks at a time, with 4 lanes per block | ||
50 | ######################################################################## | ||
51 | |||
52 | #ifdef CONFIG_AS_AVX2 | ||
53 | #include <linux/linkage.h> | ||
54 | |||
55 | .text | ||
56 | |||
57 | # Virtual Registers | ||
58 | Y_0 = %ymm4 | ||
59 | Y_1 = %ymm5 | ||
60 | Y_2 = %ymm6 | ||
61 | Y_3 = %ymm7 | ||
62 | |||
63 | YTMP0 = %ymm0 | ||
64 | YTMP1 = %ymm1 | ||
65 | YTMP2 = %ymm2 | ||
66 | YTMP3 = %ymm3 | ||
67 | YTMP4 = %ymm8 | ||
68 | XFER = YTMP0 | ||
69 | |||
70 | BYTE_FLIP_MASK = %ymm9 | ||
71 | |||
72 | # 1st arg | ||
73 | INP = %rdi | ||
74 | # 2nd arg | ||
75 | CTX = %rsi | ||
76 | # 3rd arg | ||
77 | NUM_BLKS = %rdx | ||
78 | |||
79 | c = %rcx | ||
80 | d = %r8 | ||
81 | e = %rdx | ||
82 | y3 = %rdi | ||
83 | |||
84 | TBL = %rbp | ||
85 | |||
86 | a = %rax | ||
87 | b = %rbx | ||
88 | |||
89 | f = %r9 | ||
90 | g = %r10 | ||
91 | h = %r11 | ||
92 | old_h = %r11 | ||
93 | |||
94 | T1 = %r12 | ||
95 | y0 = %r13 | ||
96 | y1 = %r14 | ||
97 | y2 = %r15 | ||
98 | |||
99 | y4 = %r12 | ||
100 | |||
101 | # Local variables (stack frame) | ||
102 | XFER_SIZE = 4*8 | ||
103 | SRND_SIZE = 1*8 | ||
104 | INP_SIZE = 1*8 | ||
105 | INPEND_SIZE = 1*8 | ||
106 | RSPSAVE_SIZE = 1*8 | ||
107 | GPRSAVE_SIZE = 6*8 | ||
108 | |||
109 | frame_XFER = 0 | ||
110 | frame_SRND = frame_XFER + XFER_SIZE | ||
111 | frame_INP = frame_SRND + SRND_SIZE | ||
112 | frame_INPEND = frame_INP + INP_SIZE | ||
113 | frame_RSPSAVE = frame_INPEND + INPEND_SIZE | ||
114 | frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE | ||
115 | frame_size = frame_GPRSAVE + GPRSAVE_SIZE | ||
116 | |||
117 | ## assume buffers not aligned | ||
118 | #define VMOVDQ vmovdqu | ||
119 | |||
120 | # addm [mem], reg | ||
121 | # Add reg to mem using reg-mem add and store | ||
122 | .macro addm p1 p2 | ||
123 | add \p1, \p2 | ||
124 | mov \p2, \p1 | ||
125 | .endm | ||
126 | |||
127 | |||
128 | # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask | ||
129 | # Load ymm with mem and byte swap each dword | ||
130 | .macro COPY_YMM_AND_BSWAP p1 p2 p3 | ||
131 | VMOVDQ \p2, \p1 | ||
132 | vpshufb \p3, \p1, \p1 | ||
133 | .endm | ||
134 | # rotate_Ys | ||
135 | # Rotate values of symbols Y0...Y3 | ||
136 | .macro rotate_Ys | ||
137 | Y_ = Y_0 | ||
138 | Y_0 = Y_1 | ||
139 | Y_1 = Y_2 | ||
140 | Y_2 = Y_3 | ||
141 | Y_3 = Y_ | ||
142 | .endm | ||
143 | |||
144 | # RotateState | ||
145 | .macro RotateState | ||
146 | # Rotate symbols a..h right | ||
147 | old_h = h | ||
148 | TMP_ = h | ||
149 | h = g | ||
150 | g = f | ||
151 | f = e | ||
152 | e = d | ||
153 | d = c | ||
154 | c = b | ||
155 | b = a | ||
156 | a = TMP_ | ||
157 | .endm | ||
158 | |||
159 | # macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL | ||
160 | # YDST = {YSRC1, YSRC2} >> RVAL*8 | ||
161 | .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL | ||
162 | vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} | ||
163 | vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 | ||
164 | .endm | ||
165 | |||
166 | .macro FOUR_ROUNDS_AND_SCHED | ||
167 | ################################### RND N + 0 ######################################### | ||
168 | |||
169 | # Extract w[t-7] | ||
170 | MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] | ||
171 | # Calculate w[t-16] + w[t-7] | ||
172 | vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] | ||
173 | # Extract w[t-15] | ||
174 | MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] | ||
175 | |||
176 | # Calculate sigma0 | ||
177 | |||
178 | # Calculate w[t-15] ror 1 | ||
179 | vpsrlq $1, YTMP1, YTMP2 | ||
180 | vpsllq $(64-1), YTMP1, YTMP3 | ||
181 | vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 | ||
182 | # Calculate w[t-15] shr 7 | ||
183 | vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 | ||
184 | |||
185 | mov a, y3 # y3 = a # MAJA | ||
186 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
187 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
188 | add frame_XFER(%rsp),h # h = k + w + h # -- | ||
189 | or c, y3 # y3 = a|c # MAJA | ||
190 | mov f, y2 # y2 = f # CH | ||
191 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
192 | |||
193 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
194 | xor g, y2 # y2 = f^g # CH | ||
195 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
196 | |||
197 | and e, y2 # y2 = (f^g)&e # CH | ||
198 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
199 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
200 | add h, d # d = k + w + h + d # -- | ||
201 | |||
202 | and b, y3 # y3 = (a|c)&b # MAJA | ||
203 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
204 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
205 | |||
206 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
207 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
208 | mov a, T1 # T1 = a # MAJB | ||
209 | and c, T1 # T1 = a&c # MAJB | ||
210 | |||
211 | add y0, y2 # y2 = S1 + CH # -- | ||
212 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
213 | add y1, h # h = k + w + h + S0 # -- | ||
214 | |||
215 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
216 | |||
217 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
218 | add y3, h # h = t1 + S0 + MAJ # -- | ||
219 | |||
220 | RotateState | ||
221 | |||
222 | ################################### RND N + 1 ######################################### | ||
223 | |||
224 | # Calculate w[t-15] ror 8 | ||
225 | vpsrlq $8, YTMP1, YTMP2 | ||
226 | vpsllq $(64-8), YTMP1, YTMP1 | ||
227 | vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 | ||
228 | # XOR the three components | ||
229 | vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 | ||
230 | vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 | ||
231 | |||
232 | |||
233 | # Add three components, w[t-16], w[t-7] and sigma0 | ||
234 | vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 | ||
235 | # Move to appropriate lanes for calculating w[16] and w[17] | ||
236 | vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} | ||
237 | # Move to appropriate lanes for calculating w[18] and w[19] | ||
238 | vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} | ||
239 | |||
240 | # Calculate w[16] and w[17] in both 128 bit lanes | ||
241 | |||
242 | # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes | ||
243 | vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} | ||
244 | vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} | ||
245 | |||
246 | |||
247 | mov a, y3 # y3 = a # MAJA | ||
248 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
249 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
250 | add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
251 | or c, y3 # y3 = a|c # MAJA | ||
252 | |||
253 | |||
254 | mov f, y2 # y2 = f # CH | ||
255 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
256 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
257 | xor g, y2 # y2 = f^g # CH | ||
258 | |||
259 | |||
260 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
261 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
262 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
263 | and e, y2 # y2 = (f^g)&e # CH | ||
264 | add h, d # d = k + w + h + d # -- | ||
265 | |||
266 | and b, y3 # y3 = (a|c)&b # MAJA | ||
267 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
268 | |||
269 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
270 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
271 | |||
272 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
273 | mov a, T1 # T1 = a # MAJB | ||
274 | and c, T1 # T1 = a&c # MAJB | ||
275 | add y0, y2 # y2 = S1 + CH # -- | ||
276 | |||
277 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
278 | add y1, h # h = k + w + h + S0 # -- | ||
279 | |||
280 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
281 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
282 | add y3, h # h = t1 + S0 + MAJ # -- | ||
283 | |||
284 | RotateState | ||
285 | |||
286 | |||
287 | ################################### RND N + 2 ######################################### | ||
288 | |||
289 | vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} | ||
290 | vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} | ||
291 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} | ||
292 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} | ||
293 | vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} | ||
294 | vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} | ||
295 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} | ||
296 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ | ||
297 | # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} | ||
298 | |||
299 | # Add sigma1 to the other compunents to get w[16] and w[17] | ||
300 | vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} | ||
301 | |||
302 | # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane | ||
303 | vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} | ||
304 | |||
305 | mov a, y3 # y3 = a # MAJA | ||
306 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
307 | add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
308 | |||
309 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
310 | or c, y3 # y3 = a|c # MAJA | ||
311 | mov f, y2 # y2 = f # CH | ||
312 | xor g, y2 # y2 = f^g # CH | ||
313 | |||
314 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
315 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
316 | and e, y2 # y2 = (f^g)&e # CH | ||
317 | |||
318 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
319 | add h, d # d = k + w + h + d # -- | ||
320 | and b, y3 # y3 = (a|c)&b # MAJA | ||
321 | |||
322 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
323 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
324 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
325 | |||
326 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
327 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
328 | |||
329 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
330 | mov a, T1 # T1 = a # MAJB | ||
331 | and c, T1 # T1 = a&c # MAJB | ||
332 | add y0, y2 # y2 = S1 + CH # -- | ||
333 | |||
334 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
335 | add y1, h # h = k + w + h + S0 # -- | ||
336 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
337 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
338 | |||
339 | add y3, h # h = t1 + S0 + MAJ # -- | ||
340 | |||
341 | RotateState | ||
342 | |||
343 | ################################### RND N + 3 ######################################### | ||
344 | |||
345 | vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} | ||
346 | vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} | ||
347 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} | ||
348 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} | ||
349 | vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} | ||
350 | vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} | ||
351 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} | ||
352 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ | ||
353 | # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} | ||
354 | |||
355 | # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] | ||
356 | # to newly calculated sigma1 to get w[18] and w[19] | ||
357 | vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} | ||
358 | |||
359 | # Form w[19, w[18], w17], w[16] | ||
360 | vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} | ||
361 | |||
362 | mov a, y3 # y3 = a # MAJA | ||
363 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
364 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
365 | add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
366 | or c, y3 # y3 = a|c # MAJA | ||
367 | |||
368 | |||
369 | mov f, y2 # y2 = f # CH | ||
370 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
371 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
372 | xor g, y2 # y2 = f^g # CH | ||
373 | |||
374 | |||
375 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
376 | and e, y2 # y2 = (f^g)&e # CH | ||
377 | add h, d # d = k + w + h + d # -- | ||
378 | and b, y3 # y3 = (a|c)&b # MAJA | ||
379 | |||
380 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
381 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
382 | |||
383 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
384 | add y0, y2 # y2 = S1 + CH # -- | ||
385 | |||
386 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
387 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
388 | |||
389 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
390 | |||
391 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
392 | mov a, T1 # T1 = a # MAJB | ||
393 | and c, T1 # T1 = a&c # MAJB | ||
394 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
395 | |||
396 | add y1, h # h = k + w + h + S0 # -- | ||
397 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
398 | add y3, h # h = t1 + S0 + MAJ # -- | ||
399 | |||
400 | RotateState | ||
401 | |||
402 | rotate_Ys | ||
403 | .endm | ||
404 | |||
405 | .macro DO_4ROUNDS | ||
406 | |||
407 | ################################### RND N + 0 ######################################### | ||
408 | |||
409 | mov f, y2 # y2 = f # CH | ||
410 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
411 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
412 | xor g, y2 # y2 = f^g # CH | ||
413 | |||
414 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
415 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
416 | and e, y2 # y2 = (f^g)&e # CH | ||
417 | |||
418 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
419 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
420 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
421 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
422 | mov a, y3 # y3 = a # MAJA | ||
423 | |||
424 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
425 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
426 | add frame_XFER(%rsp), h # h = k + w + h # -- | ||
427 | or c, y3 # y3 = a|c # MAJA | ||
428 | |||
429 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
430 | mov a, T1 # T1 = a # MAJB | ||
431 | and b, y3 # y3 = (a|c)&b # MAJA | ||
432 | and c, T1 # T1 = a&c # MAJB | ||
433 | add y0, y2 # y2 = S1 + CH # -- | ||
434 | |||
435 | add h, d # d = k + w + h + d # -- | ||
436 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
437 | add y1, h # h = k + w + h + S0 # -- | ||
438 | |||
439 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
440 | |||
441 | RotateState | ||
442 | |||
443 | ################################### RND N + 1 ######################################### | ||
444 | |||
445 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
446 | mov f, y2 # y2 = f # CH | ||
447 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
448 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
449 | xor g, y2 # y2 = f^g # CH | ||
450 | |||
451 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
452 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
453 | and e, y2 # y2 = (f^g)&e # CH | ||
454 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
455 | |||
456 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
457 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
458 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
459 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
460 | mov a, y3 # y3 = a # MAJA | ||
461 | |||
462 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
463 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
464 | add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- | ||
465 | or c, y3 # y3 = a|c # MAJA | ||
466 | |||
467 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
468 | mov a, T1 # T1 = a # MAJB | ||
469 | and b, y3 # y3 = (a|c)&b # MAJA | ||
470 | and c, T1 # T1 = a&c # MAJB | ||
471 | add y0, y2 # y2 = S1 + CH # -- | ||
472 | |||
473 | add h, d # d = k + w + h + d # -- | ||
474 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
475 | add y1, h # h = k + w + h + S0 # -- | ||
476 | |||
477 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
478 | |||
479 | RotateState | ||
480 | |||
481 | ################################### RND N + 2 ######################################### | ||
482 | |||
483 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
484 | mov f, y2 # y2 = f # CH | ||
485 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
486 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
487 | xor g, y2 # y2 = f^g # CH | ||
488 | |||
489 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
490 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
491 | and e, y2 # y2 = (f^g)&e # CH | ||
492 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
493 | |||
494 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
495 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
496 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
497 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
498 | mov a, y3 # y3 = a # MAJA | ||
499 | |||
500 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
501 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
502 | add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- | ||
503 | or c, y3 # y3 = a|c # MAJA | ||
504 | |||
505 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
506 | mov a, T1 # T1 = a # MAJB | ||
507 | and b, y3 # y3 = (a|c)&b # MAJA | ||
508 | and c, T1 # T1 = a&c # MAJB | ||
509 | add y0, y2 # y2 = S1 + CH # -- | ||
510 | |||
511 | add h, d # d = k + w + h + d # -- | ||
512 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
513 | add y1, h # h = k + w + h + S0 # -- | ||
514 | |||
515 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
516 | |||
517 | RotateState | ||
518 | |||
519 | ################################### RND N + 3 ######################################### | ||
520 | |||
521 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
522 | mov f, y2 # y2 = f # CH | ||
523 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
524 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
525 | xor g, y2 # y2 = f^g # CH | ||
526 | |||
527 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
528 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
529 | and e, y2 # y2 = (f^g)&e # CH | ||
530 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
531 | |||
532 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
533 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
534 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
535 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
536 | mov a, y3 # y3 = a # MAJA | ||
537 | |||
538 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
539 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
540 | add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- | ||
541 | or c, y3 # y3 = a|c # MAJA | ||
542 | |||
543 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
544 | mov a, T1 # T1 = a # MAJB | ||
545 | and b, y3 # y3 = (a|c)&b # MAJA | ||
546 | and c, T1 # T1 = a&c # MAJB | ||
547 | add y0, y2 # y2 = S1 + CH # -- | ||
548 | |||
549 | |||
550 | add h, d # d = k + w + h + d # -- | ||
551 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
552 | add y1, h # h = k + w + h + S0 # -- | ||
553 | |||
554 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
555 | |||
556 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
557 | |||
558 | add y3, h # h = t1 + S0 + MAJ # -- | ||
559 | |||
560 | RotateState | ||
561 | |||
562 | .endm | ||
563 | |||
564 | ######################################################################## | ||
565 | # void sha512_transform_rorx(const void* M, void* D, uint64_t L)# | ||
566 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | ||
567 | # The size of the message pointed to by M must be an integer multiple of SHA512 | ||
568 | # message blocks. | ||
569 | # L is the message length in SHA512 blocks | ||
570 | ######################################################################## | ||
571 | ENTRY(sha512_transform_rorx) | ||
572 | # Allocate Stack Space | ||
573 | mov %rsp, %rax | ||
574 | sub $frame_size, %rsp | ||
575 | and $~(0x20 - 1), %rsp | ||
576 | mov %rax, frame_RSPSAVE(%rsp) | ||
577 | |||
578 | # Save GPRs | ||
579 | mov %rbp, frame_GPRSAVE(%rsp) | ||
580 | mov %rbx, 8*1+frame_GPRSAVE(%rsp) | ||
581 | mov %r12, 8*2+frame_GPRSAVE(%rsp) | ||
582 | mov %r13, 8*3+frame_GPRSAVE(%rsp) | ||
583 | mov %r14, 8*4+frame_GPRSAVE(%rsp) | ||
584 | mov %r15, 8*5+frame_GPRSAVE(%rsp) | ||
585 | |||
586 | shl $7, NUM_BLKS # convert to bytes | ||
587 | jz done_hash | ||
588 | add INP, NUM_BLKS # pointer to end of data | ||
589 | mov NUM_BLKS, frame_INPEND(%rsp) | ||
590 | |||
591 | ## load initial digest | ||
592 | mov 8*0(CTX),a | ||
593 | mov 8*1(CTX),b | ||
594 | mov 8*2(CTX),c | ||
595 | mov 8*3(CTX),d | ||
596 | mov 8*4(CTX),e | ||
597 | mov 8*5(CTX),f | ||
598 | mov 8*6(CTX),g | ||
599 | mov 8*7(CTX),h | ||
600 | |||
601 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
602 | |||
603 | loop0: | ||
604 | lea K512(%rip), TBL | ||
605 | |||
606 | ## byte swap first 16 dwords | ||
607 | COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK | ||
608 | COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK | ||
609 | COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK | ||
610 | COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK | ||
611 | |||
612 | mov INP, frame_INP(%rsp) | ||
613 | |||
614 | ## schedule 64 input dwords, by doing 12 rounds of 4 each | ||
615 | movq $4, frame_SRND(%rsp) | ||
616 | |||
617 | .align 16 | ||
618 | loop1: | ||
619 | vpaddq (TBL), Y_0, XFER | ||
620 | vmovdqa XFER, frame_XFER(%rsp) | ||
621 | FOUR_ROUNDS_AND_SCHED | ||
622 | |||
623 | vpaddq 1*32(TBL), Y_0, XFER | ||
624 | vmovdqa XFER, frame_XFER(%rsp) | ||
625 | FOUR_ROUNDS_AND_SCHED | ||
626 | |||
627 | vpaddq 2*32(TBL), Y_0, XFER | ||
628 | vmovdqa XFER, frame_XFER(%rsp) | ||
629 | FOUR_ROUNDS_AND_SCHED | ||
630 | |||
631 | vpaddq 3*32(TBL), Y_0, XFER | ||
632 | vmovdqa XFER, frame_XFER(%rsp) | ||
633 | add $(4*32), TBL | ||
634 | FOUR_ROUNDS_AND_SCHED | ||
635 | |||
636 | subq $1, frame_SRND(%rsp) | ||
637 | jne loop1 | ||
638 | |||
639 | movq $2, frame_SRND(%rsp) | ||
640 | loop2: | ||
641 | vpaddq (TBL), Y_0, XFER | ||
642 | vmovdqa XFER, frame_XFER(%rsp) | ||
643 | DO_4ROUNDS | ||
644 | vpaddq 1*32(TBL), Y_1, XFER | ||
645 | vmovdqa XFER, frame_XFER(%rsp) | ||
646 | add $(2*32), TBL | ||
647 | DO_4ROUNDS | ||
648 | |||
649 | vmovdqa Y_2, Y_0 | ||
650 | vmovdqa Y_3, Y_1 | ||
651 | |||
652 | subq $1, frame_SRND(%rsp) | ||
653 | jne loop2 | ||
654 | |||
655 | addm 8*0(CTX),a | ||
656 | addm 8*1(CTX),b | ||
657 | addm 8*2(CTX),c | ||
658 | addm 8*3(CTX),d | ||
659 | addm 8*4(CTX),e | ||
660 | addm 8*5(CTX),f | ||
661 | addm 8*6(CTX),g | ||
662 | addm 8*7(CTX),h | ||
663 | |||
664 | mov frame_INP(%rsp), INP | ||
665 | add $128, INP | ||
666 | cmp frame_INPEND(%rsp), INP | ||
667 | jne loop0 | ||
668 | |||
669 | done_hash: | ||
670 | |||
671 | # Restore GPRs | ||
672 | mov frame_GPRSAVE(%rsp) ,%rbp | ||
673 | mov 8*1+frame_GPRSAVE(%rsp) ,%rbx | ||
674 | mov 8*2+frame_GPRSAVE(%rsp) ,%r12 | ||
675 | mov 8*3+frame_GPRSAVE(%rsp) ,%r13 | ||
676 | mov 8*4+frame_GPRSAVE(%rsp) ,%r14 | ||
677 | mov 8*5+frame_GPRSAVE(%rsp) ,%r15 | ||
678 | |||
679 | # Restore Stack Pointer | ||
680 | mov frame_RSPSAVE(%rsp), %rsp | ||
681 | ret | ||
682 | ENDPROC(sha512_transform_rorx) | ||
683 | |||
684 | ######################################################################## | ||
685 | ### Binary Data | ||
686 | |||
687 | .data | ||
688 | |||
689 | .align 64 | ||
690 | # K[t] used in SHA512 hashing | ||
691 | K512: | ||
692 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd | ||
693 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | ||
694 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 | ||
695 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | ||
696 | .quad 0xd807aa98a3030242,0x12835b0145706fbe | ||
697 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | ||
698 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | ||
699 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 | ||
700 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | ||
701 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | ||
702 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | ||
703 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | ||
704 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 | ||
705 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 | ||
706 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 | ||
707 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 | ||
708 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 | ||
709 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | ||
710 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 | ||
711 | .quad 0x81c2c92e47edaee6,0x92722c851482353b | ||
712 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 | ||
713 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 | ||
714 | .quad 0xd192e819d6ef5218,0xd69906245565a910 | ||
715 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 | ||
716 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 | ||
717 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | ||
718 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | ||
719 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | ||
720 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 | ||
721 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec | ||
722 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 | ||
723 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b | ||
724 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 | ||
725 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | ||
726 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | ||
727 | .quad 0x113f9804bef90dae,0x1b710b35131c471b | ||
728 | .quad 0x28db77f523047d84,0x32caab7b40c72493 | ||
729 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | ||
730 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | ||
731 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | ||
732 | |||
733 | .align 32 | ||
734 | |||
735 | # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. | ||
736 | PSHUFFLE_BYTE_FLIP_MASK: | ||
737 | .octa 0x08090a0b0c0d0e0f0001020304050607 | ||
738 | .octa 0x18191a1b1c1d1e1f1011121314151617 | ||
739 | |||
740 | MASK_YMM_LO: | ||
741 | .octa 0x00000000000000000000000000000000 | ||
742 | .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF | ||
743 | #endif | ||