diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/crypto/Makefile | 8 | ||||
-rw-r--r-- | arch/x86/crypto/sha1_ssse3_asm.S | 558 | ||||
-rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 240 | ||||
-rw-r--r-- | arch/x86/include/asm/cpufeature.h | 3 |
4 files changed, 809 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index c04f1b7a9139..57c7f7b4436d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o | |||
13 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o | 13 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o |
14 | 14 | ||
15 | obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o | 15 | obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o |
16 | obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o | ||
16 | 17 | ||
17 | aes-i586-y := aes-i586-asm_32.o aes_glue.o | 18 | aes-i586-y := aes-i586-asm_32.o aes_glue.o |
18 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o | 19 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o |
@@ -25,3 +26,10 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o | |||
25 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 26 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
26 | 27 | ||
27 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | 28 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
29 | |||
30 | # enable AVX support only when $(AS) can actually assemble the instructions | ||
31 | ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) | ||
32 | AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT | ||
33 | CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT | ||
34 | endif | ||
35 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | ||
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S new file mode 100644 index 000000000000..b2c2f57d70e8 --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_asm.S | |||
@@ -0,0 +1,558 @@ | |||
1 | /* | ||
2 | * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental | ||
3 | * SSE3 instruction set extensions introduced in Intel Core Microarchitecture | ||
4 | * processors. CPUs supporting Intel(R) AVX extensions will get an additional | ||
5 | * boost. | ||
6 | * | ||
7 | * This work was inspired by the vectorized implementation of Dean Gaudet. | ||
8 | * Additional information on it can be found at: | ||
9 | * http://www.arctic.org/~dean/crypto/sha1.html | ||
10 | * | ||
11 | * It was improved upon with more efficient vectorization of the message | ||
12 | * scheduling. This implementation has also been optimized for all current and | ||
13 | * several future generations of Intel CPUs. | ||
14 | * | ||
15 | * See this article for more information about the implementation details: | ||
16 | * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ | ||
17 | * | ||
18 | * Copyright (C) 2010, Intel Corp. | ||
19 | * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> | ||
20 | * Ronen Zohar <ronen.zohar@intel.com> | ||
21 | * | ||
22 | * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: | ||
23 | * Author: Mathias Krause <minipli@googlemail.com> | ||
24 | * | ||
25 | * This program is free software; you can redistribute it and/or modify | ||
26 | * it under the terms of the GNU General Public License as published by | ||
27 | * the Free Software Foundation; either version 2 of the License, or | ||
28 | * (at your option) any later version. | ||
29 | */ | ||
30 | |||
31 | #define CTX %rdi // arg1 | ||
32 | #define BUF %rsi // arg2 | ||
33 | #define CNT %rdx // arg3 | ||
34 | |||
35 | #define REG_A %ecx | ||
36 | #define REG_B %esi | ||
37 | #define REG_C %edi | ||
38 | #define REG_D %ebp | ||
39 | #define REG_E %edx | ||
40 | |||
41 | #define REG_T1 %eax | ||
42 | #define REG_T2 %ebx | ||
43 | |||
44 | #define K_BASE %r8 | ||
45 | #define HASH_PTR %r9 | ||
46 | #define BUFFER_PTR %r10 | ||
47 | #define BUFFER_END %r11 | ||
48 | |||
49 | #define W_TMP1 %xmm0 | ||
50 | #define W_TMP2 %xmm9 | ||
51 | |||
52 | #define W0 %xmm1 | ||
53 | #define W4 %xmm2 | ||
54 | #define W8 %xmm3 | ||
55 | #define W12 %xmm4 | ||
56 | #define W16 %xmm5 | ||
57 | #define W20 %xmm6 | ||
58 | #define W24 %xmm7 | ||
59 | #define W28 %xmm8 | ||
60 | |||
61 | #define XMM_SHUFB_BSWAP %xmm10 | ||
62 | |||
63 | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ | ||
64 | #define WK(t) (((t) & 15) * 4)(%rsp) | ||
65 | #define W_PRECALC_AHEAD 16 | ||
66 | |||
67 | /* | ||
68 | * This macro implements the SHA-1 function's body for single 64-byte block | ||
69 | * param: function's name | ||
70 | */ | ||
71 | .macro SHA1_VECTOR_ASM name | ||
72 | .global \name | ||
73 | .type \name, @function | ||
74 | .align 32 | ||
75 | \name: | ||
76 | push %rbx | ||
77 | push %rbp | ||
78 | push %r12 | ||
79 | |||
80 | mov %rsp, %r12 | ||
81 | sub $64, %rsp # allocate workspace | ||
82 | and $~15, %rsp # align stack | ||
83 | |||
84 | mov CTX, HASH_PTR | ||
85 | mov BUF, BUFFER_PTR | ||
86 | |||
87 | shl $6, CNT # multiply by 64 | ||
88 | add BUF, CNT | ||
89 | mov CNT, BUFFER_END | ||
90 | |||
91 | lea K_XMM_AR(%rip), K_BASE | ||
92 | xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP | ||
93 | |||
94 | SHA1_PIPELINED_MAIN_BODY | ||
95 | |||
96 | # cleanup workspace | ||
97 | mov $8, %ecx | ||
98 | mov %rsp, %rdi | ||
99 | xor %rax, %rax | ||
100 | rep stosq | ||
101 | |||
102 | mov %r12, %rsp # deallocate workspace | ||
103 | |||
104 | pop %r12 | ||
105 | pop %rbp | ||
106 | pop %rbx | ||
107 | ret | ||
108 | |||
109 | .size \name, .-\name | ||
110 | .endm | ||
111 | |||
112 | /* | ||
113 | * This macro implements 80 rounds of SHA-1 for one 64-byte block | ||
114 | */ | ||
115 | .macro SHA1_PIPELINED_MAIN_BODY | ||
116 | INIT_REGALLOC | ||
117 | |||
118 | mov (HASH_PTR), A | ||
119 | mov 4(HASH_PTR), B | ||
120 | mov 8(HASH_PTR), C | ||
121 | mov 12(HASH_PTR), D | ||
122 | mov 16(HASH_PTR), E | ||
123 | |||
124 | .set i, 0 | ||
125 | .rept W_PRECALC_AHEAD | ||
126 | W_PRECALC i | ||
127 | .set i, (i+1) | ||
128 | .endr | ||
129 | |||
130 | .align 4 | ||
131 | 1: | ||
132 | RR F1,A,B,C,D,E,0 | ||
133 | RR F1,D,E,A,B,C,2 | ||
134 | RR F1,B,C,D,E,A,4 | ||
135 | RR F1,E,A,B,C,D,6 | ||
136 | RR F1,C,D,E,A,B,8 | ||
137 | |||
138 | RR F1,A,B,C,D,E,10 | ||
139 | RR F1,D,E,A,B,C,12 | ||
140 | RR F1,B,C,D,E,A,14 | ||
141 | RR F1,E,A,B,C,D,16 | ||
142 | RR F1,C,D,E,A,B,18 | ||
143 | |||
144 | RR F2,A,B,C,D,E,20 | ||
145 | RR F2,D,E,A,B,C,22 | ||
146 | RR F2,B,C,D,E,A,24 | ||
147 | RR F2,E,A,B,C,D,26 | ||
148 | RR F2,C,D,E,A,B,28 | ||
149 | |||
150 | RR F2,A,B,C,D,E,30 | ||
151 | RR F2,D,E,A,B,C,32 | ||
152 | RR F2,B,C,D,E,A,34 | ||
153 | RR F2,E,A,B,C,D,36 | ||
154 | RR F2,C,D,E,A,B,38 | ||
155 | |||
156 | RR F3,A,B,C,D,E,40 | ||
157 | RR F3,D,E,A,B,C,42 | ||
158 | RR F3,B,C,D,E,A,44 | ||
159 | RR F3,E,A,B,C,D,46 | ||
160 | RR F3,C,D,E,A,B,48 | ||
161 | |||
162 | RR F3,A,B,C,D,E,50 | ||
163 | RR F3,D,E,A,B,C,52 | ||
164 | RR F3,B,C,D,E,A,54 | ||
165 | RR F3,E,A,B,C,D,56 | ||
166 | RR F3,C,D,E,A,B,58 | ||
167 | |||
168 | add $64, BUFFER_PTR # move to the next 64-byte block | ||
169 | cmp BUFFER_END, BUFFER_PTR # if the current is the last one use | ||
170 | cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun | ||
171 | |||
172 | RR F4,A,B,C,D,E,60 | ||
173 | RR F4,D,E,A,B,C,62 | ||
174 | RR F4,B,C,D,E,A,64 | ||
175 | RR F4,E,A,B,C,D,66 | ||
176 | RR F4,C,D,E,A,B,68 | ||
177 | |||
178 | RR F4,A,B,C,D,E,70 | ||
179 | RR F4,D,E,A,B,C,72 | ||
180 | RR F4,B,C,D,E,A,74 | ||
181 | RR F4,E,A,B,C,D,76 | ||
182 | RR F4,C,D,E,A,B,78 | ||
183 | |||
184 | UPDATE_HASH (HASH_PTR), A | ||
185 | UPDATE_HASH 4(HASH_PTR), B | ||
186 | UPDATE_HASH 8(HASH_PTR), C | ||
187 | UPDATE_HASH 12(HASH_PTR), D | ||
188 | UPDATE_HASH 16(HASH_PTR), E | ||
189 | |||
190 | RESTORE_RENAMED_REGS | ||
191 | cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end | ||
192 | jne 1b | ||
193 | .endm | ||
194 | |||
195 | .macro INIT_REGALLOC | ||
196 | .set A, REG_A | ||
197 | .set B, REG_B | ||
198 | .set C, REG_C | ||
199 | .set D, REG_D | ||
200 | .set E, REG_E | ||
201 | .set T1, REG_T1 | ||
202 | .set T2, REG_T2 | ||
203 | .endm | ||
204 | |||
205 | .macro RESTORE_RENAMED_REGS | ||
206 | # order is important (REG_C is where it should be) | ||
207 | mov B, REG_B | ||
208 | mov D, REG_D | ||
209 | mov A, REG_A | ||
210 | mov E, REG_E | ||
211 | .endm | ||
212 | |||
213 | .macro SWAP_REG_NAMES a, b | ||
214 | .set _T, \a | ||
215 | .set \a, \b | ||
216 | .set \b, _T | ||
217 | .endm | ||
218 | |||
219 | .macro F1 b, c, d | ||
220 | mov \c, T1 | ||
221 | SWAP_REG_NAMES \c, T1 | ||
222 | xor \d, T1 | ||
223 | and \b, T1 | ||
224 | xor \d, T1 | ||
225 | .endm | ||
226 | |||
227 | .macro F2 b, c, d | ||
228 | mov \d, T1 | ||
229 | SWAP_REG_NAMES \d, T1 | ||
230 | xor \c, T1 | ||
231 | xor \b, T1 | ||
232 | .endm | ||
233 | |||
234 | .macro F3 b, c ,d | ||
235 | mov \c, T1 | ||
236 | SWAP_REG_NAMES \c, T1 | ||
237 | mov \b, T2 | ||
238 | or \b, T1 | ||
239 | and \c, T2 | ||
240 | and \d, T1 | ||
241 | or T2, T1 | ||
242 | .endm | ||
243 | |||
244 | .macro F4 b, c, d | ||
245 | F2 \b, \c, \d | ||
246 | .endm | ||
247 | |||
248 | .macro UPDATE_HASH hash, val | ||
249 | add \hash, \val | ||
250 | mov \val, \hash | ||
251 | .endm | ||
252 | |||
253 | /* | ||
254 | * RR does two rounds of SHA-1 back to back with W[] pre-calc | ||
255 | * t1 = F(b, c, d); e += w(i) | ||
256 | * e += t1; b <<= 30; d += w(i+1); | ||
257 | * t1 = F(a, b, c); | ||
258 | * d += t1; a <<= 5; | ||
259 | * e += a; | ||
260 | * t1 = e; a >>= 7; | ||
261 | * t1 <<= 5; | ||
262 | * d += t1; | ||
263 | */ | ||
264 | .macro RR F, a, b, c, d, e, round | ||
265 | add WK(\round), \e | ||
266 | \F \b, \c, \d # t1 = F(b, c, d); | ||
267 | W_PRECALC (\round + W_PRECALC_AHEAD) | ||
268 | rol $30, \b | ||
269 | add T1, \e | ||
270 | add WK(\round + 1), \d | ||
271 | |||
272 | \F \a, \b, \c | ||
273 | W_PRECALC (\round + W_PRECALC_AHEAD + 1) | ||
274 | rol $5, \a | ||
275 | add \a, \e | ||
276 | add T1, \d | ||
277 | ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) | ||
278 | |||
279 | mov \e, T1 | ||
280 | SWAP_REG_NAMES \e, T1 | ||
281 | |||
282 | rol $5, T1 | ||
283 | add T1, \d | ||
284 | |||
285 | # write: \a, \b | ||
286 | # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c | ||
287 | .endm | ||
288 | |||
289 | .macro W_PRECALC r | ||
290 | .set i, \r | ||
291 | |||
292 | .if (i < 20) | ||
293 | .set K_XMM, 0 | ||
294 | .elseif (i < 40) | ||
295 | .set K_XMM, 16 | ||
296 | .elseif (i < 60) | ||
297 | .set K_XMM, 32 | ||
298 | .elseif (i < 80) | ||
299 | .set K_XMM, 48 | ||
300 | .endif | ||
301 | |||
302 | .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) | ||
303 | .set i, ((\r) % 80) # pre-compute for the next iteration | ||
304 | .if (i == 0) | ||
305 | W_PRECALC_RESET | ||
306 | .endif | ||
307 | W_PRECALC_00_15 | ||
308 | .elseif (i<32) | ||
309 | W_PRECALC_16_31 | ||
310 | .elseif (i < 80) // rounds 32-79 | ||
311 | W_PRECALC_32_79 | ||
312 | .endif | ||
313 | .endm | ||
314 | |||
315 | .macro W_PRECALC_RESET | ||
316 | .set W, W0 | ||
317 | .set W_minus_04, W4 | ||
318 | .set W_minus_08, W8 | ||
319 | .set W_minus_12, W12 | ||
320 | .set W_minus_16, W16 | ||
321 | .set W_minus_20, W20 | ||
322 | .set W_minus_24, W24 | ||
323 | .set W_minus_28, W28 | ||
324 | .set W_minus_32, W | ||
325 | .endm | ||
326 | |||
327 | .macro W_PRECALC_ROTATE | ||
328 | .set W_minus_32, W_minus_28 | ||
329 | .set W_minus_28, W_minus_24 | ||
330 | .set W_minus_24, W_minus_20 | ||
331 | .set W_minus_20, W_minus_16 | ||
332 | .set W_minus_16, W_minus_12 | ||
333 | .set W_minus_12, W_minus_08 | ||
334 | .set W_minus_08, W_minus_04 | ||
335 | .set W_minus_04, W | ||
336 | .set W, W_minus_32 | ||
337 | .endm | ||
338 | |||
339 | .macro W_PRECALC_SSSE3 | ||
340 | |||
341 | .macro W_PRECALC_00_15 | ||
342 | W_PRECALC_00_15_SSSE3 | ||
343 | .endm | ||
344 | .macro W_PRECALC_16_31 | ||
345 | W_PRECALC_16_31_SSSE3 | ||
346 | .endm | ||
347 | .macro W_PRECALC_32_79 | ||
348 | W_PRECALC_32_79_SSSE3 | ||
349 | .endm | ||
350 | |||
351 | /* message scheduling pre-compute for rounds 0-15 */ | ||
352 | .macro W_PRECALC_00_15_SSSE3 | ||
353 | .if ((i & 3) == 0) | ||
354 | movdqu (i*4)(BUFFER_PTR), W_TMP1 | ||
355 | .elseif ((i & 3) == 1) | ||
356 | pshufb XMM_SHUFB_BSWAP, W_TMP1 | ||
357 | movdqa W_TMP1, W | ||
358 | .elseif ((i & 3) == 2) | ||
359 | paddd (K_BASE), W_TMP1 | ||
360 | .elseif ((i & 3) == 3) | ||
361 | movdqa W_TMP1, WK(i&~3) | ||
362 | W_PRECALC_ROTATE | ||
363 | .endif | ||
364 | .endm | ||
365 | |||
366 | /* message scheduling pre-compute for rounds 16-31 | ||
367 | * | ||
368 | * - calculating last 32 w[i] values in 8 XMM registers | ||
369 | * - pre-calculate K+w[i] values and store to mem, for later load by ALU add | ||
370 | * instruction | ||
371 | * | ||
372 | * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] | ||
373 | * dependency, but improves for 32-79 | ||
374 | */ | ||
375 | .macro W_PRECALC_16_31_SSSE3 | ||
376 | # blended scheduling of vector and scalar instruction streams, one 4-wide | ||
377 | # vector iteration / 4 scalar rounds | ||
378 | .if ((i & 3) == 0) | ||
379 | movdqa W_minus_12, W | ||
380 | palignr $8, W_minus_16, W # w[i-14] | ||
381 | movdqa W_minus_04, W_TMP1 | ||
382 | psrldq $4, W_TMP1 # w[i-3] | ||
383 | pxor W_minus_08, W | ||
384 | .elseif ((i & 3) == 1) | ||
385 | pxor W_minus_16, W_TMP1 | ||
386 | pxor W_TMP1, W | ||
387 | movdqa W, W_TMP2 | ||
388 | movdqa W, W_TMP1 | ||
389 | pslldq $12, W_TMP2 | ||
390 | .elseif ((i & 3) == 2) | ||
391 | psrld $31, W | ||
392 | pslld $1, W_TMP1 | ||
393 | por W, W_TMP1 | ||
394 | movdqa W_TMP2, W | ||
395 | psrld $30, W_TMP2 | ||
396 | pslld $2, W | ||
397 | .elseif ((i & 3) == 3) | ||
398 | pxor W, W_TMP1 | ||
399 | pxor W_TMP2, W_TMP1 | ||
400 | movdqa W_TMP1, W | ||
401 | paddd K_XMM(K_BASE), W_TMP1 | ||
402 | movdqa W_TMP1, WK(i&~3) | ||
403 | W_PRECALC_ROTATE | ||
404 | .endif | ||
405 | .endm | ||
406 | |||
407 | /* message scheduling pre-compute for rounds 32-79 | ||
408 | * | ||
409 | * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 | ||
410 | * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | ||
411 | * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken | ||
412 | */ | ||
413 | .macro W_PRECALC_32_79_SSSE3 | ||
414 | .if ((i & 3) == 0) | ||
415 | movdqa W_minus_04, W_TMP1 | ||
416 | pxor W_minus_28, W # W is W_minus_32 before xor | ||
417 | palignr $8, W_minus_08, W_TMP1 | ||
418 | .elseif ((i & 3) == 1) | ||
419 | pxor W_minus_16, W | ||
420 | pxor W_TMP1, W | ||
421 | movdqa W, W_TMP1 | ||
422 | .elseif ((i & 3) == 2) | ||
423 | psrld $30, W | ||
424 | pslld $2, W_TMP1 | ||
425 | por W, W_TMP1 | ||
426 | .elseif ((i & 3) == 3) | ||
427 | movdqa W_TMP1, W | ||
428 | paddd K_XMM(K_BASE), W_TMP1 | ||
429 | movdqa W_TMP1, WK(i&~3) | ||
430 | W_PRECALC_ROTATE | ||
431 | .endif | ||
432 | .endm | ||
433 | |||
434 | .endm // W_PRECALC_SSSE3 | ||
435 | |||
436 | |||
437 | #define K1 0x5a827999 | ||
438 | #define K2 0x6ed9eba1 | ||
439 | #define K3 0x8f1bbcdc | ||
440 | #define K4 0xca62c1d6 | ||
441 | |||
442 | .section .rodata | ||
443 | .align 16 | ||
444 | |||
445 | K_XMM_AR: | ||
446 | .long K1, K1, K1, K1 | ||
447 | .long K2, K2, K2, K2 | ||
448 | .long K3, K3, K3, K3 | ||
449 | .long K4, K4, K4, K4 | ||
450 | |||
451 | BSWAP_SHUFB_CTL: | ||
452 | .long 0x00010203 | ||
453 | .long 0x04050607 | ||
454 | .long 0x08090a0b | ||
455 | .long 0x0c0d0e0f | ||
456 | |||
457 | |||
458 | .section .text | ||
459 | |||
460 | W_PRECALC_SSSE3 | ||
461 | .macro xmm_mov a, b | ||
462 | movdqu \a,\b | ||
463 | .endm | ||
464 | |||
465 | /* SSSE3 optimized implementation: | ||
466 | * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, | ||
467 | * unsigned int rounds); | ||
468 | */ | ||
469 | SHA1_VECTOR_ASM sha1_transform_ssse3 | ||
470 | |||
471 | #ifdef SHA1_ENABLE_AVX_SUPPORT | ||
472 | |||
473 | .macro W_PRECALC_AVX | ||
474 | |||
475 | .purgem W_PRECALC_00_15 | ||
476 | .macro W_PRECALC_00_15 | ||
477 | W_PRECALC_00_15_AVX | ||
478 | .endm | ||
479 | .purgem W_PRECALC_16_31 | ||
480 | .macro W_PRECALC_16_31 | ||
481 | W_PRECALC_16_31_AVX | ||
482 | .endm | ||
483 | .purgem W_PRECALC_32_79 | ||
484 | .macro W_PRECALC_32_79 | ||
485 | W_PRECALC_32_79_AVX | ||
486 | .endm | ||
487 | |||
488 | .macro W_PRECALC_00_15_AVX | ||
489 | .if ((i & 3) == 0) | ||
490 | vmovdqu (i*4)(BUFFER_PTR), W_TMP1 | ||
491 | .elseif ((i & 3) == 1) | ||
492 | vpshufb XMM_SHUFB_BSWAP, W_TMP1, W | ||
493 | .elseif ((i & 3) == 2) | ||
494 | vpaddd (K_BASE), W, W_TMP1 | ||
495 | .elseif ((i & 3) == 3) | ||
496 | vmovdqa W_TMP1, WK(i&~3) | ||
497 | W_PRECALC_ROTATE | ||
498 | .endif | ||
499 | .endm | ||
500 | |||
501 | .macro W_PRECALC_16_31_AVX | ||
502 | .if ((i & 3) == 0) | ||
503 | vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] | ||
504 | vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] | ||
505 | vpxor W_minus_08, W, W | ||
506 | vpxor W_minus_16, W_TMP1, W_TMP1 | ||
507 | .elseif ((i & 3) == 1) | ||
508 | vpxor W_TMP1, W, W | ||
509 | vpslldq $12, W, W_TMP2 | ||
510 | vpslld $1, W, W_TMP1 | ||
511 | .elseif ((i & 3) == 2) | ||
512 | vpsrld $31, W, W | ||
513 | vpor W, W_TMP1, W_TMP1 | ||
514 | vpslld $2, W_TMP2, W | ||
515 | vpsrld $30, W_TMP2, W_TMP2 | ||
516 | .elseif ((i & 3) == 3) | ||
517 | vpxor W, W_TMP1, W_TMP1 | ||
518 | vpxor W_TMP2, W_TMP1, W | ||
519 | vpaddd K_XMM(K_BASE), W, W_TMP1 | ||
520 | vmovdqu W_TMP1, WK(i&~3) | ||
521 | W_PRECALC_ROTATE | ||
522 | .endif | ||
523 | .endm | ||
524 | |||
525 | .macro W_PRECALC_32_79_AVX | ||
526 | .if ((i & 3) == 0) | ||
527 | vpalignr $8, W_minus_08, W_minus_04, W_TMP1 | ||
528 | vpxor W_minus_28, W, W # W is W_minus_32 before xor | ||
529 | .elseif ((i & 3) == 1) | ||
530 | vpxor W_minus_16, W_TMP1, W_TMP1 | ||
531 | vpxor W_TMP1, W, W | ||
532 | .elseif ((i & 3) == 2) | ||
533 | vpslld $2, W, W_TMP1 | ||
534 | vpsrld $30, W, W | ||
535 | vpor W, W_TMP1, W | ||
536 | .elseif ((i & 3) == 3) | ||
537 | vpaddd K_XMM(K_BASE), W, W_TMP1 | ||
538 | vmovdqu W_TMP1, WK(i&~3) | ||
539 | W_PRECALC_ROTATE | ||
540 | .endif | ||
541 | .endm | ||
542 | |||
543 | .endm // W_PRECALC_AVX | ||
544 | |||
545 | W_PRECALC_AVX | ||
546 | .purgem xmm_mov | ||
547 | .macro xmm_mov a, b | ||
548 | vmovdqu \a,\b | ||
549 | .endm | ||
550 | |||
551 | |||
552 | /* AVX optimized implementation: | ||
553 | * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, | ||
554 | * unsigned int rounds); | ||
555 | */ | ||
556 | SHA1_VECTOR_ASM sha1_transform_avx | ||
557 | |||
558 | #endif | ||
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c new file mode 100644 index 000000000000..f916499d0abe --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_glue.c | |||
@@ -0,0 +1,240 @@ | |||
1 | /* | ||
2 | * Cryptographic API. | ||
3 | * | ||
4 | * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using | ||
5 | * Supplemental SSE3 instructions. | ||
6 | * | ||
7 | * This file is based on sha1_generic.c | ||
8 | * | ||
9 | * Copyright (c) Alan Smithee. | ||
10 | * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> | ||
11 | * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> | ||
12 | * Copyright (c) Mathias Krause <minipli@googlemail.com> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | ||
15 | * under the terms of the GNU General Public License as published by the Free | ||
16 | * Software Foundation; either version 2 of the License, or (at your option) | ||
17 | * any later version. | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
22 | |||
23 | #include <crypto/internal/hash.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/cryptohash.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <crypto/sha.h> | ||
30 | #include <asm/byteorder.h> | ||
31 | #include <asm/i387.h> | ||
32 | #include <asm/xcr.h> | ||
33 | #include <asm/xsave.h> | ||
34 | |||
35 | |||
36 | asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, | ||
37 | unsigned int rounds); | ||
38 | #ifdef SHA1_ENABLE_AVX_SUPPORT | ||
39 | asmlinkage void sha1_transform_avx(u32 *digest, const char *data, | ||
40 | unsigned int rounds); | ||
41 | #endif | ||
42 | |||
43 | static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); | ||
44 | |||
45 | |||
46 | static int sha1_ssse3_init(struct shash_desc *desc) | ||
47 | { | ||
48 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
49 | |||
50 | *sctx = (struct sha1_state){ | ||
51 | .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, | ||
52 | }; | ||
53 | |||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
58 | unsigned int len, unsigned int partial) | ||
59 | { | ||
60 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
61 | unsigned int done = 0; | ||
62 | |||
63 | sctx->count += len; | ||
64 | |||
65 | if (partial) { | ||
66 | done = SHA1_BLOCK_SIZE - partial; | ||
67 | memcpy(sctx->buffer + partial, data, done); | ||
68 | sha1_transform_asm(sctx->state, sctx->buffer, 1); | ||
69 | } | ||
70 | |||
71 | if (len - done >= SHA1_BLOCK_SIZE) { | ||
72 | const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; | ||
73 | |||
74 | sha1_transform_asm(sctx->state, data + done, rounds); | ||
75 | done += rounds * SHA1_BLOCK_SIZE; | ||
76 | } | ||
77 | |||
78 | memcpy(sctx->buffer, data + done, len - done); | ||
79 | |||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
84 | unsigned int len) | ||
85 | { | ||
86 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
87 | unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; | ||
88 | int res; | ||
89 | |||
90 | /* Handle the fast case right here */ | ||
91 | if (partial + len < SHA1_BLOCK_SIZE) { | ||
92 | sctx->count += len; | ||
93 | memcpy(sctx->buffer + partial, data, len); | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | if (!irq_fpu_usable()) { | ||
99 | res = crypto_sha1_update(desc, data, len); | ||
100 | } else { | ||
101 | kernel_fpu_begin(); | ||
102 | res = __sha1_ssse3_update(desc, data, len, partial); | ||
103 | kernel_fpu_end(); | ||
104 | } | ||
105 | |||
106 | return res; | ||
107 | } | ||
108 | |||
109 | |||
110 | /* Add padding and return the message digest. */ | ||
111 | static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) | ||
112 | { | ||
113 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
114 | unsigned int i, index, padlen; | ||
115 | __be32 *dst = (__be32 *)out; | ||
116 | __be64 bits; | ||
117 | static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; | ||
118 | |||
119 | bits = cpu_to_be64(sctx->count << 3); | ||
120 | |||
121 | /* Pad out to 56 mod 64 and append length */ | ||
122 | index = sctx->count % SHA1_BLOCK_SIZE; | ||
123 | padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); | ||
124 | if (!irq_fpu_usable()) { | ||
125 | crypto_sha1_update(desc, padding, padlen); | ||
126 | crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
127 | } else { | ||
128 | kernel_fpu_begin(); | ||
129 | /* We need to fill a whole block for __sha1_ssse3_update() */ | ||
130 | if (padlen <= 56) { | ||
131 | sctx->count += padlen; | ||
132 | memcpy(sctx->buffer + index, padding, padlen); | ||
133 | } else { | ||
134 | __sha1_ssse3_update(desc, padding, padlen, index); | ||
135 | } | ||
136 | __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); | ||
137 | kernel_fpu_end(); | ||
138 | } | ||
139 | |||
140 | /* Store state in digest */ | ||
141 | for (i = 0; i < 5; i++) | ||
142 | dst[i] = cpu_to_be32(sctx->state[i]); | ||
143 | |||
144 | /* Wipe context */ | ||
145 | memset(sctx, 0, sizeof(*sctx)); | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | static int sha1_ssse3_export(struct shash_desc *desc, void *out) | ||
151 | { | ||
152 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
153 | |||
154 | memcpy(out, sctx, sizeof(*sctx)); | ||
155 | |||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static int sha1_ssse3_import(struct shash_desc *desc, const void *in) | ||
160 | { | ||
161 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
162 | |||
163 | memcpy(sctx, in, sizeof(*sctx)); | ||
164 | |||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | static struct shash_alg alg = { | ||
169 | .digestsize = SHA1_DIGEST_SIZE, | ||
170 | .init = sha1_ssse3_init, | ||
171 | .update = sha1_ssse3_update, | ||
172 | .final = sha1_ssse3_final, | ||
173 | .export = sha1_ssse3_export, | ||
174 | .import = sha1_ssse3_import, | ||
175 | .descsize = sizeof(struct sha1_state), | ||
176 | .statesize = sizeof(struct sha1_state), | ||
177 | .base = { | ||
178 | .cra_name = "sha1", | ||
179 | .cra_driver_name= "sha1-ssse3", | ||
180 | .cra_priority = 150, | ||
181 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
182 | .cra_blocksize = SHA1_BLOCK_SIZE, | ||
183 | .cra_module = THIS_MODULE, | ||
184 | } | ||
185 | }; | ||
186 | |||
187 | #ifdef SHA1_ENABLE_AVX_SUPPORT | ||
188 | static bool __init avx_usable(void) | ||
189 | { | ||
190 | u64 xcr0; | ||
191 | |||
192 | if (!cpu_has_avx || !cpu_has_osxsave) | ||
193 | return false; | ||
194 | |||
195 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
196 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
197 | pr_info("AVX detected but unusable.\n"); | ||
198 | |||
199 | return false; | ||
200 | } | ||
201 | |||
202 | return true; | ||
203 | } | ||
204 | #endif | ||
205 | |||
206 | static int __init sha1_ssse3_mod_init(void) | ||
207 | { | ||
208 | /* test for SSSE3 first */ | ||
209 | if (cpu_has_ssse3) | ||
210 | sha1_transform_asm = sha1_transform_ssse3; | ||
211 | |||
212 | #ifdef SHA1_ENABLE_AVX_SUPPORT | ||
213 | /* allow AVX to override SSSE3, it's a little faster */ | ||
214 | if (avx_usable()) | ||
215 | sha1_transform_asm = sha1_transform_avx; | ||
216 | #endif | ||
217 | |||
218 | if (sha1_transform_asm) { | ||
219 | pr_info("Using %s optimized SHA-1 implementation\n", | ||
220 | sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" | ||
221 | : "AVX"); | ||
222 | return crypto_register_shash(&alg); | ||
223 | } | ||
224 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); | ||
225 | |||
226 | return -ENODEV; | ||
227 | } | ||
228 | |||
229 | static void __exit sha1_ssse3_mod_fini(void) | ||
230 | { | ||
231 | crypto_unregister_shash(&alg); | ||
232 | } | ||
233 | |||
234 | module_init(sha1_ssse3_mod_init); | ||
235 | module_exit(sha1_ssse3_mod_fini); | ||
236 | |||
237 | MODULE_LICENSE("GPL"); | ||
238 | MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); | ||
239 | |||
240 | MODULE_ALIAS("sha1"); | ||
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4258aac99a6e..48a93ef5c84b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -257,7 +257,9 @@ extern const char * const x86_power_flags[32]; | |||
257 | #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) | 257 | #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) |
258 | #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) | 258 | #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) |
259 | #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) | 259 | #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) |
260 | #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) | ||
260 | #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) | 261 | #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) |
262 | #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) | ||
261 | #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) | 263 | #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) |
262 | #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) | 264 | #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) |
263 | #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) | 265 | #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) |
@@ -285,6 +287,7 @@ extern const char * const x86_power_flags[32]; | |||
285 | #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) | 287 | #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) |
286 | #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) | 288 | #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) |
287 | #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) | 289 | #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) |
290 | #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) | ||
288 | #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) | 291 | #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) |
289 | #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) | 292 | #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) |
290 | #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) | 293 | #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) |