aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorMathias Krause <minipli@googlemail.com>2011-08-04 14:19:25 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2011-08-10 07:00:29 -0400
commit66be895158886a6cd816aa1eaa18965a5c522d8f (patch)
tree2eee685d2249cd5973e15303b8101df7c956e607 /arch/x86
parent7c390170b49337477985be7a624015160ffeb056 (diff)
crypto: sha1 - SSSE3 based SHA1 implementation for x86-64
This is an assembler implementation of the SHA1 algorithm using the Supplemental SSE3 (SSSE3) instructions or, when available, the Advanced Vector Extensions (AVX). Testing with the tcrypt module shows the raw hash performance is up to 2.3 times faster than the C implementation, using 8k data blocks on a Core 2 Duo T5500. For the smalest data set (16 byte) it is still 25% faster. Since this implementation uses SSE/YMM registers it cannot safely be used in every situation, e.g. while an IRQ interrupts a kernel thread. The implementation falls back to the generic SHA1 variant, if using the SSE/YMM registers is not possible. With this algorithm I was able to increase the throughput of a single IPsec link from 344 Mbit/s to 464 Mbit/s on a Core 2 Quad CPU using the SSSE3 variant -- a speedup of +34.8%. Saving and restoring SSE/YMM state might make the actual throughput fluctuate when there are FPU intensive userland applications running. For example, meassuring the performance using iperf2 directly on the machine under test gives wobbling numbers because iperf2 uses the FPU for each packet to check if the reporting interval has expired (in the above test I got min/max/avg: 402/484/464 MBit/s). Using this algorithm on a IPsec gateway gives much more reasonable and stable numbers, albeit not as high as in the directly connected case. Here is the result from an RFC 2544 test run with a EXFO Packet Blazer FTB-8510: frame size sha1-generic sha1-ssse3 delta 64 byte 37.5 MBit/s 37.5 MBit/s 0.0% 128 byte 56.3 MBit/s 62.5 MBit/s +11.0% 256 byte 87.5 MBit/s 100.0 MBit/s +14.3% 512 byte 131.3 MBit/s 150.0 MBit/s +14.2% 1024 byte 162.5 MBit/s 193.8 MBit/s +19.3% 1280 byte 175.0 MBit/s 212.5 MBit/s +21.4% 1420 byte 175.0 MBit/s 218.7 MBit/s +25.0% 1518 byte 150.0 MBit/s 181.2 MBit/s +20.8% The throughput for the largest frame size is lower than for the previous size because the IP packets need to be fragmented in this case to make there way through the IPsec tunnel. Signed-off-by: Mathias Krause <minipli@googlemail.com> Cc: Maxim Locktyukhin <maxim.locktyukhin@intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S558
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c240
-rw-r--r--arch/x86/include/asm/cpufeature.h3
4 files changed, 809 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index c04f1b7a9139..57c7f7b4436d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
13obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 13obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
14 14
15obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 15obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
16obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
16 17
17aes-i586-y := aes-i586-asm_32.o aes_glue.o 18aes-i586-y := aes-i586-asm_32.o aes_glue.o
18twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 19twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -25,3 +26,10 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
26 27
27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 28ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
29
30# enable AVX support only when $(AS) can actually assemble the instructions
31ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
32AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
33CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
34endif
35sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
new file mode 100644
index 000000000000..b2c2f57d70e8
--- /dev/null
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -0,0 +1,558 @@
1/*
2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
5 * boost.
6 *
7 * This work was inspired by the vectorized implementation of Dean Gaudet.
8 * Additional information on it can be found at:
9 * http://www.arctic.org/~dean/crypto/sha1.html
10 *
11 * It was improved upon with more efficient vectorization of the message
12 * scheduling. This implementation has also been optimized for all current and
13 * several future generations of Intel CPUs.
14 *
15 * See this article for more information about the implementation details:
16 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
17 *
18 * Copyright (C) 2010, Intel Corp.
19 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
20 * Ronen Zohar <ronen.zohar@intel.com>
21 *
22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
23 * Author: Mathias Krause <minipli@googlemail.com>
24 *
25 * This program is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
29 */
30
31#define CTX %rdi // arg1
32#define BUF %rsi // arg2
33#define CNT %rdx // arg3
34
35#define REG_A %ecx
36#define REG_B %esi
37#define REG_C %edi
38#define REG_D %ebp
39#define REG_E %edx
40
41#define REG_T1 %eax
42#define REG_T2 %ebx
43
44#define K_BASE %r8
45#define HASH_PTR %r9
46#define BUFFER_PTR %r10
47#define BUFFER_END %r11
48
49#define W_TMP1 %xmm0
50#define W_TMP2 %xmm9
51
52#define W0 %xmm1
53#define W4 %xmm2
54#define W8 %xmm3
55#define W12 %xmm4
56#define W16 %xmm5
57#define W20 %xmm6
58#define W24 %xmm7
59#define W28 %xmm8
60
61#define XMM_SHUFB_BSWAP %xmm10
62
63/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
64#define WK(t) (((t) & 15) * 4)(%rsp)
65#define W_PRECALC_AHEAD 16
66
67/*
68 * This macro implements the SHA-1 function's body for single 64-byte block
69 * param: function's name
70 */
71.macro SHA1_VECTOR_ASM name
72 .global \name
73 .type \name, @function
74 .align 32
75\name:
76 push %rbx
77 push %rbp
78 push %r12
79
80 mov %rsp, %r12
81 sub $64, %rsp # allocate workspace
82 and $~15, %rsp # align stack
83
84 mov CTX, HASH_PTR
85 mov BUF, BUFFER_PTR
86
87 shl $6, CNT # multiply by 64
88 add BUF, CNT
89 mov CNT, BUFFER_END
90
91 lea K_XMM_AR(%rip), K_BASE
92 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
93
94 SHA1_PIPELINED_MAIN_BODY
95
96 # cleanup workspace
97 mov $8, %ecx
98 mov %rsp, %rdi
99 xor %rax, %rax
100 rep stosq
101
102 mov %r12, %rsp # deallocate workspace
103
104 pop %r12
105 pop %rbp
106 pop %rbx
107 ret
108
109 .size \name, .-\name
110.endm
111
112/*
113 * This macro implements 80 rounds of SHA-1 for one 64-byte block
114 */
115.macro SHA1_PIPELINED_MAIN_BODY
116 INIT_REGALLOC
117
118 mov (HASH_PTR), A
119 mov 4(HASH_PTR), B
120 mov 8(HASH_PTR), C
121 mov 12(HASH_PTR), D
122 mov 16(HASH_PTR), E
123
124 .set i, 0
125 .rept W_PRECALC_AHEAD
126 W_PRECALC i
127 .set i, (i+1)
128 .endr
129
130.align 4
1311:
132 RR F1,A,B,C,D,E,0
133 RR F1,D,E,A,B,C,2
134 RR F1,B,C,D,E,A,4
135 RR F1,E,A,B,C,D,6
136 RR F1,C,D,E,A,B,8
137
138 RR F1,A,B,C,D,E,10
139 RR F1,D,E,A,B,C,12
140 RR F1,B,C,D,E,A,14
141 RR F1,E,A,B,C,D,16
142 RR F1,C,D,E,A,B,18
143
144 RR F2,A,B,C,D,E,20
145 RR F2,D,E,A,B,C,22
146 RR F2,B,C,D,E,A,24
147 RR F2,E,A,B,C,D,26
148 RR F2,C,D,E,A,B,28
149
150 RR F2,A,B,C,D,E,30
151 RR F2,D,E,A,B,C,32
152 RR F2,B,C,D,E,A,34
153 RR F2,E,A,B,C,D,36
154 RR F2,C,D,E,A,B,38
155
156 RR F3,A,B,C,D,E,40
157 RR F3,D,E,A,B,C,42
158 RR F3,B,C,D,E,A,44
159 RR F3,E,A,B,C,D,46
160 RR F3,C,D,E,A,B,48
161
162 RR F3,A,B,C,D,E,50
163 RR F3,D,E,A,B,C,52
164 RR F3,B,C,D,E,A,54
165 RR F3,E,A,B,C,D,56
166 RR F3,C,D,E,A,B,58
167
168 add $64, BUFFER_PTR # move to the next 64-byte block
169 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
170 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
171
172 RR F4,A,B,C,D,E,60
173 RR F4,D,E,A,B,C,62
174 RR F4,B,C,D,E,A,64
175 RR F4,E,A,B,C,D,66
176 RR F4,C,D,E,A,B,68
177
178 RR F4,A,B,C,D,E,70
179 RR F4,D,E,A,B,C,72
180 RR F4,B,C,D,E,A,74
181 RR F4,E,A,B,C,D,76
182 RR F4,C,D,E,A,B,78
183
184 UPDATE_HASH (HASH_PTR), A
185 UPDATE_HASH 4(HASH_PTR), B
186 UPDATE_HASH 8(HASH_PTR), C
187 UPDATE_HASH 12(HASH_PTR), D
188 UPDATE_HASH 16(HASH_PTR), E
189
190 RESTORE_RENAMED_REGS
191 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
192 jne 1b
193.endm
194
195.macro INIT_REGALLOC
196 .set A, REG_A
197 .set B, REG_B
198 .set C, REG_C
199 .set D, REG_D
200 .set E, REG_E
201 .set T1, REG_T1
202 .set T2, REG_T2
203.endm
204
205.macro RESTORE_RENAMED_REGS
206 # order is important (REG_C is where it should be)
207 mov B, REG_B
208 mov D, REG_D
209 mov A, REG_A
210 mov E, REG_E
211.endm
212
213.macro SWAP_REG_NAMES a, b
214 .set _T, \a
215 .set \a, \b
216 .set \b, _T
217.endm
218
219.macro F1 b, c, d
220 mov \c, T1
221 SWAP_REG_NAMES \c, T1
222 xor \d, T1
223 and \b, T1
224 xor \d, T1
225.endm
226
227.macro F2 b, c, d
228 mov \d, T1
229 SWAP_REG_NAMES \d, T1
230 xor \c, T1
231 xor \b, T1
232.endm
233
234.macro F3 b, c ,d
235 mov \c, T1
236 SWAP_REG_NAMES \c, T1
237 mov \b, T2
238 or \b, T1
239 and \c, T2
240 and \d, T1
241 or T2, T1
242.endm
243
244.macro F4 b, c, d
245 F2 \b, \c, \d
246.endm
247
248.macro UPDATE_HASH hash, val
249 add \hash, \val
250 mov \val, \hash
251.endm
252
253/*
254 * RR does two rounds of SHA-1 back to back with W[] pre-calc
255 * t1 = F(b, c, d); e += w(i)
256 * e += t1; b <<= 30; d += w(i+1);
257 * t1 = F(a, b, c);
258 * d += t1; a <<= 5;
259 * e += a;
260 * t1 = e; a >>= 7;
261 * t1 <<= 5;
262 * d += t1;
263 */
264.macro RR F, a, b, c, d, e, round
265 add WK(\round), \e
266 \F \b, \c, \d # t1 = F(b, c, d);
267 W_PRECALC (\round + W_PRECALC_AHEAD)
268 rol $30, \b
269 add T1, \e
270 add WK(\round + 1), \d
271
272 \F \a, \b, \c
273 W_PRECALC (\round + W_PRECALC_AHEAD + 1)
274 rol $5, \a
275 add \a, \e
276 add T1, \d
277 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
278
279 mov \e, T1
280 SWAP_REG_NAMES \e, T1
281
282 rol $5, T1
283 add T1, \d
284
285 # write: \a, \b
286 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
287.endm
288
289.macro W_PRECALC r
290 .set i, \r
291
292 .if (i < 20)
293 .set K_XMM, 0
294 .elseif (i < 40)
295 .set K_XMM, 16
296 .elseif (i < 60)
297 .set K_XMM, 32
298 .elseif (i < 80)
299 .set K_XMM, 48
300 .endif
301
302 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
303 .set i, ((\r) % 80) # pre-compute for the next iteration
304 .if (i == 0)
305 W_PRECALC_RESET
306 .endif
307 W_PRECALC_00_15
308 .elseif (i<32)
309 W_PRECALC_16_31
310 .elseif (i < 80) // rounds 32-79
311 W_PRECALC_32_79
312 .endif
313.endm
314
315.macro W_PRECALC_RESET
316 .set W, W0
317 .set W_minus_04, W4
318 .set W_minus_08, W8
319 .set W_minus_12, W12
320 .set W_minus_16, W16
321 .set W_minus_20, W20
322 .set W_minus_24, W24
323 .set W_minus_28, W28
324 .set W_minus_32, W
325.endm
326
327.macro W_PRECALC_ROTATE
328 .set W_minus_32, W_minus_28
329 .set W_minus_28, W_minus_24
330 .set W_minus_24, W_minus_20
331 .set W_minus_20, W_minus_16
332 .set W_minus_16, W_minus_12
333 .set W_minus_12, W_minus_08
334 .set W_minus_08, W_minus_04
335 .set W_minus_04, W
336 .set W, W_minus_32
337.endm
338
339.macro W_PRECALC_SSSE3
340
341.macro W_PRECALC_00_15
342 W_PRECALC_00_15_SSSE3
343.endm
344.macro W_PRECALC_16_31
345 W_PRECALC_16_31_SSSE3
346.endm
347.macro W_PRECALC_32_79
348 W_PRECALC_32_79_SSSE3
349.endm
350
351/* message scheduling pre-compute for rounds 0-15 */
352.macro W_PRECALC_00_15_SSSE3
353 .if ((i & 3) == 0)
354 movdqu (i*4)(BUFFER_PTR), W_TMP1
355 .elseif ((i & 3) == 1)
356 pshufb XMM_SHUFB_BSWAP, W_TMP1
357 movdqa W_TMP1, W
358 .elseif ((i & 3) == 2)
359 paddd (K_BASE), W_TMP1
360 .elseif ((i & 3) == 3)
361 movdqa W_TMP1, WK(i&~3)
362 W_PRECALC_ROTATE
363 .endif
364.endm
365
366/* message scheduling pre-compute for rounds 16-31
367 *
368 * - calculating last 32 w[i] values in 8 XMM registers
369 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
370 * instruction
371 *
372 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
373 * dependency, but improves for 32-79
374 */
375.macro W_PRECALC_16_31_SSSE3
376 # blended scheduling of vector and scalar instruction streams, one 4-wide
377 # vector iteration / 4 scalar rounds
378 .if ((i & 3) == 0)
379 movdqa W_minus_12, W
380 palignr $8, W_minus_16, W # w[i-14]
381 movdqa W_minus_04, W_TMP1
382 psrldq $4, W_TMP1 # w[i-3]
383 pxor W_minus_08, W
384 .elseif ((i & 3) == 1)
385 pxor W_minus_16, W_TMP1
386 pxor W_TMP1, W
387 movdqa W, W_TMP2
388 movdqa W, W_TMP1
389 pslldq $12, W_TMP2
390 .elseif ((i & 3) == 2)
391 psrld $31, W
392 pslld $1, W_TMP1
393 por W, W_TMP1
394 movdqa W_TMP2, W
395 psrld $30, W_TMP2
396 pslld $2, W
397 .elseif ((i & 3) == 3)
398 pxor W, W_TMP1
399 pxor W_TMP2, W_TMP1
400 movdqa W_TMP1, W
401 paddd K_XMM(K_BASE), W_TMP1
402 movdqa W_TMP1, WK(i&~3)
403 W_PRECALC_ROTATE
404 .endif
405.endm
406
407/* message scheduling pre-compute for rounds 32-79
408 *
409 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
410 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
411 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
412 */
413.macro W_PRECALC_32_79_SSSE3
414 .if ((i & 3) == 0)
415 movdqa W_minus_04, W_TMP1
416 pxor W_minus_28, W # W is W_minus_32 before xor
417 palignr $8, W_minus_08, W_TMP1
418 .elseif ((i & 3) == 1)
419 pxor W_minus_16, W
420 pxor W_TMP1, W
421 movdqa W, W_TMP1
422 .elseif ((i & 3) == 2)
423 psrld $30, W
424 pslld $2, W_TMP1
425 por W, W_TMP1
426 .elseif ((i & 3) == 3)
427 movdqa W_TMP1, W
428 paddd K_XMM(K_BASE), W_TMP1
429 movdqa W_TMP1, WK(i&~3)
430 W_PRECALC_ROTATE
431 .endif
432.endm
433
434.endm // W_PRECALC_SSSE3
435
436
437#define K1 0x5a827999
438#define K2 0x6ed9eba1
439#define K3 0x8f1bbcdc
440#define K4 0xca62c1d6
441
442.section .rodata
443.align 16
444
445K_XMM_AR:
446 .long K1, K1, K1, K1
447 .long K2, K2, K2, K2
448 .long K3, K3, K3, K3
449 .long K4, K4, K4, K4
450
451BSWAP_SHUFB_CTL:
452 .long 0x00010203
453 .long 0x04050607
454 .long 0x08090a0b
455 .long 0x0c0d0e0f
456
457
458.section .text
459
460W_PRECALC_SSSE3
461.macro xmm_mov a, b
462 movdqu \a,\b
463.endm
464
465/* SSSE3 optimized implementation:
466 * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
467 * unsigned int rounds);
468 */
469SHA1_VECTOR_ASM sha1_transform_ssse3
470
471#ifdef SHA1_ENABLE_AVX_SUPPORT
472
473.macro W_PRECALC_AVX
474
475.purgem W_PRECALC_00_15
476.macro W_PRECALC_00_15
477 W_PRECALC_00_15_AVX
478.endm
479.purgem W_PRECALC_16_31
480.macro W_PRECALC_16_31
481 W_PRECALC_16_31_AVX
482.endm
483.purgem W_PRECALC_32_79
484.macro W_PRECALC_32_79
485 W_PRECALC_32_79_AVX
486.endm
487
488.macro W_PRECALC_00_15_AVX
489 .if ((i & 3) == 0)
490 vmovdqu (i*4)(BUFFER_PTR), W_TMP1
491 .elseif ((i & 3) == 1)
492 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
493 .elseif ((i & 3) == 2)
494 vpaddd (K_BASE), W, W_TMP1
495 .elseif ((i & 3) == 3)
496 vmovdqa W_TMP1, WK(i&~3)
497 W_PRECALC_ROTATE
498 .endif
499.endm
500
501.macro W_PRECALC_16_31_AVX
502 .if ((i & 3) == 0)
503 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
504 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
505 vpxor W_minus_08, W, W
506 vpxor W_minus_16, W_TMP1, W_TMP1
507 .elseif ((i & 3) == 1)
508 vpxor W_TMP1, W, W
509 vpslldq $12, W, W_TMP2
510 vpslld $1, W, W_TMP1
511 .elseif ((i & 3) == 2)
512 vpsrld $31, W, W
513 vpor W, W_TMP1, W_TMP1
514 vpslld $2, W_TMP2, W
515 vpsrld $30, W_TMP2, W_TMP2
516 .elseif ((i & 3) == 3)
517 vpxor W, W_TMP1, W_TMP1
518 vpxor W_TMP2, W_TMP1, W
519 vpaddd K_XMM(K_BASE), W, W_TMP1
520 vmovdqu W_TMP1, WK(i&~3)
521 W_PRECALC_ROTATE
522 .endif
523.endm
524
525.macro W_PRECALC_32_79_AVX
526 .if ((i & 3) == 0)
527 vpalignr $8, W_minus_08, W_minus_04, W_TMP1
528 vpxor W_minus_28, W, W # W is W_minus_32 before xor
529 .elseif ((i & 3) == 1)
530 vpxor W_minus_16, W_TMP1, W_TMP1
531 vpxor W_TMP1, W, W
532 .elseif ((i & 3) == 2)
533 vpslld $2, W, W_TMP1
534 vpsrld $30, W, W
535 vpor W, W_TMP1, W
536 .elseif ((i & 3) == 3)
537 vpaddd K_XMM(K_BASE), W, W_TMP1
538 vmovdqu W_TMP1, WK(i&~3)
539 W_PRECALC_ROTATE
540 .endif
541.endm
542
543.endm // W_PRECALC_AVX
544
545W_PRECALC_AVX
546.purgem xmm_mov
547.macro xmm_mov a, b
548 vmovdqu \a,\b
549.endm
550
551
552/* AVX optimized implementation:
553 * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
554 * unsigned int rounds);
555 */
556SHA1_VECTOR_ASM sha1_transform_avx
557
558#endif
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
new file mode 100644
index 000000000000..f916499d0abe
--- /dev/null
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -0,0 +1,240 @@
1/*
2 * Cryptographic API.
3 *
4 * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
5 * Supplemental SSE3 instructions.
6 *
7 * This file is based on sha1_generic.c
8 *
9 * Copyright (c) Alan Smithee.
10 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
11 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
12 * Copyright (c) Mathias Krause <minipli@googlemail.com>
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <crypto/internal/hash.h>
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/mm.h>
27#include <linux/cryptohash.h>
28#include <linux/types.h>
29#include <crypto/sha.h>
30#include <asm/byteorder.h>
31#include <asm/i387.h>
32#include <asm/xcr.h>
33#include <asm/xsave.h>
34
35
36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
37 unsigned int rounds);
38#ifdef SHA1_ENABLE_AVX_SUPPORT
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds);
41#endif
42
43static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
44
45
46static int sha1_ssse3_init(struct shash_desc *desc)
47{
48 struct sha1_state *sctx = shash_desc_ctx(desc);
49
50 *sctx = (struct sha1_state){
51 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
52 };
53
54 return 0;
55}
56
57static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
58 unsigned int len, unsigned int partial)
59{
60 struct sha1_state *sctx = shash_desc_ctx(desc);
61 unsigned int done = 0;
62
63 sctx->count += len;
64
65 if (partial) {
66 done = SHA1_BLOCK_SIZE - partial;
67 memcpy(sctx->buffer + partial, data, done);
68 sha1_transform_asm(sctx->state, sctx->buffer, 1);
69 }
70
71 if (len - done >= SHA1_BLOCK_SIZE) {
72 const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
73
74 sha1_transform_asm(sctx->state, data + done, rounds);
75 done += rounds * SHA1_BLOCK_SIZE;
76 }
77
78 memcpy(sctx->buffer, data + done, len - done);
79
80 return 0;
81}
82
83static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
84 unsigned int len)
85{
86 struct sha1_state *sctx = shash_desc_ctx(desc);
87 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
88 int res;
89
90 /* Handle the fast case right here */
91 if (partial + len < SHA1_BLOCK_SIZE) {
92 sctx->count += len;
93 memcpy(sctx->buffer + partial, data, len);
94
95 return 0;
96 }
97
98 if (!irq_fpu_usable()) {
99 res = crypto_sha1_update(desc, data, len);
100 } else {
101 kernel_fpu_begin();
102 res = __sha1_ssse3_update(desc, data, len, partial);
103 kernel_fpu_end();
104 }
105
106 return res;
107}
108
109
110/* Add padding and return the message digest. */
111static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
112{
113 struct sha1_state *sctx = shash_desc_ctx(desc);
114 unsigned int i, index, padlen;
115 __be32 *dst = (__be32 *)out;
116 __be64 bits;
117 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
118
119 bits = cpu_to_be64(sctx->count << 3);
120
121 /* Pad out to 56 mod 64 and append length */
122 index = sctx->count % SHA1_BLOCK_SIZE;
123 padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
124 if (!irq_fpu_usable()) {
125 crypto_sha1_update(desc, padding, padlen);
126 crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
127 } else {
128 kernel_fpu_begin();
129 /* We need to fill a whole block for __sha1_ssse3_update() */
130 if (padlen <= 56) {
131 sctx->count += padlen;
132 memcpy(sctx->buffer + index, padding, padlen);
133 } else {
134 __sha1_ssse3_update(desc, padding, padlen, index);
135 }
136 __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
137 kernel_fpu_end();
138 }
139
140 /* Store state in digest */
141 for (i = 0; i < 5; i++)
142 dst[i] = cpu_to_be32(sctx->state[i]);
143
144 /* Wipe context */
145 memset(sctx, 0, sizeof(*sctx));
146
147 return 0;
148}
149
150static int sha1_ssse3_export(struct shash_desc *desc, void *out)
151{
152 struct sha1_state *sctx = shash_desc_ctx(desc);
153
154 memcpy(out, sctx, sizeof(*sctx));
155
156 return 0;
157}
158
159static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
160{
161 struct sha1_state *sctx = shash_desc_ctx(desc);
162
163 memcpy(sctx, in, sizeof(*sctx));
164
165 return 0;
166}
167
168static struct shash_alg alg = {
169 .digestsize = SHA1_DIGEST_SIZE,
170 .init = sha1_ssse3_init,
171 .update = sha1_ssse3_update,
172 .final = sha1_ssse3_final,
173 .export = sha1_ssse3_export,
174 .import = sha1_ssse3_import,
175 .descsize = sizeof(struct sha1_state),
176 .statesize = sizeof(struct sha1_state),
177 .base = {
178 .cra_name = "sha1",
179 .cra_driver_name= "sha1-ssse3",
180 .cra_priority = 150,
181 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
182 .cra_blocksize = SHA1_BLOCK_SIZE,
183 .cra_module = THIS_MODULE,
184 }
185};
186
187#ifdef SHA1_ENABLE_AVX_SUPPORT
188static bool __init avx_usable(void)
189{
190 u64 xcr0;
191
192 if (!cpu_has_avx || !cpu_has_osxsave)
193 return false;
194
195 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
196 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
197 pr_info("AVX detected but unusable.\n");
198
199 return false;
200 }
201
202 return true;
203}
204#endif
205
206static int __init sha1_ssse3_mod_init(void)
207{
208 /* test for SSSE3 first */
209 if (cpu_has_ssse3)
210 sha1_transform_asm = sha1_transform_ssse3;
211
212#ifdef SHA1_ENABLE_AVX_SUPPORT
213 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable())
215 sha1_transform_asm = sha1_transform_avx;
216#endif
217
218 if (sha1_transform_asm) {
219 pr_info("Using %s optimized SHA-1 implementation\n",
220 sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
221 : "AVX");
222 return crypto_register_shash(&alg);
223 }
224 pr_info("Neither AVX nor SSSE3 is available/usable.\n");
225
226 return -ENODEV;
227}
228
229static void __exit sha1_ssse3_mod_fini(void)
230{
231 crypto_unregister_shash(&alg);
232}
233
234module_init(sha1_ssse3_mod_init);
235module_exit(sha1_ssse3_mod_fini);
236
237MODULE_LICENSE("GPL");
238MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
239
240MODULE_ALIAS("sha1");
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4258aac99a6e..48a93ef5c84b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -257,7 +257,9 @@ extern const char * const x86_power_flags[32];
257#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) 257#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
258#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) 258#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
259#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) 259#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
260#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
260#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) 261#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
262#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
261#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) 263#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
262#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) 264#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
263#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) 265#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
@@ -285,6 +287,7 @@ extern const char * const x86_power_flags[32];
285#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) 287#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
286#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 288#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
287#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 289#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
290#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
288#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 291#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
289#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 292#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
290#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) 293#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)