diff options
author | Martin Willi <martin@strongswan.org> | 2015-07-16 13:14:03 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-07-17 09:20:25 -0400 |
commit | 3d1e93cdf16cfe6f315167c65dc504e467e4681a (patch) | |
tree | 500aae710ce50604b90858b4204e6fb54bfbe541 | |
parent | 274f938e0a01286f465d84d5a3f1565225f4ec4b (diff) |
crypto: chacha20 - Add an eight block AVX2 variant for x86_64
Extends the x86_64 ChaCha20 implementation by a function processing eight
ChaCha20 blocks in parallel using AVX2.
For large messages, throughput increases by ~55-70% compared to four block
SSSE3:
testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds (675987680 bytes)
test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds (2972265024 bytes)
test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds (8455196672 bytes)
test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds (11846409216 bytes)
test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds (11868250112 bytes)
testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 41999675 operations in 10 seconds (671994800 bytes)
test 1 (256 bit key, 64 byte blocks): 45805908 operations in 10 seconds (2931578112 bytes)
test 2 (256 bit key, 256 byte blocks): 32814947 operations in 10 seconds (8400626432 bytes)
test 3 (256 bit key, 1024 byte blocks): 19777167 operations in 10 seconds (20251819008 bytes)
test 4 (256 bit key, 8192 byte blocks): 2279321 operations in 10 seconds (18672197632 bytes)
Benchmark results from a Core i5-4670T.
Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/crypto/chacha20-avx2-x86_64.S | 443 | ||||
-rw-r--r-- | arch/x86/crypto/chacha20_glue.c | 19 | ||||
-rw-r--r-- | crypto/Kconfig | 2 |
4 files changed, 464 insertions, 1 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b09e9a4cea3e..ce39b3c872b1 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -77,6 +77,7 @@ endif | |||
77 | 77 | ||
78 | ifeq ($(avx2_supported),yes) | 78 | ifeq ($(avx2_supported),yes) |
79 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o | 79 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o |
80 | chacha20-x86_64-y += chacha20-avx2-x86_64.o | ||
80 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o | 81 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o |
81 | endif | 82 | endif |
82 | 83 | ||
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S new file mode 100644 index 000000000000..16694e625f77 --- /dev/null +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S | |||
@@ -0,0 +1,443 @@ | |||
1 | /* | ||
2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions | ||
3 | * | ||
4 | * Copyright (C) 2015 Martin Willi | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | |||
14 | .data | ||
15 | .align 32 | ||
16 | |||
17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 | ||
18 | .octa 0x0e0d0c0f0a09080b0605040702010003 | ||
19 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 | ||
20 | .octa 0x0d0c0f0e09080b0a0504070601000302 | ||
21 | CTRINC: .octa 0x00000003000000020000000100000000 | ||
22 | .octa 0x00000007000000060000000500000004 | ||
23 | |||
24 | .text | ||
25 | |||
26 | ENTRY(chacha20_8block_xor_avx2) | ||
27 | # %rdi: Input state matrix, s | ||
28 | # %rsi: 8 data blocks output, o | ||
29 | # %rdx: 8 data blocks input, i | ||
30 | |||
31 | # This function encrypts eight consecutive ChaCha20 blocks by loading | ||
32 | # the state matrix in AVX registers eight times. As we need some | ||
33 | # scratch registers, we save the first four registers on the stack. The | ||
34 | # algorithm performs each operation on the corresponding word of each | ||
35 | # state matrix, hence requires no word shuffling. For final XORing step | ||
36 | # we transpose the matrix by interleaving 32-, 64- and then 128-bit | ||
37 | # words, which allows us to do XOR in AVX registers. 8/16-bit word | ||
38 | # rotation is done with the slightly better performing byte shuffling, | ||
39 | # 7/12-bit word rotation uses traditional shift+OR. | ||
40 | |||
41 | vzeroupper | ||
42 | # 4 * 32 byte stack, 32-byte aligned | ||
43 | mov %rsp, %r8 | ||
44 | and $~31, %rsp | ||
45 | sub $0x80, %rsp | ||
46 | |||
47 | # x0..15[0-7] = s[0..15] | ||
48 | vpbroadcastd 0x00(%rdi),%ymm0 | ||
49 | vpbroadcastd 0x04(%rdi),%ymm1 | ||
50 | vpbroadcastd 0x08(%rdi),%ymm2 | ||
51 | vpbroadcastd 0x0c(%rdi),%ymm3 | ||
52 | vpbroadcastd 0x10(%rdi),%ymm4 | ||
53 | vpbroadcastd 0x14(%rdi),%ymm5 | ||
54 | vpbroadcastd 0x18(%rdi),%ymm6 | ||
55 | vpbroadcastd 0x1c(%rdi),%ymm7 | ||
56 | vpbroadcastd 0x20(%rdi),%ymm8 | ||
57 | vpbroadcastd 0x24(%rdi),%ymm9 | ||
58 | vpbroadcastd 0x28(%rdi),%ymm10 | ||
59 | vpbroadcastd 0x2c(%rdi),%ymm11 | ||
60 | vpbroadcastd 0x30(%rdi),%ymm12 | ||
61 | vpbroadcastd 0x34(%rdi),%ymm13 | ||
62 | vpbroadcastd 0x38(%rdi),%ymm14 | ||
63 | vpbroadcastd 0x3c(%rdi),%ymm15 | ||
64 | # x0..3 on stack | ||
65 | vmovdqa %ymm0,0x00(%rsp) | ||
66 | vmovdqa %ymm1,0x20(%rsp) | ||
67 | vmovdqa %ymm2,0x40(%rsp) | ||
68 | vmovdqa %ymm3,0x60(%rsp) | ||
69 | |||
70 | vmovdqa CTRINC(%rip),%ymm1 | ||
71 | vmovdqa ROT8(%rip),%ymm2 | ||
72 | vmovdqa ROT16(%rip),%ymm3 | ||
73 | |||
74 | # x12 += counter values 0-3 | ||
75 | vpaddd %ymm1,%ymm12,%ymm12 | ||
76 | |||
77 | mov $10,%ecx | ||
78 | |||
79 | .Ldoubleround8: | ||
80 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) | ||
81 | vpaddd 0x00(%rsp),%ymm4,%ymm0 | ||
82 | vmovdqa %ymm0,0x00(%rsp) | ||
83 | vpxor %ymm0,%ymm12,%ymm12 | ||
84 | vpshufb %ymm3,%ymm12,%ymm12 | ||
85 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) | ||
86 | vpaddd 0x20(%rsp),%ymm5,%ymm0 | ||
87 | vmovdqa %ymm0,0x20(%rsp) | ||
88 | vpxor %ymm0,%ymm13,%ymm13 | ||
89 | vpshufb %ymm3,%ymm13,%ymm13 | ||
90 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) | ||
91 | vpaddd 0x40(%rsp),%ymm6,%ymm0 | ||
92 | vmovdqa %ymm0,0x40(%rsp) | ||
93 | vpxor %ymm0,%ymm14,%ymm14 | ||
94 | vpshufb %ymm3,%ymm14,%ymm14 | ||
95 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) | ||
96 | vpaddd 0x60(%rsp),%ymm7,%ymm0 | ||
97 | vmovdqa %ymm0,0x60(%rsp) | ||
98 | vpxor %ymm0,%ymm15,%ymm15 | ||
99 | vpshufb %ymm3,%ymm15,%ymm15 | ||
100 | |||
101 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) | ||
102 | vpaddd %ymm12,%ymm8,%ymm8 | ||
103 | vpxor %ymm8,%ymm4,%ymm4 | ||
104 | vpslld $12,%ymm4,%ymm0 | ||
105 | vpsrld $20,%ymm4,%ymm4 | ||
106 | vpor %ymm0,%ymm4,%ymm4 | ||
107 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) | ||
108 | vpaddd %ymm13,%ymm9,%ymm9 | ||
109 | vpxor %ymm9,%ymm5,%ymm5 | ||
110 | vpslld $12,%ymm5,%ymm0 | ||
111 | vpsrld $20,%ymm5,%ymm5 | ||
112 | vpor %ymm0,%ymm5,%ymm5 | ||
113 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) | ||
114 | vpaddd %ymm14,%ymm10,%ymm10 | ||
115 | vpxor %ymm10,%ymm6,%ymm6 | ||
116 | vpslld $12,%ymm6,%ymm0 | ||
117 | vpsrld $20,%ymm6,%ymm6 | ||
118 | vpor %ymm0,%ymm6,%ymm6 | ||
119 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) | ||
120 | vpaddd %ymm15,%ymm11,%ymm11 | ||
121 | vpxor %ymm11,%ymm7,%ymm7 | ||
122 | vpslld $12,%ymm7,%ymm0 | ||
123 | vpsrld $20,%ymm7,%ymm7 | ||
124 | vpor %ymm0,%ymm7,%ymm7 | ||
125 | |||
126 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) | ||
127 | vpaddd 0x00(%rsp),%ymm4,%ymm0 | ||
128 | vmovdqa %ymm0,0x00(%rsp) | ||
129 | vpxor %ymm0,%ymm12,%ymm12 | ||
130 | vpshufb %ymm2,%ymm12,%ymm12 | ||
131 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) | ||
132 | vpaddd 0x20(%rsp),%ymm5,%ymm0 | ||
133 | vmovdqa %ymm0,0x20(%rsp) | ||
134 | vpxor %ymm0,%ymm13,%ymm13 | ||
135 | vpshufb %ymm2,%ymm13,%ymm13 | ||
136 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) | ||
137 | vpaddd 0x40(%rsp),%ymm6,%ymm0 | ||
138 | vmovdqa %ymm0,0x40(%rsp) | ||
139 | vpxor %ymm0,%ymm14,%ymm14 | ||
140 | vpshufb %ymm2,%ymm14,%ymm14 | ||
141 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) | ||
142 | vpaddd 0x60(%rsp),%ymm7,%ymm0 | ||
143 | vmovdqa %ymm0,0x60(%rsp) | ||
144 | vpxor %ymm0,%ymm15,%ymm15 | ||
145 | vpshufb %ymm2,%ymm15,%ymm15 | ||
146 | |||
147 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) | ||
148 | vpaddd %ymm12,%ymm8,%ymm8 | ||
149 | vpxor %ymm8,%ymm4,%ymm4 | ||
150 | vpslld $7,%ymm4,%ymm0 | ||
151 | vpsrld $25,%ymm4,%ymm4 | ||
152 | vpor %ymm0,%ymm4,%ymm4 | ||
153 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) | ||
154 | vpaddd %ymm13,%ymm9,%ymm9 | ||
155 | vpxor %ymm9,%ymm5,%ymm5 | ||
156 | vpslld $7,%ymm5,%ymm0 | ||
157 | vpsrld $25,%ymm5,%ymm5 | ||
158 | vpor %ymm0,%ymm5,%ymm5 | ||
159 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) | ||
160 | vpaddd %ymm14,%ymm10,%ymm10 | ||
161 | vpxor %ymm10,%ymm6,%ymm6 | ||
162 | vpslld $7,%ymm6,%ymm0 | ||
163 | vpsrld $25,%ymm6,%ymm6 | ||
164 | vpor %ymm0,%ymm6,%ymm6 | ||
165 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) | ||
166 | vpaddd %ymm15,%ymm11,%ymm11 | ||
167 | vpxor %ymm11,%ymm7,%ymm7 | ||
168 | vpslld $7,%ymm7,%ymm0 | ||
169 | vpsrld $25,%ymm7,%ymm7 | ||
170 | vpor %ymm0,%ymm7,%ymm7 | ||
171 | |||
172 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) | ||
173 | vpaddd 0x00(%rsp),%ymm5,%ymm0 | ||
174 | vmovdqa %ymm0,0x00(%rsp) | ||
175 | vpxor %ymm0,%ymm15,%ymm15 | ||
176 | vpshufb %ymm3,%ymm15,%ymm15 | ||
177 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 | ||
178 | vpaddd 0x20(%rsp),%ymm6,%ymm0 | ||
179 | vmovdqa %ymm0,0x20(%rsp) | ||
180 | vpxor %ymm0,%ymm12,%ymm12 | ||
181 | vpshufb %ymm3,%ymm12,%ymm12 | ||
182 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) | ||
183 | vpaddd 0x40(%rsp),%ymm7,%ymm0 | ||
184 | vmovdqa %ymm0,0x40(%rsp) | ||
185 | vpxor %ymm0,%ymm13,%ymm13 | ||
186 | vpshufb %ymm3,%ymm13,%ymm13 | ||
187 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) | ||
188 | vpaddd 0x60(%rsp),%ymm4,%ymm0 | ||
189 | vmovdqa %ymm0,0x60(%rsp) | ||
190 | vpxor %ymm0,%ymm14,%ymm14 | ||
191 | vpshufb %ymm3,%ymm14,%ymm14 | ||
192 | |||
193 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) | ||
194 | vpaddd %ymm15,%ymm10,%ymm10 | ||
195 | vpxor %ymm10,%ymm5,%ymm5 | ||
196 | vpslld $12,%ymm5,%ymm0 | ||
197 | vpsrld $20,%ymm5,%ymm5 | ||
198 | vpor %ymm0,%ymm5,%ymm5 | ||
199 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) | ||
200 | vpaddd %ymm12,%ymm11,%ymm11 | ||
201 | vpxor %ymm11,%ymm6,%ymm6 | ||
202 | vpslld $12,%ymm6,%ymm0 | ||
203 | vpsrld $20,%ymm6,%ymm6 | ||
204 | vpor %ymm0,%ymm6,%ymm6 | ||
205 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) | ||
206 | vpaddd %ymm13,%ymm8,%ymm8 | ||
207 | vpxor %ymm8,%ymm7,%ymm7 | ||
208 | vpslld $12,%ymm7,%ymm0 | ||
209 | vpsrld $20,%ymm7,%ymm7 | ||
210 | vpor %ymm0,%ymm7,%ymm7 | ||
211 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) | ||
212 | vpaddd %ymm14,%ymm9,%ymm9 | ||
213 | vpxor %ymm9,%ymm4,%ymm4 | ||
214 | vpslld $12,%ymm4,%ymm0 | ||
215 | vpsrld $20,%ymm4,%ymm4 | ||
216 | vpor %ymm0,%ymm4,%ymm4 | ||
217 | |||
218 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) | ||
219 | vpaddd 0x00(%rsp),%ymm5,%ymm0 | ||
220 | vmovdqa %ymm0,0x00(%rsp) | ||
221 | vpxor %ymm0,%ymm15,%ymm15 | ||
222 | vpshufb %ymm2,%ymm15,%ymm15 | ||
223 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) | ||
224 | vpaddd 0x20(%rsp),%ymm6,%ymm0 | ||
225 | vmovdqa %ymm0,0x20(%rsp) | ||
226 | vpxor %ymm0,%ymm12,%ymm12 | ||
227 | vpshufb %ymm2,%ymm12,%ymm12 | ||
228 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) | ||
229 | vpaddd 0x40(%rsp),%ymm7,%ymm0 | ||
230 | vmovdqa %ymm0,0x40(%rsp) | ||
231 | vpxor %ymm0,%ymm13,%ymm13 | ||
232 | vpshufb %ymm2,%ymm13,%ymm13 | ||
233 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) | ||
234 | vpaddd 0x60(%rsp),%ymm4,%ymm0 | ||
235 | vmovdqa %ymm0,0x60(%rsp) | ||
236 | vpxor %ymm0,%ymm14,%ymm14 | ||
237 | vpshufb %ymm2,%ymm14,%ymm14 | ||
238 | |||
239 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) | ||
240 | vpaddd %ymm15,%ymm10,%ymm10 | ||
241 | vpxor %ymm10,%ymm5,%ymm5 | ||
242 | vpslld $7,%ymm5,%ymm0 | ||
243 | vpsrld $25,%ymm5,%ymm5 | ||
244 | vpor %ymm0,%ymm5,%ymm5 | ||
245 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) | ||
246 | vpaddd %ymm12,%ymm11,%ymm11 | ||
247 | vpxor %ymm11,%ymm6,%ymm6 | ||
248 | vpslld $7,%ymm6,%ymm0 | ||
249 | vpsrld $25,%ymm6,%ymm6 | ||
250 | vpor %ymm0,%ymm6,%ymm6 | ||
251 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) | ||
252 | vpaddd %ymm13,%ymm8,%ymm8 | ||
253 | vpxor %ymm8,%ymm7,%ymm7 | ||
254 | vpslld $7,%ymm7,%ymm0 | ||
255 | vpsrld $25,%ymm7,%ymm7 | ||
256 | vpor %ymm0,%ymm7,%ymm7 | ||
257 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) | ||
258 | vpaddd %ymm14,%ymm9,%ymm9 | ||
259 | vpxor %ymm9,%ymm4,%ymm4 | ||
260 | vpslld $7,%ymm4,%ymm0 | ||
261 | vpsrld $25,%ymm4,%ymm4 | ||
262 | vpor %ymm0,%ymm4,%ymm4 | ||
263 | |||
264 | dec %ecx | ||
265 | jnz .Ldoubleround8 | ||
266 | |||
267 | # x0..15[0-3] += s[0..15] | ||
268 | vpbroadcastd 0x00(%rdi),%ymm0 | ||
269 | vpaddd 0x00(%rsp),%ymm0,%ymm0 | ||
270 | vmovdqa %ymm0,0x00(%rsp) | ||
271 | vpbroadcastd 0x04(%rdi),%ymm0 | ||
272 | vpaddd 0x20(%rsp),%ymm0,%ymm0 | ||
273 | vmovdqa %ymm0,0x20(%rsp) | ||
274 | vpbroadcastd 0x08(%rdi),%ymm0 | ||
275 | vpaddd 0x40(%rsp),%ymm0,%ymm0 | ||
276 | vmovdqa %ymm0,0x40(%rsp) | ||
277 | vpbroadcastd 0x0c(%rdi),%ymm0 | ||
278 | vpaddd 0x60(%rsp),%ymm0,%ymm0 | ||
279 | vmovdqa %ymm0,0x60(%rsp) | ||
280 | vpbroadcastd 0x10(%rdi),%ymm0 | ||
281 | vpaddd %ymm0,%ymm4,%ymm4 | ||
282 | vpbroadcastd 0x14(%rdi),%ymm0 | ||
283 | vpaddd %ymm0,%ymm5,%ymm5 | ||
284 | vpbroadcastd 0x18(%rdi),%ymm0 | ||
285 | vpaddd %ymm0,%ymm6,%ymm6 | ||
286 | vpbroadcastd 0x1c(%rdi),%ymm0 | ||
287 | vpaddd %ymm0,%ymm7,%ymm7 | ||
288 | vpbroadcastd 0x20(%rdi),%ymm0 | ||
289 | vpaddd %ymm0,%ymm8,%ymm8 | ||
290 | vpbroadcastd 0x24(%rdi),%ymm0 | ||
291 | vpaddd %ymm0,%ymm9,%ymm9 | ||
292 | vpbroadcastd 0x28(%rdi),%ymm0 | ||
293 | vpaddd %ymm0,%ymm10,%ymm10 | ||
294 | vpbroadcastd 0x2c(%rdi),%ymm0 | ||
295 | vpaddd %ymm0,%ymm11,%ymm11 | ||
296 | vpbroadcastd 0x30(%rdi),%ymm0 | ||
297 | vpaddd %ymm0,%ymm12,%ymm12 | ||
298 | vpbroadcastd 0x34(%rdi),%ymm0 | ||
299 | vpaddd %ymm0,%ymm13,%ymm13 | ||
300 | vpbroadcastd 0x38(%rdi),%ymm0 | ||
301 | vpaddd %ymm0,%ymm14,%ymm14 | ||
302 | vpbroadcastd 0x3c(%rdi),%ymm0 | ||
303 | vpaddd %ymm0,%ymm15,%ymm15 | ||
304 | |||
305 | # x12 += counter values 0-3 | ||
306 | vpaddd %ymm1,%ymm12,%ymm12 | ||
307 | |||
308 | # interleave 32-bit words in state n, n+1 | ||
309 | vmovdqa 0x00(%rsp),%ymm0 | ||
310 | vmovdqa 0x20(%rsp),%ymm1 | ||
311 | vpunpckldq %ymm1,%ymm0,%ymm2 | ||
312 | vpunpckhdq %ymm1,%ymm0,%ymm1 | ||
313 | vmovdqa %ymm2,0x00(%rsp) | ||
314 | vmovdqa %ymm1,0x20(%rsp) | ||
315 | vmovdqa 0x40(%rsp),%ymm0 | ||
316 | vmovdqa 0x60(%rsp),%ymm1 | ||
317 | vpunpckldq %ymm1,%ymm0,%ymm2 | ||
318 | vpunpckhdq %ymm1,%ymm0,%ymm1 | ||
319 | vmovdqa %ymm2,0x40(%rsp) | ||
320 | vmovdqa %ymm1,0x60(%rsp) | ||
321 | vmovdqa %ymm4,%ymm0 | ||
322 | vpunpckldq %ymm5,%ymm0,%ymm4 | ||
323 | vpunpckhdq %ymm5,%ymm0,%ymm5 | ||
324 | vmovdqa %ymm6,%ymm0 | ||
325 | vpunpckldq %ymm7,%ymm0,%ymm6 | ||
326 | vpunpckhdq %ymm7,%ymm0,%ymm7 | ||
327 | vmovdqa %ymm8,%ymm0 | ||
328 | vpunpckldq %ymm9,%ymm0,%ymm8 | ||
329 | vpunpckhdq %ymm9,%ymm0,%ymm9 | ||
330 | vmovdqa %ymm10,%ymm0 | ||
331 | vpunpckldq %ymm11,%ymm0,%ymm10 | ||
332 | vpunpckhdq %ymm11,%ymm0,%ymm11 | ||
333 | vmovdqa %ymm12,%ymm0 | ||
334 | vpunpckldq %ymm13,%ymm0,%ymm12 | ||
335 | vpunpckhdq %ymm13,%ymm0,%ymm13 | ||
336 | vmovdqa %ymm14,%ymm0 | ||
337 | vpunpckldq %ymm15,%ymm0,%ymm14 | ||
338 | vpunpckhdq %ymm15,%ymm0,%ymm15 | ||
339 | |||
340 | # interleave 64-bit words in state n, n+2 | ||
341 | vmovdqa 0x00(%rsp),%ymm0 | ||
342 | vmovdqa 0x40(%rsp),%ymm2 | ||
343 | vpunpcklqdq %ymm2,%ymm0,%ymm1 | ||
344 | vpunpckhqdq %ymm2,%ymm0,%ymm2 | ||
345 | vmovdqa %ymm1,0x00(%rsp) | ||
346 | vmovdqa %ymm2,0x40(%rsp) | ||
347 | vmovdqa 0x20(%rsp),%ymm0 | ||
348 | vmovdqa 0x60(%rsp),%ymm2 | ||
349 | vpunpcklqdq %ymm2,%ymm0,%ymm1 | ||
350 | vpunpckhqdq %ymm2,%ymm0,%ymm2 | ||
351 | vmovdqa %ymm1,0x20(%rsp) | ||
352 | vmovdqa %ymm2,0x60(%rsp) | ||
353 | vmovdqa %ymm4,%ymm0 | ||
354 | vpunpcklqdq %ymm6,%ymm0,%ymm4 | ||
355 | vpunpckhqdq %ymm6,%ymm0,%ymm6 | ||
356 | vmovdqa %ymm5,%ymm0 | ||
357 | vpunpcklqdq %ymm7,%ymm0,%ymm5 | ||
358 | vpunpckhqdq %ymm7,%ymm0,%ymm7 | ||
359 | vmovdqa %ymm8,%ymm0 | ||
360 | vpunpcklqdq %ymm10,%ymm0,%ymm8 | ||
361 | vpunpckhqdq %ymm10,%ymm0,%ymm10 | ||
362 | vmovdqa %ymm9,%ymm0 | ||
363 | vpunpcklqdq %ymm11,%ymm0,%ymm9 | ||
364 | vpunpckhqdq %ymm11,%ymm0,%ymm11 | ||
365 | vmovdqa %ymm12,%ymm0 | ||
366 | vpunpcklqdq %ymm14,%ymm0,%ymm12 | ||
367 | vpunpckhqdq %ymm14,%ymm0,%ymm14 | ||
368 | vmovdqa %ymm13,%ymm0 | ||
369 | vpunpcklqdq %ymm15,%ymm0,%ymm13 | ||
370 | vpunpckhqdq %ymm15,%ymm0,%ymm15 | ||
371 | |||
372 | # interleave 128-bit words in state n, n+4 | ||
373 | vmovdqa 0x00(%rsp),%ymm0 | ||
374 | vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 | ||
375 | vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 | ||
376 | vmovdqa %ymm1,0x00(%rsp) | ||
377 | vmovdqa 0x20(%rsp),%ymm0 | ||
378 | vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 | ||
379 | vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 | ||
380 | vmovdqa %ymm1,0x20(%rsp) | ||
381 | vmovdqa 0x40(%rsp),%ymm0 | ||
382 | vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 | ||
383 | vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 | ||
384 | vmovdqa %ymm1,0x40(%rsp) | ||
385 | vmovdqa 0x60(%rsp),%ymm0 | ||
386 | vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 | ||
387 | vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 | ||
388 | vmovdqa %ymm1,0x60(%rsp) | ||
389 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 | ||
390 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 | ||
391 | vmovdqa %ymm0,%ymm8 | ||
392 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 | ||
393 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 | ||
394 | vmovdqa %ymm0,%ymm9 | ||
395 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 | ||
396 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 | ||
397 | vmovdqa %ymm0,%ymm10 | ||
398 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 | ||
399 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 | ||
400 | vmovdqa %ymm0,%ymm11 | ||
401 | |||
402 | # xor with corresponding input, write to output | ||
403 | vmovdqa 0x00(%rsp),%ymm0 | ||
404 | vpxor 0x0000(%rdx),%ymm0,%ymm0 | ||
405 | vmovdqu %ymm0,0x0000(%rsi) | ||
406 | vmovdqa 0x20(%rsp),%ymm0 | ||
407 | vpxor 0x0080(%rdx),%ymm0,%ymm0 | ||
408 | vmovdqu %ymm0,0x0080(%rsi) | ||
409 | vmovdqa 0x40(%rsp),%ymm0 | ||
410 | vpxor 0x0040(%rdx),%ymm0,%ymm0 | ||
411 | vmovdqu %ymm0,0x0040(%rsi) | ||
412 | vmovdqa 0x60(%rsp),%ymm0 | ||
413 | vpxor 0x00c0(%rdx),%ymm0,%ymm0 | ||
414 | vmovdqu %ymm0,0x00c0(%rsi) | ||
415 | vpxor 0x0100(%rdx),%ymm4,%ymm4 | ||
416 | vmovdqu %ymm4,0x0100(%rsi) | ||
417 | vpxor 0x0180(%rdx),%ymm5,%ymm5 | ||
418 | vmovdqu %ymm5,0x00180(%rsi) | ||
419 | vpxor 0x0140(%rdx),%ymm6,%ymm6 | ||
420 | vmovdqu %ymm6,0x0140(%rsi) | ||
421 | vpxor 0x01c0(%rdx),%ymm7,%ymm7 | ||
422 | vmovdqu %ymm7,0x01c0(%rsi) | ||
423 | vpxor 0x0020(%rdx),%ymm8,%ymm8 | ||
424 | vmovdqu %ymm8,0x0020(%rsi) | ||
425 | vpxor 0x00a0(%rdx),%ymm9,%ymm9 | ||
426 | vmovdqu %ymm9,0x00a0(%rsi) | ||
427 | vpxor 0x0060(%rdx),%ymm10,%ymm10 | ||
428 | vmovdqu %ymm10,0x0060(%rsi) | ||
429 | vpxor 0x00e0(%rdx),%ymm11,%ymm11 | ||
430 | vmovdqu %ymm11,0x00e0(%rsi) | ||
431 | vpxor 0x0120(%rdx),%ymm12,%ymm12 | ||
432 | vmovdqu %ymm12,0x0120(%rsi) | ||
433 | vpxor 0x01a0(%rdx),%ymm13,%ymm13 | ||
434 | vmovdqu %ymm13,0x01a0(%rsi) | ||
435 | vpxor 0x0160(%rdx),%ymm14,%ymm14 | ||
436 | vmovdqu %ymm14,0x0160(%rsi) | ||
437 | vpxor 0x01e0(%rdx),%ymm15,%ymm15 | ||
438 | vmovdqu %ymm15,0x01e0(%rsi) | ||
439 | |||
440 | vzeroupper | ||
441 | mov %r8,%rsp | ||
442 | ret | ||
443 | ENDPROC(chacha20_8block_xor_avx2) | ||
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 4d677c3eb7bd..effe2160b7c5 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c | |||
@@ -21,12 +21,27 @@ | |||
21 | 21 | ||
22 | asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | 22 | asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); |
23 | asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | 23 | asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); |
24 | #ifdef CONFIG_AS_AVX2 | ||
25 | asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); | ||
26 | static bool chacha20_use_avx2; | ||
27 | #endif | ||
24 | 28 | ||
25 | static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, | 29 | static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, |
26 | unsigned int bytes) | 30 | unsigned int bytes) |
27 | { | 31 | { |
28 | u8 buf[CHACHA20_BLOCK_SIZE]; | 32 | u8 buf[CHACHA20_BLOCK_SIZE]; |
29 | 33 | ||
34 | #ifdef CONFIG_AS_AVX2 | ||
35 | if (chacha20_use_avx2) { | ||
36 | while (bytes >= CHACHA20_BLOCK_SIZE * 8) { | ||
37 | chacha20_8block_xor_avx2(state, dst, src); | ||
38 | bytes -= CHACHA20_BLOCK_SIZE * 8; | ||
39 | src += CHACHA20_BLOCK_SIZE * 8; | ||
40 | dst += CHACHA20_BLOCK_SIZE * 8; | ||
41 | state[12] += 8; | ||
42 | } | ||
43 | } | ||
44 | #endif | ||
30 | while (bytes >= CHACHA20_BLOCK_SIZE * 4) { | 45 | while (bytes >= CHACHA20_BLOCK_SIZE * 4) { |
31 | chacha20_4block_xor_ssse3(state, dst, src); | 46 | chacha20_4block_xor_ssse3(state, dst, src); |
32 | bytes -= CHACHA20_BLOCK_SIZE * 4; | 47 | bytes -= CHACHA20_BLOCK_SIZE * 4; |
@@ -113,6 +128,10 @@ static int __init chacha20_simd_mod_init(void) | |||
113 | if (!cpu_has_ssse3) | 128 | if (!cpu_has_ssse3) |
114 | return -ENODEV; | 129 | return -ENODEV; |
115 | 130 | ||
131 | #ifdef CONFIG_AS_AVX2 | ||
132 | chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && | ||
133 | cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); | ||
134 | #endif | ||
116 | return crypto_register_alg(&alg); | 135 | return crypto_register_alg(&alg); |
117 | } | 136 | } |
118 | 137 | ||
diff --git a/crypto/Kconfig b/crypto/Kconfig index 8f24185ee0a7..82caab0e8256 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -1214,7 +1214,7 @@ config CRYPTO_CHACHA20 | |||
1214 | <http://cr.yp.to/chacha/chacha-20080128.pdf> | 1214 | <http://cr.yp.to/chacha/chacha-20080128.pdf> |
1215 | 1215 | ||
1216 | config CRYPTO_CHACHA20_X86_64 | 1216 | config CRYPTO_CHACHA20_X86_64 |
1217 | tristate "ChaCha20 cipher algorithm (x86_64/SSSE3)" | 1217 | tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)" |
1218 | depends on X86 && 64BIT | 1218 | depends on X86 && 64BIT |
1219 | select CRYPTO_BLKCIPHER | 1219 | select CRYPTO_BLKCIPHER |
1220 | select CRYPTO_CHACHA20 | 1220 | select CRYPTO_CHACHA20 |