diff options
author | Martin Willi <martin@strongswan.org> | 2015-07-16 13:14:02 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-07-17 09:20:25 -0400 |
commit | 274f938e0a01286f465d84d5a3f1565225f4ec4b (patch) | |
tree | 57dd7a99156fe50ce0f4c43c32494de0dcf9fd2b | |
parent | c9320b6dcb89658a5e53b4f8e31f4c2ee810ec2d (diff) |
crypto: chacha20 - Add a four block SSSE3 variant for x86_64
Extends the x86_64 SSSE3 ChaCha20 implementation by a function processing
four ChaCha20 blocks in parallel. This avoids the word shuffling needed
in the single block variant, further increasing throughput.
For large messages, throughput increases by ~110% compared to single block
SSSE3:
testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds (690270176 bytes)
test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds (2998135936 bytes)
test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds (4725379072 bytes)
test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds (5489185792 bytes)
test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds (5675794432 bytes)
testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds (675987680 bytes)
test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds (2972265024 bytes)
test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds (8455196672 bytes)
test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds (11846409216 bytes)
test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds (11868250112 bytes)
Benchmark results from a Core i5-4670T.
Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/chacha20-ssse3-x86_64.S | 483 | ||||
-rw-r--r-- | arch/x86/crypto/chacha20_glue.c | 8 |
2 files changed, 491 insertions, 0 deletions
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 1b97ad074cef..712b13047b41 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S | |||
@@ -16,6 +16,7 @@ | |||
16 | 16 | ||
17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 | 17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
18 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 | 18 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
19 | CTRINC: .octa 0x00000003000000020000000100000000 | ||
19 | 20 | ||
20 | .text | 21 | .text |
21 | 22 | ||
@@ -140,3 +141,485 @@ ENTRY(chacha20_block_xor_ssse3) | |||
140 | 141 | ||
141 | ret | 142 | ret |
142 | ENDPROC(chacha20_block_xor_ssse3) | 143 | ENDPROC(chacha20_block_xor_ssse3) |
144 | |||
145 | ENTRY(chacha20_4block_xor_ssse3) | ||
146 | # %rdi: Input state matrix, s | ||
147 | # %rsi: 4 data blocks output, o | ||
148 | # %rdx: 4 data blocks input, i | ||
149 | |||
150 | # This function encrypts four consecutive ChaCha20 blocks by loading the | ||
151 | # the state matrix in SSE registers four times. As we need some scratch | ||
152 | # registers, we save the first four registers on the stack. The | ||
153 | # algorithm performs each operation on the corresponding word of each | ||
154 | # state matrix, hence requires no word shuffling. For final XORing step | ||
155 | # we transpose the matrix by interleaving 32- and then 64-bit words, | ||
156 | # which allows us to do XOR in SSE registers. 8/16-bit word rotation is | ||
157 | # done with the slightly better performing SSSE3 byte shuffling, | ||
158 | # 7/12-bit word rotation uses traditional shift+OR. | ||
159 | |||
160 | sub $0x40,%rsp | ||
161 | |||
162 | # x0..15[0-3] = s0..3[0..3] | ||
163 | movq 0x00(%rdi),%xmm1 | ||
164 | pshufd $0x00,%xmm1,%xmm0 | ||
165 | pshufd $0x55,%xmm1,%xmm1 | ||
166 | movq 0x08(%rdi),%xmm3 | ||
167 | pshufd $0x00,%xmm3,%xmm2 | ||
168 | pshufd $0x55,%xmm3,%xmm3 | ||
169 | movq 0x10(%rdi),%xmm5 | ||
170 | pshufd $0x00,%xmm5,%xmm4 | ||
171 | pshufd $0x55,%xmm5,%xmm5 | ||
172 | movq 0x18(%rdi),%xmm7 | ||
173 | pshufd $0x00,%xmm7,%xmm6 | ||
174 | pshufd $0x55,%xmm7,%xmm7 | ||
175 | movq 0x20(%rdi),%xmm9 | ||
176 | pshufd $0x00,%xmm9,%xmm8 | ||
177 | pshufd $0x55,%xmm9,%xmm9 | ||
178 | movq 0x28(%rdi),%xmm11 | ||
179 | pshufd $0x00,%xmm11,%xmm10 | ||
180 | pshufd $0x55,%xmm11,%xmm11 | ||
181 | movq 0x30(%rdi),%xmm13 | ||
182 | pshufd $0x00,%xmm13,%xmm12 | ||
183 | pshufd $0x55,%xmm13,%xmm13 | ||
184 | movq 0x38(%rdi),%xmm15 | ||
185 | pshufd $0x00,%xmm15,%xmm14 | ||
186 | pshufd $0x55,%xmm15,%xmm15 | ||
187 | # x0..3 on stack | ||
188 | movdqa %xmm0,0x00(%rsp) | ||
189 | movdqa %xmm1,0x10(%rsp) | ||
190 | movdqa %xmm2,0x20(%rsp) | ||
191 | movdqa %xmm3,0x30(%rsp) | ||
192 | |||
193 | movdqa CTRINC(%rip),%xmm1 | ||
194 | movdqa ROT8(%rip),%xmm2 | ||
195 | movdqa ROT16(%rip),%xmm3 | ||
196 | |||
197 | # x12 += counter values 0-3 | ||
198 | paddd %xmm1,%xmm12 | ||
199 | |||
200 | mov $10,%ecx | ||
201 | |||
202 | .Ldoubleround4: | ||
203 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) | ||
204 | movdqa 0x00(%rsp),%xmm0 | ||
205 | paddd %xmm4,%xmm0 | ||
206 | movdqa %xmm0,0x00(%rsp) | ||
207 | pxor %xmm0,%xmm12 | ||
208 | pshufb %xmm3,%xmm12 | ||
209 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) | ||
210 | movdqa 0x10(%rsp),%xmm0 | ||
211 | paddd %xmm5,%xmm0 | ||
212 | movdqa %xmm0,0x10(%rsp) | ||
213 | pxor %xmm0,%xmm13 | ||
214 | pshufb %xmm3,%xmm13 | ||
215 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) | ||
216 | movdqa 0x20(%rsp),%xmm0 | ||
217 | paddd %xmm6,%xmm0 | ||
218 | movdqa %xmm0,0x20(%rsp) | ||
219 | pxor %xmm0,%xmm14 | ||
220 | pshufb %xmm3,%xmm14 | ||
221 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) | ||
222 | movdqa 0x30(%rsp),%xmm0 | ||
223 | paddd %xmm7,%xmm0 | ||
224 | movdqa %xmm0,0x30(%rsp) | ||
225 | pxor %xmm0,%xmm15 | ||
226 | pshufb %xmm3,%xmm15 | ||
227 | |||
228 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) | ||
229 | paddd %xmm12,%xmm8 | ||
230 | pxor %xmm8,%xmm4 | ||
231 | movdqa %xmm4,%xmm0 | ||
232 | pslld $12,%xmm0 | ||
233 | psrld $20,%xmm4 | ||
234 | por %xmm0,%xmm4 | ||
235 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) | ||
236 | paddd %xmm13,%xmm9 | ||
237 | pxor %xmm9,%xmm5 | ||
238 | movdqa %xmm5,%xmm0 | ||
239 | pslld $12,%xmm0 | ||
240 | psrld $20,%xmm5 | ||
241 | por %xmm0,%xmm5 | ||
242 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) | ||
243 | paddd %xmm14,%xmm10 | ||
244 | pxor %xmm10,%xmm6 | ||
245 | movdqa %xmm6,%xmm0 | ||
246 | pslld $12,%xmm0 | ||
247 | psrld $20,%xmm6 | ||
248 | por %xmm0,%xmm6 | ||
249 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) | ||
250 | paddd %xmm15,%xmm11 | ||
251 | pxor %xmm11,%xmm7 | ||
252 | movdqa %xmm7,%xmm0 | ||
253 | pslld $12,%xmm0 | ||
254 | psrld $20,%xmm7 | ||
255 | por %xmm0,%xmm7 | ||
256 | |||
257 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) | ||
258 | movdqa 0x00(%rsp),%xmm0 | ||
259 | paddd %xmm4,%xmm0 | ||
260 | movdqa %xmm0,0x00(%rsp) | ||
261 | pxor %xmm0,%xmm12 | ||
262 | pshufb %xmm2,%xmm12 | ||
263 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) | ||
264 | movdqa 0x10(%rsp),%xmm0 | ||
265 | paddd %xmm5,%xmm0 | ||
266 | movdqa %xmm0,0x10(%rsp) | ||
267 | pxor %xmm0,%xmm13 | ||
268 | pshufb %xmm2,%xmm13 | ||
269 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) | ||
270 | movdqa 0x20(%rsp),%xmm0 | ||
271 | paddd %xmm6,%xmm0 | ||
272 | movdqa %xmm0,0x20(%rsp) | ||
273 | pxor %xmm0,%xmm14 | ||
274 | pshufb %xmm2,%xmm14 | ||
275 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) | ||
276 | movdqa 0x30(%rsp),%xmm0 | ||
277 | paddd %xmm7,%xmm0 | ||
278 | movdqa %xmm0,0x30(%rsp) | ||
279 | pxor %xmm0,%xmm15 | ||
280 | pshufb %xmm2,%xmm15 | ||
281 | |||
282 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) | ||
283 | paddd %xmm12,%xmm8 | ||
284 | pxor %xmm8,%xmm4 | ||
285 | movdqa %xmm4,%xmm0 | ||
286 | pslld $7,%xmm0 | ||
287 | psrld $25,%xmm4 | ||
288 | por %xmm0,%xmm4 | ||
289 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) | ||
290 | paddd %xmm13,%xmm9 | ||
291 | pxor %xmm9,%xmm5 | ||
292 | movdqa %xmm5,%xmm0 | ||
293 | pslld $7,%xmm0 | ||
294 | psrld $25,%xmm5 | ||
295 | por %xmm0,%xmm5 | ||
296 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) | ||
297 | paddd %xmm14,%xmm10 | ||
298 | pxor %xmm10,%xmm6 | ||
299 | movdqa %xmm6,%xmm0 | ||
300 | pslld $7,%xmm0 | ||
301 | psrld $25,%xmm6 | ||
302 | por %xmm0,%xmm6 | ||
303 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) | ||
304 | paddd %xmm15,%xmm11 | ||
305 | pxor %xmm11,%xmm7 | ||
306 | movdqa %xmm7,%xmm0 | ||
307 | pslld $7,%xmm0 | ||
308 | psrld $25,%xmm7 | ||
309 | por %xmm0,%xmm7 | ||
310 | |||
311 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) | ||
312 | movdqa 0x00(%rsp),%xmm0 | ||
313 | paddd %xmm5,%xmm0 | ||
314 | movdqa %xmm0,0x00(%rsp) | ||
315 | pxor %xmm0,%xmm15 | ||
316 | pshufb %xmm3,%xmm15 | ||
317 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) | ||
318 | movdqa 0x10(%rsp),%xmm0 | ||
319 | paddd %xmm6,%xmm0 | ||
320 | movdqa %xmm0,0x10(%rsp) | ||
321 | pxor %xmm0,%xmm12 | ||
322 | pshufb %xmm3,%xmm12 | ||
323 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) | ||
324 | movdqa 0x20(%rsp),%xmm0 | ||
325 | paddd %xmm7,%xmm0 | ||
326 | movdqa %xmm0,0x20(%rsp) | ||
327 | pxor %xmm0,%xmm13 | ||
328 | pshufb %xmm3,%xmm13 | ||
329 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) | ||
330 | movdqa 0x30(%rsp),%xmm0 | ||
331 | paddd %xmm4,%xmm0 | ||
332 | movdqa %xmm0,0x30(%rsp) | ||
333 | pxor %xmm0,%xmm14 | ||
334 | pshufb %xmm3,%xmm14 | ||
335 | |||
336 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) | ||
337 | paddd %xmm15,%xmm10 | ||
338 | pxor %xmm10,%xmm5 | ||
339 | movdqa %xmm5,%xmm0 | ||
340 | pslld $12,%xmm0 | ||
341 | psrld $20,%xmm5 | ||
342 | por %xmm0,%xmm5 | ||
343 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) | ||
344 | paddd %xmm12,%xmm11 | ||
345 | pxor %xmm11,%xmm6 | ||
346 | movdqa %xmm6,%xmm0 | ||
347 | pslld $12,%xmm0 | ||
348 | psrld $20,%xmm6 | ||
349 | por %xmm0,%xmm6 | ||
350 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) | ||
351 | paddd %xmm13,%xmm8 | ||
352 | pxor %xmm8,%xmm7 | ||
353 | movdqa %xmm7,%xmm0 | ||
354 | pslld $12,%xmm0 | ||
355 | psrld $20,%xmm7 | ||
356 | por %xmm0,%xmm7 | ||
357 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) | ||
358 | paddd %xmm14,%xmm9 | ||
359 | pxor %xmm9,%xmm4 | ||
360 | movdqa %xmm4,%xmm0 | ||
361 | pslld $12,%xmm0 | ||
362 | psrld $20,%xmm4 | ||
363 | por %xmm0,%xmm4 | ||
364 | |||
365 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) | ||
366 | movdqa 0x00(%rsp),%xmm0 | ||
367 | paddd %xmm5,%xmm0 | ||
368 | movdqa %xmm0,0x00(%rsp) | ||
369 | pxor %xmm0,%xmm15 | ||
370 | pshufb %xmm2,%xmm15 | ||
371 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) | ||
372 | movdqa 0x10(%rsp),%xmm0 | ||
373 | paddd %xmm6,%xmm0 | ||
374 | movdqa %xmm0,0x10(%rsp) | ||
375 | pxor %xmm0,%xmm12 | ||
376 | pshufb %xmm2,%xmm12 | ||
377 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) | ||
378 | movdqa 0x20(%rsp),%xmm0 | ||
379 | paddd %xmm7,%xmm0 | ||
380 | movdqa %xmm0,0x20(%rsp) | ||
381 | pxor %xmm0,%xmm13 | ||
382 | pshufb %xmm2,%xmm13 | ||
383 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) | ||
384 | movdqa 0x30(%rsp),%xmm0 | ||
385 | paddd %xmm4,%xmm0 | ||
386 | movdqa %xmm0,0x30(%rsp) | ||
387 | pxor %xmm0,%xmm14 | ||
388 | pshufb %xmm2,%xmm14 | ||
389 | |||
390 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) | ||
391 | paddd %xmm15,%xmm10 | ||
392 | pxor %xmm10,%xmm5 | ||
393 | movdqa %xmm5,%xmm0 | ||
394 | pslld $7,%xmm0 | ||
395 | psrld $25,%xmm5 | ||
396 | por %xmm0,%xmm5 | ||
397 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) | ||
398 | paddd %xmm12,%xmm11 | ||
399 | pxor %xmm11,%xmm6 | ||
400 | movdqa %xmm6,%xmm0 | ||
401 | pslld $7,%xmm0 | ||
402 | psrld $25,%xmm6 | ||
403 | por %xmm0,%xmm6 | ||
404 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) | ||
405 | paddd %xmm13,%xmm8 | ||
406 | pxor %xmm8,%xmm7 | ||
407 | movdqa %xmm7,%xmm0 | ||
408 | pslld $7,%xmm0 | ||
409 | psrld $25,%xmm7 | ||
410 | por %xmm0,%xmm7 | ||
411 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) | ||
412 | paddd %xmm14,%xmm9 | ||
413 | pxor %xmm9,%xmm4 | ||
414 | movdqa %xmm4,%xmm0 | ||
415 | pslld $7,%xmm0 | ||
416 | psrld $25,%xmm4 | ||
417 | por %xmm0,%xmm4 | ||
418 | |||
419 | dec %ecx | ||
420 | jnz .Ldoubleround4 | ||
421 | |||
422 | # x0[0-3] += s0[0] | ||
423 | # x1[0-3] += s0[1] | ||
424 | movq 0x00(%rdi),%xmm3 | ||
425 | pshufd $0x00,%xmm3,%xmm2 | ||
426 | pshufd $0x55,%xmm3,%xmm3 | ||
427 | paddd 0x00(%rsp),%xmm2 | ||
428 | movdqa %xmm2,0x00(%rsp) | ||
429 | paddd 0x10(%rsp),%xmm3 | ||
430 | movdqa %xmm3,0x10(%rsp) | ||
431 | # x2[0-3] += s0[2] | ||
432 | # x3[0-3] += s0[3] | ||
433 | movq 0x08(%rdi),%xmm3 | ||
434 | pshufd $0x00,%xmm3,%xmm2 | ||
435 | pshufd $0x55,%xmm3,%xmm3 | ||
436 | paddd 0x20(%rsp),%xmm2 | ||
437 | movdqa %xmm2,0x20(%rsp) | ||
438 | paddd 0x30(%rsp),%xmm3 | ||
439 | movdqa %xmm3,0x30(%rsp) | ||
440 | |||
441 | # x4[0-3] += s1[0] | ||
442 | # x5[0-3] += s1[1] | ||
443 | movq 0x10(%rdi),%xmm3 | ||
444 | pshufd $0x00,%xmm3,%xmm2 | ||
445 | pshufd $0x55,%xmm3,%xmm3 | ||
446 | paddd %xmm2,%xmm4 | ||
447 | paddd %xmm3,%xmm5 | ||
448 | # x6[0-3] += s1[2] | ||
449 | # x7[0-3] += s1[3] | ||
450 | movq 0x18(%rdi),%xmm3 | ||
451 | pshufd $0x00,%xmm3,%xmm2 | ||
452 | pshufd $0x55,%xmm3,%xmm3 | ||
453 | paddd %xmm2,%xmm6 | ||
454 | paddd %xmm3,%xmm7 | ||
455 | |||
456 | # x8[0-3] += s2[0] | ||
457 | # x9[0-3] += s2[1] | ||
458 | movq 0x20(%rdi),%xmm3 | ||
459 | pshufd $0x00,%xmm3,%xmm2 | ||
460 | pshufd $0x55,%xmm3,%xmm3 | ||
461 | paddd %xmm2,%xmm8 | ||
462 | paddd %xmm3,%xmm9 | ||
463 | # x10[0-3] += s2[2] | ||
464 | # x11[0-3] += s2[3] | ||
465 | movq 0x28(%rdi),%xmm3 | ||
466 | pshufd $0x00,%xmm3,%xmm2 | ||
467 | pshufd $0x55,%xmm3,%xmm3 | ||
468 | paddd %xmm2,%xmm10 | ||
469 | paddd %xmm3,%xmm11 | ||
470 | |||
471 | # x12[0-3] += s3[0] | ||
472 | # x13[0-3] += s3[1] | ||
473 | movq 0x30(%rdi),%xmm3 | ||
474 | pshufd $0x00,%xmm3,%xmm2 | ||
475 | pshufd $0x55,%xmm3,%xmm3 | ||
476 | paddd %xmm2,%xmm12 | ||
477 | paddd %xmm3,%xmm13 | ||
478 | # x14[0-3] += s3[2] | ||
479 | # x15[0-3] += s3[3] | ||
480 | movq 0x38(%rdi),%xmm3 | ||
481 | pshufd $0x00,%xmm3,%xmm2 | ||
482 | pshufd $0x55,%xmm3,%xmm3 | ||
483 | paddd %xmm2,%xmm14 | ||
484 | paddd %xmm3,%xmm15 | ||
485 | |||
486 | # x12 += counter values 0-3 | ||
487 | paddd %xmm1,%xmm12 | ||
488 | |||
489 | # interleave 32-bit words in state n, n+1 | ||
490 | movdqa 0x00(%rsp),%xmm0 | ||
491 | movdqa 0x10(%rsp),%xmm1 | ||
492 | movdqa %xmm0,%xmm2 | ||
493 | punpckldq %xmm1,%xmm2 | ||
494 | punpckhdq %xmm1,%xmm0 | ||
495 | movdqa %xmm2,0x00(%rsp) | ||
496 | movdqa %xmm0,0x10(%rsp) | ||
497 | movdqa 0x20(%rsp),%xmm0 | ||
498 | movdqa 0x30(%rsp),%xmm1 | ||
499 | movdqa %xmm0,%xmm2 | ||
500 | punpckldq %xmm1,%xmm2 | ||
501 | punpckhdq %xmm1,%xmm0 | ||
502 | movdqa %xmm2,0x20(%rsp) | ||
503 | movdqa %xmm0,0x30(%rsp) | ||
504 | movdqa %xmm4,%xmm0 | ||
505 | punpckldq %xmm5,%xmm4 | ||
506 | punpckhdq %xmm5,%xmm0 | ||
507 | movdqa %xmm0,%xmm5 | ||
508 | movdqa %xmm6,%xmm0 | ||
509 | punpckldq %xmm7,%xmm6 | ||
510 | punpckhdq %xmm7,%xmm0 | ||
511 | movdqa %xmm0,%xmm7 | ||
512 | movdqa %xmm8,%xmm0 | ||
513 | punpckldq %xmm9,%xmm8 | ||
514 | punpckhdq %xmm9,%xmm0 | ||
515 | movdqa %xmm0,%xmm9 | ||
516 | movdqa %xmm10,%xmm0 | ||
517 | punpckldq %xmm11,%xmm10 | ||
518 | punpckhdq %xmm11,%xmm0 | ||
519 | movdqa %xmm0,%xmm11 | ||
520 | movdqa %xmm12,%xmm0 | ||
521 | punpckldq %xmm13,%xmm12 | ||
522 | punpckhdq %xmm13,%xmm0 | ||
523 | movdqa %xmm0,%xmm13 | ||
524 | movdqa %xmm14,%xmm0 | ||
525 | punpckldq %xmm15,%xmm14 | ||
526 | punpckhdq %xmm15,%xmm0 | ||
527 | movdqa %xmm0,%xmm15 | ||
528 | |||
529 | # interleave 64-bit words in state n, n+2 | ||
530 | movdqa 0x00(%rsp),%xmm0 | ||
531 | movdqa 0x20(%rsp),%xmm1 | ||
532 | movdqa %xmm0,%xmm2 | ||
533 | punpcklqdq %xmm1,%xmm2 | ||
534 | punpckhqdq %xmm1,%xmm0 | ||
535 | movdqa %xmm2,0x00(%rsp) | ||
536 | movdqa %xmm0,0x20(%rsp) | ||
537 | movdqa 0x10(%rsp),%xmm0 | ||
538 | movdqa 0x30(%rsp),%xmm1 | ||
539 | movdqa %xmm0,%xmm2 | ||
540 | punpcklqdq %xmm1,%xmm2 | ||
541 | punpckhqdq %xmm1,%xmm0 | ||
542 | movdqa %xmm2,0x10(%rsp) | ||
543 | movdqa %xmm0,0x30(%rsp) | ||
544 | movdqa %xmm4,%xmm0 | ||
545 | punpcklqdq %xmm6,%xmm4 | ||
546 | punpckhqdq %xmm6,%xmm0 | ||
547 | movdqa %xmm0,%xmm6 | ||
548 | movdqa %xmm5,%xmm0 | ||
549 | punpcklqdq %xmm7,%xmm5 | ||
550 | punpckhqdq %xmm7,%xmm0 | ||
551 | movdqa %xmm0,%xmm7 | ||
552 | movdqa %xmm8,%xmm0 | ||
553 | punpcklqdq %xmm10,%xmm8 | ||
554 | punpckhqdq %xmm10,%xmm0 | ||
555 | movdqa %xmm0,%xmm10 | ||
556 | movdqa %xmm9,%xmm0 | ||
557 | punpcklqdq %xmm11,%xmm9 | ||
558 | punpckhqdq %xmm11,%xmm0 | ||
559 | movdqa %xmm0,%xmm11 | ||
560 | movdqa %xmm12,%xmm0 | ||
561 | punpcklqdq %xmm14,%xmm12 | ||
562 | punpckhqdq %xmm14,%xmm0 | ||
563 | movdqa %xmm0,%xmm14 | ||
564 | movdqa %xmm13,%xmm0 | ||
565 | punpcklqdq %xmm15,%xmm13 | ||
566 | punpckhqdq %xmm15,%xmm0 | ||
567 | movdqa %xmm0,%xmm15 | ||
568 | |||
569 | # xor with corresponding input, write to output | ||
570 | movdqa 0x00(%rsp),%xmm0 | ||
571 | movdqu 0x00(%rdx),%xmm1 | ||
572 | pxor %xmm1,%xmm0 | ||
573 | movdqu %xmm0,0x00(%rsi) | ||
574 | movdqa 0x10(%rsp),%xmm0 | ||
575 | movdqu 0x80(%rdx),%xmm1 | ||
576 | pxor %xmm1,%xmm0 | ||
577 | movdqu %xmm0,0x80(%rsi) | ||
578 | movdqa 0x20(%rsp),%xmm0 | ||
579 | movdqu 0x40(%rdx),%xmm1 | ||
580 | pxor %xmm1,%xmm0 | ||
581 | movdqu %xmm0,0x40(%rsi) | ||
582 | movdqa 0x30(%rsp),%xmm0 | ||
583 | movdqu 0xc0(%rdx),%xmm1 | ||
584 | pxor %xmm1,%xmm0 | ||
585 | movdqu %xmm0,0xc0(%rsi) | ||
586 | movdqu 0x10(%rdx),%xmm1 | ||
587 | pxor %xmm1,%xmm4 | ||
588 | movdqu %xmm4,0x10(%rsi) | ||
589 | movdqu 0x90(%rdx),%xmm1 | ||
590 | pxor %xmm1,%xmm5 | ||
591 | movdqu %xmm5,0x90(%rsi) | ||
592 | movdqu 0x50(%rdx),%xmm1 | ||
593 | pxor %xmm1,%xmm6 | ||
594 | movdqu %xmm6,0x50(%rsi) | ||
595 | movdqu 0xd0(%rdx),%xmm1 | ||
596 | pxor %xmm1,%xmm7 | ||
597 | movdqu %xmm7,0xd0(%rsi) | ||
598 | movdqu 0x20(%rdx),%xmm1 | ||
599 | pxor %xmm1,%xmm8 | ||
600 | movdqu %xmm8,0x20(%rsi) | ||
601 | movdqu 0xa0(%rdx),%xmm1 | ||
602 | pxor %xmm1,%xmm9 | ||
603 | movdqu %xmm9,0xa0(%rsi) | ||
604 | movdqu 0x60(%rdx),%xmm1 | ||
605 | pxor %xmm1,%xmm10 | ||
606 | movdqu %xmm10,0x60(%rsi) | ||
607 | movdqu 0xe0(%rdx),%xmm1 | ||
608 | pxor %xmm1,%xmm11 | ||
609 | movdqu %xmm11,0xe0(%rsi) | ||
610 | movdqu 0x30(%rdx),%xmm1 | ||
611 | pxor %xmm1,%xmm12 | ||
612 | movdqu %xmm12,0x30(%rsi) | ||
613 | movdqu 0xb0(%rdx),%xmm1 | ||
614 | pxor %xmm1,%xmm13 | ||
615 | movdqu %xmm13,0xb0(%rsi) | ||
616 | movdqu 0x70(%rdx),%xmm1 | ||
617 | pxor %xmm1,%xmm14 | ||
618 | movdqu %xmm14,0x70(%rsi) | ||
619 | movdqu 0xf0(%rdx),%xmm1 | ||
620 | pxor %xmm1,%xmm15 | ||
621 | movdqu %xmm15,0xf0(%rsi) | ||
622 | |||
623 | add $0x40,%rsp | ||
624 | ret | ||
625 | ENDPROC(chacha20_4block_xor_ssse3) | ||
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 250de401d28f..4d677c3eb7bd 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c | |||
@@ -20,12 +20,20 @@ | |||
20 | #define CHACHA20_STATE_ALIGN 16 | 20 | #define CHACHA20_STATE_ALIGN 16 |
21 | 21 | ||
22 | asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | 22 | asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); |
23 | asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | ||
23 | 24 | ||
24 | static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, | 25 | static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, |
25 | unsigned int bytes) | 26 | unsigned int bytes) |
26 | { | 27 | { |
27 | u8 buf[CHACHA20_BLOCK_SIZE]; | 28 | u8 buf[CHACHA20_BLOCK_SIZE]; |
28 | 29 | ||
30 | while (bytes >= CHACHA20_BLOCK_SIZE * 4) { | ||
31 | chacha20_4block_xor_ssse3(state, dst, src); | ||
32 | bytes -= CHACHA20_BLOCK_SIZE * 4; | ||
33 | src += CHACHA20_BLOCK_SIZE * 4; | ||
34 | dst += CHACHA20_BLOCK_SIZE * 4; | ||
35 | state[12] += 4; | ||
36 | } | ||
29 | while (bytes >= CHACHA20_BLOCK_SIZE) { | 37 | while (bytes >= CHACHA20_BLOCK_SIZE) { |
30 | chacha20_block_xor_ssse3(state, dst, src); | 38 | chacha20_block_xor_ssse3(state, dst, src); |
31 | bytes -= CHACHA20_BLOCK_SIZE; | 39 | bytes -= CHACHA20_BLOCK_SIZE; |