aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Willi <martin@strongswan.org>2015-07-16 13:14:02 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2015-07-17 09:20:25 -0400
commit274f938e0a01286f465d84d5a3f1565225f4ec4b (patch)
tree57dd7a99156fe50ce0f4c43c32494de0dcf9fd2b
parentc9320b6dcb89658a5e53b4f8e31f4c2ee810ec2d (diff)
crypto: chacha20 - Add a four block SSSE3 variant for x86_64
Extends the x86_64 SSSE3 ChaCha20 implementation by a function processing four ChaCha20 blocks in parallel. This avoids the word shuffling needed in the single block variant, further increasing throughput. For large messages, throughput increases by ~110% compared to single block SSSE3: testing speed of chacha20 (chacha20-simd) encryption test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds (690270176 bytes) test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds (2998135936 bytes) test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds (4725379072 bytes) test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds (5489185792 bytes) test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds (5675794432 bytes) testing speed of chacha20 (chacha20-simd) encryption test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds (675987680 bytes) test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds (2972265024 bytes) test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds (8455196672 bytes) test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds (11846409216 bytes) test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds (11868250112 bytes) Benchmark results from a Core i5-4670T. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/chacha20-ssse3-x86_64.S483
-rw-r--r--arch/x86/crypto/chacha20_glue.c8
2 files changed, 491 insertions, 0 deletions
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 1b97ad074cef..712b13047b41 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -16,6 +16,7 @@
16 16
17ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 17ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
18ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 18ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
19CTRINC: .octa 0x00000003000000020000000100000000
19 20
20.text 21.text
21 22
@@ -140,3 +141,485 @@ ENTRY(chacha20_block_xor_ssse3)
140 141
141 ret 142 ret
142ENDPROC(chacha20_block_xor_ssse3) 143ENDPROC(chacha20_block_xor_ssse3)
144
145ENTRY(chacha20_4block_xor_ssse3)
146 # %rdi: Input state matrix, s
147 # %rsi: 4 data blocks output, o
148 # %rdx: 4 data blocks input, i
149
150 # This function encrypts four consecutive ChaCha20 blocks by loading the
151 # the state matrix in SSE registers four times. As we need some scratch
152 # registers, we save the first four registers on the stack. The
153 # algorithm performs each operation on the corresponding word of each
154 # state matrix, hence requires no word shuffling. For final XORing step
155 # we transpose the matrix by interleaving 32- and then 64-bit words,
156 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157 # done with the slightly better performing SSSE3 byte shuffling,
158 # 7/12-bit word rotation uses traditional shift+OR.
159
160 sub $0x40,%rsp
161
162 # x0..15[0-3] = s0..3[0..3]
163 movq 0x00(%rdi),%xmm1
164 pshufd $0x00,%xmm1,%xmm0
165 pshufd $0x55,%xmm1,%xmm1
166 movq 0x08(%rdi),%xmm3
167 pshufd $0x00,%xmm3,%xmm2
168 pshufd $0x55,%xmm3,%xmm3
169 movq 0x10(%rdi),%xmm5
170 pshufd $0x00,%xmm5,%xmm4
171 pshufd $0x55,%xmm5,%xmm5
172 movq 0x18(%rdi),%xmm7
173 pshufd $0x00,%xmm7,%xmm6
174 pshufd $0x55,%xmm7,%xmm7
175 movq 0x20(%rdi),%xmm9
176 pshufd $0x00,%xmm9,%xmm8
177 pshufd $0x55,%xmm9,%xmm9
178 movq 0x28(%rdi),%xmm11
179 pshufd $0x00,%xmm11,%xmm10
180 pshufd $0x55,%xmm11,%xmm11
181 movq 0x30(%rdi),%xmm13
182 pshufd $0x00,%xmm13,%xmm12
183 pshufd $0x55,%xmm13,%xmm13
184 movq 0x38(%rdi),%xmm15
185 pshufd $0x00,%xmm15,%xmm14
186 pshufd $0x55,%xmm15,%xmm15
187 # x0..3 on stack
188 movdqa %xmm0,0x00(%rsp)
189 movdqa %xmm1,0x10(%rsp)
190 movdqa %xmm2,0x20(%rsp)
191 movdqa %xmm3,0x30(%rsp)
192
193 movdqa CTRINC(%rip),%xmm1
194 movdqa ROT8(%rip),%xmm2
195 movdqa ROT16(%rip),%xmm3
196
197 # x12 += counter values 0-3
198 paddd %xmm1,%xmm12
199
200 mov $10,%ecx
201
202.Ldoubleround4:
203 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
204 movdqa 0x00(%rsp),%xmm0
205 paddd %xmm4,%xmm0
206 movdqa %xmm0,0x00(%rsp)
207 pxor %xmm0,%xmm12
208 pshufb %xmm3,%xmm12
209 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
210 movdqa 0x10(%rsp),%xmm0
211 paddd %xmm5,%xmm0
212 movdqa %xmm0,0x10(%rsp)
213 pxor %xmm0,%xmm13
214 pshufb %xmm3,%xmm13
215 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
216 movdqa 0x20(%rsp),%xmm0
217 paddd %xmm6,%xmm0
218 movdqa %xmm0,0x20(%rsp)
219 pxor %xmm0,%xmm14
220 pshufb %xmm3,%xmm14
221 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
222 movdqa 0x30(%rsp),%xmm0
223 paddd %xmm7,%xmm0
224 movdqa %xmm0,0x30(%rsp)
225 pxor %xmm0,%xmm15
226 pshufb %xmm3,%xmm15
227
228 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
229 paddd %xmm12,%xmm8
230 pxor %xmm8,%xmm4
231 movdqa %xmm4,%xmm0
232 pslld $12,%xmm0
233 psrld $20,%xmm4
234 por %xmm0,%xmm4
235 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
236 paddd %xmm13,%xmm9
237 pxor %xmm9,%xmm5
238 movdqa %xmm5,%xmm0
239 pslld $12,%xmm0
240 psrld $20,%xmm5
241 por %xmm0,%xmm5
242 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
243 paddd %xmm14,%xmm10
244 pxor %xmm10,%xmm6
245 movdqa %xmm6,%xmm0
246 pslld $12,%xmm0
247 psrld $20,%xmm6
248 por %xmm0,%xmm6
249 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
250 paddd %xmm15,%xmm11
251 pxor %xmm11,%xmm7
252 movdqa %xmm7,%xmm0
253 pslld $12,%xmm0
254 psrld $20,%xmm7
255 por %xmm0,%xmm7
256
257 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
258 movdqa 0x00(%rsp),%xmm0
259 paddd %xmm4,%xmm0
260 movdqa %xmm0,0x00(%rsp)
261 pxor %xmm0,%xmm12
262 pshufb %xmm2,%xmm12
263 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
264 movdqa 0x10(%rsp),%xmm0
265 paddd %xmm5,%xmm0
266 movdqa %xmm0,0x10(%rsp)
267 pxor %xmm0,%xmm13
268 pshufb %xmm2,%xmm13
269 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
270 movdqa 0x20(%rsp),%xmm0
271 paddd %xmm6,%xmm0
272 movdqa %xmm0,0x20(%rsp)
273 pxor %xmm0,%xmm14
274 pshufb %xmm2,%xmm14
275 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
276 movdqa 0x30(%rsp),%xmm0
277 paddd %xmm7,%xmm0
278 movdqa %xmm0,0x30(%rsp)
279 pxor %xmm0,%xmm15
280 pshufb %xmm2,%xmm15
281
282 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
283 paddd %xmm12,%xmm8
284 pxor %xmm8,%xmm4
285 movdqa %xmm4,%xmm0
286 pslld $7,%xmm0
287 psrld $25,%xmm4
288 por %xmm0,%xmm4
289 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
290 paddd %xmm13,%xmm9
291 pxor %xmm9,%xmm5
292 movdqa %xmm5,%xmm0
293 pslld $7,%xmm0
294 psrld $25,%xmm5
295 por %xmm0,%xmm5
296 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
297 paddd %xmm14,%xmm10
298 pxor %xmm10,%xmm6
299 movdqa %xmm6,%xmm0
300 pslld $7,%xmm0
301 psrld $25,%xmm6
302 por %xmm0,%xmm6
303 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
304 paddd %xmm15,%xmm11
305 pxor %xmm11,%xmm7
306 movdqa %xmm7,%xmm0
307 pslld $7,%xmm0
308 psrld $25,%xmm7
309 por %xmm0,%xmm7
310
311 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
312 movdqa 0x00(%rsp),%xmm0
313 paddd %xmm5,%xmm0
314 movdqa %xmm0,0x00(%rsp)
315 pxor %xmm0,%xmm15
316 pshufb %xmm3,%xmm15
317 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
318 movdqa 0x10(%rsp),%xmm0
319 paddd %xmm6,%xmm0
320 movdqa %xmm0,0x10(%rsp)
321 pxor %xmm0,%xmm12
322 pshufb %xmm3,%xmm12
323 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
324 movdqa 0x20(%rsp),%xmm0
325 paddd %xmm7,%xmm0
326 movdqa %xmm0,0x20(%rsp)
327 pxor %xmm0,%xmm13
328 pshufb %xmm3,%xmm13
329 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
330 movdqa 0x30(%rsp),%xmm0
331 paddd %xmm4,%xmm0
332 movdqa %xmm0,0x30(%rsp)
333 pxor %xmm0,%xmm14
334 pshufb %xmm3,%xmm14
335
336 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
337 paddd %xmm15,%xmm10
338 pxor %xmm10,%xmm5
339 movdqa %xmm5,%xmm0
340 pslld $12,%xmm0
341 psrld $20,%xmm5
342 por %xmm0,%xmm5
343 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
344 paddd %xmm12,%xmm11
345 pxor %xmm11,%xmm6
346 movdqa %xmm6,%xmm0
347 pslld $12,%xmm0
348 psrld $20,%xmm6
349 por %xmm0,%xmm6
350 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
351 paddd %xmm13,%xmm8
352 pxor %xmm8,%xmm7
353 movdqa %xmm7,%xmm0
354 pslld $12,%xmm0
355 psrld $20,%xmm7
356 por %xmm0,%xmm7
357 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
358 paddd %xmm14,%xmm9
359 pxor %xmm9,%xmm4
360 movdqa %xmm4,%xmm0
361 pslld $12,%xmm0
362 psrld $20,%xmm4
363 por %xmm0,%xmm4
364
365 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
366 movdqa 0x00(%rsp),%xmm0
367 paddd %xmm5,%xmm0
368 movdqa %xmm0,0x00(%rsp)
369 pxor %xmm0,%xmm15
370 pshufb %xmm2,%xmm15
371 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
372 movdqa 0x10(%rsp),%xmm0
373 paddd %xmm6,%xmm0
374 movdqa %xmm0,0x10(%rsp)
375 pxor %xmm0,%xmm12
376 pshufb %xmm2,%xmm12
377 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
378 movdqa 0x20(%rsp),%xmm0
379 paddd %xmm7,%xmm0
380 movdqa %xmm0,0x20(%rsp)
381 pxor %xmm0,%xmm13
382 pshufb %xmm2,%xmm13
383 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
384 movdqa 0x30(%rsp),%xmm0
385 paddd %xmm4,%xmm0
386 movdqa %xmm0,0x30(%rsp)
387 pxor %xmm0,%xmm14
388 pshufb %xmm2,%xmm14
389
390 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
391 paddd %xmm15,%xmm10
392 pxor %xmm10,%xmm5
393 movdqa %xmm5,%xmm0
394 pslld $7,%xmm0
395 psrld $25,%xmm5
396 por %xmm0,%xmm5
397 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
398 paddd %xmm12,%xmm11
399 pxor %xmm11,%xmm6
400 movdqa %xmm6,%xmm0
401 pslld $7,%xmm0
402 psrld $25,%xmm6
403 por %xmm0,%xmm6
404 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
405 paddd %xmm13,%xmm8
406 pxor %xmm8,%xmm7
407 movdqa %xmm7,%xmm0
408 pslld $7,%xmm0
409 psrld $25,%xmm7
410 por %xmm0,%xmm7
411 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
412 paddd %xmm14,%xmm9
413 pxor %xmm9,%xmm4
414 movdqa %xmm4,%xmm0
415 pslld $7,%xmm0
416 psrld $25,%xmm4
417 por %xmm0,%xmm4
418
419 dec %ecx
420 jnz .Ldoubleround4
421
422 # x0[0-3] += s0[0]
423 # x1[0-3] += s0[1]
424 movq 0x00(%rdi),%xmm3
425 pshufd $0x00,%xmm3,%xmm2
426 pshufd $0x55,%xmm3,%xmm3
427 paddd 0x00(%rsp),%xmm2
428 movdqa %xmm2,0x00(%rsp)
429 paddd 0x10(%rsp),%xmm3
430 movdqa %xmm3,0x10(%rsp)
431 # x2[0-3] += s0[2]
432 # x3[0-3] += s0[3]
433 movq 0x08(%rdi),%xmm3
434 pshufd $0x00,%xmm3,%xmm2
435 pshufd $0x55,%xmm3,%xmm3
436 paddd 0x20(%rsp),%xmm2
437 movdqa %xmm2,0x20(%rsp)
438 paddd 0x30(%rsp),%xmm3
439 movdqa %xmm3,0x30(%rsp)
440
441 # x4[0-3] += s1[0]
442 # x5[0-3] += s1[1]
443 movq 0x10(%rdi),%xmm3
444 pshufd $0x00,%xmm3,%xmm2
445 pshufd $0x55,%xmm3,%xmm3
446 paddd %xmm2,%xmm4
447 paddd %xmm3,%xmm5
448 # x6[0-3] += s1[2]
449 # x7[0-3] += s1[3]
450 movq 0x18(%rdi),%xmm3
451 pshufd $0x00,%xmm3,%xmm2
452 pshufd $0x55,%xmm3,%xmm3
453 paddd %xmm2,%xmm6
454 paddd %xmm3,%xmm7
455
456 # x8[0-3] += s2[0]
457 # x9[0-3] += s2[1]
458 movq 0x20(%rdi),%xmm3
459 pshufd $0x00,%xmm3,%xmm2
460 pshufd $0x55,%xmm3,%xmm3
461 paddd %xmm2,%xmm8
462 paddd %xmm3,%xmm9
463 # x10[0-3] += s2[2]
464 # x11[0-3] += s2[3]
465 movq 0x28(%rdi),%xmm3
466 pshufd $0x00,%xmm3,%xmm2
467 pshufd $0x55,%xmm3,%xmm3
468 paddd %xmm2,%xmm10
469 paddd %xmm3,%xmm11
470
471 # x12[0-3] += s3[0]
472 # x13[0-3] += s3[1]
473 movq 0x30(%rdi),%xmm3
474 pshufd $0x00,%xmm3,%xmm2
475 pshufd $0x55,%xmm3,%xmm3
476 paddd %xmm2,%xmm12
477 paddd %xmm3,%xmm13
478 # x14[0-3] += s3[2]
479 # x15[0-3] += s3[3]
480 movq 0x38(%rdi),%xmm3
481 pshufd $0x00,%xmm3,%xmm2
482 pshufd $0x55,%xmm3,%xmm3
483 paddd %xmm2,%xmm14
484 paddd %xmm3,%xmm15
485
486 # x12 += counter values 0-3
487 paddd %xmm1,%xmm12
488
489 # interleave 32-bit words in state n, n+1
490 movdqa 0x00(%rsp),%xmm0
491 movdqa 0x10(%rsp),%xmm1
492 movdqa %xmm0,%xmm2
493 punpckldq %xmm1,%xmm2
494 punpckhdq %xmm1,%xmm0
495 movdqa %xmm2,0x00(%rsp)
496 movdqa %xmm0,0x10(%rsp)
497 movdqa 0x20(%rsp),%xmm0
498 movdqa 0x30(%rsp),%xmm1
499 movdqa %xmm0,%xmm2
500 punpckldq %xmm1,%xmm2
501 punpckhdq %xmm1,%xmm0
502 movdqa %xmm2,0x20(%rsp)
503 movdqa %xmm0,0x30(%rsp)
504 movdqa %xmm4,%xmm0
505 punpckldq %xmm5,%xmm4
506 punpckhdq %xmm5,%xmm0
507 movdqa %xmm0,%xmm5
508 movdqa %xmm6,%xmm0
509 punpckldq %xmm7,%xmm6
510 punpckhdq %xmm7,%xmm0
511 movdqa %xmm0,%xmm7
512 movdqa %xmm8,%xmm0
513 punpckldq %xmm9,%xmm8
514 punpckhdq %xmm9,%xmm0
515 movdqa %xmm0,%xmm9
516 movdqa %xmm10,%xmm0
517 punpckldq %xmm11,%xmm10
518 punpckhdq %xmm11,%xmm0
519 movdqa %xmm0,%xmm11
520 movdqa %xmm12,%xmm0
521 punpckldq %xmm13,%xmm12
522 punpckhdq %xmm13,%xmm0
523 movdqa %xmm0,%xmm13
524 movdqa %xmm14,%xmm0
525 punpckldq %xmm15,%xmm14
526 punpckhdq %xmm15,%xmm0
527 movdqa %xmm0,%xmm15
528
529 # interleave 64-bit words in state n, n+2
530 movdqa 0x00(%rsp),%xmm0
531 movdqa 0x20(%rsp),%xmm1
532 movdqa %xmm0,%xmm2
533 punpcklqdq %xmm1,%xmm2
534 punpckhqdq %xmm1,%xmm0
535 movdqa %xmm2,0x00(%rsp)
536 movdqa %xmm0,0x20(%rsp)
537 movdqa 0x10(%rsp),%xmm0
538 movdqa 0x30(%rsp),%xmm1
539 movdqa %xmm0,%xmm2
540 punpcklqdq %xmm1,%xmm2
541 punpckhqdq %xmm1,%xmm0
542 movdqa %xmm2,0x10(%rsp)
543 movdqa %xmm0,0x30(%rsp)
544 movdqa %xmm4,%xmm0
545 punpcklqdq %xmm6,%xmm4
546 punpckhqdq %xmm6,%xmm0
547 movdqa %xmm0,%xmm6
548 movdqa %xmm5,%xmm0
549 punpcklqdq %xmm7,%xmm5
550 punpckhqdq %xmm7,%xmm0
551 movdqa %xmm0,%xmm7
552 movdqa %xmm8,%xmm0
553 punpcklqdq %xmm10,%xmm8
554 punpckhqdq %xmm10,%xmm0
555 movdqa %xmm0,%xmm10
556 movdqa %xmm9,%xmm0
557 punpcklqdq %xmm11,%xmm9
558 punpckhqdq %xmm11,%xmm0
559 movdqa %xmm0,%xmm11
560 movdqa %xmm12,%xmm0
561 punpcklqdq %xmm14,%xmm12
562 punpckhqdq %xmm14,%xmm0
563 movdqa %xmm0,%xmm14
564 movdqa %xmm13,%xmm0
565 punpcklqdq %xmm15,%xmm13
566 punpckhqdq %xmm15,%xmm0
567 movdqa %xmm0,%xmm15
568
569 # xor with corresponding input, write to output
570 movdqa 0x00(%rsp),%xmm0
571 movdqu 0x00(%rdx),%xmm1
572 pxor %xmm1,%xmm0
573 movdqu %xmm0,0x00(%rsi)
574 movdqa 0x10(%rsp),%xmm0
575 movdqu 0x80(%rdx),%xmm1
576 pxor %xmm1,%xmm0
577 movdqu %xmm0,0x80(%rsi)
578 movdqa 0x20(%rsp),%xmm0
579 movdqu 0x40(%rdx),%xmm1
580 pxor %xmm1,%xmm0
581 movdqu %xmm0,0x40(%rsi)
582 movdqa 0x30(%rsp),%xmm0
583 movdqu 0xc0(%rdx),%xmm1
584 pxor %xmm1,%xmm0
585 movdqu %xmm0,0xc0(%rsi)
586 movdqu 0x10(%rdx),%xmm1
587 pxor %xmm1,%xmm4
588 movdqu %xmm4,0x10(%rsi)
589 movdqu 0x90(%rdx),%xmm1
590 pxor %xmm1,%xmm5
591 movdqu %xmm5,0x90(%rsi)
592 movdqu 0x50(%rdx),%xmm1
593 pxor %xmm1,%xmm6
594 movdqu %xmm6,0x50(%rsi)
595 movdqu 0xd0(%rdx),%xmm1
596 pxor %xmm1,%xmm7
597 movdqu %xmm7,0xd0(%rsi)
598 movdqu 0x20(%rdx),%xmm1
599 pxor %xmm1,%xmm8
600 movdqu %xmm8,0x20(%rsi)
601 movdqu 0xa0(%rdx),%xmm1
602 pxor %xmm1,%xmm9
603 movdqu %xmm9,0xa0(%rsi)
604 movdqu 0x60(%rdx),%xmm1
605 pxor %xmm1,%xmm10
606 movdqu %xmm10,0x60(%rsi)
607 movdqu 0xe0(%rdx),%xmm1
608 pxor %xmm1,%xmm11
609 movdqu %xmm11,0xe0(%rsi)
610 movdqu 0x30(%rdx),%xmm1
611 pxor %xmm1,%xmm12
612 movdqu %xmm12,0x30(%rsi)
613 movdqu 0xb0(%rdx),%xmm1
614 pxor %xmm1,%xmm13
615 movdqu %xmm13,0xb0(%rsi)
616 movdqu 0x70(%rdx),%xmm1
617 pxor %xmm1,%xmm14
618 movdqu %xmm14,0x70(%rsi)
619 movdqu 0xf0(%rdx),%xmm1
620 pxor %xmm1,%xmm15
621 movdqu %xmm15,0xf0(%rsi)
622
623 add $0x40,%rsp
624 ret
625ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 250de401d28f..4d677c3eb7bd 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -20,12 +20,20 @@
20#define CHACHA20_STATE_ALIGN 16 20#define CHACHA20_STATE_ALIGN 16
21 21
22asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); 22asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
23asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
23 24
24static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, 25static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
25 unsigned int bytes) 26 unsigned int bytes)
26{ 27{
27 u8 buf[CHACHA20_BLOCK_SIZE]; 28 u8 buf[CHACHA20_BLOCK_SIZE];
28 29
30 while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
31 chacha20_4block_xor_ssse3(state, dst, src);
32 bytes -= CHACHA20_BLOCK_SIZE * 4;
33 src += CHACHA20_BLOCK_SIZE * 4;
34 dst += CHACHA20_BLOCK_SIZE * 4;
35 state[12] += 4;
36 }
29 while (bytes >= CHACHA20_BLOCK_SIZE) { 37 while (bytes >= CHACHA20_BLOCK_SIZE) {
30 chacha20_block_xor_ssse3(state, dst, src); 38 chacha20_block_xor_ssse3(state, dst, src);
31 bytes -= CHACHA20_BLOCK_SIZE; 39 bytes -= CHACHA20_BLOCK_SIZE;