aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S598
1 files changed, 519 insertions, 79 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index d528fde219d2..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -204,9 +204,9 @@ enc: .octa 0x2
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/ 205*/
206 206
207.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
208XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
209 207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD 210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen 211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11 212 mov %r12, %r11
@@ -228,19 +228,25 @@ _get_AAD_loop2\num_initial_blocks\operation:
228 cmp %r11, %r12 228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation 229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation: 230_get_AAD_loop2_done\num_initial_blocks\operation:
231 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data 231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
232 xor %r11, %r11 # initialise the data pointer offset as zero 234 xor %r11, %r11 # initialise the data pointer offset as zero
233 235
234 # start AES for num_initial_blocks blocks 236 # start AES for num_initial_blocks blocks
235 237
236 mov %arg5, %rax # %rax = *Y0 238 mov %arg5, %rax # %rax = *Y0
237 movdqu (%rax), \XMM0 # XMM0 = Y0 239 movdqu (%rax), \XMM0 # XMM0 = Y0
238 pshufb SHUF_MASK(%rip), \XMM0 240 movdqa SHUF_MASK(%rip), %xmm14
239.if \i_seq != 0 241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
240.irpc index, \i_seq 244.irpc index, \i_seq
241 paddd ONE(%rip), \XMM0 # INCR Y0 245 paddd ONE(%rip), \XMM0 # INCR Y0
242 movdqa \XMM0, %xmm\index 246 movdqa \XMM0, %xmm\index
243 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap 247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
244.endr 250.endr
245.irpc index, \i_seq 251.irpc index, \i_seq
246 pxor 16*0(%arg1), %xmm\index 252 pxor 16*0(%arg1), %xmm\index
@@ -291,10 +297,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
291 movdqu %xmm\index, (%arg2 , %r11, 1) 297 movdqu %xmm\index, (%arg2 , %r11, 1)
292 # write back plaintext/ciphertext for num_initial_blocks 298 # write back plaintext/ciphertext for num_initial_blocks
293 add $16, %r11 299 add $16, %r11
294.if \operation == dec 300
295 movdqa \TMP1, %xmm\index 301 movdqa \TMP1, %xmm\index
296.endif 302 movdqa SHUF_MASK(%rip), %xmm14
297 pshufb SHUF_MASK(%rip), %xmm\index 303 PSHUFB_XMM %xmm14, %xmm\index
304
298 # prepare plaintext/ciphertext for GHASH computation 305 # prepare plaintext/ciphertext for GHASH computation
299.endr 306.endr
300.endif 307.endif
@@ -327,16 +334,24 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
327*/ 334*/
328 paddd ONE(%rip), \XMM0 # INCR Y0 335 paddd ONE(%rip), \XMM0 # INCR Y0
329 movdqa \XMM0, \XMM1 336 movdqa \XMM0, \XMM1
330 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap 337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
331 paddd ONE(%rip), \XMM0 # INCR Y0 340 paddd ONE(%rip), \XMM0 # INCR Y0
332 movdqa \XMM0, \XMM2 341 movdqa \XMM0, \XMM2
333 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap 342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
334 paddd ONE(%rip), \XMM0 # INCR Y0 345 paddd ONE(%rip), \XMM0 # INCR Y0
335 movdqa \XMM0, \XMM3 346 movdqa \XMM0, \XMM3
336 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap 347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
337 paddd ONE(%rip), \XMM0 # INCR Y0 350 paddd ONE(%rip), \XMM0 # INCR Y0
338 movdqa \XMM0, \XMM4 351 movdqa \XMM0, \XMM4
339 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap 352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
340 pxor 16*0(%arg1), \XMM1 355 pxor 16*0(%arg1), \XMM1
341 pxor 16*0(%arg1), \XMM2 356 pxor 16*0(%arg1), \XMM2
342 pxor 16*0(%arg1), \XMM3 357 pxor 16*0(%arg1), \XMM3
@@ -385,41 +400,268 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
385 AESENCLAST \TMP2, \XMM4 400 AESENCLAST \TMP2, \XMM4
386 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
387 pxor \TMP1, \XMM1 402 pxor \TMP1, \XMM1
388.if \operation == dec
389 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
390 movdqa \TMP1, \XMM1 404 movdqa \TMP1, \XMM1
391.endif
392 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
393 pxor \TMP1, \XMM2 406 pxor \TMP1, \XMM2
394.if \operation == dec
395 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
396 movdqa \TMP1, \XMM2 408 movdqa \TMP1, \XMM2
397.endif
398 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
399 pxor \TMP1, \XMM3 410 pxor \TMP1, \XMM3
400.if \operation == dec
401 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
402 movdqa \TMP1, \XMM3 412 movdqa \TMP1, \XMM3
403.endif
404 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
405 pxor \TMP1, \XMM4 414 pxor \TMP1, \XMM4
406.if \operation == dec
407 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM4 416 movdqa \TMP1, \XMM4
409.else 417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
410 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
411 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
412 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
413 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
414.endif 650
415 add $64, %r11 651 add $64, %r11
416 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap 652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
417 pxor \XMMDst, \XMM1 654 pxor \XMMDst, \XMM1
418# combine GHASHed value with the corresponding ciphertext 655# combine GHASHed value with the corresponding ciphertext
419 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap 656 movdqa SHUF_MASK(%rip), %xmm14
420 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap 657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
421 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap 658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
422_initial_blocks_done\num_initial_blocks\operation: 663_initial_blocks_done\num_initial_blocks\operation:
664
423.endm 665.endm
424 666
425/* 667/*
@@ -428,7 +670,199 @@ _initial_blocks_done\num_initial_blocks\operation:
428* arg1, %arg2, %arg3 are used as pointers only, not modified 670* arg1, %arg2, %arg3 are used as pointers only, not modified
429* %r11 is the data offset value 671* %r11 is the data offset value
430*/ 672*/
431.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \ 673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
432TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
433 867
434 movdqa \XMM1, \XMM5 868 movdqa \XMM1, \XMM5
@@ -436,6 +870,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
436 movdqa \XMM3, \XMM7 870 movdqa \XMM3, \XMM7
437 movdqa \XMM4, \XMM8 871 movdqa \XMM4, \XMM8
438 872
873 movdqa SHUF_MASK(%rip), %xmm15
439 # multiply TMP5 * HashKey using karatsuba 874 # multiply TMP5 * HashKey using karatsuba
440 875
441 movdqa \XMM5, \TMP4 876 movdqa \XMM5, \TMP4
@@ -451,11 +886,12 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
451 movdqa \XMM0, \XMM3 886 movdqa \XMM0, \XMM3
452 paddd ONE(%rip), \XMM0 # INCR CNT 887 paddd ONE(%rip), \XMM0 # INCR CNT
453 movdqa \XMM0, \XMM4 888 movdqa \XMM0, \XMM4
454 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap 889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
455 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
456 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap 891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
457 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap 892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
458 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap 893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
459 pxor (%arg1), \XMM1 895 pxor (%arg1), \XMM1
460 pxor (%arg1), \XMM2 896 pxor (%arg1), \XMM2
461 pxor (%arg1), \XMM3 897 pxor (%arg1), \XMM3
@@ -553,37 +989,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
553 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
554 movdqu (%arg3,%r11,1), \TMP3 990 movdqu (%arg3,%r11,1), \TMP3
555 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
556.if \operation == dec
557 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
558 movdqa \TMP3, \XMM1 993 movdqa \TMP3, \XMM1
559.endif
560 movdqu 16(%arg3,%r11,1), \TMP3 994 movdqu 16(%arg3,%r11,1), \TMP3
561 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
562.if \operation == dec
563 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
564 movdqa \TMP3, \XMM2 997 movdqa \TMP3, \XMM2
565.endif
566 movdqu 32(%arg3,%r11,1), \TMP3 998 movdqu 32(%arg3,%r11,1), \TMP3
567 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
568.if \operation == dec
569 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
570 movdqa \TMP3, \XMM3 1001 movdqa \TMP3, \XMM3
571.endif
572 movdqu 48(%arg3,%r11,1), \TMP3 1002 movdqu 48(%arg3,%r11,1), \TMP3
573 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
574.if \operation == dec
575 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
576 movdqa \TMP3, \XMM4 1005 movdqa \TMP3, \XMM4
577.else 1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
578 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
579 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
580 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
581 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
582.endif
583 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
584 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
585 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
586 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
587 1010
588 pxor \TMP4, \TMP1 1011 pxor \TMP4, \TMP1
589 pxor \XMM8, \XMM5 1012 pxor \XMM8, \XMM5
@@ -853,7 +1276,9 @@ ENTRY(aesni_gcm_dec)
853 and $~63, %rsp # align rsp to 64 bytes 1276 and $~63, %rsp # align rsp to 64 bytes
854 mov %arg6, %r12 1277 mov %arg6, %r12
855 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
856 pshufb SHUF_MASK(%rip), %xmm13 1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
857 1282
858# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
859 1284
@@ -885,22 +1310,22 @@ ENTRY(aesni_gcm_dec)
885 jb _initial_num_blocks_is_1_decrypt 1310 jb _initial_num_blocks_is_1_decrypt
886 je _initial_num_blocks_is_2_decrypt 1311 je _initial_num_blocks_is_2_decrypt
887_initial_num_blocks_is_3_decrypt: 1312_initial_num_blocks_is_3_decrypt:
888 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
889%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
890 sub $48, %r13 1315 sub $48, %r13
891 jmp _initial_blocks_decrypted 1316 jmp _initial_blocks_decrypted
892_initial_num_blocks_is_2_decrypt: 1317_initial_num_blocks_is_2_decrypt:
893 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
894%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
895 sub $32, %r13 1320 sub $32, %r13
896 jmp _initial_blocks_decrypted 1321 jmp _initial_blocks_decrypted
897_initial_num_blocks_is_1_decrypt: 1322_initial_num_blocks_is_1_decrypt:
898 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
899%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
900 sub $16, %r13 1325 sub $16, %r13
901 jmp _initial_blocks_decrypted 1326 jmp _initial_blocks_decrypted
902_initial_num_blocks_is_0_decrypt: 1327_initial_num_blocks_is_0_decrypt:
903 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
904%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
905_initial_blocks_decrypted: 1330_initial_blocks_decrypted:
906 cmp $0, %r13 1331 cmp $0, %r13
@@ -908,7 +1333,7 @@ _initial_blocks_decrypted:
908 sub $64, %r13 1333 sub $64, %r13
909 je _four_cipher_left_decrypt 1334 je _four_cipher_left_decrypt
910_decrypt_by_4: 1335_decrypt_by_4:
911 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
912%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
913 add $64, %r11 1338 add $64, %r11
914 sub $64, %r13 1339 sub $64, %r13
@@ -924,7 +1349,9 @@ _zero_cipher_left_decrypt:
924 # Handle the last <16 byte block seperately 1349 # Handle the last <16 byte block seperately
925 1350
926 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
927 pshufb SHUF_MASK(%rip), %xmm0 1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
928 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
929 sub $16, %r11 1356 sub $16, %r11
930 add %r13, %r11 1357 add %r13, %r11
@@ -934,14 +1361,17 @@ _zero_cipher_left_decrypt:
934# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
935# (%r13 is the number of bytes in plaintext mod 16) 1362# (%r13 is the number of bytes in plaintext mod 16)
936 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
937 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes 1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
938 movdqa %xmm1, %xmm2 1366 movdqa %xmm1, %xmm2
939 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
940 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
941 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
942 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
943 pand %xmm1, %xmm2 1371 pand %xmm1, %xmm2
944 pshufb SHUF_MASK(%rip),%xmm2 1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
945 pxor %xmm2, %xmm8 1375 pxor %xmm2, %xmm8
946 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
947 # GHASH computation for the last <16 byte block 1377 # GHASH computation for the last <16 byte block
@@ -949,13 +1379,13 @@ _zero_cipher_left_decrypt:
949 add $16, %r11 1379 add $16, %r11
950 1380
951 # output %r13 bytes 1381 # output %r13 bytes
952 movq %xmm0, %rax 1382 MOVQ_R64_XMM %xmm0, %rax
953 cmp $8, %r13 1383 cmp $8, %r13
954 jle _less_than_8_bytes_left_decrypt 1384 jle _less_than_8_bytes_left_decrypt
955 mov %rax, (%arg2 , %r11, 1) 1385 mov %rax, (%arg2 , %r11, 1)
956 add $8, %r11 1386 add $8, %r11
957 psrldq $8, %xmm0 1387 psrldq $8, %xmm0
958 movq %xmm0, %rax 1388 MOVQ_R64_XMM %xmm0, %rax
959 sub $8, %r13 1389 sub $8, %r13
960_less_than_8_bytes_left_decrypt: 1390_less_than_8_bytes_left_decrypt:
961 mov %al, (%arg2, %r11, 1) 1391 mov %al, (%arg2, %r11, 1)
@@ -968,13 +1398,15 @@ _multiple_of_16_bytes_decrypt:
968 shl $3, %r12 # convert into number of bits 1398 shl $3, %r12 # convert into number of bits
969 movd %r12d, %xmm15 # len(A) in %xmm15 1399 movd %r12d, %xmm15 # len(A) in %xmm15
970 shl $3, %arg4 # len(C) in bits (*128) 1400 shl $3, %arg4 # len(C) in bits (*128)
971 movq %arg4, %xmm1 1401 MOVQ_R64_XMM %arg4, %xmm1
972 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
973 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
974 pxor %xmm15, %xmm8 1404 pxor %xmm15, %xmm8
975 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
976 # final GHASH computation 1406 # final GHASH computation
977 pshufb SHUF_MASK(%rip), %xmm8 1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
978 mov %arg5, %rax # %rax = *Y0 1410 mov %arg5, %rax # %rax = *Y0
979 movdqu (%rax), %xmm0 # %xmm0 = Y0 1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
980 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
@@ -987,11 +1419,11 @@ _return_T_decrypt:
987 cmp $12, %r11 1419 cmp $12, %r11
988 je _T_12_decrypt 1420 je _T_12_decrypt
989_T_8_decrypt: 1421_T_8_decrypt:
990 movq %xmm0, %rax 1422 MOVQ_R64_XMM %xmm0, %rax
991 mov %rax, (%r10) 1423 mov %rax, (%r10)
992 jmp _return_T_done_decrypt 1424 jmp _return_T_done_decrypt
993_T_12_decrypt: 1425_T_12_decrypt:
994 movq %xmm0, %rax 1426 MOVQ_R64_XMM %xmm0, %rax
995 mov %rax, (%r10) 1427 mov %rax, (%r10)
996 psrldq $8, %xmm0 1428 psrldq $8, %xmm0
997 movd %xmm0, %eax 1429 movd %xmm0, %eax
@@ -1103,7 +1535,9 @@ ENTRY(aesni_gcm_enc)
1103 and $~63, %rsp 1535 and $~63, %rsp
1104 mov %arg6, %r12 1536 mov %arg6, %r12
1105 movdqu (%r12), %xmm13 1537 movdqu (%r12), %xmm13
1106 pshufb SHUF_MASK(%rip), %xmm13 1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1107 1541
1108# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1109 1543
@@ -1134,22 +1568,22 @@ ENTRY(aesni_gcm_enc)
1134 jb _initial_num_blocks_is_1_encrypt 1568 jb _initial_num_blocks_is_1_encrypt
1135 je _initial_num_blocks_is_2_encrypt 1569 je _initial_num_blocks_is_2_encrypt
1136_initial_num_blocks_is_3_encrypt: 1570_initial_num_blocks_is_3_encrypt:
1137 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1138%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1139 sub $48, %r13 1573 sub $48, %r13
1140 jmp _initial_blocks_encrypted 1574 jmp _initial_blocks_encrypted
1141_initial_num_blocks_is_2_encrypt: 1575_initial_num_blocks_is_2_encrypt:
1142 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1143%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1144 sub $32, %r13 1578 sub $32, %r13
1145 jmp _initial_blocks_encrypted 1579 jmp _initial_blocks_encrypted
1146_initial_num_blocks_is_1_encrypt: 1580_initial_num_blocks_is_1_encrypt:
1147 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1148%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1149 sub $16, %r13 1583 sub $16, %r13
1150 jmp _initial_blocks_encrypted 1584 jmp _initial_blocks_encrypted
1151_initial_num_blocks_is_0_encrypt: 1585_initial_num_blocks_is_0_encrypt:
1152 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1153%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1154_initial_blocks_encrypted: 1588_initial_blocks_encrypted:
1155 1589
@@ -1160,7 +1594,7 @@ _initial_blocks_encrypted:
1160 sub $64, %r13 1594 sub $64, %r13
1161 je _four_cipher_left_encrypt 1595 je _four_cipher_left_encrypt
1162_encrypt_by_4_encrypt: 1596_encrypt_by_4_encrypt:
1163 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1164%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1165 add $64, %r11 1599 add $64, %r11
1166 sub $64, %r13 1600 sub $64, %r13
@@ -1175,7 +1609,9 @@ _zero_cipher_left_encrypt:
1175 1609
1176 # Handle the last <16 Byte block seperately 1610 # Handle the last <16 Byte block seperately
1177 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1178 pshufb SHUF_MASK(%rip), %xmm0 1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1179 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1180 sub $16, %r11 1616 sub $16, %r11
1181 add %r13, %r11 1617 add %r13, %r11
@@ -1185,29 +1621,31 @@ _zero_cipher_left_encrypt:
1185 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1621 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1186 # (%r13 is the number of bytes in plaintext mod 16) 1622 # (%r13 is the number of bytes in plaintext mod 16)
1187 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1623 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1188 pshufb %xmm2, %xmm1 # shift right 16-r13 byte 1624 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1189 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1625 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1190 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1626 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1191 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1627 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1192 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1628 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10,%xmm0
1193 1631
1194 pshufb SHUF_MASK(%rip),%xmm0
1195 pxor %xmm0, %xmm8 1632 pxor %xmm0, %xmm8
1196 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1633 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1197 # GHASH computation for the last <16 byte block 1634 # GHASH computation for the last <16 byte block
1198 sub %r13, %r11 1635 sub %r13, %r11
1199 add $16, %r11 1636 add $16, %r11
1200 pshufb SHUF_MASK(%rip), %xmm0 1637 PSHUFB_XMM %xmm10, %xmm1
1638
1201 # shuffle xmm0 back to output as ciphertext 1639 # shuffle xmm0 back to output as ciphertext
1202 1640
1203 # Output %r13 bytes 1641 # Output %r13 bytes
1204 movq %xmm0, %rax 1642 MOVQ_R64_XMM %xmm0, %rax
1205 cmp $8, %r13 1643 cmp $8, %r13
1206 jle _less_than_8_bytes_left_encrypt 1644 jle _less_than_8_bytes_left_encrypt
1207 mov %rax, (%arg2 , %r11, 1) 1645 mov %rax, (%arg2 , %r11, 1)
1208 add $8, %r11 1646 add $8, %r11
1209 psrldq $8, %xmm0 1647 psrldq $8, %xmm0
1210 movq %xmm0, %rax 1648 MOVQ_R64_XMM %xmm0, %rax
1211 sub $8, %r13 1649 sub $8, %r13
1212_less_than_8_bytes_left_encrypt: 1650_less_than_8_bytes_left_encrypt:
1213 mov %al, (%arg2, %r11, 1) 1651 mov %al, (%arg2, %r11, 1)
@@ -1220,14 +1658,15 @@ _multiple_of_16_bytes_encrypt:
1220 shl $3, %r12 1658 shl $3, %r12
1221 movd %r12d, %xmm15 # len(A) in %xmm15 1659 movd %r12d, %xmm15 # len(A) in %xmm15
1222 shl $3, %arg4 # len(C) in bits (*128) 1660 shl $3, %arg4 # len(C) in bits (*128)
1223 movq %arg4, %xmm1 1661 MOVQ_R64_XMM %arg4, %xmm1
1224 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1662 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1225 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1663 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1226 pxor %xmm15, %xmm8 1664 pxor %xmm15, %xmm8
1227 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1665 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1228 # final GHASH computation 1666 # final GHASH computation
1667 movdqa SHUF_MASK(%rip), %xmm10
1668 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1229 1669
1230 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1231 mov %arg5, %rax # %rax = *Y0 1670 mov %arg5, %rax # %rax = *Y0
1232 movdqu (%rax), %xmm0 # %xmm0 = Y0 1671 movdqu (%rax), %xmm0 # %xmm0 = Y0
1233 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1672 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
@@ -1240,11 +1679,11 @@ _return_T_encrypt:
1240 cmp $12, %r11 1679 cmp $12, %r11
1241 je _T_12_encrypt 1680 je _T_12_encrypt
1242_T_8_encrypt: 1681_T_8_encrypt:
1243 movq %xmm0, %rax 1682 MOVQ_R64_XMM %xmm0, %rax
1244 mov %rax, (%r10) 1683 mov %rax, (%r10)
1245 jmp _return_T_done_encrypt 1684 jmp _return_T_done_encrypt
1246_T_12_encrypt: 1685_T_12_encrypt:
1247 movq %xmm0, %rax 1686 MOVQ_R64_XMM %xmm0, %rax
1248 mov %rax, (%r10) 1687 mov %rax, (%r10)
1249 psrldq $8, %xmm0 1688 psrldq $8, %xmm0
1250 movd %xmm0, %eax 1689 movd %xmm0, %eax
@@ -1258,6 +1697,7 @@ _return_T_done_encrypt:
1258 pop %r13 1697 pop %r13
1259 pop %r12 1698 pop %r12
1260 ret 1699 ret
1700
1261#endif 1701#endif
1262 1702
1263 1703