aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/serpent-avx-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S166
1 files changed, 108 insertions, 58 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106bf04a2..02b0e9fe997c 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
24 * 24 *
25 */ 25 */
26 26
27#include "glue_helper-asm-avx.S"
28
27.file "serpent-avx-x86_64-asm_64.S" 29.file "serpent-avx-x86_64-asm_64.S"
30
31.data
32.align 16
33
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36
28.text 37.text
29 38
30#define CTX %rdi 39#define CTX %rdi
@@ -550,51 +559,27 @@
550 vpunpcklqdq x3, t2, x2; \ 559 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3; 560 vpunpckhqdq x3, t2, x3;
552 561
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560 564
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580 567
581.align 8 568.align 8
582.global __serpent_enc_blk_8way_avx 569.type __serpent_enc_blk8_avx,@function;
583.type __serpent_enc_blk_8way_avx,@function;
584 570
585__serpent_enc_blk_8way_avx: 571__serpent_enc_blk8_avx:
586 /* input: 572 /* input:
587 * %rdi: ctx, CTX 573 * %rdi: ctx, CTX
588 * %rsi: dst 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
589 * %rdx: src 575 * output:
590 * %rcx: bool, if true: xor output 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
591 */ 577 */
592 578
593 vpcmpeqd RNOT, RNOT, RNOT; 579 vpcmpeqd RNOT, RNOT, RNOT;
594 580
595 leaq (4*4*4)(%rdx), %rax; 581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598 583
599 K2(RA, RB, RC, RD, RE, 0); 584 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632 617
633 leaq (4*4*4)(%rsi), %rax; 618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634 619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646 620
647 ret; 621 ret;
648 622
649.align 8 623.align 8
650.global serpent_dec_blk_8way_avx 624.type __serpent_dec_blk8_avx,@function;
651.type serpent_dec_blk_8way_avx,@function;
652 625
653serpent_dec_blk_8way_avx: 626__serpent_dec_blk8_avx:
654 /* input: 627 /* input:
655 * %rdi: ctx, CTX 628 * %rdi: ctx, CTX
656 * %rsi: dst 629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
657 * %rdx: src 630 * output:
631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
658 */ 632 */
659 633
660 vpcmpeqd RNOT, RNOT, RNOT; 634 vpcmpeqd RNOT, RNOT, RNOT;
661 635
662 leaq (4*4*4)(%rdx), %rax; 636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665 638
666 K2(RA, RB, RC, RD, RE, 32); 639 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699 672
700 leaq (4*4*4)(%rsi), %rax; 673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); 674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); 675
676 ret;
677
678.align 8
679.global serpent_ecb_enc_8way_avx
680.type serpent_ecb_enc_8way_avx,@function;
681
682serpent_ecb_enc_8way_avx:
683 /* input:
684 * %rdi: ctx, CTX
685 * %rsi: dst
686 * %rdx: src
687 */
688
689 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
690
691 call __serpent_enc_blk8_avx;
692
693 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
694
695 ret;
696
697.align 8
698.global serpent_ecb_dec_8way_avx
699.type serpent_ecb_dec_8way_avx,@function;
700
701serpent_ecb_dec_8way_avx:
702 /* input:
703 * %rdi: ctx, CTX
704 * %rsi: dst
705 * %rdx: src
706 */
707
708 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
709
710 call __serpent_dec_blk8_avx;
711
712 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
713
714 ret;
715
716.align 8
717.global serpent_cbc_dec_8way_avx
718.type serpent_cbc_dec_8way_avx,@function;
719
720serpent_cbc_dec_8way_avx:
721 /* input:
722 * %rdi: ctx, CTX
723 * %rsi: dst
724 * %rdx: src
725 */
726
727 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
728
729 call __serpent_dec_blk8_avx;
730
731 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
732
733 ret;
734
735.align 8
736.global serpent_ctr_8way_avx
737.type serpent_ctr_8way_avx,@function;
738
739serpent_ctr_8way_avx:
740 /* input:
741 * %rdi: ctx, CTX
742 * %rsi: dst
743 * %rdx: src
744 * %rcx: iv (little endian, 128bit)
745 */
746
747 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
748 RD2, RK0, RK1, RK2);
749
750 call __serpent_enc_blk8_avx;
751
752 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
703 753
704 ret; 754 ret;