diff options
Diffstat (limited to 'arch/x86/crypto/serpent-avx-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 |
1 files changed, 108 insertions, 58 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a2..02b0e9fe997c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |||
@@ -24,7 +24,16 @@ | |||
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include "glue_helper-asm-avx.S" | ||
28 | |||
27 | .file "serpent-avx-x86_64-asm_64.S" | 29 | .file "serpent-avx-x86_64-asm_64.S" |
30 | |||
31 | .data | ||
32 | .align 16 | ||
33 | |||
34 | .Lbswap128_mask: | ||
35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
36 | |||
28 | .text | 37 | .text |
29 | 38 | ||
30 | #define CTX %rdi | 39 | #define CTX %rdi |
@@ -550,51 +559,27 @@ | |||
550 | vpunpcklqdq x3, t2, x2; \ | 559 | vpunpcklqdq x3, t2, x2; \ |
551 | vpunpckhqdq x3, t2, x3; | 560 | vpunpckhqdq x3, t2, x3; |
552 | 561 | ||
553 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | 562 | #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
554 | vmovdqu (0*4*4)(in), x0; \ | ||
555 | vmovdqu (1*4*4)(in), x1; \ | ||
556 | vmovdqu (2*4*4)(in), x2; \ | ||
557 | vmovdqu (3*4*4)(in), x3; \ | ||
558 | \ | ||
559 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 563 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
560 | 564 | ||
561 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 565 | #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
562 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 566 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
563 | \ | ||
564 | vmovdqu x0, (0*4*4)(out); \ | ||
565 | vmovdqu x1, (1*4*4)(out); \ | ||
566 | vmovdqu x2, (2*4*4)(out); \ | ||
567 | vmovdqu x3, (3*4*4)(out); | ||
568 | |||
569 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | ||
570 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
571 | \ | ||
572 | vpxor (0*4*4)(out), x0, x0; \ | ||
573 | vmovdqu x0, (0*4*4)(out); \ | ||
574 | vpxor (1*4*4)(out), x1, x1; \ | ||
575 | vmovdqu x1, (1*4*4)(out); \ | ||
576 | vpxor (2*4*4)(out), x2, x2; \ | ||
577 | vmovdqu x2, (2*4*4)(out); \ | ||
578 | vpxor (3*4*4)(out), x3, x3; \ | ||
579 | vmovdqu x3, (3*4*4)(out); | ||
580 | 567 | ||
581 | .align 8 | 568 | .align 8 |
582 | .global __serpent_enc_blk_8way_avx | 569 | .type __serpent_enc_blk8_avx,@function; |
583 | .type __serpent_enc_blk_8way_avx,@function; | ||
584 | 570 | ||
585 | __serpent_enc_blk_8way_avx: | 571 | __serpent_enc_blk8_avx: |
586 | /* input: | 572 | /* input: |
587 | * %rdi: ctx, CTX | 573 | * %rdi: ctx, CTX |
588 | * %rsi: dst | 574 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
589 | * %rdx: src | 575 | * output: |
590 | * %rcx: bool, if true: xor output | 576 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
591 | */ | 577 | */ |
592 | 578 | ||
593 | vpcmpeqd RNOT, RNOT, RNOT; | 579 | vpcmpeqd RNOT, RNOT, RNOT; |
594 | 580 | ||
595 | leaq (4*4*4)(%rdx), %rax; | 581 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
596 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 582 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
597 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
598 | 583 | ||
599 | K2(RA, RB, RC, RD, RE, 0); | 584 | K2(RA, RB, RC, RD, RE, 0); |
600 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | 585 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); |
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: | |||
630 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | 615 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); |
631 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | 616 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); |
632 | 617 | ||
633 | leaq (4*4*4)(%rsi), %rax; | 618 | write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
634 | 619 | write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
635 | testb %cl, %cl; | ||
636 | jnz __enc_xor8; | ||
637 | |||
638 | write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
639 | write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
640 | |||
641 | ret; | ||
642 | |||
643 | __enc_xor8: | ||
644 | xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
645 | xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
646 | 620 | ||
647 | ret; | 621 | ret; |
648 | 622 | ||
649 | .align 8 | 623 | .align 8 |
650 | .global serpent_dec_blk_8way_avx | 624 | .type __serpent_dec_blk8_avx,@function; |
651 | .type serpent_dec_blk_8way_avx,@function; | ||
652 | 625 | ||
653 | serpent_dec_blk_8way_avx: | 626 | __serpent_dec_blk8_avx: |
654 | /* input: | 627 | /* input: |
655 | * %rdi: ctx, CTX | 628 | * %rdi: ctx, CTX |
656 | * %rsi: dst | 629 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
657 | * %rdx: src | 630 | * output: |
631 | * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks | ||
658 | */ | 632 | */ |
659 | 633 | ||
660 | vpcmpeqd RNOT, RNOT, RNOT; | 634 | vpcmpeqd RNOT, RNOT, RNOT; |
661 | 635 | ||
662 | leaq (4*4*4)(%rdx), %rax; | 636 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
663 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 637 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
664 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
665 | 638 | ||
666 | K2(RA, RB, RC, RD, RE, 32); | 639 | K2(RA, RB, RC, RD, RE, 32); |
667 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | 640 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); |
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: | |||
697 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | 670 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); |
698 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | 671 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); |
699 | 672 | ||
700 | leaq (4*4*4)(%rsi), %rax; | 673 | write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); |
701 | write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); | 674 | write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); |
702 | write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); | 675 | |
676 | ret; | ||
677 | |||
678 | .align 8 | ||
679 | .global serpent_ecb_enc_8way_avx | ||
680 | .type serpent_ecb_enc_8way_avx,@function; | ||
681 | |||
682 | serpent_ecb_enc_8way_avx: | ||
683 | /* input: | ||
684 | * %rdi: ctx, CTX | ||
685 | * %rsi: dst | ||
686 | * %rdx: src | ||
687 | */ | ||
688 | |||
689 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
690 | |||
691 | call __serpent_enc_blk8_avx; | ||
692 | |||
693 | store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
694 | |||
695 | ret; | ||
696 | |||
697 | .align 8 | ||
698 | .global serpent_ecb_dec_8way_avx | ||
699 | .type serpent_ecb_dec_8way_avx,@function; | ||
700 | |||
701 | serpent_ecb_dec_8way_avx: | ||
702 | /* input: | ||
703 | * %rdi: ctx, CTX | ||
704 | * %rsi: dst | ||
705 | * %rdx: src | ||
706 | */ | ||
707 | |||
708 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
709 | |||
710 | call __serpent_dec_blk8_avx; | ||
711 | |||
712 | store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
713 | |||
714 | ret; | ||
715 | |||
716 | .align 8 | ||
717 | .global serpent_cbc_dec_8way_avx | ||
718 | .type serpent_cbc_dec_8way_avx,@function; | ||
719 | |||
720 | serpent_cbc_dec_8way_avx: | ||
721 | /* input: | ||
722 | * %rdi: ctx, CTX | ||
723 | * %rsi: dst | ||
724 | * %rdx: src | ||
725 | */ | ||
726 | |||
727 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
728 | |||
729 | call __serpent_dec_blk8_avx; | ||
730 | |||
731 | store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
732 | |||
733 | ret; | ||
734 | |||
735 | .align 8 | ||
736 | .global serpent_ctr_8way_avx | ||
737 | .type serpent_ctr_8way_avx,@function; | ||
738 | |||
739 | serpent_ctr_8way_avx: | ||
740 | /* input: | ||
741 | * %rdi: ctx, CTX | ||
742 | * %rsi: dst | ||
743 | * %rdx: src | ||
744 | * %rcx: iv (little endian, 128bit) | ||
745 | */ | ||
746 | |||
747 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
748 | RD2, RK0, RK1, RK2); | ||
749 | |||
750 | call __serpent_enc_blk8_avx; | ||
751 | |||
752 | store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
703 | 753 | ||
704 | ret; | 754 | ret; |