diff options
Diffstat (limited to 'arch/x86/crypto/serpent-avx-x86_64-asm_64.S')
| -rw-r--r-- | arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 |
1 files changed, 108 insertions, 58 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a2..02b0e9fe997c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |||
| @@ -24,7 +24,16 @@ | |||
| 24 | * | 24 | * |
| 25 | */ | 25 | */ |
| 26 | 26 | ||
| 27 | #include "glue_helper-asm-avx.S" | ||
| 28 | |||
| 27 | .file "serpent-avx-x86_64-asm_64.S" | 29 | .file "serpent-avx-x86_64-asm_64.S" |
| 30 | |||
| 31 | .data | ||
| 32 | .align 16 | ||
| 33 | |||
| 34 | .Lbswap128_mask: | ||
| 35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
| 36 | |||
| 28 | .text | 37 | .text |
| 29 | 38 | ||
| 30 | #define CTX %rdi | 39 | #define CTX %rdi |
| @@ -550,51 +559,27 @@ | |||
| 550 | vpunpcklqdq x3, t2, x2; \ | 559 | vpunpcklqdq x3, t2, x2; \ |
| 551 | vpunpckhqdq x3, t2, x3; | 560 | vpunpckhqdq x3, t2, x3; |
| 552 | 561 | ||
| 553 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | 562 | #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
| 554 | vmovdqu (0*4*4)(in), x0; \ | ||
| 555 | vmovdqu (1*4*4)(in), x1; \ | ||
| 556 | vmovdqu (2*4*4)(in), x2; \ | ||
| 557 | vmovdqu (3*4*4)(in), x3; \ | ||
| 558 | \ | ||
| 559 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 563 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
| 560 | 564 | ||
| 561 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 565 | #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
| 562 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 566 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
| 563 | \ | ||
| 564 | vmovdqu x0, (0*4*4)(out); \ | ||
| 565 | vmovdqu x1, (1*4*4)(out); \ | ||
| 566 | vmovdqu x2, (2*4*4)(out); \ | ||
| 567 | vmovdqu x3, (3*4*4)(out); | ||
| 568 | |||
| 569 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | ||
| 570 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
| 571 | \ | ||
| 572 | vpxor (0*4*4)(out), x0, x0; \ | ||
| 573 | vmovdqu x0, (0*4*4)(out); \ | ||
| 574 | vpxor (1*4*4)(out), x1, x1; \ | ||
| 575 | vmovdqu x1, (1*4*4)(out); \ | ||
| 576 | vpxor (2*4*4)(out), x2, x2; \ | ||
| 577 | vmovdqu x2, (2*4*4)(out); \ | ||
| 578 | vpxor (3*4*4)(out), x3, x3; \ | ||
| 579 | vmovdqu x3, (3*4*4)(out); | ||
| 580 | 567 | ||
| 581 | .align 8 | 568 | .align 8 |
| 582 | .global __serpent_enc_blk_8way_avx | 569 | .type __serpent_enc_blk8_avx,@function; |
| 583 | .type __serpent_enc_blk_8way_avx,@function; | ||
| 584 | 570 | ||
| 585 | __serpent_enc_blk_8way_avx: | 571 | __serpent_enc_blk8_avx: |
| 586 | /* input: | 572 | /* input: |
| 587 | * %rdi: ctx, CTX | 573 | * %rdi: ctx, CTX |
| 588 | * %rsi: dst | 574 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
| 589 | * %rdx: src | 575 | * output: |
| 590 | * %rcx: bool, if true: xor output | 576 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
| 591 | */ | 577 | */ |
| 592 | 578 | ||
| 593 | vpcmpeqd RNOT, RNOT, RNOT; | 579 | vpcmpeqd RNOT, RNOT, RNOT; |
| 594 | 580 | ||
| 595 | leaq (4*4*4)(%rdx), %rax; | 581 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
| 596 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 582 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
| 597 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
| 598 | 583 | ||
| 599 | K2(RA, RB, RC, RD, RE, 0); | 584 | K2(RA, RB, RC, RD, RE, 0); |
| 600 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | 585 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); |
| @@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: | |||
| 630 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | 615 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); |
| 631 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | 616 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); |
| 632 | 617 | ||
| 633 | leaq (4*4*4)(%rsi), %rax; | 618 | write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
| 634 | 619 | write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
| 635 | testb %cl, %cl; | ||
| 636 | jnz __enc_xor8; | ||
| 637 | |||
| 638 | write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
| 639 | write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
| 640 | |||
| 641 | ret; | ||
| 642 | |||
| 643 | __enc_xor8: | ||
| 644 | xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
| 645 | xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
| 646 | 620 | ||
| 647 | ret; | 621 | ret; |
| 648 | 622 | ||
| 649 | .align 8 | 623 | .align 8 |
| 650 | .global serpent_dec_blk_8way_avx | 624 | .type __serpent_dec_blk8_avx,@function; |
| 651 | .type serpent_dec_blk_8way_avx,@function; | ||
| 652 | 625 | ||
| 653 | serpent_dec_blk_8way_avx: | 626 | __serpent_dec_blk8_avx: |
| 654 | /* input: | 627 | /* input: |
| 655 | * %rdi: ctx, CTX | 628 | * %rdi: ctx, CTX |
| 656 | * %rsi: dst | 629 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
| 657 | * %rdx: src | 630 | * output: |
| 631 | * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks | ||
| 658 | */ | 632 | */ |
| 659 | 633 | ||
| 660 | vpcmpeqd RNOT, RNOT, RNOT; | 634 | vpcmpeqd RNOT, RNOT, RNOT; |
| 661 | 635 | ||
| 662 | leaq (4*4*4)(%rdx), %rax; | 636 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
| 663 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 637 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
| 664 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
| 665 | 638 | ||
| 666 | K2(RA, RB, RC, RD, RE, 32); | 639 | K2(RA, RB, RC, RD, RE, 32); |
| 667 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | 640 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); |
| @@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: | |||
| 697 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | 670 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); |
| 698 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | 671 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); |
| 699 | 672 | ||
| 700 | leaq (4*4*4)(%rsi), %rax; | 673 | write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); |
| 701 | write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); | 674 | write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); |
| 702 | write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); | 675 | |
| 676 | ret; | ||
| 677 | |||
| 678 | .align 8 | ||
| 679 | .global serpent_ecb_enc_8way_avx | ||
| 680 | .type serpent_ecb_enc_8way_avx,@function; | ||
| 681 | |||
| 682 | serpent_ecb_enc_8way_avx: | ||
| 683 | /* input: | ||
| 684 | * %rdi: ctx, CTX | ||
| 685 | * %rsi: dst | ||
| 686 | * %rdx: src | ||
| 687 | */ | ||
| 688 | |||
| 689 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
| 690 | |||
| 691 | call __serpent_enc_blk8_avx; | ||
| 692 | |||
| 693 | store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
| 694 | |||
| 695 | ret; | ||
| 696 | |||
| 697 | .align 8 | ||
| 698 | .global serpent_ecb_dec_8way_avx | ||
| 699 | .type serpent_ecb_dec_8way_avx,@function; | ||
| 700 | |||
| 701 | serpent_ecb_dec_8way_avx: | ||
| 702 | /* input: | ||
| 703 | * %rdi: ctx, CTX | ||
| 704 | * %rsi: dst | ||
| 705 | * %rdx: src | ||
| 706 | */ | ||
| 707 | |||
| 708 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
| 709 | |||
| 710 | call __serpent_dec_blk8_avx; | ||
| 711 | |||
| 712 | store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
| 713 | |||
| 714 | ret; | ||
| 715 | |||
| 716 | .align 8 | ||
| 717 | .global serpent_cbc_dec_8way_avx | ||
| 718 | .type serpent_cbc_dec_8way_avx,@function; | ||
| 719 | |||
| 720 | serpent_cbc_dec_8way_avx: | ||
| 721 | /* input: | ||
| 722 | * %rdi: ctx, CTX | ||
| 723 | * %rsi: dst | ||
| 724 | * %rdx: src | ||
| 725 | */ | ||
| 726 | |||
| 727 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
| 728 | |||
| 729 | call __serpent_dec_blk8_avx; | ||
| 730 | |||
| 731 | store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
| 732 | |||
| 733 | ret; | ||
| 734 | |||
| 735 | .align 8 | ||
| 736 | .global serpent_ctr_8way_avx | ||
| 737 | .type serpent_ctr_8way_avx,@function; | ||
| 738 | |||
| 739 | serpent_ctr_8way_avx: | ||
| 740 | /* input: | ||
| 741 | * %rdi: ctx, CTX | ||
| 742 | * %rsi: dst | ||
| 743 | * %rdx: src | ||
| 744 | * %rcx: iv (little endian, 128bit) | ||
| 745 | */ | ||
| 746 | |||
| 747 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
| 748 | RD2, RK0, RK1, RK2); | ||
| 749 | |||
| 750 | call __serpent_enc_blk8_avx; | ||
| 751 | |||
| 752 | store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
| 703 | 753 | ||
| 704 | ret; | 754 | ret; |
