[CRYPTO] twofish: x86-64 assembly version

The patch passed the trycpt tests and automated filesystem tests. This rewrite resulted in some nice perfomance increase over my last patch. Short summary of the tcrypt benchmarks: Twofish Assembler vs. Twofish C (256bit 8kb block CBC) encrypt: -27% Cycles decrypt: -23% Cycles Twofish Assembler vs. AES Assembler (128bit 8kb block CBC) encrypt: +18% Cycles decrypt: +15% Cycles Twofish Assembler vs. AES Assembler (256bit 8kb block CBC) encrypt: -9% Cycles decrypt: -8% Cycles Full Output: http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-x86_64.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-x86_64.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-x86_64.txt Here is another bonnie++ benchmark with encrypted filesystems. Most runs maxed out the hd. It should give some idea what the module can do for encrypted filesystem performance even though you can't see the full numbers. http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060610_130806_x86_64.html Signed-off-by: Joachim Fritschi <jfritschi@freenet.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Joachim Fritschi <jfritschi@freenet.de> 2006-06-20 07:12:02 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2006-09-20 21:16:29 -0400
commit: eaf44088ff467410dd15a033fef118888002ffe6 (patch)
tree: 72b225b910342ae74e1b0915ceff61b4ead97883 /arch
parent: b9f535ffe38f7eb61ac2219d32d97c377b69f70d (diff)
3 files changed, 424 insertions, 0 deletions
diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile
index 426d20f4b72e..15b538a8b7f7 100644
--- a/arch/x86_64/crypto/Makefile
+++ b/arch/x86_64/crypto/Makefile
@@ -5,5 +5,8 @@
 # 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S
new file mode 100644
index 000000000000..35974a586615
--- /dev/null
+++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+.file "twofish-x86_64-asm.S"
+.text
+#include <asm/asm-offsets.h>
+#define a_offset        0
+#define b_offset        4
+#define c_offset        8
+#define d_offset        12
+/* Structure of the crypto context struct*/
+#define s0      0       /* S0 Array 256 Words each */
+#define s1      1024    /* S1 Array */
+#define s2      2048    /* S2 Array */
+#define s3      3072    /* S3 Array */
+#define w       4096    /* 8 whitening keys (word) */
+#define k       4128    /* key 1-32 ( word ) */
+/* define a few register aliases to allow macro substitution */
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+        xor     w+offset(context),      src;
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+        xor     w+16+offset(context),   src;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $15,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        rol     $15,            c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        xor     a,              %r10;\
+        movzx   b ## H,         %edi;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        ror     $1,             c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $15,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        rol     $15,            d ## D;
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        xor     a,              %r10;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        ror     $1,             d ## D;
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+twofish_enc_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        input_whitening(R1,%r11,a_offset)
+        input_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        rol     $16,    R0D
+        shr     $32,    R1
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R3D
+        encrypt_round(R0,R1,R2,R3,0);
+        encrypt_round(R2,R3,R0,R1,8);
+        encrypt_round(R0,R1,R2,R3,2*8);
+        encrypt_round(R2,R3,R0,R1,3*8);
+        encrypt_round(R0,R1,R2,R3,4*8);
+        encrypt_round(R2,R3,R0,R1,5*8);
+        encrypt_round(R0,R1,R2,R3,6*8);
+        encrypt_round(R2,R3,R0,R1,7*8);
+        encrypt_round(R0,R1,R2,R3,8*8);
+        encrypt_round(R2,R3,R0,R1,9*8);
+        encrypt_round(R0,R1,R2,R3,10*8);
+        encrypt_round(R2,R3,R0,R1,11*8);
+        encrypt_round(R0,R1,R2,R3,12*8);
+        encrypt_round(R2,R3,R0,R1,13*8);
+        encrypt_round(R0,R1,R2,R3,14*8);
+        encrypt_last_round(R2,R3,R0,R1,15*8);
+        output_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        output_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret
+twofish_dec_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        output_whitening(R1,%r11,a_offset)
+        output_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        shr     $32,    R1
+        rol     $16,    R1D
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R2D
+        decrypt_round(R0,R1,R2,R3,15*8);
+        decrypt_round(R2,R3,R0,R1,14*8);
+        decrypt_round(R0,R1,R2,R3,13*8);
+        decrypt_round(R2,R3,R0,R1,12*8);
+        decrypt_round(R0,R1,R2,R3,11*8);
+        decrypt_round(R2,R3,R0,R1,10*8);
+        decrypt_round(R0,R1,R2,R3,9*8);
+        decrypt_round(R2,R3,R0,R1,8*8);
+        decrypt_round(R0,R1,R2,R3,7*8);
+        decrypt_round(R2,R3,R0,R1,6*8);
+        decrypt_round(R0,R1,R2,R3,5*8);
+        decrypt_round(R2,R3,R0,R1,4*8);
+        decrypt_round(R0,R1,R2,R3,3*8);
+        decrypt_round(R2,R3,R0,R1,2*8);
+        decrypt_round(R0,R1,R2,R3,1*8);
+        decrypt_last_round(R2,R3,R0,R1,0);
+        input_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        input_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret
diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c
new file mode 100644
index 000000000000..182d91d5cfb9
--- /dev/null
+++ b/arch/x86_64/crypto/twofish.c
@@ -0,0 +1,97 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_enc_blk(tfm, dst, src);
+}
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg alg = {
+        .cra_name               =       "twofish",
+        .cra_driver_name        =       "twofish-x86_64",
+        .cra_priority           =       200,
+        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          =       TF_BLOCK_SIZE,
+        .cra_ctxsize            =       sizeof(struct twofish_ctx),
+        .cra_alignmask          =       3,
+        .cra_module             =       THIS_MODULE,
+        .cra_list               =       LIST_HEAD_INIT(alg.cra_list),
+        .cra_u                  =       {
+                .cipher = {
+                        .cia_min_keysize        =       TF_MIN_KEY_SIZE,
+                        .cia_max_keysize        =       TF_MAX_KEY_SIZE,
+                        .cia_setkey             =       twofish_setkey,
+                        .cia_encrypt            =       twofish_encrypt,
+                        .cia_decrypt            =       twofish_decrypt
+                }
+        }
+};
+static int __init init(void)
+{
+        return crypto_register_alg(&alg);
+}
+static void __exit fini(void)
+{
+        crypto_unregister_alg(&alg);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
+MODULE_ALIAS("twofish");
author	Joachim Fritschi <jfritschi@freenet.de>	2006-06-20 07:12:02 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2006-09-20 21:16:29 -0400
commit	eaf44088ff467410dd15a033fef118888002ffe6 (patch)
tree	72b225b910342ae74e1b0915ceff61b4ead97883 /arch
parent	b9f535ffe38f7eb61ac2219d32d97c377b69f70d (diff)

diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile index 426d20f4b72e..15b538a8b7f7 100644 --- a/arch/x86_64/crypto/Makefile +++ b/arch/x86_64/crypto/Makefile
@@ -5,5 +5,8 @@
5	#	5	#
6		6
7	obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o	7	obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
		8	obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
8		9
9	aes-x86_64-y := aes-x86_64-asm.o aes.o	10	aes-x86_64-y := aes-x86_64-asm.o aes.o
		11	twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
		12


diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S new file mode 100644 index 000000000000..35974a586615 --- /dev/null +++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
		1	/***************************************************************************
		2	* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
		3	* *
		4	* This program is free software; you can redistribute it and/or modify *
		5	* it under the terms of the GNU General Public License as published by *
		6	* the Free Software Foundation; either version 2 of the License, or *
		7	* (at your option) any later version. *
		8	* *
		9	* This program is distributed in the hope that it will be useful, *
		10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
		11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
		12	* GNU General Public License for more details. *
		13	* *
		14	* You should have received a copy of the GNU General Public License *
		15	* along with this program; if not, write to the *
		16	* Free Software Foundation, Inc., *
		17	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
		18	***************************************************************************/
		19
		20	.file "twofish-x86_64-asm.S"
		21	.text
		22
		23	#include <asm/asm-offsets.h>
		24
		25	#define a_offset 0
		26	#define b_offset 4
		27	#define c_offset 8
		28	#define d_offset 12
		29
		30	/* Structure of the crypto context struct*/
		31
		32	#define s0 0 /* S0 Array 256 Words each */
		33	#define s1 1024 /* S1 Array */
		34	#define s2 2048 /* S2 Array */
		35	#define s3 3072 /* S3 Array */
		36	#define w 4096 /* 8 whitening keys (word) */
		37	#define k 4128 /* key 1-32 ( word ) */
		38
		39	/* define a few register aliases to allow macro substitution */
		40
		41	#define R0 %rax
		42	#define R0D %eax
		43	#define R0B %al
		44	#define R0H %ah
		45
		46	#define R1 %rbx
		47	#define R1D %ebx
		48	#define R1B %bl
		49	#define R1H %bh
		50
		51	#define R2 %rcx
		52	#define R2D %ecx
		53	#define R2B %cl
		54	#define R2H %ch
		55
		56	#define R3 %rdx
		57	#define R3D %edx
		58	#define R3B %dl
		59	#define R3H %dh
		60
		61
		62	/* performs input whitening */
		63	#define input_whitening(src,context,offset)\
		64	xor w+offset(context), src;
		65
		66	/* performs input whitening */
		67	#define output_whitening(src,context,offset)\
		68	xor w+16+offset(context), src;
		69
		70
		71	/*
		72	* a input register containing a (rotated 16)
		73	* b input register containing b
		74	* c input register containing c
		75	* d input register containing d (already rol $1)
		76	* operations on a and b are interleaved to increase performance
		77	*/
		78	#define encrypt_round(a,b,c,d,round)\
		79	movzx b ## B, %edi;\
		80	mov s1(%r11,%rdi,4),%r8d;\
		81	movzx a ## B, %edi;\
		82	mov s2(%r11,%rdi,4),%r9d;\
		83	movzx b ## H, %edi;\
		84	ror $16, b ## D;\
		85	xor s2(%r11,%rdi,4),%r8d;\
		86	movzx a ## H, %edi;\
		87	ror $16, a ## D;\
		88	xor s3(%r11,%rdi,4),%r9d;\
		89	movzx b ## B, %edi;\
		90	xor s3(%r11,%rdi,4),%r8d;\
		91	movzx a ## B, %edi;\
		92	xor (%r11,%rdi,4), %r9d;\
		93	movzx b ## H, %edi;\
		94	ror $15, b ## D;\
		95	xor (%r11,%rdi,4), %r8d;\
		96	movzx a ## H, %edi;\
		97	xor s1(%r11,%rdi,4),%r9d;\
		98	add %r8d, %r9d;\
		99	add %r9d, %r8d;\
		100	add k+round(%r11), %r9d;\
		101	xor %r9d, c ## D;\
		102	rol $15, c ## D;\
		103	add k+4+round(%r11),%r8d;\
		104	xor %r8d, d ## D;
		105
		106	/*
		107	* a input register containing a(rotated 16)
		108	* b input register containing b
		109	* c input register containing c
		110	* d input register containing d (already rol $1)
		111	* operations on a and b are interleaved to increase performance
		112	* during the round a and b are prepared for the output whitening
		113	*/
		114	#define encrypt_last_round(a,b,c,d,round)\
		115	mov b ## D, %r10d;\
		116	shl $32, %r10;\
		117	movzx b ## B, %edi;\
		118	mov s1(%r11,%rdi,4),%r8d;\
		119	movzx a ## B, %edi;\
		120	mov s2(%r11,%rdi,4),%r9d;\
		121	movzx b ## H, %edi;\
		122	ror $16, b ## D;\
		123	xor s2(%r11,%rdi,4),%r8d;\
		124	movzx a ## H, %edi;\
		125	ror $16, a ## D;\
		126	xor s3(%r11,%rdi,4),%r9d;\
		127	movzx b ## B, %edi;\
		128	xor s3(%r11,%rdi,4),%r8d;\
		129	movzx a ## B, %edi;\
		130	xor (%r11,%rdi,4), %r9d;\
		131	xor a, %r10;\
		132	movzx b ## H, %edi;\
		133	xor (%r11,%rdi,4), %r8d;\
		134	movzx a ## H, %edi;\
		135	xor s1(%r11,%rdi,4),%r9d;\
		136	add %r8d, %r9d;\
		137	add %r9d, %r8d;\
		138	add k+round(%r11), %r9d;\
		139	xor %r9d, c ## D;\
		140	ror $1, c ## D;\
		141	add k+4+round(%r11),%r8d;\
		142	xor %r8d, d ## D
		143
		144	/*
		145	* a input register containing a
		146	* b input register containing b (rotated 16)
		147	* c input register containing c (already rol $1)
		148	* d input register containing d
		149	* operations on a and b are interleaved to increase performance
		150	*/
		151	#define decrypt_round(a,b,c,d,round)\
		152	movzx a ## B, %edi;\
		153	mov (%r11,%rdi,4), %r9d;\
		154	movzx b ## B, %edi;\
		155	mov s3(%r11,%rdi,4),%r8d;\
		156	movzx a ## H, %edi;\
		157	ror $16, a ## D;\
		158	xor s1(%r11,%rdi,4),%r9d;\
		159	movzx b ## H, %edi;\
		160	ror $16, b ## D;\
		161	xor (%r11,%rdi,4), %r8d;\
		162	movzx a ## B, %edi;\
		163	xor s2(%r11,%rdi,4),%r9d;\
		164	movzx b ## B, %edi;\
		165	xor s1(%r11,%rdi,4),%r8d;\
		166	movzx a ## H, %edi;\
		167	ror $15, a ## D;\
		168	xor s3(%r11,%rdi,4),%r9d;\
		169	movzx b ## H, %edi;\
		170	xor s2(%r11,%rdi,4),%r8d;\
		171	add %r8d, %r9d;\
		172	add %r9d, %r8d;\
		173	add k+round(%r11), %r9d;\
		174	xor %r9d, c ## D;\
		175	add k+4+round(%r11),%r8d;\
		176	xor %r8d, d ## D;\
		177	rol $15, d ## D;
		178
		179	/*
		180	* a input register containing a
		181	* b input register containing b
		182	* c input register containing c (already rol $1)
		183	* d input register containing d
		184	* operations on a and b are interleaved to increase performance
		185	* during the round a and b are prepared for the output whitening
		186	*/
		187	#define decrypt_last_round(a,b,c,d,round)\
		188	movzx a ## B, %edi;\
		189	mov (%r11,%rdi,4), %r9d;\
		190	movzx b ## B, %edi;\
		191	mov s3(%r11,%rdi,4),%r8d;\
		192	movzx b ## H, %edi;\
		193	ror $16, b ## D;\
		194	xor (%r11,%rdi,4), %r8d;\
		195	movzx a ## H, %edi;\
		196	mov b ## D, %r10d;\
		197	shl $32, %r10;\
		198	xor a, %r10;\
		199	ror $16, a ## D;\
		200	xor s1(%r11,%rdi,4),%r9d;\
		201	movzx b ## B, %edi;\
		202	xor s1(%r11,%rdi,4),%r8d;\
		203	movzx a ## B, %edi;\
		204	xor s2(%r11,%rdi,4),%r9d;\
		205	movzx b ## H, %edi;\
		206	xor s2(%r11,%rdi,4),%r8d;\
		207	movzx a ## H, %edi;\
		208	xor s3(%r11,%rdi,4),%r9d;\
		209	add %r8d, %r9d;\
		210	add %r9d, %r8d;\
		211	add k+round(%r11), %r9d;\
		212	xor %r9d, c ## D;\
		213	add k+4+round(%r11),%r8d;\
		214	xor %r8d, d ## D;\
		215	ror $1, d ## D;
		216
		217	.align 8
		218	.global twofish_enc_blk
		219	.global twofish_dec_blk
		220
		221	twofish_enc_blk:
		222	pushq R1
		223
		224	/* %rdi contains the crypto tfm adress */
		225	/* %rsi contains the output adress */
		226	/* %rdx contains the input adress */
		227	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
		228	/* ctx adress is moved to free one non-rex register
		229	as target for the 8bit high operations */
		230	mov %rdi, %r11
		231
		232	movq (R3), R1
		233	movq 8(R3), R3
		234	input_whitening(R1,%r11,a_offset)
		235	input_whitening(R3,%r11,c_offset)
		236	mov R1D, R0D
		237	rol $16, R0D
		238	shr $32, R1
		239	mov R3D, R2D
		240	shr $32, R3
		241	rol $1, R3D
		242
		243	encrypt_round(R0,R1,R2,R3,0);
		244	encrypt_round(R2,R3,R0,R1,8);
		245	encrypt_round(R0,R1,R2,R3,2*8);
		246	encrypt_round(R2,R3,R0,R1,3*8);
		247	encrypt_round(R0,R1,R2,R3,4*8);
		248	encrypt_round(R2,R3,R0,R1,5*8);
		249	encrypt_round(R0,R1,R2,R3,6*8);
		250	encrypt_round(R2,R3,R0,R1,7*8);
		251	encrypt_round(R0,R1,R2,R3,8*8);
		252	encrypt_round(R2,R3,R0,R1,9*8);
		253	encrypt_round(R0,R1,R2,R3,10*8);
		254	encrypt_round(R2,R3,R0,R1,11*8);
		255	encrypt_round(R0,R1,R2,R3,12*8);
		256	encrypt_round(R2,R3,R0,R1,13*8);
		257	encrypt_round(R0,R1,R2,R3,14*8);
		258	encrypt_last_round(R2,R3,R0,R1,15*8);
		259
		260
		261	output_whitening(%r10,%r11,a_offset)
		262	movq %r10, (%rsi)
		263
		264	shl $32, R1
		265	xor R0, R1
		266
		267	output_whitening(R1,%r11,c_offset)
		268	movq R1, 8(%rsi)
		269
		270	popq R1
		271	movq $1,%rax
		272	ret
		273
		274	twofish_dec_blk:
		275	pushq R1
		276
		277	/* %rdi contains the crypto tfm adress */
		278	/* %rsi contains the output adress */
		279	/* %rdx contains the input adress */
		280	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
		281	/* ctx adress is moved to free one non-rex register
		282	as target for the 8bit high operations */
		283	mov %rdi, %r11
		284
		285	movq (R3), R1
		286	movq 8(R3), R3
		287	output_whitening(R1,%r11,a_offset)
		288	output_whitening(R3,%r11,c_offset)
		289	mov R1D, R0D
		290	shr $32, R1
		291	rol $16, R1D
		292	mov R3D, R2D
		293	shr $32, R3
		294	rol $1, R2D
		295
		296	decrypt_round(R0,R1,R2,R3,15*8);
		297	decrypt_round(R2,R3,R0,R1,14*8);
		298	decrypt_round(R0,R1,R2,R3,13*8);
		299	decrypt_round(R2,R3,R0,R1,12*8);
		300	decrypt_round(R0,R1,R2,R3,11*8);
		301	decrypt_round(R2,R3,R0,R1,10*8);
		302	decrypt_round(R0,R1,R2,R3,9*8);
		303	decrypt_round(R2,R3,R0,R1,8*8);
		304	decrypt_round(R0,R1,R2,R3,7*8);
		305	decrypt_round(R2,R3,R0,R1,6*8);
		306	decrypt_round(R0,R1,R2,R3,5*8);
		307	decrypt_round(R2,R3,R0,R1,4*8);
		308	decrypt_round(R0,R1,R2,R3,3*8);
		309	decrypt_round(R2,R3,R0,R1,2*8);
		310	decrypt_round(R0,R1,R2,R3,1*8);
		311	decrypt_last_round(R2,R3,R0,R1,0);
		312
		313	input_whitening(%r10,%r11,a_offset)
		314	movq %r10, (%rsi)
		315
		316	shl $32, R1
		317	xor R0, R1
		318
		319	input_whitening(R1,%r11,c_offset)
		320	movq R1, 8(%rsi)
		321
		322	popq R1
		323	movq $1,%rax
		324	ret


diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c new file mode 100644 index 000000000000..182d91d5cfb9 --- /dev/null +++ b/arch/x86_64/crypto/twofish.c
@@ -0,0 +1,97 @@
		1	/*
		2	* Glue Code for optimized x86_64 assembler version of TWOFISH
		3	*
		4	* Originally Twofish for GPG
		5	* By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
		6	* 256-bit key length added March 20, 1999
		7	* Some modifications to reduce the text size by Werner Koch, April, 1998
		8	* Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
		9	* Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
		10	*
		11	* The original author has disclaimed all copyright interest in this
		12	* code and thus put it in the public domain. The subsequent authors
		13	* have put this under the GNU General Public License.
		14	*
		15	* This program is free software; you can redistribute it and/or modify
		16	* it under the terms of the GNU General Public License as published by
		17	* the Free Software Foundation; either version 2 of the License, or
		18	* (at your option) any later version.
		19	*
		20	* This program is distributed in the hope that it will be useful,
		21	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		22	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		23	* GNU General Public License for more details.
		24	*
		25	* You should have received a copy of the GNU General Public License
		26	* along with this program; if not, write to the Free Software
		27	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
		28	* USA
		29	*
		30	* This code is a "clean room" implementation, written from the paper
		31	* _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
		32	* Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
		33	* through http://www.counterpane.com/twofish.html
		34	*
		35	* For background information on multiplication in finite fields, used for
		36	* the matrix operations in the key schedule, see the book _Contemporary
		37	* Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
		38	* Third Edition.
		39	*/
		40
		41	#include <crypto/twofish.h>
		42	#include <linux/crypto.h>
		43	#include <linux/init.h>
		44	#include <linux/kernel.h>
		45	#include <linux/module.h>
		46	#include <linux/types.h>
		47
		48	asmlinkage void twofish_enc_blk(struct crypto_tfm tfm, u8 dst, const u8 *src);
		49	asmlinkage void twofish_dec_blk(struct crypto_tfm tfm, u8 dst, const u8 *src);
		50
		51	static void twofish_encrypt(struct crypto_tfm tfm, u8 dst, const u8 *src)
		52	{
		53	twofish_enc_blk(tfm, dst, src);
		54	}
		55
		56	static void twofish_decrypt(struct crypto_tfm tfm, u8 dst, const u8 *src)
		57	{
		58	twofish_dec_blk(tfm, dst, src);
		59	}
		60
		61	static struct crypto_alg alg = {
		62	.cra_name = "twofish",
		63	.cra_driver_name = "twofish-x86_64",
		64	.cra_priority = 200,
		65	.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
		66	.cra_blocksize = TF_BLOCK_SIZE,
		67	.cra_ctxsize = sizeof(struct twofish_ctx),
		68	.cra_alignmask = 3,
		69	.cra_module = THIS_MODULE,
		70	.cra_list = LIST_HEAD_INIT(alg.cra_list),
		71	.cra_u = {
		72	.cipher = {
		73	.cia_min_keysize = TF_MIN_KEY_SIZE,
		74	.cia_max_keysize = TF_MAX_KEY_SIZE,
		75	.cia_setkey = twofish_setkey,
		76	.cia_encrypt = twofish_encrypt,
		77	.cia_decrypt = twofish_decrypt
		78	}
		79	}
		80	};
		81
		82	static int __init init(void)
		83	{
		84	return crypto_register_alg(&alg);
		85	}
		86
		87	static void __exit fini(void)
		88	{
		89	crypto_unregister_alg(&alg);
		90	}
		91
		92	module_init(init);
		93	module_exit(fini);
		94
		95	MODULE_LICENSE("GPL");
		96	MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
		97	MODULE_ALIAS("twofish");