[CRYPTO] twofish: i586 assembly version

The patch passed the trycpt tests and automated filesystem tests. This rewrite resulted in some nice perfomance increase over my last patch. Short summary of the tcrypt benchmarks: Twofish Assembler vs. Twofish C (256bit 8kb block CBC) encrypt: -33% Cycles decrypt: -45% Cycles Twofish Assembler vs. AES Assembler (128bit 8kb block CBC) encrypt: +3% Cycles decrypt: -22% Cycles Twofish Assembler vs. AES Assembler (256bit 8kb block CBC) encrypt: -20% Cycles decrypt: -36% Cycles Full Output: http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-i586.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-i586.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-i586.txt Here is another bonnie++ benchmark with encrypted filesystems. All runs with the twofish assembler modules max out the drivespeed. It should give some idea what the module can do for encrypted filesystem performance even though you can't see the full numbers. http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060611_205432_x86.html Signed-off-by: Joachim Fritschi <jfritschi@freenet.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Joachim Fritschi <jfritschi@freenet.de> 2006-06-20 06:59:16 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2006-09-20 21:16:28 -0400
commit: b9f535ffe38f7eb61ac2219d32d97c377b69f70d (patch)
tree: 57e09481226ab5a25f3938963f8299c9f0cd8439 /arch/i386/crypto/twofish-i586-asm.S
parent: 758f570ea785a5fbcdca026dfab2e9e1a3f89726 (diff)
1 files changed, 335 insertions, 0 deletions
diff --git a/arch/i386/crypto/twofish-i586-asm.S b/arch/i386/crypto/twofish-i586-asm.S
new file mode 100644
index 000000000000..39b98ed2c1b9
--- /dev/null
+++ b/arch/i386/crypto/twofish-i586-asm.S
@@ -0,0 +1,335 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+.file "twofish-i586-asm.S"
+.text
+#include <asm/asm-offsets.h>
+/* return adress at 0 */
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define tfm       4  /* Twofish context structure */
+#define a_offset        0
+#define b_offset        4
+#define c_offset        8
+#define d_offset        12
+/* Structure of the crypto context struct*/
+#define s0      0       /* S0 Array 256 Words each */
+#define s1      1024    /* S1 Array */
+#define s2      2048    /* S2 Array */
+#define s3      3072    /* S3 Array */
+#define w       4096    /* 8 whitening keys (word) */
+#define k       4128    /* key 1-32 ( word ) */
+/* define a few register aliases to allow macro substitution */
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+        xor     w+offset(context),      src;
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+        xor     w+16+offset(context),   src;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+        push    d ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%ebp,%edi,4),%esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%ebp,%edi,4),d ## D;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),%esi;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   b ## H,         %edi;\
+        ror     $15,            b ## D;\
+        xor     (%ebp,%edi,4),  d ## D;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     d ## D,         %esi;\
+        add     %esi,           d ## D;\
+        add     k+round(%ebp),  %esi;\
+        xor     %esi,           c ## D;\
+        rol     $15,            c ## D;\
+        add     k+4+round(%ebp),d ## D;\
+        xor     %edi,           d ## D;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+        push    d ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%ebp,%edi,4),%esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%ebp,%edi,4),d ## D;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),%esi;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  d ## D;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     d ## D,         %esi;\
+        add     %esi,           d ## D;\
+        add     k+round(%ebp),  %esi;\
+        xor     %esi,           c ## D;\
+        ror     $1,             c ## D;\
+        add     k+4+round(%ebp),d ## D;\
+        xor     %edi,           d ## D;
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+        push    c ## D;\
+        movzx   a ## B,         %edi;\
+        mov     (%ebp,%edi,4),  c ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%ebp,%edi,4),c ## D;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $15,            a ## D;\
+        xor     s3(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     %esi,           c ## D;\
+        add     c ## D,         %esi;\
+        add     k+round(%ebp),  c ## D;\
+        xor     %edi,           c ## D;\
+        add     k+4+round(%ebp),%esi;\
+        xor     %esi,           d ## D;\
+        rol     $15,            d ## D;
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+        push    c ## D;\
+        movzx   a ## B,         %edi;\
+        mov     (%ebp,%edi,4),  c ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%ebp,%edi,4),c ## D;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     %esi,           c ## D;\
+        add     c ## D,         %esi;\
+        add     k+round(%ebp),  c ## D;\
+        xor     %edi,           c ## D;\
+        add     k+4+round(%ebp),%esi;\
+        xor     %esi,           d ## D;\
+        ror     $1,             d ## D;
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+twofish_enc_blk:
+        push    %ebp                    /* save registers according to calling convention*/
+        push    %ebx
+        push    %esi
+        push    %edi
+        mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
+        add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
+        mov     in_blk+16(%esp),%edi    /* input adress in edi */
+        mov     (%edi),         %eax
+        mov     b_offset(%edi), %ebx
+        mov     c_offset(%edi), %ecx
+        mov     d_offset(%edi), %edx
+        input_whitening(%eax,%ebp,a_offset)
+        ror     $16,    %eax
+        input_whitening(%ebx,%ebp,b_offset)
+        input_whitening(%ecx,%ebp,c_offset)
+        input_whitening(%edx,%ebp,d_offset)
+        rol     $1,     %edx
+        encrypt_round(R0,R1,R2,R3,0);
+        encrypt_round(R2,R3,R0,R1,8);
+        encrypt_round(R0,R1,R2,R3,2*8);
+        encrypt_round(R2,R3,R0,R1,3*8);
+        encrypt_round(R0,R1,R2,R3,4*8);
+        encrypt_round(R2,R3,R0,R1,5*8);
+        encrypt_round(R0,R1,R2,R3,6*8);
+        encrypt_round(R2,R3,R0,R1,7*8);
+        encrypt_round(R0,R1,R2,R3,8*8);
+        encrypt_round(R2,R3,R0,R1,9*8);
+        encrypt_round(R0,R1,R2,R3,10*8);
+        encrypt_round(R2,R3,R0,R1,11*8);
+        encrypt_round(R0,R1,R2,R3,12*8);
+        encrypt_round(R2,R3,R0,R1,13*8);
+        encrypt_round(R0,R1,R2,R3,14*8);
+        encrypt_last_round(R2,R3,R0,R1,15*8);
+        output_whitening(%eax,%ebp,c_offset)
+        output_whitening(%ebx,%ebp,d_offset)
+        output_whitening(%ecx,%ebp,a_offset)
+        output_whitening(%edx,%ebp,b_offset)
+        mov     out_blk+16(%esp),%edi;
+        mov     %eax,           c_offset(%edi)
+        mov     %ebx,           d_offset(%edi)
+        mov     %ecx,           (%edi)
+        mov     %edx,           b_offset(%edi)
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        mov     $1,     %eax
+        ret
+twofish_dec_blk:
+        push    %ebp                    /* save registers according to calling convention*/
+        push    %ebx
+        push    %esi
+        push    %edi
+        mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
+        add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
+        mov     in_blk+16(%esp),%edi    /* input adress in edi */
+        mov     (%edi),         %eax
+        mov     b_offset(%edi), %ebx
+        mov     c_offset(%edi), %ecx
+        mov     d_offset(%edi), %edx
+        output_whitening(%eax,%ebp,a_offset)
+        output_whitening(%ebx,%ebp,b_offset)
+        ror     $16,    %ebx
+        output_whitening(%ecx,%ebp,c_offset)
+        output_whitening(%edx,%ebp,d_offset)
+        rol     $1,     %ecx
+        decrypt_round(R0,R1,R2,R3,15*8);
+        decrypt_round(R2,R3,R0,R1,14*8);
+        decrypt_round(R0,R1,R2,R3,13*8);
+        decrypt_round(R2,R3,R0,R1,12*8);
+        decrypt_round(R0,R1,R2,R3,11*8);
+        decrypt_round(R2,R3,R0,R1,10*8);
+        decrypt_round(R0,R1,R2,R3,9*8);
+        decrypt_round(R2,R3,R0,R1,8*8);
+        decrypt_round(R0,R1,R2,R3,7*8);
+        decrypt_round(R2,R3,R0,R1,6*8);
+        decrypt_round(R0,R1,R2,R3,5*8);
+        decrypt_round(R2,R3,R0,R1,4*8);
+        decrypt_round(R0,R1,R2,R3,3*8);
+        decrypt_round(R2,R3,R0,R1,2*8);
+        decrypt_round(R0,R1,R2,R3,1*8);
+        decrypt_last_round(R2,R3,R0,R1,0);
+        input_whitening(%eax,%ebp,c_offset)
+        input_whitening(%ebx,%ebp,d_offset)
+        input_whitening(%ecx,%ebp,a_offset)
+        input_whitening(%edx,%ebp,b_offset)
+        mov     out_blk+16(%esp),%edi;
+        mov     %eax,           c_offset(%edi)
+        mov     %ebx,           d_offset(%edi)
+        mov     %ecx,           (%edi)
+        mov     %edx,           b_offset(%edi)
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        mov     $1,     %eax
+        ret
author	Joachim Fritschi <jfritschi@freenet.de>	2006-06-20 06:59:16 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2006-09-20 21:16:28 -0400
commit	b9f535ffe38f7eb61ac2219d32d97c377b69f70d (patch)
tree	57e09481226ab5a25f3938963f8299c9f0cd8439 /arch/i386/crypto/twofish-i586-asm.S
parent	758f570ea785a5fbcdca026dfab2e9e1a3f89726 (diff)

diff --git a/arch/i386/crypto/twofish-i586-asm.S b/arch/i386/crypto/twofish-i586-asm.S new file mode 100644 index 000000000000..39b98ed2c1b9 --- /dev/null +++ b/arch/i386/crypto/twofish-i586-asm.S
@@ -0,0 +1,335 @@
	1	/***************************************************************************
	2	* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
	3	* *
	4	* This program is free software; you can redistribute it and/or modify *
	5	* it under the terms of the GNU General Public License as published by *
	6	* the Free Software Foundation; either version 2 of the License, or *
	7	* (at your option) any later version. *
	8	* *
	9	* This program is distributed in the hope that it will be useful, *
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
	12	* GNU General Public License for more details. *
	13	* *
	14	* You should have received a copy of the GNU General Public License *
	15	* along with this program; if not, write to the *
	16	* Free Software Foundation, Inc., *
	17	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
	18	***************************************************************************/
	19
	20	.file "twofish-i586-asm.S"
	21	.text
	22
	23	#include <asm/asm-offsets.h>
	24
	25	/* return adress at 0 */
	26
	27	#define in_blk 12 /* input byte array address parameter*/
	28	#define out_blk 8 /* output byte array address parameter*/
	29	#define tfm 4 /* Twofish context structure */
	30
	31	#define a_offset 0
	32	#define b_offset 4
	33	#define c_offset 8
	34	#define d_offset 12
	35
	36	/* Structure of the crypto context struct*/
	37
	38	#define s0 0 /* S0 Array 256 Words each */
	39	#define s1 1024 /* S1 Array */
	40	#define s2 2048 /* S2 Array */
	41	#define s3 3072 /* S3 Array */
	42	#define w 4096 /* 8 whitening keys (word) */
	43	#define k 4128 /* key 1-32 ( word ) */
	44
	45	/* define a few register aliases to allow macro substitution */
	46
	47	#define R0D %eax
	48	#define R0B %al
	49	#define R0H %ah
	50
	51	#define R1D %ebx
	52	#define R1B %bl
	53	#define R1H %bh
	54
	55	#define R2D %ecx
	56	#define R2B %cl
	57	#define R2H %ch
	58
	59	#define R3D %edx
	60	#define R3B %dl
	61	#define R3H %dh
	62
	63
	64	/* performs input whitening */
	65	#define input_whitening(src,context,offset)\
	66	xor w+offset(context), src;
	67
	68	/* performs input whitening */
	69	#define output_whitening(src,context,offset)\
	70	xor w+16+offset(context), src;
	71
	72	/*
	73	* a input register containing a (rotated 16)
	74	* b input register containing b
	75	* c input register containing c
	76	* d input register containing d (already rol $1)
	77	* operations on a and b are interleaved to increase performance
	78	*/
	79	#define encrypt_round(a,b,c,d,round)\
	80	push d ## D;\
	81	movzx b ## B, %edi;\
	82	mov s1(%ebp,%edi,4),d ## D;\
	83	movzx a ## B, %edi;\
	84	mov s2(%ebp,%edi,4),%esi;\
	85	movzx b ## H, %edi;\
	86	ror $16, b ## D;\
	87	xor s2(%ebp,%edi,4),d ## D;\
	88	movzx a ## H, %edi;\
	89	ror $16, a ## D;\
	90	xor s3(%ebp,%edi,4),%esi;\
	91	movzx b ## B, %edi;\
	92	xor s3(%ebp,%edi,4),d ## D;\
	93	movzx a ## B, %edi;\
	94	xor (%ebp,%edi,4), %esi;\
	95	movzx b ## H, %edi;\
	96	ror $15, b ## D;\
	97	xor (%ebp,%edi,4), d ## D;\
	98	movzx a ## H, %edi;\
	99	xor s1(%ebp,%edi,4),%esi;\
	100	pop %edi;\
	101	add d ## D, %esi;\
	102	add %esi, d ## D;\
	103	add k+round(%ebp), %esi;\
	104	xor %esi, c ## D;\
	105	rol $15, c ## D;\
	106	add k+4+round(%ebp),d ## D;\
	107	xor %edi, d ## D;
	108
	109	/*
	110	* a input register containing a (rotated 16)
	111	* b input register containing b
	112	* c input register containing c
	113	* d input register containing d (already rol $1)
	114	* operations on a and b are interleaved to increase performance
	115	* last round has different rotations for the output preparation
	116	*/
	117	#define encrypt_last_round(a,b,c,d,round)\
	118	push d ## D;\
	119	movzx b ## B, %edi;\
	120	mov s1(%ebp,%edi,4),d ## D;\
	121	movzx a ## B, %edi;\
	122	mov s2(%ebp,%edi,4),%esi;\
	123	movzx b ## H, %edi;\
	124	ror $16, b ## D;\
	125	xor s2(%ebp,%edi,4),d ## D;\
	126	movzx a ## H, %edi;\
	127	ror $16, a ## D;\
	128	xor s3(%ebp,%edi,4),%esi;\
	129	movzx b ## B, %edi;\
	130	xor s3(%ebp,%edi,4),d ## D;\
	131	movzx a ## B, %edi;\
	132	xor (%ebp,%edi,4), %esi;\
	133	movzx b ## H, %edi;\
	134	ror $16, b ## D;\
	135	xor (%ebp,%edi,4), d ## D;\
	136	movzx a ## H, %edi;\
	137	xor s1(%ebp,%edi,4),%esi;\
	138	pop %edi;\
	139	add d ## D, %esi;\
	140	add %esi, d ## D;\
	141	add k+round(%ebp), %esi;\
	142	xor %esi, c ## D;\
	143	ror $1, c ## D;\
	144	add k+4+round(%ebp),d ## D;\
	145	xor %edi, d ## D;
	146
	147	/*
	148	* a input register containing a
	149	* b input register containing b (rotated 16)
	150	* c input register containing c
	151	* d input register containing d (already rol $1)
	152	* operations on a and b are interleaved to increase performance
	153	*/
	154	#define decrypt_round(a,b,c,d,round)\
	155	push c ## D;\
	156	movzx a ## B, %edi;\
	157	mov (%ebp,%edi,4), c ## D;\
	158	movzx b ## B, %edi;\
	159	mov s3(%ebp,%edi,4),%esi;\
	160	movzx a ## H, %edi;\
	161	ror $16, a ## D;\
	162	xor s1(%ebp,%edi,4),c ## D;\
	163	movzx b ## H, %edi;\
	164	ror $16, b ## D;\
	165	xor (%ebp,%edi,4), %esi;\
	166	movzx a ## B, %edi;\
	167	xor s2(%ebp,%edi,4),c ## D;\
	168	movzx b ## B, %edi;\
	169	xor s1(%ebp,%edi,4),%esi;\
	170	movzx a ## H, %edi;\
	171	ror $15, a ## D;\
	172	xor s3(%ebp,%edi,4),c ## D;\
	173	movzx b ## H, %edi;\
	174	xor s2(%ebp,%edi,4),%esi;\
	175	pop %edi;\
	176	add %esi, c ## D;\
	177	add c ## D, %esi;\
	178	add k+round(%ebp), c ## D;\
	179	xor %edi, c ## D;\
	180	add k+4+round(%ebp),%esi;\
	181	xor %esi, d ## D;\
	182	rol $15, d ## D;
	183
	184	/*
	185	* a input register containing a
	186	* b input register containing b (rotated 16)
	187	* c input register containing c
	188	* d input register containing d (already rol $1)
	189	* operations on a and b are interleaved to increase performance
	190	* last round has different rotations for the output preparation
	191	*/
	192	#define decrypt_last_round(a,b,c,d,round)\
	193	push c ## D;\
	194	movzx a ## B, %edi;\
	195	mov (%ebp,%edi,4), c ## D;\
	196	movzx b ## B, %edi;\
	197	mov s3(%ebp,%edi,4),%esi;\
	198	movzx a ## H, %edi;\
	199	ror $16, a ## D;\
	200	xor s1(%ebp,%edi,4),c ## D;\
	201	movzx b ## H, %edi;\
	202	ror $16, b ## D;\
	203	xor (%ebp,%edi,4), %esi;\
	204	movzx a ## B, %edi;\
	205	xor s2(%ebp,%edi,4),c ## D;\
	206	movzx b ## B, %edi;\
	207	xor s1(%ebp,%edi,4),%esi;\
	208	movzx a ## H, %edi;\
	209	ror $16, a ## D;\
	210	xor s3(%ebp,%edi,4),c ## D;\
	211	movzx b ## H, %edi;\
	212	xor s2(%ebp,%edi,4),%esi;\
	213	pop %edi;\
	214	add %esi, c ## D;\
	215	add c ## D, %esi;\
	216	add k+round(%ebp), c ## D;\
	217	xor %edi, c ## D;\
	218	add k+4+round(%ebp),%esi;\
	219	xor %esi, d ## D;\
	220	ror $1, d ## D;
	221
	222	.align 4
	223	.global twofish_enc_blk
	224	.global twofish_dec_blk
	225
	226	twofish_enc_blk:
	227	push %ebp /* save registers according to calling convention*/
	228	push %ebx
	229	push %esi
	230	push %edi
	231
	232	mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
	233	add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
	234	mov in_blk+16(%esp),%edi /* input adress in edi */
	235
	236	mov (%edi), %eax
	237	mov b_offset(%edi), %ebx
	238	mov c_offset(%edi), %ecx
	239	mov d_offset(%edi), %edx
	240	input_whitening(%eax,%ebp,a_offset)
	241	ror $16, %eax
	242	input_whitening(%ebx,%ebp,b_offset)
	243	input_whitening(%ecx,%ebp,c_offset)
	244	input_whitening(%edx,%ebp,d_offset)
	245	rol $1, %edx
	246
	247	encrypt_round(R0,R1,R2,R3,0);
	248	encrypt_round(R2,R3,R0,R1,8);
	249	encrypt_round(R0,R1,R2,R3,2*8);
	250	encrypt_round(R2,R3,R0,R1,3*8);
	251	encrypt_round(R0,R1,R2,R3,4*8);
	252	encrypt_round(R2,R3,R0,R1,5*8);
	253	encrypt_round(R0,R1,R2,R3,6*8);
	254	encrypt_round(R2,R3,R0,R1,7*8);
	255	encrypt_round(R0,R1,R2,R3,8*8);
	256	encrypt_round(R2,R3,R0,R1,9*8);
	257	encrypt_round(R0,R1,R2,R3,10*8);
	258	encrypt_round(R2,R3,R0,R1,11*8);
	259	encrypt_round(R0,R1,R2,R3,12*8);
	260	encrypt_round(R2,R3,R0,R1,13*8);
	261	encrypt_round(R0,R1,R2,R3,14*8);
	262	encrypt_last_round(R2,R3,R0,R1,15*8);
	263
	264	output_whitening(%eax,%ebp,c_offset)
	265	output_whitening(%ebx,%ebp,d_offset)
	266	output_whitening(%ecx,%ebp,a_offset)
	267	output_whitening(%edx,%ebp,b_offset)
	268	mov out_blk+16(%esp),%edi;
	269	mov %eax, c_offset(%edi)
	270	mov %ebx, d_offset(%edi)
	271	mov %ecx, (%edi)
	272	mov %edx, b_offset(%edi)
	273
	274	pop %edi
	275	pop %esi
	276	pop %ebx
	277	pop %ebp
	278	mov $1, %eax
	279	ret
	280
	281	twofish_dec_blk:
	282	push %ebp /* save registers according to calling convention*/
	283	push %ebx
	284	push %esi
	285	push %edi
	286
	287
	288	mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
	289	add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
	290	mov in_blk+16(%esp),%edi /* input adress in edi */
	291
	292	mov (%edi), %eax
	293	mov b_offset(%edi), %ebx
	294	mov c_offset(%edi), %ecx
	295	mov d_offset(%edi), %edx
	296	output_whitening(%eax,%ebp,a_offset)
	297	output_whitening(%ebx,%ebp,b_offset)
	298	ror $16, %ebx
	299	output_whitening(%ecx,%ebp,c_offset)
	300	output_whitening(%edx,%ebp,d_offset)
	301	rol $1, %ecx
	302
	303	decrypt_round(R0,R1,R2,R3,15*8);
	304	decrypt_round(R2,R3,R0,R1,14*8);
	305	decrypt_round(R0,R1,R2,R3,13*8);
	306	decrypt_round(R2,R3,R0,R1,12*8);
	307	decrypt_round(R0,R1,R2,R3,11*8);
	308	decrypt_round(R2,R3,R0,R1,10*8);
	309	decrypt_round(R0,R1,R2,R3,9*8);
	310	decrypt_round(R2,R3,R0,R1,8*8);
	311	decrypt_round(R0,R1,R2,R3,7*8);
	312	decrypt_round(R2,R3,R0,R1,6*8);
	313	decrypt_round(R0,R1,R2,R3,5*8);
	314	decrypt_round(R2,R3,R0,R1,4*8);
	315	decrypt_round(R0,R1,R2,R3,3*8);
	316	decrypt_round(R2,R3,R0,R1,2*8);
	317	decrypt_round(R0,R1,R2,R3,1*8);
	318	decrypt_last_round(R2,R3,R0,R1,0);
	319
	320	input_whitening(%eax,%ebp,c_offset)
	321	input_whitening(%ebx,%ebp,d_offset)
	322	input_whitening(%ecx,%ebp,a_offset)
	323	input_whitening(%edx,%ebp,b_offset)
	324	mov out_blk+16(%esp),%edi;
	325	mov %eax, c_offset(%edi)
	326	mov %ebx, d_offset(%edi)
	327	mov %ecx, (%edi)
	328	mov %edx, b_offset(%edi)
	329
	330	pop %edi
	331	pop %esi
	332	pop %ebx
	333	pop %ebp
	334	mov $1, %eax
	335	ret