1 files changed, 324 insertions, 0 deletions
diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S
new file mode 100644
index 000000000000..35974a586615
--- /dev/null
+++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+.file "twofish-x86_64-asm.S"
+.text
+#include <asm/asm-offsets.h>
+#define a_offset        0
+#define b_offset        4
+#define c_offset        8
+#define d_offset        12
+/* Structure of the crypto context struct*/
+#define s0      0       /* S0 Array 256 Words each */
+#define s1      1024    /* S1 Array */
+#define s2      2048    /* S2 Array */
+#define s3      3072    /* S3 Array */
+#define w       4096    /* 8 whitening keys (word) */
+#define k       4128    /* key 1-32 ( word ) */
+/* define a few register aliases to allow macro substitution */
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+        xor     w+offset(context),      src;
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+        xor     w+16+offset(context),   src;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $15,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        rol     $15,            c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        xor     a,              %r10;\
+        movzx   b ## H,         %edi;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        ror     $1,             c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $15,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        rol     $15,            d ## D;
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        xor     a,              %r10;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        ror     $1,             d ## D;
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+twofish_enc_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        input_whitening(R1,%r11,a_offset)
+        input_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        rol     $16,    R0D
+        shr     $32,    R1
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R3D
+        encrypt_round(R0,R1,R2,R3,0);
+        encrypt_round(R2,R3,R0,R1,8);
+        encrypt_round(R0,R1,R2,R3,2*8);
+        encrypt_round(R2,R3,R0,R1,3*8);
+        encrypt_round(R0,R1,R2,R3,4*8);
+        encrypt_round(R2,R3,R0,R1,5*8);
+        encrypt_round(R0,R1,R2,R3,6*8);
+        encrypt_round(R2,R3,R0,R1,7*8);
+        encrypt_round(R0,R1,R2,R3,8*8);
+        encrypt_round(R2,R3,R0,R1,9*8);
+        encrypt_round(R0,R1,R2,R3,10*8);
+        encrypt_round(R2,R3,R0,R1,11*8);
+        encrypt_round(R0,R1,R2,R3,12*8);
+        encrypt_round(R2,R3,R0,R1,13*8);
+        encrypt_round(R0,R1,R2,R3,14*8);
+        encrypt_last_round(R2,R3,R0,R1,15*8);
+        output_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        output_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret
+twofish_dec_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        output_whitening(R1,%r11,a_offset)
+        output_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        shr     $32,    R1
+        rol     $16,    R1D
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R2D
+        decrypt_round(R0,R1,R2,R3,15*8);
+        decrypt_round(R2,R3,R0,R1,14*8);
+        decrypt_round(R0,R1,R2,R3,13*8);
+        decrypt_round(R2,R3,R0,R1,12*8);
+        decrypt_round(R0,R1,R2,R3,11*8);
+        decrypt_round(R2,R3,R0,R1,10*8);
+        decrypt_round(R0,R1,R2,R3,9*8);
+        decrypt_round(R2,R3,R0,R1,8*8);
+        decrypt_round(R0,R1,R2,R3,7*8);
+        decrypt_round(R2,R3,R0,R1,6*8);
+        decrypt_round(R0,R1,R2,R3,5*8);
+        decrypt_round(R2,R3,R0,R1,4*8);
+        decrypt_round(R0,R1,R2,R3,3*8);
+        decrypt_round(R2,R3,R0,R1,2*8);
+        decrypt_round(R0,R1,R2,R3,1*8);
+        decrypt_last_round(R2,R3,R0,R1,0);
+        input_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        input_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret

diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S new file mode 100644 index 000000000000..35974a586615 --- /dev/null +++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
	1	/***************************************************************************
	2	* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
	3	* *
	4	* This program is free software; you can redistribute it and/or modify *
	5	* it under the terms of the GNU General Public License as published by *
	6	* the Free Software Foundation; either version 2 of the License, or *
	7	* (at your option) any later version. *
	8	* *
	9	* This program is distributed in the hope that it will be useful, *
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
	12	* GNU General Public License for more details. *
	13	* *
	14	* You should have received a copy of the GNU General Public License *
	15	* along with this program; if not, write to the *
	16	* Free Software Foundation, Inc., *
	17	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
	18	***************************************************************************/
	19
	20	.file "twofish-x86_64-asm.S"
	21	.text
	22
	23	#include <asm/asm-offsets.h>
	24
	25	#define a_offset 0
	26	#define b_offset 4
	27	#define c_offset 8
	28	#define d_offset 12
	29
	30	/* Structure of the crypto context struct*/
	31
	32	#define s0 0 /* S0 Array 256 Words each */
	33	#define s1 1024 /* S1 Array */
	34	#define s2 2048 /* S2 Array */
	35	#define s3 3072 /* S3 Array */
	36	#define w 4096 /* 8 whitening keys (word) */
	37	#define k 4128 /* key 1-32 ( word ) */
	38
	39	/* define a few register aliases to allow macro substitution */
	40
	41	#define R0 %rax
	42	#define R0D %eax
	43	#define R0B %al
	44	#define R0H %ah
	45
	46	#define R1 %rbx
	47	#define R1D %ebx
	48	#define R1B %bl
	49	#define R1H %bh
	50
	51	#define R2 %rcx
	52	#define R2D %ecx
	53	#define R2B %cl
	54	#define R2H %ch
	55
	56	#define R3 %rdx
	57	#define R3D %edx
	58	#define R3B %dl
	59	#define R3H %dh
	60
	61
	62	/* performs input whitening */
	63	#define input_whitening(src,context,offset)\
	64	xor w+offset(context), src;
	65
	66	/* performs input whitening */
	67	#define output_whitening(src,context,offset)\
	68	xor w+16+offset(context), src;
	69
	70
	71	/*
	72	* a input register containing a (rotated 16)
	73	* b input register containing b
	74	* c input register containing c
	75	* d input register containing d (already rol $1)
	76	* operations on a and b are interleaved to increase performance
	77	*/
	78	#define encrypt_round(a,b,c,d,round)\
	79	movzx b ## B, %edi;\
	80	mov s1(%r11,%rdi,4),%r8d;\
	81	movzx a ## B, %edi;\
	82	mov s2(%r11,%rdi,4),%r9d;\
	83	movzx b ## H, %edi;\
	84	ror $16, b ## D;\
	85	xor s2(%r11,%rdi,4),%r8d;\
	86	movzx a ## H, %edi;\
	87	ror $16, a ## D;\
	88	xor s3(%r11,%rdi,4),%r9d;\
	89	movzx b ## B, %edi;\
	90	xor s3(%r11,%rdi,4),%r8d;\
	91	movzx a ## B, %edi;\
	92	xor (%r11,%rdi,4), %r9d;\
	93	movzx b ## H, %edi;\
	94	ror $15, b ## D;\
	95	xor (%r11,%rdi,4), %r8d;\
	96	movzx a ## H, %edi;\
	97	xor s1(%r11,%rdi,4),%r9d;\
	98	add %r8d, %r9d;\
	99	add %r9d, %r8d;\
	100	add k+round(%r11), %r9d;\
	101	xor %r9d, c ## D;\
	102	rol $15, c ## D;\
	103	add k+4+round(%r11),%r8d;\
	104	xor %r8d, d ## D;
	105
	106	/*
	107	* a input register containing a(rotated 16)
	108	* b input register containing b
	109	* c input register containing c
	110	* d input register containing d (already rol $1)
	111	* operations on a and b are interleaved to increase performance
	112	* during the round a and b are prepared for the output whitening
	113	*/
	114	#define encrypt_last_round(a,b,c,d,round)\
	115	mov b ## D, %r10d;\
	116	shl $32, %r10;\
	117	movzx b ## B, %edi;\
	118	mov s1(%r11,%rdi,4),%r8d;\
	119	movzx a ## B, %edi;\
	120	mov s2(%r11,%rdi,4),%r9d;\
	121	movzx b ## H, %edi;\
	122	ror $16, b ## D;\
	123	xor s2(%r11,%rdi,4),%r8d;\
	124	movzx a ## H, %edi;\
	125	ror $16, a ## D;\
	126	xor s3(%r11,%rdi,4),%r9d;\
	127	movzx b ## B, %edi;\
	128	xor s3(%r11,%rdi,4),%r8d;\
	129	movzx a ## B, %edi;\
	130	xor (%r11,%rdi,4), %r9d;\
	131	xor a, %r10;\
	132	movzx b ## H, %edi;\
	133	xor (%r11,%rdi,4), %r8d;\
	134	movzx a ## H, %edi;\
	135	xor s1(%r11,%rdi,4),%r9d;\
	136	add %r8d, %r9d;\
	137	add %r9d, %r8d;\
	138	add k+round(%r11), %r9d;\
	139	xor %r9d, c ## D;\
	140	ror $1, c ## D;\
	141	add k+4+round(%r11),%r8d;\
	142	xor %r8d, d ## D
	143
	144	/*
	145	* a input register containing a
	146	* b input register containing b (rotated 16)
	147	* c input register containing c (already rol $1)
	148	* d input register containing d
	149	* operations on a and b are interleaved to increase performance
	150	*/
	151	#define decrypt_round(a,b,c,d,round)\
	152	movzx a ## B, %edi;\
	153	mov (%r11,%rdi,4), %r9d;\
	154	movzx b ## B, %edi;\
	155	mov s3(%r11,%rdi,4),%r8d;\
	156	movzx a ## H, %edi;\
	157	ror $16, a ## D;\
	158	xor s1(%r11,%rdi,4),%r9d;\
	159	movzx b ## H, %edi;\
	160	ror $16, b ## D;\
	161	xor (%r11,%rdi,4), %r8d;\
	162	movzx a ## B, %edi;\
	163	xor s2(%r11,%rdi,4),%r9d;\
	164	movzx b ## B, %edi;\
	165	xor s1(%r11,%rdi,4),%r8d;\
	166	movzx a ## H, %edi;\
	167	ror $15, a ## D;\
	168	xor s3(%r11,%rdi,4),%r9d;\
	169	movzx b ## H, %edi;\
	170	xor s2(%r11,%rdi,4),%r8d;\
	171	add %r8d, %r9d;\
	172	add %r9d, %r8d;\
	173	add k+round(%r11), %r9d;\
	174	xor %r9d, c ## D;\
	175	add k+4+round(%r11),%r8d;\
	176	xor %r8d, d ## D;\
	177	rol $15, d ## D;
	178
	179	/*
	180	* a input register containing a
	181	* b input register containing b
	182	* c input register containing c (already rol $1)
	183	* d input register containing d
	184	* operations on a and b are interleaved to increase performance
	185	* during the round a and b are prepared for the output whitening
	186	*/
	187	#define decrypt_last_round(a,b,c,d,round)\
	188	movzx a ## B, %edi;\
	189	mov (%r11,%rdi,4), %r9d;\
	190	movzx b ## B, %edi;\
	191	mov s3(%r11,%rdi,4),%r8d;\
	192	movzx b ## H, %edi;\
	193	ror $16, b ## D;\
	194	xor (%r11,%rdi,4), %r8d;\
	195	movzx a ## H, %edi;\
	196	mov b ## D, %r10d;\
	197	shl $32, %r10;\
	198	xor a, %r10;\
	199	ror $16, a ## D;\
	200	xor s1(%r11,%rdi,4),%r9d;\
	201	movzx b ## B, %edi;\
	202	xor s1(%r11,%rdi,4),%r8d;\
	203	movzx a ## B, %edi;\
	204	xor s2(%r11,%rdi,4),%r9d;\
	205	movzx b ## H, %edi;\
	206	xor s2(%r11,%rdi,4),%r8d;\
	207	movzx a ## H, %edi;\
	208	xor s3(%r11,%rdi,4),%r9d;\
	209	add %r8d, %r9d;\
	210	add %r9d, %r8d;\
	211	add k+round(%r11), %r9d;\
	212	xor %r9d, c ## D;\
	213	add k+4+round(%r11),%r8d;\
	214	xor %r8d, d ## D;\
	215	ror $1, d ## D;
	216
	217	.align 8
	218	.global twofish_enc_blk
	219	.global twofish_dec_blk
	220
	221	twofish_enc_blk:
	222	pushq R1
	223
	224	/* %rdi contains the crypto tfm adress */
	225	/* %rsi contains the output adress */
	226	/* %rdx contains the input adress */
	227	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
	228	/* ctx adress is moved to free one non-rex register
	229	as target for the 8bit high operations */
	230	mov %rdi, %r11
	231
	232	movq (R3), R1
	233	movq 8(R3), R3
	234	input_whitening(R1,%r11,a_offset)
	235	input_whitening(R3,%r11,c_offset)
	236	mov R1D, R0D
	237	rol $16, R0D
	238	shr $32, R1
	239	mov R3D, R2D
	240	shr $32, R3
	241	rol $1, R3D
	242
	243	encrypt_round(R0,R1,R2,R3,0);
	244	encrypt_round(R2,R3,R0,R1,8);
	245	encrypt_round(R0,R1,R2,R3,2*8);
	246	encrypt_round(R2,R3,R0,R1,3*8);
	247	encrypt_round(R0,R1,R2,R3,4*8);
	248	encrypt_round(R2,R3,R0,R1,5*8);
	249	encrypt_round(R0,R1,R2,R3,6*8);
	250	encrypt_round(R2,R3,R0,R1,7*8);
	251	encrypt_round(R0,R1,R2,R3,8*8);
	252	encrypt_round(R2,R3,R0,R1,9*8);
	253	encrypt_round(R0,R1,R2,R3,10*8);
	254	encrypt_round(R2,R3,R0,R1,11*8);
	255	encrypt_round(R0,R1,R2,R3,12*8);
	256	encrypt_round(R2,R3,R0,R1,13*8);
	257	encrypt_round(R0,R1,R2,R3,14*8);
	258	encrypt_last_round(R2,R3,R0,R1,15*8);
	259
	260
	261	output_whitening(%r10,%r11,a_offset)
	262	movq %r10, (%rsi)
	263
	264	shl $32, R1
	265	xor R0, R1
	266
	267	output_whitening(R1,%r11,c_offset)
	268	movq R1, 8(%rsi)
	269
	270	popq R1
	271	movq $1,%rax
	272	ret
	273
	274	twofish_dec_blk:
	275	pushq R1
	276
	277	/* %rdi contains the crypto tfm adress */
	278	/* %rsi contains the output adress */
	279	/* %rdx contains the input adress */
	280	add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
	281	/* ctx adress is moved to free one non-rex register
	282	as target for the 8bit high operations */
	283	mov %rdi, %r11
	284
	285	movq (R3), R1
	286	movq 8(R3), R3
	287	output_whitening(R1,%r11,a_offset)
	288	output_whitening(R3,%r11,c_offset)
	289	mov R1D, R0D
	290	shr $32, R1
	291	rol $16, R1D
	292	mov R3D, R2D
	293	shr $32, R3
	294	rol $1, R2D
	295
	296	decrypt_round(R0,R1,R2,R3,15*8);
	297	decrypt_round(R2,R3,R0,R1,14*8);
	298	decrypt_round(R0,R1,R2,R3,13*8);
	299	decrypt_round(R2,R3,R0,R1,12*8);
	300	decrypt_round(R0,R1,R2,R3,11*8);
	301	decrypt_round(R2,R3,R0,R1,10*8);
	302	decrypt_round(R0,R1,R2,R3,9*8);
	303	decrypt_round(R2,R3,R0,R1,8*8);
	304	decrypt_round(R0,R1,R2,R3,7*8);
	305	decrypt_round(R2,R3,R0,R1,6*8);
	306	decrypt_round(R0,R1,R2,R3,5*8);
	307	decrypt_round(R2,R3,R0,R1,4*8);
	308	decrypt_round(R0,R1,R2,R3,3*8);
	309	decrypt_round(R2,R3,R0,R1,2*8);
	310	decrypt_round(R0,R1,R2,R3,1*8);
	311	decrypt_last_round(R2,R3,R0,R1,0);
	312
	313	input_whitening(%r10,%r11,a_offset)
	314	movq %r10, (%rsi)
	315
	316	shl $32, R1
	317	xor R0, R1
	318
	319	input_whitening(R1,%r11,c_offset)
	320	movq R1, 8(%rsi)
	321
	322	popq R1
	323	movq $1,%rax
	324	ret