crypto: cast5 - add x86_64/avx assembler implementation

This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 2012-07-11 13:37:37 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2012-08-01 05:47:30 -0400
commit: 4d6d6a2c850f89bc9283d02519cb536baba72032 (patch)
tree: 8433747260d88000d79849bcd4db0e56b86aa6e4 /arch/x86/crypto/cast5-avx-x86_64-asm_64.S
parent: a2c5826095562983bf316e3a7eb137ef04a71a24 (diff)
1 files changed, 322 insertions, 0 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..94693c877e3b
--- /dev/null
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,322 @@
+/*
+ * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+.file "cast5-avx-x86_64-asm_64.S"
+.text
+.extern cast5_s1
+.extern cast5_s2
+.extern cast5_s3
+.extern cast5_s4
+/* structure of crypto context */
+#define km      0
+#define kr      (16*4)
+#define rr      ((16*4)+16)
+/* s-boxes */
+#define s1      cast5_s1
+#define s2      cast5_s2
+#define s3      cast5_s3
+#define s4      cast5_s4
+/**********************************************************************
+  16-way AVX cast5
+ **********************************************************************/
+#define CTX %rdi
+#define RL1 %xmm0
+#define RR1 %xmm1
+#define RL2 %xmm2
+#define RR2 %xmm3
+#define RL3 %xmm4
+#define RR3 %xmm5
+#define RL4 %xmm6
+#define RR4 %xmm7
+#define RX %xmm8
+#define RKM  %xmm9
+#define RKRF %xmm10
+#define RKRR %xmm11
+#define RTMP  %xmm12
+#define RMASK %xmm13
+#define R32   %xmm14
+#define RID1  %rax
+#define RID1b %al
+#define RID2  %rbx
+#define RID2b %bl
+#define RGI1   %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2   %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+#define RFS1  %r8
+#define RFS1d %r8d
+#define RFS2  %r9
+#define RFS2d %r9d
+#define RFS3  %r10
+#define RFS3d %r10d
+#define lookup_32bit(src, dst, op1, op2, op3) \
+        movb            src ## bl,     RID1b;    \
+        movb            src ## bh,     RID2b;    \
+        movl            s1(, RID1, 4), dst ## d; \
+        op1             s2(, RID2, 4), dst ## d; \
+        shrq $16,       src;                     \
+        movb            src ## bl,     RID1b;    \
+        movb            src ## bh,     RID2b;    \
+        op2             s3(, RID1, 4), dst ## d; \
+        op3             s4(, RID2, 4), dst ## d;
+#define F(a, x, op0, op1, op2, op3) \
+        op0     a,      RKM,  x;                 \
+        vpslld  RKRF,   x,    RTMP;              \
+        vpsrld  RKRR,   x,    x;                 \
+        vpor    RTMP,   x,    x;                 \
+        \
+        vpshufb RMASK,  x,    x;                 \
+        vmovq           x,    RGI1;              \
+        vpsrldq $8,     x,    x;                 \
+        vmovq           x,    RGI2;              \
+        \
+        lookup_32bit(RGI1, RFS1, op1, op2, op3); \
+        shrq $16,       RGI1;                    \
+        lookup_32bit(RGI1, RFS2, op1, op2, op3); \
+        shlq $32,       RFS2;                    \
+        orq             RFS1, RFS2;              \
+        \
+        lookup_32bit(RGI2, RFS1, op1, op2, op3); \
+        shrq $16,       RGI2;                    \
+        lookup_32bit(RGI2, RFS3, op1, op2, op3); \
+        shlq $32,       RFS3;                    \
+        orq             RFS1, RFS3;              \
+        \
+        vmovq           RFS2, x;                 \
+        vpinsrq $1,     RFS3, x, x;
+#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
+#define F2(b, x) F(b, x, vpxor,  subl, addl, xorl)
+#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
+#define subround(a, b, x, n, f) \
+        F ## f(b, x);  \
+        vpxor a, x, a;
+#define round(l, r, n, f) \
+        vbroadcastss    (km+(4*n))(CTX), RKM;        \
+        vpinsrb $0,     (kr+n)(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,            R32,  RKRR; \
+        subround(l ## 1, r ## 1, RX, n, f);          \
+        subround(l ## 2, r ## 2, RX, n, f);          \
+        subround(l ## 3, r ## 3, RX, n, f);          \
+        subround(l ## 4, r ## 4, RX, n, f);
+#define transpose_2x4(x0, x1, t0, t1) \
+        vpunpckldq              x1, x0, t0; \
+        vpunpckhdq              x1, x0, t1; \
+        \
+        vpunpcklqdq             t1, t0, x0; \
+        vpunpckhqdq             t1, t0, x1;
+#define inpack_blocks(in, x0, x1, t0, t1) \
+        vmovdqu (0*4*4)(in),    x0; \
+        vmovdqu (1*4*4)(in),    x1; \
+        vpshufb RMASK, x0,      x0; \
+        vpshufb RMASK, x1,      x1; \
+        \
+        transpose_2x4(x0, x1, t0, t1)
+#define outunpack_blocks(out, x0, x1, t0, t1) \
+        transpose_2x4(x0, x1, t0, t1) \
+        \
+        vpshufb RMASK,  x0, x0;           \
+        vpshufb RMASK,  x1, x1;           \
+        vmovdqu         x0, (0*4*4)(out); \
+        vmovdqu         x1, (1*4*4)(out);
+#define outunpack_xor_blocks(out, x0, x1, t0, t1) \
+        transpose_2x4(x0, x1, t0, t1) \
+        \
+        vpshufb RMASK,  x0, x0;               \
+        vpshufb RMASK,  x1, x1;               \
+        vpxor           (0*4*4)(out), x0, x0; \
+        vmovdqu         x0, (0*4*4)(out);     \
+        vpxor           (1*4*4)(out), x1, x1; \
+        vmovdqu         x1, (1*4*4)(out);
+.align 16
+.Lbswap_mask:
+        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.L32_mask:
+        .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
+.align 16
+.global __cast5_enc_blk_16way
+.type   __cast5_enc_blk_16way,@function;
+__cast5_enc_blk_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: bool, if true: xor output
+         */
+        pushq %rbx;
+        pushq %rcx;
+        vmovdqu .Lbswap_mask, RMASK;
+        vmovdqu .L32_mask, R32;
+        vpxor RKRF, RKRF, RKRF;
+        inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
+        leaq (2*4*4)(%rdx), %rax;
+        inpack_blocks(%rax, RL2, RR2, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        inpack_blocks(%rax, RL3, RR3, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        inpack_blocks(%rax, RL4, RR4, RTMP, RX);
+        xorq RID1, RID1;
+        xorq RID2, RID2;
+        round(RL, RR, 0, 1);
+        round(RR, RL, 1, 2);
+        round(RL, RR, 2, 3);
+        round(RR, RL, 3, 1);
+        round(RL, RR, 4, 2);
+        round(RR, RL, 5, 3);
+        round(RL, RR, 6, 1);
+        round(RR, RL, 7, 2);
+        round(RL, RR, 8, 3);
+        round(RR, RL, 9, 1);
+        round(RL, RR, 10, 2);
+        round(RR, RL, 11, 3);
+        movb rr(CTX), %al;
+        testb %al, %al;
+        jnz __skip_enc;
+        round(RL, RR, 12, 1);
+        round(RR, RL, 13, 2);
+        round(RL, RR, 14, 3);
+        round(RR, RL, 15, 1);
+__skip_enc:
+        popq %rcx;
+        popq %rbx;
+        testb %cl, %cl;
+        jnz __enc_xor16;
+        outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
+        leaq (2*4*4)(%rsi), %rax;
+        outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
+        ret;
+__enc_xor16:
+        outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX);
+        leaq (2*4*4)(%rsi), %rax;
+        outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
+        ret;
+.align 16
+.global cast5_dec_blk_16way
+.type   cast5_dec_blk_16way,@function;
+cast5_dec_blk_16way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        pushq %rbx;
+        vmovdqu .Lbswap_mask, RMASK;
+        vmovdqu .L32_mask, R32;
+        vpxor RKRF, RKRF, RKRF;
+        inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
+        leaq (2*4*4)(%rdx), %rax;
+        inpack_blocks(%rax, RL2, RR2, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        inpack_blocks(%rax, RL3, RR3, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        inpack_blocks(%rax, RL4, RR4, RTMP, RX);
+        xorq RID1, RID1;
+        xorq RID2, RID2;
+        movb rr(CTX), %al;
+        testb %al, %al;
+        jnz __skip_dec;
+        round(RL, RR, 15, 1);
+        round(RR, RL, 14, 3);
+        round(RL, RR, 13, 2);
+        round(RR, RL, 12, 1);
+__skip_dec:
+        round(RL, RR, 11, 3);
+        round(RR, RL, 10, 2);
+        round(RL, RR, 9, 1);
+        round(RR, RL, 8, 3);
+        round(RL, RR, 7, 2);
+        round(RR, RL, 6, 1);
+        round(RL, RR, 5, 3);
+        round(RR, RL, 4, 2);
+        round(RL, RR, 3, 1);
+        round(RR, RL, 2, 3);
+        round(RL, RR, 1, 2);
+        round(RR, RL, 0, 1);
+        popq %rbx;
+        outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
+        leaq (2*4*4)(%rsi), %rax;
+        outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
+        leaq (2*4*4)(%rax), %rax;
+        outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
+        ret;
author	Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>	2012-07-11 13:37:37 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2012-08-01 05:47:30 -0400
commit	4d6d6a2c850f89bc9283d02519cb536baba72032 (patch)
tree	8433747260d88000d79849bcd4db0e56b86aa6e4 /arch/x86/crypto/cast5-avx-x86_64-asm_64.S
parent	a2c5826095562983bf316e3a7eb137ef04a71a24 (diff)

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S new file mode 100644 index 000000000000..94693c877e3b --- /dev/null +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,322 @@
	1	/*
	2	* Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
	3	*
	4	* Copyright (C) 2012 Johannes Goetzfried
	5	* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with this program; if not, write to the Free Software
	19	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	20	* USA
	21	*
	22	*/
	23
	24	.file "cast5-avx-x86_64-asm_64.S"
	25	.text
	26
	27	.extern cast5_s1
	28	.extern cast5_s2
	29	.extern cast5_s3
	30	.extern cast5_s4
	31
	32	/* structure of crypto context */
	33	#define km 0
	34	#define kr (16*4)
	35	#define rr ((16*4)+16)
	36
	37	/* s-boxes */
	38	#define s1 cast5_s1
	39	#define s2 cast5_s2
	40	#define s3 cast5_s3
	41	#define s4 cast5_s4
	42
	43	/**********************************************************************
	44	16-way AVX cast5
	45	**********************************************************************/
	46	#define CTX %rdi
	47
	48	#define RL1 %xmm0
	49	#define RR1 %xmm1
	50	#define RL2 %xmm2
	51	#define RR2 %xmm3
	52	#define RL3 %xmm4
	53	#define RR3 %xmm5
	54	#define RL4 %xmm6
	55	#define RR4 %xmm7
	56
	57	#define RX %xmm8
	58
	59	#define RKM %xmm9
	60	#define RKRF %xmm10
	61	#define RKRR %xmm11
	62
	63	#define RTMP %xmm12
	64	#define RMASK %xmm13
	65	#define R32 %xmm14
	66
	67	#define RID1 %rax
	68	#define RID1b %al
	69	#define RID2 %rbx
	70	#define RID2b %bl
	71
	72	#define RGI1 %rdx
	73	#define RGI1bl %dl
	74	#define RGI1bh %dh
	75	#define RGI2 %rcx
	76	#define RGI2bl %cl
	77	#define RGI2bh %ch
	78
	79	#define RFS1 %r8
	80	#define RFS1d %r8d
	81	#define RFS2 %r9
	82	#define RFS2d %r9d
	83	#define RFS3 %r10
	84	#define RFS3d %r10d
	85
	86
	87	#define lookup_32bit(src, dst, op1, op2, op3) \
	88	movb src ## bl, RID1b; \
	89	movb src ## bh, RID2b; \
	90	movl s1(, RID1, 4), dst ## d; \
	91	op1 s2(, RID2, 4), dst ## d; \
	92	shrq $16, src; \
	93	movb src ## bl, RID1b; \
	94	movb src ## bh, RID2b; \
	95	op2 s3(, RID1, 4), dst ## d; \
	96	op3 s4(, RID2, 4), dst ## d;
	97
	98	#define F(a, x, op0, op1, op2, op3) \
	99	op0 a, RKM, x; \
	100	vpslld RKRF, x, RTMP; \
	101	vpsrld RKRR, x, x; \
	102	vpor RTMP, x, x; \
	103	\
	104	vpshufb RMASK, x, x; \
	105	vmovq x, RGI1; \
	106	vpsrldq $8, x, x; \
	107	vmovq x, RGI2; \
	108	\
	109	lookup_32bit(RGI1, RFS1, op1, op2, op3); \
	110	shrq $16, RGI1; \
	111	lookup_32bit(RGI1, RFS2, op1, op2, op3); \
	112	shlq $32, RFS2; \
	113	orq RFS1, RFS2; \
	114	\
	115	lookup_32bit(RGI2, RFS1, op1, op2, op3); \
	116	shrq $16, RGI2; \
	117	lookup_32bit(RGI2, RFS3, op1, op2, op3); \
	118	shlq $32, RFS3; \
	119	orq RFS1, RFS3; \
	120	\
	121	vmovq RFS2, x; \
	122	vpinsrq $1, RFS3, x, x;
	123
	124	#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
	125	#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
	126	#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
	127
	128	#define subround(a, b, x, n, f) \
	129	F ## f(b, x); \
	130	vpxor a, x, a;
	131
	132	#define round(l, r, n, f) \
	133	vbroadcastss (km+(4*n))(CTX), RKM; \
	134	vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \
	135	vpsubq RKRF, R32, RKRR; \
	136	subround(l ## 1, r ## 1, RX, n, f); \
	137	subround(l ## 2, r ## 2, RX, n, f); \
	138	subround(l ## 3, r ## 3, RX, n, f); \
	139	subround(l ## 4, r ## 4, RX, n, f);
	140
	141
	142	#define transpose_2x4(x0, x1, t0, t1) \
	143	vpunpckldq x1, x0, t0; \
	144	vpunpckhdq x1, x0, t1; \
	145	\
	146	vpunpcklqdq t1, t0, x0; \
	147	vpunpckhqdq t1, t0, x1;
	148
	149	#define inpack_blocks(in, x0, x1, t0, t1) \
	150	vmovdqu (044)(in), x0; \
	151	vmovdqu (144)(in), x1; \
	152	vpshufb RMASK, x0, x0; \
	153	vpshufb RMASK, x1, x1; \
	154	\
	155	transpose_2x4(x0, x1, t0, t1)
	156
	157	#define outunpack_blocks(out, x0, x1, t0, t1) \
	158	transpose_2x4(x0, x1, t0, t1) \
	159	\
	160	vpshufb RMASK, x0, x0; \
	161	vpshufb RMASK, x1, x1; \
	162	vmovdqu x0, (044)(out); \
	163	vmovdqu x1, (144)(out);
	164
	165	#define outunpack_xor_blocks(out, x0, x1, t0, t1) \
	166	transpose_2x4(x0, x1, t0, t1) \
	167	\
	168	vpshufb RMASK, x0, x0; \
	169	vpshufb RMASK, x1, x1; \
	170	vpxor (044)(out), x0, x0; \
	171	vmovdqu x0, (044)(out); \
	172	vpxor (144)(out), x1, x1; \
	173	vmovdqu x1, (144)(out);
	174
	175	.align 16
	176	.Lbswap_mask:
	177	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
	178	.L32_mask:
	179	.byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
	180
	181	.align 16
	182	.global __cast5_enc_blk_16way
	183	.type __cast5_enc_blk_16way,@function;
	184
	185	__cast5_enc_blk_16way:
	186	/* input:
	187	* %rdi: ctx, CTX
	188	* %rsi: dst
	189	* %rdx: src
	190	* %rcx: bool, if true: xor output
	191	*/
	192
	193	pushq %rbx;
	194	pushq %rcx;
	195
	196	vmovdqu .Lbswap_mask, RMASK;
	197	vmovdqu .L32_mask, R32;
	198	vpxor RKRF, RKRF, RKRF;
	199
	200	inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
	201	leaq (244)(%rdx), %rax;
	202	inpack_blocks(%rax, RL2, RR2, RTMP, RX);
	203	leaq (244)(%rax), %rax;
	204	inpack_blocks(%rax, RL3, RR3, RTMP, RX);
	205	leaq (244)(%rax), %rax;
	206	inpack_blocks(%rax, RL4, RR4, RTMP, RX);
	207
	208	xorq RID1, RID1;
	209	xorq RID2, RID2;
	210
	211	round(RL, RR, 0, 1);
	212	round(RR, RL, 1, 2);
	213	round(RL, RR, 2, 3);
	214	round(RR, RL, 3, 1);
	215	round(RL, RR, 4, 2);
	216	round(RR, RL, 5, 3);
	217	round(RL, RR, 6, 1);
	218	round(RR, RL, 7, 2);
	219	round(RL, RR, 8, 3);
	220	round(RR, RL, 9, 1);
	221	round(RL, RR, 10, 2);
	222	round(RR, RL, 11, 3);
	223
	224	movb rr(CTX), %al;
	225	testb %al, %al;
	226	jnz __skip_enc;
	227
	228	round(RL, RR, 12, 1);
	229	round(RR, RL, 13, 2);
	230	round(RL, RR, 14, 3);
	231	round(RR, RL, 15, 1);
	232
	233	__skip_enc:
	234	popq %rcx;
	235	popq %rbx;
	236
	237	testb %cl, %cl;
	238	jnz __enc_xor16;
	239
	240	outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
	241	leaq (244)(%rsi), %rax;
	242	outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
	243	leaq (244)(%rax), %rax;
	244	outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
	245	leaq (244)(%rax), %rax;
	246	outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
	247
	248	ret;
	249
	250	__enc_xor16:
	251	outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX);
	252	leaq (244)(%rsi), %rax;
	253	outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX);
	254	leaq (244)(%rax), %rax;
	255	outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX);
	256	leaq (244)(%rax), %rax;
	257	outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
	258
	259	ret;
	260
	261	.align 16
	262	.global cast5_dec_blk_16way
	263	.type cast5_dec_blk_16way,@function;
	264
	265	cast5_dec_blk_16way:
	266	/* input:
	267	* %rdi: ctx, CTX
	268	* %rsi: dst
	269	* %rdx: src
	270	*/
	271
	272	pushq %rbx;
	273
	274	vmovdqu .Lbswap_mask, RMASK;
	275	vmovdqu .L32_mask, R32;
	276	vpxor RKRF, RKRF, RKRF;
	277
	278	inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
	279	leaq (244)(%rdx), %rax;
	280	inpack_blocks(%rax, RL2, RR2, RTMP, RX);
	281	leaq (244)(%rax), %rax;
	282	inpack_blocks(%rax, RL3, RR3, RTMP, RX);
	283	leaq (244)(%rax), %rax;
	284	inpack_blocks(%rax, RL4, RR4, RTMP, RX);
	285
	286	xorq RID1, RID1;
	287	xorq RID2, RID2;
	288
	289	movb rr(CTX), %al;
	290	testb %al, %al;
	291	jnz __skip_dec;
	292
	293	round(RL, RR, 15, 1);
	294	round(RR, RL, 14, 3);
	295	round(RL, RR, 13, 2);
	296	round(RR, RL, 12, 1);
	297
	298	__skip_dec:
	299	round(RL, RR, 11, 3);
	300	round(RR, RL, 10, 2);
	301	round(RL, RR, 9, 1);
	302	round(RR, RL, 8, 3);
	303	round(RL, RR, 7, 2);
	304	round(RR, RL, 6, 1);
	305	round(RL, RR, 5, 3);
	306	round(RR, RL, 4, 2);
	307	round(RL, RR, 3, 1);
	308	round(RR, RL, 2, 3);
	309	round(RL, RR, 1, 2);
	310	round(RR, RL, 0, 1);
	311
	312	popq %rbx;
	313
	314	outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
	315	leaq (244)(%rsi), %rax;
	316	outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
	317	leaq (244)(%rax), %rax;
	318	outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
	319	leaq (244)(%rax), %rax;
	320	outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
	321
	322	ret;