crypto: cast6 - add x86_64/avx assembler implementation

This patch adds a x86_64/avx assembler implementation of the Cast6 block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast6-avx-x86_64 vs. cast6-generic 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 1.00x 1.01x 1.01x 0.99x 0.97x 0.98x 1.01x 0.96x 0.98x 64B 0.98x 0.99x 1.02x 1.01x 0.99x 1.00x 1.01x 0.99x 1.00x 0.99x 256B 1.77x 1.84x 0.99x 1.85x 1.77x 1.77x 1.70x 1.74x 1.69x 1.72x 1024B 1.93x 1.95x 0.99x 1.96x 1.93x 1.93x 1.84x 1.85x 1.89x 1.87x 8192B 1.91x 1.95x 0.99x 1.97x 1.95x 1.91x 1.86x 1.87x 1.93x 1.90x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.99x 1.02x 1.01x 0.98x 0.99x 1.00x 1.00x 0.98x 0.98x 64B 0.98x 0.99x 1.01x 1.00x 1.00x 1.00x 1.01x 1.01x 0.97x 1.00x 256B 1.77x 1.83x 1.00x 1.86x 1.79x 1.78x 1.70x 1.76x 1.71x 1.69x 1024B 1.92x 1.95x 0.99x 1.96x 1.93x 1.93x 1.83x 1.86x 1.89x 1.87x 8192B 1.94x 1.95x 0.99x 1.97x 1.95x 1.95x 1.87x 1.87x 1.93x 1.91x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 2012-07-11 13:38:57 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2012-08-01 05:47:30 -0400
commit: 4ea1277d301eb776e321684cd4ea95116b4e8847 (patch)
tree: 675ef40d239946bc3232861cdf5a84259da09dc6 /arch/x86/crypto/cast6-avx-x86_64-asm_64.S
parent: 9b8b04051d0df1e2c7c31206caff05673a2c685f (diff)
1 files changed, 335 insertions, 0 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..d258ce0d2e0
--- /dev/null
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,335 @@
+/*
+ * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+.file "cast6-avx-x86_64-asm_64.S"
+.text
+.extern cast6_s1
+.extern cast6_s2
+.extern cast6_s3
+.extern cast6_s4
+/* structure of crypto context */
+#define km      0
+#define kr      (12*4*4)
+/* s-boxes */
+#define s1      cast6_s1
+#define s2      cast6_s2
+#define s3      cast6_s3
+#define s4      cast6_s4
+/**********************************************************************
+  8-way AVX cast6
+ **********************************************************************/
+#define CTX %rdi
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+#define RX %xmm8
+#define RKM  %xmm9
+#define RKRF %xmm10
+#define RKRR %xmm11
+#define RTMP  %xmm12
+#define RMASK %xmm13
+#define R32   %xmm14
+#define RID1  %rax
+#define RID1b %al
+#define RID2  %rbx
+#define RID2b %bl
+#define RGI1   %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2   %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+#define RFS1  %r8
+#define RFS1d %r8d
+#define RFS2  %r9
+#define RFS2d %r9d
+#define RFS3  %r10
+#define RFS3d %r10d
+#define lookup_32bit(src, dst, op1, op2, op3) \
+        movb            src ## bl,     RID1b;    \
+        movb            src ## bh,     RID2b;    \
+        movl            s1(, RID1, 4), dst ## d; \
+        op1             s2(, RID2, 4), dst ## d; \
+        shrq $16,       src;                     \
+        movb            src ## bl,     RID1b;    \
+        movb            src ## bh,     RID2b;    \
+        op2             s3(, RID1, 4), dst ## d; \
+        op3             s4(, RID2, 4), dst ## d;
+#define F(a, x, op0, op1, op2, op3) \
+        op0     a,      RKM,  x;                 \
+        vpslld  RKRF,   x,    RTMP;              \
+        vpsrld  RKRR,   x,    x;                 \
+        vpor    RTMP,   x,    x;                 \
+        \
+        vpshufb RMASK,  x,    x;                 \
+        vmovq           x,    RGI1;              \
+        vpsrldq $8,     x,    x;                 \
+        vmovq           x,    RGI2;              \
+        \
+        lookup_32bit(RGI1, RFS1, op1, op2, op3); \
+        shrq $16,       RGI1;                    \
+        lookup_32bit(RGI1, RFS2, op1, op2, op3); \
+        shlq $32,       RFS2;                    \
+        orq             RFS1, RFS2;              \
+        \
+        lookup_32bit(RGI2, RFS1, op1, op2, op3); \
+        shrq $16,       RGI2;                    \
+        lookup_32bit(RGI2, RFS3, op1, op2, op3); \
+        shlq $32,       RFS3;                    \
+        orq             RFS1, RFS3;              \
+        \
+        vmovq           RFS2, x;                 \
+        vpinsrq $1,     RFS3, x, x;
+#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
+#define F2(b, x) F(b, x, vpxor,  subl, addl, xorl)
+#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
+#define qop(in, out, x, f) \
+        F ## f(in ## 1, x);          \
+        vpxor out ## 1, x, out ## 1; \
+        F ## f(in ## 2, x);          \
+        vpxor out ## 2, x, out ## 2; \
+#define Q(n) \
+        vbroadcastss    (km+(4*(4*n+0)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+0))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RD, RC, RX, 1);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+1)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+1))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RC, RB, RX, 2);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+2)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+2))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RB, RA, RX, 3);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+3)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+3))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RA, RD, RX, 1);
+#define QBAR(n) \
+        vbroadcastss    (km+(4*(4*n+3)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+3))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RA, RD, RX, 1);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+2)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+2))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RB, RA, RX, 3);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+1)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+1))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RC, RB, RX, 2);                                \
+        \
+        vbroadcastss    (km+(4*(4*n+0)))(CTX), RKM;        \
+        vpinsrb $0,     (kr+(4*n+0))(CTX),     RKRF, RKRF; \
+        vpsubq          RKRF,                  R32,  RKRR; \
+        qop(RD, RC, RX, 1);
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+        vpunpckldq              x1, x0, t0; \
+        vpunpckhdq              x1, x0, t2; \
+        vpunpckldq              x3, x2, t1; \
+        vpunpckhdq              x3, x2, x3; \
+        \
+        vpunpcklqdq             t1, t0, x0; \
+        vpunpckhqdq             t1, t0, x1; \
+        vpunpcklqdq             x3, t2, x2; \
+        vpunpckhqdq             x3, t2, x3;
+#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+        vmovdqu (0*4*4)(in),    x0; \
+        vmovdqu (1*4*4)(in),    x1; \
+        vmovdqu (2*4*4)(in),    x2; \
+        vmovdqu (3*4*4)(in),    x3; \
+        vpshufb RMASK, x0,      x0; \
+        vpshufb RMASK, x1,      x1; \
+        vpshufb RMASK, x2,      x2; \
+        vpshufb RMASK, x3,      x3; \
+        \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+        \
+        vpshufb RMASK,          x0, x0;       \
+        vpshufb RMASK,          x1, x1;       \
+        vpshufb RMASK,          x2, x2;       \
+        vpshufb RMASK,          x3, x3;       \
+        vmovdqu x0,             (0*4*4)(out); \
+        vmovdqu x1,             (1*4*4)(out); \
+        vmovdqu x2,             (2*4*4)(out); \
+        vmovdqu x3,             (3*4*4)(out);
+#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+        \
+        vpshufb RMASK,          x0, x0;       \
+        vpshufb RMASK,          x1, x1;       \
+        vpshufb RMASK,          x2, x2;       \
+        vpshufb RMASK,          x3, x3;       \
+        vpxor (0*4*4)(out),     x0, x0;       \
+        vmovdqu x0,             (0*4*4)(out); \
+        vpxor (1*4*4)(out),     x1, x1;       \
+        vmovdqu x1,             (1*4*4)(out); \
+        vpxor (2*4*4)(out),     x2, x2;       \
+        vmovdqu x2,             (2*4*4)(out); \
+        vpxor (3*4*4)(out),     x3, x3;       \
+        vmovdqu x3,             (3*4*4)(out);
+.align 16
+.Lbswap_mask:
+        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.L32_mask:
+        .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
+.align 16
+.global __cast6_enc_blk_8way
+.type   __cast6_enc_blk_8way,@function;
+__cast6_enc_blk_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         *      %rcx: bool, if true: xor output
+         */
+        pushq %rbx;
+        pushq %rcx;
+        vmovdqu .Lbswap_mask, RMASK;
+        vmovdqu .L32_mask, R32;
+        vpxor RKRF, RKRF, RKRF;
+        leaq (4*4*4)(%rdx), %rax;
+        inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
+        inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+        xorq RID1, RID1;
+        xorq RID2, RID2;
+        Q(0);
+        Q(1);
+        Q(2);
+        Q(3);
+        Q(4);
+        Q(5);
+        QBAR(6);
+        QBAR(7);
+        QBAR(8);
+        QBAR(9);
+        QBAR(10);
+        QBAR(11);
+        popq %rcx;
+        popq %rbx;
+        leaq (4*4*4)(%rsi), %rax;
+        testb %cl, %cl;
+        jnz __enc_xor8;
+        outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
+        outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+        ret;
+__enc_xor8:
+        outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
+        outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+        ret;
+.align 16
+.global cast6_dec_blk_8way
+.type   cast6_dec_blk_8way,@function;
+cast6_dec_blk_8way:
+        /* input:
+         *      %rdi: ctx, CTX
+         *      %rsi: dst
+         *      %rdx: src
+         */
+        pushq %rbx;
+        vmovdqu .Lbswap_mask, RMASK;
+        vmovdqu .L32_mask, R32;
+        vpxor RKRF, RKRF, RKRF;
+        leaq (4*4*4)(%rdx), %rax;
+        inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
+        inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+        xorq RID1, RID1;
+        xorq RID2, RID2;
+        Q(11);
+        Q(10);
+        Q(9);
+        Q(8);
+        Q(7);
+        Q(6);
+        QBAR(5);
+        QBAR(4);
+        QBAR(3);
+        QBAR(2);
+        QBAR(1);
+        QBAR(0);
+        popq %rbx;
+        leaq (4*4*4)(%rsi), %rax;
+        outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
+        outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+        ret;
author	Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>	2012-07-11 13:38:57 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2012-08-01 05:47:30 -0400
commit	4ea1277d301eb776e321684cd4ea95116b4e8847 (patch)
tree	675ef40d239946bc3232861cdf5a84259da09dc6 /arch/x86/crypto/cast6-avx-x86_64-asm_64.S
parent	9b8b04051d0df1e2c7c31206caff05673a2c685f (diff)

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S new file mode 100644 index 00000000000..d258ce0d2e0 --- /dev/null +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,335 @@
	1	/*
	2	* Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
	3	*
	4	* Copyright (C) 2012 Johannes Goetzfried
	5	* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with this program; if not, write to the Free Software
	19	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	20	* USA
	21	*
	22	*/
	23
	24	.file "cast6-avx-x86_64-asm_64.S"
	25	.text
	26
	27	.extern cast6_s1
	28	.extern cast6_s2
	29	.extern cast6_s3
	30	.extern cast6_s4
	31
	32	/* structure of crypto context */
	33	#define km 0
	34	#define kr (1244)
	35
	36	/* s-boxes */
	37	#define s1 cast6_s1
	38	#define s2 cast6_s2
	39	#define s3 cast6_s3
	40	#define s4 cast6_s4
	41
	42	/**********************************************************************
	43	8-way AVX cast6
	44	**********************************************************************/
	45	#define CTX %rdi
	46
	47	#define RA1 %xmm0
	48	#define RB1 %xmm1
	49	#define RC1 %xmm2
	50	#define RD1 %xmm3
	51
	52	#define RA2 %xmm4
	53	#define RB2 %xmm5
	54	#define RC2 %xmm6
	55	#define RD2 %xmm7
	56
	57	#define RX %xmm8
	58
	59	#define RKM %xmm9
	60	#define RKRF %xmm10
	61	#define RKRR %xmm11
	62
	63	#define RTMP %xmm12
	64	#define RMASK %xmm13
	65	#define R32 %xmm14
	66
	67	#define RID1 %rax
	68	#define RID1b %al
	69	#define RID2 %rbx
	70	#define RID2b %bl
	71
	72	#define RGI1 %rdx
	73	#define RGI1bl %dl
	74	#define RGI1bh %dh
	75	#define RGI2 %rcx
	76	#define RGI2bl %cl
	77	#define RGI2bh %ch
	78
	79	#define RFS1 %r8
	80	#define RFS1d %r8d
	81	#define RFS2 %r9
	82	#define RFS2d %r9d
	83	#define RFS3 %r10
	84	#define RFS3d %r10d
	85
	86
	87	#define lookup_32bit(src, dst, op1, op2, op3) \
	88	movb src ## bl, RID1b; \
	89	movb src ## bh, RID2b; \
	90	movl s1(, RID1, 4), dst ## d; \
	91	op1 s2(, RID2, 4), dst ## d; \
	92	shrq $16, src; \
	93	movb src ## bl, RID1b; \
	94	movb src ## bh, RID2b; \
	95	op2 s3(, RID1, 4), dst ## d; \
	96	op3 s4(, RID2, 4), dst ## d;
	97
	98	#define F(a, x, op0, op1, op2, op3) \
	99	op0 a, RKM, x; \
	100	vpslld RKRF, x, RTMP; \
	101	vpsrld RKRR, x, x; \
	102	vpor RTMP, x, x; \
	103	\
	104	vpshufb RMASK, x, x; \
	105	vmovq x, RGI1; \
	106	vpsrldq $8, x, x; \
	107	vmovq x, RGI2; \
	108	\
	109	lookup_32bit(RGI1, RFS1, op1, op2, op3); \
	110	shrq $16, RGI1; \
	111	lookup_32bit(RGI1, RFS2, op1, op2, op3); \
	112	shlq $32, RFS2; \
	113	orq RFS1, RFS2; \
	114	\
	115	lookup_32bit(RGI2, RFS1, op1, op2, op3); \
	116	shrq $16, RGI2; \
	117	lookup_32bit(RGI2, RFS3, op1, op2, op3); \
	118	shlq $32, RFS3; \
	119	orq RFS1, RFS3; \
	120	\
	121	vmovq RFS2, x; \
	122	vpinsrq $1, RFS3, x, x;
	123
	124	#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
	125	#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
	126	#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
	127
	128	#define qop(in, out, x, f) \
	129	F ## f(in ## 1, x); \
	130	vpxor out ## 1, x, out ## 1; \
	131	F ## f(in ## 2, x); \
	132	vpxor out ## 2, x, out ## 2; \
	133
	134	#define Q(n) \
	135	vbroadcastss (km+(4(4n+0)))(CTX), RKM; \
	136	vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
	137	vpsubq RKRF, R32, RKRR; \
	138	qop(RD, RC, RX, 1); \
	139	\
	140	vbroadcastss (km+(4(4n+1)))(CTX), RKM; \
	141	vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
	142	vpsubq RKRF, R32, RKRR; \
	143	qop(RC, RB, RX, 2); \
	144	\
	145	vbroadcastss (km+(4(4n+2)))(CTX), RKM; \
	146	vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
	147	vpsubq RKRF, R32, RKRR; \
	148	qop(RB, RA, RX, 3); \
	149	\
	150	vbroadcastss (km+(4(4n+3)))(CTX), RKM; \
	151	vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
	152	vpsubq RKRF, R32, RKRR; \
	153	qop(RA, RD, RX, 1);
	154
	155	#define QBAR(n) \
	156	vbroadcastss (km+(4(4n+3)))(CTX), RKM; \
	157	vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
	158	vpsubq RKRF, R32, RKRR; \
	159	qop(RA, RD, RX, 1); \
	160	\
	161	vbroadcastss (km+(4(4n+2)))(CTX), RKM; \
	162	vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
	163	vpsubq RKRF, R32, RKRR; \
	164	qop(RB, RA, RX, 3); \
	165	\
	166	vbroadcastss (km+(4(4n+1)))(CTX), RKM; \
	167	vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
	168	vpsubq RKRF, R32, RKRR; \
	169	qop(RC, RB, RX, 2); \
	170	\
	171	vbroadcastss (km+(4(4n+0)))(CTX), RKM; \
	172	vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
	173	vpsubq RKRF, R32, RKRR; \
	174	qop(RD, RC, RX, 1);
	175
	176
	177	#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
	178	vpunpckldq x1, x0, t0; \
	179	vpunpckhdq x1, x0, t2; \
	180	vpunpckldq x3, x2, t1; \
	181	vpunpckhdq x3, x2, x3; \
	182	\
	183	vpunpcklqdq t1, t0, x0; \
	184	vpunpckhqdq t1, t0, x1; \
	185	vpunpcklqdq x3, t2, x2; \
	186	vpunpckhqdq x3, t2, x3;
	187
	188	#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
	189	vmovdqu (044)(in), x0; \
	190	vmovdqu (144)(in), x1; \
	191	vmovdqu (244)(in), x2; \
	192	vmovdqu (344)(in), x3; \
	193	vpshufb RMASK, x0, x0; \
	194	vpshufb RMASK, x1, x1; \
	195	vpshufb RMASK, x2, x2; \
	196	vpshufb RMASK, x3, x3; \
	197	\
	198	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
	199
	200	#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
	201	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
	202	\
	203	vpshufb RMASK, x0, x0; \
	204	vpshufb RMASK, x1, x1; \
	205	vpshufb RMASK, x2, x2; \
	206	vpshufb RMASK, x3, x3; \
	207	vmovdqu x0, (044)(out); \
	208	vmovdqu x1, (144)(out); \
	209	vmovdqu x2, (244)(out); \
	210	vmovdqu x3, (344)(out);
	211
	212	#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
	213	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
	214	\
	215	vpshufb RMASK, x0, x0; \
	216	vpshufb RMASK, x1, x1; \
	217	vpshufb RMASK, x2, x2; \
	218	vpshufb RMASK, x3, x3; \
	219	vpxor (044)(out), x0, x0; \
	220	vmovdqu x0, (044)(out); \
	221	vpxor (144)(out), x1, x1; \
	222	vmovdqu x1, (144)(out); \
	223	vpxor (244)(out), x2, x2; \
	224	vmovdqu x2, (244)(out); \
	225	vpxor (344)(out), x3, x3; \
	226	vmovdqu x3, (344)(out);
	227
	228	.align 16
	229	.Lbswap_mask:
	230	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
	231	.L32_mask:
	232	.byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
	233
	234	.align 16
	235	.global __cast6_enc_blk_8way
	236	.type __cast6_enc_blk_8way,@function;
	237
	238	__cast6_enc_blk_8way:
	239	/* input:
	240	* %rdi: ctx, CTX
	241	* %rsi: dst
	242	* %rdx: src
	243	* %rcx: bool, if true: xor output
	244	*/
	245
	246	pushq %rbx;
	247	pushq %rcx;
	248
	249	vmovdqu .Lbswap_mask, RMASK;
	250	vmovdqu .L32_mask, R32;
	251	vpxor RKRF, RKRF, RKRF;
	252
	253	leaq (444)(%rdx), %rax;
	254	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
	255	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
	256
	257	xorq RID1, RID1;
	258	xorq RID2, RID2;
	259
	260	Q(0);
	261	Q(1);
	262	Q(2);
	263	Q(3);
	264	Q(4);
	265	Q(5);
	266	QBAR(6);
	267	QBAR(7);
	268	QBAR(8);
	269	QBAR(9);
	270	QBAR(10);
	271	QBAR(11);
	272
	273	popq %rcx;
	274	popq %rbx;
	275
	276	leaq (444)(%rsi), %rax;
	277
	278	testb %cl, %cl;
	279	jnz __enc_xor8;
	280
	281	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
	282	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
	283
	284	ret;
	285
	286	__enc_xor8:
	287	outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
	288	outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
	289
	290	ret;
	291
	292	.align 16
	293	.global cast6_dec_blk_8way
	294	.type cast6_dec_blk_8way,@function;
	295
	296	cast6_dec_blk_8way:
	297	/* input:
	298	* %rdi: ctx, CTX
	299	* %rsi: dst
	300	* %rdx: src
	301	*/
	302
	303	pushq %rbx;
	304
	305	vmovdqu .Lbswap_mask, RMASK;
	306	vmovdqu .L32_mask, R32;
	307	vpxor RKRF, RKRF, RKRF;
	308
	309	leaq (444)(%rdx), %rax;
	310	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
	311	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
	312
	313	xorq RID1, RID1;
	314	xorq RID2, RID2;
	315
	316	Q(11);
	317	Q(10);
	318	Q(9);
	319	Q(8);
	320	Q(7);
	321	Q(6);
	322	QBAR(5);
	323	QBAR(4);
	324	QBAR(3);
	325	QBAR(2);
	326	QBAR(1);
	327	QBAR(0);
	328
	329	popq %rbx;
	330
	331	leaq (444)(%rsi), %rax;
	332	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
	333	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
	334
	335	ret;