aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-07-11 13:38:57 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-08-01 05:47:30 -0400
commit4ea1277d301eb776e321684cd4ea95116b4e8847 (patch)
tree675ef40d239946bc3232861cdf5a84259da09dc6 /arch/x86/crypto
parent9b8b04051d0df1e2c7c31206caff05673a2c685f (diff)
crypto: cast6 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast6 block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast6-avx-x86_64 vs. cast6-generic 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 1.00x 1.01x 1.01x 0.99x 0.97x 0.98x 1.01x 0.96x 0.98x 64B 0.98x 0.99x 1.02x 1.01x 0.99x 1.00x 1.01x 0.99x 1.00x 0.99x 256B 1.77x 1.84x 0.99x 1.85x 1.77x 1.77x 1.70x 1.74x 1.69x 1.72x 1024B 1.93x 1.95x 0.99x 1.96x 1.93x 1.93x 1.84x 1.85x 1.89x 1.87x 8192B 1.91x 1.95x 0.99x 1.97x 1.95x 1.91x 1.86x 1.87x 1.93x 1.90x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.99x 1.02x 1.01x 0.98x 0.99x 1.00x 1.00x 0.98x 0.98x 64B 0.98x 0.99x 1.01x 1.00x 1.00x 1.00x 1.01x 1.01x 0.97x 1.00x 256B 1.77x 1.83x 1.00x 1.86x 1.79x 1.78x 1.70x 1.76x 1.71x 1.69x 1024B 1.92x 1.95x 0.99x 1.96x 1.93x 1.93x 1.83x 1.86x 1.89x 1.87x 8192B 1.94x 1.95x 0.99x 1.97x 1.95x 1.95x 1.87x 1.87x 1.93x 1.91x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S335
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c648
3 files changed, 985 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 565e82b00142..5bacb4a226ac 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o 14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
15obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o 15obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 17obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 18obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
18obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 19obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
@@ -34,6 +35,7 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
34aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 35aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
35camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o 36camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
36cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o 37cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
38cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
37blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 39blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
38twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 40twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
39twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 41twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..d258ce0d2e06
--- /dev/null
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,335 @@
1/*
2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "cast6-avx-x86_64-asm_64.S"
25.text
26
27.extern cast6_s1
28.extern cast6_s2
29.extern cast6_s3
30.extern cast6_s4
31
32/* structure of crypto context */
33#define km 0
34#define kr (12*4*4)
35
36/* s-boxes */
37#define s1 cast6_s1
38#define s2 cast6_s2
39#define s3 cast6_s3
40#define s4 cast6_s4
41
42/**********************************************************************
43 8-way AVX cast6
44 **********************************************************************/
45#define CTX %rdi
46
47#define RA1 %xmm0
48#define RB1 %xmm1
49#define RC1 %xmm2
50#define RD1 %xmm3
51
52#define RA2 %xmm4
53#define RB2 %xmm5
54#define RC2 %xmm6
55#define RD2 %xmm7
56
57#define RX %xmm8
58
59#define RKM %xmm9
60#define RKRF %xmm10
61#define RKRR %xmm11
62
63#define RTMP %xmm12
64#define RMASK %xmm13
65#define R32 %xmm14
66
67#define RID1 %rax
68#define RID1b %al
69#define RID2 %rbx
70#define RID2b %bl
71
72#define RGI1 %rdx
73#define RGI1bl %dl
74#define RGI1bh %dh
75#define RGI2 %rcx
76#define RGI2bl %cl
77#define RGI2bh %ch
78
79#define RFS1 %r8
80#define RFS1d %r8d
81#define RFS2 %r9
82#define RFS2d %r9d
83#define RFS3 %r10
84#define RFS3d %r10d
85
86
87#define lookup_32bit(src, dst, op1, op2, op3) \
88 movb src ## bl, RID1b; \
89 movb src ## bh, RID2b; \
90 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \
93 movb src ## bl, RID1b; \
94 movb src ## bh, RID2b; \
95 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d;
97
98#define F(a, x, op0, op1, op2, op3) \
99 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \
103 \
104 vpshufb RMASK, x, x; \
105 vmovq x, RGI1; \
106 vpsrldq $8, x, x; \
107 vmovq x, RGI2; \
108 \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \
116 shrq $16, RGI2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \
118 shlq $32, RFS3; \
119 orq RFS1, RFS3; \
120 \
121 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x;
123
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
127
128#define qop(in, out, x, f) \
129 F ## f(in ## 1, x); \
130 vpxor out ## 1, x, out ## 1; \
131 F ## f(in ## 2, x); \
132 vpxor out ## 2, x, out ## 2; \
133
134#define Q(n) \
135 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
136 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
137 vpsubq RKRF, R32, RKRR; \
138 qop(RD, RC, RX, 1); \
139 \
140 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
141 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
142 vpsubq RKRF, R32, RKRR; \
143 qop(RC, RB, RX, 2); \
144 \
145 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
146 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
147 vpsubq RKRF, R32, RKRR; \
148 qop(RB, RA, RX, 3); \
149 \
150 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
151 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
152 vpsubq RKRF, R32, RKRR; \
153 qop(RA, RD, RX, 1);
154
155#define QBAR(n) \
156 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
157 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
158 vpsubq RKRF, R32, RKRR; \
159 qop(RA, RD, RX, 1); \
160 \
161 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
162 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 qop(RB, RA, RX, 3); \
165 \
166 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
167 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
168 vpsubq RKRF, R32, RKRR; \
169 qop(RC, RB, RX, 2); \
170 \
171 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
172 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
173 vpsubq RKRF, R32, RKRR; \
174 qop(RD, RC, RX, 1);
175
176
177#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
178 vpunpckldq x1, x0, t0; \
179 vpunpckhdq x1, x0, t2; \
180 vpunpckldq x3, x2, t1; \
181 vpunpckhdq x3, x2, x3; \
182 \
183 vpunpcklqdq t1, t0, x0; \
184 vpunpckhqdq t1, t0, x1; \
185 vpunpcklqdq x3, t2, x2; \
186 vpunpckhqdq x3, t2, x3;
187
188#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
189 vmovdqu (0*4*4)(in), x0; \
190 vmovdqu (1*4*4)(in), x1; \
191 vmovdqu (2*4*4)(in), x2; \
192 vmovdqu (3*4*4)(in), x3; \
193 vpshufb RMASK, x0, x0; \
194 vpshufb RMASK, x1, x1; \
195 vpshufb RMASK, x2, x2; \
196 vpshufb RMASK, x3, x3; \
197 \
198 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
199
200#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
201 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 \
203 vpshufb RMASK, x0, x0; \
204 vpshufb RMASK, x1, x1; \
205 vpshufb RMASK, x2, x2; \
206 vpshufb RMASK, x3, x3; \
207 vmovdqu x0, (0*4*4)(out); \
208 vmovdqu x1, (1*4*4)(out); \
209 vmovdqu x2, (2*4*4)(out); \
210 vmovdqu x3, (3*4*4)(out);
211
212#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
213 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
214 \
215 vpshufb RMASK, x0, x0; \
216 vpshufb RMASK, x1, x1; \
217 vpshufb RMASK, x2, x2; \
218 vpshufb RMASK, x3, x3; \
219 vpxor (0*4*4)(out), x0, x0; \
220 vmovdqu x0, (0*4*4)(out); \
221 vpxor (1*4*4)(out), x1, x1; \
222 vmovdqu x1, (1*4*4)(out); \
223 vpxor (2*4*4)(out), x2, x2; \
224 vmovdqu x2, (2*4*4)(out); \
225 vpxor (3*4*4)(out), x3, x3; \
226 vmovdqu x3, (3*4*4)(out);
227
228.align 16
229.Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.L32_mask:
232 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
233
234.align 16
235.global __cast6_enc_blk_8way
236.type __cast6_enc_blk_8way,@function;
237
238__cast6_enc_blk_8way:
239 /* input:
240 * %rdi: ctx, CTX
241 * %rsi: dst
242 * %rdx: src
243 * %rcx: bool, if true: xor output
244 */
245
246 pushq %rbx;
247 pushq %rcx;
248
249 vmovdqu .Lbswap_mask, RMASK;
250 vmovdqu .L32_mask, R32;
251 vpxor RKRF, RKRF, RKRF;
252
253 leaq (4*4*4)(%rdx), %rax;
254 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
255 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
256
257 xorq RID1, RID1;
258 xorq RID2, RID2;
259
260 Q(0);
261 Q(1);
262 Q(2);
263 Q(3);
264 Q(4);
265 Q(5);
266 QBAR(6);
267 QBAR(7);
268 QBAR(8);
269 QBAR(9);
270 QBAR(10);
271 QBAR(11);
272
273 popq %rcx;
274 popq %rbx;
275
276 leaq (4*4*4)(%rsi), %rax;
277
278 testb %cl, %cl;
279 jnz __enc_xor8;
280
281 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
282 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
283
284 ret;
285
286__enc_xor8:
287 outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
288 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
289
290 ret;
291
292.align 16
293.global cast6_dec_blk_8way
294.type cast6_dec_blk_8way,@function;
295
296cast6_dec_blk_8way:
297 /* input:
298 * %rdi: ctx, CTX
299 * %rsi: dst
300 * %rdx: src
301 */
302
303 pushq %rbx;
304
305 vmovdqu .Lbswap_mask, RMASK;
306 vmovdqu .L32_mask, R32;
307 vpxor RKRF, RKRF, RKRF;
308
309 leaq (4*4*4)(%rdx), %rax;
310 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
311 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
312
313 xorq RID1, RID1;
314 xorq RID2, RID2;
315
316 Q(11);
317 Q(10);
318 Q(9);
319 Q(8);
320 Q(7);
321 Q(6);
322 QBAR(5);
323 QBAR(4);
324 QBAR(3);
325 QBAR(2);
326 QBAR(1);
327 QBAR(0);
328
329 popq %rbx;
330
331 leaq (4*4*4)(%rsi), %rax;
332 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
333 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
334
335 ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
new file mode 100644
index 000000000000..15e5f85a5011
--- /dev/null
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -0,0 +1,648 @@
1/*
2 * Glue Code for the AVX assembler implemention of the Cast6 Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/cast6.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/xcr.h>
37#include <asm/xsave.h>
38#include <asm/crypto/ablk_helper.h>
39#include <asm/crypto/glue_helper.h>
40
41#define CAST6_PARALLEL_BLOCKS 8
42
43asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
44 const u8 *src, bool xor);
45asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
46 const u8 *src);
47
48static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __cast6_enc_blk_8way(ctx, dst, src, false);
52}
53
54static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
55 const u8 *src)
56{
57 __cast6_enc_blk_8way(ctx, dst, src, true);
58}
59
60static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 cast6_dec_blk_8way(ctx, dst, src);
64}
65
66
67static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
68{
69 u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
70 unsigned int j;
71
72 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
73 ivs[j] = src[j];
74
75 cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
76
77 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
78 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
79}
80
81static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
82{
83 be128 ctrblk;
84
85 u128_to_be128(&ctrblk, iv);
86 u128_inc(iv);
87
88 __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
89 u128_xor(dst, src, (u128 *)&ctrblk);
90}
91
92static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[CAST6_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx cast6_enc = {
110 .num_funcs = 2,
111 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
112
113 .funcs = { {
114 .num_blocks = CAST6_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
116 }, {
117 .num_blocks = 1,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
119 } }
120};
121
122static const struct common_glue_ctx cast6_ctr = {
123 .num_funcs = 2,
124 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
125
126 .funcs = { {
127 .num_blocks = CAST6_PARALLEL_BLOCKS,
128 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
129 }, {
130 .num_blocks = 1,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
132 } }
133};
134
135static const struct common_glue_ctx cast6_dec = {
136 .num_funcs = 2,
137 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
138
139 .funcs = { {
140 .num_blocks = CAST6_PARALLEL_BLOCKS,
141 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
142 }, {
143 .num_blocks = 1,
144 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
145 } }
146};
147
148static const struct common_glue_ctx cast6_dec_cbc = {
149 .num_funcs = 2,
150 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
151
152 .funcs = { {
153 .num_blocks = CAST6_PARALLEL_BLOCKS,
154 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
155 }, {
156 .num_blocks = 1,
157 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
158 } }
159};
160
161static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
162 struct scatterlist *src, unsigned int nbytes)
163{
164 return glue_ecb_crypt_128bit(&cast6_enc, desc, dst, src, nbytes);
165}
166
167static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
168 struct scatterlist *src, unsigned int nbytes)
169{
170 return glue_ecb_crypt_128bit(&cast6_dec, desc, dst, src, nbytes);
171}
172
173static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
174 struct scatterlist *src, unsigned int nbytes)
175{
176 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__cast6_encrypt), desc,
177 dst, src, nbytes);
178}
179
180static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
181 struct scatterlist *src, unsigned int nbytes)
182{
183 return glue_cbc_decrypt_128bit(&cast6_dec_cbc, desc, dst, src,
184 nbytes);
185}
186
187static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
188 struct scatterlist *src, unsigned int nbytes)
189{
190 return glue_ctr_crypt_128bit(&cast6_ctr, desc, dst, src, nbytes);
191}
192
193static inline bool cast6_fpu_begin(bool fpu_enabled, unsigned int nbytes)
194{
195 return glue_fpu_begin(CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS,
196 NULL, fpu_enabled, nbytes);
197}
198
199static inline void cast6_fpu_end(bool fpu_enabled)
200{
201 glue_fpu_end(fpu_enabled);
202}
203
204struct crypt_priv {
205 struct cast6_ctx *ctx;
206 bool fpu_enabled;
207};
208
209static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
210{
211 const unsigned int bsize = CAST6_BLOCK_SIZE;
212 struct crypt_priv *ctx = priv;
213 int i;
214
215 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
216
217 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
218 cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
219 return;
220 }
221
222 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
223 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
224}
225
226static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
227{
228 const unsigned int bsize = CAST6_BLOCK_SIZE;
229 struct crypt_priv *ctx = priv;
230 int i;
231
232 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
233
234 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
235 cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
236 return;
237 }
238
239 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
240 __cast6_decrypt(ctx->ctx, srcdst, srcdst);
241}
242
243struct cast6_lrw_ctx {
244 struct lrw_table_ctx lrw_table;
245 struct cast6_ctx cast6_ctx;
246};
247
248static int lrw_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
249 unsigned int keylen)
250{
251 struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
252 int err;
253
254 err = __cast6_setkey(&ctx->cast6_ctx, key, keylen - CAST6_BLOCK_SIZE,
255 &tfm->crt_flags);
256 if (err)
257 return err;
258
259 return lrw_init_table(&ctx->lrw_table, key + keylen - CAST6_BLOCK_SIZE);
260}
261
262static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
263 struct scatterlist *src, unsigned int nbytes)
264{
265 struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
266 be128 buf[CAST6_PARALLEL_BLOCKS];
267 struct crypt_priv crypt_ctx = {
268 .ctx = &ctx->cast6_ctx,
269 .fpu_enabled = false,
270 };
271 struct lrw_crypt_req req = {
272 .tbuf = buf,
273 .tbuflen = sizeof(buf),
274
275 .table_ctx = &ctx->lrw_table,
276 .crypt_ctx = &crypt_ctx,
277 .crypt_fn = encrypt_callback,
278 };
279 int ret;
280
281 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
282 ret = lrw_crypt(desc, dst, src, nbytes, &req);
283 cast6_fpu_end(crypt_ctx.fpu_enabled);
284
285 return ret;
286}
287
288static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
289 struct scatterlist *src, unsigned int nbytes)
290{
291 struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
292 be128 buf[CAST6_PARALLEL_BLOCKS];
293 struct crypt_priv crypt_ctx = {
294 .ctx = &ctx->cast6_ctx,
295 .fpu_enabled = false,
296 };
297 struct lrw_crypt_req req = {
298 .tbuf = buf,
299 .tbuflen = sizeof(buf),
300
301 .table_ctx = &ctx->lrw_table,
302 .crypt_ctx = &crypt_ctx,
303 .crypt_fn = decrypt_callback,
304 };
305 int ret;
306
307 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
308 ret = lrw_crypt(desc, dst, src, nbytes, &req);
309 cast6_fpu_end(crypt_ctx.fpu_enabled);
310
311 return ret;
312}
313
314static void lrw_exit_tfm(struct crypto_tfm *tfm)
315{
316 struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
317
318 lrw_free_table(&ctx->lrw_table);
319}
320
321struct cast6_xts_ctx {
322 struct cast6_ctx tweak_ctx;
323 struct cast6_ctx crypt_ctx;
324};
325
326static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
327 unsigned int keylen)
328{
329 struct cast6_xts_ctx *ctx = crypto_tfm_ctx(tfm);
330 u32 *flags = &tfm->crt_flags;
331 int err;
332
333 /* key consists of keys of equal size concatenated, therefore
334 * the length must be even
335 */
336 if (keylen % 2) {
337 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
338 return -EINVAL;
339 }
340
341 /* first half of xts-key is for crypt */
342 err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
343 if (err)
344 return err;
345
346 /* second half of xts-key is for tweak */
347 return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
348 flags);
349}
350
351static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
352 struct scatterlist *src, unsigned int nbytes)
353{
354 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
355 be128 buf[CAST6_PARALLEL_BLOCKS];
356 struct crypt_priv crypt_ctx = {
357 .ctx = &ctx->crypt_ctx,
358 .fpu_enabled = false,
359 };
360 struct xts_crypt_req req = {
361 .tbuf = buf,
362 .tbuflen = sizeof(buf),
363
364 .tweak_ctx = &ctx->tweak_ctx,
365 .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
366 .crypt_ctx = &crypt_ctx,
367 .crypt_fn = encrypt_callback,
368 };
369 int ret;
370
371 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
372 ret = xts_crypt(desc, dst, src, nbytes, &req);
373 cast6_fpu_end(crypt_ctx.fpu_enabled);
374
375 return ret;
376}
377
378static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
379 struct scatterlist *src, unsigned int nbytes)
380{
381 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
382 be128 buf[CAST6_PARALLEL_BLOCKS];
383 struct crypt_priv crypt_ctx = {
384 .ctx = &ctx->crypt_ctx,
385 .fpu_enabled = false,
386 };
387 struct xts_crypt_req req = {
388 .tbuf = buf,
389 .tbuflen = sizeof(buf),
390
391 .tweak_ctx = &ctx->tweak_ctx,
392 .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
393 .crypt_ctx = &crypt_ctx,
394 .crypt_fn = decrypt_callback,
395 };
396 int ret;
397
398 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
399 ret = xts_crypt(desc, dst, src, nbytes, &req);
400 cast6_fpu_end(crypt_ctx.fpu_enabled);
401
402 return ret;
403}
404
405static struct crypto_alg cast6_algs[10] = { {
406 .cra_name = "__ecb-cast6-avx",
407 .cra_driver_name = "__driver-ecb-cast6-avx",
408 .cra_priority = 0,
409 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
410 .cra_blocksize = CAST6_BLOCK_SIZE,
411 .cra_ctxsize = sizeof(struct cast6_ctx),
412 .cra_alignmask = 0,
413 .cra_type = &crypto_blkcipher_type,
414 .cra_module = THIS_MODULE,
415 .cra_u = {
416 .blkcipher = {
417 .min_keysize = CAST6_MIN_KEY_SIZE,
418 .max_keysize = CAST6_MAX_KEY_SIZE,
419 .setkey = cast6_setkey,
420 .encrypt = ecb_encrypt,
421 .decrypt = ecb_decrypt,
422 },
423 },
424}, {
425 .cra_name = "__cbc-cast6-avx",
426 .cra_driver_name = "__driver-cbc-cast6-avx",
427 .cra_priority = 0,
428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
429 .cra_blocksize = CAST6_BLOCK_SIZE,
430 .cra_ctxsize = sizeof(struct cast6_ctx),
431 .cra_alignmask = 0,
432 .cra_type = &crypto_blkcipher_type,
433 .cra_module = THIS_MODULE,
434 .cra_u = {
435 .blkcipher = {
436 .min_keysize = CAST6_MIN_KEY_SIZE,
437 .max_keysize = CAST6_MAX_KEY_SIZE,
438 .setkey = cast6_setkey,
439 .encrypt = cbc_encrypt,
440 .decrypt = cbc_decrypt,
441 },
442 },
443}, {
444 .cra_name = "__ctr-cast6-avx",
445 .cra_driver_name = "__driver-ctr-cast6-avx",
446 .cra_priority = 0,
447 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
448 .cra_blocksize = 1,
449 .cra_ctxsize = sizeof(struct cast6_ctx),
450 .cra_alignmask = 0,
451 .cra_type = &crypto_blkcipher_type,
452 .cra_module = THIS_MODULE,
453 .cra_u = {
454 .blkcipher = {
455 .min_keysize = CAST6_MIN_KEY_SIZE,
456 .max_keysize = CAST6_MAX_KEY_SIZE,
457 .ivsize = CAST6_BLOCK_SIZE,
458 .setkey = cast6_setkey,
459 .encrypt = ctr_crypt,
460 .decrypt = ctr_crypt,
461 },
462 },
463}, {
464 .cra_name = "__lrw-cast6-avx",
465 .cra_driver_name = "__driver-lrw-cast6-avx",
466 .cra_priority = 0,
467 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
468 .cra_blocksize = CAST6_BLOCK_SIZE,
469 .cra_ctxsize = sizeof(struct cast6_lrw_ctx),
470 .cra_alignmask = 0,
471 .cra_type = &crypto_blkcipher_type,
472 .cra_module = THIS_MODULE,
473 .cra_exit = lrw_exit_tfm,
474 .cra_u = {
475 .blkcipher = {
476 .min_keysize = CAST6_MIN_KEY_SIZE +
477 CAST6_BLOCK_SIZE,
478 .max_keysize = CAST6_MAX_KEY_SIZE +
479 CAST6_BLOCK_SIZE,
480 .ivsize = CAST6_BLOCK_SIZE,
481 .setkey = lrw_cast6_setkey,
482 .encrypt = lrw_encrypt,
483 .decrypt = lrw_decrypt,
484 },
485 },
486}, {
487 .cra_name = "__xts-cast6-avx",
488 .cra_driver_name = "__driver-xts-cast6-avx",
489 .cra_priority = 0,
490 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
491 .cra_blocksize = CAST6_BLOCK_SIZE,
492 .cra_ctxsize = sizeof(struct cast6_xts_ctx),
493 .cra_alignmask = 0,
494 .cra_type = &crypto_blkcipher_type,
495 .cra_module = THIS_MODULE,
496 .cra_u = {
497 .blkcipher = {
498 .min_keysize = CAST6_MIN_KEY_SIZE * 2,
499 .max_keysize = CAST6_MAX_KEY_SIZE * 2,
500 .ivsize = CAST6_BLOCK_SIZE,
501 .setkey = xts_cast6_setkey,
502 .encrypt = xts_encrypt,
503 .decrypt = xts_decrypt,
504 },
505 },
506}, {
507 .cra_name = "ecb(cast6)",
508 .cra_driver_name = "ecb-cast6-avx",
509 .cra_priority = 200,
510 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
511 .cra_blocksize = CAST6_BLOCK_SIZE,
512 .cra_ctxsize = sizeof(struct async_helper_ctx),
513 .cra_alignmask = 0,
514 .cra_type = &crypto_ablkcipher_type,
515 .cra_module = THIS_MODULE,
516 .cra_init = ablk_init,
517 .cra_exit = ablk_exit,
518 .cra_u = {
519 .ablkcipher = {
520 .min_keysize = CAST6_MIN_KEY_SIZE,
521 .max_keysize = CAST6_MAX_KEY_SIZE,
522 .setkey = ablk_set_key,
523 .encrypt = ablk_encrypt,
524 .decrypt = ablk_decrypt,
525 },
526 },
527}, {
528 .cra_name = "cbc(cast6)",
529 .cra_driver_name = "cbc-cast6-avx",
530 .cra_priority = 200,
531 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
532 .cra_blocksize = CAST6_BLOCK_SIZE,
533 .cra_ctxsize = sizeof(struct async_helper_ctx),
534 .cra_alignmask = 0,
535 .cra_type = &crypto_ablkcipher_type,
536 .cra_module = THIS_MODULE,
537 .cra_init = ablk_init,
538 .cra_exit = ablk_exit,
539 .cra_u = {
540 .ablkcipher = {
541 .min_keysize = CAST6_MIN_KEY_SIZE,
542 .max_keysize = CAST6_MAX_KEY_SIZE,
543 .ivsize = CAST6_BLOCK_SIZE,
544 .setkey = ablk_set_key,
545 .encrypt = __ablk_encrypt,
546 .decrypt = ablk_decrypt,
547 },
548 },
549}, {
550 .cra_name = "ctr(cast6)",
551 .cra_driver_name = "ctr-cast6-avx",
552 .cra_priority = 200,
553 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
554 .cra_blocksize = 1,
555 .cra_ctxsize = sizeof(struct async_helper_ctx),
556 .cra_alignmask = 0,
557 .cra_type = &crypto_ablkcipher_type,
558 .cra_module = THIS_MODULE,
559 .cra_init = ablk_init,
560 .cra_exit = ablk_exit,
561 .cra_u = {
562 .ablkcipher = {
563 .min_keysize = CAST6_MIN_KEY_SIZE,
564 .max_keysize = CAST6_MAX_KEY_SIZE,
565 .ivsize = CAST6_BLOCK_SIZE,
566 .setkey = ablk_set_key,
567 .encrypt = ablk_encrypt,
568 .decrypt = ablk_encrypt,
569 .geniv = "chainiv",
570 },
571 },
572}, {
573 .cra_name = "lrw(cast6)",
574 .cra_driver_name = "lrw-cast6-avx",
575 .cra_priority = 200,
576 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
577 .cra_blocksize = CAST6_BLOCK_SIZE,
578 .cra_ctxsize = sizeof(struct async_helper_ctx),
579 .cra_alignmask = 0,
580 .cra_type = &crypto_ablkcipher_type,
581 .cra_module = THIS_MODULE,
582 .cra_init = ablk_init,
583 .cra_exit = ablk_exit,
584 .cra_u = {
585 .ablkcipher = {
586 .min_keysize = CAST6_MIN_KEY_SIZE +
587 CAST6_BLOCK_SIZE,
588 .max_keysize = CAST6_MAX_KEY_SIZE +
589 CAST6_BLOCK_SIZE,
590 .ivsize = CAST6_BLOCK_SIZE,
591 .setkey = ablk_set_key,
592 .encrypt = ablk_encrypt,
593 .decrypt = ablk_decrypt,
594 },
595 },
596}, {
597 .cra_name = "xts(cast6)",
598 .cra_driver_name = "xts-cast6-avx",
599 .cra_priority = 200,
600 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
601 .cra_blocksize = CAST6_BLOCK_SIZE,
602 .cra_ctxsize = sizeof(struct async_helper_ctx),
603 .cra_alignmask = 0,
604 .cra_type = &crypto_ablkcipher_type,
605 .cra_module = THIS_MODULE,
606 .cra_init = ablk_init,
607 .cra_exit = ablk_exit,
608 .cra_u = {
609 .ablkcipher = {
610 .min_keysize = CAST6_MIN_KEY_SIZE * 2,
611 .max_keysize = CAST6_MAX_KEY_SIZE * 2,
612 .ivsize = CAST6_BLOCK_SIZE,
613 .setkey = ablk_set_key,
614 .encrypt = ablk_encrypt,
615 .decrypt = ablk_decrypt,
616 },
617 },
618} };
619
620static int __init cast6_init(void)
621{
622 u64 xcr0;
623
624 if (!cpu_has_avx || !cpu_has_osxsave) {
625 pr_info("AVX instructions are not detected.\n");
626 return -ENODEV;
627 }
628
629 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
630 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
631 pr_info("AVX detected but unusable.\n");
632 return -ENODEV;
633 }
634
635 return crypto_register_algs(cast6_algs, ARRAY_SIZE(cast6_algs));
636}
637
638static void __exit cast6_exit(void)
639{
640 crypto_unregister_algs(cast6_algs, ARRAY_SIZE(cast6_algs));
641}
642
643module_init(cast6_init);
644module_exit(cast6_exit);
645
646MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
647MODULE_LICENSE("GPL");
648MODULE_ALIAS("cast6");