aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-07-11 13:37:37 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-08-01 05:47:30 -0400
commit4d6d6a2c850f89bc9283d02519cb536baba72032 (patch)
tree8433747260d88000d79849bcd4db0e56b86aa6e4 /arch
parenta2c5826095562983bf316e3a7eb137ef04a71a24 (diff)
crypto: cast5 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S322
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c530
3 files changed, 854 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e908e5de82d..565e82b0014 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
12 12
13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o 14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
15obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 16obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
16obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
17obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 18obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
@@ -32,6 +33,7 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
32 33
33aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 34aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
34camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o 35camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
36cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
35blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 37blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
36twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 38twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
37twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 39twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..94693c877e3
--- /dev/null
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,322 @@
1/*
2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "cast5-avx-x86_64-asm_64.S"
25.text
26
27.extern cast5_s1
28.extern cast5_s2
29.extern cast5_s3
30.extern cast5_s4
31
32/* structure of crypto context */
33#define km 0
34#define kr (16*4)
35#define rr ((16*4)+16)
36
37/* s-boxes */
38#define s1 cast5_s1
39#define s2 cast5_s2
40#define s3 cast5_s3
41#define s4 cast5_s4
42
43/**********************************************************************
44 16-way AVX cast5
45 **********************************************************************/
46#define CTX %rdi
47
48#define RL1 %xmm0
49#define RR1 %xmm1
50#define RL2 %xmm2
51#define RR2 %xmm3
52#define RL3 %xmm4
53#define RR3 %xmm5
54#define RL4 %xmm6
55#define RR4 %xmm7
56
57#define RX %xmm8
58
59#define RKM %xmm9
60#define RKRF %xmm10
61#define RKRR %xmm11
62
63#define RTMP %xmm12
64#define RMASK %xmm13
65#define R32 %xmm14
66
67#define RID1 %rax
68#define RID1b %al
69#define RID2 %rbx
70#define RID2b %bl
71
72#define RGI1 %rdx
73#define RGI1bl %dl
74#define RGI1bh %dh
75#define RGI2 %rcx
76#define RGI2bl %cl
77#define RGI2bh %ch
78
79#define RFS1 %r8
80#define RFS1d %r8d
81#define RFS2 %r9
82#define RFS2d %r9d
83#define RFS3 %r10
84#define RFS3d %r10d
85
86
87#define lookup_32bit(src, dst, op1, op2, op3) \
88 movb src ## bl, RID1b; \
89 movb src ## bh, RID2b; \
90 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \
93 movb src ## bl, RID1b; \
94 movb src ## bh, RID2b; \
95 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d;
97
98#define F(a, x, op0, op1, op2, op3) \
99 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \
103 \
104 vpshufb RMASK, x, x; \
105 vmovq x, RGI1; \
106 vpsrldq $8, x, x; \
107 vmovq x, RGI2; \
108 \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \
116 shrq $16, RGI2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \
118 shlq $32, RFS3; \
119 orq RFS1, RFS3; \
120 \
121 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x;
123
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
127
128#define subround(a, b, x, n, f) \
129 F ## f(b, x); \
130 vpxor a, x, a;
131
132#define round(l, r, n, f) \
133 vbroadcastss (km+(4*n))(CTX), RKM; \
134 vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \
135 vpsubq RKRF, R32, RKRR; \
136 subround(l ## 1, r ## 1, RX, n, f); \
137 subround(l ## 2, r ## 2, RX, n, f); \
138 subround(l ## 3, r ## 3, RX, n, f); \
139 subround(l ## 4, r ## 4, RX, n, f);
140
141
142#define transpose_2x4(x0, x1, t0, t1) \
143 vpunpckldq x1, x0, t0; \
144 vpunpckhdq x1, x0, t1; \
145 \
146 vpunpcklqdq t1, t0, x0; \
147 vpunpckhqdq t1, t0, x1;
148
149#define inpack_blocks(in, x0, x1, t0, t1) \
150 vmovdqu (0*4*4)(in), x0; \
151 vmovdqu (1*4*4)(in), x1; \
152 vpshufb RMASK, x0, x0; \
153 vpshufb RMASK, x1, x1; \
154 \
155 transpose_2x4(x0, x1, t0, t1)
156
157#define outunpack_blocks(out, x0, x1, t0, t1) \
158 transpose_2x4(x0, x1, t0, t1) \
159 \
160 vpshufb RMASK, x0, x0; \
161 vpshufb RMASK, x1, x1; \
162 vmovdqu x0, (0*4*4)(out); \
163 vmovdqu x1, (1*4*4)(out);
164
165#define outunpack_xor_blocks(out, x0, x1, t0, t1) \
166 transpose_2x4(x0, x1, t0, t1) \
167 \
168 vpshufb RMASK, x0, x0; \
169 vpshufb RMASK, x1, x1; \
170 vpxor (0*4*4)(out), x0, x0; \
171 vmovdqu x0, (0*4*4)(out); \
172 vpxor (1*4*4)(out), x1, x1; \
173 vmovdqu x1, (1*4*4)(out);
174
175.align 16
176.Lbswap_mask:
177 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
178.L32_mask:
179 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
180
181.align 16
182.global __cast5_enc_blk_16way
183.type __cast5_enc_blk_16way,@function;
184
185__cast5_enc_blk_16way:
186 /* input:
187 * %rdi: ctx, CTX
188 * %rsi: dst
189 * %rdx: src
190 * %rcx: bool, if true: xor output
191 */
192
193 pushq %rbx;
194 pushq %rcx;
195
196 vmovdqu .Lbswap_mask, RMASK;
197 vmovdqu .L32_mask, R32;
198 vpxor RKRF, RKRF, RKRF;
199
200 inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
201 leaq (2*4*4)(%rdx), %rax;
202 inpack_blocks(%rax, RL2, RR2, RTMP, RX);
203 leaq (2*4*4)(%rax), %rax;
204 inpack_blocks(%rax, RL3, RR3, RTMP, RX);
205 leaq (2*4*4)(%rax), %rax;
206 inpack_blocks(%rax, RL4, RR4, RTMP, RX);
207
208 xorq RID1, RID1;
209 xorq RID2, RID2;
210
211 round(RL, RR, 0, 1);
212 round(RR, RL, 1, 2);
213 round(RL, RR, 2, 3);
214 round(RR, RL, 3, 1);
215 round(RL, RR, 4, 2);
216 round(RR, RL, 5, 3);
217 round(RL, RR, 6, 1);
218 round(RR, RL, 7, 2);
219 round(RL, RR, 8, 3);
220 round(RR, RL, 9, 1);
221 round(RL, RR, 10, 2);
222 round(RR, RL, 11, 3);
223
224 movb rr(CTX), %al;
225 testb %al, %al;
226 jnz __skip_enc;
227
228 round(RL, RR, 12, 1);
229 round(RR, RL, 13, 2);
230 round(RL, RR, 14, 3);
231 round(RR, RL, 15, 1);
232
233__skip_enc:
234 popq %rcx;
235 popq %rbx;
236
237 testb %cl, %cl;
238 jnz __enc_xor16;
239
240 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
241 leaq (2*4*4)(%rsi), %rax;
242 outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
243 leaq (2*4*4)(%rax), %rax;
244 outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
245 leaq (2*4*4)(%rax), %rax;
246 outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
247
248 ret;
249
250__enc_xor16:
251 outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX);
252 leaq (2*4*4)(%rsi), %rax;
253 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX);
254 leaq (2*4*4)(%rax), %rax;
255 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX);
256 leaq (2*4*4)(%rax), %rax;
257 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
258
259 ret;
260
261.align 16
262.global cast5_dec_blk_16way
263.type cast5_dec_blk_16way,@function;
264
265cast5_dec_blk_16way:
266 /* input:
267 * %rdi: ctx, CTX
268 * %rsi: dst
269 * %rdx: src
270 */
271
272 pushq %rbx;
273
274 vmovdqu .Lbswap_mask, RMASK;
275 vmovdqu .L32_mask, R32;
276 vpxor RKRF, RKRF, RKRF;
277
278 inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
279 leaq (2*4*4)(%rdx), %rax;
280 inpack_blocks(%rax, RL2, RR2, RTMP, RX);
281 leaq (2*4*4)(%rax), %rax;
282 inpack_blocks(%rax, RL3, RR3, RTMP, RX);
283 leaq (2*4*4)(%rax), %rax;
284 inpack_blocks(%rax, RL4, RR4, RTMP, RX);
285
286 xorq RID1, RID1;
287 xorq RID2, RID2;
288
289 movb rr(CTX), %al;
290 testb %al, %al;
291 jnz __skip_dec;
292
293 round(RL, RR, 15, 1);
294 round(RR, RL, 14, 3);
295 round(RL, RR, 13, 2);
296 round(RR, RL, 12, 1);
297
298__skip_dec:
299 round(RL, RR, 11, 3);
300 round(RR, RL, 10, 2);
301 round(RL, RR, 9, 1);
302 round(RR, RL, 8, 3);
303 round(RL, RR, 7, 2);
304 round(RR, RL, 6, 1);
305 round(RL, RR, 5, 3);
306 round(RR, RL, 4, 2);
307 round(RL, RR, 3, 1);
308 round(RR, RL, 2, 3);
309 round(RL, RR, 1, 2);
310 round(RR, RL, 0, 1);
311
312 popq %rbx;
313
314 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
315 leaq (2*4*4)(%rsi), %rax;
316 outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
317 leaq (2*4*4)(%rax), %rax;
318 outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
319 leaq (2*4*4)(%rax), %rax;
320 outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
321
322 ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
new file mode 100644
index 00000000000..445aab06387
--- /dev/null
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -0,0 +1,530 @@
1/*
2 * Glue Code for the AVX assembler implemention of the Cast5 Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/cast5.h>
31#include <crypto/cryptd.h>
32#include <crypto/ctr.h>
33#include <asm/xcr.h>
34#include <asm/xsave.h>
35#include <asm/crypto/ablk_helper.h>
36#include <asm/crypto/glue_helper.h>
37
38#define CAST5_PARALLEL_BLOCKS 16
39
40asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
46 const u8 *src)
47{
48 __cast5_enc_blk_16way(ctx, dst, src, false);
49}
50
51static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __cast5_enc_blk_16way(ctx, dst, src, true);
55}
56
57static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 cast5_dec_blk_16way(ctx, dst, src);
61}
62
63
64static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
65{
66 return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
67 NULL, fpu_enabled, nbytes);
68}
69
70static inline void cast5_fpu_end(bool fpu_enabled)
71{
72 return glue_fpu_end(fpu_enabled);
73}
74
75static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
76 bool enc)
77{
78 bool fpu_enabled = false;
79 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = CAST5_BLOCK_SIZE;
81 unsigned int nbytes;
82 int err;
83
84 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86
87 while ((nbytes = walk->nbytes)) {
88 u8 *wsrc = walk->src.virt.addr;
89 u8 *wdst = walk->dst.virt.addr;
90
91 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
92
93 /* Process multi-block batch */
94 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
95 do {
96 if (enc)
97 cast5_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 cast5_dec_blk_xway(ctx, wdst, wsrc);
100
101 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
102 wdst += bsize * CAST5_PARALLEL_BLOCKS;
103 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
104 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
105
106 if (nbytes < bsize)
107 goto done;
108 }
109
110 /* Handle leftovers */
111 do {
112 if (enc)
113 __cast5_encrypt(ctx, wdst, wsrc);
114 else
115 __cast5_decrypt(ctx, wdst, wsrc);
116
117 wsrc += bsize;
118 wdst += bsize;
119 nbytes -= bsize;
120 } while (nbytes >= bsize);
121
122done:
123 err = blkcipher_walk_done(desc, walk, nbytes);
124 }
125
126 cast5_fpu_end(fpu_enabled);
127 return err;
128}
129
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 struct blkcipher_walk walk;
134
135 blkcipher_walk_init(&walk, dst, src, nbytes);
136 return ecb_crypt(desc, &walk, true);
137}
138
139static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
140 struct scatterlist *src, unsigned int nbytes)
141{
142 struct blkcipher_walk walk;
143
144 blkcipher_walk_init(&walk, dst, src, nbytes);
145 return ecb_crypt(desc, &walk, false);
146}
147
148static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
149 struct blkcipher_walk *walk)
150{
151 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
152 const unsigned int bsize = CAST5_BLOCK_SIZE;
153 unsigned int nbytes = walk->nbytes;
154 u64 *src = (u64 *)walk->src.virt.addr;
155 u64 *dst = (u64 *)walk->dst.virt.addr;
156 u64 *iv = (u64 *)walk->iv;
157
158 do {
159 *dst = *src ^ *iv;
160 __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
161 iv = dst;
162
163 src += 1;
164 dst += 1;
165 nbytes -= bsize;
166 } while (nbytes >= bsize);
167
168 *(u64 *)walk->iv ^= *iv;
169 return nbytes;
170}
171
172static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
173 struct scatterlist *src, unsigned int nbytes)
174{
175 struct blkcipher_walk walk;
176 int err;
177
178 blkcipher_walk_init(&walk, dst, src, nbytes);
179 err = blkcipher_walk_virt(desc, &walk);
180
181 while ((nbytes = walk.nbytes)) {
182 nbytes = __cbc_encrypt(desc, &walk);
183 err = blkcipher_walk_done(desc, &walk, nbytes);
184 }
185
186 return err;
187}
188
189static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
190 struct blkcipher_walk *walk)
191{
192 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
193 const unsigned int bsize = CAST5_BLOCK_SIZE;
194 unsigned int nbytes = walk->nbytes;
195 u64 *src = (u64 *)walk->src.virt.addr;
196 u64 *dst = (u64 *)walk->dst.virt.addr;
197 u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
198 u64 last_iv;
199 int i;
200
201 /* Start of the last block. */
202 src += nbytes / bsize - 1;
203 dst += nbytes / bsize - 1;
204
205 last_iv = *src;
206
207 /* Process multi-block batch */
208 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
209 do {
210 nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
211 src -= CAST5_PARALLEL_BLOCKS - 1;
212 dst -= CAST5_PARALLEL_BLOCKS - 1;
213
214 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
215 ivs[i] = src[i];
216
217 cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
220 *(dst + (i + 1)) ^= *(ivs + i);
221
222 nbytes -= bsize;
223 if (nbytes < bsize)
224 goto done;
225
226 *dst ^= *(src - 1);
227 src -= 1;
228 dst -= 1;
229 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
230
231 if (nbytes < bsize)
232 goto done;
233 }
234
235 /* Handle leftovers */
236 for (;;) {
237 __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
238
239 nbytes -= bsize;
240 if (nbytes < bsize)
241 break;
242
243 *dst ^= *(src - 1);
244 src -= 1;
245 dst -= 1;
246 }
247
248done:
249 *dst ^= *(u64 *)walk->iv;
250 *(u64 *)walk->iv = last_iv;
251
252 return nbytes;
253}
254
255static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
256 struct scatterlist *src, unsigned int nbytes)
257{
258 bool fpu_enabled = false;
259 struct blkcipher_walk walk;
260 int err;
261
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
265
266 while ((nbytes = walk.nbytes)) {
267 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
268 nbytes = __cbc_decrypt(desc, &walk);
269 err = blkcipher_walk_done(desc, &walk, nbytes);
270 }
271
272 cast5_fpu_end(fpu_enabled);
273 return err;
274}
275
276static void ctr_crypt_final(struct blkcipher_desc *desc,
277 struct blkcipher_walk *walk)
278{
279 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
280 u8 *ctrblk = walk->iv;
281 u8 keystream[CAST5_BLOCK_SIZE];
282 u8 *src = walk->src.virt.addr;
283 u8 *dst = walk->dst.virt.addr;
284 unsigned int nbytes = walk->nbytes;
285
286 __cast5_encrypt(ctx, keystream, ctrblk);
287 crypto_xor(keystream, src, nbytes);
288 memcpy(dst, keystream, nbytes);
289
290 crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
291}
292
293static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
294 struct blkcipher_walk *walk)
295{
296 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
297 const unsigned int bsize = CAST5_BLOCK_SIZE;
298 unsigned int nbytes = walk->nbytes;
299 u64 *src = (u64 *)walk->src.virt.addr;
300 u64 *dst = (u64 *)walk->dst.virt.addr;
301 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
302 __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
303 int i;
304
305 /* Process multi-block batch */
306 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
307 do {
308 /* create ctrblks for parallel encrypt */
309 for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
310 if (dst != src)
311 dst[i] = src[i];
312
313 ctrblocks[i] = cpu_to_be64(ctrblk++);
314 }
315
316 cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
317 (u8 *)ctrblocks);
318
319 src += CAST5_PARALLEL_BLOCKS;
320 dst += CAST5_PARALLEL_BLOCKS;
321 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
322 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
323
324 if (nbytes < bsize)
325 goto done;
326 }
327
328 /* Handle leftovers */
329 do {
330 if (dst != src)
331 *dst = *src;
332
333 ctrblocks[0] = cpu_to_be64(ctrblk++);
334
335 __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
336 *dst ^= ctrblocks[0];
337
338 src += 1;
339 dst += 1;
340 nbytes -= bsize;
341 } while (nbytes >= bsize);
342
343done:
344 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
345 return nbytes;
346}
347
348static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
349 struct scatterlist *src, unsigned int nbytes)
350{
351 bool fpu_enabled = false;
352 struct blkcipher_walk walk;
353 int err;
354
355 blkcipher_walk_init(&walk, dst, src, nbytes);
356 err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE);
357 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
358
359 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
360 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
361 nbytes = __ctr_crypt(desc, &walk);
362 err = blkcipher_walk_done(desc, &walk, nbytes);
363 }
364
365 cast5_fpu_end(fpu_enabled);
366
367 if (walk.nbytes) {
368 ctr_crypt_final(desc, &walk);
369 err = blkcipher_walk_done(desc, &walk, 0);
370 }
371
372 return err;
373}
374
375
376static struct crypto_alg cast5_algs[6] = { {
377 .cra_name = "__ecb-cast5-avx",
378 .cra_driver_name = "__driver-ecb-cast5-avx",
379 .cra_priority = 0,
380 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
381 .cra_blocksize = CAST5_BLOCK_SIZE,
382 .cra_ctxsize = sizeof(struct cast5_ctx),
383 .cra_alignmask = 0,
384 .cra_type = &crypto_blkcipher_type,
385 .cra_module = THIS_MODULE,
386 .cra_u = {
387 .blkcipher = {
388 .min_keysize = CAST5_MIN_KEY_SIZE,
389 .max_keysize = CAST5_MAX_KEY_SIZE,
390 .setkey = cast5_setkey,
391 .encrypt = ecb_encrypt,
392 .decrypt = ecb_decrypt,
393 },
394 },
395}, {
396 .cra_name = "__cbc-cast5-avx",
397 .cra_driver_name = "__driver-cbc-cast5-avx",
398 .cra_priority = 0,
399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
400 .cra_blocksize = CAST5_BLOCK_SIZE,
401 .cra_ctxsize = sizeof(struct cast5_ctx),
402 .cra_alignmask = 0,
403 .cra_type = &crypto_blkcipher_type,
404 .cra_module = THIS_MODULE,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = CAST5_MIN_KEY_SIZE,
408 .max_keysize = CAST5_MAX_KEY_SIZE,
409 .setkey = cast5_setkey,
410 .encrypt = cbc_encrypt,
411 .decrypt = cbc_decrypt,
412 },
413 },
414}, {
415 .cra_name = "__ctr-cast5-avx",
416 .cra_driver_name = "__driver-ctr-cast5-avx",
417 .cra_priority = 0,
418 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
419 .cra_blocksize = 1,
420 .cra_ctxsize = sizeof(struct cast5_ctx),
421 .cra_alignmask = 0,
422 .cra_type = &crypto_blkcipher_type,
423 .cra_module = THIS_MODULE,
424 .cra_u = {
425 .blkcipher = {
426 .min_keysize = CAST5_MIN_KEY_SIZE,
427 .max_keysize = CAST5_MAX_KEY_SIZE,
428 .ivsize = CAST5_BLOCK_SIZE,
429 .setkey = cast5_setkey,
430 .encrypt = ctr_crypt,
431 .decrypt = ctr_crypt,
432 },
433 },
434}, {
435 .cra_name = "ecb(cast5)",
436 .cra_driver_name = "ecb-cast5-avx",
437 .cra_priority = 200,
438 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
439 .cra_blocksize = CAST5_BLOCK_SIZE,
440 .cra_ctxsize = sizeof(struct async_helper_ctx),
441 .cra_alignmask = 0,
442 .cra_type = &crypto_ablkcipher_type,
443 .cra_module = THIS_MODULE,
444 .cra_init = ablk_init,
445 .cra_exit = ablk_exit,
446 .cra_u = {
447 .ablkcipher = {
448 .min_keysize = CAST5_MIN_KEY_SIZE,
449 .max_keysize = CAST5_MAX_KEY_SIZE,
450 .setkey = ablk_set_key,
451 .encrypt = ablk_encrypt,
452 .decrypt = ablk_decrypt,
453 },
454 },
455}, {
456 .cra_name = "cbc(cast5)",
457 .cra_driver_name = "cbc-cast5-avx",
458 .cra_priority = 200,
459 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
460 .cra_blocksize = CAST5_BLOCK_SIZE,
461 .cra_ctxsize = sizeof(struct async_helper_ctx),
462 .cra_alignmask = 0,
463 .cra_type = &crypto_ablkcipher_type,
464 .cra_module = THIS_MODULE,
465 .cra_init = ablk_init,
466 .cra_exit = ablk_exit,
467 .cra_u = {
468 .ablkcipher = {
469 .min_keysize = CAST5_MIN_KEY_SIZE,
470 .max_keysize = CAST5_MAX_KEY_SIZE,
471 .ivsize = CAST5_BLOCK_SIZE,
472 .setkey = ablk_set_key,
473 .encrypt = __ablk_encrypt,
474 .decrypt = ablk_decrypt,
475 },
476 },
477}, {
478 .cra_name = "ctr(cast5)",
479 .cra_driver_name = "ctr-cast5-avx",
480 .cra_priority = 200,
481 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
482 .cra_blocksize = 1,
483 .cra_ctxsize = sizeof(struct async_helper_ctx),
484 .cra_alignmask = 0,
485 .cra_type = &crypto_ablkcipher_type,
486 .cra_module = THIS_MODULE,
487 .cra_init = ablk_init,
488 .cra_exit = ablk_exit,
489 .cra_u = {
490 .ablkcipher = {
491 .min_keysize = CAST5_MIN_KEY_SIZE,
492 .max_keysize = CAST5_MAX_KEY_SIZE,
493 .ivsize = CAST5_BLOCK_SIZE,
494 .setkey = ablk_set_key,
495 .encrypt = ablk_encrypt,
496 .decrypt = ablk_encrypt,
497 .geniv = "chainiv",
498 },
499 },
500} };
501
502static int __init cast5_init(void)
503{
504 u64 xcr0;
505
506 if (!cpu_has_avx || !cpu_has_osxsave) {
507 pr_info("AVX instructions are not detected.\n");
508 return -ENODEV;
509 }
510
511 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
512 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
513 pr_info("AVX detected but unusable.\n");
514 return -ENODEV;
515 }
516
517 return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
518}
519
520static void __exit cast5_exit(void)
521{
522 crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
523}
524
525module_init(cast5_init);
526module_exit(cast5_exit);
527
528MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
529MODULE_LICENSE("GPL");
530MODULE_ALIAS("cast5");