aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-20 08:06:46 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-10-24 09:10:55 -0400
commit8435a3c3003c00c43f1b267368bbe1d8dada35d1 (patch)
tree0cc1f6dbf5f379f0e3e1efdd796ed91b3f3d6304 /arch/x86/crypto
parentcba1cce05498d55f363c28cd2512368e95605518 (diff)
crypto: twofish/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.2% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~3%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S208
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c73
2 files changed, 152 insertions, 129 deletions
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb13dde..ebac16bfa830 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "twofish-avx-x86_64-asm_64.S" 28.file "twofish-avx-x86_64-asm_64.S"
29
30.data
31.align 16
32
33.Lbswap128_mask:
34 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
35
27.text 36.text
28 37
29/* structure of crypto context */ 38/* structure of crypto context */
@@ -217,69 +226,45 @@
217 vpunpcklqdq x3, t2, x2; \ 226 vpunpcklqdq x3, t2, x2; \
218 vpunpckhqdq x3, t2, x3; 227 vpunpckhqdq x3, t2, x3;
219 228
220#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ 229#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
221 vpxor (0*4*4)(in), wkey, x0; \ 230 vpxor x0, wkey, x0; \
222 vpxor (1*4*4)(in), wkey, x1; \ 231 vpxor x1, wkey, x1; \
223 vpxor (2*4*4)(in), wkey, x2; \ 232 vpxor x2, wkey, x2; \
224 vpxor (3*4*4)(in), wkey, x3; \ 233 vpxor x3, wkey, x3; \
225 \ 234 \
226 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 235 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
227 236
228#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ 237#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
229 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
230 \
231 vpxor x0, wkey, x0; \
232 vmovdqu x0, (0*4*4)(out); \
233 vpxor x1, wkey, x1; \
234 vmovdqu x1, (1*4*4)(out); \
235 vpxor x2, wkey, x2; \
236 vmovdqu x2, (2*4*4)(out); \
237 vpxor x3, wkey, x3; \
238 vmovdqu x3, (3*4*4)(out);
239
240#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
241 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 238 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
242 \ 239 \
243 vpxor x0, wkey, x0; \ 240 vpxor x0, wkey, x0; \
244 vpxor (0*4*4)(out), x0, x0; \ 241 vpxor x1, wkey, x1; \
245 vmovdqu x0, (0*4*4)(out); \ 242 vpxor x2, wkey, x2; \
246 vpxor x1, wkey, x1; \ 243 vpxor x3, wkey, x3;
247 vpxor (1*4*4)(out), x1, x1; \
248 vmovdqu x1, (1*4*4)(out); \
249 vpxor x2, wkey, x2; \
250 vpxor (2*4*4)(out), x2, x2; \
251 vmovdqu x2, (2*4*4)(out); \
252 vpxor x3, wkey, x3; \
253 vpxor (3*4*4)(out), x3, x3; \
254 vmovdqu x3, (3*4*4)(out);
255 244
256.align 8 245.align 8
257.global __twofish_enc_blk_8way 246.type __twofish_enc_blk8,@function;
258.type __twofish_enc_blk_8way,@function;
259 247
260__twofish_enc_blk_8way: 248__twofish_enc_blk8:
261 /* input: 249 /* input:
262 * %rdi: ctx, CTX 250 * %rdi: ctx, CTX
263 * %rsi: dst 251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
264 * %rdx: src 252 * output:
265 * %rcx: bool, if true: xor output 253 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
266 */ 254 */
267 255
256 vmovdqu w(CTX), RK1;
257
268 pushq %rbp; 258 pushq %rbp;
269 pushq %rbx; 259 pushq %rbx;
270 pushq %rcx; 260 pushq %rcx;
271 261
272 vmovdqu w(CTX), RK1; 262 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
273
274 leaq (4*4*4)(%rdx), %rax;
275 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
276 preload_rgi(RA1); 263 preload_rgi(RA1);
277 rotate_1l(RD1); 264 rotate_1l(RD1);
278 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 265 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
279 rotate_1l(RD2); 266 rotate_1l(RD2);
280 267
281 movq %rsi, %r11;
282
283 encrypt_cycle(0); 268 encrypt_cycle(0);
284 encrypt_cycle(1); 269 encrypt_cycle(1);
285 encrypt_cycle(2); 270 encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
295 popq %rbx; 280 popq %rbx;
296 popq %rbp; 281 popq %rbp;
297 282
298 leaq (4*4*4)(%r11), %rax; 283 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
299 284 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
300 testb %cl, %cl;
301 jnz __enc_xor8;
302
303 outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
304 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
305
306 ret;
307
308__enc_xor8:
309 outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
310 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
311 285
312 ret; 286 ret;
313 287
314.align 8 288.align 8
315.global twofish_dec_blk_8way 289.type __twofish_dec_blk8,@function;
316.type twofish_dec_blk_8way,@function;
317 290
318twofish_dec_blk_8way: 291__twofish_dec_blk8:
319 /* input: 292 /* input:
320 * %rdi: ctx, CTX 293 * %rdi: ctx, CTX
321 * %rsi: dst 294 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
322 * %rdx: src 295 * output:
296 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
323 */ 297 */
324 298
299 vmovdqu (w+4*4)(CTX), RK1;
300
325 pushq %rbp; 301 pushq %rbp;
326 pushq %rbx; 302 pushq %rbx;
327 303
328 vmovdqu (w+4*4)(CTX), RK1; 304 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
329
330 leaq (4*4*4)(%rdx), %rax;
331 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
332 preload_rgi(RC1); 305 preload_rgi(RC1);
333 rotate_1l(RA1); 306 rotate_1l(RA1);
334 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 307 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
335 rotate_1l(RA2); 308 rotate_1l(RA2);
336 309
337 movq %rsi, %r11;
338
339 decrypt_cycle(7); 310 decrypt_cycle(7);
340 decrypt_cycle(6); 311 decrypt_cycle(6);
341 decrypt_cycle(5); 312 decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
350 popq %rbx; 321 popq %rbx;
351 popq %rbp; 322 popq %rbp;
352 323
353 leaq (4*4*4)(%r11), %rax; 324 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
354 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 325 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
355 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 326
327 ret;
328
329.align 8
330.global twofish_ecb_enc_8way
331.type twofish_ecb_enc_8way,@function;
332
333twofish_ecb_enc_8way:
334 /* input:
335 * %rdi: ctx, CTX
336 * %rsi: dst
337 * %rdx: src
338 */
339
340 movq %rsi, %r11;
341
342 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
343
344 call __twofish_enc_blk8;
345
346 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
347
348 ret;
349
350.align 8
351.global twofish_ecb_dec_8way
352.type twofish_ecb_dec_8way,@function;
353
354twofish_ecb_dec_8way:
355 /* input:
356 * %rdi: ctx, CTX
357 * %rsi: dst
358 * %rdx: src
359 */
360
361 movq %rsi, %r11;
362
363 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
364
365 call __twofish_dec_blk8;
366
367 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
368
369 ret;
370
371.align 8
372.global twofish_cbc_dec_8way
373.type twofish_cbc_dec_8way,@function;
374
375twofish_cbc_dec_8way:
376 /* input:
377 * %rdi: ctx, CTX
378 * %rsi: dst
379 * %rdx: src
380 */
381
382 pushq %r12;
383
384 movq %rsi, %r11;
385 movq %rdx, %r12;
386
387 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
388
389 call __twofish_dec_blk8;
390
391 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
392
393 popq %r12;
394
395 ret;
396
397.align 8
398.global twofish_ctr_8way
399.type twofish_ctr_8way,@function;
400
401twofish_ctr_8way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 * %rcx: iv (little endian, 128bit)
407 */
408
409 pushq %r12;
410
411 movq %rsi, %r11;
412 movq %rdx, %r12;
413
414 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
415 RD2, RX0, RX1, RY0);
416
417 call __twofish_enc_blk8;
418
419 store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
420
421 popq %r12;
356 422
357 ret; 423 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 810e45d51186..94ac91d26e47 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
45 45
46#define TWOFISH_PARALLEL_BLOCKS 8 46#define TWOFISH_PARALLEL_BLOCKS 8
47 47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */ 48/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, 49asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor); 50 const u8 *src);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, 51asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src); 52 const u8 *src);
59 53
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, 54asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src) 55 const u8 *src);
62{ 56asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
63 __twofish_enc_blk_8way(ctx, dst, src, false); 57 const u8 *src, le128 *iv);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71 58
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, 59static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src) 60 const u8 *src)
74{ 61{
75 twofish_dec_blk_8way(ctx, dst, src); 62 __twofish_enc_blk_3way(ctx, dst, src, false);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90} 63}
91 64
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 le128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 le128_to_be128(&ctrblks[i], iv);
103 le128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108 65
109static const struct common_glue_ctx twofish_enc = { 66static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3, 67 .num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
112 69
113 .funcs = { { 70 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 71 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } 72 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
116 }, { 73 }, {
117 .num_blocks = 3, 74 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } 75 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
128 85
129 .funcs = { { 86 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 87 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } 88 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
132 }, { 89 }, {
133 .num_blocks = 3, 90 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } 91 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
144 101
145 .funcs = { { 102 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 103 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } 104 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
148 }, { 105 }, {
149 .num_blocks = 3, 106 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } 107 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
160 117
161 .funcs = { { 118 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 119 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } 120 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
164 }, { 121 }, {
165 .num_blocks = 3, 122 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } 123 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 184 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228 185
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 186 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); 187 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
231 return; 188 return;
232 } 189 }
233 190
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 206 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250 207
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 208 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); 209 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
253 return; 210 return;
254 } 211 }
255 212