aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S208
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c73
2 files changed, 152 insertions, 129 deletions
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb13dde..ebac16bfa830 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "twofish-avx-x86_64-asm_64.S" 28.file "twofish-avx-x86_64-asm_64.S"
29
30.data
31.align 16
32
33.Lbswap128_mask:
34 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
35
27.text 36.text
28 37
29/* structure of crypto context */ 38/* structure of crypto context */
@@ -217,69 +226,45 @@
217 vpunpcklqdq x3, t2, x2; \ 226 vpunpcklqdq x3, t2, x2; \
218 vpunpckhqdq x3, t2, x3; 227 vpunpckhqdq x3, t2, x3;
219 228
220#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ 229#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
221 vpxor (0*4*4)(in), wkey, x0; \ 230 vpxor x0, wkey, x0; \
222 vpxor (1*4*4)(in), wkey, x1; \ 231 vpxor x1, wkey, x1; \
223 vpxor (2*4*4)(in), wkey, x2; \ 232 vpxor x2, wkey, x2; \
224 vpxor (3*4*4)(in), wkey, x3; \ 233 vpxor x3, wkey, x3; \
225 \ 234 \
226 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 235 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
227 236
228#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ 237#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
229 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
230 \
231 vpxor x0, wkey, x0; \
232 vmovdqu x0, (0*4*4)(out); \
233 vpxor x1, wkey, x1; \
234 vmovdqu x1, (1*4*4)(out); \
235 vpxor x2, wkey, x2; \
236 vmovdqu x2, (2*4*4)(out); \
237 vpxor x3, wkey, x3; \
238 vmovdqu x3, (3*4*4)(out);
239
240#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
241 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 238 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
242 \ 239 \
243 vpxor x0, wkey, x0; \ 240 vpxor x0, wkey, x0; \
244 vpxor (0*4*4)(out), x0, x0; \ 241 vpxor x1, wkey, x1; \
245 vmovdqu x0, (0*4*4)(out); \ 242 vpxor x2, wkey, x2; \
246 vpxor x1, wkey, x1; \ 243 vpxor x3, wkey, x3;
247 vpxor (1*4*4)(out), x1, x1; \
248 vmovdqu x1, (1*4*4)(out); \
249 vpxor x2, wkey, x2; \
250 vpxor (2*4*4)(out), x2, x2; \
251 vmovdqu x2, (2*4*4)(out); \
252 vpxor x3, wkey, x3; \
253 vpxor (3*4*4)(out), x3, x3; \
254 vmovdqu x3, (3*4*4)(out);
255 244
256.align 8 245.align 8
257.global __twofish_enc_blk_8way 246.type __twofish_enc_blk8,@function;
258.type __twofish_enc_blk_8way,@function;
259 247
260__twofish_enc_blk_8way: 248__twofish_enc_blk8:
261 /* input: 249 /* input:
262 * %rdi: ctx, CTX 250 * %rdi: ctx, CTX
263 * %rsi: dst 251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
264 * %rdx: src 252 * output:
265 * %rcx: bool, if true: xor output 253 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
266 */ 254 */
267 255
256 vmovdqu w(CTX), RK1;
257
268 pushq %rbp; 258 pushq %rbp;
269 pushq %rbx; 259 pushq %rbx;
270 pushq %rcx; 260 pushq %rcx;
271 261
272 vmovdqu w(CTX), RK1; 262 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
273
274 leaq (4*4*4)(%rdx), %rax;
275 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
276 preload_rgi(RA1); 263 preload_rgi(RA1);
277 rotate_1l(RD1); 264 rotate_1l(RD1);
278 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 265 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
279 rotate_1l(RD2); 266 rotate_1l(RD2);
280 267
281 movq %rsi, %r11;
282
283 encrypt_cycle(0); 268 encrypt_cycle(0);
284 encrypt_cycle(1); 269 encrypt_cycle(1);
285 encrypt_cycle(2); 270 encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
295 popq %rbx; 280 popq %rbx;
296 popq %rbp; 281 popq %rbp;
297 282
298 leaq (4*4*4)(%r11), %rax; 283 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
299 284 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
300 testb %cl, %cl;
301 jnz __enc_xor8;
302
303 outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
304 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
305
306 ret;
307
308__enc_xor8:
309 outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
310 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
311 285
312 ret; 286 ret;
313 287
314.align 8 288.align 8
315.global twofish_dec_blk_8way 289.type __twofish_dec_blk8,@function;
316.type twofish_dec_blk_8way,@function;
317 290
318twofish_dec_blk_8way: 291__twofish_dec_blk8:
319 /* input: 292 /* input:
320 * %rdi: ctx, CTX 293 * %rdi: ctx, CTX
321 * %rsi: dst 294 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
322 * %rdx: src 295 * output:
296 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
323 */ 297 */
324 298
299 vmovdqu (w+4*4)(CTX), RK1;
300
325 pushq %rbp; 301 pushq %rbp;
326 pushq %rbx; 302 pushq %rbx;
327 303
328 vmovdqu (w+4*4)(CTX), RK1; 304 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
329
330 leaq (4*4*4)(%rdx), %rax;
331 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
332 preload_rgi(RC1); 305 preload_rgi(RC1);
333 rotate_1l(RA1); 306 rotate_1l(RA1);
334 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 307 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
335 rotate_1l(RA2); 308 rotate_1l(RA2);
336 309
337 movq %rsi, %r11;
338
339 decrypt_cycle(7); 310 decrypt_cycle(7);
340 decrypt_cycle(6); 311 decrypt_cycle(6);
341 decrypt_cycle(5); 312 decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
350 popq %rbx; 321 popq %rbx;
351 popq %rbp; 322 popq %rbp;
352 323
353 leaq (4*4*4)(%r11), %rax; 324 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
354 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 325 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
355 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 326
327 ret;
328
329.align 8
330.global twofish_ecb_enc_8way
331.type twofish_ecb_enc_8way,@function;
332
333twofish_ecb_enc_8way:
334 /* input:
335 * %rdi: ctx, CTX
336 * %rsi: dst
337 * %rdx: src
338 */
339
340 movq %rsi, %r11;
341
342 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
343
344 call __twofish_enc_blk8;
345
346 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
347
348 ret;
349
350.align 8
351.global twofish_ecb_dec_8way
352.type twofish_ecb_dec_8way,@function;
353
354twofish_ecb_dec_8way:
355 /* input:
356 * %rdi: ctx, CTX
357 * %rsi: dst
358 * %rdx: src
359 */
360
361 movq %rsi, %r11;
362
363 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
364
365 call __twofish_dec_blk8;
366
367 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
368
369 ret;
370
371.align 8
372.global twofish_cbc_dec_8way
373.type twofish_cbc_dec_8way,@function;
374
375twofish_cbc_dec_8way:
376 /* input:
377 * %rdi: ctx, CTX
378 * %rsi: dst
379 * %rdx: src
380 */
381
382 pushq %r12;
383
384 movq %rsi, %r11;
385 movq %rdx, %r12;
386
387 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
388
389 call __twofish_dec_blk8;
390
391 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
392
393 popq %r12;
394
395 ret;
396
397.align 8
398.global twofish_ctr_8way
399.type twofish_ctr_8way,@function;
400
401twofish_ctr_8way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 * %rcx: iv (little endian, 128bit)
407 */
408
409 pushq %r12;
410
411 movq %rsi, %r11;
412 movq %rdx, %r12;
413
414 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
415 RD2, RX0, RX1, RY0);
416
417 call __twofish_enc_blk8;
418
419 store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
420
421 popq %r12;
356 422
357 ret; 423 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 810e45d51186..94ac91d26e47 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
45 45
46#define TWOFISH_PARALLEL_BLOCKS 8 46#define TWOFISH_PARALLEL_BLOCKS 8
47 47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */ 48/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, 49asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor); 50 const u8 *src);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, 51asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src); 52 const u8 *src);
59 53
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, 54asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src) 55 const u8 *src);
62{ 56asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
63 __twofish_enc_blk_8way(ctx, dst, src, false); 57 const u8 *src, le128 *iv);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71 58
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, 59static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src) 60 const u8 *src)
74{ 61{
75 twofish_dec_blk_8way(ctx, dst, src); 62 __twofish_enc_blk_3way(ctx, dst, src, false);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90} 63}
91 64
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 le128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 le128_to_be128(&ctrblks[i], iv);
103 le128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108 65
109static const struct common_glue_ctx twofish_enc = { 66static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3, 67 .num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
112 69
113 .funcs = { { 70 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 71 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } 72 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
116 }, { 73 }, {
117 .num_blocks = 3, 74 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } 75 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
128 85
129 .funcs = { { 86 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 87 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } 88 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
132 }, { 89 }, {
133 .num_blocks = 3, 90 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } 91 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
144 101
145 .funcs = { { 102 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 103 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } 104 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
148 }, { 105 }, {
149 .num_blocks = 3, 106 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } 107 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
160 117
161 .funcs = { { 118 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 119 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } 120 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
164 }, { 121 }, {
165 .num_blocks = 3, 122 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } 123 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 184 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228 185
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 186 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); 187 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
231 return; 188 return;
232 } 189 }
233 190
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 206 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250 207
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 208 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); 209 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
253 return; 210 return;
254 } 211 }
255 212