aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/cast6-avx-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S206
1 files changed, 131 insertions, 75 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283772f4..2569d0da841f 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,22 +23,24 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "cast6-avx-x86_64-asm_64.S" 28.file "cast6-avx-x86_64-asm_64.S"
27 29
28.extern cast6_s1 30.extern cast_s1
29.extern cast6_s2 31.extern cast_s2
30.extern cast6_s3 32.extern cast_s3
31.extern cast6_s4 33.extern cast_s4
32 34
33/* structure of crypto context */ 35/* structure of crypto context */
34#define km 0 36#define km 0
35#define kr (12*4*4) 37#define kr (12*4*4)
36 38
37/* s-boxes */ 39/* s-boxes */
38#define s1 cast6_s1 40#define s1 cast_s1
39#define s2 cast6_s2 41#define s2 cast_s2
40#define s3 cast6_s3 42#define s3 cast_s3
41#define s4 cast6_s4 43#define s4 cast_s4
42 44
43/********************************************************************** 45/**********************************************************************
44 8-way AVX cast6 46 8-way AVX cast6
@@ -205,11 +207,7 @@
205 vpunpcklqdq x3, t2, x2; \ 207 vpunpcklqdq x3, t2, x2; \
206 vpunpckhqdq x3, t2, x3; 208 vpunpckhqdq x3, t2, x3;
207 209
208#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ 210#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
209 vmovdqu (0*4*4)(in), x0; \
210 vmovdqu (1*4*4)(in), x1; \
211 vmovdqu (2*4*4)(in), x2; \
212 vmovdqu (3*4*4)(in), x3; \
213 vpshufb rmask, x0, x0; \ 211 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \ 212 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \ 213 vpshufb rmask, x2, x2; \
@@ -217,39 +215,21 @@
217 \ 215 \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
219 217
220#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ 218#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
222 \ 220 \
223 vpshufb rmask, x0, x0; \ 221 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \ 222 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \ 223 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3; \ 224 vpshufb rmask, x3, x3;
227 vmovdqu x0, (0*4*4)(out); \
228 vmovdqu x1, (1*4*4)(out); \
229 vmovdqu x2, (2*4*4)(out); \
230 vmovdqu x3, (3*4*4)(out);
231
232#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
233 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
234 \
235 vpshufb rmask, x0, x0; \
236 vpshufb rmask, x1, x1; \
237 vpshufb rmask, x2, x2; \
238 vpshufb rmask, x3, x3; \
239 vpxor (0*4*4)(out), x0, x0; \
240 vmovdqu x0, (0*4*4)(out); \
241 vpxor (1*4*4)(out), x1, x1; \
242 vmovdqu x1, (1*4*4)(out); \
243 vpxor (2*4*4)(out), x2, x2; \
244 vmovdqu x2, (2*4*4)(out); \
245 vpxor (3*4*4)(out), x3, x3; \
246 vmovdqu x3, (3*4*4)(out);
247 225
248.data 226.data
249 227
250.align 16 228.align 16
251.Lbswap_mask: 229.Lbswap_mask:
252 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
253.Lrkr_enc_Q_Q_QBAR_QBAR: 233.Lrkr_enc_Q_Q_QBAR_QBAR:
254 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
255.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 235.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@
269 249
270.text 250.text
271 251
272.align 16 252.align 8
273.global __cast6_enc_blk_8way 253.type __cast6_enc_blk8,@function;
274.type __cast6_enc_blk_8way,@function;
275 254
276__cast6_enc_blk_8way: 255__cast6_enc_blk8:
277 /* input: 256 /* input:
278 * %rdi: ctx, CTX 257 * %rdi: ctx, CTX
279 * %rsi: dst 258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
280 * %rdx: src 259 * output:
281 * %rcx: bool, if true: xor output 260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
282 */ 261 */
283 262
284 pushq %rbp; 263 pushq %rbp;
285 pushq %rbx; 264 pushq %rbx;
286 pushq %rcx;
287 265
288 vmovdqa .Lbswap_mask, RKM; 266 vmovdqa .Lbswap_mask, RKM;
289 vmovd .Lfirst_mask, R1ST; 267 vmovd .Lfirst_mask, R1ST;
290 vmovd .L32_mask, R32; 268 vmovd .L32_mask, R32;
291 269
292 leaq (4*4*4)(%rdx), %rax; 270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
293 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
294 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295
296 movq %rsi, %r11;
297 272
298 preload_rkr(0, dummy, none); 273 preload_rkr(0, dummy, none);
299 Q(0); 274 Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
311 QBAR(10); 286 QBAR(10);
312 QBAR(11); 287 QBAR(11);
313 288
314 popq %rcx;
315 popq %rbx; 289 popq %rbx;
316 popq %rbp; 290 popq %rbp;
317 291
318 vmovdqa .Lbswap_mask, RKM; 292 vmovdqa .Lbswap_mask, RKM;
319 leaq (4*4*4)(%r11), %rax;
320
321 testb %cl, %cl;
322 jnz __enc_xor8;
323
324 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
325 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
326
327 ret;
328 293
329__enc_xor8: 294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
330 outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
331 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
332 296
333 ret; 297 ret;
334 298
335.align 16 299.align 8
336.global cast6_dec_blk_8way 300.type __cast6_dec_blk8,@function;
337.type cast6_dec_blk_8way,@function;
338 301
339cast6_dec_blk_8way: 302__cast6_dec_blk8:
340 /* input: 303 /* input:
341 * %rdi: ctx, CTX 304 * %rdi: ctx, CTX
342 * %rsi: dst 305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
343 * %rdx: src 306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
344 */ 308 */
345 309
346 pushq %rbp; 310 pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
350 vmovd .Lfirst_mask, R1ST; 314 vmovd .Lfirst_mask, R1ST;
351 vmovd .L32_mask, R32; 315 vmovd .L32_mask, R32;
352 316
353 leaq (4*4*4)(%rdx), %rax; 317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
354 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
355 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
356
357 movq %rsi, %r11;
358 319
359 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
360 Q(11); 321 Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
376 popq %rbp; 337 popq %rbp;
377 338
378 vmovdqa .Lbswap_mask, RKM; 339 vmovdqa .Lbswap_mask, RKM;
379 leaq (4*4*4)(%r11), %rax; 340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
380 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
381 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 342
343 ret;
344
345.align 8
346.global cast6_ecb_enc_8way
347.type cast6_ecb_enc_8way,@function;
348
349cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366.align 8
367.global cast6_ecb_dec_8way
368.type cast6_ecb_dec_8way,@function;
369
370cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387.align 8
388.global cast6_cbc_dec_8way
389.type cast6_cbc_dec_8way,@function;
390
391cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413.align 8
414.global cast6_ctr_8way
415.type cast6_ctr_8way,@function;
416
417cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
382 438
383 ret; 439 ret;