diff options
Diffstat (limited to 'arch/x86/crypto/cast6-avx-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 206 |
1 files changed, 131 insertions, 75 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 218d283772f4..2569d0da841f 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S | |||
@@ -23,22 +23,24 @@ | |||
23 | * | 23 | * |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include "glue_helper-asm-avx.S" | ||
27 | |||
26 | .file "cast6-avx-x86_64-asm_64.S" | 28 | .file "cast6-avx-x86_64-asm_64.S" |
27 | 29 | ||
28 | .extern cast6_s1 | 30 | .extern cast_s1 |
29 | .extern cast6_s2 | 31 | .extern cast_s2 |
30 | .extern cast6_s3 | 32 | .extern cast_s3 |
31 | .extern cast6_s4 | 33 | .extern cast_s4 |
32 | 34 | ||
33 | /* structure of crypto context */ | 35 | /* structure of crypto context */ |
34 | #define km 0 | 36 | #define km 0 |
35 | #define kr (12*4*4) | 37 | #define kr (12*4*4) |
36 | 38 | ||
37 | /* s-boxes */ | 39 | /* s-boxes */ |
38 | #define s1 cast6_s1 | 40 | #define s1 cast_s1 |
39 | #define s2 cast6_s2 | 41 | #define s2 cast_s2 |
40 | #define s3 cast6_s3 | 42 | #define s3 cast_s3 |
41 | #define s4 cast6_s4 | 43 | #define s4 cast_s4 |
42 | 44 | ||
43 | /********************************************************************** | 45 | /********************************************************************** |
44 | 8-way AVX cast6 | 46 | 8-way AVX cast6 |
@@ -205,11 +207,7 @@ | |||
205 | vpunpcklqdq x3, t2, x2; \ | 207 | vpunpcklqdq x3, t2, x2; \ |
206 | vpunpckhqdq x3, t2, x3; | 208 | vpunpckhqdq x3, t2, x3; |
207 | 209 | ||
208 | #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ | 210 | #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
209 | vmovdqu (0*4*4)(in), x0; \ | ||
210 | vmovdqu (1*4*4)(in), x1; \ | ||
211 | vmovdqu (2*4*4)(in), x2; \ | ||
212 | vmovdqu (3*4*4)(in), x3; \ | ||
213 | vpshufb rmask, x0, x0; \ | 211 | vpshufb rmask, x0, x0; \ |
214 | vpshufb rmask, x1, x1; \ | 212 | vpshufb rmask, x1, x1; \ |
215 | vpshufb rmask, x2, x2; \ | 213 | vpshufb rmask, x2, x2; \ |
@@ -217,39 +215,21 @@ | |||
217 | \ | 215 | \ |
218 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 216 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
219 | 217 | ||
220 | #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ | 218 | #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
221 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 219 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
222 | \ | 220 | \ |
223 | vpshufb rmask, x0, x0; \ | 221 | vpshufb rmask, x0, x0; \ |
224 | vpshufb rmask, x1, x1; \ | 222 | vpshufb rmask, x1, x1; \ |
225 | vpshufb rmask, x2, x2; \ | 223 | vpshufb rmask, x2, x2; \ |
226 | vpshufb rmask, x3, x3; \ | 224 | vpshufb rmask, x3, x3; |
227 | vmovdqu x0, (0*4*4)(out); \ | ||
228 | vmovdqu x1, (1*4*4)(out); \ | ||
229 | vmovdqu x2, (2*4*4)(out); \ | ||
230 | vmovdqu x3, (3*4*4)(out); | ||
231 | |||
232 | #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ | ||
233 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
234 | \ | ||
235 | vpshufb rmask, x0, x0; \ | ||
236 | vpshufb rmask, x1, x1; \ | ||
237 | vpshufb rmask, x2, x2; \ | ||
238 | vpshufb rmask, x3, x3; \ | ||
239 | vpxor (0*4*4)(out), x0, x0; \ | ||
240 | vmovdqu x0, (0*4*4)(out); \ | ||
241 | vpxor (1*4*4)(out), x1, x1; \ | ||
242 | vmovdqu x1, (1*4*4)(out); \ | ||
243 | vpxor (2*4*4)(out), x2, x2; \ | ||
244 | vmovdqu x2, (2*4*4)(out); \ | ||
245 | vpxor (3*4*4)(out), x3, x3; \ | ||
246 | vmovdqu x3, (3*4*4)(out); | ||
247 | 225 | ||
248 | .data | 226 | .data |
249 | 227 | ||
250 | .align 16 | 228 | .align 16 |
251 | .Lbswap_mask: | 229 | .Lbswap_mask: |
252 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 230 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
231 | .Lbswap128_mask: | ||
232 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
253 | .Lrkr_enc_Q_Q_QBAR_QBAR: | 233 | .Lrkr_enc_Q_Q_QBAR_QBAR: |
254 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 | 234 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 |
255 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: | 235 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: |
@@ -269,31 +249,26 @@ | |||
269 | 249 | ||
270 | .text | 250 | .text |
271 | 251 | ||
272 | .align 16 | 252 | .align 8 |
273 | .global __cast6_enc_blk_8way | 253 | .type __cast6_enc_blk8,@function; |
274 | .type __cast6_enc_blk_8way,@function; | ||
275 | 254 | ||
276 | __cast6_enc_blk_8way: | 255 | __cast6_enc_blk8: |
277 | /* input: | 256 | /* input: |
278 | * %rdi: ctx, CTX | 257 | * %rdi: ctx, CTX |
279 | * %rsi: dst | 258 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
280 | * %rdx: src | 259 | * output: |
281 | * %rcx: bool, if true: xor output | 260 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
282 | */ | 261 | */ |
283 | 262 | ||
284 | pushq %rbp; | 263 | pushq %rbp; |
285 | pushq %rbx; | 264 | pushq %rbx; |
286 | pushq %rcx; | ||
287 | 265 | ||
288 | vmovdqa .Lbswap_mask, RKM; | 266 | vmovdqa .Lbswap_mask, RKM; |
289 | vmovd .Lfirst_mask, R1ST; | 267 | vmovd .Lfirst_mask, R1ST; |
290 | vmovd .L32_mask, R32; | 268 | vmovd .L32_mask, R32; |
291 | 269 | ||
292 | leaq (4*4*4)(%rdx), %rax; | 270 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
293 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 271 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
294 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
295 | |||
296 | movq %rsi, %r11; | ||
297 | 272 | ||
298 | preload_rkr(0, dummy, none); | 273 | preload_rkr(0, dummy, none); |
299 | Q(0); | 274 | Q(0); |
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way: | |||
311 | QBAR(10); | 286 | QBAR(10); |
312 | QBAR(11); | 287 | QBAR(11); |
313 | 288 | ||
314 | popq %rcx; | ||
315 | popq %rbx; | 289 | popq %rbx; |
316 | popq %rbp; | 290 | popq %rbp; |
317 | 291 | ||
318 | vmovdqa .Lbswap_mask, RKM; | 292 | vmovdqa .Lbswap_mask, RKM; |
319 | leaq (4*4*4)(%r11), %rax; | ||
320 | |||
321 | testb %cl, %cl; | ||
322 | jnz __enc_xor8; | ||
323 | |||
324 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | ||
325 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
326 | |||
327 | ret; | ||
328 | 293 | ||
329 | __enc_xor8: | 294 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
330 | outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 295 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
331 | outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
332 | 296 | ||
333 | ret; | 297 | ret; |
334 | 298 | ||
335 | .align 16 | 299 | .align 8 |
336 | .global cast6_dec_blk_8way | 300 | .type __cast6_dec_blk8,@function; |
337 | .type cast6_dec_blk_8way,@function; | ||
338 | 301 | ||
339 | cast6_dec_blk_8way: | 302 | __cast6_dec_blk8: |
340 | /* input: | 303 | /* input: |
341 | * %rdi: ctx, CTX | 304 | * %rdi: ctx, CTX |
342 | * %rsi: dst | 305 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
343 | * %rdx: src | 306 | * output: |
307 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks | ||
344 | */ | 308 | */ |
345 | 309 | ||
346 | pushq %rbp; | 310 | pushq %rbp; |
@@ -350,11 +314,8 @@ cast6_dec_blk_8way: | |||
350 | vmovd .Lfirst_mask, R1ST; | 314 | vmovd .Lfirst_mask, R1ST; |
351 | vmovd .L32_mask, R32; | 315 | vmovd .L32_mask, R32; |
352 | 316 | ||
353 | leaq (4*4*4)(%rdx), %rax; | 317 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
354 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 318 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
355 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
356 | |||
357 | movq %rsi, %r11; | ||
358 | 319 | ||
359 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); | 320 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); |
360 | Q(11); | 321 | Q(11); |
@@ -376,8 +337,103 @@ cast6_dec_blk_8way: | |||
376 | popq %rbp; | 337 | popq %rbp; |
377 | 338 | ||
378 | vmovdqa .Lbswap_mask, RKM; | 339 | vmovdqa .Lbswap_mask, RKM; |
379 | leaq (4*4*4)(%r11), %rax; | 340 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
380 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 341 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
381 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | 342 | |
343 | ret; | ||
344 | |||
345 | .align 8 | ||
346 | .global cast6_ecb_enc_8way | ||
347 | .type cast6_ecb_enc_8way,@function; | ||
348 | |||
349 | cast6_ecb_enc_8way: | ||
350 | /* input: | ||
351 | * %rdi: ctx, CTX | ||
352 | * %rsi: dst | ||
353 | * %rdx: src | ||
354 | */ | ||
355 | |||
356 | movq %rsi, %r11; | ||
357 | |||
358 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
359 | |||
360 | call __cast6_enc_blk8; | ||
361 | |||
362 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
363 | |||
364 | ret; | ||
365 | |||
366 | .align 8 | ||
367 | .global cast6_ecb_dec_8way | ||
368 | .type cast6_ecb_dec_8way,@function; | ||
369 | |||
370 | cast6_ecb_dec_8way: | ||
371 | /* input: | ||
372 | * %rdi: ctx, CTX | ||
373 | * %rsi: dst | ||
374 | * %rdx: src | ||
375 | */ | ||
376 | |||
377 | movq %rsi, %r11; | ||
378 | |||
379 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
380 | |||
381 | call __cast6_dec_blk8; | ||
382 | |||
383 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
384 | |||
385 | ret; | ||
386 | |||
387 | .align 8 | ||
388 | .global cast6_cbc_dec_8way | ||
389 | .type cast6_cbc_dec_8way,@function; | ||
390 | |||
391 | cast6_cbc_dec_8way: | ||
392 | /* input: | ||
393 | * %rdi: ctx, CTX | ||
394 | * %rsi: dst | ||
395 | * %rdx: src | ||
396 | */ | ||
397 | |||
398 | pushq %r12; | ||
399 | |||
400 | movq %rsi, %r11; | ||
401 | movq %rdx, %r12; | ||
402 | |||
403 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
404 | |||
405 | call __cast6_dec_blk8; | ||
406 | |||
407 | store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
408 | |||
409 | popq %r12; | ||
410 | |||
411 | ret; | ||
412 | |||
413 | .align 8 | ||
414 | .global cast6_ctr_8way | ||
415 | .type cast6_ctr_8way,@function; | ||
416 | |||
417 | cast6_ctr_8way: | ||
418 | /* input: | ||
419 | * %rdi: ctx, CTX | ||
420 | * %rsi: dst | ||
421 | * %rdx: src | ||
422 | * %rcx: iv (little endian, 128bit) | ||
423 | */ | ||
424 | |||
425 | pushq %r12; | ||
426 | |||
427 | movq %rsi, %r11; | ||
428 | movq %rdx, %r12; | ||
429 | |||
430 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
431 | RD2, RX, RKR, RKM); | ||
432 | |||
433 | call __cast6_enc_blk8; | ||
434 | |||
435 | store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
436 | |||
437 | popq %r12; | ||
382 | 438 | ||
383 | ret; | 439 | ret; |