aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-20 08:06:41 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-10-24 09:10:54 -0400
commitcba1cce05498d55f363c28cd2512368e95605518 (patch)
treea8728af2175a89a598b865c89dc2e3f08313c813 /arch/x86
parent58990986f1cba40c23c0c10592ace08616de3ffa (diff)
crypto: cast6/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~2%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S190
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c71
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S91
3 files changed, 227 insertions, 125 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283772f4..83a53818f0a5 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,6 +23,8 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "cast6-avx-x86_64-asm_64.S" 28.file "cast6-avx-x86_64-asm_64.S"
27 29
28.extern cast6_s1 30.extern cast6_s1
@@ -205,11 +207,7 @@
205 vpunpcklqdq x3, t2, x2; \ 207 vpunpcklqdq x3, t2, x2; \
206 vpunpckhqdq x3, t2, x3; 208 vpunpckhqdq x3, t2, x3;
207 209
208#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ 210#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
209 vmovdqu (0*4*4)(in), x0; \
210 vmovdqu (1*4*4)(in), x1; \
211 vmovdqu (2*4*4)(in), x2; \
212 vmovdqu (3*4*4)(in), x3; \
213 vpshufb rmask, x0, x0; \ 211 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \ 212 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \ 213 vpshufb rmask, x2, x2; \
@@ -217,39 +215,21 @@
217 \ 215 \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
219 217
220#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ 218#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
222 \ 220 \
223 vpshufb rmask, x0, x0; \ 221 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \ 222 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \ 223 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3; \ 224 vpshufb rmask, x3, x3;
227 vmovdqu x0, (0*4*4)(out); \
228 vmovdqu x1, (1*4*4)(out); \
229 vmovdqu x2, (2*4*4)(out); \
230 vmovdqu x3, (3*4*4)(out);
231
232#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
233 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
234 \
235 vpshufb rmask, x0, x0; \
236 vpshufb rmask, x1, x1; \
237 vpshufb rmask, x2, x2; \
238 vpshufb rmask, x3, x3; \
239 vpxor (0*4*4)(out), x0, x0; \
240 vmovdqu x0, (0*4*4)(out); \
241 vpxor (1*4*4)(out), x1, x1; \
242 vmovdqu x1, (1*4*4)(out); \
243 vpxor (2*4*4)(out), x2, x2; \
244 vmovdqu x2, (2*4*4)(out); \
245 vpxor (3*4*4)(out), x3, x3; \
246 vmovdqu x3, (3*4*4)(out);
247 225
248.data 226.data
249 227
250.align 16 228.align 16
251.Lbswap_mask: 229.Lbswap_mask:
252 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
253.Lrkr_enc_Q_Q_QBAR_QBAR: 233.Lrkr_enc_Q_Q_QBAR_QBAR:
254 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
255.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 235.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@
269 249
270.text 250.text
271 251
272.align 16 252.align 8
273.global __cast6_enc_blk_8way 253.type __cast6_enc_blk8,@function;
274.type __cast6_enc_blk_8way,@function;
275 254
276__cast6_enc_blk_8way: 255__cast6_enc_blk8:
277 /* input: 256 /* input:
278 * %rdi: ctx, CTX 257 * %rdi: ctx, CTX
279 * %rsi: dst 258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
280 * %rdx: src 259 * output:
281 * %rcx: bool, if true: xor output 260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
282 */ 261 */
283 262
284 pushq %rbp; 263 pushq %rbp;
285 pushq %rbx; 264 pushq %rbx;
286 pushq %rcx;
287 265
288 vmovdqa .Lbswap_mask, RKM; 266 vmovdqa .Lbswap_mask, RKM;
289 vmovd .Lfirst_mask, R1ST; 267 vmovd .Lfirst_mask, R1ST;
290 vmovd .L32_mask, R32; 268 vmovd .L32_mask, R32;
291 269
292 leaq (4*4*4)(%rdx), %rax; 270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
293 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
294 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295
296 movq %rsi, %r11;
297 272
298 preload_rkr(0, dummy, none); 273 preload_rkr(0, dummy, none);
299 Q(0); 274 Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
311 QBAR(10); 286 QBAR(10);
312 QBAR(11); 287 QBAR(11);
313 288
314 popq %rcx;
315 popq %rbx; 289 popq %rbx;
316 popq %rbp; 290 popq %rbp;
317 291
318 vmovdqa .Lbswap_mask, RKM; 292 vmovdqa .Lbswap_mask, RKM;
319 leaq (4*4*4)(%r11), %rax;
320
321 testb %cl, %cl;
322 jnz __enc_xor8;
323
324 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
325 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
326
327 ret;
328 293
329__enc_xor8: 294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
330 outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
331 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
332 296
333 ret; 297 ret;
334 298
335.align 16 299.align 8
336.global cast6_dec_blk_8way 300.type __cast6_dec_blk8,@function;
337.type cast6_dec_blk_8way,@function;
338 301
339cast6_dec_blk_8way: 302__cast6_dec_blk8:
340 /* input: 303 /* input:
341 * %rdi: ctx, CTX 304 * %rdi: ctx, CTX
342 * %rsi: dst 305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
343 * %rdx: src 306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
344 */ 308 */
345 309
346 pushq %rbp; 310 pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
350 vmovd .Lfirst_mask, R1ST; 314 vmovd .Lfirst_mask, R1ST;
351 vmovd .L32_mask, R32; 315 vmovd .L32_mask, R32;
352 316
353 leaq (4*4*4)(%rdx), %rax; 317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
354 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
355 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
356
357 movq %rsi, %r11;
358 319
359 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
360 Q(11); 321 Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
376 popq %rbp; 337 popq %rbp;
377 338
378 vmovdqa .Lbswap_mask, RKM; 339 vmovdqa .Lbswap_mask, RKM;
379 leaq (4*4*4)(%r11), %rax; 340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
380 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
381 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 342
343 ret;
344
345.align 8
346.global cast6_ecb_enc_8way
347.type cast6_ecb_enc_8way,@function;
348
349cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366.align 8
367.global cast6_ecb_dec_8way
368.type cast6_ecb_dec_8way,@function;
369
370cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387.align 8
388.global cast6_cbc_dec_8way
389.type cast6_cbc_dec_8way,@function;
390
391cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413.align 8
414.global cast6_ctr_8way
415.type cast6_ctr_8way,@function;
416
417cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
382 438
383 ret; 439 ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 1dfd33b5b4fb..92f7ca24790a 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -40,43 +40,15 @@
40 40
41#define CAST6_PARALLEL_BLOCKS 8 41#define CAST6_PARALLEL_BLOCKS 8
42 42
43asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, 43asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
44 const u8 *src, bool xor); 44 const u8 *src);
45asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, 45asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
46 const u8 *src); 46 const u8 *src);
47 47
48static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, 48asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
49 const u8 *src) 49 const u8 *src);
50{ 50asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
51 __cast6_enc_blk_8way(ctx, dst, src, false); 51 le128 *iv);
52}
53
54static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
55 const u8 *src)
56{
57 __cast6_enc_blk_8way(ctx, dst, src, true);
58}
59
60static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 cast6_dec_blk_8way(ctx, dst, src);
64}
65
66
67static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
68{
69 u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
70 unsigned int j;
71
72 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
73 ivs[j] = src[j];
74
75 cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
76
77 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
78 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
79}
80 52
81static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 53static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
82{ 54{
@@ -89,30 +61,13 @@ static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
89 u128_xor(dst, src, (u128 *)&ctrblk); 61 u128_xor(dst, src, (u128 *)&ctrblk);
90} 62}
91 63
92static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 le128 *iv)
94{
95 be128 ctrblks[CAST6_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 le128_to_be128(&ctrblks[i], iv);
103 le128_inc(iv);
104 }
105
106 cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx cast6_enc = { 64static const struct common_glue_ctx cast6_enc = {
110 .num_funcs = 2, 65 .num_funcs = 2,
111 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, 66 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
112 67
113 .funcs = { { 68 .funcs = { {
114 .num_blocks = CAST6_PARALLEL_BLOCKS, 69 .num_blocks = CAST6_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } 70 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
116 }, { 71 }, {
117 .num_blocks = 1, 72 .num_blocks = 1,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } 73 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
125 80
126 .funcs = { { 81 .funcs = { {
127 .num_blocks = CAST6_PARALLEL_BLOCKS, 82 .num_blocks = CAST6_PARALLEL_BLOCKS,
128 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } 83 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
129 }, { 84 }, {
130 .num_blocks = 1, 85 .num_blocks = 1,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } 86 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
138 93
139 .funcs = { { 94 .funcs = { {
140 .num_blocks = CAST6_PARALLEL_BLOCKS, 95 .num_blocks = CAST6_PARALLEL_BLOCKS,
141 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } 96 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
142 }, { 97 }, {
143 .num_blocks = 1, 98 .num_blocks = 1,
144 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } 99 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
151 106
152 .funcs = { { 107 .funcs = { {
153 .num_blocks = CAST6_PARALLEL_BLOCKS, 108 .num_blocks = CAST6_PARALLEL_BLOCKS,
154 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } 109 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
155 }, { 110 }, {
156 .num_blocks = 1, 111 .num_blocks = 1,
157 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } 112 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
215 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); 170 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
216 171
217 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { 172 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
218 cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); 173 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
219 return; 174 return;
220 } 175 }
221 176
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
232 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); 187 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
233 188
234 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { 189 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
235 cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); 190 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
236 return; 191 return;
237 } 192 }
238 193
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000000..f7b6ea2ddfdb
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,91 @@
1/*
2 * Shared glue code for 128bit block ciphers, AVX assembler macros
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
19 vmovdqu (0*16)(src), x0; \
20 vmovdqu (1*16)(src), x1; \
21 vmovdqu (2*16)(src), x2; \
22 vmovdqu (3*16)(src), x3; \
23 vmovdqu (4*16)(src), x4; \
24 vmovdqu (5*16)(src), x5; \
25 vmovdqu (6*16)(src), x6; \
26 vmovdqu (7*16)(src), x7;
27
28#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
29 vmovdqu x0, (0*16)(dst); \
30 vmovdqu x1, (1*16)(dst); \
31 vmovdqu x2, (2*16)(dst); \
32 vmovdqu x3, (3*16)(dst); \
33 vmovdqu x4, (4*16)(dst); \
34 vmovdqu x5, (5*16)(dst); \
35 vmovdqu x6, (6*16)(dst); \
36 vmovdqu x7, (7*16)(dst);
37
38#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
39 vpxor (0*16)(src), x1, x1; \
40 vpxor (1*16)(src), x2, x2; \
41 vpxor (2*16)(src), x3, x3; \
42 vpxor (3*16)(src), x4, x4; \
43 vpxor (4*16)(src), x5, x5; \
44 vpxor (5*16)(src), x6, x6; \
45 vpxor (6*16)(src), x7, x7; \
46 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
47
48#define inc_le128(x, minus_one, tmp) \
49 vpcmpeqq minus_one, x, tmp; \
50 vpsubq minus_one, x, x; \
51 vpslldq $8, tmp, tmp; \
52 vpsubq tmp, x, x;
53
54#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
55 vpcmpeqd t0, t0, t0; \
56 vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
57 vmovdqa bswap, t1; \
58 \
59 /* load IV and byteswap */ \
60 vmovdqu (iv), x7; \
61 vpshufb t1, x7, x0; \
62 \
63 /* construct IVs */ \
64 inc_le128(x7, t0, t2); \
65 vpshufb t1, x7, x1; \
66 inc_le128(x7, t0, t2); \
67 vpshufb t1, x7, x2; \
68 inc_le128(x7, t0, t2); \
69 vpshufb t1, x7, x3; \
70 inc_le128(x7, t0, t2); \
71 vpshufb t1, x7, x4; \
72 inc_le128(x7, t0, t2); \
73 vpshufb t1, x7, x5; \
74 inc_le128(x7, t0, t2); \
75 vpshufb t1, x7, x6; \
76 inc_le128(x7, t0, t2); \
77 vmovdqa x7, t2; \
78 vpshufb t1, x7, x7; \
79 inc_le128(t2, t0, t1); \
80 vmovdqu t2, (iv);
81
82#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
83 vpxor (0*16)(src), x0, x0; \
84 vpxor (1*16)(src), x1, x1; \
85 vpxor (2*16)(src), x2, x2; \
86 vpxor (3*16)(src), x3, x3; \
87 vpxor (4*16)(src), x4, x4; \
88 vpxor (5*16)(src), x5, x5; \
89 vpxor (6*16)(src), x6, x6; \
90 vpxor (7*16)(src), x7, x7; \
91 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);