aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S160
1 files changed, 89 insertions, 71 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 91a1878fcc3e..0e0b8863a34b 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -51,16 +51,6 @@
51#define ymm14_x xmm14 51#define ymm14_x xmm14
52#define ymm15_x xmm15 52#define ymm15_x xmm15
53 53
54/*
55 * AES-NI instructions do not support ymmX registers, so we need splitting and
56 * merging.
57 */
58#define vaesenclast256(zero, yreg, tmp) \
59 vextracti128 $1, yreg, tmp##_x; \
60 vaesenclast zero##_x, yreg##_x, yreg##_x; \
61 vaesenclast zero##_x, tmp##_x, tmp##_x; \
62 vinserti128 $1, tmp##_x, yreg, yreg;
63
64/********************************************************************** 54/**********************************************************************
65 32-way camellia 55 32-way camellia
66 **********************************************************************/ 56 **********************************************************************/
@@ -79,46 +69,70 @@
79 * S-function with AES subbytes \ 69 * S-function with AES subbytes \
80 */ \ 70 */ \
81 vbroadcasti128 .Linv_shift_row, t4; \ 71 vbroadcasti128 .Linv_shift_row, t4; \
82 vpbroadcastb .L0f0f0f0f, t7; \ 72 vpbroadcastd .L0f0f0f0f, t7; \
83 vbroadcasti128 .Lpre_tf_lo_s1, t0; \ 73 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
84 vbroadcasti128 .Lpre_tf_hi_s1, t1; \ 74 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
75 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
76 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
85 \ 77 \
86 /* AES inverse shift rows */ \ 78 /* AES inverse shift rows */ \
87 vpshufb t4, x0, x0; \ 79 vpshufb t4, x0, x0; \
88 vpshufb t4, x7, x7; \ 80 vpshufb t4, x7, x7; \
89 vpshufb t4, x1, x1; \
90 vpshufb t4, x4, x4; \
91 vpshufb t4, x2, x2; \
92 vpshufb t4, x5, x5; \
93 vpshufb t4, x3, x3; \ 81 vpshufb t4, x3, x3; \
94 vpshufb t4, x6, x6; \ 82 vpshufb t4, x6, x6; \
83 vpshufb t4, x2, x2; \
84 vpshufb t4, x5, x5; \
85 vpshufb t4, x1, x1; \
86 vpshufb t4, x4, x4; \
95 \ 87 \
96 /* prefilter sboxes 1, 2 and 3 */ \ 88 /* prefilter sboxes 1, 2 and 3 */ \
97 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
98 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
99 filter_8bit(x0, t0, t1, t7, t6); \
100 filter_8bit(x7, t0, t1, t7, t6); \
101 filter_8bit(x1, t0, t1, t7, t6); \
102 filter_8bit(x4, t0, t1, t7, t6); \
103 filter_8bit(x2, t0, t1, t7, t6); \
104 filter_8bit(x5, t0, t1, t7, t6); \
105 \
106 /* prefilter sbox 4 */ \ 89 /* prefilter sbox 4 */ \
90 filter_8bit(x0, t5, t6, t7, t4); \
91 filter_8bit(x7, t5, t6, t7, t4); \
92 vextracti128 $1, x0, t0##_x; \
93 vextracti128 $1, x7, t1##_x; \
94 filter_8bit(x3, t2, t3, t7, t4); \
95 filter_8bit(x6, t2, t3, t7, t4); \
96 vextracti128 $1, x3, t3##_x; \
97 vextracti128 $1, x6, t2##_x; \
98 filter_8bit(x2, t5, t6, t7, t4); \
99 filter_8bit(x5, t5, t6, t7, t4); \
100 filter_8bit(x1, t5, t6, t7, t4); \
101 filter_8bit(x4, t5, t6, t7, t4); \
102 \
107 vpxor t4##_x, t4##_x, t4##_x; \ 103 vpxor t4##_x, t4##_x, t4##_x; \
108 filter_8bit(x3, t2, t3, t7, t6); \
109 filter_8bit(x6, t2, t3, t7, t6); \
110 \ 104 \
111 /* AES subbytes + AES shift rows */ \ 105 /* AES subbytes + AES shift rows */ \
106 vextracti128 $1, x2, t6##_x; \
107 vextracti128 $1, x5, t5##_x; \
108 vaesenclast t4##_x, x0##_x, x0##_x; \
109 vaesenclast t4##_x, t0##_x, t0##_x; \
110 vinserti128 $1, t0##_x, x0, x0; \
111 vaesenclast t4##_x, x7##_x, x7##_x; \
112 vaesenclast t4##_x, t1##_x, t1##_x; \
113 vinserti128 $1, t1##_x, x7, x7; \
114 vaesenclast t4##_x, x3##_x, x3##_x; \
115 vaesenclast t4##_x, t3##_x, t3##_x; \
116 vinserti128 $1, t3##_x, x3, x3; \
117 vaesenclast t4##_x, x6##_x, x6##_x; \
118 vaesenclast t4##_x, t2##_x, t2##_x; \
119 vinserti128 $1, t2##_x, x6, x6; \
120 vextracti128 $1, x1, t3##_x; \
121 vextracti128 $1, x4, t2##_x; \
112 vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 122 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
113 vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 123 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
114 vaesenclast256(t4, x0, t5); \ 124 vaesenclast t4##_x, x2##_x, x2##_x; \
115 vaesenclast256(t4, x7, t5); \ 125 vaesenclast t4##_x, t6##_x, t6##_x; \
116 vaesenclast256(t4, x1, t5); \ 126 vinserti128 $1, t6##_x, x2, x2; \
117 vaesenclast256(t4, x4, t5); \ 127 vaesenclast t4##_x, x5##_x, x5##_x; \
118 vaesenclast256(t4, x2, t5); \ 128 vaesenclast t4##_x, t5##_x, t5##_x; \
119 vaesenclast256(t4, x5, t5); \ 129 vinserti128 $1, t5##_x, x5, x5; \
120 vaesenclast256(t4, x3, t5); \ 130 vaesenclast t4##_x, x1##_x, x1##_x; \
121 vaesenclast256(t4, x6, t5); \ 131 vaesenclast t4##_x, t3##_x, t3##_x; \
132 vinserti128 $1, t3##_x, x1, x1; \
133 vaesenclast t4##_x, x4##_x, x4##_x; \
134 vaesenclast t4##_x, t2##_x, t2##_x; \
135 vinserti128 $1, t2##_x, x4, x4; \
122 \ 136 \
123 /* postfilter sboxes 1 and 4 */ \ 137 /* postfilter sboxes 1 and 4 */ \
124 vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 138 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
@@ -139,22 +153,12 @@
139 /* postfilter sbox 2 */ \ 153 /* postfilter sbox 2 */ \
140 filter_8bit(x1, t4, t5, t7, t2); \ 154 filter_8bit(x1, t4, t5, t7, t2); \
141 filter_8bit(x4, t4, t5, t7, t2); \ 155 filter_8bit(x4, t4, t5, t7, t2); \
156 vpxor t7, t7, t7; \
142 \ 157 \
143 vpsrldq $1, t0, t1; \ 158 vpsrldq $1, t0, t1; \
144 vpsrldq $2, t0, t2; \ 159 vpsrldq $2, t0, t2; \
160 vpshufb t7, t1, t1; \
145 vpsrldq $3, t0, t3; \ 161 vpsrldq $3, t0, t3; \
146 vpsrldq $4, t0, t4; \
147 vpsrldq $5, t0, t5; \
148 vpsrldq $6, t0, t6; \
149 vpsrldq $7, t0, t7; \
150 vpbroadcastb t0##_x, t0; \
151 vpbroadcastb t1##_x, t1; \
152 vpbroadcastb t2##_x, t2; \
153 vpbroadcastb t3##_x, t3; \
154 vpbroadcastb t4##_x, t4; \
155 vpbroadcastb t6##_x, t6; \
156 vpbroadcastb t5##_x, t5; \
157 vpbroadcastb t7##_x, t7; \
158 \ 162 \
159 /* P-function */ \ 163 /* P-function */ \
160 vpxor x5, x0, x0; \ 164 vpxor x5, x0, x0; \
@@ -162,11 +166,21 @@
162 vpxor x7, x2, x2; \ 166 vpxor x7, x2, x2; \
163 vpxor x4, x3, x3; \ 167 vpxor x4, x3, x3; \
164 \ 168 \
169 vpshufb t7, t2, t2; \
170 vpsrldq $4, t0, t4; \
171 vpshufb t7, t3, t3; \
172 vpsrldq $5, t0, t5; \
173 vpshufb t7, t4, t4; \
174 \
165 vpxor x2, x4, x4; \ 175 vpxor x2, x4, x4; \
166 vpxor x3, x5, x5; \ 176 vpxor x3, x5, x5; \
167 vpxor x0, x6, x6; \ 177 vpxor x0, x6, x6; \
168 vpxor x1, x7, x7; \ 178 vpxor x1, x7, x7; \
169 \ 179 \
180 vpsrldq $6, t0, t6; \
181 vpshufb t7, t5, t5; \
182 vpshufb t7, t6, t6; \
183 \
170 vpxor x7, x0, x0; \ 184 vpxor x7, x0, x0; \
171 vpxor x4, x1, x1; \ 185 vpxor x4, x1, x1; \
172 vpxor x5, x2, x2; \ 186 vpxor x5, x2, x2; \
@@ -179,12 +193,16 @@
179 \ 193 \
180 /* Add key material and result to CD (x becomes new CD) */ \ 194 /* Add key material and result to CD (x becomes new CD) */ \
181 \ 195 \
182 vpxor t7, x0, x0; \
183 vpxor 4 * 32(mem_cd), x0, x0; \
184 \
185 vpxor t6, x1, x1; \ 196 vpxor t6, x1, x1; \
186 vpxor 5 * 32(mem_cd), x1, x1; \ 197 vpxor 5 * 32(mem_cd), x1, x1; \
187 \ 198 \
199 vpsrldq $7, t0, t6; \
200 vpshufb t7, t0, t0; \
201 vpshufb t7, t6, t7; \
202 \
203 vpxor t7, x0, x0; \
204 vpxor 4 * 32(mem_cd), x0, x0; \
205 \
188 vpxor t5, x2, x2; \ 206 vpxor t5, x2, x2; \
189 vpxor 6 * 32(mem_cd), x2, x2; \ 207 vpxor 6 * 32(mem_cd), x2, x2; \
190 \ 208 \
@@ -204,7 +222,7 @@
204 vpxor 3 * 32(mem_cd), x7, x7; 222 vpxor 3 * 32(mem_cd), x7, x7;
205 223
206/* 224/*
207 * Size optimization... with inlined roundsm16 binary would be over 5 times 225 * Size optimization... with inlined roundsm32 binary would be over 5 times
208 * larger and would only marginally faster. 226 * larger and would only marginally faster.
209 */ 227 */
210.align 8 228.align 8
@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
324 */ \ 342 */ \
325 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 343 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
326 vpxor tt0, tt0, tt0; \ 344 vpxor tt0, tt0, tt0; \
327 vpbroadcastb t0##_x, t3; \ 345 vpshufb tt0, t0, t3; \
328 vpsrldq $1, t0, t0; \ 346 vpsrldq $1, t0, t0; \
329 vpbroadcastb t0##_x, t2; \ 347 vpshufb tt0, t0, t2; \
330 vpsrldq $1, t0, t0; \ 348 vpsrldq $1, t0, t0; \
331 vpbroadcastb t0##_x, t1; \ 349 vpshufb tt0, t0, t1; \
332 vpsrldq $1, t0, t0; \ 350 vpsrldq $1, t0, t0; \
333 vpbroadcastb t0##_x, t0; \ 351 vpshufb tt0, t0, t0; \
334 \ 352 \
335 vpand l0, t0, t0; \ 353 vpand l0, t0, t0; \
336 vpand l1, t1, t1; \ 354 vpand l1, t1, t1; \
@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
340 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 358 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
341 \ 359 \
342 vpxor l4, t0, l4; \ 360 vpxor l4, t0, l4; \
361 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
343 vmovdqu l4, 4 * 32(l); \ 362 vmovdqu l4, 4 * 32(l); \
344 vpxor l5, t1, l5; \ 363 vpxor l5, t1, l5; \
345 vmovdqu l5, 5 * 32(l); \ 364 vmovdqu l5, 5 * 32(l); \
@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
354 * rl ^= t2; \ 373 * rl ^= t2; \
355 */ \ 374 */ \
356 \ 375 \
357 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 376 vpshufb tt0, t0, t3; \
358 vpbroadcastb t0##_x, t3; \
359 vpsrldq $1, t0, t0; \ 377 vpsrldq $1, t0, t0; \
360 vpbroadcastb t0##_x, t2; \ 378 vpshufb tt0, t0, t2; \
361 vpsrldq $1, t0, t0; \ 379 vpsrldq $1, t0, t0; \
362 vpbroadcastb t0##_x, t1; \ 380 vpshufb tt0, t0, t1; \
363 vpsrldq $1, t0, t0; \ 381 vpsrldq $1, t0, t0; \
364 vpbroadcastb t0##_x, t0; \ 382 vpshufb tt0, t0, t0; \
365 \ 383 \
366 vpor 4 * 32(r), t0, t0; \ 384 vpor 4 * 32(r), t0, t0; \
367 vpor 5 * 32(r), t1, t1; \ 385 vpor 5 * 32(r), t1, t1; \
@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
373 vpxor 2 * 32(r), t2, t2; \ 391 vpxor 2 * 32(r), t2, t2; \
374 vpxor 3 * 32(r), t3, t3; \ 392 vpxor 3 * 32(r), t3, t3; \
375 vmovdqu t0, 0 * 32(r); \ 393 vmovdqu t0, 0 * 32(r); \
394 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
376 vmovdqu t1, 1 * 32(r); \ 395 vmovdqu t1, 1 * 32(r); \
377 vmovdqu t2, 2 * 32(r); \ 396 vmovdqu t2, 2 * 32(r); \
378 vmovdqu t3, 3 * 32(r); \ 397 vmovdqu t3, 3 * 32(r); \
@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
382 * t2 &= rl; \ 401 * t2 &= rl; \
383 * rr ^= rol32(t2, 1); \ 402 * rr ^= rol32(t2, 1); \
384 */ \ 403 */ \
385 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 404 vpshufb tt0, t0, t3; \
386 vpbroadcastb t0##_x, t3; \
387 vpsrldq $1, t0, t0; \ 405 vpsrldq $1, t0, t0; \
388 vpbroadcastb t0##_x, t2; \ 406 vpshufb tt0, t0, t2; \
389 vpsrldq $1, t0, t0; \ 407 vpsrldq $1, t0, t0; \
390 vpbroadcastb t0##_x, t1; \ 408 vpshufb tt0, t0, t1; \
391 vpsrldq $1, t0, t0; \ 409 vpsrldq $1, t0, t0; \
392 vpbroadcastb t0##_x, t0; \ 410 vpshufb tt0, t0, t0; \
393 \ 411 \
394 vpand 0 * 32(r), t0, t0; \ 412 vpand 0 * 32(r), t0, t0; \
395 vpand 1 * 32(r), t1, t1; \ 413 vpand 1 * 32(r), t1, t1; \
@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
403 vpxor 6 * 32(r), t2, t2; \ 421 vpxor 6 * 32(r), t2, t2; \
404 vpxor 7 * 32(r), t3, t3; \ 422 vpxor 7 * 32(r), t3, t3; \
405 vmovdqu t0, 4 * 32(r); \ 423 vmovdqu t0, 4 * 32(r); \
424 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
406 vmovdqu t1, 5 * 32(r); \ 425 vmovdqu t1, 5 * 32(r); \
407 vmovdqu t2, 6 * 32(r); \ 426 vmovdqu t2, 6 * 32(r); \
408 vmovdqu t3, 7 * 32(r); \ 427 vmovdqu t3, 7 * 32(r); \
@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
413 * ll ^= t0; \ 432 * ll ^= t0; \
414 */ \ 433 */ \
415 \ 434 \
416 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 435 vpshufb tt0, t0, t3; \
417 vpbroadcastb t0##_x, t3; \
418 vpsrldq $1, t0, t0; \ 436 vpsrldq $1, t0, t0; \
419 vpbroadcastb t0##_x, t2; \ 437 vpshufb tt0, t0, t2; \
420 vpsrldq $1, t0, t0; \ 438 vpsrldq $1, t0, t0; \
421 vpbroadcastb t0##_x, t1; \ 439 vpshufb tt0, t0, t1; \
422 vpsrldq $1, t0, t0; \ 440 vpsrldq $1, t0, t0; \
423 vpbroadcastb t0##_x, t0; \ 441 vpshufb tt0, t0, t0; \
424 \ 442 \
425 vpor l4, t0, t0; \ 443 vpor l4, t0, t0; \
426 vpor l5, t1, t1; \ 444 vpor l5, t1, t1; \