diff options
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 160 |
1 files changed, 89 insertions, 71 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 91a1878fcc3e..0e0b8863a34b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S | |||
@@ -51,16 +51,6 @@ | |||
51 | #define ymm14_x xmm14 | 51 | #define ymm14_x xmm14 |
52 | #define ymm15_x xmm15 | 52 | #define ymm15_x xmm15 |
53 | 53 | ||
54 | /* | ||
55 | * AES-NI instructions do not support ymmX registers, so we need splitting and | ||
56 | * merging. | ||
57 | */ | ||
58 | #define vaesenclast256(zero, yreg, tmp) \ | ||
59 | vextracti128 $1, yreg, tmp##_x; \ | ||
60 | vaesenclast zero##_x, yreg##_x, yreg##_x; \ | ||
61 | vaesenclast zero##_x, tmp##_x, tmp##_x; \ | ||
62 | vinserti128 $1, tmp##_x, yreg, yreg; | ||
63 | |||
64 | /********************************************************************** | 54 | /********************************************************************** |
65 | 32-way camellia | 55 | 32-way camellia |
66 | **********************************************************************/ | 56 | **********************************************************************/ |
@@ -79,46 +69,70 @@ | |||
79 | * S-function with AES subbytes \ | 69 | * S-function with AES subbytes \ |
80 | */ \ | 70 | */ \ |
81 | vbroadcasti128 .Linv_shift_row, t4; \ | 71 | vbroadcasti128 .Linv_shift_row, t4; \ |
82 | vpbroadcastb .L0f0f0f0f, t7; \ | 72 | vpbroadcastd .L0f0f0f0f, t7; \ |
83 | vbroadcasti128 .Lpre_tf_lo_s1, t0; \ | 73 | vbroadcasti128 .Lpre_tf_lo_s1, t5; \ |
84 | vbroadcasti128 .Lpre_tf_hi_s1, t1; \ | 74 | vbroadcasti128 .Lpre_tf_hi_s1, t6; \ |
75 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
76 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
85 | \ | 77 | \ |
86 | /* AES inverse shift rows */ \ | 78 | /* AES inverse shift rows */ \ |
87 | vpshufb t4, x0, x0; \ | 79 | vpshufb t4, x0, x0; \ |
88 | vpshufb t4, x7, x7; \ | 80 | vpshufb t4, x7, x7; \ |
89 | vpshufb t4, x1, x1; \ | ||
90 | vpshufb t4, x4, x4; \ | ||
91 | vpshufb t4, x2, x2; \ | ||
92 | vpshufb t4, x5, x5; \ | ||
93 | vpshufb t4, x3, x3; \ | 81 | vpshufb t4, x3, x3; \ |
94 | vpshufb t4, x6, x6; \ | 82 | vpshufb t4, x6, x6; \ |
83 | vpshufb t4, x2, x2; \ | ||
84 | vpshufb t4, x5, x5; \ | ||
85 | vpshufb t4, x1, x1; \ | ||
86 | vpshufb t4, x4, x4; \ | ||
95 | \ | 87 | \ |
96 | /* prefilter sboxes 1, 2 and 3 */ \ | 88 | /* prefilter sboxes 1, 2 and 3 */ \ |
97 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
98 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
99 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
101 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
102 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
103 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
104 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
105 | \ | ||
106 | /* prefilter sbox 4 */ \ | 89 | /* prefilter sbox 4 */ \ |
90 | filter_8bit(x0, t5, t6, t7, t4); \ | ||
91 | filter_8bit(x7, t5, t6, t7, t4); \ | ||
92 | vextracti128 $1, x0, t0##_x; \ | ||
93 | vextracti128 $1, x7, t1##_x; \ | ||
94 | filter_8bit(x3, t2, t3, t7, t4); \ | ||
95 | filter_8bit(x6, t2, t3, t7, t4); \ | ||
96 | vextracti128 $1, x3, t3##_x; \ | ||
97 | vextracti128 $1, x6, t2##_x; \ | ||
98 | filter_8bit(x2, t5, t6, t7, t4); \ | ||
99 | filter_8bit(x5, t5, t6, t7, t4); \ | ||
100 | filter_8bit(x1, t5, t6, t7, t4); \ | ||
101 | filter_8bit(x4, t5, t6, t7, t4); \ | ||
102 | \ | ||
107 | vpxor t4##_x, t4##_x, t4##_x; \ | 103 | vpxor t4##_x, t4##_x, t4##_x; \ |
108 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
109 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
110 | \ | 104 | \ |
111 | /* AES subbytes + AES shift rows */ \ | 105 | /* AES subbytes + AES shift rows */ \ |
106 | vextracti128 $1, x2, t6##_x; \ | ||
107 | vextracti128 $1, x5, t5##_x; \ | ||
108 | vaesenclast t4##_x, x0##_x, x0##_x; \ | ||
109 | vaesenclast t4##_x, t0##_x, t0##_x; \ | ||
110 | vinserti128 $1, t0##_x, x0, x0; \ | ||
111 | vaesenclast t4##_x, x7##_x, x7##_x; \ | ||
112 | vaesenclast t4##_x, t1##_x, t1##_x; \ | ||
113 | vinserti128 $1, t1##_x, x7, x7; \ | ||
114 | vaesenclast t4##_x, x3##_x, x3##_x; \ | ||
115 | vaesenclast t4##_x, t3##_x, t3##_x; \ | ||
116 | vinserti128 $1, t3##_x, x3, x3; \ | ||
117 | vaesenclast t4##_x, x6##_x, x6##_x; \ | ||
118 | vaesenclast t4##_x, t2##_x, t2##_x; \ | ||
119 | vinserti128 $1, t2##_x, x6, x6; \ | ||
120 | vextracti128 $1, x1, t3##_x; \ | ||
121 | vextracti128 $1, x4, t2##_x; \ | ||
112 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ | 122 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ |
113 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ | 123 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ |
114 | vaesenclast256(t4, x0, t5); \ | 124 | vaesenclast t4##_x, x2##_x, x2##_x; \ |
115 | vaesenclast256(t4, x7, t5); \ | 125 | vaesenclast t4##_x, t6##_x, t6##_x; \ |
116 | vaesenclast256(t4, x1, t5); \ | 126 | vinserti128 $1, t6##_x, x2, x2; \ |
117 | vaesenclast256(t4, x4, t5); \ | 127 | vaesenclast t4##_x, x5##_x, x5##_x; \ |
118 | vaesenclast256(t4, x2, t5); \ | 128 | vaesenclast t4##_x, t5##_x, t5##_x; \ |
119 | vaesenclast256(t4, x5, t5); \ | 129 | vinserti128 $1, t5##_x, x5, x5; \ |
120 | vaesenclast256(t4, x3, t5); \ | 130 | vaesenclast t4##_x, x1##_x, x1##_x; \ |
121 | vaesenclast256(t4, x6, t5); \ | 131 | vaesenclast t4##_x, t3##_x, t3##_x; \ |
132 | vinserti128 $1, t3##_x, x1, x1; \ | ||
133 | vaesenclast t4##_x, x4##_x, x4##_x; \ | ||
134 | vaesenclast t4##_x, t2##_x, t2##_x; \ | ||
135 | vinserti128 $1, t2##_x, x4, x4; \ | ||
122 | \ | 136 | \ |
123 | /* postfilter sboxes 1 and 4 */ \ | 137 | /* postfilter sboxes 1 and 4 */ \ |
124 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ | 138 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ |
@@ -139,22 +153,12 @@ | |||
139 | /* postfilter sbox 2 */ \ | 153 | /* postfilter sbox 2 */ \ |
140 | filter_8bit(x1, t4, t5, t7, t2); \ | 154 | filter_8bit(x1, t4, t5, t7, t2); \ |
141 | filter_8bit(x4, t4, t5, t7, t2); \ | 155 | filter_8bit(x4, t4, t5, t7, t2); \ |
156 | vpxor t7, t7, t7; \ | ||
142 | \ | 157 | \ |
143 | vpsrldq $1, t0, t1; \ | 158 | vpsrldq $1, t0, t1; \ |
144 | vpsrldq $2, t0, t2; \ | 159 | vpsrldq $2, t0, t2; \ |
160 | vpshufb t7, t1, t1; \ | ||
145 | vpsrldq $3, t0, t3; \ | 161 | vpsrldq $3, t0, t3; \ |
146 | vpsrldq $4, t0, t4; \ | ||
147 | vpsrldq $5, t0, t5; \ | ||
148 | vpsrldq $6, t0, t6; \ | ||
149 | vpsrldq $7, t0, t7; \ | ||
150 | vpbroadcastb t0##_x, t0; \ | ||
151 | vpbroadcastb t1##_x, t1; \ | ||
152 | vpbroadcastb t2##_x, t2; \ | ||
153 | vpbroadcastb t3##_x, t3; \ | ||
154 | vpbroadcastb t4##_x, t4; \ | ||
155 | vpbroadcastb t6##_x, t6; \ | ||
156 | vpbroadcastb t5##_x, t5; \ | ||
157 | vpbroadcastb t7##_x, t7; \ | ||
158 | \ | 162 | \ |
159 | /* P-function */ \ | 163 | /* P-function */ \ |
160 | vpxor x5, x0, x0; \ | 164 | vpxor x5, x0, x0; \ |
@@ -162,11 +166,21 @@ | |||
162 | vpxor x7, x2, x2; \ | 166 | vpxor x7, x2, x2; \ |
163 | vpxor x4, x3, x3; \ | 167 | vpxor x4, x3, x3; \ |
164 | \ | 168 | \ |
169 | vpshufb t7, t2, t2; \ | ||
170 | vpsrldq $4, t0, t4; \ | ||
171 | vpshufb t7, t3, t3; \ | ||
172 | vpsrldq $5, t0, t5; \ | ||
173 | vpshufb t7, t4, t4; \ | ||
174 | \ | ||
165 | vpxor x2, x4, x4; \ | 175 | vpxor x2, x4, x4; \ |
166 | vpxor x3, x5, x5; \ | 176 | vpxor x3, x5, x5; \ |
167 | vpxor x0, x6, x6; \ | 177 | vpxor x0, x6, x6; \ |
168 | vpxor x1, x7, x7; \ | 178 | vpxor x1, x7, x7; \ |
169 | \ | 179 | \ |
180 | vpsrldq $6, t0, t6; \ | ||
181 | vpshufb t7, t5, t5; \ | ||
182 | vpshufb t7, t6, t6; \ | ||
183 | \ | ||
170 | vpxor x7, x0, x0; \ | 184 | vpxor x7, x0, x0; \ |
171 | vpxor x4, x1, x1; \ | 185 | vpxor x4, x1, x1; \ |
172 | vpxor x5, x2, x2; \ | 186 | vpxor x5, x2, x2; \ |
@@ -179,12 +193,16 @@ | |||
179 | \ | 193 | \ |
180 | /* Add key material and result to CD (x becomes new CD) */ \ | 194 | /* Add key material and result to CD (x becomes new CD) */ \ |
181 | \ | 195 | \ |
182 | vpxor t7, x0, x0; \ | ||
183 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
184 | \ | ||
185 | vpxor t6, x1, x1; \ | 196 | vpxor t6, x1, x1; \ |
186 | vpxor 5 * 32(mem_cd), x1, x1; \ | 197 | vpxor 5 * 32(mem_cd), x1, x1; \ |
187 | \ | 198 | \ |
199 | vpsrldq $7, t0, t6; \ | ||
200 | vpshufb t7, t0, t0; \ | ||
201 | vpshufb t7, t6, t7; \ | ||
202 | \ | ||
203 | vpxor t7, x0, x0; \ | ||
204 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
205 | \ | ||
188 | vpxor t5, x2, x2; \ | 206 | vpxor t5, x2, x2; \ |
189 | vpxor 6 * 32(mem_cd), x2, x2; \ | 207 | vpxor 6 * 32(mem_cd), x2, x2; \ |
190 | \ | 208 | \ |
@@ -204,7 +222,7 @@ | |||
204 | vpxor 3 * 32(mem_cd), x7, x7; | 222 | vpxor 3 * 32(mem_cd), x7, x7; |
205 | 223 | ||
206 | /* | 224 | /* |
207 | * Size optimization... with inlined roundsm16 binary would be over 5 times | 225 | * Size optimization... with inlined roundsm32 binary would be over 5 times |
208 | * larger and would only marginally faster. | 226 | * larger and would only marginally faster. |
209 | */ | 227 | */ |
210 | .align 8 | 228 | .align 8 |
@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
324 | */ \ | 342 | */ \ |
325 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ | 343 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ |
326 | vpxor tt0, tt0, tt0; \ | 344 | vpxor tt0, tt0, tt0; \ |
327 | vpbroadcastb t0##_x, t3; \ | 345 | vpshufb tt0, t0, t3; \ |
328 | vpsrldq $1, t0, t0; \ | 346 | vpsrldq $1, t0, t0; \ |
329 | vpbroadcastb t0##_x, t2; \ | 347 | vpshufb tt0, t0, t2; \ |
330 | vpsrldq $1, t0, t0; \ | 348 | vpsrldq $1, t0, t0; \ |
331 | vpbroadcastb t0##_x, t1; \ | 349 | vpshufb tt0, t0, t1; \ |
332 | vpsrldq $1, t0, t0; \ | 350 | vpsrldq $1, t0, t0; \ |
333 | vpbroadcastb t0##_x, t0; \ | 351 | vpshufb tt0, t0, t0; \ |
334 | \ | 352 | \ |
335 | vpand l0, t0, t0; \ | 353 | vpand l0, t0, t0; \ |
336 | vpand l1, t1, t1; \ | 354 | vpand l1, t1, t1; \ |
@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
340 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | 358 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ |
341 | \ | 359 | \ |
342 | vpxor l4, t0, l4; \ | 360 | vpxor l4, t0, l4; \ |
361 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | ||
343 | vmovdqu l4, 4 * 32(l); \ | 362 | vmovdqu l4, 4 * 32(l); \ |
344 | vpxor l5, t1, l5; \ | 363 | vpxor l5, t1, l5; \ |
345 | vmovdqu l5, 5 * 32(l); \ | 364 | vmovdqu l5, 5 * 32(l); \ |
@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
354 | * rl ^= t2; \ | 373 | * rl ^= t2; \ |
355 | */ \ | 374 | */ \ |
356 | \ | 375 | \ |
357 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | 376 | vpshufb tt0, t0, t3; \ |
358 | vpbroadcastb t0##_x, t3; \ | ||
359 | vpsrldq $1, t0, t0; \ | 377 | vpsrldq $1, t0, t0; \ |
360 | vpbroadcastb t0##_x, t2; \ | 378 | vpshufb tt0, t0, t2; \ |
361 | vpsrldq $1, t0, t0; \ | 379 | vpsrldq $1, t0, t0; \ |
362 | vpbroadcastb t0##_x, t1; \ | 380 | vpshufb tt0, t0, t1; \ |
363 | vpsrldq $1, t0, t0; \ | 381 | vpsrldq $1, t0, t0; \ |
364 | vpbroadcastb t0##_x, t0; \ | 382 | vpshufb tt0, t0, t0; \ |
365 | \ | 383 | \ |
366 | vpor 4 * 32(r), t0, t0; \ | 384 | vpor 4 * 32(r), t0, t0; \ |
367 | vpor 5 * 32(r), t1, t1; \ | 385 | vpor 5 * 32(r), t1, t1; \ |
@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
373 | vpxor 2 * 32(r), t2, t2; \ | 391 | vpxor 2 * 32(r), t2, t2; \ |
374 | vpxor 3 * 32(r), t3, t3; \ | 392 | vpxor 3 * 32(r), t3, t3; \ |
375 | vmovdqu t0, 0 * 32(r); \ | 393 | vmovdqu t0, 0 * 32(r); \ |
394 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | ||
376 | vmovdqu t1, 1 * 32(r); \ | 395 | vmovdqu t1, 1 * 32(r); \ |
377 | vmovdqu t2, 2 * 32(r); \ | 396 | vmovdqu t2, 2 * 32(r); \ |
378 | vmovdqu t3, 3 * 32(r); \ | 397 | vmovdqu t3, 3 * 32(r); \ |
@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
382 | * t2 &= rl; \ | 401 | * t2 &= rl; \ |
383 | * rr ^= rol32(t2, 1); \ | 402 | * rr ^= rol32(t2, 1); \ |
384 | */ \ | 403 | */ \ |
385 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | 404 | vpshufb tt0, t0, t3; \ |
386 | vpbroadcastb t0##_x, t3; \ | ||
387 | vpsrldq $1, t0, t0; \ | 405 | vpsrldq $1, t0, t0; \ |
388 | vpbroadcastb t0##_x, t2; \ | 406 | vpshufb tt0, t0, t2; \ |
389 | vpsrldq $1, t0, t0; \ | 407 | vpsrldq $1, t0, t0; \ |
390 | vpbroadcastb t0##_x, t1; \ | 408 | vpshufb tt0, t0, t1; \ |
391 | vpsrldq $1, t0, t0; \ | 409 | vpsrldq $1, t0, t0; \ |
392 | vpbroadcastb t0##_x, t0; \ | 410 | vpshufb tt0, t0, t0; \ |
393 | \ | 411 | \ |
394 | vpand 0 * 32(r), t0, t0; \ | 412 | vpand 0 * 32(r), t0, t0; \ |
395 | vpand 1 * 32(r), t1, t1; \ | 413 | vpand 1 * 32(r), t1, t1; \ |
@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
403 | vpxor 6 * 32(r), t2, t2; \ | 421 | vpxor 6 * 32(r), t2, t2; \ |
404 | vpxor 7 * 32(r), t3, t3; \ | 422 | vpxor 7 * 32(r), t3, t3; \ |
405 | vmovdqu t0, 4 * 32(r); \ | 423 | vmovdqu t0, 4 * 32(r); \ |
424 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | ||
406 | vmovdqu t1, 5 * 32(r); \ | 425 | vmovdqu t1, 5 * 32(r); \ |
407 | vmovdqu t2, 6 * 32(r); \ | 426 | vmovdqu t2, 6 * 32(r); \ |
408 | vmovdqu t3, 7 * 32(r); \ | 427 | vmovdqu t3, 7 * 32(r); \ |
@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
413 | * ll ^= t0; \ | 432 | * ll ^= t0; \ |
414 | */ \ | 433 | */ \ |
415 | \ | 434 | \ |
416 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | 435 | vpshufb tt0, t0, t3; \ |
417 | vpbroadcastb t0##_x, t3; \ | ||
418 | vpsrldq $1, t0, t0; \ | 436 | vpsrldq $1, t0, t0; \ |
419 | vpbroadcastb t0##_x, t2; \ | 437 | vpshufb tt0, t0, t2; \ |
420 | vpsrldq $1, t0, t0; \ | 438 | vpsrldq $1, t0, t0; \ |
421 | vpbroadcastb t0##_x, t1; \ | 439 | vpshufb tt0, t0, t1; \ |
422 | vpsrldq $1, t0, t0; \ | 440 | vpsrldq $1, t0, t0; \ |
423 | vpbroadcastb t0##_x, t0; \ | 441 | vpshufb tt0, t0, t0; \ |
424 | \ | 442 | \ |
425 | vpor l4, t0, t0; \ | 443 | vpor l4, t0, t0; \ |
426 | vpor l5, t1, t1; \ | 444 | vpor l5, t1, t1; \ |