diff options
author | Timothy McCaffrey <timothy.mccaffrey@unisys.com> | 2015-01-13 13:16:43 -0500 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-01-14 05:56:51 -0500 |
commit | e31ac32d3bc27c33f002e0c9ffd6ae08b65474e6 (patch) | |
tree | e2f11e810e52f8aa8a1b5e813a64ef56f1c1a6a6 /arch/x86/crypto/aesni-intel_asm.S | |
parent | d8219f52a72033f84c15cde73294d46578fb2d68 (diff) |
crypto: aesni - Add support for 192 & 256 bit keys to AESNI RFC4106
These patches fix the RFC4106 implementation in the aesni-intel
module so it supports 192 & 256 bit keys.
Since the AVX support that was added to this module also only
supports 128 bit keys, and this patch only affects the SSE
implementation, changes were also made to use the SSE version
if key sizes other than 128 are specified.
RFC4106 specifies that 192 & 256 bit keys must be supported (section
8.4).
Also, this should fix Strongswan issue 341 where the aesni module
needs to be unloaded if 256 bit keys are used:
http://wiki.strongswan.org/issues/341
This patch has been tested with Sandy Bridge and Haswell processors.
With 128 bit keys and input buffers > 512 bytes a slight performance
degradation was noticed (~1%). For input buffers of less than 512
bytes there was no performance impact. Compared to 128 bit keys,
256 bit key size performance is approx. .5 cycles per byte slower
on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs.
SSE code).
This patch has also been tested with StrongSwan IPSec connections
where it worked correctly.
I created this diff from a git clone of crypto-2.6.git.
Any questions, please feel free to contact me.
Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 343 |
1 files changed, 177 insertions, 166 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 477e9d75149b..6bd2c6c95373 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -32,12 +32,23 @@ | |||
32 | #include <linux/linkage.h> | 32 | #include <linux/linkage.h> |
33 | #include <asm/inst.h> | 33 | #include <asm/inst.h> |
34 | 34 | ||
35 | /* | ||
36 | * The following macros are used to move an (un)aligned 16 byte value to/from | ||
37 | * an XMM register. This can done for either FP or integer values, for FP use | ||
38 | * movaps (move aligned packed single) or integer use movdqa (move double quad | ||
39 | * aligned). It doesn't make a performance difference which instruction is used | ||
40 | * since Nehalem (original Core i7) was released. However, the movaps is a byte | ||
41 | * shorter, so that is the one we'll use for now. (same for unaligned). | ||
42 | */ | ||
43 | #define MOVADQ movaps | ||
44 | #define MOVUDQ movups | ||
45 | |||
35 | #ifdef __x86_64__ | 46 | #ifdef __x86_64__ |
47 | |||
36 | .data | 48 | .data |
37 | .align 16 | 49 | .align 16 |
38 | .Lgf128mul_x_ble_mask: | 50 | .Lgf128mul_x_ble_mask: |
39 | .octa 0x00000000000000010000000000000087 | 51 | .octa 0x00000000000000010000000000000087 |
40 | |||
41 | POLY: .octa 0xC2000000000000000000000000000001 | 52 | POLY: .octa 0xC2000000000000000000000000000001 |
42 | TWOONE: .octa 0x00000001000000000000000000000001 | 53 | TWOONE: .octa 0x00000001000000000000000000000001 |
43 | 54 | ||
@@ -89,6 +100,7 @@ enc: .octa 0x2 | |||
89 | #define arg8 STACK_OFFSET+16(%r14) | 100 | #define arg8 STACK_OFFSET+16(%r14) |
90 | #define arg9 STACK_OFFSET+24(%r14) | 101 | #define arg9 STACK_OFFSET+24(%r14) |
91 | #define arg10 STACK_OFFSET+32(%r14) | 102 | #define arg10 STACK_OFFSET+32(%r14) |
103 | #define keysize 2*15*16(%arg1) | ||
92 | #endif | 104 | #endif |
93 | 105 | ||
94 | 106 | ||
@@ -213,10 +225,12 @@ enc: .octa 0x2 | |||
213 | 225 | ||
214 | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 226 | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ |
215 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 227 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation |
228 | MOVADQ SHUF_MASK(%rip), %xmm14 | ||
216 | mov arg7, %r10 # %r10 = AAD | 229 | mov arg7, %r10 # %r10 = AAD |
217 | mov arg8, %r12 # %r12 = aadLen | 230 | mov arg8, %r12 # %r12 = aadLen |
218 | mov %r12, %r11 | 231 | mov %r12, %r11 |
219 | pxor %xmm\i, %xmm\i | 232 | pxor %xmm\i, %xmm\i |
233 | |||
220 | _get_AAD_loop\num_initial_blocks\operation: | 234 | _get_AAD_loop\num_initial_blocks\operation: |
221 | movd (%r10), \TMP1 | 235 | movd (%r10), \TMP1 |
222 | pslldq $12, \TMP1 | 236 | pslldq $12, \TMP1 |
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation: | |||
225 | add $4, %r10 | 239 | add $4, %r10 |
226 | sub $4, %r12 | 240 | sub $4, %r12 |
227 | jne _get_AAD_loop\num_initial_blocks\operation | 241 | jne _get_AAD_loop\num_initial_blocks\operation |
242 | |||
228 | cmp $16, %r11 | 243 | cmp $16, %r11 |
229 | je _get_AAD_loop2_done\num_initial_blocks\operation | 244 | je _get_AAD_loop2_done\num_initial_blocks\operation |
245 | |||
230 | mov $16, %r12 | 246 | mov $16, %r12 |
231 | _get_AAD_loop2\num_initial_blocks\operation: | 247 | _get_AAD_loop2\num_initial_blocks\operation: |
232 | psrldq $4, %xmm\i | 248 | psrldq $4, %xmm\i |
233 | sub $4, %r12 | 249 | sub $4, %r12 |
234 | cmp %r11, %r12 | 250 | cmp %r11, %r12 |
235 | jne _get_AAD_loop2\num_initial_blocks\operation | 251 | jne _get_AAD_loop2\num_initial_blocks\operation |
252 | |||
236 | _get_AAD_loop2_done\num_initial_blocks\operation: | 253 | _get_AAD_loop2_done\num_initial_blocks\operation: |
237 | movdqa SHUF_MASK(%rip), %xmm14 | ||
238 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | 254 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data |
239 | 255 | ||
240 | xor %r11, %r11 # initialise the data pointer offset as zero | 256 | xor %r11, %r11 # initialise the data pointer offset as zero |
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
243 | 259 | ||
244 | mov %arg5, %rax # %rax = *Y0 | 260 | mov %arg5, %rax # %rax = *Y0 |
245 | movdqu (%rax), \XMM0 # XMM0 = Y0 | 261 | movdqu (%rax), \XMM0 # XMM0 = Y0 |
246 | movdqa SHUF_MASK(%rip), %xmm14 | ||
247 | PSHUFB_XMM %xmm14, \XMM0 | 262 | PSHUFB_XMM %xmm14, \XMM0 |
248 | 263 | ||
249 | .if (\i == 5) || (\i == 6) || (\i == 7) | 264 | .if (\i == 5) || (\i == 6) || (\i == 7) |
265 | MOVADQ ONE(%RIP),\TMP1 | ||
266 | MOVADQ (%arg1),\TMP2 | ||
250 | .irpc index, \i_seq | 267 | .irpc index, \i_seq |
251 | paddd ONE(%rip), \XMM0 # INCR Y0 | 268 | paddd \TMP1, \XMM0 # INCR Y0 |
252 | movdqa \XMM0, %xmm\index | 269 | movdqa \XMM0, %xmm\index |
253 | movdqa SHUF_MASK(%rip), %xmm14 | ||
254 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | 270 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap |
255 | 271 | pxor \TMP2, %xmm\index | |
256 | .endr | ||
257 | .irpc index, \i_seq | ||
258 | pxor 16*0(%arg1), %xmm\index | ||
259 | .endr | ||
260 | .irpc index, \i_seq | ||
261 | movaps 0x10(%rdi), \TMP1 | ||
262 | AESENC \TMP1, %xmm\index # Round 1 | ||
263 | .endr | ||
264 | .irpc index, \i_seq | ||
265 | movaps 0x20(%arg1), \TMP1 | ||
266 | AESENC \TMP1, %xmm\index # Round 2 | ||
267 | .endr | ||
268 | .irpc index, \i_seq | ||
269 | movaps 0x30(%arg1), \TMP1 | ||
270 | AESENC \TMP1, %xmm\index # Round 2 | ||
271 | .endr | ||
272 | .irpc index, \i_seq | ||
273 | movaps 0x40(%arg1), \TMP1 | ||
274 | AESENC \TMP1, %xmm\index # Round 2 | ||
275 | .endr | ||
276 | .irpc index, \i_seq | ||
277 | movaps 0x50(%arg1), \TMP1 | ||
278 | AESENC \TMP1, %xmm\index # Round 2 | ||
279 | .endr | ||
280 | .irpc index, \i_seq | ||
281 | movaps 0x60(%arg1), \TMP1 | ||
282 | AESENC \TMP1, %xmm\index # Round 2 | ||
283 | .endr | 272 | .endr |
284 | .irpc index, \i_seq | 273 | lea 0x10(%arg1),%r10 |
285 | movaps 0x70(%arg1), \TMP1 | 274 | mov keysize,%eax |
286 | AESENC \TMP1, %xmm\index # Round 2 | 275 | shr $2,%eax # 128->4, 192->6, 256->8 |
287 | .endr | 276 | add $5,%eax # 128->9, 192->11, 256->13 |
288 | .irpc index, \i_seq | 277 | |
289 | movaps 0x80(%arg1), \TMP1 | 278 | aes_loop_initial_dec\num_initial_blocks: |
290 | AESENC \TMP1, %xmm\index # Round 2 | 279 | MOVADQ (%r10),\TMP1 |
291 | .endr | 280 | .irpc index, \i_seq |
292 | .irpc index, \i_seq | 281 | AESENC \TMP1, %xmm\index |
293 | movaps 0x90(%arg1), \TMP1 | ||
294 | AESENC \TMP1, %xmm\index # Round 2 | ||
295 | .endr | 282 | .endr |
283 | add $16,%r10 | ||
284 | sub $1,%eax | ||
285 | jnz aes_loop_initial_dec\num_initial_blocks | ||
286 | |||
287 | MOVADQ (%r10), \TMP1 | ||
296 | .irpc index, \i_seq | 288 | .irpc index, \i_seq |
297 | movaps 0xa0(%arg1), \TMP1 | 289 | AESENCLAST \TMP1, %xmm\index # Last Round |
298 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
299 | .endr | 290 | .endr |
300 | .irpc index, \i_seq | 291 | .irpc index, \i_seq |
301 | movdqu (%arg3 , %r11, 1), \TMP1 | 292 | movdqu (%arg3 , %r11, 1), \TMP1 |
@@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
305 | add $16, %r11 | 296 | add $16, %r11 |
306 | 297 | ||
307 | movdqa \TMP1, %xmm\index | 298 | movdqa \TMP1, %xmm\index |
308 | movdqa SHUF_MASK(%rip), %xmm14 | ||
309 | PSHUFB_XMM %xmm14, %xmm\index | 299 | PSHUFB_XMM %xmm14, %xmm\index |
310 | 300 | # prepare plaintext/ciphertext for GHASH computation | |
311 | # prepare plaintext/ciphertext for GHASH computation | ||
312 | .endr | 301 | .endr |
313 | .endif | 302 | .endif |
314 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | 303 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 |
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
338 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 327 | * Precomputations for HashKey parallel with encryption of first 4 blocks. |
339 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 328 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i |
340 | */ | 329 | */ |
341 | paddd ONE(%rip), \XMM0 # INCR Y0 | 330 | MOVADQ ONE(%rip), \TMP1 |
342 | movdqa \XMM0, \XMM1 | 331 | paddd \TMP1, \XMM0 # INCR Y0 |
343 | movdqa SHUF_MASK(%rip), %xmm14 | 332 | MOVADQ \XMM0, \XMM1 |
344 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 333 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap |
345 | 334 | ||
346 | paddd ONE(%rip), \XMM0 # INCR Y0 | 335 | paddd \TMP1, \XMM0 # INCR Y0 |
347 | movdqa \XMM0, \XMM2 | 336 | MOVADQ \XMM0, \XMM2 |
348 | movdqa SHUF_MASK(%rip), %xmm14 | ||
349 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 337 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap |
350 | 338 | ||
351 | paddd ONE(%rip), \XMM0 # INCR Y0 | 339 | paddd \TMP1, \XMM0 # INCR Y0 |
352 | movdqa \XMM0, \XMM3 | 340 | MOVADQ \XMM0, \XMM3 |
353 | movdqa SHUF_MASK(%rip), %xmm14 | ||
354 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 341 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap |
355 | 342 | ||
356 | paddd ONE(%rip), \XMM0 # INCR Y0 | 343 | paddd \TMP1, \XMM0 # INCR Y0 |
357 | movdqa \XMM0, \XMM4 | 344 | MOVADQ \XMM0, \XMM4 |
358 | movdqa SHUF_MASK(%rip), %xmm14 | ||
359 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 345 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap |
360 | 346 | ||
361 | pxor 16*0(%arg1), \XMM1 | 347 | MOVADQ 0(%arg1),\TMP1 |
362 | pxor 16*0(%arg1), \XMM2 | 348 | pxor \TMP1, \XMM1 |
363 | pxor 16*0(%arg1), \XMM3 | 349 | pxor \TMP1, \XMM2 |
364 | pxor 16*0(%arg1), \XMM4 | 350 | pxor \TMP1, \XMM3 |
351 | pxor \TMP1, \XMM4 | ||
365 | movdqa \TMP3, \TMP5 | 352 | movdqa \TMP3, \TMP5 |
366 | pshufd $78, \TMP3, \TMP1 | 353 | pshufd $78, \TMP3, \TMP1 |
367 | pxor \TMP3, \TMP1 | 354 | pxor \TMP3, \TMP1 |
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
399 | pshufd $78, \TMP5, \TMP1 | 386 | pshufd $78, \TMP5, \TMP1 |
400 | pxor \TMP5, \TMP1 | 387 | pxor \TMP5, \TMP1 |
401 | movdqa \TMP1, HashKey_4_k(%rsp) | 388 | movdqa \TMP1, HashKey_4_k(%rsp) |
402 | movaps 0xa0(%arg1), \TMP2 | 389 | lea 0xa0(%arg1),%r10 |
390 | mov keysize,%eax | ||
391 | shr $2,%eax # 128->4, 192->6, 256->8 | ||
392 | sub $4,%eax # 128->0, 192->2, 256->4 | ||
393 | jz aes_loop_pre_dec_done\num_initial_blocks | ||
394 | |||
395 | aes_loop_pre_dec\num_initial_blocks: | ||
396 | MOVADQ (%r10),\TMP2 | ||
397 | .irpc index, 1234 | ||
398 | AESENC \TMP2, %xmm\index | ||
399 | .endr | ||
400 | add $16,%r10 | ||
401 | sub $1,%eax | ||
402 | jnz aes_loop_pre_dec\num_initial_blocks | ||
403 | |||
404 | aes_loop_pre_dec_done\num_initial_blocks: | ||
405 | MOVADQ (%r10), \TMP2 | ||
403 | AESENCLAST \TMP2, \XMM1 | 406 | AESENCLAST \TMP2, \XMM1 |
404 | AESENCLAST \TMP2, \XMM2 | 407 | AESENCLAST \TMP2, \XMM2 |
405 | AESENCLAST \TMP2, \XMM3 | 408 | AESENCLAST \TMP2, \XMM3 |
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
421 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | 424 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) |
422 | movdqa \TMP1, \XMM4 | 425 | movdqa \TMP1, \XMM4 |
423 | add $64, %r11 | 426 | add $64, %r11 |
424 | movdqa SHUF_MASK(%rip), %xmm14 | ||
425 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 427 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap |
426 | pxor \XMMDst, \XMM1 | 428 | pxor \XMMDst, \XMM1 |
427 | # combine GHASHed value with the corresponding ciphertext | 429 | # combine GHASHed value with the corresponding ciphertext |
428 | movdqa SHUF_MASK(%rip), %xmm14 | ||
429 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 430 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap |
430 | movdqa SHUF_MASK(%rip), %xmm14 | ||
431 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 431 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap |
432 | movdqa SHUF_MASK(%rip), %xmm14 | ||
433 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 432 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap |
434 | 433 | ||
435 | _initial_blocks_done\num_initial_blocks\operation: | 434 | _initial_blocks_done\num_initial_blocks\operation: |
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation: | |||
451 | 450 | ||
452 | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | 451 | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ |
453 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | 452 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation |
453 | MOVADQ SHUF_MASK(%rip), %xmm14 | ||
454 | mov arg7, %r10 # %r10 = AAD | 454 | mov arg7, %r10 # %r10 = AAD |
455 | mov arg8, %r12 # %r12 = aadLen | 455 | mov arg8, %r12 # %r12 = aadLen |
456 | mov %r12, %r11 | 456 | mov %r12, %r11 |
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation: | |||
472 | cmp %r11, %r12 | 472 | cmp %r11, %r12 |
473 | jne _get_AAD_loop2\num_initial_blocks\operation | 473 | jne _get_AAD_loop2\num_initial_blocks\operation |
474 | _get_AAD_loop2_done\num_initial_blocks\operation: | 474 | _get_AAD_loop2_done\num_initial_blocks\operation: |
475 | movdqa SHUF_MASK(%rip), %xmm14 | ||
476 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | 475 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data |
477 | 476 | ||
478 | xor %r11, %r11 # initialise the data pointer offset as zero | 477 | xor %r11, %r11 # initialise the data pointer offset as zero |
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
481 | 480 | ||
482 | mov %arg5, %rax # %rax = *Y0 | 481 | mov %arg5, %rax # %rax = *Y0 |
483 | movdqu (%rax), \XMM0 # XMM0 = Y0 | 482 | movdqu (%rax), \XMM0 # XMM0 = Y0 |
484 | movdqa SHUF_MASK(%rip), %xmm14 | ||
485 | PSHUFB_XMM %xmm14, \XMM0 | 483 | PSHUFB_XMM %xmm14, \XMM0 |
486 | 484 | ||
487 | .if (\i == 5) || (\i == 6) || (\i == 7) | 485 | .if (\i == 5) || (\i == 6) || (\i == 7) |
488 | .irpc index, \i_seq | ||
489 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
490 | movdqa \XMM0, %xmm\index | ||
491 | movdqa SHUF_MASK(%rip), %xmm14 | ||
492 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | ||
493 | 486 | ||
494 | .endr | 487 | MOVADQ ONE(%RIP),\TMP1 |
495 | .irpc index, \i_seq | 488 | MOVADQ 0(%arg1),\TMP2 |
496 | pxor 16*0(%arg1), %xmm\index | ||
497 | .endr | ||
498 | .irpc index, \i_seq | ||
499 | movaps 0x10(%rdi), \TMP1 | ||
500 | AESENC \TMP1, %xmm\index # Round 1 | ||
501 | .endr | ||
502 | .irpc index, \i_seq | ||
503 | movaps 0x20(%arg1), \TMP1 | ||
504 | AESENC \TMP1, %xmm\index # Round 2 | ||
505 | .endr | ||
506 | .irpc index, \i_seq | 489 | .irpc index, \i_seq |
507 | movaps 0x30(%arg1), \TMP1 | 490 | paddd \TMP1, \XMM0 # INCR Y0 |
508 | AESENC \TMP1, %xmm\index # Round 2 | 491 | MOVADQ \XMM0, %xmm\index |
492 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | ||
493 | pxor \TMP2, %xmm\index | ||
509 | .endr | 494 | .endr |
510 | .irpc index, \i_seq | 495 | lea 0x10(%arg1),%r10 |
511 | movaps 0x40(%arg1), \TMP1 | 496 | mov keysize,%eax |
512 | AESENC \TMP1, %xmm\index # Round 2 | 497 | shr $2,%eax # 128->4, 192->6, 256->8 |
513 | .endr | 498 | add $5,%eax # 128->9, 192->11, 256->13 |
514 | .irpc index, \i_seq | 499 | |
515 | movaps 0x50(%arg1), \TMP1 | 500 | aes_loop_initial_enc\num_initial_blocks: |
516 | AESENC \TMP1, %xmm\index # Round 2 | 501 | MOVADQ (%r10),\TMP1 |
517 | .endr | 502 | .irpc index, \i_seq |
518 | .irpc index, \i_seq | 503 | AESENC \TMP1, %xmm\index |
519 | movaps 0x60(%arg1), \TMP1 | ||
520 | AESENC \TMP1, %xmm\index # Round 2 | ||
521 | .endr | ||
522 | .irpc index, \i_seq | ||
523 | movaps 0x70(%arg1), \TMP1 | ||
524 | AESENC \TMP1, %xmm\index # Round 2 | ||
525 | .endr | ||
526 | .irpc index, \i_seq | ||
527 | movaps 0x80(%arg1), \TMP1 | ||
528 | AESENC \TMP1, %xmm\index # Round 2 | ||
529 | .endr | ||
530 | .irpc index, \i_seq | ||
531 | movaps 0x90(%arg1), \TMP1 | ||
532 | AESENC \TMP1, %xmm\index # Round 2 | ||
533 | .endr | 504 | .endr |
505 | add $16,%r10 | ||
506 | sub $1,%eax | ||
507 | jnz aes_loop_initial_enc\num_initial_blocks | ||
508 | |||
509 | MOVADQ (%r10), \TMP1 | ||
534 | .irpc index, \i_seq | 510 | .irpc index, \i_seq |
535 | movaps 0xa0(%arg1), \TMP1 | 511 | AESENCLAST \TMP1, %xmm\index # Last Round |
536 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
537 | .endr | 512 | .endr |
538 | .irpc index, \i_seq | 513 | .irpc index, \i_seq |
539 | movdqu (%arg3 , %r11, 1), \TMP1 | 514 | movdqu (%arg3 , %r11, 1), \TMP1 |
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
541 | movdqu %xmm\index, (%arg2 , %r11, 1) | 516 | movdqu %xmm\index, (%arg2 , %r11, 1) |
542 | # write back plaintext/ciphertext for num_initial_blocks | 517 | # write back plaintext/ciphertext for num_initial_blocks |
543 | add $16, %r11 | 518 | add $16, %r11 |
544 | |||
545 | movdqa SHUF_MASK(%rip), %xmm14 | ||
546 | PSHUFB_XMM %xmm14, %xmm\index | 519 | PSHUFB_XMM %xmm14, %xmm\index |
547 | 520 | ||
548 | # prepare plaintext/ciphertext for GHASH computation | 521 | # prepare plaintext/ciphertext for GHASH computation |
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
575 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | 548 | * Precomputations for HashKey parallel with encryption of first 4 blocks. |
576 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | 549 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i |
577 | */ | 550 | */ |
578 | paddd ONE(%rip), \XMM0 # INCR Y0 | 551 | MOVADQ ONE(%RIP),\TMP1 |
579 | movdqa \XMM0, \XMM1 | 552 | paddd \TMP1, \XMM0 # INCR Y0 |
580 | movdqa SHUF_MASK(%rip), %xmm14 | 553 | MOVADQ \XMM0, \XMM1 |
581 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 554 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap |
582 | 555 | ||
583 | paddd ONE(%rip), \XMM0 # INCR Y0 | 556 | paddd \TMP1, \XMM0 # INCR Y0 |
584 | movdqa \XMM0, \XMM2 | 557 | MOVADQ \XMM0, \XMM2 |
585 | movdqa SHUF_MASK(%rip), %xmm14 | ||
586 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 558 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap |
587 | 559 | ||
588 | paddd ONE(%rip), \XMM0 # INCR Y0 | 560 | paddd \TMP1, \XMM0 # INCR Y0 |
589 | movdqa \XMM0, \XMM3 | 561 | MOVADQ \XMM0, \XMM3 |
590 | movdqa SHUF_MASK(%rip), %xmm14 | ||
591 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 562 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap |
592 | 563 | ||
593 | paddd ONE(%rip), \XMM0 # INCR Y0 | 564 | paddd \TMP1, \XMM0 # INCR Y0 |
594 | movdqa \XMM0, \XMM4 | 565 | MOVADQ \XMM0, \XMM4 |
595 | movdqa SHUF_MASK(%rip), %xmm14 | ||
596 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 566 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap |
597 | 567 | ||
598 | pxor 16*0(%arg1), \XMM1 | 568 | MOVADQ 0(%arg1),\TMP1 |
599 | pxor 16*0(%arg1), \XMM2 | 569 | pxor \TMP1, \XMM1 |
600 | pxor 16*0(%arg1), \XMM3 | 570 | pxor \TMP1, \XMM2 |
601 | pxor 16*0(%arg1), \XMM4 | 571 | pxor \TMP1, \XMM3 |
572 | pxor \TMP1, \XMM4 | ||
602 | movdqa \TMP3, \TMP5 | 573 | movdqa \TMP3, \TMP5 |
603 | pshufd $78, \TMP3, \TMP1 | 574 | pshufd $78, \TMP3, \TMP1 |
604 | pxor \TMP3, \TMP1 | 575 | pxor \TMP3, \TMP1 |
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
636 | pshufd $78, \TMP5, \TMP1 | 607 | pshufd $78, \TMP5, \TMP1 |
637 | pxor \TMP5, \TMP1 | 608 | pxor \TMP5, \TMP1 |
638 | movdqa \TMP1, HashKey_4_k(%rsp) | 609 | movdqa \TMP1, HashKey_4_k(%rsp) |
639 | movaps 0xa0(%arg1), \TMP2 | 610 | lea 0xa0(%arg1),%r10 |
611 | mov keysize,%eax | ||
612 | shr $2,%eax # 128->4, 192->6, 256->8 | ||
613 | sub $4,%eax # 128->0, 192->2, 256->4 | ||
614 | jz aes_loop_pre_enc_done\num_initial_blocks | ||
615 | |||
616 | aes_loop_pre_enc\num_initial_blocks: | ||
617 | MOVADQ (%r10),\TMP2 | ||
618 | .irpc index, 1234 | ||
619 | AESENC \TMP2, %xmm\index | ||
620 | .endr | ||
621 | add $16,%r10 | ||
622 | sub $1,%eax | ||
623 | jnz aes_loop_pre_enc\num_initial_blocks | ||
624 | |||
625 | aes_loop_pre_enc_done\num_initial_blocks: | ||
626 | MOVADQ (%r10), \TMP2 | ||
640 | AESENCLAST \TMP2, \XMM1 | 627 | AESENCLAST \TMP2, \XMM1 |
641 | AESENCLAST \TMP2, \XMM2 | 628 | AESENCLAST \TMP2, \XMM2 |
642 | AESENCLAST \TMP2, \XMM3 | 629 | AESENCLAST \TMP2, \XMM3 |
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |||
655 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | 642 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) |
656 | 643 | ||
657 | add $64, %r11 | 644 | add $64, %r11 |
658 | movdqa SHUF_MASK(%rip), %xmm14 | ||
659 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | 645 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap |
660 | pxor \XMMDst, \XMM1 | 646 | pxor \XMMDst, \XMM1 |
661 | # combine GHASHed value with the corresponding ciphertext | 647 | # combine GHASHed value with the corresponding ciphertext |
662 | movdqa SHUF_MASK(%rip), %xmm14 | ||
663 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | 648 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap |
664 | movdqa SHUF_MASK(%rip), %xmm14 | ||
665 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | 649 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap |
666 | movdqa SHUF_MASK(%rip), %xmm14 | ||
667 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | 650 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap |
668 | 651 | ||
669 | _initial_blocks_done\num_initial_blocks\operation: | 652 | _initial_blocks_done\num_initial_blocks\operation: |
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | |||
794 | AESENC \TMP3, \XMM3 | 777 | AESENC \TMP3, \XMM3 |
795 | AESENC \TMP3, \XMM4 | 778 | AESENC \TMP3, \XMM4 |
796 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | 779 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 |
797 | movaps 0xa0(%arg1), \TMP3 | 780 | lea 0xa0(%arg1),%r10 |
781 | mov keysize,%eax | ||
782 | shr $2,%eax # 128->4, 192->6, 256->8 | ||
783 | sub $4,%eax # 128->0, 192->2, 256->4 | ||
784 | jz aes_loop_par_enc_done | ||
785 | |||
786 | aes_loop_par_enc: | ||
787 | MOVADQ (%r10),\TMP3 | ||
788 | .irpc index, 1234 | ||
789 | AESENC \TMP3, %xmm\index | ||
790 | .endr | ||
791 | add $16,%r10 | ||
792 | sub $1,%eax | ||
793 | jnz aes_loop_par_enc | ||
794 | |||
795 | aes_loop_par_enc_done: | ||
796 | MOVADQ (%r10), \TMP3 | ||
798 | AESENCLAST \TMP3, \XMM1 # Round 10 | 797 | AESENCLAST \TMP3, \XMM1 # Round 10 |
799 | AESENCLAST \TMP3, \XMM2 | 798 | AESENCLAST \TMP3, \XMM2 |
800 | AESENCLAST \TMP3, \XMM3 | 799 | AESENCLAST \TMP3, \XMM3 |
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | |||
986 | AESENC \TMP3, \XMM3 | 985 | AESENC \TMP3, \XMM3 |
987 | AESENC \TMP3, \XMM4 | 986 | AESENC \TMP3, \XMM4 |
988 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | 987 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 |
989 | movaps 0xa0(%arg1), \TMP3 | 988 | lea 0xa0(%arg1),%r10 |
990 | AESENCLAST \TMP3, \XMM1 # Round 10 | 989 | mov keysize,%eax |
990 | shr $2,%eax # 128->4, 192->6, 256->8 | ||
991 | sub $4,%eax # 128->0, 192->2, 256->4 | ||
992 | jz aes_loop_par_dec_done | ||
993 | |||
994 | aes_loop_par_dec: | ||
995 | MOVADQ (%r10),\TMP3 | ||
996 | .irpc index, 1234 | ||
997 | AESENC \TMP3, %xmm\index | ||
998 | .endr | ||
999 | add $16,%r10 | ||
1000 | sub $1,%eax | ||
1001 | jnz aes_loop_par_dec | ||
1002 | |||
1003 | aes_loop_par_dec_done: | ||
1004 | MOVADQ (%r10), \TMP3 | ||
1005 | AESENCLAST \TMP3, \XMM1 # last round | ||
991 | AESENCLAST \TMP3, \XMM2 | 1006 | AESENCLAST \TMP3, \XMM2 |
992 | AESENCLAST \TMP3, \XMM3 | 1007 | AESENCLAST \TMP3, \XMM3 |
993 | AESENCLAST \TMP3, \XMM4 | 1008 | AESENCLAST \TMP3, \XMM4 |
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | |||
1155 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | 1170 | pxor \TMP6, \XMMDst # reduced result is in XMMDst |
1156 | .endm | 1171 | .endm |
1157 | 1172 | ||
1158 | /* Encryption of a single block done*/ | ||
1159 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | ||
1160 | 1173 | ||
1161 | pxor (%arg1), \XMM0 | 1174 | /* Encryption of a single block |
1162 | movaps 16(%arg1), \TMP1 | 1175 | * uses eax & r10 |
1163 | AESENC \TMP1, \XMM0 | 1176 | */ |
1164 | movaps 32(%arg1), \TMP1 | ||
1165 | AESENC \TMP1, \XMM0 | ||
1166 | movaps 48(%arg1), \TMP1 | ||
1167 | AESENC \TMP1, \XMM0 | ||
1168 | movaps 64(%arg1), \TMP1 | ||
1169 | AESENC \TMP1, \XMM0 | ||
1170 | movaps 80(%arg1), \TMP1 | ||
1171 | AESENC \TMP1, \XMM0 | ||
1172 | movaps 96(%arg1), \TMP1 | ||
1173 | AESENC \TMP1, \XMM0 | ||
1174 | movaps 112(%arg1), \TMP1 | ||
1175 | AESENC \TMP1, \XMM0 | ||
1176 | movaps 128(%arg1), \TMP1 | ||
1177 | AESENC \TMP1, \XMM0 | ||
1178 | movaps 144(%arg1), \TMP1 | ||
1179 | AESENC \TMP1, \XMM0 | ||
1180 | movaps 160(%arg1), \TMP1 | ||
1181 | AESENCLAST \TMP1, \XMM0 | ||
1182 | .endm | ||
1183 | 1177 | ||
1178 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | ||
1184 | 1179 | ||
1180 | pxor (%arg1), \XMM0 | ||
1181 | mov keysize,%eax | ||
1182 | shr $2,%eax # 128->4, 192->6, 256->8 | ||
1183 | add $5,%eax # 128->9, 192->11, 256->13 | ||
1184 | lea 16(%arg1), %r10 # get first expanded key address | ||
1185 | |||
1186 | _esb_loop_\@: | ||
1187 | MOVADQ (%r10),\TMP1 | ||
1188 | AESENC \TMP1,\XMM0 | ||
1189 | add $16,%r10 | ||
1190 | sub $1,%eax | ||
1191 | jnz _esb_loop_\@ | ||
1192 | |||
1193 | MOVADQ (%r10),\TMP1 | ||
1194 | AESENCLAST \TMP1,\XMM0 | ||
1195 | .endm | ||
1185 | /***************************************************************************** | 1196 | /***************************************************************************** |
1186 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | 1197 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. |
1187 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. | 1198 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. |