aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
authorTimothy McCaffrey <timothy.mccaffrey@unisys.com>2015-01-13 13:16:43 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2015-01-14 05:56:51 -0500
commite31ac32d3bc27c33f002e0c9ffd6ae08b65474e6 (patch)
treee2f11e810e52f8aa8a1b5e813a64ef56f1c1a6a6 /arch/x86/crypto/aesni-intel_asm.S
parentd8219f52a72033f84c15cde73294d46578fb2d68 (diff)
crypto: aesni - Add support for 192 & 256 bit keys to AESNI RFC4106
These patches fix the RFC4106 implementation in the aesni-intel module so it supports 192 & 256 bit keys. Since the AVX support that was added to this module also only supports 128 bit keys, and this patch only affects the SSE implementation, changes were also made to use the SSE version if key sizes other than 128 are specified. RFC4106 specifies that 192 & 256 bit keys must be supported (section 8.4). Also, this should fix Strongswan issue 341 where the aesni module needs to be unloaded if 256 bit keys are used: http://wiki.strongswan.org/issues/341 This patch has been tested with Sandy Bridge and Haswell processors. With 128 bit keys and input buffers > 512 bytes a slight performance degradation was noticed (~1%). For input buffers of less than 512 bytes there was no performance impact. Compared to 128 bit keys, 256 bit key size performance is approx. .5 cycles per byte slower on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs. SSE code). This patch has also been tested with StrongSwan IPSec connections where it worked correctly. I created this diff from a git clone of crypto-2.6.git. Any questions, please feel free to contact me. Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S343
1 files changed, 177 insertions, 166 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 477e9d75149b..6bd2c6c95373 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,12 +32,23 @@
32#include <linux/linkage.h> 32#include <linux/linkage.h>
33#include <asm/inst.h> 33#include <asm/inst.h>
34 34
35/*
36 * The following macros are used to move an (un)aligned 16 byte value to/from
37 * an XMM register. This can done for either FP or integer values, for FP use
38 * movaps (move aligned packed single) or integer use movdqa (move double quad
39 * aligned). It doesn't make a performance difference which instruction is used
40 * since Nehalem (original Core i7) was released. However, the movaps is a byte
41 * shorter, so that is the one we'll use for now. (same for unaligned).
42 */
43#define MOVADQ movaps
44#define MOVUDQ movups
45
35#ifdef __x86_64__ 46#ifdef __x86_64__
47
36.data 48.data
37.align 16 49.align 16
38.Lgf128mul_x_ble_mask: 50.Lgf128mul_x_ble_mask:
39 .octa 0x00000000000000010000000000000087 51 .octa 0x00000000000000010000000000000087
40
41POLY: .octa 0xC2000000000000000000000000000001 52POLY: .octa 0xC2000000000000000000000000000001
42TWOONE: .octa 0x00000001000000000000000000000001 53TWOONE: .octa 0x00000001000000000000000000000001
43 54
@@ -89,6 +100,7 @@ enc: .octa 0x2
89#define arg8 STACK_OFFSET+16(%r14) 100#define arg8 STACK_OFFSET+16(%r14)
90#define arg9 STACK_OFFSET+24(%r14) 101#define arg9 STACK_OFFSET+24(%r14)
91#define arg10 STACK_OFFSET+32(%r14) 102#define arg10 STACK_OFFSET+32(%r14)
103#define keysize 2*15*16(%arg1)
92#endif 104#endif
93 105
94 106
@@ -213,10 +225,12 @@ enc: .octa 0x2
213 225
214.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 226.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
215XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 227XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
228 MOVADQ SHUF_MASK(%rip), %xmm14
216 mov arg7, %r10 # %r10 = AAD 229 mov arg7, %r10 # %r10 = AAD
217 mov arg8, %r12 # %r12 = aadLen 230 mov arg8, %r12 # %r12 = aadLen
218 mov %r12, %r11 231 mov %r12, %r11
219 pxor %xmm\i, %xmm\i 232 pxor %xmm\i, %xmm\i
233
220_get_AAD_loop\num_initial_blocks\operation: 234_get_AAD_loop\num_initial_blocks\operation:
221 movd (%r10), \TMP1 235 movd (%r10), \TMP1
222 pslldq $12, \TMP1 236 pslldq $12, \TMP1
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation:
225 add $4, %r10 239 add $4, %r10
226 sub $4, %r12 240 sub $4, %r12
227 jne _get_AAD_loop\num_initial_blocks\operation 241 jne _get_AAD_loop\num_initial_blocks\operation
242
228 cmp $16, %r11 243 cmp $16, %r11
229 je _get_AAD_loop2_done\num_initial_blocks\operation 244 je _get_AAD_loop2_done\num_initial_blocks\operation
245
230 mov $16, %r12 246 mov $16, %r12
231_get_AAD_loop2\num_initial_blocks\operation: 247_get_AAD_loop2\num_initial_blocks\operation:
232 psrldq $4, %xmm\i 248 psrldq $4, %xmm\i
233 sub $4, %r12 249 sub $4, %r12
234 cmp %r11, %r12 250 cmp %r11, %r12
235 jne _get_AAD_loop2\num_initial_blocks\operation 251 jne _get_AAD_loop2\num_initial_blocks\operation
252
236_get_AAD_loop2_done\num_initial_blocks\operation: 253_get_AAD_loop2_done\num_initial_blocks\operation:
237 movdqa SHUF_MASK(%rip), %xmm14
238 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 254 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
239 255
240 xor %r11, %r11 # initialise the data pointer offset as zero 256 xor %r11, %r11 # initialise the data pointer offset as zero
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
243 259
244 mov %arg5, %rax # %rax = *Y0 260 mov %arg5, %rax # %rax = *Y0
245 movdqu (%rax), \XMM0 # XMM0 = Y0 261 movdqu (%rax), \XMM0 # XMM0 = Y0
246 movdqa SHUF_MASK(%rip), %xmm14
247 PSHUFB_XMM %xmm14, \XMM0 262 PSHUFB_XMM %xmm14, \XMM0
248 263
249.if (\i == 5) || (\i == 6) || (\i == 7) 264.if (\i == 5) || (\i == 6) || (\i == 7)
265 MOVADQ ONE(%RIP),\TMP1
266 MOVADQ (%arg1),\TMP2
250.irpc index, \i_seq 267.irpc index, \i_seq
251 paddd ONE(%rip), \XMM0 # INCR Y0 268 paddd \TMP1, \XMM0 # INCR Y0
252 movdqa \XMM0, %xmm\index 269 movdqa \XMM0, %xmm\index
253 movdqa SHUF_MASK(%rip), %xmm14
254 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 270 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
255 271 pxor \TMP2, %xmm\index
256.endr
257.irpc index, \i_seq
258 pxor 16*0(%arg1), %xmm\index
259.endr
260.irpc index, \i_seq
261 movaps 0x10(%rdi), \TMP1
262 AESENC \TMP1, %xmm\index # Round 1
263.endr
264.irpc index, \i_seq
265 movaps 0x20(%arg1), \TMP1
266 AESENC \TMP1, %xmm\index # Round 2
267.endr
268.irpc index, \i_seq
269 movaps 0x30(%arg1), \TMP1
270 AESENC \TMP1, %xmm\index # Round 2
271.endr
272.irpc index, \i_seq
273 movaps 0x40(%arg1), \TMP1
274 AESENC \TMP1, %xmm\index # Round 2
275.endr
276.irpc index, \i_seq
277 movaps 0x50(%arg1), \TMP1
278 AESENC \TMP1, %xmm\index # Round 2
279.endr
280.irpc index, \i_seq
281 movaps 0x60(%arg1), \TMP1
282 AESENC \TMP1, %xmm\index # Round 2
283.endr 272.endr
284.irpc index, \i_seq 273 lea 0x10(%arg1),%r10
285 movaps 0x70(%arg1), \TMP1 274 mov keysize,%eax
286 AESENC \TMP1, %xmm\index # Round 2 275 shr $2,%eax # 128->4, 192->6, 256->8
287.endr 276 add $5,%eax # 128->9, 192->11, 256->13
288.irpc index, \i_seq 277
289 movaps 0x80(%arg1), \TMP1 278aes_loop_initial_dec\num_initial_blocks:
290 AESENC \TMP1, %xmm\index # Round 2 279 MOVADQ (%r10),\TMP1
291.endr 280.irpc index, \i_seq
292.irpc index, \i_seq 281 AESENC \TMP1, %xmm\index
293 movaps 0x90(%arg1), \TMP1
294 AESENC \TMP1, %xmm\index # Round 2
295.endr 282.endr
283 add $16,%r10
284 sub $1,%eax
285 jnz aes_loop_initial_dec\num_initial_blocks
286
287 MOVADQ (%r10), \TMP1
296.irpc index, \i_seq 288.irpc index, \i_seq
297 movaps 0xa0(%arg1), \TMP1 289 AESENCLAST \TMP1, %xmm\index # Last Round
298 AESENCLAST \TMP1, %xmm\index # Round 10
299.endr 290.endr
300.irpc index, \i_seq 291.irpc index, \i_seq
301 movdqu (%arg3 , %r11, 1), \TMP1 292 movdqu (%arg3 , %r11, 1), \TMP1
@@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
305 add $16, %r11 296 add $16, %r11
306 297
307 movdqa \TMP1, %xmm\index 298 movdqa \TMP1, %xmm\index
308 movdqa SHUF_MASK(%rip), %xmm14
309 PSHUFB_XMM %xmm14, %xmm\index 299 PSHUFB_XMM %xmm14, %xmm\index
310 300 # prepare plaintext/ciphertext for GHASH computation
311 # prepare plaintext/ciphertext for GHASH computation
312.endr 301.endr
313.endif 302.endif
314 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 303 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
338* Precomputations for HashKey parallel with encryption of first 4 blocks. 327* Precomputations for HashKey parallel with encryption of first 4 blocks.
339* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 328* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
340*/ 329*/
341 paddd ONE(%rip), \XMM0 # INCR Y0 330 MOVADQ ONE(%rip), \TMP1
342 movdqa \XMM0, \XMM1 331 paddd \TMP1, \XMM0 # INCR Y0
343 movdqa SHUF_MASK(%rip), %xmm14 332 MOVADQ \XMM0, \XMM1
344 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 333 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
345 334
346 paddd ONE(%rip), \XMM0 # INCR Y0 335 paddd \TMP1, \XMM0 # INCR Y0
347 movdqa \XMM0, \XMM2 336 MOVADQ \XMM0, \XMM2
348 movdqa SHUF_MASK(%rip), %xmm14
349 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 337 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
350 338
351 paddd ONE(%rip), \XMM0 # INCR Y0 339 paddd \TMP1, \XMM0 # INCR Y0
352 movdqa \XMM0, \XMM3 340 MOVADQ \XMM0, \XMM3
353 movdqa SHUF_MASK(%rip), %xmm14
354 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 341 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
355 342
356 paddd ONE(%rip), \XMM0 # INCR Y0 343 paddd \TMP1, \XMM0 # INCR Y0
357 movdqa \XMM0, \XMM4 344 MOVADQ \XMM0, \XMM4
358 movdqa SHUF_MASK(%rip), %xmm14
359 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 345 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
360 346
361 pxor 16*0(%arg1), \XMM1 347 MOVADQ 0(%arg1),\TMP1
362 pxor 16*0(%arg1), \XMM2 348 pxor \TMP1, \XMM1
363 pxor 16*0(%arg1), \XMM3 349 pxor \TMP1, \XMM2
364 pxor 16*0(%arg1), \XMM4 350 pxor \TMP1, \XMM3
351 pxor \TMP1, \XMM4
365 movdqa \TMP3, \TMP5 352 movdqa \TMP3, \TMP5
366 pshufd $78, \TMP3, \TMP1 353 pshufd $78, \TMP3, \TMP1
367 pxor \TMP3, \TMP1 354 pxor \TMP3, \TMP1
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
399 pshufd $78, \TMP5, \TMP1 386 pshufd $78, \TMP5, \TMP1
400 pxor \TMP5, \TMP1 387 pxor \TMP5, \TMP1
401 movdqa \TMP1, HashKey_4_k(%rsp) 388 movdqa \TMP1, HashKey_4_k(%rsp)
402 movaps 0xa0(%arg1), \TMP2 389 lea 0xa0(%arg1),%r10
390 mov keysize,%eax
391 shr $2,%eax # 128->4, 192->6, 256->8
392 sub $4,%eax # 128->0, 192->2, 256->4
393 jz aes_loop_pre_dec_done\num_initial_blocks
394
395aes_loop_pre_dec\num_initial_blocks:
396 MOVADQ (%r10),\TMP2
397.irpc index, 1234
398 AESENC \TMP2, %xmm\index
399.endr
400 add $16,%r10
401 sub $1,%eax
402 jnz aes_loop_pre_dec\num_initial_blocks
403
404aes_loop_pre_dec_done\num_initial_blocks:
405 MOVADQ (%r10), \TMP2
403 AESENCLAST \TMP2, \XMM1 406 AESENCLAST \TMP2, \XMM1
404 AESENCLAST \TMP2, \XMM2 407 AESENCLAST \TMP2, \XMM2
405 AESENCLAST \TMP2, \XMM3 408 AESENCLAST \TMP2, \XMM3
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
421 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 424 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
422 movdqa \TMP1, \XMM4 425 movdqa \TMP1, \XMM4
423 add $64, %r11 426 add $64, %r11
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 427 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
426 pxor \XMMDst, \XMM1 428 pxor \XMMDst, \XMM1
427# combine GHASHed value with the corresponding ciphertext 429# combine GHASHed value with the corresponding ciphertext
428 movdqa SHUF_MASK(%rip), %xmm14
429 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 430 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
430 movdqa SHUF_MASK(%rip), %xmm14
431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432 movdqa SHUF_MASK(%rip), %xmm14
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 432 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434 433
435_initial_blocks_done\num_initial_blocks\operation: 434_initial_blocks_done\num_initial_blocks\operation:
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
451 450
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 451.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 452XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
453 MOVADQ SHUF_MASK(%rip), %xmm14
454 mov arg7, %r10 # %r10 = AAD 454 mov arg7, %r10 # %r10 = AAD
455 mov arg8, %r12 # %r12 = aadLen 455 mov arg8, %r12 # %r12 = aadLen
456 mov %r12, %r11 456 mov %r12, %r11
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
472 cmp %r11, %r12 472 cmp %r11, %r12
473 jne _get_AAD_loop2\num_initial_blocks\operation 473 jne _get_AAD_loop2\num_initial_blocks\operation
474_get_AAD_loop2_done\num_initial_blocks\operation: 474_get_AAD_loop2_done\num_initial_blocks\operation:
475 movdqa SHUF_MASK(%rip), %xmm14
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 475 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
477 476
478 xor %r11, %r11 # initialise the data pointer offset as zero 477 xor %r11, %r11 # initialise the data pointer offset as zero
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
481 480
482 mov %arg5, %rax # %rax = *Y0 481 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0 482 movdqu (%rax), \XMM0 # XMM0 = Y0
484 movdqa SHUF_MASK(%rip), %xmm14
485 PSHUFB_XMM %xmm14, \XMM0 483 PSHUFB_XMM %xmm14, \XMM0
486 484
487.if (\i == 5) || (\i == 6) || (\i == 7) 485.if (\i == 5) || (\i == 6) || (\i == 7)
488.irpc index, \i_seq
489 paddd ONE(%rip), \XMM0 # INCR Y0
490 movdqa \XMM0, %xmm\index
491 movdqa SHUF_MASK(%rip), %xmm14
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
493 486
494.endr 487 MOVADQ ONE(%RIP),\TMP1
495.irpc index, \i_seq 488 MOVADQ 0(%arg1),\TMP2
496 pxor 16*0(%arg1), %xmm\index
497.endr
498.irpc index, \i_seq
499 movaps 0x10(%rdi), \TMP1
500 AESENC \TMP1, %xmm\index # Round 1
501.endr
502.irpc index, \i_seq
503 movaps 0x20(%arg1), \TMP1
504 AESENC \TMP1, %xmm\index # Round 2
505.endr
506.irpc index, \i_seq 489.irpc index, \i_seq
507 movaps 0x30(%arg1), \TMP1 490 paddd \TMP1, \XMM0 # INCR Y0
508 AESENC \TMP1, %xmm\index # Round 2 491 MOVADQ \XMM0, %xmm\index
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
493 pxor \TMP2, %xmm\index
509.endr 494.endr
510.irpc index, \i_seq 495 lea 0x10(%arg1),%r10
511 movaps 0x40(%arg1), \TMP1 496 mov keysize,%eax
512 AESENC \TMP1, %xmm\index # Round 2 497 shr $2,%eax # 128->4, 192->6, 256->8
513.endr 498 add $5,%eax # 128->9, 192->11, 256->13
514.irpc index, \i_seq 499
515 movaps 0x50(%arg1), \TMP1 500aes_loop_initial_enc\num_initial_blocks:
516 AESENC \TMP1, %xmm\index # Round 2 501 MOVADQ (%r10),\TMP1
517.endr 502.irpc index, \i_seq
518.irpc index, \i_seq 503 AESENC \TMP1, %xmm\index
519 movaps 0x60(%arg1), \TMP1
520 AESENC \TMP1, %xmm\index # Round 2
521.endr
522.irpc index, \i_seq
523 movaps 0x70(%arg1), \TMP1
524 AESENC \TMP1, %xmm\index # Round 2
525.endr
526.irpc index, \i_seq
527 movaps 0x80(%arg1), \TMP1
528 AESENC \TMP1, %xmm\index # Round 2
529.endr
530.irpc index, \i_seq
531 movaps 0x90(%arg1), \TMP1
532 AESENC \TMP1, %xmm\index # Round 2
533.endr 504.endr
505 add $16,%r10
506 sub $1,%eax
507 jnz aes_loop_initial_enc\num_initial_blocks
508
509 MOVADQ (%r10), \TMP1
534.irpc index, \i_seq 510.irpc index, \i_seq
535 movaps 0xa0(%arg1), \TMP1 511 AESENCLAST \TMP1, %xmm\index # Last Round
536 AESENCLAST \TMP1, %xmm\index # Round 10
537.endr 512.endr
538.irpc index, \i_seq 513.irpc index, \i_seq
539 movdqu (%arg3 , %r11, 1), \TMP1 514 movdqu (%arg3 , %r11, 1), \TMP1
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
541 movdqu %xmm\index, (%arg2 , %r11, 1) 516 movdqu %xmm\index, (%arg2 , %r11, 1)
542 # write back plaintext/ciphertext for num_initial_blocks 517 # write back plaintext/ciphertext for num_initial_blocks
543 add $16, %r11 518 add $16, %r11
544
545 movdqa SHUF_MASK(%rip), %xmm14
546 PSHUFB_XMM %xmm14, %xmm\index 519 PSHUFB_XMM %xmm14, %xmm\index
547 520
548 # prepare plaintext/ciphertext for GHASH computation 521 # prepare plaintext/ciphertext for GHASH computation
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
575* Precomputations for HashKey parallel with encryption of first 4 blocks. 548* Precomputations for HashKey parallel with encryption of first 4 blocks.
576* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 549* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
577*/ 550*/
578 paddd ONE(%rip), \XMM0 # INCR Y0 551 MOVADQ ONE(%RIP),\TMP1
579 movdqa \XMM0, \XMM1 552 paddd \TMP1, \XMM0 # INCR Y0
580 movdqa SHUF_MASK(%rip), %xmm14 553 MOVADQ \XMM0, \XMM1
581 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 554 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
582 555
583 paddd ONE(%rip), \XMM0 # INCR Y0 556 paddd \TMP1, \XMM0 # INCR Y0
584 movdqa \XMM0, \XMM2 557 MOVADQ \XMM0, \XMM2
585 movdqa SHUF_MASK(%rip), %xmm14
586 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
587 559
588 paddd ONE(%rip), \XMM0 # INCR Y0 560 paddd \TMP1, \XMM0 # INCR Y0
589 movdqa \XMM0, \XMM3 561 MOVADQ \XMM0, \XMM3
590 movdqa SHUF_MASK(%rip), %xmm14
591 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 562 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
592 563
593 paddd ONE(%rip), \XMM0 # INCR Y0 564 paddd \TMP1, \XMM0 # INCR Y0
594 movdqa \XMM0, \XMM4 565 MOVADQ \XMM0, \XMM4
595 movdqa SHUF_MASK(%rip), %xmm14
596 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 566 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
597 567
598 pxor 16*0(%arg1), \XMM1 568 MOVADQ 0(%arg1),\TMP1
599 pxor 16*0(%arg1), \XMM2 569 pxor \TMP1, \XMM1
600 pxor 16*0(%arg1), \XMM3 570 pxor \TMP1, \XMM2
601 pxor 16*0(%arg1), \XMM4 571 pxor \TMP1, \XMM3
572 pxor \TMP1, \XMM4
602 movdqa \TMP3, \TMP5 573 movdqa \TMP3, \TMP5
603 pshufd $78, \TMP3, \TMP1 574 pshufd $78, \TMP3, \TMP1
604 pxor \TMP3, \TMP1 575 pxor \TMP3, \TMP1
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
636 pshufd $78, \TMP5, \TMP1 607 pshufd $78, \TMP5, \TMP1
637 pxor \TMP5, \TMP1 608 pxor \TMP5, \TMP1
638 movdqa \TMP1, HashKey_4_k(%rsp) 609 movdqa \TMP1, HashKey_4_k(%rsp)
639 movaps 0xa0(%arg1), \TMP2 610 lea 0xa0(%arg1),%r10
611 mov keysize,%eax
612 shr $2,%eax # 128->4, 192->6, 256->8
613 sub $4,%eax # 128->0, 192->2, 256->4
614 jz aes_loop_pre_enc_done\num_initial_blocks
615
616aes_loop_pre_enc\num_initial_blocks:
617 MOVADQ (%r10),\TMP2
618.irpc index, 1234
619 AESENC \TMP2, %xmm\index
620.endr
621 add $16,%r10
622 sub $1,%eax
623 jnz aes_loop_pre_enc\num_initial_blocks
624
625aes_loop_pre_enc_done\num_initial_blocks:
626 MOVADQ (%r10), \TMP2
640 AESENCLAST \TMP2, \XMM1 627 AESENCLAST \TMP2, \XMM1
641 AESENCLAST \TMP2, \XMM2 628 AESENCLAST \TMP2, \XMM2
642 AESENCLAST \TMP2, \XMM3 629 AESENCLAST \TMP2, \XMM3
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
655 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 642 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
656 643
657 add $64, %r11 644 add $64, %r11
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 645 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
660 pxor \XMMDst, \XMM1 646 pxor \XMMDst, \XMM1
661# combine GHASHed value with the corresponding ciphertext 647# combine GHASHed value with the corresponding ciphertext
662 movdqa SHUF_MASK(%rip), %xmm14
663 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 648 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
664 movdqa SHUF_MASK(%rip), %xmm14
665 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 649 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
666 movdqa SHUF_MASK(%rip), %xmm14
667 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 650 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
668 651
669_initial_blocks_done\num_initial_blocks\operation: 652_initial_blocks_done\num_initial_blocks\operation:
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
794 AESENC \TMP3, \XMM3 777 AESENC \TMP3, \XMM3
795 AESENC \TMP3, \XMM4 778 AESENC \TMP3, \XMM4
796 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 779 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
797 movaps 0xa0(%arg1), \TMP3 780 lea 0xa0(%arg1),%r10
781 mov keysize,%eax
782 shr $2,%eax # 128->4, 192->6, 256->8
783 sub $4,%eax # 128->0, 192->2, 256->4
784 jz aes_loop_par_enc_done
785
786aes_loop_par_enc:
787 MOVADQ (%r10),\TMP3
788.irpc index, 1234
789 AESENC \TMP3, %xmm\index
790.endr
791 add $16,%r10
792 sub $1,%eax
793 jnz aes_loop_par_enc
794
795aes_loop_par_enc_done:
796 MOVADQ (%r10), \TMP3
798 AESENCLAST \TMP3, \XMM1 # Round 10 797 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2 798 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3 799 AESENCLAST \TMP3, \XMM3
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
986 AESENC \TMP3, \XMM3 985 AESENC \TMP3, \XMM3
987 AESENC \TMP3, \XMM4 986 AESENC \TMP3, \XMM4
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 987 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
989 movaps 0xa0(%arg1), \TMP3 988 lea 0xa0(%arg1),%r10
990 AESENCLAST \TMP3, \XMM1 # Round 10 989 mov keysize,%eax
990 shr $2,%eax # 128->4, 192->6, 256->8
991 sub $4,%eax # 128->0, 192->2, 256->4
992 jz aes_loop_par_dec_done
993
994aes_loop_par_dec:
995 MOVADQ (%r10),\TMP3
996.irpc index, 1234
997 AESENC \TMP3, %xmm\index
998.endr
999 add $16,%r10
1000 sub $1,%eax
1001 jnz aes_loop_par_dec
1002
1003aes_loop_par_dec_done:
1004 MOVADQ (%r10), \TMP3
1005 AESENCLAST \TMP3, \XMM1 # last round
991 AESENCLAST \TMP3, \XMM2 1006 AESENCLAST \TMP3, \XMM2
992 AESENCLAST \TMP3, \XMM3 1007 AESENCLAST \TMP3, \XMM3
993 AESENCLAST \TMP3, \XMM4 1008 AESENCLAST \TMP3, \XMM4
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1155 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1170 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1156.endm 1171.endm
1157 1172
1158/* Encryption of a single block done*/
1159.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1160 1173
1161 pxor (%arg1), \XMM0 1174/* Encryption of a single block
1162 movaps 16(%arg1), \TMP1 1175* uses eax & r10
1163 AESENC \TMP1, \XMM0 1176*/
1164 movaps 32(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 48(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 64(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 80(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 96(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 112(%arg1), \TMP1
1175 AESENC \TMP1, \XMM0
1176 movaps 128(%arg1), \TMP1
1177 AESENC \TMP1, \XMM0
1178 movaps 144(%arg1), \TMP1
1179 AESENC \TMP1, \XMM0
1180 movaps 160(%arg1), \TMP1
1181 AESENCLAST \TMP1, \XMM0
1182.endm
1183 1177
1178.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1184 1179
1180 pxor (%arg1), \XMM0
1181 mov keysize,%eax
1182 shr $2,%eax # 128->4, 192->6, 256->8
1183 add $5,%eax # 128->9, 192->11, 256->13
1184 lea 16(%arg1), %r10 # get first expanded key address
1185
1186_esb_loop_\@:
1187 MOVADQ (%r10),\TMP1
1188 AESENC \TMP1,\XMM0
1189 add $16,%r10
1190 sub $1,%eax
1191 jnz _esb_loop_\@
1192
1193 MOVADQ (%r10),\TMP1
1194 AESENCLAST \TMP1,\XMM0
1195.endm
1185/***************************************************************************** 1196/*****************************************************************************
1186* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1197* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1187* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1198* u8 *out, // Plaintext output. Encrypt in-place is allowed.