aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2019-09-03 12:43:25 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2019-09-09 03:35:28 -0400
commit1dede02bdd64116b98bdcbc2e2be20e62afd43f5 (patch)
treecc63d7afa9983e3ff90075e56c7630984498b257 /arch/arm
parent46a22776bc97aa5ab9d5f9dc4829859219b86365 (diff)
crypto: arm/aes-ce - switch to 4x interleave
When the ARM AES instruction based crypto driver was introduced, there were no known implementations that could benefit from a 4-way interleave, and so a 3-way interleave was used instead. Since we have sufficient space in the SIMD register file, let's switch to a 4-way interleave to align with the 64-bit driver, and to ensure that we can reach optimum performance when running under emulation on high end 64-bit cores. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm')
-rw-r--r--arch/arm/crypto/aes-ce-core.S263
1 files changed, 144 insertions, 119 deletions
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index 1e0d45183590..a3ca4ac2d7bb 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -44,46 +44,56 @@
44 veor q0, q0, \key3 44 veor q0, q0, \key3
45 .endm 45 .endm
46 46
47 .macro enc_dround_3x, key1, key2 47 .macro enc_dround_4x, key1, key2
48 enc_round q0, \key1 48 enc_round q0, \key1
49 enc_round q1, \key1 49 enc_round q1, \key1
50 enc_round q2, \key1 50 enc_round q2, \key1
51 enc_round q3, \key1
51 enc_round q0, \key2 52 enc_round q0, \key2
52 enc_round q1, \key2 53 enc_round q1, \key2
53 enc_round q2, \key2 54 enc_round q2, \key2
55 enc_round q3, \key2
54 .endm 56 .endm
55 57
56 .macro dec_dround_3x, key1, key2 58 .macro dec_dround_4x, key1, key2
57 dec_round q0, \key1 59 dec_round q0, \key1
58 dec_round q1, \key1 60 dec_round q1, \key1
59 dec_round q2, \key1 61 dec_round q2, \key1
62 dec_round q3, \key1
60 dec_round q0, \key2 63 dec_round q0, \key2
61 dec_round q1, \key2 64 dec_round q1, \key2
62 dec_round q2, \key2 65 dec_round q2, \key2
66 dec_round q3, \key2
63 .endm 67 .endm
64 68
65 .macro enc_fround_3x, key1, key2, key3 69 .macro enc_fround_4x, key1, key2, key3
66 enc_round q0, \key1 70 enc_round q0, \key1
67 enc_round q1, \key1 71 enc_round q1, \key1
68 enc_round q2, \key1 72 enc_round q2, \key1
73 enc_round q3, \key1
69 aese.8 q0, \key2 74 aese.8 q0, \key2
70 aese.8 q1, \key2 75 aese.8 q1, \key2
71 aese.8 q2, \key2 76 aese.8 q2, \key2
77 aese.8 q3, \key2
72 veor q0, q0, \key3 78 veor q0, q0, \key3
73 veor q1, q1, \key3 79 veor q1, q1, \key3
74 veor q2, q2, \key3 80 veor q2, q2, \key3
81 veor q3, q3, \key3
75 .endm 82 .endm
76 83
77 .macro dec_fround_3x, key1, key2, key3 84 .macro dec_fround_4x, key1, key2, key3
78 dec_round q0, \key1 85 dec_round q0, \key1
79 dec_round q1, \key1 86 dec_round q1, \key1
80 dec_round q2, \key1 87 dec_round q2, \key1
88 dec_round q3, \key1
81 aesd.8 q0, \key2 89 aesd.8 q0, \key2
82 aesd.8 q1, \key2 90 aesd.8 q1, \key2
83 aesd.8 q2, \key2 91 aesd.8 q2, \key2
92 aesd.8 q3, \key2
84 veor q0, q0, \key3 93 veor q0, q0, \key3
85 veor q1, q1, \key3 94 veor q1, q1, \key3
86 veor q2, q2, \key3 95 veor q2, q2, \key3
96 veor q3, q3, \key3
87 .endm 97 .endm
88 98
89 .macro do_block, dround, fround 99 .macro do_block, dround, fround
@@ -114,8 +124,9 @@
114 * transforms. These should preserve all registers except q0 - q2 and ip 124 * transforms. These should preserve all registers except q0 - q2 and ip
115 * Arguments: 125 * Arguments:
116 * q0 : first in/output block 126 * q0 : first in/output block
117 * q1 : second in/output block (_3x version only) 127 * q1 : second in/output block (_4x version only)
118 * q2 : third in/output block (_3x version only) 128 * q2 : third in/output block (_4x version only)
129 * q3 : fourth in/output block (_4x version only)
119 * q8 : first round key 130 * q8 : first round key
120 * q9 : secound round key 131 * q9 : secound round key
121 * q14 : final round key 132 * q14 : final round key
@@ -136,16 +147,16 @@ aes_decrypt:
136ENDPROC(aes_decrypt) 147ENDPROC(aes_decrypt)
137 148
138 .align 6 149 .align 6
139aes_encrypt_3x: 150aes_encrypt_4x:
140 add ip, r2, #32 @ 3rd round key 151 add ip, r2, #32 @ 3rd round key
141 do_block enc_dround_3x, enc_fround_3x 152 do_block enc_dround_4x, enc_fround_4x
142ENDPROC(aes_encrypt_3x) 153ENDPROC(aes_encrypt_4x)
143 154
144 .align 6 155 .align 6
145aes_decrypt_3x: 156aes_decrypt_4x:
146 add ip, r2, #32 @ 3rd round key 157 add ip, r2, #32 @ 3rd round key
147 do_block dec_dround_3x, dec_fround_3x 158 do_block dec_dround_4x, dec_fround_4x
148ENDPROC(aes_decrypt_3x) 159ENDPROC(aes_decrypt_4x)
149 160
150 .macro prepare_key, rk, rounds 161 .macro prepare_key, rk, rounds
151 add ip, \rk, \rounds, lsl #4 162 add ip, \rk, \rounds, lsl #4
@@ -163,17 +174,17 @@ ENTRY(ce_aes_ecb_encrypt)
163 push {r4, lr} 174 push {r4, lr}
164 ldr r4, [sp, #8] 175 ldr r4, [sp, #8]
165 prepare_key r2, r3 176 prepare_key r2, r3
166.Lecbencloop3x: 177.Lecbencloop4x:
167 subs r4, r4, #3 178 subs r4, r4, #4
168 bmi .Lecbenc1x 179 bmi .Lecbenc1x
169 vld1.8 {q0-q1}, [r1]! 180 vld1.8 {q0-q1}, [r1]!
170 vld1.8 {q2}, [r1]! 181 vld1.8 {q2-q3}, [r1]!
171 bl aes_encrypt_3x 182 bl aes_encrypt_4x
172 vst1.8 {q0-q1}, [r0]! 183 vst1.8 {q0-q1}, [r0]!
173 vst1.8 {q2}, [r0]! 184 vst1.8 {q2-q3}, [r0]!
174 b .Lecbencloop3x 185 b .Lecbencloop4x
175.Lecbenc1x: 186.Lecbenc1x:
176 adds r4, r4, #3 187 adds r4, r4, #4
177 beq .Lecbencout 188 beq .Lecbencout
178.Lecbencloop: 189.Lecbencloop:
179 vld1.8 {q0}, [r1]! 190 vld1.8 {q0}, [r1]!
@@ -189,17 +200,17 @@ ENTRY(ce_aes_ecb_decrypt)
189 push {r4, lr} 200 push {r4, lr}
190 ldr r4, [sp, #8] 201 ldr r4, [sp, #8]
191 prepare_key r2, r3 202 prepare_key r2, r3
192.Lecbdecloop3x: 203.Lecbdecloop4x:
193 subs r4, r4, #3 204 subs r4, r4, #4
194 bmi .Lecbdec1x 205 bmi .Lecbdec1x
195 vld1.8 {q0-q1}, [r1]! 206 vld1.8 {q0-q1}, [r1]!
196 vld1.8 {q2}, [r1]! 207 vld1.8 {q2-q3}, [r1]!
197 bl aes_decrypt_3x 208 bl aes_decrypt_4x
198 vst1.8 {q0-q1}, [r0]! 209 vst1.8 {q0-q1}, [r0]!
199 vst1.8 {q2}, [r0]! 210 vst1.8 {q2-q3}, [r0]!
200 b .Lecbdecloop3x 211 b .Lecbdecloop4x
201.Lecbdec1x: 212.Lecbdec1x:
202 adds r4, r4, #3 213 adds r4, r4, #4
203 beq .Lecbdecout 214 beq .Lecbdecout
204.Lecbdecloop: 215.Lecbdecloop:
205 vld1.8 {q0}, [r1]! 216 vld1.8 {q0}, [r1]!
@@ -236,38 +247,40 @@ ENDPROC(ce_aes_cbc_encrypt)
236ENTRY(ce_aes_cbc_decrypt) 247ENTRY(ce_aes_cbc_decrypt)
237 push {r4-r6, lr} 248 push {r4-r6, lr}
238 ldrd r4, r5, [sp, #16] 249 ldrd r4, r5, [sp, #16]
239 vld1.8 {q6}, [r5] @ keep iv in q6 250 vld1.8 {q15}, [r5] @ keep iv in q15
240 prepare_key r2, r3 251 prepare_key r2, r3
241.Lcbcdecloop3x: 252.Lcbcdecloop4x:
242 subs r4, r4, #3 253 subs r4, r4, #4
243 bmi .Lcbcdec1x 254 bmi .Lcbcdec1x
244 vld1.8 {q0-q1}, [r1]! 255 vld1.8 {q0-q1}, [r1]!
245 vld1.8 {q2}, [r1]! 256 vld1.8 {q2-q3}, [r1]!
246 vmov q3, q0 257 vmov q4, q0
247 vmov q4, q1 258 vmov q5, q1
248 vmov q5, q2 259 vmov q6, q2
249 bl aes_decrypt_3x 260 vmov q7, q3
250 veor q0, q0, q6 261 bl aes_decrypt_4x
251 veor q1, q1, q3 262 veor q0, q0, q15
252 veor q2, q2, q4 263 veor q1, q1, q4
253 vmov q6, q5 264 veor q2, q2, q5
265 veor q3, q3, q6
266 vmov q15, q7
254 vst1.8 {q0-q1}, [r0]! 267 vst1.8 {q0-q1}, [r0]!
255 vst1.8 {q2}, [r0]! 268 vst1.8 {q2-q3}, [r0]!
256 b .Lcbcdecloop3x 269 b .Lcbcdecloop4x
257.Lcbcdec1x: 270.Lcbcdec1x:
258 adds r4, r4, #3 271 adds r4, r4, #4
259 beq .Lcbcdecout 272 beq .Lcbcdecout
260 vmov q15, q14 @ preserve last round key 273 vmov q6, q14 @ preserve last round key
261.Lcbcdecloop: 274.Lcbcdecloop:
262 vld1.8 {q0}, [r1]! @ get next ct block 275 vld1.8 {q0}, [r1]! @ get next ct block
263 veor q14, q15, q6 @ combine prev ct with last key 276 veor q14, q15, q6 @ combine prev ct with last key
264 vmov q6, q0 277 vmov q15, q0
265 bl aes_decrypt 278 bl aes_decrypt
266 vst1.8 {q0}, [r0]! 279 vst1.8 {q0}, [r0]!
267 subs r4, r4, #1 280 subs r4, r4, #1
268 bne .Lcbcdecloop 281 bne .Lcbcdecloop
269.Lcbcdecout: 282.Lcbcdecout:
270 vst1.8 {q6}, [r5] @ keep iv in q6 283 vst1.8 {q15}, [r5] @ keep iv in q15
271 pop {r4-r6, pc} 284 pop {r4-r6, pc}
272ENDPROC(ce_aes_cbc_decrypt) 285ENDPROC(ce_aes_cbc_decrypt)
273 286
@@ -278,46 +291,52 @@ ENDPROC(ce_aes_cbc_decrypt)
278ENTRY(ce_aes_ctr_encrypt) 291ENTRY(ce_aes_ctr_encrypt)
279 push {r4-r6, lr} 292 push {r4-r6, lr}
280 ldrd r4, r5, [sp, #16] 293 ldrd r4, r5, [sp, #16]
281 vld1.8 {q6}, [r5] @ load ctr 294 vld1.8 {q7}, [r5] @ load ctr
282 prepare_key r2, r3 295 prepare_key r2, r3
283 vmov r6, s27 @ keep swabbed ctr in r6 296 vmov r6, s31 @ keep swabbed ctr in r6
284 rev r6, r6 297 rev r6, r6
285 cmn r6, r4 @ 32 bit overflow? 298 cmn r6, r4 @ 32 bit overflow?
286 bcs .Lctrloop 299 bcs .Lctrloop
287.Lctrloop3x: 300.Lctrloop4x:
288 subs r4, r4, #3 301 subs r4, r4, #4
289 bmi .Lctr1x 302 bmi .Lctr1x
290 add r6, r6, #1 303 add r6, r6, #1
291 vmov q0, q6 304 vmov q0, q7
292 vmov q1, q6 305 vmov q1, q7
293 rev ip, r6 306 rev ip, r6
294 add r6, r6, #1 307 add r6, r6, #1
295 vmov q2, q6 308 vmov q2, q7
296 vmov s7, ip 309 vmov s7, ip
297 rev ip, r6 310 rev ip, r6
298 add r6, r6, #1 311 add r6, r6, #1
312 vmov q3, q7
299 vmov s11, ip 313 vmov s11, ip
300 vld1.8 {q3-q4}, [r1]! 314 rev ip, r6
301 vld1.8 {q5}, [r1]! 315 add r6, r6, #1
302 bl aes_encrypt_3x 316 vmov s15, ip
303 veor q0, q0, q3 317 vld1.8 {q4-q5}, [r1]!
304 veor q1, q1, q4 318 vld1.8 {q6}, [r1]!
305 veor q2, q2, q5 319 vld1.8 {q15}, [r1]!
320 bl aes_encrypt_4x
321 veor q0, q0, q4
322 veor q1, q1, q5
323 veor q2, q2, q6
324 veor q3, q3, q15
306 rev ip, r6 325 rev ip, r6
307 vst1.8 {q0-q1}, [r0]! 326 vst1.8 {q0-q1}, [r0]!
308 vst1.8 {q2}, [r0]! 327 vst1.8 {q2-q3}, [r0]!
309 vmov s27, ip 328 vmov s31, ip
310 b .Lctrloop3x 329 b .Lctrloop4x
311.Lctr1x: 330.Lctr1x:
312 adds r4, r4, #3 331 adds r4, r4, #4
313 beq .Lctrout 332 beq .Lctrout
314.Lctrloop: 333.Lctrloop:
315 vmov q0, q6 334 vmov q0, q7
316 bl aes_encrypt 335 bl aes_encrypt
317 336
318 adds r6, r6, #1 @ increment BE ctr 337 adds r6, r6, #1 @ increment BE ctr
319 rev ip, r6 338 rev ip, r6
320 vmov s27, ip 339 vmov s31, ip
321 bcs .Lctrcarry 340 bcs .Lctrcarry
322 341
323.Lctrcarrydone: 342.Lctrcarrydone:
@@ -329,7 +348,7 @@ ENTRY(ce_aes_ctr_encrypt)
329 bne .Lctrloop 348 bne .Lctrloop
330 349
331.Lctrout: 350.Lctrout:
332 vst1.8 {q6}, [r5] @ return next CTR value 351 vst1.8 {q7}, [r5] @ return next CTR value
333 pop {r4-r6, pc} 352 pop {r4-r6, pc}
334 353
335.Lctrtailblock: 354.Lctrtailblock:
@@ -337,7 +356,7 @@ ENTRY(ce_aes_ctr_encrypt)
337 b .Lctrout 356 b .Lctrout
338 357
339.Lctrcarry: 358.Lctrcarry:
340 .irp sreg, s26, s25, s24 359 .irp sreg, s30, s29, s28
341 vmov ip, \sreg @ load next word of ctr 360 vmov ip, \sreg @ load next word of ctr
342 rev ip, ip @ ... to handle the carry 361 rev ip, ip @ ... to handle the carry
343 adds ip, ip, #1 362 adds ip, ip, #1
@@ -368,8 +387,8 @@ ENDPROC(ce_aes_ctr_encrypt)
368 .quad 1, 0x87 387 .quad 1, 0x87
369 388
370ce_aes_xts_init: 389ce_aes_xts_init:
371 vldr d14, .Lxts_mul_x 390 vldr d30, .Lxts_mul_x
372 vldr d15, .Lxts_mul_x + 8 391 vldr d31, .Lxts_mul_x + 8
373 392
374 ldrd r4, r5, [sp, #16] @ load args 393 ldrd r4, r5, [sp, #16] @ load args
375 ldr r6, [sp, #28] 394 ldr r6, [sp, #28]
@@ -390,48 +409,51 @@ ENTRY(ce_aes_xts_encrypt)
390 409
391 bl ce_aes_xts_init @ run shared prologue 410 bl ce_aes_xts_init @ run shared prologue
392 prepare_key r2, r3 411 prepare_key r2, r3
393 vmov q3, q0 412 vmov q4, q0
394 413
395 teq r6, #0 @ start of a block? 414 teq r6, #0 @ start of a block?
396 bne .Lxtsenc3x 415 bne .Lxtsenc4x
397 416
398.Lxtsencloop3x: 417.Lxtsencloop4x:
399 next_tweak q3, q3, q7, q6 418 next_tweak q4, q4, q15, q10
400.Lxtsenc3x: 419.Lxtsenc4x:
401 subs r4, r4, #3 420 subs r4, r4, #4
402 bmi .Lxtsenc1x 421 bmi .Lxtsenc1x
403 vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks 422 vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
404 vld1.8 {q2}, [r1]! 423 vld1.8 {q2-q3}, [r1]!
405 next_tweak q4, q3, q7, q6 424 next_tweak q5, q4, q15, q10
406 veor q0, q0, q3 425 veor q0, q0, q4
407 next_tweak q5, q4, q7, q6 426 next_tweak q6, q5, q15, q10
408 veor q1, q1, q4 427 veor q1, q1, q5
409 veor q2, q2, q5 428 next_tweak q7, q6, q15, q10
410 bl aes_encrypt_3x 429 veor q2, q2, q6
411 veor q0, q0, q3 430 veor q3, q3, q7
412 veor q1, q1, q4 431 bl aes_encrypt_4x
413 veor q2, q2, q5 432 veor q0, q0, q4
414 vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks 433 veor q1, q1, q5
415 vst1.8 {q2}, [r0]! 434 veor q2, q2, q6
416 vmov q3, q5 435 veor q3, q3, q7
436 vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
437 vst1.8 {q2-q3}, [r0]!
438 vmov q4, q7
417 teq r4, #0 439 teq r4, #0
418 beq .Lxtsencout 440 beq .Lxtsencout
419 b .Lxtsencloop3x 441 b .Lxtsencloop4x
420.Lxtsenc1x: 442.Lxtsenc1x:
421 adds r4, r4, #3 443 adds r4, r4, #4
422 beq .Lxtsencout 444 beq .Lxtsencout
423.Lxtsencloop: 445.Lxtsencloop:
424 vld1.8 {q0}, [r1]! 446 vld1.8 {q0}, [r1]!
425 veor q0, q0, q3 447 veor q0, q0, q4
426 bl aes_encrypt 448 bl aes_encrypt
427 veor q0, q0, q3 449 veor q0, q0, q4
428 vst1.8 {q0}, [r0]! 450 vst1.8 {q0}, [r0]!
429 subs r4, r4, #1 451 subs r4, r4, #1
430 beq .Lxtsencout 452 beq .Lxtsencout
431 next_tweak q3, q3, q7, q6 453 next_tweak q4, q4, q15, q6
432 b .Lxtsencloop 454 b .Lxtsencloop
433.Lxtsencout: 455.Lxtsencout:
434 vst1.8 {q3}, [r5] 456 vst1.8 {q4}, [r5]
435 pop {r4-r6, pc} 457 pop {r4-r6, pc}
436ENDPROC(ce_aes_xts_encrypt) 458ENDPROC(ce_aes_xts_encrypt)
437 459
@@ -441,49 +463,52 @@ ENTRY(ce_aes_xts_decrypt)
441 463
442 bl ce_aes_xts_init @ run shared prologue 464 bl ce_aes_xts_init @ run shared prologue
443 prepare_key r2, r3 465 prepare_key r2, r3
444 vmov q3, q0 466 vmov q4, q0
445 467
446 teq r6, #0 @ start of a block? 468 teq r6, #0 @ start of a block?
447 bne .Lxtsdec3x 469 bne .Lxtsdec4x
448 470
449.Lxtsdecloop3x: 471.Lxtsdecloop4x:
450 next_tweak q3, q3, q7, q6 472 next_tweak q4, q4, q15, q10
451.Lxtsdec3x: 473.Lxtsdec4x:
452 subs r4, r4, #3 474 subs r4, r4, #4
453 bmi .Lxtsdec1x 475 bmi .Lxtsdec1x
454 vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks 476 vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
455 vld1.8 {q2}, [r1]! 477 vld1.8 {q2-q3}, [r1]!
456 next_tweak q4, q3, q7, q6 478 next_tweak q5, q4, q15, q10
457 veor q0, q0, q3 479 veor q0, q0, q4
458 next_tweak q5, q4, q7, q6 480 next_tweak q6, q5, q15, q10
459 veor q1, q1, q4 481 veor q1, q1, q5
460 veor q2, q2, q5 482 next_tweak q7, q6, q15, q10
461 bl aes_decrypt_3x 483 veor q2, q2, q6
462 veor q0, q0, q3 484 veor q3, q3, q7
463 veor q1, q1, q4 485 bl aes_decrypt_4x
464 veor q2, q2, q5 486 veor q0, q0, q4
465 vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks 487 veor q1, q1, q5
466 vst1.8 {q2}, [r0]! 488 veor q2, q2, q6
467 vmov q3, q5 489 veor q3, q3, q7
490 vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
491 vst1.8 {q2-q3}, [r0]!
492 vmov q4, q7
468 teq r4, #0 493 teq r4, #0
469 beq .Lxtsdecout 494 beq .Lxtsdecout
470 b .Lxtsdecloop3x 495 b .Lxtsdecloop4x
471.Lxtsdec1x: 496.Lxtsdec1x:
472 adds r4, r4, #3 497 adds r4, r4, #4
473 beq .Lxtsdecout 498 beq .Lxtsdecout
474.Lxtsdecloop: 499.Lxtsdecloop:
475 vld1.8 {q0}, [r1]! 500 vld1.8 {q0}, [r1]!
476 veor q0, q0, q3 501 veor q0, q0, q4
477 add ip, r2, #32 @ 3rd round key 502 add ip, r2, #32 @ 3rd round key
478 bl aes_decrypt 503 bl aes_decrypt
479 veor q0, q0, q3 504 veor q0, q0, q4
480 vst1.8 {q0}, [r0]! 505 vst1.8 {q0}, [r0]!
481 subs r4, r4, #1 506 subs r4, r4, #1
482 beq .Lxtsdecout 507 beq .Lxtsdecout
483 next_tweak q3, q3, q7, q6 508 next_tweak q4, q4, q15, q6
484 b .Lxtsdecloop 509 b .Lxtsdecloop
485.Lxtsdecout: 510.Lxtsdecout:
486 vst1.8 {q3}, [r5] 511 vst1.8 {q4}, [r5]
487 pop {r4-r6, pc} 512 pop {r4-r6, pc}
488ENDPROC(ce_aes_xts_decrypt) 513ENDPROC(ce_aes_xts_decrypt)
489 514