diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2019-09-03 12:43:25 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2019-09-09 03:35:28 -0400 |
commit | 1dede02bdd64116b98bdcbc2e2be20e62afd43f5 (patch) | |
tree | cc63d7afa9983e3ff90075e56c7630984498b257 /arch/arm | |
parent | 46a22776bc97aa5ab9d5f9dc4829859219b86365 (diff) |
crypto: arm/aes-ce - switch to 4x interleave
When the ARM AES instruction based crypto driver was introduced, there
were no known implementations that could benefit from a 4-way interleave,
and so a 3-way interleave was used instead. Since we have sufficient
space in the SIMD register file, let's switch to a 4-way interleave to
align with the 64-bit driver, and to ensure that we can reach optimum
performance when running under emulation on high end 64-bit cores.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm')
-rw-r--r-- | arch/arm/crypto/aes-ce-core.S | 263 |
1 files changed, 144 insertions, 119 deletions
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S index 1e0d45183590..a3ca4ac2d7bb 100644 --- a/arch/arm/crypto/aes-ce-core.S +++ b/arch/arm/crypto/aes-ce-core.S | |||
@@ -44,46 +44,56 @@ | |||
44 | veor q0, q0, \key3 | 44 | veor q0, q0, \key3 |
45 | .endm | 45 | .endm |
46 | 46 | ||
47 | .macro enc_dround_3x, key1, key2 | 47 | .macro enc_dround_4x, key1, key2 |
48 | enc_round q0, \key1 | 48 | enc_round q0, \key1 |
49 | enc_round q1, \key1 | 49 | enc_round q1, \key1 |
50 | enc_round q2, \key1 | 50 | enc_round q2, \key1 |
51 | enc_round q3, \key1 | ||
51 | enc_round q0, \key2 | 52 | enc_round q0, \key2 |
52 | enc_round q1, \key2 | 53 | enc_round q1, \key2 |
53 | enc_round q2, \key2 | 54 | enc_round q2, \key2 |
55 | enc_round q3, \key2 | ||
54 | .endm | 56 | .endm |
55 | 57 | ||
56 | .macro dec_dround_3x, key1, key2 | 58 | .macro dec_dround_4x, key1, key2 |
57 | dec_round q0, \key1 | 59 | dec_round q0, \key1 |
58 | dec_round q1, \key1 | 60 | dec_round q1, \key1 |
59 | dec_round q2, \key1 | 61 | dec_round q2, \key1 |
62 | dec_round q3, \key1 | ||
60 | dec_round q0, \key2 | 63 | dec_round q0, \key2 |
61 | dec_round q1, \key2 | 64 | dec_round q1, \key2 |
62 | dec_round q2, \key2 | 65 | dec_round q2, \key2 |
66 | dec_round q3, \key2 | ||
63 | .endm | 67 | .endm |
64 | 68 | ||
65 | .macro enc_fround_3x, key1, key2, key3 | 69 | .macro enc_fround_4x, key1, key2, key3 |
66 | enc_round q0, \key1 | 70 | enc_round q0, \key1 |
67 | enc_round q1, \key1 | 71 | enc_round q1, \key1 |
68 | enc_round q2, \key1 | 72 | enc_round q2, \key1 |
73 | enc_round q3, \key1 | ||
69 | aese.8 q0, \key2 | 74 | aese.8 q0, \key2 |
70 | aese.8 q1, \key2 | 75 | aese.8 q1, \key2 |
71 | aese.8 q2, \key2 | 76 | aese.8 q2, \key2 |
77 | aese.8 q3, \key2 | ||
72 | veor q0, q0, \key3 | 78 | veor q0, q0, \key3 |
73 | veor q1, q1, \key3 | 79 | veor q1, q1, \key3 |
74 | veor q2, q2, \key3 | 80 | veor q2, q2, \key3 |
81 | veor q3, q3, \key3 | ||
75 | .endm | 82 | .endm |
76 | 83 | ||
77 | .macro dec_fround_3x, key1, key2, key3 | 84 | .macro dec_fround_4x, key1, key2, key3 |
78 | dec_round q0, \key1 | 85 | dec_round q0, \key1 |
79 | dec_round q1, \key1 | 86 | dec_round q1, \key1 |
80 | dec_round q2, \key1 | 87 | dec_round q2, \key1 |
88 | dec_round q3, \key1 | ||
81 | aesd.8 q0, \key2 | 89 | aesd.8 q0, \key2 |
82 | aesd.8 q1, \key2 | 90 | aesd.8 q1, \key2 |
83 | aesd.8 q2, \key2 | 91 | aesd.8 q2, \key2 |
92 | aesd.8 q3, \key2 | ||
84 | veor q0, q0, \key3 | 93 | veor q0, q0, \key3 |
85 | veor q1, q1, \key3 | 94 | veor q1, q1, \key3 |
86 | veor q2, q2, \key3 | 95 | veor q2, q2, \key3 |
96 | veor q3, q3, \key3 | ||
87 | .endm | 97 | .endm |
88 | 98 | ||
89 | .macro do_block, dround, fround | 99 | .macro do_block, dround, fround |
@@ -114,8 +124,9 @@ | |||
114 | * transforms. These should preserve all registers except q0 - q2 and ip | 124 | * transforms. These should preserve all registers except q0 - q2 and ip |
115 | * Arguments: | 125 | * Arguments: |
116 | * q0 : first in/output block | 126 | * q0 : first in/output block |
117 | * q1 : second in/output block (_3x version only) | 127 | * q1 : second in/output block (_4x version only) |
118 | * q2 : third in/output block (_3x version only) | 128 | * q2 : third in/output block (_4x version only) |
129 | * q3 : fourth in/output block (_4x version only) | ||
119 | * q8 : first round key | 130 | * q8 : first round key |
120 | * q9 : secound round key | 131 | * q9 : secound round key |
121 | * q14 : final round key | 132 | * q14 : final round key |
@@ -136,16 +147,16 @@ aes_decrypt: | |||
136 | ENDPROC(aes_decrypt) | 147 | ENDPROC(aes_decrypt) |
137 | 148 | ||
138 | .align 6 | 149 | .align 6 |
139 | aes_encrypt_3x: | 150 | aes_encrypt_4x: |
140 | add ip, r2, #32 @ 3rd round key | 151 | add ip, r2, #32 @ 3rd round key |
141 | do_block enc_dround_3x, enc_fround_3x | 152 | do_block enc_dround_4x, enc_fround_4x |
142 | ENDPROC(aes_encrypt_3x) | 153 | ENDPROC(aes_encrypt_4x) |
143 | 154 | ||
144 | .align 6 | 155 | .align 6 |
145 | aes_decrypt_3x: | 156 | aes_decrypt_4x: |
146 | add ip, r2, #32 @ 3rd round key | 157 | add ip, r2, #32 @ 3rd round key |
147 | do_block dec_dround_3x, dec_fround_3x | 158 | do_block dec_dround_4x, dec_fround_4x |
148 | ENDPROC(aes_decrypt_3x) | 159 | ENDPROC(aes_decrypt_4x) |
149 | 160 | ||
150 | .macro prepare_key, rk, rounds | 161 | .macro prepare_key, rk, rounds |
151 | add ip, \rk, \rounds, lsl #4 | 162 | add ip, \rk, \rounds, lsl #4 |
@@ -163,17 +174,17 @@ ENTRY(ce_aes_ecb_encrypt) | |||
163 | push {r4, lr} | 174 | push {r4, lr} |
164 | ldr r4, [sp, #8] | 175 | ldr r4, [sp, #8] |
165 | prepare_key r2, r3 | 176 | prepare_key r2, r3 |
166 | .Lecbencloop3x: | 177 | .Lecbencloop4x: |
167 | subs r4, r4, #3 | 178 | subs r4, r4, #4 |
168 | bmi .Lecbenc1x | 179 | bmi .Lecbenc1x |
169 | vld1.8 {q0-q1}, [r1]! | 180 | vld1.8 {q0-q1}, [r1]! |
170 | vld1.8 {q2}, [r1]! | 181 | vld1.8 {q2-q3}, [r1]! |
171 | bl aes_encrypt_3x | 182 | bl aes_encrypt_4x |
172 | vst1.8 {q0-q1}, [r0]! | 183 | vst1.8 {q0-q1}, [r0]! |
173 | vst1.8 {q2}, [r0]! | 184 | vst1.8 {q2-q3}, [r0]! |
174 | b .Lecbencloop3x | 185 | b .Lecbencloop4x |
175 | .Lecbenc1x: | 186 | .Lecbenc1x: |
176 | adds r4, r4, #3 | 187 | adds r4, r4, #4 |
177 | beq .Lecbencout | 188 | beq .Lecbencout |
178 | .Lecbencloop: | 189 | .Lecbencloop: |
179 | vld1.8 {q0}, [r1]! | 190 | vld1.8 {q0}, [r1]! |
@@ -189,17 +200,17 @@ ENTRY(ce_aes_ecb_decrypt) | |||
189 | push {r4, lr} | 200 | push {r4, lr} |
190 | ldr r4, [sp, #8] | 201 | ldr r4, [sp, #8] |
191 | prepare_key r2, r3 | 202 | prepare_key r2, r3 |
192 | .Lecbdecloop3x: | 203 | .Lecbdecloop4x: |
193 | subs r4, r4, #3 | 204 | subs r4, r4, #4 |
194 | bmi .Lecbdec1x | 205 | bmi .Lecbdec1x |
195 | vld1.8 {q0-q1}, [r1]! | 206 | vld1.8 {q0-q1}, [r1]! |
196 | vld1.8 {q2}, [r1]! | 207 | vld1.8 {q2-q3}, [r1]! |
197 | bl aes_decrypt_3x | 208 | bl aes_decrypt_4x |
198 | vst1.8 {q0-q1}, [r0]! | 209 | vst1.8 {q0-q1}, [r0]! |
199 | vst1.8 {q2}, [r0]! | 210 | vst1.8 {q2-q3}, [r0]! |
200 | b .Lecbdecloop3x | 211 | b .Lecbdecloop4x |
201 | .Lecbdec1x: | 212 | .Lecbdec1x: |
202 | adds r4, r4, #3 | 213 | adds r4, r4, #4 |
203 | beq .Lecbdecout | 214 | beq .Lecbdecout |
204 | .Lecbdecloop: | 215 | .Lecbdecloop: |
205 | vld1.8 {q0}, [r1]! | 216 | vld1.8 {q0}, [r1]! |
@@ -236,38 +247,40 @@ ENDPROC(ce_aes_cbc_encrypt) | |||
236 | ENTRY(ce_aes_cbc_decrypt) | 247 | ENTRY(ce_aes_cbc_decrypt) |
237 | push {r4-r6, lr} | 248 | push {r4-r6, lr} |
238 | ldrd r4, r5, [sp, #16] | 249 | ldrd r4, r5, [sp, #16] |
239 | vld1.8 {q6}, [r5] @ keep iv in q6 | 250 | vld1.8 {q15}, [r5] @ keep iv in q15 |
240 | prepare_key r2, r3 | 251 | prepare_key r2, r3 |
241 | .Lcbcdecloop3x: | 252 | .Lcbcdecloop4x: |
242 | subs r4, r4, #3 | 253 | subs r4, r4, #4 |
243 | bmi .Lcbcdec1x | 254 | bmi .Lcbcdec1x |
244 | vld1.8 {q0-q1}, [r1]! | 255 | vld1.8 {q0-q1}, [r1]! |
245 | vld1.8 {q2}, [r1]! | 256 | vld1.8 {q2-q3}, [r1]! |
246 | vmov q3, q0 | 257 | vmov q4, q0 |
247 | vmov q4, q1 | 258 | vmov q5, q1 |
248 | vmov q5, q2 | 259 | vmov q6, q2 |
249 | bl aes_decrypt_3x | 260 | vmov q7, q3 |
250 | veor q0, q0, q6 | 261 | bl aes_decrypt_4x |
251 | veor q1, q1, q3 | 262 | veor q0, q0, q15 |
252 | veor q2, q2, q4 | 263 | veor q1, q1, q4 |
253 | vmov q6, q5 | 264 | veor q2, q2, q5 |
265 | veor q3, q3, q6 | ||
266 | vmov q15, q7 | ||
254 | vst1.8 {q0-q1}, [r0]! | 267 | vst1.8 {q0-q1}, [r0]! |
255 | vst1.8 {q2}, [r0]! | 268 | vst1.8 {q2-q3}, [r0]! |
256 | b .Lcbcdecloop3x | 269 | b .Lcbcdecloop4x |
257 | .Lcbcdec1x: | 270 | .Lcbcdec1x: |
258 | adds r4, r4, #3 | 271 | adds r4, r4, #4 |
259 | beq .Lcbcdecout | 272 | beq .Lcbcdecout |
260 | vmov q15, q14 @ preserve last round key | 273 | vmov q6, q14 @ preserve last round key |
261 | .Lcbcdecloop: | 274 | .Lcbcdecloop: |
262 | vld1.8 {q0}, [r1]! @ get next ct block | 275 | vld1.8 {q0}, [r1]! @ get next ct block |
263 | veor q14, q15, q6 @ combine prev ct with last key | 276 | veor q14, q15, q6 @ combine prev ct with last key |
264 | vmov q6, q0 | 277 | vmov q15, q0 |
265 | bl aes_decrypt | 278 | bl aes_decrypt |
266 | vst1.8 {q0}, [r0]! | 279 | vst1.8 {q0}, [r0]! |
267 | subs r4, r4, #1 | 280 | subs r4, r4, #1 |
268 | bne .Lcbcdecloop | 281 | bne .Lcbcdecloop |
269 | .Lcbcdecout: | 282 | .Lcbcdecout: |
270 | vst1.8 {q6}, [r5] @ keep iv in q6 | 283 | vst1.8 {q15}, [r5] @ keep iv in q15 |
271 | pop {r4-r6, pc} | 284 | pop {r4-r6, pc} |
272 | ENDPROC(ce_aes_cbc_decrypt) | 285 | ENDPROC(ce_aes_cbc_decrypt) |
273 | 286 | ||
@@ -278,46 +291,52 @@ ENDPROC(ce_aes_cbc_decrypt) | |||
278 | ENTRY(ce_aes_ctr_encrypt) | 291 | ENTRY(ce_aes_ctr_encrypt) |
279 | push {r4-r6, lr} | 292 | push {r4-r6, lr} |
280 | ldrd r4, r5, [sp, #16] | 293 | ldrd r4, r5, [sp, #16] |
281 | vld1.8 {q6}, [r5] @ load ctr | 294 | vld1.8 {q7}, [r5] @ load ctr |
282 | prepare_key r2, r3 | 295 | prepare_key r2, r3 |
283 | vmov r6, s27 @ keep swabbed ctr in r6 | 296 | vmov r6, s31 @ keep swabbed ctr in r6 |
284 | rev r6, r6 | 297 | rev r6, r6 |
285 | cmn r6, r4 @ 32 bit overflow? | 298 | cmn r6, r4 @ 32 bit overflow? |
286 | bcs .Lctrloop | 299 | bcs .Lctrloop |
287 | .Lctrloop3x: | 300 | .Lctrloop4x: |
288 | subs r4, r4, #3 | 301 | subs r4, r4, #4 |
289 | bmi .Lctr1x | 302 | bmi .Lctr1x |
290 | add r6, r6, #1 | 303 | add r6, r6, #1 |
291 | vmov q0, q6 | 304 | vmov q0, q7 |
292 | vmov q1, q6 | 305 | vmov q1, q7 |
293 | rev ip, r6 | 306 | rev ip, r6 |
294 | add r6, r6, #1 | 307 | add r6, r6, #1 |
295 | vmov q2, q6 | 308 | vmov q2, q7 |
296 | vmov s7, ip | 309 | vmov s7, ip |
297 | rev ip, r6 | 310 | rev ip, r6 |
298 | add r6, r6, #1 | 311 | add r6, r6, #1 |
312 | vmov q3, q7 | ||
299 | vmov s11, ip | 313 | vmov s11, ip |
300 | vld1.8 {q3-q4}, [r1]! | 314 | rev ip, r6 |
301 | vld1.8 {q5}, [r1]! | 315 | add r6, r6, #1 |
302 | bl aes_encrypt_3x | 316 | vmov s15, ip |
303 | veor q0, q0, q3 | 317 | vld1.8 {q4-q5}, [r1]! |
304 | veor q1, q1, q4 | 318 | vld1.8 {q6}, [r1]! |
305 | veor q2, q2, q5 | 319 | vld1.8 {q15}, [r1]! |
320 | bl aes_encrypt_4x | ||
321 | veor q0, q0, q4 | ||
322 | veor q1, q1, q5 | ||
323 | veor q2, q2, q6 | ||
324 | veor q3, q3, q15 | ||
306 | rev ip, r6 | 325 | rev ip, r6 |
307 | vst1.8 {q0-q1}, [r0]! | 326 | vst1.8 {q0-q1}, [r0]! |
308 | vst1.8 {q2}, [r0]! | 327 | vst1.8 {q2-q3}, [r0]! |
309 | vmov s27, ip | 328 | vmov s31, ip |
310 | b .Lctrloop3x | 329 | b .Lctrloop4x |
311 | .Lctr1x: | 330 | .Lctr1x: |
312 | adds r4, r4, #3 | 331 | adds r4, r4, #4 |
313 | beq .Lctrout | 332 | beq .Lctrout |
314 | .Lctrloop: | 333 | .Lctrloop: |
315 | vmov q0, q6 | 334 | vmov q0, q7 |
316 | bl aes_encrypt | 335 | bl aes_encrypt |
317 | 336 | ||
318 | adds r6, r6, #1 @ increment BE ctr | 337 | adds r6, r6, #1 @ increment BE ctr |
319 | rev ip, r6 | 338 | rev ip, r6 |
320 | vmov s27, ip | 339 | vmov s31, ip |
321 | bcs .Lctrcarry | 340 | bcs .Lctrcarry |
322 | 341 | ||
323 | .Lctrcarrydone: | 342 | .Lctrcarrydone: |
@@ -329,7 +348,7 @@ ENTRY(ce_aes_ctr_encrypt) | |||
329 | bne .Lctrloop | 348 | bne .Lctrloop |
330 | 349 | ||
331 | .Lctrout: | 350 | .Lctrout: |
332 | vst1.8 {q6}, [r5] @ return next CTR value | 351 | vst1.8 {q7}, [r5] @ return next CTR value |
333 | pop {r4-r6, pc} | 352 | pop {r4-r6, pc} |
334 | 353 | ||
335 | .Lctrtailblock: | 354 | .Lctrtailblock: |
@@ -337,7 +356,7 @@ ENTRY(ce_aes_ctr_encrypt) | |||
337 | b .Lctrout | 356 | b .Lctrout |
338 | 357 | ||
339 | .Lctrcarry: | 358 | .Lctrcarry: |
340 | .irp sreg, s26, s25, s24 | 359 | .irp sreg, s30, s29, s28 |
341 | vmov ip, \sreg @ load next word of ctr | 360 | vmov ip, \sreg @ load next word of ctr |
342 | rev ip, ip @ ... to handle the carry | 361 | rev ip, ip @ ... to handle the carry |
343 | adds ip, ip, #1 | 362 | adds ip, ip, #1 |
@@ -368,8 +387,8 @@ ENDPROC(ce_aes_ctr_encrypt) | |||
368 | .quad 1, 0x87 | 387 | .quad 1, 0x87 |
369 | 388 | ||
370 | ce_aes_xts_init: | 389 | ce_aes_xts_init: |
371 | vldr d14, .Lxts_mul_x | 390 | vldr d30, .Lxts_mul_x |
372 | vldr d15, .Lxts_mul_x + 8 | 391 | vldr d31, .Lxts_mul_x + 8 |
373 | 392 | ||
374 | ldrd r4, r5, [sp, #16] @ load args | 393 | ldrd r4, r5, [sp, #16] @ load args |
375 | ldr r6, [sp, #28] | 394 | ldr r6, [sp, #28] |
@@ -390,48 +409,51 @@ ENTRY(ce_aes_xts_encrypt) | |||
390 | 409 | ||
391 | bl ce_aes_xts_init @ run shared prologue | 410 | bl ce_aes_xts_init @ run shared prologue |
392 | prepare_key r2, r3 | 411 | prepare_key r2, r3 |
393 | vmov q3, q0 | 412 | vmov q4, q0 |
394 | 413 | ||
395 | teq r6, #0 @ start of a block? | 414 | teq r6, #0 @ start of a block? |
396 | bne .Lxtsenc3x | 415 | bne .Lxtsenc4x |
397 | 416 | ||
398 | .Lxtsencloop3x: | 417 | .Lxtsencloop4x: |
399 | next_tweak q3, q3, q7, q6 | 418 | next_tweak q4, q4, q15, q10 |
400 | .Lxtsenc3x: | 419 | .Lxtsenc4x: |
401 | subs r4, r4, #3 | 420 | subs r4, r4, #4 |
402 | bmi .Lxtsenc1x | 421 | bmi .Lxtsenc1x |
403 | vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks | 422 | vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks |
404 | vld1.8 {q2}, [r1]! | 423 | vld1.8 {q2-q3}, [r1]! |
405 | next_tweak q4, q3, q7, q6 | 424 | next_tweak q5, q4, q15, q10 |
406 | veor q0, q0, q3 | 425 | veor q0, q0, q4 |
407 | next_tweak q5, q4, q7, q6 | 426 | next_tweak q6, q5, q15, q10 |
408 | veor q1, q1, q4 | 427 | veor q1, q1, q5 |
409 | veor q2, q2, q5 | 428 | next_tweak q7, q6, q15, q10 |
410 | bl aes_encrypt_3x | 429 | veor q2, q2, q6 |
411 | veor q0, q0, q3 | 430 | veor q3, q3, q7 |
412 | veor q1, q1, q4 | 431 | bl aes_encrypt_4x |
413 | veor q2, q2, q5 | 432 | veor q0, q0, q4 |
414 | vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks | 433 | veor q1, q1, q5 |
415 | vst1.8 {q2}, [r0]! | 434 | veor q2, q2, q6 |
416 | vmov q3, q5 | 435 | veor q3, q3, q7 |
436 | vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks | ||
437 | vst1.8 {q2-q3}, [r0]! | ||
438 | vmov q4, q7 | ||
417 | teq r4, #0 | 439 | teq r4, #0 |
418 | beq .Lxtsencout | 440 | beq .Lxtsencout |
419 | b .Lxtsencloop3x | 441 | b .Lxtsencloop4x |
420 | .Lxtsenc1x: | 442 | .Lxtsenc1x: |
421 | adds r4, r4, #3 | 443 | adds r4, r4, #4 |
422 | beq .Lxtsencout | 444 | beq .Lxtsencout |
423 | .Lxtsencloop: | 445 | .Lxtsencloop: |
424 | vld1.8 {q0}, [r1]! | 446 | vld1.8 {q0}, [r1]! |
425 | veor q0, q0, q3 | 447 | veor q0, q0, q4 |
426 | bl aes_encrypt | 448 | bl aes_encrypt |
427 | veor q0, q0, q3 | 449 | veor q0, q0, q4 |
428 | vst1.8 {q0}, [r0]! | 450 | vst1.8 {q0}, [r0]! |
429 | subs r4, r4, #1 | 451 | subs r4, r4, #1 |
430 | beq .Lxtsencout | 452 | beq .Lxtsencout |
431 | next_tweak q3, q3, q7, q6 | 453 | next_tweak q4, q4, q15, q6 |
432 | b .Lxtsencloop | 454 | b .Lxtsencloop |
433 | .Lxtsencout: | 455 | .Lxtsencout: |
434 | vst1.8 {q3}, [r5] | 456 | vst1.8 {q4}, [r5] |
435 | pop {r4-r6, pc} | 457 | pop {r4-r6, pc} |
436 | ENDPROC(ce_aes_xts_encrypt) | 458 | ENDPROC(ce_aes_xts_encrypt) |
437 | 459 | ||
@@ -441,49 +463,52 @@ ENTRY(ce_aes_xts_decrypt) | |||
441 | 463 | ||
442 | bl ce_aes_xts_init @ run shared prologue | 464 | bl ce_aes_xts_init @ run shared prologue |
443 | prepare_key r2, r3 | 465 | prepare_key r2, r3 |
444 | vmov q3, q0 | 466 | vmov q4, q0 |
445 | 467 | ||
446 | teq r6, #0 @ start of a block? | 468 | teq r6, #0 @ start of a block? |
447 | bne .Lxtsdec3x | 469 | bne .Lxtsdec4x |
448 | 470 | ||
449 | .Lxtsdecloop3x: | 471 | .Lxtsdecloop4x: |
450 | next_tweak q3, q3, q7, q6 | 472 | next_tweak q4, q4, q15, q10 |
451 | .Lxtsdec3x: | 473 | .Lxtsdec4x: |
452 | subs r4, r4, #3 | 474 | subs r4, r4, #4 |
453 | bmi .Lxtsdec1x | 475 | bmi .Lxtsdec1x |
454 | vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks | 476 | vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks |
455 | vld1.8 {q2}, [r1]! | 477 | vld1.8 {q2-q3}, [r1]! |
456 | next_tweak q4, q3, q7, q6 | 478 | next_tweak q5, q4, q15, q10 |
457 | veor q0, q0, q3 | 479 | veor q0, q0, q4 |
458 | next_tweak q5, q4, q7, q6 | 480 | next_tweak q6, q5, q15, q10 |
459 | veor q1, q1, q4 | 481 | veor q1, q1, q5 |
460 | veor q2, q2, q5 | 482 | next_tweak q7, q6, q15, q10 |
461 | bl aes_decrypt_3x | 483 | veor q2, q2, q6 |
462 | veor q0, q0, q3 | 484 | veor q3, q3, q7 |
463 | veor q1, q1, q4 | 485 | bl aes_decrypt_4x |
464 | veor q2, q2, q5 | 486 | veor q0, q0, q4 |
465 | vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks | 487 | veor q1, q1, q5 |
466 | vst1.8 {q2}, [r0]! | 488 | veor q2, q2, q6 |
467 | vmov q3, q5 | 489 | veor q3, q3, q7 |
490 | vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks | ||
491 | vst1.8 {q2-q3}, [r0]! | ||
492 | vmov q4, q7 | ||
468 | teq r4, #0 | 493 | teq r4, #0 |
469 | beq .Lxtsdecout | 494 | beq .Lxtsdecout |
470 | b .Lxtsdecloop3x | 495 | b .Lxtsdecloop4x |
471 | .Lxtsdec1x: | 496 | .Lxtsdec1x: |
472 | adds r4, r4, #3 | 497 | adds r4, r4, #4 |
473 | beq .Lxtsdecout | 498 | beq .Lxtsdecout |
474 | .Lxtsdecloop: | 499 | .Lxtsdecloop: |
475 | vld1.8 {q0}, [r1]! | 500 | vld1.8 {q0}, [r1]! |
476 | veor q0, q0, q3 | 501 | veor q0, q0, q4 |
477 | add ip, r2, #32 @ 3rd round key | 502 | add ip, r2, #32 @ 3rd round key |
478 | bl aes_decrypt | 503 | bl aes_decrypt |
479 | veor q0, q0, q3 | 504 | veor q0, q0, q4 |
480 | vst1.8 {q0}, [r0]! | 505 | vst1.8 {q0}, [r0]! |
481 | subs r4, r4, #1 | 506 | subs r4, r4, #1 |
482 | beq .Lxtsdecout | 507 | beq .Lxtsdecout |
483 | next_tweak q3, q3, q7, q6 | 508 | next_tweak q4, q4, q15, q6 |
484 | b .Lxtsdecloop | 509 | b .Lxtsdecloop |
485 | .Lxtsdecout: | 510 | .Lxtsdecout: |
486 | vst1.8 {q3}, [r5] | 511 | vst1.8 {q4}, [r5] |
487 | pop {r4-r6, pc} | 512 | pop {r4-r6, pc} |
488 | ENDPROC(ce_aes_xts_decrypt) | 513 | ENDPROC(ce_aes_xts_decrypt) |
489 | 514 | ||