aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/crypto/bsaes-armv7.pl
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 18:51:29 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 18:51:29 -0500
commitf47671e2d861a2093179cd64dda22016664b2015 (patch)
treef77cb8e7d875f442e2cf0bdc8fbe478ec8ff8181 /arch/arm/crypto/bsaes-armv7.pl
parent8ceafbfa91ffbdbb2afaea5c24ccb519ffb8b587 (diff)
parent42cbe8271ca6562b4ad4b2e6a9895084b16eef5e (diff)
Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm
Pull ARM updates from Russell King: "Included in this series are: 1. BE8 (modern big endian) changes for ARM from Ben Dooks 2. big.Little support from Nicolas Pitre and Dave Martin 3. support for LPAE systems with all system memory above 4GB 4. Perf updates from Will Deacon 5. Additional prefetching and other performance improvements from Will. 6. Neon-optimised AES implementation fro Ard. 7. A number of smaller fixes scattered around the place. There is a rather horrid merge conflict in tools/perf - I was never notified of the conflict because it originally occurred between Will's tree and other stuff. Consequently I have a resolution which Will forwarded me, which I'll forward on immediately after sending this mail. The other notable thing is I'm expecting some build breakage in the crypto stuff on ARM only with Ard's AES patches. These were merged into a stable git branch which others had already pulled, so there's little I can do about this. The problem is caused because these patches have a dependency on some code in the crypto git tree - I tried requesting a branch I can pull to resolve these, and all I got each time from the crypto people was "we'll revert our patches then" which would only make things worse since I still don't have the dependent patches. I've no idea what's going on there or how to resolve that, and since I can't split these patches from the rest of this pull request, I'm rather stuck with pushing this as-is or reverting Ard's patches. Since it should "come out in the wash" I've left them in - the only build problems they seem to cause at the moment are with randconfigs, and since it's a new feature anyway. However, if by -rc1 the dependencies aren't in, I think it'd be best to revert Ard's patches" I resolved the perf conflict roughly as per the patch sent by Russell, but there may be some differences. Any errors are likely mine. Let's see how the crypto issues work out.. * 'for-linus' of git://git.linaro.org/people/rmk/linux-arm: (110 commits) ARM: 7868/1: arm/arm64: remove atomic_clear_mask() in "include/asm/atomic.h" ARM: 7867/1: include: asm: use 'int' instead of 'unsigned long' for 'oldval' in atomic_cmpxchg(). ARM: 7866/1: include: asm: use 'long long' instead of 'u64' within atomic.h ARM: 7871/1: amba: Extend number of IRQS ARM: 7887/1: Don't smp_cross_call() on UP devices in arch_irq_work_raise() ARM: 7872/1: Support arch_irq_work_raise() via self IPIs ARM: 7880/1: Clear the IT state independent of the Thumb-2 mode ARM: 7878/1: nommu: Implement dummy early_paging_init() ARM: 7876/1: clear Thumb-2 IT state on exception handling ARM: 7874/2: bL_switcher: Remove cpu_hotplug_driver_{lock,unlock}() ARM: footbridge: fix build warnings for netwinder ARM: 7873/1: vfp: clear vfp_current_hw_state for dying cpu ARM: fix misplaced arch_virt_to_idmap() ARM: 7848/1: mcpm: Implement cpu_kill() to synchronise on powerdown ARM: 7847/1: mcpm: Factor out logical-to-physical CPU translation ARM: 7869/1: remove unused XSCALE_PMU Kconfig param ARM: 7864/1: Handle 64-bit memory in case of 32-bit phys_addr_t ARM: 7863/1: Let arm_add_memory() always use 64-bit arguments ARM: 7862/1: pcpu: replace __get_cpu_var_uses ARM: 7861/1: cacheflush: consolidate single-CPU ARMv7 cache disabling code ...
Diffstat (limited to 'arch/arm/crypto/bsaes-armv7.pl')
-rw-r--r--arch/arm/crypto/bsaes-armv7.pl2467
1 files changed, 2467 insertions, 0 deletions
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644
index 000000000000..f3d96d932573
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
10# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11# granted.
12# ====================================================================
13
14# Bit-sliced AES for ARM NEON
15#
16# February 2012.
17#
18# This implementation is direct adaptation of bsaes-x86_64 module for
19# ARM NEON. Except that this module is endian-neutral [in sense that
20# it can be compiled for either endianness] by courtesy of vld1.8's
21# neutrality. Initial version doesn't implement interface to OpenSSL,
22# only low-level primitives and unsupported entry points, just enough
23# to collect performance results, which for Cortex-A8 core are:
24#
25# encrypt 19.5 cycles per byte processed with 128-bit key
26# decrypt 22.1 cycles per byte processed with 128-bit key
27# key conv. 440 cycles per 128-bit key/0.18 of 8x block
28#
29# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
30# which is [much] worse than anticipated (for further details see
31# http://www.openssl.org/~appro/Snapdragon-S4.html).
32#
33# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
34# manages in 20.0 cycles].
35#
36# When comparing to x86_64 results keep in mind that NEON unit is
37# [mostly] single-issue and thus can't [fully] benefit from
38# instruction-level parallelism. And when comparing to aes-armv4
39# results keep in mind key schedule conversion overhead (see
40# bsaes-x86_64.pl for further details)...
41#
42# <appro@openssl.org>
43
44# April-August 2013
45#
46# Add CBC, CTR and XTS subroutines, adapt for kernel use.
47#
48# <ard.biesheuvel@linaro.org>
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
54my @XMM=map("q$_",(0..15));
55
56{
57my ($key,$rounds,$const)=("r4","r5","r6");
58
59sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
60sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
61
62sub Sbox {
63# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
64# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
65my @b=@_[0..7];
66my @t=@_[8..11];
67my @s=@_[12..15];
68 &InBasisChange (@b);
69 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
70 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
71}
72
73sub InBasisChange {
74# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
75# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
76my @b=@_[0..7];
77$code.=<<___;
78 veor @b[2], @b[2], @b[1]
79 veor @b[5], @b[5], @b[6]
80 veor @b[3], @b[3], @b[0]
81 veor @b[6], @b[6], @b[2]
82 veor @b[5], @b[5], @b[0]
83
84 veor @b[6], @b[6], @b[3]
85 veor @b[3], @b[3], @b[7]
86 veor @b[7], @b[7], @b[5]
87 veor @b[3], @b[3], @b[4]
88 veor @b[4], @b[4], @b[5]
89
90 veor @b[2], @b[2], @b[7]
91 veor @b[3], @b[3], @b[1]
92 veor @b[1], @b[1], @b[5]
93___
94}
95
96sub OutBasisChange {
97# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
98# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
99my @b=@_[0..7];
100$code.=<<___;
101 veor @b[0], @b[0], @b[6]
102 veor @b[1], @b[1], @b[4]
103 veor @b[4], @b[4], @b[6]
104 veor @b[2], @b[2], @b[0]
105 veor @b[6], @b[6], @b[1]
106
107 veor @b[1], @b[1], @b[5]
108 veor @b[5], @b[5], @b[3]
109 veor @b[3], @b[3], @b[7]
110 veor @b[7], @b[7], @b[5]
111 veor @b[2], @b[2], @b[5]
112
113 veor @b[4], @b[4], @b[7]
114___
115}
116
117sub InvSbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InvInBasisChange (@b);
124 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
125 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
126}
127
128sub InvInBasisChange { # OutBasisChange in reverse (with twist)
129my @b=@_[5,1,2,6,3,7,0,4];
130$code.=<<___
131 veor @b[1], @b[1], @b[7]
132 veor @b[4], @b[4], @b[7]
133
134 veor @b[7], @b[7], @b[5]
135 veor @b[1], @b[1], @b[3]
136 veor @b[2], @b[2], @b[5]
137 veor @b[3], @b[3], @b[7]
138
139 veor @b[6], @b[6], @b[1]
140 veor @b[2], @b[2], @b[0]
141 veor @b[5], @b[5], @b[3]
142 veor @b[4], @b[4], @b[6]
143 veor @b[0], @b[0], @b[6]
144 veor @b[1], @b[1], @b[4]
145___
146}
147
148sub InvOutBasisChange { # InBasisChange in reverse
149my @b=@_[2,5,7,3,6,1,0,4];
150$code.=<<___;
151 veor @b[1], @b[1], @b[5]
152 veor @b[2], @b[2], @b[7]
153
154 veor @b[3], @b[3], @b[1]
155 veor @b[4], @b[4], @b[5]
156 veor @b[7], @b[7], @b[5]
157 veor @b[3], @b[3], @b[4]
158 veor @b[5], @b[5], @b[0]
159 veor @b[3], @b[3], @b[7]
160 veor @b[6], @b[6], @b[2]
161 veor @b[2], @b[2], @b[1]
162 veor @b[6], @b[6], @b[3]
163
164 veor @b[3], @b[3], @b[0]
165 veor @b[5], @b[5], @b[6]
166___
167}
168
169sub Mul_GF4 {
170#;*************************************************************
171#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
172#;*************************************************************
173my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
174$code.=<<___;
175 veor $t0, $y0, $y1
176 vand $t0, $t0, $x0
177 veor $x0, $x0, $x1
178 vand $t1, $x1, $y0
179 vand $x0, $x0, $y1
180 veor $x1, $t1, $t0
181 veor $x0, $x0, $t1
182___
183}
184
185sub Mul_GF4_N { # not used, see next subroutine
186# multiply and scale by N
187my ($x0,$x1,$y0,$y1,$t0)=@_;
188$code.=<<___;
189 veor $t0, $y0, $y1
190 vand $t0, $t0, $x0
191 veor $x0, $x0, $x1
192 vand $x1, $x1, $y0
193 vand $x0, $x0, $y1
194 veor $x1, $x1, $x0
195 veor $x0, $x0, $t0
196___
197}
198
199sub Mul_GF4_N_GF4 {
200# interleaved Mul_GF4_N and Mul_GF4
201my ($x0,$x1,$y0,$y1,$t0,
202 $x2,$x3,$y2,$y3,$t1)=@_;
203$code.=<<___;
204 veor $t0, $y0, $y1
205 veor $t1, $y2, $y3
206 vand $t0, $t0, $x0
207 vand $t1, $t1, $x2
208 veor $x0, $x0, $x1
209 veor $x2, $x2, $x3
210 vand $x1, $x1, $y0
211 vand $x3, $x3, $y2
212 vand $x0, $x0, $y1
213 vand $x2, $x2, $y3
214 veor $x1, $x1, $x0
215 veor $x2, $x2, $x3
216 veor $x0, $x0, $t0
217 veor $x3, $x3, $t1
218___
219}
220sub Mul_GF16_2 {
221my @x=@_[0..7];
222my @y=@_[8..11];
223my @t=@_[12..15];
224$code.=<<___;
225 veor @t[0], @x[0], @x[2]
226 veor @t[1], @x[1], @x[3]
227___
228 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
229$code.=<<___;
230 veor @y[0], @y[0], @y[2]
231 veor @y[1], @y[1], @y[3]
232___
233 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
234 @x[2], @x[3], @y[2], @y[3], @t[2]);
235$code.=<<___;
236 veor @x[0], @x[0], @t[0]
237 veor @x[2], @x[2], @t[0]
238 veor @x[1], @x[1], @t[1]
239 veor @x[3], @x[3], @t[1]
240
241 veor @t[0], @x[4], @x[6]
242 veor @t[1], @x[5], @x[7]
243___
244 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
245 @x[6], @x[7], @y[2], @y[3], @t[2]);
246$code.=<<___;
247 veor @y[0], @y[0], @y[2]
248 veor @y[1], @y[1], @y[3]
249___
250 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
251$code.=<<___;
252 veor @x[4], @x[4], @t[0]
253 veor @x[6], @x[6], @t[0]
254 veor @x[5], @x[5], @t[1]
255 veor @x[7], @x[7], @t[1]
256___
257}
258sub Inv_GF256 {
259#;********************************************************************
260#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
261#;********************************************************************
262my @x=@_[0..7];
263my @t=@_[8..11];
264my @s=@_[12..15];
265# direct optimizations from hardware
266$code.=<<___;
267 veor @t[3], @x[4], @x[6]
268 veor @t[2], @x[5], @x[7]
269 veor @t[1], @x[1], @x[3]
270 veor @s[1], @x[7], @x[6]
271 vmov @t[0], @t[2]
272 veor @s[0], @x[0], @x[2]
273
274 vorr @t[2], @t[2], @t[1]
275 veor @s[3], @t[3], @t[0]
276 vand @s[2], @t[3], @s[0]
277 vorr @t[3], @t[3], @s[0]
278 veor @s[0], @s[0], @t[1]
279 vand @t[0], @t[0], @t[1]
280 veor @t[1], @x[3], @x[2]
281 vand @s[3], @s[3], @s[0]
282 vand @s[1], @s[1], @t[1]
283 veor @t[1], @x[4], @x[5]
284 veor @s[0], @x[1], @x[0]
285 veor @t[3], @t[3], @s[1]
286 veor @t[2], @t[2], @s[1]
287 vand @s[1], @t[1], @s[0]
288 vorr @t[1], @t[1], @s[0]
289 veor @t[3], @t[3], @s[3]
290 veor @t[0], @t[0], @s[1]
291 veor @t[2], @t[2], @s[2]
292 veor @t[1], @t[1], @s[3]
293 veor @t[0], @t[0], @s[2]
294 vand @s[0], @x[7], @x[3]
295 veor @t[1], @t[1], @s[2]
296 vand @s[1], @x[6], @x[2]
297 vand @s[2], @x[5], @x[1]
298 vorr @s[3], @x[4], @x[0]
299 veor @t[3], @t[3], @s[0]
300 veor @t[1], @t[1], @s[2]
301 veor @t[0], @t[0], @s[3]
302 veor @t[2], @t[2], @s[1]
303
304 @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
305
306 @ new smaller inversion
307
308 vand @s[2], @t[3], @t[1]
309 vmov @s[0], @t[0]
310
311 veor @s[1], @t[2], @s[2]
312 veor @s[3], @t[0], @s[2]
313 veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
314
315 vbsl @s[1], @t[1], @t[0]
316 vbsl @s[3], @t[3], @t[2]
317 veor @t[3], @t[3], @t[2]
318
319 vbsl @s[0], @s[1], @s[2]
320 vbsl @t[0], @s[2], @s[1]
321
322 vand @s[2], @s[0], @s[3]
323 veor @t[1], @t[1], @t[0]
324
325 veor @s[2], @s[2], @t[3]
326___
327# output in s3, s2, s1, t1
328
329# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
330
331# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
332 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
333
334### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
335}
336
337# AES linear components
338
339sub ShiftRows {
340my @x=@_[0..7];
341my @t=@_[8..11];
342my $mask=pop;
343$code.=<<___;
344 vldmia $key!, {@t[0]-@t[3]}
345 veor @t[0], @t[0], @x[0]
346 veor @t[1], @t[1], @x[1]
347 vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
348 vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
349 vldmia $key!, {@t[0]}
350 veor @t[2], @t[2], @x[2]
351 vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
352 vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
353 vldmia $key!, {@t[1]}
354 veor @t[3], @t[3], @x[3]
355 vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
356 vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
357 vldmia $key!, {@t[2]}
358 vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
359 vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
360 vldmia $key!, {@t[3]}
361 veor @t[0], @t[0], @x[4]
362 veor @t[1], @t[1], @x[5]
363 vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
364 vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
365 veor @t[2], @t[2], @x[6]
366 vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
367 vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
368 veor @t[3], @t[3], @x[7]
369 vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
370 vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
371 vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
372 vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
373___
374}
375
376sub MixColumns {
377# modified to emit output in order suitable for feeding back to aesenc[last]
378my @x=@_[0..7];
379my @t=@_[8..15];
380my $inv=@_[16]; # optional
381$code.=<<___;
382 vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
383 vext.8 @t[1], @x[1], @x[1], #12
384 veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
385 vext.8 @t[2], @x[2], @x[2], #12
386 veor @x[1], @x[1], @t[1]
387 vext.8 @t[3], @x[3], @x[3], #12
388 veor @x[2], @x[2], @t[2]
389 vext.8 @t[4], @x[4], @x[4], #12
390 veor @x[3], @x[3], @t[3]
391 vext.8 @t[5], @x[5], @x[5], #12
392 veor @x[4], @x[4], @t[4]
393 vext.8 @t[6], @x[6], @x[6], #12
394 veor @x[5], @x[5], @t[5]
395 vext.8 @t[7], @x[7], @x[7], #12
396 veor @x[6], @x[6], @t[6]
397
398 veor @t[1], @t[1], @x[0]
399 veor @x[7], @x[7], @t[7]
400 vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
401 veor @t[2], @t[2], @x[1]
402 veor @t[0], @t[0], @x[7]
403 veor @t[1], @t[1], @x[7]
404 vext.8 @x[1], @x[1], @x[1], #8
405 veor @t[5], @t[5], @x[4]
406 veor @x[0], @x[0], @t[0]
407 veor @t[6], @t[6], @x[5]
408 veor @x[1], @x[1], @t[1]
409 vext.8 @t[0], @x[4], @x[4], #8
410 veor @t[4], @t[4], @x[3]
411 vext.8 @t[1], @x[5], @x[5], #8
412 veor @t[7], @t[7], @x[6]
413 vext.8 @x[4], @x[3], @x[3], #8
414 veor @t[3], @t[3], @x[2]
415 vext.8 @x[5], @x[7], @x[7], #8
416 veor @t[4], @t[4], @x[7]
417 vext.8 @x[3], @x[6], @x[6], #8
418 veor @t[3], @t[3], @x[7]
419 vext.8 @x[6], @x[2], @x[2], #8
420 veor @x[7], @t[1], @t[5]
421___
422$code.=<<___ if (!$inv);
423 veor @x[2], @t[0], @t[4]
424 veor @x[4], @x[4], @t[3]
425 veor @x[5], @x[5], @t[7]
426 veor @x[3], @x[3], @t[6]
427 @ vmov @x[2], @t[0]
428 veor @x[6], @x[6], @t[2]
429 @ vmov @x[7], @t[1]
430___
431$code.=<<___ if ($inv);
432 veor @t[3], @t[3], @x[4]
433 veor @x[5], @x[5], @t[7]
434 veor @x[2], @x[3], @t[6]
435 veor @x[3], @t[0], @t[4]
436 veor @x[4], @x[6], @t[2]
437 vmov @x[6], @t[3]
438 @ vmov @x[7], @t[1]
439___
440}
441
442sub InvMixColumns_orig {
443my @x=@_[0..7];
444my @t=@_[8..15];
445
446$code.=<<___;
447 @ multiplication by 0x0e
448 vext.8 @t[7], @x[7], @x[7], #12
449 vmov @t[2], @x[2]
450 veor @x[2], @x[2], @x[5] @ 2 5
451 veor @x[7], @x[7], @x[5] @ 7 5
452 vext.8 @t[0], @x[0], @x[0], #12
453 vmov @t[5], @x[5]
454 veor @x[5], @x[5], @x[0] @ 5 0 [1]
455 veor @x[0], @x[0], @x[1] @ 0 1
456 vext.8 @t[1], @x[1], @x[1], #12
457 veor @x[1], @x[1], @x[2] @ 1 25
458 veor @x[0], @x[0], @x[6] @ 01 6 [2]
459 vext.8 @t[3], @x[3], @x[3], #12
460 veor @x[1], @x[1], @x[3] @ 125 3 [4]
461 veor @x[2], @x[2], @x[0] @ 25 016 [3]
462 veor @x[3], @x[3], @x[7] @ 3 75
463 veor @x[7], @x[7], @x[6] @ 75 6 [0]
464 vext.8 @t[6], @x[6], @x[6], #12
465 vmov @t[4], @x[4]
466 veor @x[6], @x[6], @x[4] @ 6 4
467 veor @x[4], @x[4], @x[3] @ 4 375 [6]
468 veor @x[3], @x[3], @x[7] @ 375 756=36
469 veor @x[6], @x[6], @t[5] @ 64 5 [7]
470 veor @x[3], @x[3], @t[2] @ 36 2
471 vext.8 @t[5], @t[5], @t[5], #12
472 veor @x[3], @x[3], @t[4] @ 362 4 [5]
473___
474 my @y = @x[7,5,0,2,1,3,4,6];
475$code.=<<___;
476 @ multiplication by 0x0b
477 veor @y[1], @y[1], @y[0]
478 veor @y[0], @y[0], @t[0]
479 vext.8 @t[2], @t[2], @t[2], #12
480 veor @y[1], @y[1], @t[1]
481 veor @y[0], @y[0], @t[5]
482 vext.8 @t[4], @t[4], @t[4], #12
483 veor @y[1], @y[1], @t[6]
484 veor @y[0], @y[0], @t[7]
485 veor @t[7], @t[7], @t[6] @ clobber t[7]
486
487 veor @y[3], @y[3], @t[0]
488 veor @y[1], @y[1], @y[0]
489 vext.8 @t[0], @t[0], @t[0], #12
490 veor @y[2], @y[2], @t[1]
491 veor @y[4], @y[4], @t[1]
492 vext.8 @t[1], @t[1], @t[1], #12
493 veor @y[2], @y[2], @t[2]
494 veor @y[3], @y[3], @t[2]
495 veor @y[5], @y[5], @t[2]
496 veor @y[2], @y[2], @t[7]
497 vext.8 @t[2], @t[2], @t[2], #12
498 veor @y[3], @y[3], @t[3]
499 veor @y[6], @y[6], @t[3]
500 veor @y[4], @y[4], @t[3]
501 veor @y[7], @y[7], @t[4]
502 vext.8 @t[3], @t[3], @t[3], #12
503 veor @y[5], @y[5], @t[4]
504 veor @y[7], @y[7], @t[7]
505 veor @t[7], @t[7], @t[5] @ clobber t[7] even more
506 veor @y[3], @y[3], @t[5]
507 veor @y[4], @y[4], @t[4]
508
509 veor @y[5], @y[5], @t[7]
510 vext.8 @t[4], @t[4], @t[4], #12
511 veor @y[6], @y[6], @t[7]
512 veor @y[4], @y[4], @t[7]
513
514 veor @t[7], @t[7], @t[5]
515 vext.8 @t[5], @t[5], @t[5], #12
516
517 @ multiplication by 0x0d
518 veor @y[4], @y[4], @y[7]
519 veor @t[7], @t[7], @t[6] @ restore t[7]
520 veor @y[7], @y[7], @t[4]
521 vext.8 @t[6], @t[6], @t[6], #12
522 veor @y[2], @y[2], @t[0]
523 veor @y[7], @y[7], @t[5]
524 vext.8 @t[7], @t[7], @t[7], #12
525 veor @y[2], @y[2], @t[2]
526
527 veor @y[3], @y[3], @y[1]
528 veor @y[1], @y[1], @t[1]
529 veor @y[0], @y[0], @t[0]
530 veor @y[3], @y[3], @t[0]
531 veor @y[1], @y[1], @t[5]
532 veor @y[0], @y[0], @t[5]
533 vext.8 @t[0], @t[0], @t[0], #12
534 veor @y[1], @y[1], @t[7]
535 veor @y[0], @y[0], @t[6]
536 veor @y[3], @y[3], @y[1]
537 veor @y[4], @y[4], @t[1]
538 vext.8 @t[1], @t[1], @t[1], #12
539
540 veor @y[7], @y[7], @t[7]
541 veor @y[4], @y[4], @t[2]
542 veor @y[5], @y[5], @t[2]
543 veor @y[2], @y[2], @t[6]
544 veor @t[6], @t[6], @t[3] @ clobber t[6]
545 vext.8 @t[2], @t[2], @t[2], #12
546 veor @y[4], @y[4], @y[7]
547 veor @y[3], @y[3], @t[6]
548
549 veor @y[6], @y[6], @t[6]
550 veor @y[5], @y[5], @t[5]
551 vext.8 @t[5], @t[5], @t[5], #12
552 veor @y[6], @y[6], @t[4]
553 vext.8 @t[4], @t[4], @t[4], #12
554 veor @y[5], @y[5], @t[6]
555 veor @y[6], @y[6], @t[7]
556 vext.8 @t[7], @t[7], @t[7], #12
557 veor @t[6], @t[6], @t[3] @ restore t[6]
558 vext.8 @t[3], @t[3], @t[3], #12
559
560 @ multiplication by 0x09
561 veor @y[4], @y[4], @y[1]
562 veor @t[1], @t[1], @y[1] @ t[1]=y[1]
563 veor @t[0], @t[0], @t[5] @ clobber t[0]
564 vext.8 @t[6], @t[6], @t[6], #12
565 veor @t[1], @t[1], @t[5]
566 veor @y[3], @y[3], @t[0]
567 veor @t[0], @t[0], @y[0] @ t[0]=y[0]
568 veor @t[1], @t[1], @t[6]
569 veor @t[6], @t[6], @t[7] @ clobber t[6]
570 veor @y[4], @y[4], @t[1]
571 veor @y[7], @y[7], @t[4]
572 veor @y[6], @y[6], @t[3]
573 veor @y[5], @y[5], @t[2]
574 veor @t[4], @t[4], @y[4] @ t[4]=y[4]
575 veor @t[3], @t[3], @y[3] @ t[3]=y[3]
576 veor @t[5], @t[5], @y[5] @ t[5]=y[5]
577 veor @t[2], @t[2], @y[2] @ t[2]=y[2]
578 veor @t[3], @t[3], @t[7]
579 veor @XMM[5], @t[5], @t[6]
580 veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
581 veor @XMM[2], @t[2], @t[6]
582 veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
583
584 vmov @XMM[0], @t[0]
585 vmov @XMM[1], @t[1]
586 @ vmov @XMM[2], @t[2]
587 vmov @XMM[3], @t[3]
588 vmov @XMM[4], @t[4]
589 @ vmov @XMM[5], @t[5]
590 @ vmov @XMM[6], @t[6]
591 @ vmov @XMM[7], @t[7]
592___
593}
594
595sub InvMixColumns {
596my @x=@_[0..7];
597my @t=@_[8..15];
598
599# Thanks to Jussi Kivilinna for providing pointer to
600#
601# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
602# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
603# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
604# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
605
606$code.=<<___;
607 @ multiplication by 0x05-0x00-0x04-0x00
608 vext.8 @t[0], @x[0], @x[0], #8
609 vext.8 @t[6], @x[6], @x[6], #8
610 vext.8 @t[7], @x[7], @x[7], #8
611 veor @t[0], @t[0], @x[0]
612 vext.8 @t[1], @x[1], @x[1], #8
613 veor @t[6], @t[6], @x[6]
614 vext.8 @t[2], @x[2], @x[2], #8
615 veor @t[7], @t[7], @x[7]
616 vext.8 @t[3], @x[3], @x[3], #8
617 veor @t[1], @t[1], @x[1]
618 vext.8 @t[4], @x[4], @x[4], #8
619 veor @t[2], @t[2], @x[2]
620 vext.8 @t[5], @x[5], @x[5], #8
621 veor @t[3], @t[3], @x[3]
622 veor @t[4], @t[4], @x[4]
623 veor @t[5], @t[5], @x[5]
624
625 veor @x[0], @x[0], @t[6]
626 veor @x[1], @x[1], @t[6]
627 veor @x[2], @x[2], @t[0]
628 veor @x[4], @x[4], @t[2]
629 veor @x[3], @x[3], @t[1]
630 veor @x[1], @x[1], @t[7]
631 veor @x[2], @x[2], @t[7]
632 veor @x[4], @x[4], @t[6]
633 veor @x[5], @x[5], @t[3]
634 veor @x[3], @x[3], @t[6]
635 veor @x[6], @x[6], @t[4]
636 veor @x[4], @x[4], @t[7]
637 veor @x[5], @x[5], @t[7]
638 veor @x[7], @x[7], @t[5]
639___
640 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
641}
642
643sub swapmove {
644my ($a,$b,$n,$mask,$t)=@_;
645$code.=<<___;
646 vshr.u64 $t, $b, #$n
647 veor $t, $t, $a
648 vand $t, $t, $mask
649 veor $a, $a, $t
650 vshl.u64 $t, $t, #$n
651 veor $b, $b, $t
652___
653}
654sub swapmove2x {
655my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
656$code.=<<___;
657 vshr.u64 $t0, $b0, #$n
658 vshr.u64 $t1, $b1, #$n
659 veor $t0, $t0, $a0
660 veor $t1, $t1, $a1
661 vand $t0, $t0, $mask
662 vand $t1, $t1, $mask
663 veor $a0, $a0, $t0
664 vshl.u64 $t0, $t0, #$n
665 veor $a1, $a1, $t1
666 vshl.u64 $t1, $t1, #$n
667 veor $b0, $b0, $t0
668 veor $b1, $b1, $t1
669___
670}
671
672sub bitslice {
673my @x=reverse(@_[0..7]);
674my ($t0,$t1,$t2,$t3)=@_[8..11];
675$code.=<<___;
676 vmov.i8 $t0,#0x55 @ compose .LBS0
677 vmov.i8 $t1,#0x33 @ compose .LBS1
678___
679 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
680 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
681$code.=<<___;
682 vmov.i8 $t0,#0x0f @ compose .LBS2
683___
684 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
685 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
686
687 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
688 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
689}
690
691$code.=<<___;
692#ifndef __KERNEL__
693# include "arm_arch.h"
694
695# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
696# define VFP_ABI_POP vldmia sp!,{d8-d15}
697# define VFP_ABI_FRAME 0x40
698#else
699# define VFP_ABI_PUSH
700# define VFP_ABI_POP
701# define VFP_ABI_FRAME 0
702# define BSAES_ASM_EXTENDED_KEY
703# define XTS_CHAIN_TWEAK
704# define __ARM_ARCH__ __LINUX_ARM_ARCH__
705#endif
706
707#ifdef __thumb__
708# define adrl adr
709#endif
710
711#if __ARM_ARCH__>=7
712.text
713.syntax unified @ ARMv7-capable assembler is expected to handle this
714#ifdef __thumb2__
715.thumb
716#else
717.code 32
718#endif
719
720.fpu neon
721
722.type _bsaes_decrypt8,%function
723.align 4
724_bsaes_decrypt8:
725 adr $const,_bsaes_decrypt8
726 vldmia $key!, {@XMM[9]} @ round 0 key
727 add $const,$const,#.LM0ISR-_bsaes_decrypt8
728
729 vldmia $const!, {@XMM[8]} @ .LM0ISR
730 veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
731 veor @XMM[11], @XMM[1], @XMM[9]
732 vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
733 vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
734 veor @XMM[12], @XMM[2], @XMM[9]
735 vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
736 vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
737 veor @XMM[13], @XMM[3], @XMM[9]
738 vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
739 vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
740 veor @XMM[14], @XMM[4], @XMM[9]
741 vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
742 vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
743 veor @XMM[15], @XMM[5], @XMM[9]
744 vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
745 vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
746 veor @XMM[10], @XMM[6], @XMM[9]
747 vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
748 vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
749 veor @XMM[11], @XMM[7], @XMM[9]
750 vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
751 vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
752 vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
753 vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
754___
755 &bitslice (@XMM[0..7, 8..11]);
756$code.=<<___;
757 sub $rounds,$rounds,#1
758 b .Ldec_sbox
759.align 4
760.Ldec_loop:
761___
762 &ShiftRows (@XMM[0..7, 8..12]);
763$code.=".Ldec_sbox:\n";
764 &InvSbox (@XMM[0..7, 8..15]);
765$code.=<<___;
766 subs $rounds,$rounds,#1
767 bcc .Ldec_done
768___
769 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
770$code.=<<___;
771 vldmia $const, {@XMM[12]} @ .LISR
772 ite eq @ Thumb2 thing, sanity check in ARM
773 addeq $const,$const,#0x10
774 bne .Ldec_loop
775 vldmia $const, {@XMM[12]} @ .LISRM0
776 b .Ldec_loop
777.align 4
778.Ldec_done:
779___
780 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
781$code.=<<___;
782 vldmia $key, {@XMM[8]} @ last round key
783 veor @XMM[6], @XMM[6], @XMM[8]
784 veor @XMM[4], @XMM[4], @XMM[8]
785 veor @XMM[2], @XMM[2], @XMM[8]
786 veor @XMM[7], @XMM[7], @XMM[8]
787 veor @XMM[3], @XMM[3], @XMM[8]
788 veor @XMM[5], @XMM[5], @XMM[8]
789 veor @XMM[0], @XMM[0], @XMM[8]
790 veor @XMM[1], @XMM[1], @XMM[8]
791 bx lr
792.size _bsaes_decrypt8,.-_bsaes_decrypt8
793
794.type _bsaes_const,%object
795.align 6
796_bsaes_const:
797.LM0ISR: @ InvShiftRows constants
798 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
799.LISR:
800 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
801.LISRM0:
802 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
803.LM0SR: @ ShiftRows constants
804 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
805.LSR:
806 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
807.LSRM0:
808 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
809.LM0:
810 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
811.LREVM0SR:
812 .quad 0x090d01050c000408, 0x03070b0f060a0e02
813.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
814.align 6
815.size _bsaes_const,.-_bsaes_const
816
817.type _bsaes_encrypt8,%function
818.align 4
819_bsaes_encrypt8:
820 adr $const,_bsaes_encrypt8
821 vldmia $key!, {@XMM[9]} @ round 0 key
822 sub $const,$const,#_bsaes_encrypt8-.LM0SR
823
824 vldmia $const!, {@XMM[8]} @ .LM0SR
825_bsaes_encrypt8_alt:
826 veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
827 veor @XMM[11], @XMM[1], @XMM[9]
828 vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
829 vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
830 veor @XMM[12], @XMM[2], @XMM[9]
831 vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
832 vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
833 veor @XMM[13], @XMM[3], @XMM[9]
834 vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
835 vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
836 veor @XMM[14], @XMM[4], @XMM[9]
837 vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
838 vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
839 veor @XMM[15], @XMM[5], @XMM[9]
840 vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
841 vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
842 veor @XMM[10], @XMM[6], @XMM[9]
843 vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
844 vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
845 veor @XMM[11], @XMM[7], @XMM[9]
846 vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
847 vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
848 vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
849 vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
850_bsaes_encrypt8_bitslice:
851___
852 &bitslice (@XMM[0..7, 8..11]);
853$code.=<<___;
854 sub $rounds,$rounds,#1
855 b .Lenc_sbox
856.align 4
857.Lenc_loop:
858___
859 &ShiftRows (@XMM[0..7, 8..12]);
860$code.=".Lenc_sbox:\n";
861 &Sbox (@XMM[0..7, 8..15]);
862$code.=<<___;
863 subs $rounds,$rounds,#1
864 bcc .Lenc_done
865___
866 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
867$code.=<<___;
868 vldmia $const, {@XMM[12]} @ .LSR
869 ite eq @ Thumb2 thing, samity check in ARM
870 addeq $const,$const,#0x10
871 bne .Lenc_loop
872 vldmia $const, {@XMM[12]} @ .LSRM0
873 b .Lenc_loop
874.align 4
875.Lenc_done:
876___
877 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
878 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
879$code.=<<___;
880 vldmia $key, {@XMM[8]} @ last round key
881 veor @XMM[4], @XMM[4], @XMM[8]
882 veor @XMM[6], @XMM[6], @XMM[8]
883 veor @XMM[3], @XMM[3], @XMM[8]
884 veor @XMM[7], @XMM[7], @XMM[8]
885 veor @XMM[2], @XMM[2], @XMM[8]
886 veor @XMM[5], @XMM[5], @XMM[8]
887 veor @XMM[0], @XMM[0], @XMM[8]
888 veor @XMM[1], @XMM[1], @XMM[8]
889 bx lr
890.size _bsaes_encrypt8,.-_bsaes_encrypt8
891___
892}
893{
894my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
895
896sub bitslice_key {
897my @x=reverse(@_[0..7]);
898my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
899
900 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
901$code.=<<___;
902 @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
903 vmov @x[2], @x[0]
904 vmov @x[3], @x[1]
905___
906 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
907
908 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
909$code.=<<___;
910 @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
911 vmov @x[4], @x[0]
912 vmov @x[6], @x[2]
913 vmov @x[5], @x[1]
914 vmov @x[7], @x[3]
915___
916 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
917 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
918}
919
920$code.=<<___;
921.type _bsaes_key_convert,%function
922.align 4
923_bsaes_key_convert:
924 adr $const,_bsaes_key_convert
925 vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
926 sub $const,$const,#_bsaes_key_convert-.LM0
927 vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
928
929 vmov.i8 @XMM[8], #0x01 @ bit masks
930 vmov.i8 @XMM[9], #0x02
931 vmov.i8 @XMM[10], #0x04
932 vmov.i8 @XMM[11], #0x08
933 vmov.i8 @XMM[12], #0x10
934 vmov.i8 @XMM[13], #0x20
935 vldmia $const, {@XMM[14]} @ .LM0
936
937#ifdef __ARMEL__
938 vrev32.8 @XMM[7], @XMM[7]
939 vrev32.8 @XMM[15], @XMM[15]
940#endif
941 sub $rounds,$rounds,#1
942 vstmia $out!, {@XMM[7]} @ save round 0 key
943 b .Lkey_loop
944
945.align 4
946.Lkey_loop:
947 vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
948 vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
949 vmov.i8 @XMM[6], #0x40
950 vmov.i8 @XMM[15], #0x80
951
952 vtst.8 @XMM[0], @XMM[7], @XMM[8]
953 vtst.8 @XMM[1], @XMM[7], @XMM[9]
954 vtst.8 @XMM[2], @XMM[7], @XMM[10]
955 vtst.8 @XMM[3], @XMM[7], @XMM[11]
956 vtst.8 @XMM[4], @XMM[7], @XMM[12]
957 vtst.8 @XMM[5], @XMM[7], @XMM[13]
958 vtst.8 @XMM[6], @XMM[7], @XMM[6]
959 vtst.8 @XMM[7], @XMM[7], @XMM[15]
960 vld1.8 {@XMM[15]}, [$inp]! @ load next round key
961 vmvn @XMM[0], @XMM[0] @ "pnot"
962 vmvn @XMM[1], @XMM[1]
963 vmvn @XMM[5], @XMM[5]
964 vmvn @XMM[6], @XMM[6]
965#ifdef __ARMEL__
966 vrev32.8 @XMM[15], @XMM[15]
967#endif
968 subs $rounds,$rounds,#1
969 vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
970 bne .Lkey_loop
971
972 vmov.i8 @XMM[7],#0x63 @ compose .L63
973 @ don't save last round key
974 bx lr
975.size _bsaes_key_convert,.-_bsaes_key_convert
976___
977}
978
979if (0) { # following four functions are unsupported interface
980 # used for benchmarking...
981$code.=<<___;
982.globl bsaes_enc_key_convert
983.type bsaes_enc_key_convert,%function
984.align 4
985bsaes_enc_key_convert:
986 stmdb sp!,{r4-r6,lr}
987 vstmdb sp!,{d8-d15} @ ABI specification says so
988
989 ldr r5,[$inp,#240] @ pass rounds
990 mov r4,$inp @ pass key
991 mov r12,$out @ pass key schedule
992 bl _bsaes_key_convert
993 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
994 vstmia r12, {@XMM[7]} @ save last round key
995
996 vldmia sp!,{d8-d15}
997 ldmia sp!,{r4-r6,pc}
998.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
999
1000.globl bsaes_encrypt_128
1001.type bsaes_encrypt_128,%function
1002.align 4
1003bsaes_encrypt_128:
1004 stmdb sp!,{r4-r6,lr}
1005 vstmdb sp!,{d8-d15} @ ABI specification says so
1006.Lenc128_loop:
1007 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1008 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1009 mov r4,$key @ pass the key
1010 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1011 mov r5,#10 @ pass rounds
1012 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1013
1014 bl _bsaes_encrypt8
1015
1016 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1017 vst1.8 {@XMM[4]}, [$out]!
1018 vst1.8 {@XMM[6]}, [$out]!
1019 vst1.8 {@XMM[3]}, [$out]!
1020 vst1.8 {@XMM[7]}, [$out]!
1021 vst1.8 {@XMM[2]}, [$out]!
1022 subs $len,$len,#0x80
1023 vst1.8 {@XMM[5]}, [$out]!
1024 bhi .Lenc128_loop
1025
1026 vldmia sp!,{d8-d15}
1027 ldmia sp!,{r4-r6,pc}
1028.size bsaes_encrypt_128,.-bsaes_encrypt_128
1029
1030.globl bsaes_dec_key_convert
1031.type bsaes_dec_key_convert,%function
1032.align 4
1033bsaes_dec_key_convert:
1034 stmdb sp!,{r4-r6,lr}
1035 vstmdb sp!,{d8-d15} @ ABI specification says so
1036
1037 ldr r5,[$inp,#240] @ pass rounds
1038 mov r4,$inp @ pass key
1039 mov r12,$out @ pass key schedule
1040 bl _bsaes_key_convert
1041 vldmia $out, {@XMM[6]}
1042 vstmia r12, {@XMM[15]} @ save last round key
1043 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1044 vstmia $out, {@XMM[7]}
1045
1046 vldmia sp!,{d8-d15}
1047 ldmia sp!,{r4-r6,pc}
1048.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1049
1050.globl bsaes_decrypt_128
1051.type bsaes_decrypt_128,%function
1052.align 4
1053bsaes_decrypt_128:
1054 stmdb sp!,{r4-r6,lr}
1055 vstmdb sp!,{d8-d15} @ ABI specification says so
1056.Ldec128_loop:
1057 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1058 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1059 mov r4,$key @ pass the key
1060 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1061 mov r5,#10 @ pass rounds
1062 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1063
1064 bl _bsaes_decrypt8
1065
1066 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1067 vst1.8 {@XMM[6]}, [$out]!
1068 vst1.8 {@XMM[4]}, [$out]!
1069 vst1.8 {@XMM[2]}, [$out]!
1070 vst1.8 {@XMM[7]}, [$out]!
1071 vst1.8 {@XMM[3]}, [$out]!
1072 subs $len,$len,#0x80
1073 vst1.8 {@XMM[5]}, [$out]!
1074 bhi .Ldec128_loop
1075
1076 vldmia sp!,{d8-d15}
1077 ldmia sp!,{r4-r6,pc}
1078.size bsaes_decrypt_128,.-bsaes_decrypt_128
1079___
1080}
1081{
1082my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
1083my ($keysched)=("sp");
1084
1085$code.=<<___;
1086.extern AES_cbc_encrypt
1087.extern AES_decrypt
1088
1089.global bsaes_cbc_encrypt
1090.type bsaes_cbc_encrypt,%function
1091.align 5
1092bsaes_cbc_encrypt:
1093#ifndef __KERNEL__
1094 cmp $len, #128
1095#ifndef __thumb__
1096 blo AES_cbc_encrypt
1097#else
1098 bhs 1f
1099 b AES_cbc_encrypt
11001:
1101#endif
1102#endif
1103
1104 @ it is up to the caller to make sure we are called with enc == 0
1105
1106 mov ip, sp
1107 stmdb sp!, {r4-r10, lr}
1108 VFP_ABI_PUSH
1109 ldr $ivp, [ip] @ IV is 1st arg on the stack
1110 mov $len, $len, lsr#4 @ len in 16 byte blocks
1111 sub sp, #0x10 @ scratch space to carry over the IV
1112 mov $fp, sp @ save sp
1113
1114 ldr $rounds, [$key, #240] @ get # of rounds
1115#ifndef BSAES_ASM_EXTENDED_KEY
1116 @ allocate the key schedule on the stack
1117 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1118 add r12, #`128-32` @ sifze of bit-slices key schedule
1119
1120 @ populate the key schedule
1121 mov r4, $key @ pass key
1122 mov r5, $rounds @ pass # of rounds
1123 mov sp, r12 @ sp is $keysched
1124 bl _bsaes_key_convert
1125 vldmia $keysched, {@XMM[6]}
1126 vstmia r12, {@XMM[15]} @ save last round key
1127 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1128 vstmia $keysched, {@XMM[7]}
1129#else
1130 ldr r12, [$key, #244]
1131 eors r12, #1
1132 beq 0f
1133
1134 @ populate the key schedule
1135 str r12, [$key, #244]
1136 mov r4, $key @ pass key
1137 mov r5, $rounds @ pass # of rounds
1138 add r12, $key, #248 @ pass key schedule
1139 bl _bsaes_key_convert
1140 add r4, $key, #248
1141 vldmia r4, {@XMM[6]}
1142 vstmia r12, {@XMM[15]} @ save last round key
1143 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1144 vstmia r4, {@XMM[7]}
1145
1146.align 2
11470:
1148#endif
1149
1150 vld1.8 {@XMM[15]}, [$ivp] @ load IV
1151 b .Lcbc_dec_loop
1152
1153.align 4
1154.Lcbc_dec_loop:
1155 subs $len, $len, #0x8
1156 bmi .Lcbc_dec_loop_finish
1157
1158 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1159 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1160#ifndef BSAES_ASM_EXTENDED_KEY
1161 mov r4, $keysched @ pass the key
1162#else
1163 add r4, $key, #248
1164#endif
1165 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1166 mov r5, $rounds
1167 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
1168 sub $inp, $inp, #0x60
1169 vstmia $fp, {@XMM[15]} @ put aside IV
1170
1171 bl _bsaes_decrypt8
1172
1173 vldmia $fp, {@XMM[14]} @ reload IV
1174 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1175 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1176 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1177 veor @XMM[1], @XMM[1], @XMM[8]
1178 veor @XMM[6], @XMM[6], @XMM[9]
1179 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1180 veor @XMM[4], @XMM[4], @XMM[10]
1181 veor @XMM[2], @XMM[2], @XMM[11]
1182 vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1183 veor @XMM[7], @XMM[7], @XMM[12]
1184 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1185 veor @XMM[3], @XMM[3], @XMM[13]
1186 vst1.8 {@XMM[6]}, [$out]!
1187 veor @XMM[5], @XMM[5], @XMM[14]
1188 vst1.8 {@XMM[4]}, [$out]!
1189 vst1.8 {@XMM[2]}, [$out]!
1190 vst1.8 {@XMM[7]}, [$out]!
1191 vst1.8 {@XMM[3]}, [$out]!
1192 vst1.8 {@XMM[5]}, [$out]!
1193
1194 b .Lcbc_dec_loop
1195
1196.Lcbc_dec_loop_finish:
1197 adds $len, $len, #8
1198 beq .Lcbc_dec_done
1199
1200 vld1.8 {@XMM[0]}, [$inp]! @ load input
1201 cmp $len, #2
1202 blo .Lcbc_dec_one
1203 vld1.8 {@XMM[1]}, [$inp]!
1204#ifndef BSAES_ASM_EXTENDED_KEY
1205 mov r4, $keysched @ pass the key
1206#else
1207 add r4, $key, #248
1208#endif
1209 mov r5, $rounds
1210 vstmia $fp, {@XMM[15]} @ put aside IV
1211 beq .Lcbc_dec_two
1212 vld1.8 {@XMM[2]}, [$inp]!
1213 cmp $len, #4
1214 blo .Lcbc_dec_three
1215 vld1.8 {@XMM[3]}, [$inp]!
1216 beq .Lcbc_dec_four
1217 vld1.8 {@XMM[4]}, [$inp]!
1218 cmp $len, #6
1219 blo .Lcbc_dec_five
1220 vld1.8 {@XMM[5]}, [$inp]!
1221 beq .Lcbc_dec_six
1222 vld1.8 {@XMM[6]}, [$inp]!
1223 sub $inp, $inp, #0x70
1224
1225 bl _bsaes_decrypt8
1226
1227 vldmia $fp, {@XMM[14]} @ reload IV
1228 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1229 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1230 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1231 veor @XMM[1], @XMM[1], @XMM[8]
1232 veor @XMM[6], @XMM[6], @XMM[9]
1233 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1234 veor @XMM[4], @XMM[4], @XMM[10]
1235 veor @XMM[2], @XMM[2], @XMM[11]
1236 vld1.8 {@XMM[15]}, [$inp]!
1237 veor @XMM[7], @XMM[7], @XMM[12]
1238 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1239 veor @XMM[3], @XMM[3], @XMM[13]
1240 vst1.8 {@XMM[6]}, [$out]!
1241 vst1.8 {@XMM[4]}, [$out]!
1242 vst1.8 {@XMM[2]}, [$out]!
1243 vst1.8 {@XMM[7]}, [$out]!
1244 vst1.8 {@XMM[3]}, [$out]!
1245 b .Lcbc_dec_done
1246.align 4
1247.Lcbc_dec_six:
1248 sub $inp, $inp, #0x60
1249 bl _bsaes_decrypt8
1250 vldmia $fp,{@XMM[14]} @ reload IV
1251 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1252 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1253 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1254 veor @XMM[1], @XMM[1], @XMM[8]
1255 veor @XMM[6], @XMM[6], @XMM[9]
1256 vld1.8 {@XMM[12]}, [$inp]!
1257 veor @XMM[4], @XMM[4], @XMM[10]
1258 veor @XMM[2], @XMM[2], @XMM[11]
1259 vld1.8 {@XMM[15]}, [$inp]!
1260 veor @XMM[7], @XMM[7], @XMM[12]
1261 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1262 vst1.8 {@XMM[6]}, [$out]!
1263 vst1.8 {@XMM[4]}, [$out]!
1264 vst1.8 {@XMM[2]}, [$out]!
1265 vst1.8 {@XMM[7]}, [$out]!
1266 b .Lcbc_dec_done
1267.align 4
1268.Lcbc_dec_five:
1269 sub $inp, $inp, #0x50
1270 bl _bsaes_decrypt8
1271 vldmia $fp, {@XMM[14]} @ reload IV
1272 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1273 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1274 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1275 veor @XMM[1], @XMM[1], @XMM[8]
1276 veor @XMM[6], @XMM[6], @XMM[9]
1277 vld1.8 {@XMM[15]}, [$inp]!
1278 veor @XMM[4], @XMM[4], @XMM[10]
1279 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1280 veor @XMM[2], @XMM[2], @XMM[11]
1281 vst1.8 {@XMM[6]}, [$out]!
1282 vst1.8 {@XMM[4]}, [$out]!
1283 vst1.8 {@XMM[2]}, [$out]!
1284 b .Lcbc_dec_done
1285.align 4
1286.Lcbc_dec_four:
1287 sub $inp, $inp, #0x40
1288 bl _bsaes_decrypt8
1289 vldmia $fp, {@XMM[14]} @ reload IV
1290 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1291 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1292 vld1.8 {@XMM[10]}, [$inp]!
1293 veor @XMM[1], @XMM[1], @XMM[8]
1294 veor @XMM[6], @XMM[6], @XMM[9]
1295 vld1.8 {@XMM[15]}, [$inp]!
1296 veor @XMM[4], @XMM[4], @XMM[10]
1297 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1298 vst1.8 {@XMM[6]}, [$out]!
1299 vst1.8 {@XMM[4]}, [$out]!
1300 b .Lcbc_dec_done
1301.align 4
1302.Lcbc_dec_three:
1303 sub $inp, $inp, #0x30
1304 bl _bsaes_decrypt8
1305 vldmia $fp, {@XMM[14]} @ reload IV
1306 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1307 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1308 vld1.8 {@XMM[15]}, [$inp]!
1309 veor @XMM[1], @XMM[1], @XMM[8]
1310 veor @XMM[6], @XMM[6], @XMM[9]
1311 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1312 vst1.8 {@XMM[6]}, [$out]!
1313 b .Lcbc_dec_done
1314.align 4
1315.Lcbc_dec_two:
1316 sub $inp, $inp, #0x20
1317 bl _bsaes_decrypt8
1318 vldmia $fp, {@XMM[14]} @ reload IV
1319 vld1.8 {@XMM[8]}, [$inp]! @ reload input
1320 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1321 vld1.8 {@XMM[15]}, [$inp]! @ reload input
1322 veor @XMM[1], @XMM[1], @XMM[8]
1323 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1324 b .Lcbc_dec_done
1325.align 4
1326.Lcbc_dec_one:
1327 sub $inp, $inp, #0x10
1328 mov $rounds, $out @ save original out pointer
1329 mov $out, $fp @ use the iv scratch space as out buffer
1330 mov r2, $key
1331 vmov @XMM[4],@XMM[15] @ just in case ensure that IV
1332 vmov @XMM[5],@XMM[0] @ and input are preserved
1333 bl AES_decrypt
1334 vld1.8 {@XMM[0]}, [$fp,:64] @ load result
1335 veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
1336 vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
1337 vst1.8 {@XMM[0]}, [$rounds] @ write output
1338
1339.Lcbc_dec_done:
1340#ifndef BSAES_ASM_EXTENDED_KEY
1341 vmov.i32 q0, #0
1342 vmov.i32 q1, #0
1343.Lcbc_dec_bzero: @ wipe key schedule [if any]
1344 vstmia $keysched!, {q0-q1}
1345 cmp $keysched, $fp
1346 bne .Lcbc_dec_bzero
1347#endif
1348
1349 mov sp, $fp
1350 add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1351 vst1.8 {@XMM[15]}, [$ivp] @ return IV
1352 VFP_ABI_POP
1353 ldmia sp!, {r4-r10, pc}
1354.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1355___
1356}
1357{
1358my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1359my $const = "r6"; # shared with _bsaes_encrypt8_alt
1360my $keysched = "sp";
1361
1362$code.=<<___;
1363.extern AES_encrypt
1364.global bsaes_ctr32_encrypt_blocks
1365.type bsaes_ctr32_encrypt_blocks,%function
1366.align 5
1367bsaes_ctr32_encrypt_blocks:
1368 cmp $len, #8 @ use plain AES for
1369 blo .Lctr_enc_short @ small sizes
1370
1371 mov ip, sp
1372 stmdb sp!, {r4-r10, lr}
1373 VFP_ABI_PUSH
1374 ldr $ctr, [ip] @ ctr is 1st arg on the stack
1375 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1376 mov $fp, sp @ save sp
1377
1378 ldr $rounds, [$key, #240] @ get # of rounds
1379#ifndef BSAES_ASM_EXTENDED_KEY
1380 @ allocate the key schedule on the stack
1381 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1382 add r12, #`128-32` @ size of bit-sliced key schedule
1383
1384 @ populate the key schedule
1385 mov r4, $key @ pass key
1386 mov r5, $rounds @ pass # of rounds
1387 mov sp, r12 @ sp is $keysched
1388 bl _bsaes_key_convert
1389 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1390 vstmia r12, {@XMM[7]} @ save last round key
1391
1392 vld1.8 {@XMM[0]}, [$ctr] @ load counter
1393 add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
1394 vldmia $keysched, {@XMM[4]} @ load round0 key
1395#else
1396 ldr r12, [$key, #244]
1397 eors r12, #1
1398 beq 0f
1399
1400 @ populate the key schedule
1401 str r12, [$key, #244]
1402 mov r4, $key @ pass key
1403 mov r5, $rounds @ pass # of rounds
1404 add r12, $key, #248 @ pass key schedule
1405 bl _bsaes_key_convert
1406 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1407 vstmia r12, {@XMM[7]} @ save last round key
1408
1409.align 2
14100: add r12, $key, #248
1411 vld1.8 {@XMM[0]}, [$ctr] @ load counter
1412 adrl $ctr, .LREVM0SR @ borrow $ctr
1413 vldmia r12, {@XMM[4]} @ load round0 key
1414 sub sp, #0x10 @ place for adjusted round0 key
1415#endif
1416
1417 vmov.i32 @XMM[8],#1 @ compose 1<<96
1418 veor @XMM[9],@XMM[9],@XMM[9]
1419 vrev32.8 @XMM[0],@XMM[0]
1420 vext.8 @XMM[8],@XMM[9],@XMM[8],#4
1421 vrev32.8 @XMM[4],@XMM[4]
1422 vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1423 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
1424 b .Lctr_enc_loop
1425
1426.align 4
1427.Lctr_enc_loop:
1428 vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
1429 vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
1430 vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
1431 vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
1432 vadd.u32 @XMM[4], @XMM[1], @XMM[10]
1433 vadd.u32 @XMM[5], @XMM[2], @XMM[10]
1434 vadd.u32 @XMM[6], @XMM[3], @XMM[10]
1435 vadd.u32 @XMM[7], @XMM[4], @XMM[10]
1436 vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
1437
1438 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1439 @ to flip byte order in 32-bit counter
1440
1441 vldmia $keysched, {@XMM[9]} @ load round0 key
1442#ifndef BSAES_ASM_EXTENDED_KEY
1443 add r4, $keysched, #0x10 @ pass next round key
1444#else
1445 add r4, $key, #`248+16`
1446#endif
1447 vldmia $ctr, {@XMM[8]} @ .LREVM0SR
1448 mov r5, $rounds @ pass rounds
1449 vstmia $fp, {@XMM[10]} @ save next counter
1450 sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
1451
1452 bl _bsaes_encrypt8_alt
1453
1454 subs $len, $len, #8
1455 blo .Lctr_enc_loop_done
1456
1457 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
1458 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1459 veor @XMM[0], @XMM[8]
1460 veor @XMM[1], @XMM[9]
1461 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1462 veor @XMM[4], @XMM[10]
1463 veor @XMM[6], @XMM[11]
1464 vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1465 veor @XMM[3], @XMM[12]
1466 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1467 veor @XMM[7], @XMM[13]
1468 veor @XMM[2], @XMM[14]
1469 vst1.8 {@XMM[4]}, [$out]!
1470 veor @XMM[5], @XMM[15]
1471 vst1.8 {@XMM[6]}, [$out]!
1472 vmov.i32 @XMM[8], #1 @ compose 1<<96
1473 vst1.8 {@XMM[3]}, [$out]!
1474 veor @XMM[9], @XMM[9], @XMM[9]
1475 vst1.8 {@XMM[7]}, [$out]!
1476 vext.8 @XMM[8], @XMM[9], @XMM[8], #4
1477 vst1.8 {@XMM[2]}, [$out]!
1478 vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1479 vst1.8 {@XMM[5]}, [$out]!
1480 vldmia $fp, {@XMM[0]} @ load counter
1481
1482 bne .Lctr_enc_loop
1483 b .Lctr_enc_done
1484
1485.align 4
1486.Lctr_enc_loop_done:
1487 add $len, $len, #8
1488 vld1.8 {@XMM[8]}, [$inp]! @ load input
1489 veor @XMM[0], @XMM[8]
1490 vst1.8 {@XMM[0]}, [$out]! @ write output
1491 cmp $len, #2
1492 blo .Lctr_enc_done
1493 vld1.8 {@XMM[9]}, [$inp]!
1494 veor @XMM[1], @XMM[9]
1495 vst1.8 {@XMM[1]}, [$out]!
1496 beq .Lctr_enc_done
1497 vld1.8 {@XMM[10]}, [$inp]!
1498 veor @XMM[4], @XMM[10]
1499 vst1.8 {@XMM[4]}, [$out]!
1500 cmp $len, #4
1501 blo .Lctr_enc_done
1502 vld1.8 {@XMM[11]}, [$inp]!
1503 veor @XMM[6], @XMM[11]
1504 vst1.8 {@XMM[6]}, [$out]!
1505 beq .Lctr_enc_done
1506 vld1.8 {@XMM[12]}, [$inp]!
1507 veor @XMM[3], @XMM[12]
1508 vst1.8 {@XMM[3]}, [$out]!
1509 cmp $len, #6
1510 blo .Lctr_enc_done
1511 vld1.8 {@XMM[13]}, [$inp]!
1512 veor @XMM[7], @XMM[13]
1513 vst1.8 {@XMM[7]}, [$out]!
1514 beq .Lctr_enc_done
1515 vld1.8 {@XMM[14]}, [$inp]
1516 veor @XMM[2], @XMM[14]
1517 vst1.8 {@XMM[2]}, [$out]!
1518
1519.Lctr_enc_done:
1520 vmov.i32 q0, #0
1521 vmov.i32 q1, #0
1522#ifndef BSAES_ASM_EXTENDED_KEY
1523.Lctr_enc_bzero: @ wipe key schedule [if any]
1524 vstmia $keysched!, {q0-q1}
1525 cmp $keysched, $fp
1526 bne .Lctr_enc_bzero
1527#else
1528 vstmia $keysched, {q0-q1}
1529#endif
1530
1531 mov sp, $fp
1532 add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1533 VFP_ABI_POP
1534 ldmia sp!, {r4-r10, pc} @ return
1535
1536.align 4
1537.Lctr_enc_short:
1538 ldr ip, [sp] @ ctr pointer is passed on stack
1539 stmdb sp!, {r4-r8, lr}
1540
1541 mov r4, $inp @ copy arguments
1542 mov r5, $out
1543 mov r6, $len
1544 mov r7, $key
1545 ldr r8, [ip, #12] @ load counter LSW
1546 vld1.8 {@XMM[1]}, [ip] @ load whole counter value
1547#ifdef __ARMEL__
1548 rev r8, r8
1549#endif
1550 sub sp, sp, #0x10
1551 vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
1552 sub sp, sp, #0x10
1553
1554.Lctr_enc_short_loop:
1555 add r0, sp, #0x10 @ input counter value
1556 mov r1, sp @ output on the stack
1557 mov r2, r7 @ key
1558
1559 bl AES_encrypt
1560
1561 vld1.8 {@XMM[0]}, [r4]! @ load input
1562 vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
1563 add r8, r8, #1
1564#ifdef __ARMEL__
1565 rev r0, r8
1566 str r0, [sp, #0x1c] @ next counter value
1567#else
1568 str r8, [sp, #0x1c] @ next counter value
1569#endif
1570 veor @XMM[0],@XMM[0],@XMM[1]
1571 vst1.8 {@XMM[0]}, [r5]! @ store output
1572 subs r6, r6, #1
1573 bne .Lctr_enc_short_loop
1574
1575 vmov.i32 q0, #0
1576 vmov.i32 q1, #0
1577 vstmia sp!, {q0-q1}
1578
1579 ldmia sp!, {r4-r8, pc}
1580.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1581___
1582}
1583{
1584######################################################################
1585# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1586# const AES_KEY *key1, const AES_KEY *key2,
1587# const unsigned char iv[16]);
1588#
1589my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
1590my $const="r6"; # returned by _bsaes_key_convert
1591my $twmask=@XMM[5];
1592my @T=@XMM[6..7];
1593
1594$code.=<<___;
1595.globl bsaes_xts_encrypt
1596.type bsaes_xts_encrypt,%function
1597.align 4
1598bsaes_xts_encrypt:
1599 mov ip, sp
1600 stmdb sp!, {r4-r10, lr} @ 0x20
1601 VFP_ABI_PUSH
1602 mov r6, sp @ future $fp
1603
1604 mov $inp, r0
1605 mov $out, r1
1606 mov $len, r2
1607 mov $key, r3
1608
1609 sub r0, sp, #0x10 @ 0x10
1610 bic r0, #0xf @ align at 16 bytes
1611 mov sp, r0
1612
1613#ifdef XTS_CHAIN_TWEAK
1614 ldr r0, [ip] @ pointer to input tweak
1615#else
1616 @ generate initial tweak
1617 ldr r0, [ip, #4] @ iv[]
1618 mov r1, sp
1619 ldr r2, [ip, #0] @ key2
1620 bl AES_encrypt
1621 mov r0,sp @ pointer to initial tweak
1622#endif
1623
1624 ldr $rounds, [$key, #240] @ get # of rounds
1625 mov $fp, r6
1626#ifndef BSAES_ASM_EXTENDED_KEY
1627 @ allocate the key schedule on the stack
1628 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1629 @ add r12, #`128-32` @ size of bit-sliced key schedule
1630 sub r12, #`32+16` @ place for tweak[9]
1631
1632 @ populate the key schedule
1633 mov r4, $key @ pass key
1634 mov r5, $rounds @ pass # of rounds
1635 mov sp, r12
1636 add r12, #0x90 @ pass key schedule
1637 bl _bsaes_key_convert
1638 veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1639 vstmia r12, {@XMM[7]} @ save last round key
1640#else
1641 ldr r12, [$key, #244]
1642 eors r12, #1
1643 beq 0f
1644
1645 str r12, [$key, #244]
1646 mov r4, $key @ pass key
1647 mov r5, $rounds @ pass # of rounds
1648 add r12, $key, #248 @ pass key schedule
1649 bl _bsaes_key_convert
1650 veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1651 vstmia r12, {@XMM[7]}
1652
1653.align 2
16540: sub sp, #0x90 @ place for tweak[9]
1655#endif
1656
1657 vld1.8 {@XMM[8]}, [r0] @ initial tweak
1658 adr $magic, .Lxts_magic
1659
1660 subs $len, #0x80
1661 blo .Lxts_enc_short
1662 b .Lxts_enc_loop
1663
1664.align 4
1665.Lxts_enc_loop:
1666 vldmia $magic, {$twmask} @ load XTS magic
1667 vshr.s64 @T[0], @XMM[8], #63
1668 mov r0, sp
1669 vand @T[0], @T[0], $twmask
1670___
1671for($i=9;$i<16;$i++) {
1672$code.=<<___;
1673 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1674 vst1.64 {@XMM[$i-1]}, [r0,:128]!
1675 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1676 vshr.s64 @T[1], @XMM[$i], #63
1677 veor @XMM[$i], @XMM[$i], @T[0]
1678 vand @T[1], @T[1], $twmask
1679___
1680 @T=reverse(@T);
1681
1682$code.=<<___ if ($i>=10);
1683 vld1.8 {@XMM[$i-10]}, [$inp]!
1684___
1685$code.=<<___ if ($i>=11);
1686 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1687___
1688}
1689$code.=<<___;
1690 vadd.u64 @XMM[8], @XMM[15], @XMM[15]
1691 vst1.64 {@XMM[15]}, [r0,:128]!
1692 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1693 veor @XMM[8], @XMM[8], @T[0]
1694 vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1695
1696 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1697 veor @XMM[5], @XMM[5], @XMM[13]
1698#ifndef BSAES_ASM_EXTENDED_KEY
1699 add r4, sp, #0x90 @ pass key schedule
1700#else
1701 add r4, $key, #248 @ pass key schedule
1702#endif
1703 veor @XMM[6], @XMM[6], @XMM[14]
1704 mov r5, $rounds @ pass rounds
1705 veor @XMM[7], @XMM[7], @XMM[15]
1706 mov r0, sp
1707
1708 bl _bsaes_encrypt8
1709
1710 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1711 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1712 veor @XMM[0], @XMM[0], @XMM[ 8]
1713 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1714 veor @XMM[1], @XMM[1], @XMM[ 9]
1715 veor @XMM[8], @XMM[4], @XMM[10]
1716 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1717 veor @XMM[9], @XMM[6], @XMM[11]
1718 vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
1719 veor @XMM[10], @XMM[3], @XMM[12]
1720 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1721 veor @XMM[11], @XMM[7], @XMM[13]
1722 veor @XMM[12], @XMM[2], @XMM[14]
1723 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1724 veor @XMM[13], @XMM[5], @XMM[15]
1725 vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
1726
1727 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1728
1729 subs $len, #0x80
1730 bpl .Lxts_enc_loop
1731
1732.Lxts_enc_short:
1733 adds $len, #0x70
1734 bmi .Lxts_enc_done
1735
1736 vldmia $magic, {$twmask} @ load XTS magic
1737 vshr.s64 @T[0], @XMM[8], #63
1738 mov r0, sp
1739 vand @T[0], @T[0], $twmask
1740___
1741for($i=9;$i<16;$i++) {
1742$code.=<<___;
1743 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1744 vst1.64 {@XMM[$i-1]}, [r0,:128]!
1745 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1746 vshr.s64 @T[1], @XMM[$i], #63
1747 veor @XMM[$i], @XMM[$i], @T[0]
1748 vand @T[1], @T[1], $twmask
1749___
1750 @T=reverse(@T);
1751
1752$code.=<<___ if ($i>=10);
1753 vld1.8 {@XMM[$i-10]}, [$inp]!
1754 subs $len, #0x10
1755 bmi .Lxts_enc_`$i-9`
1756___
1757$code.=<<___ if ($i>=11);
1758 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1759___
1760}
1761$code.=<<___;
1762 sub $len, #0x10
1763 vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
1764
1765 vld1.8 {@XMM[6]}, [$inp]!
1766 veor @XMM[5], @XMM[5], @XMM[13]
1767#ifndef BSAES_ASM_EXTENDED_KEY
1768 add r4, sp, #0x90 @ pass key schedule
1769#else
1770 add r4, $key, #248 @ pass key schedule
1771#endif
1772 veor @XMM[6], @XMM[6], @XMM[14]
1773 mov r5, $rounds @ pass rounds
1774 mov r0, sp
1775
1776 bl _bsaes_encrypt8
1777
1778 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1779 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1780 veor @XMM[0], @XMM[0], @XMM[ 8]
1781 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1782 veor @XMM[1], @XMM[1], @XMM[ 9]
1783 veor @XMM[8], @XMM[4], @XMM[10]
1784 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1785 veor @XMM[9], @XMM[6], @XMM[11]
1786 vld1.64 {@XMM[14]}, [r0,:128]!
1787 veor @XMM[10], @XMM[3], @XMM[12]
1788 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1789 veor @XMM[11], @XMM[7], @XMM[13]
1790 veor @XMM[12], @XMM[2], @XMM[14]
1791 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1792 vst1.8 {@XMM[12]}, [$out]!
1793
1794 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1795 b .Lxts_enc_done
1796.align 4
1797.Lxts_enc_6:
1798 vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
1799
1800 veor @XMM[4], @XMM[4], @XMM[12]
1801#ifndef BSAES_ASM_EXTENDED_KEY
1802 add r4, sp, #0x90 @ pass key schedule
1803#else
1804 add r4, $key, #248 @ pass key schedule
1805#endif
1806 veor @XMM[5], @XMM[5], @XMM[13]
1807 mov r5, $rounds @ pass rounds
1808 mov r0, sp
1809
1810 bl _bsaes_encrypt8
1811
1812 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1813 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1814 veor @XMM[0], @XMM[0], @XMM[ 8]
1815 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1816 veor @XMM[1], @XMM[1], @XMM[ 9]
1817 veor @XMM[8], @XMM[4], @XMM[10]
1818 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1819 veor @XMM[9], @XMM[6], @XMM[11]
1820 veor @XMM[10], @XMM[3], @XMM[12]
1821 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1822 veor @XMM[11], @XMM[7], @XMM[13]
1823 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1824
1825 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1826 b .Lxts_enc_done
1827
1828@ put this in range for both ARM and Thumb mode adr instructions
1829.align 5
1830.Lxts_magic:
1831 .quad 1, 0x87
1832
1833.align 5
1834.Lxts_enc_5:
1835 vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
1836
1837 veor @XMM[3], @XMM[3], @XMM[11]
1838#ifndef BSAES_ASM_EXTENDED_KEY
1839 add r4, sp, #0x90 @ pass key schedule
1840#else
1841 add r4, $key, #248 @ pass key schedule
1842#endif
1843 veor @XMM[4], @XMM[4], @XMM[12]
1844 mov r5, $rounds @ pass rounds
1845 mov r0, sp
1846
1847 bl _bsaes_encrypt8
1848
1849 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1850 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1851 veor @XMM[0], @XMM[0], @XMM[ 8]
1852 vld1.64 {@XMM[12]}, [r0,:128]!
1853 veor @XMM[1], @XMM[1], @XMM[ 9]
1854 veor @XMM[8], @XMM[4], @XMM[10]
1855 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1856 veor @XMM[9], @XMM[6], @XMM[11]
1857 veor @XMM[10], @XMM[3], @XMM[12]
1858 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1859 vst1.8 {@XMM[10]}, [$out]!
1860
1861 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1862 b .Lxts_enc_done
1863.align 4
1864.Lxts_enc_4:
1865 vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
1866
1867 veor @XMM[2], @XMM[2], @XMM[10]
1868#ifndef BSAES_ASM_EXTENDED_KEY
1869 add r4, sp, #0x90 @ pass key schedule
1870#else
1871 add r4, $key, #248 @ pass key schedule
1872#endif
1873 veor @XMM[3], @XMM[3], @XMM[11]
1874 mov r5, $rounds @ pass rounds
1875 mov r0, sp
1876
1877 bl _bsaes_encrypt8
1878
1879 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1880 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1881 veor @XMM[0], @XMM[0], @XMM[ 8]
1882 veor @XMM[1], @XMM[1], @XMM[ 9]
1883 veor @XMM[8], @XMM[4], @XMM[10]
1884 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1885 veor @XMM[9], @XMM[6], @XMM[11]
1886 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1887
1888 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1889 b .Lxts_enc_done
1890.align 4
1891.Lxts_enc_3:
1892 vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
1893
1894 veor @XMM[1], @XMM[1], @XMM[9]
1895#ifndef BSAES_ASM_EXTENDED_KEY
1896 add r4, sp, #0x90 @ pass key schedule
1897#else
1898 add r4, $key, #248 @ pass key schedule
1899#endif
1900 veor @XMM[2], @XMM[2], @XMM[10]
1901 mov r5, $rounds @ pass rounds
1902 mov r0, sp
1903
1904 bl _bsaes_encrypt8
1905
1906 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1907 vld1.64 {@XMM[10]}, [r0,:128]!
1908 veor @XMM[0], @XMM[0], @XMM[ 8]
1909 veor @XMM[1], @XMM[1], @XMM[ 9]
1910 veor @XMM[8], @XMM[4], @XMM[10]
1911 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1912 vst1.8 {@XMM[8]}, [$out]!
1913
1914 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1915 b .Lxts_enc_done
1916.align 4
1917.Lxts_enc_2:
1918 vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
1919
1920 veor @XMM[0], @XMM[0], @XMM[8]
1921#ifndef BSAES_ASM_EXTENDED_KEY
1922 add r4, sp, #0x90 @ pass key schedule
1923#else
1924 add r4, $key, #248 @ pass key schedule
1925#endif
1926 veor @XMM[1], @XMM[1], @XMM[9]
1927 mov r5, $rounds @ pass rounds
1928 mov r0, sp
1929
1930 bl _bsaes_encrypt8
1931
1932 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1933 veor @XMM[0], @XMM[0], @XMM[ 8]
1934 veor @XMM[1], @XMM[1], @XMM[ 9]
1935 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1936
1937 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1938 b .Lxts_enc_done
1939.align 4
1940.Lxts_enc_1:
1941 mov r0, sp
1942 veor @XMM[0], @XMM[8]
1943 mov r1, sp
1944 vst1.8 {@XMM[0]}, [sp,:128]
1945 mov r2, $key
1946 mov r4, $fp @ preserve fp
1947
1948 bl AES_encrypt
1949
1950 vld1.8 {@XMM[0]}, [sp,:128]
1951 veor @XMM[0], @XMM[0], @XMM[8]
1952 vst1.8 {@XMM[0]}, [$out]!
1953 mov $fp, r4
1954
1955 vmov @XMM[8], @XMM[9] @ next round tweak
1956
1957.Lxts_enc_done:
1958#ifndef XTS_CHAIN_TWEAK
1959 adds $len, #0x10
1960 beq .Lxts_enc_ret
1961 sub r6, $out, #0x10
1962
1963.Lxts_enc_steal:
1964 ldrb r0, [$inp], #1
1965 ldrb r1, [$out, #-0x10]
1966 strb r0, [$out, #-0x10]
1967 strb r1, [$out], #1
1968
1969 subs $len, #1
1970 bhi .Lxts_enc_steal
1971
1972 vld1.8 {@XMM[0]}, [r6]
1973 mov r0, sp
1974 veor @XMM[0], @XMM[0], @XMM[8]
1975 mov r1, sp
1976 vst1.8 {@XMM[0]}, [sp,:128]
1977 mov r2, $key
1978 mov r4, $fp @ preserve fp
1979
1980 bl AES_encrypt
1981
1982 vld1.8 {@XMM[0]}, [sp,:128]
1983 veor @XMM[0], @XMM[0], @XMM[8]
1984 vst1.8 {@XMM[0]}, [r6]
1985 mov $fp, r4
1986#endif
1987
1988.Lxts_enc_ret:
1989 bic r0, $fp, #0xf
1990 vmov.i32 q0, #0
1991 vmov.i32 q1, #0
1992#ifdef XTS_CHAIN_TWEAK
1993 ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
1994#endif
1995.Lxts_enc_bzero: @ wipe key schedule [if any]
1996 vstmia sp!, {q0-q1}
1997 cmp sp, r0
1998 bne .Lxts_enc_bzero
1999
2000 mov sp, $fp
2001#ifdef XTS_CHAIN_TWEAK
2002 vst1.8 {@XMM[8]}, [r1]
2003#endif
2004 VFP_ABI_POP
2005 ldmia sp!, {r4-r10, pc} @ return
2006
2007.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2008
2009.globl bsaes_xts_decrypt
2010.type bsaes_xts_decrypt,%function
2011.align 4
2012bsaes_xts_decrypt:
2013 mov ip, sp
2014 stmdb sp!, {r4-r10, lr} @ 0x20
2015 VFP_ABI_PUSH
2016 mov r6, sp @ future $fp
2017
2018 mov $inp, r0
2019 mov $out, r1
2020 mov $len, r2
2021 mov $key, r3
2022
2023 sub r0, sp, #0x10 @ 0x10
2024 bic r0, #0xf @ align at 16 bytes
2025 mov sp, r0
2026
2027#ifdef XTS_CHAIN_TWEAK
2028 ldr r0, [ip] @ pointer to input tweak
2029#else
2030 @ generate initial tweak
2031 ldr r0, [ip, #4] @ iv[]
2032 mov r1, sp
2033 ldr r2, [ip, #0] @ key2
2034 bl AES_encrypt
2035 mov r0, sp @ pointer to initial tweak
2036#endif
2037
2038 ldr $rounds, [$key, #240] @ get # of rounds
2039 mov $fp, r6
2040#ifndef BSAES_ASM_EXTENDED_KEY
2041 @ allocate the key schedule on the stack
2042 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
2043 @ add r12, #`128-32` @ size of bit-sliced key schedule
2044 sub r12, #`32+16` @ place for tweak[9]
2045
2046 @ populate the key schedule
2047 mov r4, $key @ pass key
2048 mov r5, $rounds @ pass # of rounds
2049 mov sp, r12
2050 add r12, #0x90 @ pass key schedule
2051 bl _bsaes_key_convert
2052 add r4, sp, #0x90
2053 vldmia r4, {@XMM[6]}
2054 vstmia r12, {@XMM[15]} @ save last round key
2055 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2056 vstmia r4, {@XMM[7]}
2057#else
2058 ldr r12, [$key, #244]
2059 eors r12, #1
2060 beq 0f
2061
2062 str r12, [$key, #244]
2063 mov r4, $key @ pass key
2064 mov r5, $rounds @ pass # of rounds
2065 add r12, $key, #248 @ pass key schedule
2066 bl _bsaes_key_convert
2067 add r4, $key, #248
2068 vldmia r4, {@XMM[6]}
2069 vstmia r12, {@XMM[15]} @ save last round key
2070 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2071 vstmia r4, {@XMM[7]}
2072
2073.align 2
20740: sub sp, #0x90 @ place for tweak[9]
2075#endif
2076 vld1.8 {@XMM[8]}, [r0] @ initial tweak
2077 adr $magic, .Lxts_magic
2078
2079 tst $len, #0xf @ if not multiple of 16
2080 it ne @ Thumb2 thing, sanity check in ARM
2081 subne $len, #0x10 @ subtract another 16 bytes
2082 subs $len, #0x80
2083
2084 blo .Lxts_dec_short
2085 b .Lxts_dec_loop
2086
2087.align 4
2088.Lxts_dec_loop:
2089 vldmia $magic, {$twmask} @ load XTS magic
2090 vshr.s64 @T[0], @XMM[8], #63
2091 mov r0, sp
2092 vand @T[0], @T[0], $twmask
2093___
2094for($i=9;$i<16;$i++) {
2095$code.=<<___;
2096 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2097 vst1.64 {@XMM[$i-1]}, [r0,:128]!
2098 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2099 vshr.s64 @T[1], @XMM[$i], #63
2100 veor @XMM[$i], @XMM[$i], @T[0]
2101 vand @T[1], @T[1], $twmask
2102___
2103 @T=reverse(@T);
2104
2105$code.=<<___ if ($i>=10);
2106 vld1.8 {@XMM[$i-10]}, [$inp]!
2107___
2108$code.=<<___ if ($i>=11);
2109 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2110___
2111}
2112$code.=<<___;
2113 vadd.u64 @XMM[8], @XMM[15], @XMM[15]
2114 vst1.64 {@XMM[15]}, [r0,:128]!
2115 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2116 veor @XMM[8], @XMM[8], @T[0]
2117 vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2118
2119 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
2120 veor @XMM[5], @XMM[5], @XMM[13]
2121#ifndef BSAES_ASM_EXTENDED_KEY
2122 add r4, sp, #0x90 @ pass key schedule
2123#else
2124 add r4, $key, #248 @ pass key schedule
2125#endif
2126 veor @XMM[6], @XMM[6], @XMM[14]
2127 mov r5, $rounds @ pass rounds
2128 veor @XMM[7], @XMM[7], @XMM[15]
2129 mov r0, sp
2130
2131 bl _bsaes_decrypt8
2132
2133 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2134 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2135 veor @XMM[0], @XMM[0], @XMM[ 8]
2136 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2137 veor @XMM[1], @XMM[1], @XMM[ 9]
2138 veor @XMM[8], @XMM[6], @XMM[10]
2139 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2140 veor @XMM[9], @XMM[4], @XMM[11]
2141 vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
2142 veor @XMM[10], @XMM[2], @XMM[12]
2143 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2144 veor @XMM[11], @XMM[7], @XMM[13]
2145 veor @XMM[12], @XMM[3], @XMM[14]
2146 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2147 veor @XMM[13], @XMM[5], @XMM[15]
2148 vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
2149
2150 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2151
2152 subs $len, #0x80
2153 bpl .Lxts_dec_loop
2154
2155.Lxts_dec_short:
2156 adds $len, #0x70
2157 bmi .Lxts_dec_done
2158
2159 vldmia $magic, {$twmask} @ load XTS magic
2160 vshr.s64 @T[0], @XMM[8], #63
2161 mov r0, sp
2162 vand @T[0], @T[0], $twmask
2163___
2164for($i=9;$i<16;$i++) {
2165$code.=<<___;
2166 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2167 vst1.64 {@XMM[$i-1]}, [r0,:128]!
2168 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2169 vshr.s64 @T[1], @XMM[$i], #63
2170 veor @XMM[$i], @XMM[$i], @T[0]
2171 vand @T[1], @T[1], $twmask
2172___
2173 @T=reverse(@T);
2174
2175$code.=<<___ if ($i>=10);
2176 vld1.8 {@XMM[$i-10]}, [$inp]!
2177 subs $len, #0x10
2178 bmi .Lxts_dec_`$i-9`
2179___
2180$code.=<<___ if ($i>=11);
2181 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2182___
2183}
2184$code.=<<___;
2185 sub $len, #0x10
2186 vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
2187
2188 vld1.8 {@XMM[6]}, [$inp]!
2189 veor @XMM[5], @XMM[5], @XMM[13]
2190#ifndef BSAES_ASM_EXTENDED_KEY
2191 add r4, sp, #0x90 @ pass key schedule
2192#else
2193 add r4, $key, #248 @ pass key schedule
2194#endif
2195 veor @XMM[6], @XMM[6], @XMM[14]
2196 mov r5, $rounds @ pass rounds
2197 mov r0, sp
2198
2199 bl _bsaes_decrypt8
2200
2201 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2202 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2203 veor @XMM[0], @XMM[0], @XMM[ 8]
2204 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2205 veor @XMM[1], @XMM[1], @XMM[ 9]
2206 veor @XMM[8], @XMM[6], @XMM[10]
2207 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2208 veor @XMM[9], @XMM[4], @XMM[11]
2209 vld1.64 {@XMM[14]}, [r0,:128]!
2210 veor @XMM[10], @XMM[2], @XMM[12]
2211 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2212 veor @XMM[11], @XMM[7], @XMM[13]
2213 veor @XMM[12], @XMM[3], @XMM[14]
2214 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2215 vst1.8 {@XMM[12]}, [$out]!
2216
2217 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2218 b .Lxts_dec_done
2219.align 4
2220.Lxts_dec_6:
2221 vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
2222
2223 veor @XMM[4], @XMM[4], @XMM[12]
2224#ifndef BSAES_ASM_EXTENDED_KEY
2225 add r4, sp, #0x90 @ pass key schedule
2226#else
2227 add r4, $key, #248 @ pass key schedule
2228#endif
2229 veor @XMM[5], @XMM[5], @XMM[13]
2230 mov r5, $rounds @ pass rounds
2231 mov r0, sp
2232
2233 bl _bsaes_decrypt8
2234
2235 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2236 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2237 veor @XMM[0], @XMM[0], @XMM[ 8]
2238 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2239 veor @XMM[1], @XMM[1], @XMM[ 9]
2240 veor @XMM[8], @XMM[6], @XMM[10]
2241 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2242 veor @XMM[9], @XMM[4], @XMM[11]
2243 veor @XMM[10], @XMM[2], @XMM[12]
2244 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2245 veor @XMM[11], @XMM[7], @XMM[13]
2246 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2247
2248 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2249 b .Lxts_dec_done
2250.align 4
2251.Lxts_dec_5:
2252 vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
2253
2254 veor @XMM[3], @XMM[3], @XMM[11]
2255#ifndef BSAES_ASM_EXTENDED_KEY
2256 add r4, sp, #0x90 @ pass key schedule
2257#else
2258 add r4, $key, #248 @ pass key schedule
2259#endif
2260 veor @XMM[4], @XMM[4], @XMM[12]
2261 mov r5, $rounds @ pass rounds
2262 mov r0, sp
2263
2264 bl _bsaes_decrypt8
2265
2266 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2267 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2268 veor @XMM[0], @XMM[0], @XMM[ 8]
2269 vld1.64 {@XMM[12]}, [r0,:128]!
2270 veor @XMM[1], @XMM[1], @XMM[ 9]
2271 veor @XMM[8], @XMM[6], @XMM[10]
2272 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2273 veor @XMM[9], @XMM[4], @XMM[11]
2274 veor @XMM[10], @XMM[2], @XMM[12]
2275 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2276 vst1.8 {@XMM[10]}, [$out]!
2277
2278 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2279 b .Lxts_dec_done
2280.align 4
2281.Lxts_dec_4:
2282 vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
2283
2284 veor @XMM[2], @XMM[2], @XMM[10]
2285#ifndef BSAES_ASM_EXTENDED_KEY
2286 add r4, sp, #0x90 @ pass key schedule
2287#else
2288 add r4, $key, #248 @ pass key schedule
2289#endif
2290 veor @XMM[3], @XMM[3], @XMM[11]
2291 mov r5, $rounds @ pass rounds
2292 mov r0, sp
2293
2294 bl _bsaes_decrypt8
2295
2296 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2297 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2298 veor @XMM[0], @XMM[0], @XMM[ 8]
2299 veor @XMM[1], @XMM[1], @XMM[ 9]
2300 veor @XMM[8], @XMM[6], @XMM[10]
2301 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2302 veor @XMM[9], @XMM[4], @XMM[11]
2303 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2304
2305 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2306 b .Lxts_dec_done
2307.align 4
2308.Lxts_dec_3:
2309 vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
2310
2311 veor @XMM[1], @XMM[1], @XMM[9]
2312#ifndef BSAES_ASM_EXTENDED_KEY
2313 add r4, sp, #0x90 @ pass key schedule
2314#else
2315 add r4, $key, #248 @ pass key schedule
2316#endif
2317 veor @XMM[2], @XMM[2], @XMM[10]
2318 mov r5, $rounds @ pass rounds
2319 mov r0, sp
2320
2321 bl _bsaes_decrypt8
2322
2323 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2324 vld1.64 {@XMM[10]}, [r0,:128]!
2325 veor @XMM[0], @XMM[0], @XMM[ 8]
2326 veor @XMM[1], @XMM[1], @XMM[ 9]
2327 veor @XMM[8], @XMM[6], @XMM[10]
2328 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2329 vst1.8 {@XMM[8]}, [$out]!
2330
2331 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2332 b .Lxts_dec_done
2333.align 4
2334.Lxts_dec_2:
2335 vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
2336
2337 veor @XMM[0], @XMM[0], @XMM[8]
2338#ifndef BSAES_ASM_EXTENDED_KEY
2339 add r4, sp, #0x90 @ pass key schedule
2340#else
2341 add r4, $key, #248 @ pass key schedule
2342#endif
2343 veor @XMM[1], @XMM[1], @XMM[9]
2344 mov r5, $rounds @ pass rounds
2345 mov r0, sp
2346
2347 bl _bsaes_decrypt8
2348
2349 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2350 veor @XMM[0], @XMM[0], @XMM[ 8]
2351 veor @XMM[1], @XMM[1], @XMM[ 9]
2352 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2353
2354 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2355 b .Lxts_dec_done
2356.align 4
2357.Lxts_dec_1:
2358 mov r0, sp
2359 veor @XMM[0], @XMM[8]
2360 mov r1, sp
2361 vst1.8 {@XMM[0]}, [sp,:128]
2362 mov r2, $key
2363 mov r4, $fp @ preserve fp
2364 mov r5, $magic @ preserve magic
2365
2366 bl AES_decrypt
2367
2368 vld1.8 {@XMM[0]}, [sp,:128]
2369 veor @XMM[0], @XMM[0], @XMM[8]
2370 vst1.8 {@XMM[0]}, [$out]!
2371 mov $fp, r4
2372 mov $magic, r5
2373
2374 vmov @XMM[8], @XMM[9] @ next round tweak
2375
2376.Lxts_dec_done:
2377#ifndef XTS_CHAIN_TWEAK
2378 adds $len, #0x10
2379 beq .Lxts_dec_ret
2380
2381 @ calculate one round of extra tweak for the stolen ciphertext
2382 vldmia $magic, {$twmask}
2383 vshr.s64 @XMM[6], @XMM[8], #63
2384 vand @XMM[6], @XMM[6], $twmask
2385 vadd.u64 @XMM[9], @XMM[8], @XMM[8]
2386 vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
2387 veor @XMM[9], @XMM[9], @XMM[6]
2388
2389 @ perform the final decryption with the last tweak value
2390 vld1.8 {@XMM[0]}, [$inp]!
2391 mov r0, sp
2392 veor @XMM[0], @XMM[0], @XMM[9]
2393 mov r1, sp
2394 vst1.8 {@XMM[0]}, [sp,:128]
2395 mov r2, $key
2396 mov r4, $fp @ preserve fp
2397
2398 bl AES_decrypt
2399
2400 vld1.8 {@XMM[0]}, [sp,:128]
2401 veor @XMM[0], @XMM[0], @XMM[9]
2402 vst1.8 {@XMM[0]}, [$out]
2403
2404 mov r6, $out
2405.Lxts_dec_steal:
2406 ldrb r1, [$out]
2407 ldrb r0, [$inp], #1
2408 strb r1, [$out, #0x10]
2409 strb r0, [$out], #1
2410
2411 subs $len, #1
2412 bhi .Lxts_dec_steal
2413
2414 vld1.8 {@XMM[0]}, [r6]
2415 mov r0, sp
2416 veor @XMM[0], @XMM[8]
2417 mov r1, sp
2418 vst1.8 {@XMM[0]}, [sp,:128]
2419 mov r2, $key
2420
2421 bl AES_decrypt
2422
2423 vld1.8 {@XMM[0]}, [sp,:128]
2424 veor @XMM[0], @XMM[0], @XMM[8]
2425 vst1.8 {@XMM[0]}, [r6]
2426 mov $fp, r4
2427#endif
2428
2429.Lxts_dec_ret:
2430 bic r0, $fp, #0xf
2431 vmov.i32 q0, #0
2432 vmov.i32 q1, #0
2433#ifdef XTS_CHAIN_TWEAK
2434 ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
2435#endif
2436.Lxts_dec_bzero: @ wipe key schedule [if any]
2437 vstmia sp!, {q0-q1}
2438 cmp sp, r0
2439 bne .Lxts_dec_bzero
2440
2441 mov sp, $fp
2442#ifdef XTS_CHAIN_TWEAK
2443 vst1.8 {@XMM[8]}, [r1]
2444#endif
2445 VFP_ABI_POP
2446 ldmia sp!, {r4-r10, pc} @ return
2447
2448.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2449___
2450}
2451$code.=<<___;
2452#endif
2453___
2454
2455$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2456
2457open SELF,$0;
2458while(<SELF>) {
2459 next if (/^#!/);
2460 last if (!s/^#/@/ and !/^$/);
2461 print;
2462}
2463close SELF;
2464
2465print $code;
2466
2467close STDOUT;