diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-13 18:51:29 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-13 18:51:29 -0500 |
commit | f47671e2d861a2093179cd64dda22016664b2015 (patch) | |
tree | f77cb8e7d875f442e2cf0bdc8fbe478ec8ff8181 /arch/arm/crypto/bsaes-armv7.pl | |
parent | 8ceafbfa91ffbdbb2afaea5c24ccb519ffb8b587 (diff) | |
parent | 42cbe8271ca6562b4ad4b2e6a9895084b16eef5e (diff) |
Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm
Pull ARM updates from Russell King:
"Included in this series are:
1. BE8 (modern big endian) changes for ARM from Ben Dooks
2. big.Little support from Nicolas Pitre and Dave Martin
3. support for LPAE systems with all system memory above 4GB
4. Perf updates from Will Deacon
5. Additional prefetching and other performance improvements from Will.
6. Neon-optimised AES implementation fro Ard.
7. A number of smaller fixes scattered around the place.
There is a rather horrid merge conflict in tools/perf - I was never
notified of the conflict because it originally occurred between Will's
tree and other stuff. Consequently I have a resolution which Will
forwarded me, which I'll forward on immediately after sending this
mail.
The other notable thing is I'm expecting some build breakage in the
crypto stuff on ARM only with Ard's AES patches. These were merged
into a stable git branch which others had already pulled, so there's
little I can do about this. The problem is caused because these
patches have a dependency on some code in the crypto git tree - I
tried requesting a branch I can pull to resolve these, and all I got
each time from the crypto people was "we'll revert our patches then"
which would only make things worse since I still don't have the
dependent patches. I've no idea what's going on there or how to
resolve that, and since I can't split these patches from the rest of
this pull request, I'm rather stuck with pushing this as-is or
reverting Ard's patches.
Since it should "come out in the wash" I've left them in - the only
build problems they seem to cause at the moment are with randconfigs,
and since it's a new feature anyway. However, if by -rc1 the
dependencies aren't in, I think it'd be best to revert Ard's patches"
I resolved the perf conflict roughly as per the patch sent by Russell,
but there may be some differences. Any errors are likely mine. Let's
see how the crypto issues work out..
* 'for-linus' of git://git.linaro.org/people/rmk/linux-arm: (110 commits)
ARM: 7868/1: arm/arm64: remove atomic_clear_mask() in "include/asm/atomic.h"
ARM: 7867/1: include: asm: use 'int' instead of 'unsigned long' for 'oldval' in atomic_cmpxchg().
ARM: 7866/1: include: asm: use 'long long' instead of 'u64' within atomic.h
ARM: 7871/1: amba: Extend number of IRQS
ARM: 7887/1: Don't smp_cross_call() on UP devices in arch_irq_work_raise()
ARM: 7872/1: Support arch_irq_work_raise() via self IPIs
ARM: 7880/1: Clear the IT state independent of the Thumb-2 mode
ARM: 7878/1: nommu: Implement dummy early_paging_init()
ARM: 7876/1: clear Thumb-2 IT state on exception handling
ARM: 7874/2: bL_switcher: Remove cpu_hotplug_driver_{lock,unlock}()
ARM: footbridge: fix build warnings for netwinder
ARM: 7873/1: vfp: clear vfp_current_hw_state for dying cpu
ARM: fix misplaced arch_virt_to_idmap()
ARM: 7848/1: mcpm: Implement cpu_kill() to synchronise on powerdown
ARM: 7847/1: mcpm: Factor out logical-to-physical CPU translation
ARM: 7869/1: remove unused XSCALE_PMU Kconfig param
ARM: 7864/1: Handle 64-bit memory in case of 32-bit phys_addr_t
ARM: 7863/1: Let arm_add_memory() always use 64-bit arguments
ARM: 7862/1: pcpu: replace __get_cpu_var_uses
ARM: 7861/1: cacheflush: consolidate single-CPU ARMv7 cache disabling code
...
Diffstat (limited to 'arch/arm/crypto/bsaes-armv7.pl')
-rw-r--r-- | arch/arm/crypto/bsaes-armv7.pl | 2467 |
1 files changed, 2467 insertions, 0 deletions
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl new file mode 100644 index 000000000000..f3d96d932573 --- /dev/null +++ b/arch/arm/crypto/bsaes-armv7.pl | |||
@@ -0,0 +1,2467 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # | ||
9 | # Specific modes and adaptation for Linux kernel by Ard Biesheuvel | ||
10 | # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is | ||
11 | # granted. | ||
12 | # ==================================================================== | ||
13 | |||
14 | # Bit-sliced AES for ARM NEON | ||
15 | # | ||
16 | # February 2012. | ||
17 | # | ||
18 | # This implementation is direct adaptation of bsaes-x86_64 module for | ||
19 | # ARM NEON. Except that this module is endian-neutral [in sense that | ||
20 | # it can be compiled for either endianness] by courtesy of vld1.8's | ||
21 | # neutrality. Initial version doesn't implement interface to OpenSSL, | ||
22 | # only low-level primitives and unsupported entry points, just enough | ||
23 | # to collect performance results, which for Cortex-A8 core are: | ||
24 | # | ||
25 | # encrypt 19.5 cycles per byte processed with 128-bit key | ||
26 | # decrypt 22.1 cycles per byte processed with 128-bit key | ||
27 | # key conv. 440 cycles per 128-bit key/0.18 of 8x block | ||
28 | # | ||
29 | # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, | ||
30 | # which is [much] worse than anticipated (for further details see | ||
31 | # http://www.openssl.org/~appro/Snapdragon-S4.html). | ||
32 | # | ||
33 | # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code | ||
34 | # manages in 20.0 cycles]. | ||
35 | # | ||
36 | # When comparing to x86_64 results keep in mind that NEON unit is | ||
37 | # [mostly] single-issue and thus can't [fully] benefit from | ||
38 | # instruction-level parallelism. And when comparing to aes-armv4 | ||
39 | # results keep in mind key schedule conversion overhead (see | ||
40 | # bsaes-x86_64.pl for further details)... | ||
41 | # | ||
42 | # <appro@openssl.org> | ||
43 | |||
44 | # April-August 2013 | ||
45 | # | ||
46 | # Add CBC, CTR and XTS subroutines, adapt for kernel use. | ||
47 | # | ||
48 | # <ard.biesheuvel@linaro.org> | ||
49 | |||
50 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
51 | open STDOUT,">$output"; | ||
52 | |||
53 | my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); | ||
54 | my @XMM=map("q$_",(0..15)); | ||
55 | |||
56 | { | ||
57 | my ($key,$rounds,$const)=("r4","r5","r6"); | ||
58 | |||
59 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
60 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
61 | |||
62 | sub Sbox { | ||
63 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
64 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
65 | my @b=@_[0..7]; | ||
66 | my @t=@_[8..11]; | ||
67 | my @s=@_[12..15]; | ||
68 | &InBasisChange (@b); | ||
69 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
70 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
71 | } | ||
72 | |||
73 | sub InBasisChange { | ||
74 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
75 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
76 | my @b=@_[0..7]; | ||
77 | $code.=<<___; | ||
78 | veor @b[2], @b[2], @b[1] | ||
79 | veor @b[5], @b[5], @b[6] | ||
80 | veor @b[3], @b[3], @b[0] | ||
81 | veor @b[6], @b[6], @b[2] | ||
82 | veor @b[5], @b[5], @b[0] | ||
83 | |||
84 | veor @b[6], @b[6], @b[3] | ||
85 | veor @b[3], @b[3], @b[7] | ||
86 | veor @b[7], @b[7], @b[5] | ||
87 | veor @b[3], @b[3], @b[4] | ||
88 | veor @b[4], @b[4], @b[5] | ||
89 | |||
90 | veor @b[2], @b[2], @b[7] | ||
91 | veor @b[3], @b[3], @b[1] | ||
92 | veor @b[1], @b[1], @b[5] | ||
93 | ___ | ||
94 | } | ||
95 | |||
96 | sub OutBasisChange { | ||
97 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
98 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
99 | my @b=@_[0..7]; | ||
100 | $code.=<<___; | ||
101 | veor @b[0], @b[0], @b[6] | ||
102 | veor @b[1], @b[1], @b[4] | ||
103 | veor @b[4], @b[4], @b[6] | ||
104 | veor @b[2], @b[2], @b[0] | ||
105 | veor @b[6], @b[6], @b[1] | ||
106 | |||
107 | veor @b[1], @b[1], @b[5] | ||
108 | veor @b[5], @b[5], @b[3] | ||
109 | veor @b[3], @b[3], @b[7] | ||
110 | veor @b[7], @b[7], @b[5] | ||
111 | veor @b[2], @b[2], @b[5] | ||
112 | |||
113 | veor @b[4], @b[4], @b[7] | ||
114 | ___ | ||
115 | } | ||
116 | |||
117 | sub InvSbox { | ||
118 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
119 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
120 | my @b=@_[0..7]; | ||
121 | my @t=@_[8..11]; | ||
122 | my @s=@_[12..15]; | ||
123 | &InvInBasisChange (@b); | ||
124 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
125 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
126 | } | ||
127 | |||
128 | sub InvInBasisChange { # OutBasisChange in reverse (with twist) | ||
129 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
130 | $code.=<<___ | ||
131 | veor @b[1], @b[1], @b[7] | ||
132 | veor @b[4], @b[4], @b[7] | ||
133 | |||
134 | veor @b[7], @b[7], @b[5] | ||
135 | veor @b[1], @b[1], @b[3] | ||
136 | veor @b[2], @b[2], @b[5] | ||
137 | veor @b[3], @b[3], @b[7] | ||
138 | |||
139 | veor @b[6], @b[6], @b[1] | ||
140 | veor @b[2], @b[2], @b[0] | ||
141 | veor @b[5], @b[5], @b[3] | ||
142 | veor @b[4], @b[4], @b[6] | ||
143 | veor @b[0], @b[0], @b[6] | ||
144 | veor @b[1], @b[1], @b[4] | ||
145 | ___ | ||
146 | } | ||
147 | |||
148 | sub InvOutBasisChange { # InBasisChange in reverse | ||
149 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
150 | $code.=<<___; | ||
151 | veor @b[1], @b[1], @b[5] | ||
152 | veor @b[2], @b[2], @b[7] | ||
153 | |||
154 | veor @b[3], @b[3], @b[1] | ||
155 | veor @b[4], @b[4], @b[5] | ||
156 | veor @b[7], @b[7], @b[5] | ||
157 | veor @b[3], @b[3], @b[4] | ||
158 | veor @b[5], @b[5], @b[0] | ||
159 | veor @b[3], @b[3], @b[7] | ||
160 | veor @b[6], @b[6], @b[2] | ||
161 | veor @b[2], @b[2], @b[1] | ||
162 | veor @b[6], @b[6], @b[3] | ||
163 | |||
164 | veor @b[3], @b[3], @b[0] | ||
165 | veor @b[5], @b[5], @b[6] | ||
166 | ___ | ||
167 | } | ||
168 | |||
169 | sub Mul_GF4 { | ||
170 | #;************************************************************* | ||
171 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
172 | #;************************************************************* | ||
173 | my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; | ||
174 | $code.=<<___; | ||
175 | veor $t0, $y0, $y1 | ||
176 | vand $t0, $t0, $x0 | ||
177 | veor $x0, $x0, $x1 | ||
178 | vand $t1, $x1, $y0 | ||
179 | vand $x0, $x0, $y1 | ||
180 | veor $x1, $t1, $t0 | ||
181 | veor $x0, $x0, $t1 | ||
182 | ___ | ||
183 | } | ||
184 | |||
185 | sub Mul_GF4_N { # not used, see next subroutine | ||
186 | # multiply and scale by N | ||
187 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
188 | $code.=<<___; | ||
189 | veor $t0, $y0, $y1 | ||
190 | vand $t0, $t0, $x0 | ||
191 | veor $x0, $x0, $x1 | ||
192 | vand $x1, $x1, $y0 | ||
193 | vand $x0, $x0, $y1 | ||
194 | veor $x1, $x1, $x0 | ||
195 | veor $x0, $x0, $t0 | ||
196 | ___ | ||
197 | } | ||
198 | |||
199 | sub Mul_GF4_N_GF4 { | ||
200 | # interleaved Mul_GF4_N and Mul_GF4 | ||
201 | my ($x0,$x1,$y0,$y1,$t0, | ||
202 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
203 | $code.=<<___; | ||
204 | veor $t0, $y0, $y1 | ||
205 | veor $t1, $y2, $y3 | ||
206 | vand $t0, $t0, $x0 | ||
207 | vand $t1, $t1, $x2 | ||
208 | veor $x0, $x0, $x1 | ||
209 | veor $x2, $x2, $x3 | ||
210 | vand $x1, $x1, $y0 | ||
211 | vand $x3, $x3, $y2 | ||
212 | vand $x0, $x0, $y1 | ||
213 | vand $x2, $x2, $y3 | ||
214 | veor $x1, $x1, $x0 | ||
215 | veor $x2, $x2, $x3 | ||
216 | veor $x0, $x0, $t0 | ||
217 | veor $x3, $x3, $t1 | ||
218 | ___ | ||
219 | } | ||
220 | sub Mul_GF16_2 { | ||
221 | my @x=@_[0..7]; | ||
222 | my @y=@_[8..11]; | ||
223 | my @t=@_[12..15]; | ||
224 | $code.=<<___; | ||
225 | veor @t[0], @x[0], @x[2] | ||
226 | veor @t[1], @x[1], @x[3] | ||
227 | ___ | ||
228 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); | ||
229 | $code.=<<___; | ||
230 | veor @y[0], @y[0], @y[2] | ||
231 | veor @y[1], @y[1], @y[3] | ||
232 | ___ | ||
233 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
234 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
235 | $code.=<<___; | ||
236 | veor @x[0], @x[0], @t[0] | ||
237 | veor @x[2], @x[2], @t[0] | ||
238 | veor @x[1], @x[1], @t[1] | ||
239 | veor @x[3], @x[3], @t[1] | ||
240 | |||
241 | veor @t[0], @x[4], @x[6] | ||
242 | veor @t[1], @x[5], @x[7] | ||
243 | ___ | ||
244 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
245 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
246 | $code.=<<___; | ||
247 | veor @y[0], @y[0], @y[2] | ||
248 | veor @y[1], @y[1], @y[3] | ||
249 | ___ | ||
250 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); | ||
251 | $code.=<<___; | ||
252 | veor @x[4], @x[4], @t[0] | ||
253 | veor @x[6], @x[6], @t[0] | ||
254 | veor @x[5], @x[5], @t[1] | ||
255 | veor @x[7], @x[7], @t[1] | ||
256 | ___ | ||
257 | } | ||
258 | sub Inv_GF256 { | ||
259 | #;******************************************************************** | ||
260 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
261 | #;******************************************************************** | ||
262 | my @x=@_[0..7]; | ||
263 | my @t=@_[8..11]; | ||
264 | my @s=@_[12..15]; | ||
265 | # direct optimizations from hardware | ||
266 | $code.=<<___; | ||
267 | veor @t[3], @x[4], @x[6] | ||
268 | veor @t[2], @x[5], @x[7] | ||
269 | veor @t[1], @x[1], @x[3] | ||
270 | veor @s[1], @x[7], @x[6] | ||
271 | vmov @t[0], @t[2] | ||
272 | veor @s[0], @x[0], @x[2] | ||
273 | |||
274 | vorr @t[2], @t[2], @t[1] | ||
275 | veor @s[3], @t[3], @t[0] | ||
276 | vand @s[2], @t[3], @s[0] | ||
277 | vorr @t[3], @t[3], @s[0] | ||
278 | veor @s[0], @s[0], @t[1] | ||
279 | vand @t[0], @t[0], @t[1] | ||
280 | veor @t[1], @x[3], @x[2] | ||
281 | vand @s[3], @s[3], @s[0] | ||
282 | vand @s[1], @s[1], @t[1] | ||
283 | veor @t[1], @x[4], @x[5] | ||
284 | veor @s[0], @x[1], @x[0] | ||
285 | veor @t[3], @t[3], @s[1] | ||
286 | veor @t[2], @t[2], @s[1] | ||
287 | vand @s[1], @t[1], @s[0] | ||
288 | vorr @t[1], @t[1], @s[0] | ||
289 | veor @t[3], @t[3], @s[3] | ||
290 | veor @t[0], @t[0], @s[1] | ||
291 | veor @t[2], @t[2], @s[2] | ||
292 | veor @t[1], @t[1], @s[3] | ||
293 | veor @t[0], @t[0], @s[2] | ||
294 | vand @s[0], @x[7], @x[3] | ||
295 | veor @t[1], @t[1], @s[2] | ||
296 | vand @s[1], @x[6], @x[2] | ||
297 | vand @s[2], @x[5], @x[1] | ||
298 | vorr @s[3], @x[4], @x[0] | ||
299 | veor @t[3], @t[3], @s[0] | ||
300 | veor @t[1], @t[1], @s[2] | ||
301 | veor @t[0], @t[0], @s[3] | ||
302 | veor @t[2], @t[2], @s[1] | ||
303 | |||
304 | @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
305 | |||
306 | @ new smaller inversion | ||
307 | |||
308 | vand @s[2], @t[3], @t[1] | ||
309 | vmov @s[0], @t[0] | ||
310 | |||
311 | veor @s[1], @t[2], @s[2] | ||
312 | veor @s[3], @t[0], @s[2] | ||
313 | veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] | ||
314 | |||
315 | vbsl @s[1], @t[1], @t[0] | ||
316 | vbsl @s[3], @t[3], @t[2] | ||
317 | veor @t[3], @t[3], @t[2] | ||
318 | |||
319 | vbsl @s[0], @s[1], @s[2] | ||
320 | vbsl @t[0], @s[2], @s[1] | ||
321 | |||
322 | vand @s[2], @s[0], @s[3] | ||
323 | veor @t[1], @t[1], @t[0] | ||
324 | |||
325 | veor @s[2], @s[2], @t[3] | ||
326 | ___ | ||
327 | # output in s3, s2, s1, t1 | ||
328 | |||
329 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
330 | |||
331 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
332 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
333 | |||
334 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
335 | } | ||
336 | |||
337 | # AES linear components | ||
338 | |||
339 | sub ShiftRows { | ||
340 | my @x=@_[0..7]; | ||
341 | my @t=@_[8..11]; | ||
342 | my $mask=pop; | ||
343 | $code.=<<___; | ||
344 | vldmia $key!, {@t[0]-@t[3]} | ||
345 | veor @t[0], @t[0], @x[0] | ||
346 | veor @t[1], @t[1], @x[1] | ||
347 | vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` | ||
348 | vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` | ||
349 | vldmia $key!, {@t[0]} | ||
350 | veor @t[2], @t[2], @x[2] | ||
351 | vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` | ||
352 | vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` | ||
353 | vldmia $key!, {@t[1]} | ||
354 | veor @t[3], @t[3], @x[3] | ||
355 | vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` | ||
356 | vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` | ||
357 | vldmia $key!, {@t[2]} | ||
358 | vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` | ||
359 | vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` | ||
360 | vldmia $key!, {@t[3]} | ||
361 | veor @t[0], @t[0], @x[4] | ||
362 | veor @t[1], @t[1], @x[5] | ||
363 | vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` | ||
364 | vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` | ||
365 | veor @t[2], @t[2], @x[6] | ||
366 | vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` | ||
367 | vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` | ||
368 | veor @t[3], @t[3], @x[7] | ||
369 | vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` | ||
370 | vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` | ||
371 | vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` | ||
372 | vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` | ||
373 | ___ | ||
374 | } | ||
375 | |||
376 | sub MixColumns { | ||
377 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
378 | my @x=@_[0..7]; | ||
379 | my @t=@_[8..15]; | ||
380 | my $inv=@_[16]; # optional | ||
381 | $code.=<<___; | ||
382 | vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 | ||
383 | vext.8 @t[1], @x[1], @x[1], #12 | ||
384 | veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) | ||
385 | vext.8 @t[2], @x[2], @x[2], #12 | ||
386 | veor @x[1], @x[1], @t[1] | ||
387 | vext.8 @t[3], @x[3], @x[3], #12 | ||
388 | veor @x[2], @x[2], @t[2] | ||
389 | vext.8 @t[4], @x[4], @x[4], #12 | ||
390 | veor @x[3], @x[3], @t[3] | ||
391 | vext.8 @t[5], @x[5], @x[5], #12 | ||
392 | veor @x[4], @x[4], @t[4] | ||
393 | vext.8 @t[6], @x[6], @x[6], #12 | ||
394 | veor @x[5], @x[5], @t[5] | ||
395 | vext.8 @t[7], @x[7], @x[7], #12 | ||
396 | veor @x[6], @x[6], @t[6] | ||
397 | |||
398 | veor @t[1], @t[1], @x[0] | ||
399 | veor @x[7], @x[7], @t[7] | ||
400 | vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) | ||
401 | veor @t[2], @t[2], @x[1] | ||
402 | veor @t[0], @t[0], @x[7] | ||
403 | veor @t[1], @t[1], @x[7] | ||
404 | vext.8 @x[1], @x[1], @x[1], #8 | ||
405 | veor @t[5], @t[5], @x[4] | ||
406 | veor @x[0], @x[0], @t[0] | ||
407 | veor @t[6], @t[6], @x[5] | ||
408 | veor @x[1], @x[1], @t[1] | ||
409 | vext.8 @t[0], @x[4], @x[4], #8 | ||
410 | veor @t[4], @t[4], @x[3] | ||
411 | vext.8 @t[1], @x[5], @x[5], #8 | ||
412 | veor @t[7], @t[7], @x[6] | ||
413 | vext.8 @x[4], @x[3], @x[3], #8 | ||
414 | veor @t[3], @t[3], @x[2] | ||
415 | vext.8 @x[5], @x[7], @x[7], #8 | ||
416 | veor @t[4], @t[4], @x[7] | ||
417 | vext.8 @x[3], @x[6], @x[6], #8 | ||
418 | veor @t[3], @t[3], @x[7] | ||
419 | vext.8 @x[6], @x[2], @x[2], #8 | ||
420 | veor @x[7], @t[1], @t[5] | ||
421 | ___ | ||
422 | $code.=<<___ if (!$inv); | ||
423 | veor @x[2], @t[0], @t[4] | ||
424 | veor @x[4], @x[4], @t[3] | ||
425 | veor @x[5], @x[5], @t[7] | ||
426 | veor @x[3], @x[3], @t[6] | ||
427 | @ vmov @x[2], @t[0] | ||
428 | veor @x[6], @x[6], @t[2] | ||
429 | @ vmov @x[7], @t[1] | ||
430 | ___ | ||
431 | $code.=<<___ if ($inv); | ||
432 | veor @t[3], @t[3], @x[4] | ||
433 | veor @x[5], @x[5], @t[7] | ||
434 | veor @x[2], @x[3], @t[6] | ||
435 | veor @x[3], @t[0], @t[4] | ||
436 | veor @x[4], @x[6], @t[2] | ||
437 | vmov @x[6], @t[3] | ||
438 | @ vmov @x[7], @t[1] | ||
439 | ___ | ||
440 | } | ||
441 | |||
442 | sub InvMixColumns_orig { | ||
443 | my @x=@_[0..7]; | ||
444 | my @t=@_[8..15]; | ||
445 | |||
446 | $code.=<<___; | ||
447 | @ multiplication by 0x0e | ||
448 | vext.8 @t[7], @x[7], @x[7], #12 | ||
449 | vmov @t[2], @x[2] | ||
450 | veor @x[2], @x[2], @x[5] @ 2 5 | ||
451 | veor @x[7], @x[7], @x[5] @ 7 5 | ||
452 | vext.8 @t[0], @x[0], @x[0], #12 | ||
453 | vmov @t[5], @x[5] | ||
454 | veor @x[5], @x[5], @x[0] @ 5 0 [1] | ||
455 | veor @x[0], @x[0], @x[1] @ 0 1 | ||
456 | vext.8 @t[1], @x[1], @x[1], #12 | ||
457 | veor @x[1], @x[1], @x[2] @ 1 25 | ||
458 | veor @x[0], @x[0], @x[6] @ 01 6 [2] | ||
459 | vext.8 @t[3], @x[3], @x[3], #12 | ||
460 | veor @x[1], @x[1], @x[3] @ 125 3 [4] | ||
461 | veor @x[2], @x[2], @x[0] @ 25 016 [3] | ||
462 | veor @x[3], @x[3], @x[7] @ 3 75 | ||
463 | veor @x[7], @x[7], @x[6] @ 75 6 [0] | ||
464 | vext.8 @t[6], @x[6], @x[6], #12 | ||
465 | vmov @t[4], @x[4] | ||
466 | veor @x[6], @x[6], @x[4] @ 6 4 | ||
467 | veor @x[4], @x[4], @x[3] @ 4 375 [6] | ||
468 | veor @x[3], @x[3], @x[7] @ 375 756=36 | ||
469 | veor @x[6], @x[6], @t[5] @ 64 5 [7] | ||
470 | veor @x[3], @x[3], @t[2] @ 36 2 | ||
471 | vext.8 @t[5], @t[5], @t[5], #12 | ||
472 | veor @x[3], @x[3], @t[4] @ 362 4 [5] | ||
473 | ___ | ||
474 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
475 | $code.=<<___; | ||
476 | @ multiplication by 0x0b | ||
477 | veor @y[1], @y[1], @y[0] | ||
478 | veor @y[0], @y[0], @t[0] | ||
479 | vext.8 @t[2], @t[2], @t[2], #12 | ||
480 | veor @y[1], @y[1], @t[1] | ||
481 | veor @y[0], @y[0], @t[5] | ||
482 | vext.8 @t[4], @t[4], @t[4], #12 | ||
483 | veor @y[1], @y[1], @t[6] | ||
484 | veor @y[0], @y[0], @t[7] | ||
485 | veor @t[7], @t[7], @t[6] @ clobber t[7] | ||
486 | |||
487 | veor @y[3], @y[3], @t[0] | ||
488 | veor @y[1], @y[1], @y[0] | ||
489 | vext.8 @t[0], @t[0], @t[0], #12 | ||
490 | veor @y[2], @y[2], @t[1] | ||
491 | veor @y[4], @y[4], @t[1] | ||
492 | vext.8 @t[1], @t[1], @t[1], #12 | ||
493 | veor @y[2], @y[2], @t[2] | ||
494 | veor @y[3], @y[3], @t[2] | ||
495 | veor @y[5], @y[5], @t[2] | ||
496 | veor @y[2], @y[2], @t[7] | ||
497 | vext.8 @t[2], @t[2], @t[2], #12 | ||
498 | veor @y[3], @y[3], @t[3] | ||
499 | veor @y[6], @y[6], @t[3] | ||
500 | veor @y[4], @y[4], @t[3] | ||
501 | veor @y[7], @y[7], @t[4] | ||
502 | vext.8 @t[3], @t[3], @t[3], #12 | ||
503 | veor @y[5], @y[5], @t[4] | ||
504 | veor @y[7], @y[7], @t[7] | ||
505 | veor @t[7], @t[7], @t[5] @ clobber t[7] even more | ||
506 | veor @y[3], @y[3], @t[5] | ||
507 | veor @y[4], @y[4], @t[4] | ||
508 | |||
509 | veor @y[5], @y[5], @t[7] | ||
510 | vext.8 @t[4], @t[4], @t[4], #12 | ||
511 | veor @y[6], @y[6], @t[7] | ||
512 | veor @y[4], @y[4], @t[7] | ||
513 | |||
514 | veor @t[7], @t[7], @t[5] | ||
515 | vext.8 @t[5], @t[5], @t[5], #12 | ||
516 | |||
517 | @ multiplication by 0x0d | ||
518 | veor @y[4], @y[4], @y[7] | ||
519 | veor @t[7], @t[7], @t[6] @ restore t[7] | ||
520 | veor @y[7], @y[7], @t[4] | ||
521 | vext.8 @t[6], @t[6], @t[6], #12 | ||
522 | veor @y[2], @y[2], @t[0] | ||
523 | veor @y[7], @y[7], @t[5] | ||
524 | vext.8 @t[7], @t[7], @t[7], #12 | ||
525 | veor @y[2], @y[2], @t[2] | ||
526 | |||
527 | veor @y[3], @y[3], @y[1] | ||
528 | veor @y[1], @y[1], @t[1] | ||
529 | veor @y[0], @y[0], @t[0] | ||
530 | veor @y[3], @y[3], @t[0] | ||
531 | veor @y[1], @y[1], @t[5] | ||
532 | veor @y[0], @y[0], @t[5] | ||
533 | vext.8 @t[0], @t[0], @t[0], #12 | ||
534 | veor @y[1], @y[1], @t[7] | ||
535 | veor @y[0], @y[0], @t[6] | ||
536 | veor @y[3], @y[3], @y[1] | ||
537 | veor @y[4], @y[4], @t[1] | ||
538 | vext.8 @t[1], @t[1], @t[1], #12 | ||
539 | |||
540 | veor @y[7], @y[7], @t[7] | ||
541 | veor @y[4], @y[4], @t[2] | ||
542 | veor @y[5], @y[5], @t[2] | ||
543 | veor @y[2], @y[2], @t[6] | ||
544 | veor @t[6], @t[6], @t[3] @ clobber t[6] | ||
545 | vext.8 @t[2], @t[2], @t[2], #12 | ||
546 | veor @y[4], @y[4], @y[7] | ||
547 | veor @y[3], @y[3], @t[6] | ||
548 | |||
549 | veor @y[6], @y[6], @t[6] | ||
550 | veor @y[5], @y[5], @t[5] | ||
551 | vext.8 @t[5], @t[5], @t[5], #12 | ||
552 | veor @y[6], @y[6], @t[4] | ||
553 | vext.8 @t[4], @t[4], @t[4], #12 | ||
554 | veor @y[5], @y[5], @t[6] | ||
555 | veor @y[6], @y[6], @t[7] | ||
556 | vext.8 @t[7], @t[7], @t[7], #12 | ||
557 | veor @t[6], @t[6], @t[3] @ restore t[6] | ||
558 | vext.8 @t[3], @t[3], @t[3], #12 | ||
559 | |||
560 | @ multiplication by 0x09 | ||
561 | veor @y[4], @y[4], @y[1] | ||
562 | veor @t[1], @t[1], @y[1] @ t[1]=y[1] | ||
563 | veor @t[0], @t[0], @t[5] @ clobber t[0] | ||
564 | vext.8 @t[6], @t[6], @t[6], #12 | ||
565 | veor @t[1], @t[1], @t[5] | ||
566 | veor @y[3], @y[3], @t[0] | ||
567 | veor @t[0], @t[0], @y[0] @ t[0]=y[0] | ||
568 | veor @t[1], @t[1], @t[6] | ||
569 | veor @t[6], @t[6], @t[7] @ clobber t[6] | ||
570 | veor @y[4], @y[4], @t[1] | ||
571 | veor @y[7], @y[7], @t[4] | ||
572 | veor @y[6], @y[6], @t[3] | ||
573 | veor @y[5], @y[5], @t[2] | ||
574 | veor @t[4], @t[4], @y[4] @ t[4]=y[4] | ||
575 | veor @t[3], @t[3], @y[3] @ t[3]=y[3] | ||
576 | veor @t[5], @t[5], @y[5] @ t[5]=y[5] | ||
577 | veor @t[2], @t[2], @y[2] @ t[2]=y[2] | ||
578 | veor @t[3], @t[3], @t[7] | ||
579 | veor @XMM[5], @t[5], @t[6] | ||
580 | veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] | ||
581 | veor @XMM[2], @t[2], @t[6] | ||
582 | veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] | ||
583 | |||
584 | vmov @XMM[0], @t[0] | ||
585 | vmov @XMM[1], @t[1] | ||
586 | @ vmov @XMM[2], @t[2] | ||
587 | vmov @XMM[3], @t[3] | ||
588 | vmov @XMM[4], @t[4] | ||
589 | @ vmov @XMM[5], @t[5] | ||
590 | @ vmov @XMM[6], @t[6] | ||
591 | @ vmov @XMM[7], @t[7] | ||
592 | ___ | ||
593 | } | ||
594 | |||
595 | sub InvMixColumns { | ||
596 | my @x=@_[0..7]; | ||
597 | my @t=@_[8..15]; | ||
598 | |||
599 | # Thanks to Jussi Kivilinna for providing pointer to | ||
600 | # | ||
601 | # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | | ||
602 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | | ||
603 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | | ||
604 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | | ||
605 | |||
606 | $code.=<<___; | ||
607 | @ multiplication by 0x05-0x00-0x04-0x00 | ||
608 | vext.8 @t[0], @x[0], @x[0], #8 | ||
609 | vext.8 @t[6], @x[6], @x[6], #8 | ||
610 | vext.8 @t[7], @x[7], @x[7], #8 | ||
611 | veor @t[0], @t[0], @x[0] | ||
612 | vext.8 @t[1], @x[1], @x[1], #8 | ||
613 | veor @t[6], @t[6], @x[6] | ||
614 | vext.8 @t[2], @x[2], @x[2], #8 | ||
615 | veor @t[7], @t[7], @x[7] | ||
616 | vext.8 @t[3], @x[3], @x[3], #8 | ||
617 | veor @t[1], @t[1], @x[1] | ||
618 | vext.8 @t[4], @x[4], @x[4], #8 | ||
619 | veor @t[2], @t[2], @x[2] | ||
620 | vext.8 @t[5], @x[5], @x[5], #8 | ||
621 | veor @t[3], @t[3], @x[3] | ||
622 | veor @t[4], @t[4], @x[4] | ||
623 | veor @t[5], @t[5], @x[5] | ||
624 | |||
625 | veor @x[0], @x[0], @t[6] | ||
626 | veor @x[1], @x[1], @t[6] | ||
627 | veor @x[2], @x[2], @t[0] | ||
628 | veor @x[4], @x[4], @t[2] | ||
629 | veor @x[3], @x[3], @t[1] | ||
630 | veor @x[1], @x[1], @t[7] | ||
631 | veor @x[2], @x[2], @t[7] | ||
632 | veor @x[4], @x[4], @t[6] | ||
633 | veor @x[5], @x[5], @t[3] | ||
634 | veor @x[3], @x[3], @t[6] | ||
635 | veor @x[6], @x[6], @t[4] | ||
636 | veor @x[4], @x[4], @t[7] | ||
637 | veor @x[5], @x[5], @t[7] | ||
638 | veor @x[7], @x[7], @t[5] | ||
639 | ___ | ||
640 | &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 | ||
641 | } | ||
642 | |||
643 | sub swapmove { | ||
644 | my ($a,$b,$n,$mask,$t)=@_; | ||
645 | $code.=<<___; | ||
646 | vshr.u64 $t, $b, #$n | ||
647 | veor $t, $t, $a | ||
648 | vand $t, $t, $mask | ||
649 | veor $a, $a, $t | ||
650 | vshl.u64 $t, $t, #$n | ||
651 | veor $b, $b, $t | ||
652 | ___ | ||
653 | } | ||
654 | sub swapmove2x { | ||
655 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
656 | $code.=<<___; | ||
657 | vshr.u64 $t0, $b0, #$n | ||
658 | vshr.u64 $t1, $b1, #$n | ||
659 | veor $t0, $t0, $a0 | ||
660 | veor $t1, $t1, $a1 | ||
661 | vand $t0, $t0, $mask | ||
662 | vand $t1, $t1, $mask | ||
663 | veor $a0, $a0, $t0 | ||
664 | vshl.u64 $t0, $t0, #$n | ||
665 | veor $a1, $a1, $t1 | ||
666 | vshl.u64 $t1, $t1, #$n | ||
667 | veor $b0, $b0, $t0 | ||
668 | veor $b1, $b1, $t1 | ||
669 | ___ | ||
670 | } | ||
671 | |||
672 | sub bitslice { | ||
673 | my @x=reverse(@_[0..7]); | ||
674 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
675 | $code.=<<___; | ||
676 | vmov.i8 $t0,#0x55 @ compose .LBS0 | ||
677 | vmov.i8 $t1,#0x33 @ compose .LBS1 | ||
678 | ___ | ||
679 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
680 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
681 | $code.=<<___; | ||
682 | vmov.i8 $t0,#0x0f @ compose .LBS2 | ||
683 | ___ | ||
684 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
685 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
686 | |||
687 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
688 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
689 | } | ||
690 | |||
691 | $code.=<<___; | ||
692 | #ifndef __KERNEL__ | ||
693 | # include "arm_arch.h" | ||
694 | |||
695 | # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} | ||
696 | # define VFP_ABI_POP vldmia sp!,{d8-d15} | ||
697 | # define VFP_ABI_FRAME 0x40 | ||
698 | #else | ||
699 | # define VFP_ABI_PUSH | ||
700 | # define VFP_ABI_POP | ||
701 | # define VFP_ABI_FRAME 0 | ||
702 | # define BSAES_ASM_EXTENDED_KEY | ||
703 | # define XTS_CHAIN_TWEAK | ||
704 | # define __ARM_ARCH__ __LINUX_ARM_ARCH__ | ||
705 | #endif | ||
706 | |||
707 | #ifdef __thumb__ | ||
708 | # define adrl adr | ||
709 | #endif | ||
710 | |||
711 | #if __ARM_ARCH__>=7 | ||
712 | .text | ||
713 | .syntax unified @ ARMv7-capable assembler is expected to handle this | ||
714 | #ifdef __thumb2__ | ||
715 | .thumb | ||
716 | #else | ||
717 | .code 32 | ||
718 | #endif | ||
719 | |||
720 | .fpu neon | ||
721 | |||
722 | .type _bsaes_decrypt8,%function | ||
723 | .align 4 | ||
724 | _bsaes_decrypt8: | ||
725 | adr $const,_bsaes_decrypt8 | ||
726 | vldmia $key!, {@XMM[9]} @ round 0 key | ||
727 | add $const,$const,#.LM0ISR-_bsaes_decrypt8 | ||
728 | |||
729 | vldmia $const!, {@XMM[8]} @ .LM0ISR | ||
730 | veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key | ||
731 | veor @XMM[11], @XMM[1], @XMM[9] | ||
732 | vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
733 | vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
734 | veor @XMM[12], @XMM[2], @XMM[9] | ||
735 | vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
736 | vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
737 | veor @XMM[13], @XMM[3], @XMM[9] | ||
738 | vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` | ||
739 | vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` | ||
740 | veor @XMM[14], @XMM[4], @XMM[9] | ||
741 | vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` | ||
742 | vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` | ||
743 | veor @XMM[15], @XMM[5], @XMM[9] | ||
744 | vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` | ||
745 | vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` | ||
746 | veor @XMM[10], @XMM[6], @XMM[9] | ||
747 | vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` | ||
748 | vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` | ||
749 | veor @XMM[11], @XMM[7], @XMM[9] | ||
750 | vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
751 | vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
752 | vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
753 | vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
754 | ___ | ||
755 | &bitslice (@XMM[0..7, 8..11]); | ||
756 | $code.=<<___; | ||
757 | sub $rounds,$rounds,#1 | ||
758 | b .Ldec_sbox | ||
759 | .align 4 | ||
760 | .Ldec_loop: | ||
761 | ___ | ||
762 | &ShiftRows (@XMM[0..7, 8..12]); | ||
763 | $code.=".Ldec_sbox:\n"; | ||
764 | &InvSbox (@XMM[0..7, 8..15]); | ||
765 | $code.=<<___; | ||
766 | subs $rounds,$rounds,#1 | ||
767 | bcc .Ldec_done | ||
768 | ___ | ||
769 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
770 | $code.=<<___; | ||
771 | vldmia $const, {@XMM[12]} @ .LISR | ||
772 | ite eq @ Thumb2 thing, sanity check in ARM | ||
773 | addeq $const,$const,#0x10 | ||
774 | bne .Ldec_loop | ||
775 | vldmia $const, {@XMM[12]} @ .LISRM0 | ||
776 | b .Ldec_loop | ||
777 | .align 4 | ||
778 | .Ldec_done: | ||
779 | ___ | ||
780 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
781 | $code.=<<___; | ||
782 | vldmia $key, {@XMM[8]} @ last round key | ||
783 | veor @XMM[6], @XMM[6], @XMM[8] | ||
784 | veor @XMM[4], @XMM[4], @XMM[8] | ||
785 | veor @XMM[2], @XMM[2], @XMM[8] | ||
786 | veor @XMM[7], @XMM[7], @XMM[8] | ||
787 | veor @XMM[3], @XMM[3], @XMM[8] | ||
788 | veor @XMM[5], @XMM[5], @XMM[8] | ||
789 | veor @XMM[0], @XMM[0], @XMM[8] | ||
790 | veor @XMM[1], @XMM[1], @XMM[8] | ||
791 | bx lr | ||
792 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
793 | |||
794 | .type _bsaes_const,%object | ||
795 | .align 6 | ||
796 | _bsaes_const: | ||
797 | .LM0ISR: @ InvShiftRows constants | ||
798 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
799 | .LISR: | ||
800 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
801 | .LISRM0: | ||
802 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
803 | .LM0SR: @ ShiftRows constants | ||
804 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
805 | .LSR: | ||
806 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
807 | .LSRM0: | ||
808 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
809 | .LM0: | ||
810 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
811 | .LREVM0SR: | ||
812 | .quad 0x090d01050c000408, 0x03070b0f060a0e02 | ||
813 | .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
814 | .align 6 | ||
815 | .size _bsaes_const,.-_bsaes_const | ||
816 | |||
817 | .type _bsaes_encrypt8,%function | ||
818 | .align 4 | ||
819 | _bsaes_encrypt8: | ||
820 | adr $const,_bsaes_encrypt8 | ||
821 | vldmia $key!, {@XMM[9]} @ round 0 key | ||
822 | sub $const,$const,#_bsaes_encrypt8-.LM0SR | ||
823 | |||
824 | vldmia $const!, {@XMM[8]} @ .LM0SR | ||
825 | _bsaes_encrypt8_alt: | ||
826 | veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key | ||
827 | veor @XMM[11], @XMM[1], @XMM[9] | ||
828 | vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
829 | vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
830 | veor @XMM[12], @XMM[2], @XMM[9] | ||
831 | vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
832 | vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
833 | veor @XMM[13], @XMM[3], @XMM[9] | ||
834 | vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` | ||
835 | vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` | ||
836 | veor @XMM[14], @XMM[4], @XMM[9] | ||
837 | vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` | ||
838 | vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` | ||
839 | veor @XMM[15], @XMM[5], @XMM[9] | ||
840 | vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` | ||
841 | vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` | ||
842 | veor @XMM[10], @XMM[6], @XMM[9] | ||
843 | vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` | ||
844 | vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` | ||
845 | veor @XMM[11], @XMM[7], @XMM[9] | ||
846 | vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
847 | vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
848 | vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
849 | vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
850 | _bsaes_encrypt8_bitslice: | ||
851 | ___ | ||
852 | &bitslice (@XMM[0..7, 8..11]); | ||
853 | $code.=<<___; | ||
854 | sub $rounds,$rounds,#1 | ||
855 | b .Lenc_sbox | ||
856 | .align 4 | ||
857 | .Lenc_loop: | ||
858 | ___ | ||
859 | &ShiftRows (@XMM[0..7, 8..12]); | ||
860 | $code.=".Lenc_sbox:\n"; | ||
861 | &Sbox (@XMM[0..7, 8..15]); | ||
862 | $code.=<<___; | ||
863 | subs $rounds,$rounds,#1 | ||
864 | bcc .Lenc_done | ||
865 | ___ | ||
866 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
867 | $code.=<<___; | ||
868 | vldmia $const, {@XMM[12]} @ .LSR | ||
869 | ite eq @ Thumb2 thing, samity check in ARM | ||
870 | addeq $const,$const,#0x10 | ||
871 | bne .Lenc_loop | ||
872 | vldmia $const, {@XMM[12]} @ .LSRM0 | ||
873 | b .Lenc_loop | ||
874 | .align 4 | ||
875 | .Lenc_done: | ||
876 | ___ | ||
877 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
878 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
879 | $code.=<<___; | ||
880 | vldmia $key, {@XMM[8]} @ last round key | ||
881 | veor @XMM[4], @XMM[4], @XMM[8] | ||
882 | veor @XMM[6], @XMM[6], @XMM[8] | ||
883 | veor @XMM[3], @XMM[3], @XMM[8] | ||
884 | veor @XMM[7], @XMM[7], @XMM[8] | ||
885 | veor @XMM[2], @XMM[2], @XMM[8] | ||
886 | veor @XMM[5], @XMM[5], @XMM[8] | ||
887 | veor @XMM[0], @XMM[0], @XMM[8] | ||
888 | veor @XMM[1], @XMM[1], @XMM[8] | ||
889 | bx lr | ||
890 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
891 | ___ | ||
892 | } | ||
893 | { | ||
894 | my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); | ||
895 | |||
896 | sub bitslice_key { | ||
897 | my @x=reverse(@_[0..7]); | ||
898 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
899 | |||
900 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
901 | $code.=<<___; | ||
902 | @ &swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
903 | vmov @x[2], @x[0] | ||
904 | vmov @x[3], @x[1] | ||
905 | ___ | ||
906 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
907 | |||
908 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
909 | $code.=<<___; | ||
910 | @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
911 | vmov @x[4], @x[0] | ||
912 | vmov @x[6], @x[2] | ||
913 | vmov @x[5], @x[1] | ||
914 | vmov @x[7], @x[3] | ||
915 | ___ | ||
916 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
917 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
918 | } | ||
919 | |||
920 | $code.=<<___; | ||
921 | .type _bsaes_key_convert,%function | ||
922 | .align 4 | ||
923 | _bsaes_key_convert: | ||
924 | adr $const,_bsaes_key_convert | ||
925 | vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key | ||
926 | sub $const,$const,#_bsaes_key_convert-.LM0 | ||
927 | vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key | ||
928 | |||
929 | vmov.i8 @XMM[8], #0x01 @ bit masks | ||
930 | vmov.i8 @XMM[9], #0x02 | ||
931 | vmov.i8 @XMM[10], #0x04 | ||
932 | vmov.i8 @XMM[11], #0x08 | ||
933 | vmov.i8 @XMM[12], #0x10 | ||
934 | vmov.i8 @XMM[13], #0x20 | ||
935 | vldmia $const, {@XMM[14]} @ .LM0 | ||
936 | |||
937 | #ifdef __ARMEL__ | ||
938 | vrev32.8 @XMM[7], @XMM[7] | ||
939 | vrev32.8 @XMM[15], @XMM[15] | ||
940 | #endif | ||
941 | sub $rounds,$rounds,#1 | ||
942 | vstmia $out!, {@XMM[7]} @ save round 0 key | ||
943 | b .Lkey_loop | ||
944 | |||
945 | .align 4 | ||
946 | .Lkey_loop: | ||
947 | vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` | ||
948 | vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` | ||
949 | vmov.i8 @XMM[6], #0x40 | ||
950 | vmov.i8 @XMM[15], #0x80 | ||
951 | |||
952 | vtst.8 @XMM[0], @XMM[7], @XMM[8] | ||
953 | vtst.8 @XMM[1], @XMM[7], @XMM[9] | ||
954 | vtst.8 @XMM[2], @XMM[7], @XMM[10] | ||
955 | vtst.8 @XMM[3], @XMM[7], @XMM[11] | ||
956 | vtst.8 @XMM[4], @XMM[7], @XMM[12] | ||
957 | vtst.8 @XMM[5], @XMM[7], @XMM[13] | ||
958 | vtst.8 @XMM[6], @XMM[7], @XMM[6] | ||
959 | vtst.8 @XMM[7], @XMM[7], @XMM[15] | ||
960 | vld1.8 {@XMM[15]}, [$inp]! @ load next round key | ||
961 | vmvn @XMM[0], @XMM[0] @ "pnot" | ||
962 | vmvn @XMM[1], @XMM[1] | ||
963 | vmvn @XMM[5], @XMM[5] | ||
964 | vmvn @XMM[6], @XMM[6] | ||
965 | #ifdef __ARMEL__ | ||
966 | vrev32.8 @XMM[15], @XMM[15] | ||
967 | #endif | ||
968 | subs $rounds,$rounds,#1 | ||
969 | vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key | ||
970 | bne .Lkey_loop | ||
971 | |||
972 | vmov.i8 @XMM[7],#0x63 @ compose .L63 | ||
973 | @ don't save last round key | ||
974 | bx lr | ||
975 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
976 | ___ | ||
977 | } | ||
978 | |||
979 | if (0) { # following four functions are unsupported interface | ||
980 | # used for benchmarking... | ||
981 | $code.=<<___; | ||
982 | .globl bsaes_enc_key_convert | ||
983 | .type bsaes_enc_key_convert,%function | ||
984 | .align 4 | ||
985 | bsaes_enc_key_convert: | ||
986 | stmdb sp!,{r4-r6,lr} | ||
987 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
988 | |||
989 | ldr r5,[$inp,#240] @ pass rounds | ||
990 | mov r4,$inp @ pass key | ||
991 | mov r12,$out @ pass key schedule | ||
992 | bl _bsaes_key_convert | ||
993 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
994 | vstmia r12, {@XMM[7]} @ save last round key | ||
995 | |||
996 | vldmia sp!,{d8-d15} | ||
997 | ldmia sp!,{r4-r6,pc} | ||
998 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
999 | |||
1000 | .globl bsaes_encrypt_128 | ||
1001 | .type bsaes_encrypt_128,%function | ||
1002 | .align 4 | ||
1003 | bsaes_encrypt_128: | ||
1004 | stmdb sp!,{r4-r6,lr} | ||
1005 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1006 | .Lenc128_loop: | ||
1007 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1008 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1009 | mov r4,$key @ pass the key | ||
1010 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1011 | mov r5,#10 @ pass rounds | ||
1012 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1013 | |||
1014 | bl _bsaes_encrypt8 | ||
1015 | |||
1016 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1017 | vst1.8 {@XMM[4]}, [$out]! | ||
1018 | vst1.8 {@XMM[6]}, [$out]! | ||
1019 | vst1.8 {@XMM[3]}, [$out]! | ||
1020 | vst1.8 {@XMM[7]}, [$out]! | ||
1021 | vst1.8 {@XMM[2]}, [$out]! | ||
1022 | subs $len,$len,#0x80 | ||
1023 | vst1.8 {@XMM[5]}, [$out]! | ||
1024 | bhi .Lenc128_loop | ||
1025 | |||
1026 | vldmia sp!,{d8-d15} | ||
1027 | ldmia sp!,{r4-r6,pc} | ||
1028 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
1029 | |||
1030 | .globl bsaes_dec_key_convert | ||
1031 | .type bsaes_dec_key_convert,%function | ||
1032 | .align 4 | ||
1033 | bsaes_dec_key_convert: | ||
1034 | stmdb sp!,{r4-r6,lr} | ||
1035 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1036 | |||
1037 | ldr r5,[$inp,#240] @ pass rounds | ||
1038 | mov r4,$inp @ pass key | ||
1039 | mov r12,$out @ pass key schedule | ||
1040 | bl _bsaes_key_convert | ||
1041 | vldmia $out, {@XMM[6]} | ||
1042 | vstmia r12, {@XMM[15]} @ save last round key | ||
1043 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1044 | vstmia $out, {@XMM[7]} | ||
1045 | |||
1046 | vldmia sp!,{d8-d15} | ||
1047 | ldmia sp!,{r4-r6,pc} | ||
1048 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
1049 | |||
1050 | .globl bsaes_decrypt_128 | ||
1051 | .type bsaes_decrypt_128,%function | ||
1052 | .align 4 | ||
1053 | bsaes_decrypt_128: | ||
1054 | stmdb sp!,{r4-r6,lr} | ||
1055 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1056 | .Ldec128_loop: | ||
1057 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1058 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1059 | mov r4,$key @ pass the key | ||
1060 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1061 | mov r5,#10 @ pass rounds | ||
1062 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1063 | |||
1064 | bl _bsaes_decrypt8 | ||
1065 | |||
1066 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1067 | vst1.8 {@XMM[6]}, [$out]! | ||
1068 | vst1.8 {@XMM[4]}, [$out]! | ||
1069 | vst1.8 {@XMM[2]}, [$out]! | ||
1070 | vst1.8 {@XMM[7]}, [$out]! | ||
1071 | vst1.8 {@XMM[3]}, [$out]! | ||
1072 | subs $len,$len,#0x80 | ||
1073 | vst1.8 {@XMM[5]}, [$out]! | ||
1074 | bhi .Ldec128_loop | ||
1075 | |||
1076 | vldmia sp!,{d8-d15} | ||
1077 | ldmia sp!,{r4-r6,pc} | ||
1078 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
1079 | ___ | ||
1080 | } | ||
1081 | { | ||
1082 | my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); | ||
1083 | my ($keysched)=("sp"); | ||
1084 | |||
1085 | $code.=<<___; | ||
1086 | .extern AES_cbc_encrypt | ||
1087 | .extern AES_decrypt | ||
1088 | |||
1089 | .global bsaes_cbc_encrypt | ||
1090 | .type bsaes_cbc_encrypt,%function | ||
1091 | .align 5 | ||
1092 | bsaes_cbc_encrypt: | ||
1093 | #ifndef __KERNEL__ | ||
1094 | cmp $len, #128 | ||
1095 | #ifndef __thumb__ | ||
1096 | blo AES_cbc_encrypt | ||
1097 | #else | ||
1098 | bhs 1f | ||
1099 | b AES_cbc_encrypt | ||
1100 | 1: | ||
1101 | #endif | ||
1102 | #endif | ||
1103 | |||
1104 | @ it is up to the caller to make sure we are called with enc == 0 | ||
1105 | |||
1106 | mov ip, sp | ||
1107 | stmdb sp!, {r4-r10, lr} | ||
1108 | VFP_ABI_PUSH | ||
1109 | ldr $ivp, [ip] @ IV is 1st arg on the stack | ||
1110 | mov $len, $len, lsr#4 @ len in 16 byte blocks | ||
1111 | sub sp, #0x10 @ scratch space to carry over the IV | ||
1112 | mov $fp, sp @ save sp | ||
1113 | |||
1114 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1115 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1116 | @ allocate the key schedule on the stack | ||
1117 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1118 | add r12, #`128-32` @ sifze of bit-slices key schedule | ||
1119 | |||
1120 | @ populate the key schedule | ||
1121 | mov r4, $key @ pass key | ||
1122 | mov r5, $rounds @ pass # of rounds | ||
1123 | mov sp, r12 @ sp is $keysched | ||
1124 | bl _bsaes_key_convert | ||
1125 | vldmia $keysched, {@XMM[6]} | ||
1126 | vstmia r12, {@XMM[15]} @ save last round key | ||
1127 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1128 | vstmia $keysched, {@XMM[7]} | ||
1129 | #else | ||
1130 | ldr r12, [$key, #244] | ||
1131 | eors r12, #1 | ||
1132 | beq 0f | ||
1133 | |||
1134 | @ populate the key schedule | ||
1135 | str r12, [$key, #244] | ||
1136 | mov r4, $key @ pass key | ||
1137 | mov r5, $rounds @ pass # of rounds | ||
1138 | add r12, $key, #248 @ pass key schedule | ||
1139 | bl _bsaes_key_convert | ||
1140 | add r4, $key, #248 | ||
1141 | vldmia r4, {@XMM[6]} | ||
1142 | vstmia r12, {@XMM[15]} @ save last round key | ||
1143 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1144 | vstmia r4, {@XMM[7]} | ||
1145 | |||
1146 | .align 2 | ||
1147 | 0: | ||
1148 | #endif | ||
1149 | |||
1150 | vld1.8 {@XMM[15]}, [$ivp] @ load IV | ||
1151 | b .Lcbc_dec_loop | ||
1152 | |||
1153 | .align 4 | ||
1154 | .Lcbc_dec_loop: | ||
1155 | subs $len, $len, #0x8 | ||
1156 | bmi .Lcbc_dec_loop_finish | ||
1157 | |||
1158 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1159 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1160 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1161 | mov r4, $keysched @ pass the key | ||
1162 | #else | ||
1163 | add r4, $key, #248 | ||
1164 | #endif | ||
1165 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1166 | mov r5, $rounds | ||
1167 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp] | ||
1168 | sub $inp, $inp, #0x60 | ||
1169 | vstmia $fp, {@XMM[15]} @ put aside IV | ||
1170 | |||
1171 | bl _bsaes_decrypt8 | ||
1172 | |||
1173 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1174 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1175 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1176 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1177 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1178 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1179 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1180 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1181 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1182 | vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! | ||
1183 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1184 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1185 | veor @XMM[3], @XMM[3], @XMM[13] | ||
1186 | vst1.8 {@XMM[6]}, [$out]! | ||
1187 | veor @XMM[5], @XMM[5], @XMM[14] | ||
1188 | vst1.8 {@XMM[4]}, [$out]! | ||
1189 | vst1.8 {@XMM[2]}, [$out]! | ||
1190 | vst1.8 {@XMM[7]}, [$out]! | ||
1191 | vst1.8 {@XMM[3]}, [$out]! | ||
1192 | vst1.8 {@XMM[5]}, [$out]! | ||
1193 | |||
1194 | b .Lcbc_dec_loop | ||
1195 | |||
1196 | .Lcbc_dec_loop_finish: | ||
1197 | adds $len, $len, #8 | ||
1198 | beq .Lcbc_dec_done | ||
1199 | |||
1200 | vld1.8 {@XMM[0]}, [$inp]! @ load input | ||
1201 | cmp $len, #2 | ||
1202 | blo .Lcbc_dec_one | ||
1203 | vld1.8 {@XMM[1]}, [$inp]! | ||
1204 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1205 | mov r4, $keysched @ pass the key | ||
1206 | #else | ||
1207 | add r4, $key, #248 | ||
1208 | #endif | ||
1209 | mov r5, $rounds | ||
1210 | vstmia $fp, {@XMM[15]} @ put aside IV | ||
1211 | beq .Lcbc_dec_two | ||
1212 | vld1.8 {@XMM[2]}, [$inp]! | ||
1213 | cmp $len, #4 | ||
1214 | blo .Lcbc_dec_three | ||
1215 | vld1.8 {@XMM[3]}, [$inp]! | ||
1216 | beq .Lcbc_dec_four | ||
1217 | vld1.8 {@XMM[4]}, [$inp]! | ||
1218 | cmp $len, #6 | ||
1219 | blo .Lcbc_dec_five | ||
1220 | vld1.8 {@XMM[5]}, [$inp]! | ||
1221 | beq .Lcbc_dec_six | ||
1222 | vld1.8 {@XMM[6]}, [$inp]! | ||
1223 | sub $inp, $inp, #0x70 | ||
1224 | |||
1225 | bl _bsaes_decrypt8 | ||
1226 | |||
1227 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1228 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1229 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1230 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1231 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1232 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1233 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1234 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1235 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1236 | vld1.8 {@XMM[15]}, [$inp]! | ||
1237 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1238 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1239 | veor @XMM[3], @XMM[3], @XMM[13] | ||
1240 | vst1.8 {@XMM[6]}, [$out]! | ||
1241 | vst1.8 {@XMM[4]}, [$out]! | ||
1242 | vst1.8 {@XMM[2]}, [$out]! | ||
1243 | vst1.8 {@XMM[7]}, [$out]! | ||
1244 | vst1.8 {@XMM[3]}, [$out]! | ||
1245 | b .Lcbc_dec_done | ||
1246 | .align 4 | ||
1247 | .Lcbc_dec_six: | ||
1248 | sub $inp, $inp, #0x60 | ||
1249 | bl _bsaes_decrypt8 | ||
1250 | vldmia $fp,{@XMM[14]} @ reload IV | ||
1251 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1252 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1253 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1254 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1255 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1256 | vld1.8 {@XMM[12]}, [$inp]! | ||
1257 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1258 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1259 | vld1.8 {@XMM[15]}, [$inp]! | ||
1260 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1261 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1262 | vst1.8 {@XMM[6]}, [$out]! | ||
1263 | vst1.8 {@XMM[4]}, [$out]! | ||
1264 | vst1.8 {@XMM[2]}, [$out]! | ||
1265 | vst1.8 {@XMM[7]}, [$out]! | ||
1266 | b .Lcbc_dec_done | ||
1267 | .align 4 | ||
1268 | .Lcbc_dec_five: | ||
1269 | sub $inp, $inp, #0x50 | ||
1270 | bl _bsaes_decrypt8 | ||
1271 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1272 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1273 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1274 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1275 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1276 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1277 | vld1.8 {@XMM[15]}, [$inp]! | ||
1278 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1279 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1280 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1281 | vst1.8 {@XMM[6]}, [$out]! | ||
1282 | vst1.8 {@XMM[4]}, [$out]! | ||
1283 | vst1.8 {@XMM[2]}, [$out]! | ||
1284 | b .Lcbc_dec_done | ||
1285 | .align 4 | ||
1286 | .Lcbc_dec_four: | ||
1287 | sub $inp, $inp, #0x40 | ||
1288 | bl _bsaes_decrypt8 | ||
1289 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1290 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1291 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1292 | vld1.8 {@XMM[10]}, [$inp]! | ||
1293 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1294 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1295 | vld1.8 {@XMM[15]}, [$inp]! | ||
1296 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1297 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1298 | vst1.8 {@XMM[6]}, [$out]! | ||
1299 | vst1.8 {@XMM[4]}, [$out]! | ||
1300 | b .Lcbc_dec_done | ||
1301 | .align 4 | ||
1302 | .Lcbc_dec_three: | ||
1303 | sub $inp, $inp, #0x30 | ||
1304 | bl _bsaes_decrypt8 | ||
1305 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1306 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1307 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1308 | vld1.8 {@XMM[15]}, [$inp]! | ||
1309 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1310 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1311 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1312 | vst1.8 {@XMM[6]}, [$out]! | ||
1313 | b .Lcbc_dec_done | ||
1314 | .align 4 | ||
1315 | .Lcbc_dec_two: | ||
1316 | sub $inp, $inp, #0x20 | ||
1317 | bl _bsaes_decrypt8 | ||
1318 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1319 | vld1.8 {@XMM[8]}, [$inp]! @ reload input | ||
1320 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1321 | vld1.8 {@XMM[15]}, [$inp]! @ reload input | ||
1322 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1323 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1324 | b .Lcbc_dec_done | ||
1325 | .align 4 | ||
1326 | .Lcbc_dec_one: | ||
1327 | sub $inp, $inp, #0x10 | ||
1328 | mov $rounds, $out @ save original out pointer | ||
1329 | mov $out, $fp @ use the iv scratch space as out buffer | ||
1330 | mov r2, $key | ||
1331 | vmov @XMM[4],@XMM[15] @ just in case ensure that IV | ||
1332 | vmov @XMM[5],@XMM[0] @ and input are preserved | ||
1333 | bl AES_decrypt | ||
1334 | vld1.8 {@XMM[0]}, [$fp,:64] @ load result | ||
1335 | veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV | ||
1336 | vmov @XMM[15], @XMM[5] @ @XMM[5] holds input | ||
1337 | vst1.8 {@XMM[0]}, [$rounds] @ write output | ||
1338 | |||
1339 | .Lcbc_dec_done: | ||
1340 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1341 | vmov.i32 q0, #0 | ||
1342 | vmov.i32 q1, #0 | ||
1343 | .Lcbc_dec_bzero: @ wipe key schedule [if any] | ||
1344 | vstmia $keysched!, {q0-q1} | ||
1345 | cmp $keysched, $fp | ||
1346 | bne .Lcbc_dec_bzero | ||
1347 | #endif | ||
1348 | |||
1349 | mov sp, $fp | ||
1350 | add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb | ||
1351 | vst1.8 {@XMM[15]}, [$ivp] @ return IV | ||
1352 | VFP_ABI_POP | ||
1353 | ldmia sp!, {r4-r10, pc} | ||
1354 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1355 | ___ | ||
1356 | } | ||
1357 | { | ||
1358 | my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); | ||
1359 | my $const = "r6"; # shared with _bsaes_encrypt8_alt | ||
1360 | my $keysched = "sp"; | ||
1361 | |||
1362 | $code.=<<___; | ||
1363 | .extern AES_encrypt | ||
1364 | .global bsaes_ctr32_encrypt_blocks | ||
1365 | .type bsaes_ctr32_encrypt_blocks,%function | ||
1366 | .align 5 | ||
1367 | bsaes_ctr32_encrypt_blocks: | ||
1368 | cmp $len, #8 @ use plain AES for | ||
1369 | blo .Lctr_enc_short @ small sizes | ||
1370 | |||
1371 | mov ip, sp | ||
1372 | stmdb sp!, {r4-r10, lr} | ||
1373 | VFP_ABI_PUSH | ||
1374 | ldr $ctr, [ip] @ ctr is 1st arg on the stack | ||
1375 | sub sp, sp, #0x10 @ scratch space to carry over the ctr | ||
1376 | mov $fp, sp @ save sp | ||
1377 | |||
1378 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1379 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1380 | @ allocate the key schedule on the stack | ||
1381 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1382 | add r12, #`128-32` @ size of bit-sliced key schedule | ||
1383 | |||
1384 | @ populate the key schedule | ||
1385 | mov r4, $key @ pass key | ||
1386 | mov r5, $rounds @ pass # of rounds | ||
1387 | mov sp, r12 @ sp is $keysched | ||
1388 | bl _bsaes_key_convert | ||
1389 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
1390 | vstmia r12, {@XMM[7]} @ save last round key | ||
1391 | |||
1392 | vld1.8 {@XMM[0]}, [$ctr] @ load counter | ||
1393 | add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr | ||
1394 | vldmia $keysched, {@XMM[4]} @ load round0 key | ||
1395 | #else | ||
1396 | ldr r12, [$key, #244] | ||
1397 | eors r12, #1 | ||
1398 | beq 0f | ||
1399 | |||
1400 | @ populate the key schedule | ||
1401 | str r12, [$key, #244] | ||
1402 | mov r4, $key @ pass key | ||
1403 | mov r5, $rounds @ pass # of rounds | ||
1404 | add r12, $key, #248 @ pass key schedule | ||
1405 | bl _bsaes_key_convert | ||
1406 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
1407 | vstmia r12, {@XMM[7]} @ save last round key | ||
1408 | |||
1409 | .align 2 | ||
1410 | 0: add r12, $key, #248 | ||
1411 | vld1.8 {@XMM[0]}, [$ctr] @ load counter | ||
1412 | adrl $ctr, .LREVM0SR @ borrow $ctr | ||
1413 | vldmia r12, {@XMM[4]} @ load round0 key | ||
1414 | sub sp, #0x10 @ place for adjusted round0 key | ||
1415 | #endif | ||
1416 | |||
1417 | vmov.i32 @XMM[8],#1 @ compose 1<<96 | ||
1418 | veor @XMM[9],@XMM[9],@XMM[9] | ||
1419 | vrev32.8 @XMM[0],@XMM[0] | ||
1420 | vext.8 @XMM[8],@XMM[9],@XMM[8],#4 | ||
1421 | vrev32.8 @XMM[4],@XMM[4] | ||
1422 | vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 | ||
1423 | vstmia $keysched, {@XMM[4]} @ save adjusted round0 key | ||
1424 | b .Lctr_enc_loop | ||
1425 | |||
1426 | .align 4 | ||
1427 | .Lctr_enc_loop: | ||
1428 | vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 | ||
1429 | vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 | ||
1430 | vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 | ||
1431 | vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 | ||
1432 | vadd.u32 @XMM[4], @XMM[1], @XMM[10] | ||
1433 | vadd.u32 @XMM[5], @XMM[2], @XMM[10] | ||
1434 | vadd.u32 @XMM[6], @XMM[3], @XMM[10] | ||
1435 | vadd.u32 @XMM[7], @XMM[4], @XMM[10] | ||
1436 | vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter | ||
1437 | |||
1438 | @ Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1439 | @ to flip byte order in 32-bit counter | ||
1440 | |||
1441 | vldmia $keysched, {@XMM[9]} @ load round0 key | ||
1442 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1443 | add r4, $keysched, #0x10 @ pass next round key | ||
1444 | #else | ||
1445 | add r4, $key, #`248+16` | ||
1446 | #endif | ||
1447 | vldmia $ctr, {@XMM[8]} @ .LREVM0SR | ||
1448 | mov r5, $rounds @ pass rounds | ||
1449 | vstmia $fp, {@XMM[10]} @ save next counter | ||
1450 | sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants | ||
1451 | |||
1452 | bl _bsaes_encrypt8_alt | ||
1453 | |||
1454 | subs $len, $len, #8 | ||
1455 | blo .Lctr_enc_loop_done | ||
1456 | |||
1457 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input | ||
1458 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1459 | veor @XMM[0], @XMM[8] | ||
1460 | veor @XMM[1], @XMM[9] | ||
1461 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1462 | veor @XMM[4], @XMM[10] | ||
1463 | veor @XMM[6], @XMM[11] | ||
1464 | vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! | ||
1465 | veor @XMM[3], @XMM[12] | ||
1466 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1467 | veor @XMM[7], @XMM[13] | ||
1468 | veor @XMM[2], @XMM[14] | ||
1469 | vst1.8 {@XMM[4]}, [$out]! | ||
1470 | veor @XMM[5], @XMM[15] | ||
1471 | vst1.8 {@XMM[6]}, [$out]! | ||
1472 | vmov.i32 @XMM[8], #1 @ compose 1<<96 | ||
1473 | vst1.8 {@XMM[3]}, [$out]! | ||
1474 | veor @XMM[9], @XMM[9], @XMM[9] | ||
1475 | vst1.8 {@XMM[7]}, [$out]! | ||
1476 | vext.8 @XMM[8], @XMM[9], @XMM[8], #4 | ||
1477 | vst1.8 {@XMM[2]}, [$out]! | ||
1478 | vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 | ||
1479 | vst1.8 {@XMM[5]}, [$out]! | ||
1480 | vldmia $fp, {@XMM[0]} @ load counter | ||
1481 | |||
1482 | bne .Lctr_enc_loop | ||
1483 | b .Lctr_enc_done | ||
1484 | |||
1485 | .align 4 | ||
1486 | .Lctr_enc_loop_done: | ||
1487 | add $len, $len, #8 | ||
1488 | vld1.8 {@XMM[8]}, [$inp]! @ load input | ||
1489 | veor @XMM[0], @XMM[8] | ||
1490 | vst1.8 {@XMM[0]}, [$out]! @ write output | ||
1491 | cmp $len, #2 | ||
1492 | blo .Lctr_enc_done | ||
1493 | vld1.8 {@XMM[9]}, [$inp]! | ||
1494 | veor @XMM[1], @XMM[9] | ||
1495 | vst1.8 {@XMM[1]}, [$out]! | ||
1496 | beq .Lctr_enc_done | ||
1497 | vld1.8 {@XMM[10]}, [$inp]! | ||
1498 | veor @XMM[4], @XMM[10] | ||
1499 | vst1.8 {@XMM[4]}, [$out]! | ||
1500 | cmp $len, #4 | ||
1501 | blo .Lctr_enc_done | ||
1502 | vld1.8 {@XMM[11]}, [$inp]! | ||
1503 | veor @XMM[6], @XMM[11] | ||
1504 | vst1.8 {@XMM[6]}, [$out]! | ||
1505 | beq .Lctr_enc_done | ||
1506 | vld1.8 {@XMM[12]}, [$inp]! | ||
1507 | veor @XMM[3], @XMM[12] | ||
1508 | vst1.8 {@XMM[3]}, [$out]! | ||
1509 | cmp $len, #6 | ||
1510 | blo .Lctr_enc_done | ||
1511 | vld1.8 {@XMM[13]}, [$inp]! | ||
1512 | veor @XMM[7], @XMM[13] | ||
1513 | vst1.8 {@XMM[7]}, [$out]! | ||
1514 | beq .Lctr_enc_done | ||
1515 | vld1.8 {@XMM[14]}, [$inp] | ||
1516 | veor @XMM[2], @XMM[14] | ||
1517 | vst1.8 {@XMM[2]}, [$out]! | ||
1518 | |||
1519 | .Lctr_enc_done: | ||
1520 | vmov.i32 q0, #0 | ||
1521 | vmov.i32 q1, #0 | ||
1522 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1523 | .Lctr_enc_bzero: @ wipe key schedule [if any] | ||
1524 | vstmia $keysched!, {q0-q1} | ||
1525 | cmp $keysched, $fp | ||
1526 | bne .Lctr_enc_bzero | ||
1527 | #else | ||
1528 | vstmia $keysched, {q0-q1} | ||
1529 | #endif | ||
1530 | |||
1531 | mov sp, $fp | ||
1532 | add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb | ||
1533 | VFP_ABI_POP | ||
1534 | ldmia sp!, {r4-r10, pc} @ return | ||
1535 | |||
1536 | .align 4 | ||
1537 | .Lctr_enc_short: | ||
1538 | ldr ip, [sp] @ ctr pointer is passed on stack | ||
1539 | stmdb sp!, {r4-r8, lr} | ||
1540 | |||
1541 | mov r4, $inp @ copy arguments | ||
1542 | mov r5, $out | ||
1543 | mov r6, $len | ||
1544 | mov r7, $key | ||
1545 | ldr r8, [ip, #12] @ load counter LSW | ||
1546 | vld1.8 {@XMM[1]}, [ip] @ load whole counter value | ||
1547 | #ifdef __ARMEL__ | ||
1548 | rev r8, r8 | ||
1549 | #endif | ||
1550 | sub sp, sp, #0x10 | ||
1551 | vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value | ||
1552 | sub sp, sp, #0x10 | ||
1553 | |||
1554 | .Lctr_enc_short_loop: | ||
1555 | add r0, sp, #0x10 @ input counter value | ||
1556 | mov r1, sp @ output on the stack | ||
1557 | mov r2, r7 @ key | ||
1558 | |||
1559 | bl AES_encrypt | ||
1560 | |||
1561 | vld1.8 {@XMM[0]}, [r4]! @ load input | ||
1562 | vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter | ||
1563 | add r8, r8, #1 | ||
1564 | #ifdef __ARMEL__ | ||
1565 | rev r0, r8 | ||
1566 | str r0, [sp, #0x1c] @ next counter value | ||
1567 | #else | ||
1568 | str r8, [sp, #0x1c] @ next counter value | ||
1569 | #endif | ||
1570 | veor @XMM[0],@XMM[0],@XMM[1] | ||
1571 | vst1.8 {@XMM[0]}, [r5]! @ store output | ||
1572 | subs r6, r6, #1 | ||
1573 | bne .Lctr_enc_short_loop | ||
1574 | |||
1575 | vmov.i32 q0, #0 | ||
1576 | vmov.i32 q1, #0 | ||
1577 | vstmia sp!, {q0-q1} | ||
1578 | |||
1579 | ldmia sp!, {r4-r8, pc} | ||
1580 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
1581 | ___ | ||
1582 | } | ||
1583 | { | ||
1584 | ###################################################################### | ||
1585 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1586 | # const AES_KEY *key1, const AES_KEY *key2, | ||
1587 | # const unsigned char iv[16]); | ||
1588 | # | ||
1589 | my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3))); | ||
1590 | my $const="r6"; # returned by _bsaes_key_convert | ||
1591 | my $twmask=@XMM[5]; | ||
1592 | my @T=@XMM[6..7]; | ||
1593 | |||
1594 | $code.=<<___; | ||
1595 | .globl bsaes_xts_encrypt | ||
1596 | .type bsaes_xts_encrypt,%function | ||
1597 | .align 4 | ||
1598 | bsaes_xts_encrypt: | ||
1599 | mov ip, sp | ||
1600 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
1601 | VFP_ABI_PUSH | ||
1602 | mov r6, sp @ future $fp | ||
1603 | |||
1604 | mov $inp, r0 | ||
1605 | mov $out, r1 | ||
1606 | mov $len, r2 | ||
1607 | mov $key, r3 | ||
1608 | |||
1609 | sub r0, sp, #0x10 @ 0x10 | ||
1610 | bic r0, #0xf @ align at 16 bytes | ||
1611 | mov sp, r0 | ||
1612 | |||
1613 | #ifdef XTS_CHAIN_TWEAK | ||
1614 | ldr r0, [ip] @ pointer to input tweak | ||
1615 | #else | ||
1616 | @ generate initial tweak | ||
1617 | ldr r0, [ip, #4] @ iv[] | ||
1618 | mov r1, sp | ||
1619 | ldr r2, [ip, #0] @ key2 | ||
1620 | bl AES_encrypt | ||
1621 | mov r0,sp @ pointer to initial tweak | ||
1622 | #endif | ||
1623 | |||
1624 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1625 | mov $fp, r6 | ||
1626 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1627 | @ allocate the key schedule on the stack | ||
1628 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1629 | @ add r12, #`128-32` @ size of bit-sliced key schedule | ||
1630 | sub r12, #`32+16` @ place for tweak[9] | ||
1631 | |||
1632 | @ populate the key schedule | ||
1633 | mov r4, $key @ pass key | ||
1634 | mov r5, $rounds @ pass # of rounds | ||
1635 | mov sp, r12 | ||
1636 | add r12, #0x90 @ pass key schedule | ||
1637 | bl _bsaes_key_convert | ||
1638 | veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key | ||
1639 | vstmia r12, {@XMM[7]} @ save last round key | ||
1640 | #else | ||
1641 | ldr r12, [$key, #244] | ||
1642 | eors r12, #1 | ||
1643 | beq 0f | ||
1644 | |||
1645 | str r12, [$key, #244] | ||
1646 | mov r4, $key @ pass key | ||
1647 | mov r5, $rounds @ pass # of rounds | ||
1648 | add r12, $key, #248 @ pass key schedule | ||
1649 | bl _bsaes_key_convert | ||
1650 | veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key | ||
1651 | vstmia r12, {@XMM[7]} | ||
1652 | |||
1653 | .align 2 | ||
1654 | 0: sub sp, #0x90 @ place for tweak[9] | ||
1655 | #endif | ||
1656 | |||
1657 | vld1.8 {@XMM[8]}, [r0] @ initial tweak | ||
1658 | adr $magic, .Lxts_magic | ||
1659 | |||
1660 | subs $len, #0x80 | ||
1661 | blo .Lxts_enc_short | ||
1662 | b .Lxts_enc_loop | ||
1663 | |||
1664 | .align 4 | ||
1665 | .Lxts_enc_loop: | ||
1666 | vldmia $magic, {$twmask} @ load XTS magic | ||
1667 | vshr.s64 @T[0], @XMM[8], #63 | ||
1668 | mov r0, sp | ||
1669 | vand @T[0], @T[0], $twmask | ||
1670 | ___ | ||
1671 | for($i=9;$i<16;$i++) { | ||
1672 | $code.=<<___; | ||
1673 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
1674 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
1675 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1676 | vshr.s64 @T[1], @XMM[$i], #63 | ||
1677 | veor @XMM[$i], @XMM[$i], @T[0] | ||
1678 | vand @T[1], @T[1], $twmask | ||
1679 | ___ | ||
1680 | @T=reverse(@T); | ||
1681 | |||
1682 | $code.=<<___ if ($i>=10); | ||
1683 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
1684 | ___ | ||
1685 | $code.=<<___ if ($i>=11); | ||
1686 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
1687 | ___ | ||
1688 | } | ||
1689 | $code.=<<___; | ||
1690 | vadd.u64 @XMM[8], @XMM[15], @XMM[15] | ||
1691 | vst1.64 {@XMM[15]}, [r0,:128]! | ||
1692 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1693 | veor @XMM[8], @XMM[8], @T[0] | ||
1694 | vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1695 | |||
1696 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1697 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1698 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1699 | add r4, sp, #0x90 @ pass key schedule | ||
1700 | #else | ||
1701 | add r4, $key, #248 @ pass key schedule | ||
1702 | #endif | ||
1703 | veor @XMM[6], @XMM[6], @XMM[14] | ||
1704 | mov r5, $rounds @ pass rounds | ||
1705 | veor @XMM[7], @XMM[7], @XMM[15] | ||
1706 | mov r0, sp | ||
1707 | |||
1708 | bl _bsaes_encrypt8 | ||
1709 | |||
1710 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1711 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1712 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1713 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1714 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1715 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1716 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1717 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1718 | vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! | ||
1719 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1720 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1721 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1722 | veor @XMM[12], @XMM[2], @XMM[14] | ||
1723 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1724 | veor @XMM[13], @XMM[5], @XMM[15] | ||
1725 | vst1.8 {@XMM[12]-@XMM[13]}, [$out]! | ||
1726 | |||
1727 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1728 | |||
1729 | subs $len, #0x80 | ||
1730 | bpl .Lxts_enc_loop | ||
1731 | |||
1732 | .Lxts_enc_short: | ||
1733 | adds $len, #0x70 | ||
1734 | bmi .Lxts_enc_done | ||
1735 | |||
1736 | vldmia $magic, {$twmask} @ load XTS magic | ||
1737 | vshr.s64 @T[0], @XMM[8], #63 | ||
1738 | mov r0, sp | ||
1739 | vand @T[0], @T[0], $twmask | ||
1740 | ___ | ||
1741 | for($i=9;$i<16;$i++) { | ||
1742 | $code.=<<___; | ||
1743 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
1744 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
1745 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1746 | vshr.s64 @T[1], @XMM[$i], #63 | ||
1747 | veor @XMM[$i], @XMM[$i], @T[0] | ||
1748 | vand @T[1], @T[1], $twmask | ||
1749 | ___ | ||
1750 | @T=reverse(@T); | ||
1751 | |||
1752 | $code.=<<___ if ($i>=10); | ||
1753 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
1754 | subs $len, #0x10 | ||
1755 | bmi .Lxts_enc_`$i-9` | ||
1756 | ___ | ||
1757 | $code.=<<___ if ($i>=11); | ||
1758 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
1759 | ___ | ||
1760 | } | ||
1761 | $code.=<<___; | ||
1762 | sub $len, #0x10 | ||
1763 | vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak | ||
1764 | |||
1765 | vld1.8 {@XMM[6]}, [$inp]! | ||
1766 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1767 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1768 | add r4, sp, #0x90 @ pass key schedule | ||
1769 | #else | ||
1770 | add r4, $key, #248 @ pass key schedule | ||
1771 | #endif | ||
1772 | veor @XMM[6], @XMM[6], @XMM[14] | ||
1773 | mov r5, $rounds @ pass rounds | ||
1774 | mov r0, sp | ||
1775 | |||
1776 | bl _bsaes_encrypt8 | ||
1777 | |||
1778 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1779 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1780 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1781 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1782 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1783 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1784 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1785 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1786 | vld1.64 {@XMM[14]}, [r0,:128]! | ||
1787 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1788 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1789 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1790 | veor @XMM[12], @XMM[2], @XMM[14] | ||
1791 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1792 | vst1.8 {@XMM[12]}, [$out]! | ||
1793 | |||
1794 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1795 | b .Lxts_enc_done | ||
1796 | .align 4 | ||
1797 | .Lxts_enc_6: | ||
1798 | vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak | ||
1799 | |||
1800 | veor @XMM[4], @XMM[4], @XMM[12] | ||
1801 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1802 | add r4, sp, #0x90 @ pass key schedule | ||
1803 | #else | ||
1804 | add r4, $key, #248 @ pass key schedule | ||
1805 | #endif | ||
1806 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1807 | mov r5, $rounds @ pass rounds | ||
1808 | mov r0, sp | ||
1809 | |||
1810 | bl _bsaes_encrypt8 | ||
1811 | |||
1812 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1813 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1814 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1815 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1816 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1817 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1818 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1819 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1820 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1821 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1822 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1823 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1824 | |||
1825 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1826 | b .Lxts_enc_done | ||
1827 | |||
1828 | @ put this in range for both ARM and Thumb mode adr instructions | ||
1829 | .align 5 | ||
1830 | .Lxts_magic: | ||
1831 | .quad 1, 0x87 | ||
1832 | |||
1833 | .align 5 | ||
1834 | .Lxts_enc_5: | ||
1835 | vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak | ||
1836 | |||
1837 | veor @XMM[3], @XMM[3], @XMM[11] | ||
1838 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1839 | add r4, sp, #0x90 @ pass key schedule | ||
1840 | #else | ||
1841 | add r4, $key, #248 @ pass key schedule | ||
1842 | #endif | ||
1843 | veor @XMM[4], @XMM[4], @XMM[12] | ||
1844 | mov r5, $rounds @ pass rounds | ||
1845 | mov r0, sp | ||
1846 | |||
1847 | bl _bsaes_encrypt8 | ||
1848 | |||
1849 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1850 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1851 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1852 | vld1.64 {@XMM[12]}, [r0,:128]! | ||
1853 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1854 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1855 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1856 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1857 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1858 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1859 | vst1.8 {@XMM[10]}, [$out]! | ||
1860 | |||
1861 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1862 | b .Lxts_enc_done | ||
1863 | .align 4 | ||
1864 | .Lxts_enc_4: | ||
1865 | vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak | ||
1866 | |||
1867 | veor @XMM[2], @XMM[2], @XMM[10] | ||
1868 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1869 | add r4, sp, #0x90 @ pass key schedule | ||
1870 | #else | ||
1871 | add r4, $key, #248 @ pass key schedule | ||
1872 | #endif | ||
1873 | veor @XMM[3], @XMM[3], @XMM[11] | ||
1874 | mov r5, $rounds @ pass rounds | ||
1875 | mov r0, sp | ||
1876 | |||
1877 | bl _bsaes_encrypt8 | ||
1878 | |||
1879 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1880 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1881 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1882 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1883 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1884 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1885 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1886 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1887 | |||
1888 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1889 | b .Lxts_enc_done | ||
1890 | .align 4 | ||
1891 | .Lxts_enc_3: | ||
1892 | vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak | ||
1893 | |||
1894 | veor @XMM[1], @XMM[1], @XMM[9] | ||
1895 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1896 | add r4, sp, #0x90 @ pass key schedule | ||
1897 | #else | ||
1898 | add r4, $key, #248 @ pass key schedule | ||
1899 | #endif | ||
1900 | veor @XMM[2], @XMM[2], @XMM[10] | ||
1901 | mov r5, $rounds @ pass rounds | ||
1902 | mov r0, sp | ||
1903 | |||
1904 | bl _bsaes_encrypt8 | ||
1905 | |||
1906 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
1907 | vld1.64 {@XMM[10]}, [r0,:128]! | ||
1908 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1909 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1910 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1911 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1912 | vst1.8 {@XMM[8]}, [$out]! | ||
1913 | |||
1914 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1915 | b .Lxts_enc_done | ||
1916 | .align 4 | ||
1917 | .Lxts_enc_2: | ||
1918 | vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak | ||
1919 | |||
1920 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1921 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1922 | add r4, sp, #0x90 @ pass key schedule | ||
1923 | #else | ||
1924 | add r4, $key, #248 @ pass key schedule | ||
1925 | #endif | ||
1926 | veor @XMM[1], @XMM[1], @XMM[9] | ||
1927 | mov r5, $rounds @ pass rounds | ||
1928 | mov r0, sp | ||
1929 | |||
1930 | bl _bsaes_encrypt8 | ||
1931 | |||
1932 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
1933 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1934 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1935 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1936 | |||
1937 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1938 | b .Lxts_enc_done | ||
1939 | .align 4 | ||
1940 | .Lxts_enc_1: | ||
1941 | mov r0, sp | ||
1942 | veor @XMM[0], @XMM[8] | ||
1943 | mov r1, sp | ||
1944 | vst1.8 {@XMM[0]}, [sp,:128] | ||
1945 | mov r2, $key | ||
1946 | mov r4, $fp @ preserve fp | ||
1947 | |||
1948 | bl AES_encrypt | ||
1949 | |||
1950 | vld1.8 {@XMM[0]}, [sp,:128] | ||
1951 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1952 | vst1.8 {@XMM[0]}, [$out]! | ||
1953 | mov $fp, r4 | ||
1954 | |||
1955 | vmov @XMM[8], @XMM[9] @ next round tweak | ||
1956 | |||
1957 | .Lxts_enc_done: | ||
1958 | #ifndef XTS_CHAIN_TWEAK | ||
1959 | adds $len, #0x10 | ||
1960 | beq .Lxts_enc_ret | ||
1961 | sub r6, $out, #0x10 | ||
1962 | |||
1963 | .Lxts_enc_steal: | ||
1964 | ldrb r0, [$inp], #1 | ||
1965 | ldrb r1, [$out, #-0x10] | ||
1966 | strb r0, [$out, #-0x10] | ||
1967 | strb r1, [$out], #1 | ||
1968 | |||
1969 | subs $len, #1 | ||
1970 | bhi .Lxts_enc_steal | ||
1971 | |||
1972 | vld1.8 {@XMM[0]}, [r6] | ||
1973 | mov r0, sp | ||
1974 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1975 | mov r1, sp | ||
1976 | vst1.8 {@XMM[0]}, [sp,:128] | ||
1977 | mov r2, $key | ||
1978 | mov r4, $fp @ preserve fp | ||
1979 | |||
1980 | bl AES_encrypt | ||
1981 | |||
1982 | vld1.8 {@XMM[0]}, [sp,:128] | ||
1983 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1984 | vst1.8 {@XMM[0]}, [r6] | ||
1985 | mov $fp, r4 | ||
1986 | #endif | ||
1987 | |||
1988 | .Lxts_enc_ret: | ||
1989 | bic r0, $fp, #0xf | ||
1990 | vmov.i32 q0, #0 | ||
1991 | vmov.i32 q1, #0 | ||
1992 | #ifdef XTS_CHAIN_TWEAK | ||
1993 | ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
1994 | #endif | ||
1995 | .Lxts_enc_bzero: @ wipe key schedule [if any] | ||
1996 | vstmia sp!, {q0-q1} | ||
1997 | cmp sp, r0 | ||
1998 | bne .Lxts_enc_bzero | ||
1999 | |||
2000 | mov sp, $fp | ||
2001 | #ifdef XTS_CHAIN_TWEAK | ||
2002 | vst1.8 {@XMM[8]}, [r1] | ||
2003 | #endif | ||
2004 | VFP_ABI_POP | ||
2005 | ldmia sp!, {r4-r10, pc} @ return | ||
2006 | |||
2007 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2008 | |||
2009 | .globl bsaes_xts_decrypt | ||
2010 | .type bsaes_xts_decrypt,%function | ||
2011 | .align 4 | ||
2012 | bsaes_xts_decrypt: | ||
2013 | mov ip, sp | ||
2014 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
2015 | VFP_ABI_PUSH | ||
2016 | mov r6, sp @ future $fp | ||
2017 | |||
2018 | mov $inp, r0 | ||
2019 | mov $out, r1 | ||
2020 | mov $len, r2 | ||
2021 | mov $key, r3 | ||
2022 | |||
2023 | sub r0, sp, #0x10 @ 0x10 | ||
2024 | bic r0, #0xf @ align at 16 bytes | ||
2025 | mov sp, r0 | ||
2026 | |||
2027 | #ifdef XTS_CHAIN_TWEAK | ||
2028 | ldr r0, [ip] @ pointer to input tweak | ||
2029 | #else | ||
2030 | @ generate initial tweak | ||
2031 | ldr r0, [ip, #4] @ iv[] | ||
2032 | mov r1, sp | ||
2033 | ldr r2, [ip, #0] @ key2 | ||
2034 | bl AES_encrypt | ||
2035 | mov r0, sp @ pointer to initial tweak | ||
2036 | #endif | ||
2037 | |||
2038 | ldr $rounds, [$key, #240] @ get # of rounds | ||
2039 | mov $fp, r6 | ||
2040 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2041 | @ allocate the key schedule on the stack | ||
2042 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
2043 | @ add r12, #`128-32` @ size of bit-sliced key schedule | ||
2044 | sub r12, #`32+16` @ place for tweak[9] | ||
2045 | |||
2046 | @ populate the key schedule | ||
2047 | mov r4, $key @ pass key | ||
2048 | mov r5, $rounds @ pass # of rounds | ||
2049 | mov sp, r12 | ||
2050 | add r12, #0x90 @ pass key schedule | ||
2051 | bl _bsaes_key_convert | ||
2052 | add r4, sp, #0x90 | ||
2053 | vldmia r4, {@XMM[6]} | ||
2054 | vstmia r12, {@XMM[15]} @ save last round key | ||
2055 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
2056 | vstmia r4, {@XMM[7]} | ||
2057 | #else | ||
2058 | ldr r12, [$key, #244] | ||
2059 | eors r12, #1 | ||
2060 | beq 0f | ||
2061 | |||
2062 | str r12, [$key, #244] | ||
2063 | mov r4, $key @ pass key | ||
2064 | mov r5, $rounds @ pass # of rounds | ||
2065 | add r12, $key, #248 @ pass key schedule | ||
2066 | bl _bsaes_key_convert | ||
2067 | add r4, $key, #248 | ||
2068 | vldmia r4, {@XMM[6]} | ||
2069 | vstmia r12, {@XMM[15]} @ save last round key | ||
2070 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
2071 | vstmia r4, {@XMM[7]} | ||
2072 | |||
2073 | .align 2 | ||
2074 | 0: sub sp, #0x90 @ place for tweak[9] | ||
2075 | #endif | ||
2076 | vld1.8 {@XMM[8]}, [r0] @ initial tweak | ||
2077 | adr $magic, .Lxts_magic | ||
2078 | |||
2079 | tst $len, #0xf @ if not multiple of 16 | ||
2080 | it ne @ Thumb2 thing, sanity check in ARM | ||
2081 | subne $len, #0x10 @ subtract another 16 bytes | ||
2082 | subs $len, #0x80 | ||
2083 | |||
2084 | blo .Lxts_dec_short | ||
2085 | b .Lxts_dec_loop | ||
2086 | |||
2087 | .align 4 | ||
2088 | .Lxts_dec_loop: | ||
2089 | vldmia $magic, {$twmask} @ load XTS magic | ||
2090 | vshr.s64 @T[0], @XMM[8], #63 | ||
2091 | mov r0, sp | ||
2092 | vand @T[0], @T[0], $twmask | ||
2093 | ___ | ||
2094 | for($i=9;$i<16;$i++) { | ||
2095 | $code.=<<___; | ||
2096 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
2097 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
2098 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2099 | vshr.s64 @T[1], @XMM[$i], #63 | ||
2100 | veor @XMM[$i], @XMM[$i], @T[0] | ||
2101 | vand @T[1], @T[1], $twmask | ||
2102 | ___ | ||
2103 | @T=reverse(@T); | ||
2104 | |||
2105 | $code.=<<___ if ($i>=10); | ||
2106 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
2107 | ___ | ||
2108 | $code.=<<___ if ($i>=11); | ||
2109 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
2110 | ___ | ||
2111 | } | ||
2112 | $code.=<<___; | ||
2113 | vadd.u64 @XMM[8], @XMM[15], @XMM[15] | ||
2114 | vst1.64 {@XMM[15]}, [r0,:128]! | ||
2115 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2116 | veor @XMM[8], @XMM[8], @T[0] | ||
2117 | vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2118 | |||
2119 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
2120 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2121 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2122 | add r4, sp, #0x90 @ pass key schedule | ||
2123 | #else | ||
2124 | add r4, $key, #248 @ pass key schedule | ||
2125 | #endif | ||
2126 | veor @XMM[6], @XMM[6], @XMM[14] | ||
2127 | mov r5, $rounds @ pass rounds | ||
2128 | veor @XMM[7], @XMM[7], @XMM[15] | ||
2129 | mov r0, sp | ||
2130 | |||
2131 | bl _bsaes_decrypt8 | ||
2132 | |||
2133 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2134 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2135 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2136 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2137 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2138 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2139 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2140 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2141 | vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! | ||
2142 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2143 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2144 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2145 | veor @XMM[12], @XMM[3], @XMM[14] | ||
2146 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2147 | veor @XMM[13], @XMM[5], @XMM[15] | ||
2148 | vst1.8 {@XMM[12]-@XMM[13]}, [$out]! | ||
2149 | |||
2150 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2151 | |||
2152 | subs $len, #0x80 | ||
2153 | bpl .Lxts_dec_loop | ||
2154 | |||
2155 | .Lxts_dec_short: | ||
2156 | adds $len, #0x70 | ||
2157 | bmi .Lxts_dec_done | ||
2158 | |||
2159 | vldmia $magic, {$twmask} @ load XTS magic | ||
2160 | vshr.s64 @T[0], @XMM[8], #63 | ||
2161 | mov r0, sp | ||
2162 | vand @T[0], @T[0], $twmask | ||
2163 | ___ | ||
2164 | for($i=9;$i<16;$i++) { | ||
2165 | $code.=<<___; | ||
2166 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
2167 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
2168 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2169 | vshr.s64 @T[1], @XMM[$i], #63 | ||
2170 | veor @XMM[$i], @XMM[$i], @T[0] | ||
2171 | vand @T[1], @T[1], $twmask | ||
2172 | ___ | ||
2173 | @T=reverse(@T); | ||
2174 | |||
2175 | $code.=<<___ if ($i>=10); | ||
2176 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
2177 | subs $len, #0x10 | ||
2178 | bmi .Lxts_dec_`$i-9` | ||
2179 | ___ | ||
2180 | $code.=<<___ if ($i>=11); | ||
2181 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
2182 | ___ | ||
2183 | } | ||
2184 | $code.=<<___; | ||
2185 | sub $len, #0x10 | ||
2186 | vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak | ||
2187 | |||
2188 | vld1.8 {@XMM[6]}, [$inp]! | ||
2189 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2190 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2191 | add r4, sp, #0x90 @ pass key schedule | ||
2192 | #else | ||
2193 | add r4, $key, #248 @ pass key schedule | ||
2194 | #endif | ||
2195 | veor @XMM[6], @XMM[6], @XMM[14] | ||
2196 | mov r5, $rounds @ pass rounds | ||
2197 | mov r0, sp | ||
2198 | |||
2199 | bl _bsaes_decrypt8 | ||
2200 | |||
2201 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2202 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2203 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2204 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2205 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2206 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2207 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2208 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2209 | vld1.64 {@XMM[14]}, [r0,:128]! | ||
2210 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2211 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2212 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2213 | veor @XMM[12], @XMM[3], @XMM[14] | ||
2214 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2215 | vst1.8 {@XMM[12]}, [$out]! | ||
2216 | |||
2217 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2218 | b .Lxts_dec_done | ||
2219 | .align 4 | ||
2220 | .Lxts_dec_6: | ||
2221 | vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak | ||
2222 | |||
2223 | veor @XMM[4], @XMM[4], @XMM[12] | ||
2224 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2225 | add r4, sp, #0x90 @ pass key schedule | ||
2226 | #else | ||
2227 | add r4, $key, #248 @ pass key schedule | ||
2228 | #endif | ||
2229 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2230 | mov r5, $rounds @ pass rounds | ||
2231 | mov r0, sp | ||
2232 | |||
2233 | bl _bsaes_decrypt8 | ||
2234 | |||
2235 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2236 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2237 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2238 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2239 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2240 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2241 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2242 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2243 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2244 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2245 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2246 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2247 | |||
2248 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2249 | b .Lxts_dec_done | ||
2250 | .align 4 | ||
2251 | .Lxts_dec_5: | ||
2252 | vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak | ||
2253 | |||
2254 | veor @XMM[3], @XMM[3], @XMM[11] | ||
2255 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2256 | add r4, sp, #0x90 @ pass key schedule | ||
2257 | #else | ||
2258 | add r4, $key, #248 @ pass key schedule | ||
2259 | #endif | ||
2260 | veor @XMM[4], @XMM[4], @XMM[12] | ||
2261 | mov r5, $rounds @ pass rounds | ||
2262 | mov r0, sp | ||
2263 | |||
2264 | bl _bsaes_decrypt8 | ||
2265 | |||
2266 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2267 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2268 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2269 | vld1.64 {@XMM[12]}, [r0,:128]! | ||
2270 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2271 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2272 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2273 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2274 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2275 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2276 | vst1.8 {@XMM[10]}, [$out]! | ||
2277 | |||
2278 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2279 | b .Lxts_dec_done | ||
2280 | .align 4 | ||
2281 | .Lxts_dec_4: | ||
2282 | vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak | ||
2283 | |||
2284 | veor @XMM[2], @XMM[2], @XMM[10] | ||
2285 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2286 | add r4, sp, #0x90 @ pass key schedule | ||
2287 | #else | ||
2288 | add r4, $key, #248 @ pass key schedule | ||
2289 | #endif | ||
2290 | veor @XMM[3], @XMM[3], @XMM[11] | ||
2291 | mov r5, $rounds @ pass rounds | ||
2292 | mov r0, sp | ||
2293 | |||
2294 | bl _bsaes_decrypt8 | ||
2295 | |||
2296 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2297 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2298 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2299 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2300 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2301 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2302 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2303 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2304 | |||
2305 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2306 | b .Lxts_dec_done | ||
2307 | .align 4 | ||
2308 | .Lxts_dec_3: | ||
2309 | vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak | ||
2310 | |||
2311 | veor @XMM[1], @XMM[1], @XMM[9] | ||
2312 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2313 | add r4, sp, #0x90 @ pass key schedule | ||
2314 | #else | ||
2315 | add r4, $key, #248 @ pass key schedule | ||
2316 | #endif | ||
2317 | veor @XMM[2], @XMM[2], @XMM[10] | ||
2318 | mov r5, $rounds @ pass rounds | ||
2319 | mov r0, sp | ||
2320 | |||
2321 | bl _bsaes_decrypt8 | ||
2322 | |||
2323 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
2324 | vld1.64 {@XMM[10]}, [r0,:128]! | ||
2325 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2326 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2327 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2328 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2329 | vst1.8 {@XMM[8]}, [$out]! | ||
2330 | |||
2331 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2332 | b .Lxts_dec_done | ||
2333 | .align 4 | ||
2334 | .Lxts_dec_2: | ||
2335 | vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak | ||
2336 | |||
2337 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2338 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2339 | add r4, sp, #0x90 @ pass key schedule | ||
2340 | #else | ||
2341 | add r4, $key, #248 @ pass key schedule | ||
2342 | #endif | ||
2343 | veor @XMM[1], @XMM[1], @XMM[9] | ||
2344 | mov r5, $rounds @ pass rounds | ||
2345 | mov r0, sp | ||
2346 | |||
2347 | bl _bsaes_decrypt8 | ||
2348 | |||
2349 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
2350 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2351 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2352 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2353 | |||
2354 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2355 | b .Lxts_dec_done | ||
2356 | .align 4 | ||
2357 | .Lxts_dec_1: | ||
2358 | mov r0, sp | ||
2359 | veor @XMM[0], @XMM[8] | ||
2360 | mov r1, sp | ||
2361 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2362 | mov r2, $key | ||
2363 | mov r4, $fp @ preserve fp | ||
2364 | mov r5, $magic @ preserve magic | ||
2365 | |||
2366 | bl AES_decrypt | ||
2367 | |||
2368 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2369 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2370 | vst1.8 {@XMM[0]}, [$out]! | ||
2371 | mov $fp, r4 | ||
2372 | mov $magic, r5 | ||
2373 | |||
2374 | vmov @XMM[8], @XMM[9] @ next round tweak | ||
2375 | |||
2376 | .Lxts_dec_done: | ||
2377 | #ifndef XTS_CHAIN_TWEAK | ||
2378 | adds $len, #0x10 | ||
2379 | beq .Lxts_dec_ret | ||
2380 | |||
2381 | @ calculate one round of extra tweak for the stolen ciphertext | ||
2382 | vldmia $magic, {$twmask} | ||
2383 | vshr.s64 @XMM[6], @XMM[8], #63 | ||
2384 | vand @XMM[6], @XMM[6], $twmask | ||
2385 | vadd.u64 @XMM[9], @XMM[8], @XMM[8] | ||
2386 | vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")` | ||
2387 | veor @XMM[9], @XMM[9], @XMM[6] | ||
2388 | |||
2389 | @ perform the final decryption with the last tweak value | ||
2390 | vld1.8 {@XMM[0]}, [$inp]! | ||
2391 | mov r0, sp | ||
2392 | veor @XMM[0], @XMM[0], @XMM[9] | ||
2393 | mov r1, sp | ||
2394 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2395 | mov r2, $key | ||
2396 | mov r4, $fp @ preserve fp | ||
2397 | |||
2398 | bl AES_decrypt | ||
2399 | |||
2400 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2401 | veor @XMM[0], @XMM[0], @XMM[9] | ||
2402 | vst1.8 {@XMM[0]}, [$out] | ||
2403 | |||
2404 | mov r6, $out | ||
2405 | .Lxts_dec_steal: | ||
2406 | ldrb r1, [$out] | ||
2407 | ldrb r0, [$inp], #1 | ||
2408 | strb r1, [$out, #0x10] | ||
2409 | strb r0, [$out], #1 | ||
2410 | |||
2411 | subs $len, #1 | ||
2412 | bhi .Lxts_dec_steal | ||
2413 | |||
2414 | vld1.8 {@XMM[0]}, [r6] | ||
2415 | mov r0, sp | ||
2416 | veor @XMM[0], @XMM[8] | ||
2417 | mov r1, sp | ||
2418 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2419 | mov r2, $key | ||
2420 | |||
2421 | bl AES_decrypt | ||
2422 | |||
2423 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2424 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2425 | vst1.8 {@XMM[0]}, [r6] | ||
2426 | mov $fp, r4 | ||
2427 | #endif | ||
2428 | |||
2429 | .Lxts_dec_ret: | ||
2430 | bic r0, $fp, #0xf | ||
2431 | vmov.i32 q0, #0 | ||
2432 | vmov.i32 q1, #0 | ||
2433 | #ifdef XTS_CHAIN_TWEAK | ||
2434 | ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
2435 | #endif | ||
2436 | .Lxts_dec_bzero: @ wipe key schedule [if any] | ||
2437 | vstmia sp!, {q0-q1} | ||
2438 | cmp sp, r0 | ||
2439 | bne .Lxts_dec_bzero | ||
2440 | |||
2441 | mov sp, $fp | ||
2442 | #ifdef XTS_CHAIN_TWEAK | ||
2443 | vst1.8 {@XMM[8]}, [r1] | ||
2444 | #endif | ||
2445 | VFP_ABI_POP | ||
2446 | ldmia sp!, {r4-r10, pc} @ return | ||
2447 | |||
2448 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2449 | ___ | ||
2450 | } | ||
2451 | $code.=<<___; | ||
2452 | #endif | ||
2453 | ___ | ||
2454 | |||
2455 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
2456 | |||
2457 | open SELF,$0; | ||
2458 | while(<SELF>) { | ||
2459 | next if (/^#!/); | ||
2460 | last if (!s/^#/@/ and !/^$/); | ||
2461 | print; | ||
2462 | } | ||
2463 | close SELF; | ||
2464 | |||
2465 | print $code; | ||
2466 | |||
2467 | close STDOUT; | ||