diff options
author | Russell King <rmk+kernel@arm.linux.org.uk> | 2013-11-12 05:58:59 -0500 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2013-11-12 05:58:59 -0500 |
commit | df762eccbadf87850fbee444d729e0f1b1e946f1 (patch) | |
tree | 1bf47bbbd4ea91e343f983b3b50ec2ec73a739e1 | |
parent | ec1e20a02fe33b767ffcca8920a32211492416d7 (diff) | |
parent | 70d42126877b9faa272d446a6de5917614c28dd9 (diff) |
Merge branch 'devel-stable' into for-next
Conflicts:
arch/arm/include/asm/atomic.h
arch/arm/include/asm/hardirq.h
arch/arm/kernel/smp.c
80 files changed, 7571 insertions, 245 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index df0c609272e5..1dbb58c1feed 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -52,6 +52,8 @@ config ARM | |||
52 | select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND | 52 | select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND |
53 | select HAVE_OPROFILE if (HAVE_PERF_EVENTS) | 53 | select HAVE_OPROFILE if (HAVE_PERF_EVENTS) |
54 | select HAVE_PERF_EVENTS | 54 | select HAVE_PERF_EVENTS |
55 | select HAVE_PERF_REGS | ||
56 | select HAVE_PERF_USER_STACK_DUMP | ||
55 | select HAVE_REGS_AND_STACK_ACCESS_API | 57 | select HAVE_REGS_AND_STACK_ACCESS_API |
56 | select HAVE_SYSCALL_TRACEPOINTS | 58 | select HAVE_SYSCALL_TRACEPOINTS |
57 | select HAVE_UID16 | 59 | select HAVE_UID16 |
@@ -482,6 +484,7 @@ config ARCH_IXP4XX | |||
482 | bool "IXP4xx-based" | 484 | bool "IXP4xx-based" |
483 | depends on MMU | 485 | depends on MMU |
484 | select ARCH_HAS_DMA_SET_COHERENT_MASK | 486 | select ARCH_HAS_DMA_SET_COHERENT_MASK |
487 | select ARCH_SUPPORTS_BIG_ENDIAN | ||
485 | select ARCH_REQUIRE_GPIOLIB | 488 | select ARCH_REQUIRE_GPIOLIB |
486 | select CLKSRC_MMIO | 489 | select CLKSRC_MMIO |
487 | select CPU_XSCALE | 490 | select CPU_XSCALE |
@@ -1545,6 +1548,32 @@ config MCPM | |||
1545 | for (multi-)cluster based systems, such as big.LITTLE based | 1548 | for (multi-)cluster based systems, such as big.LITTLE based |
1546 | systems. | 1549 | systems. |
1547 | 1550 | ||
1551 | config BIG_LITTLE | ||
1552 | bool "big.LITTLE support (Experimental)" | ||
1553 | depends on CPU_V7 && SMP | ||
1554 | select MCPM | ||
1555 | help | ||
1556 | This option enables support selections for the big.LITTLE | ||
1557 | system architecture. | ||
1558 | |||
1559 | config BL_SWITCHER | ||
1560 | bool "big.LITTLE switcher support" | ||
1561 | depends on BIG_LITTLE && MCPM && HOTPLUG_CPU | ||
1562 | select CPU_PM | ||
1563 | select ARM_CPU_SUSPEND | ||
1564 | help | ||
1565 | The big.LITTLE "switcher" provides the core functionality to | ||
1566 | transparently handle transition between a cluster of A15's | ||
1567 | and a cluster of A7's in a big.LITTLE system. | ||
1568 | |||
1569 | config BL_SWITCHER_DUMMY_IF | ||
1570 | tristate "Simple big.LITTLE switcher user interface" | ||
1571 | depends on BL_SWITCHER && DEBUG_KERNEL | ||
1572 | help | ||
1573 | This is a simple and dummy char dev interface to control | ||
1574 | the big.LITTLE switcher core code. It is meant for | ||
1575 | debugging purposes only. | ||
1576 | |||
1548 | choice | 1577 | choice |
1549 | prompt "Memory split" | 1578 | prompt "Memory split" |
1550 | default VMSPLIT_3G | 1579 | default VMSPLIT_3G |
diff --git a/arch/arm/Makefile b/arch/arm/Makefile index db50b626be98..25f45256f098 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile | |||
@@ -16,6 +16,7 @@ LDFLAGS := | |||
16 | LDFLAGS_vmlinux :=-p --no-undefined -X | 16 | LDFLAGS_vmlinux :=-p --no-undefined -X |
17 | ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) | 17 | ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) |
18 | LDFLAGS_vmlinux += --be8 | 18 | LDFLAGS_vmlinux += --be8 |
19 | LDFLAGS_MODULE += --be8 | ||
19 | endif | 20 | endif |
20 | 21 | ||
21 | OBJCOPYFLAGS :=-O binary -R .comment -S | 22 | OBJCOPYFLAGS :=-O binary -R .comment -S |
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 75189f13cf54..066b03480b63 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S | |||
@@ -135,6 +135,7 @@ start: | |||
135 | .word _edata @ zImage end address | 135 | .word _edata @ zImage end address |
136 | THUMB( .thumb ) | 136 | THUMB( .thumb ) |
137 | 1: | 137 | 1: |
138 | ARM_BE8( setend be ) @ go BE8 if compiled for BE8 | ||
138 | mrs r9, cpsr | 139 | mrs r9, cpsr |
139 | #ifdef CONFIG_ARM_VIRT_EXT | 140 | #ifdef CONFIG_ARM_VIRT_EXT |
140 | bl __hyp_stub_install @ get into SVC mode, reversibly | 141 | bl __hyp_stub_install @ get into SVC mode, reversibly |
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on: | |||
699 | mrc p15, 0, r0, c1, c0, 0 @ read control reg | 700 | mrc p15, 0, r0, c1, c0, 0 @ read control reg |
700 | orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement | 701 | orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement |
701 | orr r0, r0, #0x0030 | 702 | orr r0, r0, #0x0030 |
702 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 703 | ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables |
703 | orr r0, r0, #1 << 25 @ big-endian page tables | ||
704 | #endif | ||
705 | bl __common_mmu_cache_on | 704 | bl __common_mmu_cache_on |
706 | mov r0, #0 | 705 | mov r0, #0 |
707 | mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs | 706 | mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs |
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on: | |||
728 | orr r0, r0, #1 << 22 @ U (v6 unaligned access model) | 727 | orr r0, r0, #1 << 22 @ U (v6 unaligned access model) |
729 | @ (needed for ARM1176) | 728 | @ (needed for ARM1176) |
730 | #ifdef CONFIG_MMU | 729 | #ifdef CONFIG_MMU |
731 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 730 | ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables |
732 | orr r0, r0, #1 << 25 @ big-endian page tables | ||
733 | #endif | ||
734 | mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg | 731 | mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg |
735 | orrne r0, r0, #1 @ MMU enabled | 732 | orrne r0, r0, #1 @ MMU enabled |
736 | movne r1, #0xfffffffd @ domain 0 = client | 733 | movne r1, #0xfffffffd @ domain 0 = client |
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile index 8c60f473e976..5c8584c4944d 100644 --- a/arch/arm/common/Makefile +++ b/arch/arm/common/Makefile | |||
@@ -17,3 +17,5 @@ obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o | |||
17 | AFLAGS_mcpm_head.o := -march=armv7-a | 17 | AFLAGS_mcpm_head.o := -march=armv7-a |
18 | AFLAGS_vlock.o := -march=armv7-a | 18 | AFLAGS_vlock.o := -march=armv7-a |
19 | obj-$(CONFIG_TI_PRIV_EDMA) += edma.o | 19 | obj-$(CONFIG_TI_PRIV_EDMA) += edma.o |
20 | obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o | ||
21 | obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o | ||
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c new file mode 100644 index 000000000000..5774b6ea7ad5 --- /dev/null +++ b/arch/arm/common/bL_switcher.c | |||
@@ -0,0 +1,822 @@ | |||
1 | /* | ||
2 | * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver | ||
3 | * | ||
4 | * Created by: Nicolas Pitre, March 2012 | ||
5 | * Copyright: (C) 2012-2013 Linaro Limited | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/atomic.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/cpu_pm.h> | ||
19 | #include <linux/cpu.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/kthread.h> | ||
22 | #include <linux/wait.h> | ||
23 | #include <linux/time.h> | ||
24 | #include <linux/clockchips.h> | ||
25 | #include <linux/hrtimer.h> | ||
26 | #include <linux/tick.h> | ||
27 | #include <linux/notifier.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/spinlock.h> | ||
32 | #include <linux/string.h> | ||
33 | #include <linux/sysfs.h> | ||
34 | #include <linux/irqchip/arm-gic.h> | ||
35 | #include <linux/moduleparam.h> | ||
36 | |||
37 | #include <asm/smp_plat.h> | ||
38 | #include <asm/cputype.h> | ||
39 | #include <asm/suspend.h> | ||
40 | #include <asm/mcpm.h> | ||
41 | #include <asm/bL_switcher.h> | ||
42 | |||
43 | #define CREATE_TRACE_POINTS | ||
44 | #include <trace/events/power_cpu_migrate.h> | ||
45 | |||
46 | |||
47 | /* | ||
48 | * Use our own MPIDR accessors as the generic ones in asm/cputype.h have | ||
49 | * __attribute_const__ and we don't want the compiler to assume any | ||
50 | * constness here as the value _does_ change along some code paths. | ||
51 | */ | ||
52 | |||
53 | static int read_mpidr(void) | ||
54 | { | ||
55 | unsigned int id; | ||
56 | asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id)); | ||
57 | return id & MPIDR_HWID_BITMASK; | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * Get a global nanosecond time stamp for tracing. | ||
62 | */ | ||
63 | static s64 get_ns(void) | ||
64 | { | ||
65 | struct timespec ts; | ||
66 | getnstimeofday(&ts); | ||
67 | return timespec_to_ns(&ts); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * bL switcher core code. | ||
72 | */ | ||
73 | |||
74 | static void bL_do_switch(void *_arg) | ||
75 | { | ||
76 | unsigned ib_mpidr, ib_cpu, ib_cluster; | ||
77 | long volatile handshake, **handshake_ptr = _arg; | ||
78 | |||
79 | pr_debug("%s\n", __func__); | ||
80 | |||
81 | ib_mpidr = cpu_logical_map(smp_processor_id()); | ||
82 | ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0); | ||
83 | ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1); | ||
84 | |||
85 | /* Advertise our handshake location */ | ||
86 | if (handshake_ptr) { | ||
87 | handshake = 0; | ||
88 | *handshake_ptr = &handshake; | ||
89 | } else | ||
90 | handshake = -1; | ||
91 | |||
92 | /* | ||
93 | * Our state has been saved at this point. Let's release our | ||
94 | * inbound CPU. | ||
95 | */ | ||
96 | mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume); | ||
97 | sev(); | ||
98 | |||
99 | /* | ||
100 | * From this point, we must assume that our counterpart CPU might | ||
101 | * have taken over in its parallel world already, as if execution | ||
102 | * just returned from cpu_suspend(). It is therefore important to | ||
103 | * be very careful not to make any change the other guy is not | ||
104 | * expecting. This is why we need stack isolation. | ||
105 | * | ||
106 | * Fancy under cover tasks could be performed here. For now | ||
107 | * we have none. | ||
108 | */ | ||
109 | |||
110 | /* | ||
111 | * Let's wait until our inbound is alive. | ||
112 | */ | ||
113 | while (!handshake) { | ||
114 | wfe(); | ||
115 | smp_mb(); | ||
116 | } | ||
117 | |||
118 | /* Let's put ourself down. */ | ||
119 | mcpm_cpu_power_down(); | ||
120 | |||
121 | /* should never get here */ | ||
122 | BUG(); | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * Stack isolation. To ensure 'current' remains valid, we just use another | ||
127 | * piece of our thread's stack space which should be fairly lightly used. | ||
128 | * The selected area starts just above the thread_info structure located | ||
129 | * at the very bottom of the stack, aligned to a cache line, and indexed | ||
130 | * with the cluster number. | ||
131 | */ | ||
132 | #define STACK_SIZE 512 | ||
133 | extern void call_with_stack(void (*fn)(void *), void *arg, void *sp); | ||
134 | static int bL_switchpoint(unsigned long _arg) | ||
135 | { | ||
136 | unsigned int mpidr = read_mpidr(); | ||
137 | unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); | ||
138 | void *stack = current_thread_info() + 1; | ||
139 | stack = PTR_ALIGN(stack, L1_CACHE_BYTES); | ||
140 | stack += clusterid * STACK_SIZE + STACK_SIZE; | ||
141 | call_with_stack(bL_do_switch, (void *)_arg, stack); | ||
142 | BUG(); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Generic switcher interface | ||
147 | */ | ||
148 | |||
149 | static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS]; | ||
150 | static int bL_switcher_cpu_pairing[NR_CPUS]; | ||
151 | |||
152 | /* | ||
153 | * bL_switch_to - Switch to a specific cluster for the current CPU | ||
154 | * @new_cluster_id: the ID of the cluster to switch to. | ||
155 | * | ||
156 | * This function must be called on the CPU to be switched. | ||
157 | * Returns 0 on success, else a negative status code. | ||
158 | */ | ||
159 | static int bL_switch_to(unsigned int new_cluster_id) | ||
160 | { | ||
161 | unsigned int mpidr, this_cpu, that_cpu; | ||
162 | unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; | ||
163 | struct completion inbound_alive; | ||
164 | struct tick_device *tdev; | ||
165 | enum clock_event_mode tdev_mode; | ||
166 | long volatile *handshake_ptr; | ||
167 | int ipi_nr, ret; | ||
168 | |||
169 | this_cpu = smp_processor_id(); | ||
170 | ob_mpidr = read_mpidr(); | ||
171 | ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0); | ||
172 | ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1); | ||
173 | BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr); | ||
174 | |||
175 | if (new_cluster_id == ob_cluster) | ||
176 | return 0; | ||
177 | |||
178 | that_cpu = bL_switcher_cpu_pairing[this_cpu]; | ||
179 | ib_mpidr = cpu_logical_map(that_cpu); | ||
180 | ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0); | ||
181 | ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1); | ||
182 | |||
183 | pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n", | ||
184 | this_cpu, ob_mpidr, ib_mpidr); | ||
185 | |||
186 | this_cpu = smp_processor_id(); | ||
187 | |||
188 | /* Close the gate for our entry vectors */ | ||
189 | mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL); | ||
190 | mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL); | ||
191 | |||
192 | /* Install our "inbound alive" notifier. */ | ||
193 | init_completion(&inbound_alive); | ||
194 | ipi_nr = register_ipi_completion(&inbound_alive, this_cpu); | ||
195 | ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]); | ||
196 | mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr); | ||
197 | |||
198 | /* | ||
199 | * Let's wake up the inbound CPU now in case it requires some delay | ||
200 | * to come online, but leave it gated in our entry vector code. | ||
201 | */ | ||
202 | ret = mcpm_cpu_power_up(ib_cpu, ib_cluster); | ||
203 | if (ret) { | ||
204 | pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret); | ||
205 | return ret; | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Raise a SGI on the inbound CPU to make sure it doesn't stall | ||
210 | * in a possible WFI, such as in bL_power_down(). | ||
211 | */ | ||
212 | gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0); | ||
213 | |||
214 | /* | ||
215 | * Wait for the inbound to come up. This allows for other | ||
216 | * tasks to be scheduled in the mean time. | ||
217 | */ | ||
218 | wait_for_completion(&inbound_alive); | ||
219 | mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0); | ||
220 | |||
221 | /* | ||
222 | * From this point we are entering the switch critical zone | ||
223 | * and can't take any interrupts anymore. | ||
224 | */ | ||
225 | local_irq_disable(); | ||
226 | local_fiq_disable(); | ||
227 | trace_cpu_migrate_begin(get_ns(), ob_mpidr); | ||
228 | |||
229 | /* redirect GIC's SGIs to our counterpart */ | ||
230 | gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]); | ||
231 | |||
232 | tdev = tick_get_device(this_cpu); | ||
233 | if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu))) | ||
234 | tdev = NULL; | ||
235 | if (tdev) { | ||
236 | tdev_mode = tdev->evtdev->mode; | ||
237 | clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN); | ||
238 | } | ||
239 | |||
240 | ret = cpu_pm_enter(); | ||
241 | |||
242 | /* we can not tolerate errors at this point */ | ||
243 | if (ret) | ||
244 | panic("%s: cpu_pm_enter() returned %d\n", __func__, ret); | ||
245 | |||
246 | /* Swap the physical CPUs in the logical map for this logical CPU. */ | ||
247 | cpu_logical_map(this_cpu) = ib_mpidr; | ||
248 | cpu_logical_map(that_cpu) = ob_mpidr; | ||
249 | |||
250 | /* Let's do the actual CPU switch. */ | ||
251 | ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint); | ||
252 | if (ret > 0) | ||
253 | panic("%s: cpu_suspend() returned %d\n", __func__, ret); | ||
254 | |||
255 | /* We are executing on the inbound CPU at this point */ | ||
256 | mpidr = read_mpidr(); | ||
257 | pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr); | ||
258 | BUG_ON(mpidr != ib_mpidr); | ||
259 | |||
260 | mcpm_cpu_powered_up(); | ||
261 | |||
262 | ret = cpu_pm_exit(); | ||
263 | |||
264 | if (tdev) { | ||
265 | clockevents_set_mode(tdev->evtdev, tdev_mode); | ||
266 | clockevents_program_event(tdev->evtdev, | ||
267 | tdev->evtdev->next_event, 1); | ||
268 | } | ||
269 | |||
270 | trace_cpu_migrate_finish(get_ns(), ib_mpidr); | ||
271 | local_fiq_enable(); | ||
272 | local_irq_enable(); | ||
273 | |||
274 | *handshake_ptr = 1; | ||
275 | dsb_sev(); | ||
276 | |||
277 | if (ret) | ||
278 | pr_err("%s exiting with error %d\n", __func__, ret); | ||
279 | return ret; | ||
280 | } | ||
281 | |||
282 | struct bL_thread { | ||
283 | spinlock_t lock; | ||
284 | struct task_struct *task; | ||
285 | wait_queue_head_t wq; | ||
286 | int wanted_cluster; | ||
287 | struct completion started; | ||
288 | bL_switch_completion_handler completer; | ||
289 | void *completer_cookie; | ||
290 | }; | ||
291 | |||
292 | static struct bL_thread bL_threads[NR_CPUS]; | ||
293 | |||
294 | static int bL_switcher_thread(void *arg) | ||
295 | { | ||
296 | struct bL_thread *t = arg; | ||
297 | struct sched_param param = { .sched_priority = 1 }; | ||
298 | int cluster; | ||
299 | bL_switch_completion_handler completer; | ||
300 | void *completer_cookie; | ||
301 | |||
302 | sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); | ||
303 | complete(&t->started); | ||
304 | |||
305 | do { | ||
306 | if (signal_pending(current)) | ||
307 | flush_signals(current); | ||
308 | wait_event_interruptible(t->wq, | ||
309 | t->wanted_cluster != -1 || | ||
310 | kthread_should_stop()); | ||
311 | |||
312 | spin_lock(&t->lock); | ||
313 | cluster = t->wanted_cluster; | ||
314 | completer = t->completer; | ||
315 | completer_cookie = t->completer_cookie; | ||
316 | t->wanted_cluster = -1; | ||
317 | t->completer = NULL; | ||
318 | spin_unlock(&t->lock); | ||
319 | |||
320 | if (cluster != -1) { | ||
321 | bL_switch_to(cluster); | ||
322 | |||
323 | if (completer) | ||
324 | completer(completer_cookie); | ||
325 | } | ||
326 | } while (!kthread_should_stop()); | ||
327 | |||
328 | return 0; | ||
329 | } | ||
330 | |||
331 | static struct task_struct *bL_switcher_thread_create(int cpu, void *arg) | ||
332 | { | ||
333 | struct task_struct *task; | ||
334 | |||
335 | task = kthread_create_on_node(bL_switcher_thread, arg, | ||
336 | cpu_to_node(cpu), "kswitcher_%d", cpu); | ||
337 | if (!IS_ERR(task)) { | ||
338 | kthread_bind(task, cpu); | ||
339 | wake_up_process(task); | ||
340 | } else | ||
341 | pr_err("%s failed for CPU %d\n", __func__, cpu); | ||
342 | return task; | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * bL_switch_request_cb - Switch to a specific cluster for the given CPU, | ||
347 | * with completion notification via a callback | ||
348 | * | ||
349 | * @cpu: the CPU to switch | ||
350 | * @new_cluster_id: the ID of the cluster to switch to. | ||
351 | * @completer: switch completion callback. if non-NULL, | ||
352 | * @completer(@completer_cookie) will be called on completion of | ||
353 | * the switch, in non-atomic context. | ||
354 | * @completer_cookie: opaque context argument for @completer. | ||
355 | * | ||
356 | * This function causes a cluster switch on the given CPU by waking up | ||
357 | * the appropriate switcher thread. This function may or may not return | ||
358 | * before the switch has occurred. | ||
359 | * | ||
360 | * If a @completer callback function is supplied, it will be called when | ||
361 | * the switch is complete. This can be used to determine asynchronously | ||
362 | * when the switch is complete, regardless of when bL_switch_request() | ||
363 | * returns. When @completer is supplied, no new switch request is permitted | ||
364 | * for the affected CPU until after the switch is complete, and @completer | ||
365 | * has returned. | ||
366 | */ | ||
367 | int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id, | ||
368 | bL_switch_completion_handler completer, | ||
369 | void *completer_cookie) | ||
370 | { | ||
371 | struct bL_thread *t; | ||
372 | |||
373 | if (cpu >= ARRAY_SIZE(bL_threads)) { | ||
374 | pr_err("%s: cpu %d out of bounds\n", __func__, cpu); | ||
375 | return -EINVAL; | ||
376 | } | ||
377 | |||
378 | t = &bL_threads[cpu]; | ||
379 | |||
380 | if (IS_ERR(t->task)) | ||
381 | return PTR_ERR(t->task); | ||
382 | if (!t->task) | ||
383 | return -ESRCH; | ||
384 | |||
385 | spin_lock(&t->lock); | ||
386 | if (t->completer) { | ||
387 | spin_unlock(&t->lock); | ||
388 | return -EBUSY; | ||
389 | } | ||
390 | t->completer = completer; | ||
391 | t->completer_cookie = completer_cookie; | ||
392 | t->wanted_cluster = new_cluster_id; | ||
393 | spin_unlock(&t->lock); | ||
394 | wake_up(&t->wq); | ||
395 | return 0; | ||
396 | } | ||
397 | EXPORT_SYMBOL_GPL(bL_switch_request_cb); | ||
398 | |||
399 | /* | ||
400 | * Activation and configuration code. | ||
401 | */ | ||
402 | |||
403 | static DEFINE_MUTEX(bL_switcher_activation_lock); | ||
404 | static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier); | ||
405 | static unsigned int bL_switcher_active; | ||
406 | static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS]; | ||
407 | static cpumask_t bL_switcher_removed_logical_cpus; | ||
408 | |||
409 | int bL_switcher_register_notifier(struct notifier_block *nb) | ||
410 | { | ||
411 | return blocking_notifier_chain_register(&bL_activation_notifier, nb); | ||
412 | } | ||
413 | EXPORT_SYMBOL_GPL(bL_switcher_register_notifier); | ||
414 | |||
415 | int bL_switcher_unregister_notifier(struct notifier_block *nb) | ||
416 | { | ||
417 | return blocking_notifier_chain_unregister(&bL_activation_notifier, nb); | ||
418 | } | ||
419 | EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier); | ||
420 | |||
421 | static int bL_activation_notify(unsigned long val) | ||
422 | { | ||
423 | int ret; | ||
424 | |||
425 | ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL); | ||
426 | if (ret & NOTIFY_STOP_MASK) | ||
427 | pr_err("%s: notifier chain failed with status 0x%x\n", | ||
428 | __func__, ret); | ||
429 | return notifier_to_errno(ret); | ||
430 | } | ||
431 | |||
432 | static void bL_switcher_restore_cpus(void) | ||
433 | { | ||
434 | int i; | ||
435 | |||
436 | for_each_cpu(i, &bL_switcher_removed_logical_cpus) | ||
437 | cpu_up(i); | ||
438 | } | ||
439 | |||
440 | static int bL_switcher_halve_cpus(void) | ||
441 | { | ||
442 | int i, j, cluster_0, gic_id, ret; | ||
443 | unsigned int cpu, cluster, mask; | ||
444 | cpumask_t available_cpus; | ||
445 | |||
446 | /* First pass to validate what we have */ | ||
447 | mask = 0; | ||
448 | for_each_online_cpu(i) { | ||
449 | cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0); | ||
450 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); | ||
451 | if (cluster >= 2) { | ||
452 | pr_err("%s: only dual cluster systems are supported\n", __func__); | ||
453 | return -EINVAL; | ||
454 | } | ||
455 | if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER)) | ||
456 | return -EINVAL; | ||
457 | mask |= (1 << cluster); | ||
458 | } | ||
459 | if (mask != 3) { | ||
460 | pr_err("%s: no CPU pairing possible\n", __func__); | ||
461 | return -EINVAL; | ||
462 | } | ||
463 | |||
464 | /* | ||
465 | * Now let's do the pairing. We match each CPU with another CPU | ||
466 | * from a different cluster. To get a uniform scheduling behavior | ||
467 | * without fiddling with CPU topology and compute capacity data, | ||
468 | * we'll use logical CPUs initially belonging to the same cluster. | ||
469 | */ | ||
470 | memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing)); | ||
471 | cpumask_copy(&available_cpus, cpu_online_mask); | ||
472 | cluster_0 = -1; | ||
473 | for_each_cpu(i, &available_cpus) { | ||
474 | int match = -1; | ||
475 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); | ||
476 | if (cluster_0 == -1) | ||
477 | cluster_0 = cluster; | ||
478 | if (cluster != cluster_0) | ||
479 | continue; | ||
480 | cpumask_clear_cpu(i, &available_cpus); | ||
481 | for_each_cpu(j, &available_cpus) { | ||
482 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1); | ||
483 | /* | ||
484 | * Let's remember the last match to create "odd" | ||
485 | * pairings on purpose in order for other code not | ||
486 | * to assume any relation between physical and | ||
487 | * logical CPU numbers. | ||
488 | */ | ||
489 | if (cluster != cluster_0) | ||
490 | match = j; | ||
491 | } | ||
492 | if (match != -1) { | ||
493 | bL_switcher_cpu_pairing[i] = match; | ||
494 | cpumask_clear_cpu(match, &available_cpus); | ||
495 | pr_info("CPU%d paired with CPU%d\n", i, match); | ||
496 | } | ||
497 | } | ||
498 | |||
499 | /* | ||
500 | * Now we disable the unwanted CPUs i.e. everything that has no | ||
501 | * pairing information (that includes the pairing counterparts). | ||
502 | */ | ||
503 | cpumask_clear(&bL_switcher_removed_logical_cpus); | ||
504 | for_each_online_cpu(i) { | ||
505 | cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0); | ||
506 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); | ||
507 | |||
508 | /* Let's take note of the GIC ID for this CPU */ | ||
509 | gic_id = gic_get_cpu_id(i); | ||
510 | if (gic_id < 0) { | ||
511 | pr_err("%s: bad GIC ID for CPU %d\n", __func__, i); | ||
512 | bL_switcher_restore_cpus(); | ||
513 | return -EINVAL; | ||
514 | } | ||
515 | bL_gic_id[cpu][cluster] = gic_id; | ||
516 | pr_info("GIC ID for CPU %u cluster %u is %u\n", | ||
517 | cpu, cluster, gic_id); | ||
518 | |||
519 | if (bL_switcher_cpu_pairing[i] != -1) { | ||
520 | bL_switcher_cpu_original_cluster[i] = cluster; | ||
521 | continue; | ||
522 | } | ||
523 | |||
524 | ret = cpu_down(i); | ||
525 | if (ret) { | ||
526 | bL_switcher_restore_cpus(); | ||
527 | return ret; | ||
528 | } | ||
529 | cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus); | ||
530 | } | ||
531 | |||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | /* Determine the logical CPU a given physical CPU is grouped on. */ | ||
536 | int bL_switcher_get_logical_index(u32 mpidr) | ||
537 | { | ||
538 | int cpu; | ||
539 | |||
540 | if (!bL_switcher_active) | ||
541 | return -EUNATCH; | ||
542 | |||
543 | mpidr &= MPIDR_HWID_BITMASK; | ||
544 | for_each_online_cpu(cpu) { | ||
545 | int pairing = bL_switcher_cpu_pairing[cpu]; | ||
546 | if (pairing == -1) | ||
547 | continue; | ||
548 | if ((mpidr == cpu_logical_map(cpu)) || | ||
549 | (mpidr == cpu_logical_map(pairing))) | ||
550 | return cpu; | ||
551 | } | ||
552 | return -EINVAL; | ||
553 | } | ||
554 | |||
555 | static void bL_switcher_trace_trigger_cpu(void *__always_unused info) | ||
556 | { | ||
557 | trace_cpu_migrate_current(get_ns(), read_mpidr()); | ||
558 | } | ||
559 | |||
560 | int bL_switcher_trace_trigger(void) | ||
561 | { | ||
562 | int ret; | ||
563 | |||
564 | preempt_disable(); | ||
565 | |||
566 | bL_switcher_trace_trigger_cpu(NULL); | ||
567 | ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true); | ||
568 | |||
569 | preempt_enable(); | ||
570 | |||
571 | return ret; | ||
572 | } | ||
573 | EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger); | ||
574 | |||
575 | static int bL_switcher_enable(void) | ||
576 | { | ||
577 | int cpu, ret; | ||
578 | |||
579 | mutex_lock(&bL_switcher_activation_lock); | ||
580 | lock_device_hotplug(); | ||
581 | if (bL_switcher_active) { | ||
582 | unlock_device_hotplug(); | ||
583 | mutex_unlock(&bL_switcher_activation_lock); | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | pr_info("big.LITTLE switcher initializing\n"); | ||
588 | |||
589 | ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE); | ||
590 | if (ret) | ||
591 | goto error; | ||
592 | |||
593 | ret = bL_switcher_halve_cpus(); | ||
594 | if (ret) | ||
595 | goto error; | ||
596 | |||
597 | bL_switcher_trace_trigger(); | ||
598 | |||
599 | for_each_online_cpu(cpu) { | ||
600 | struct bL_thread *t = &bL_threads[cpu]; | ||
601 | spin_lock_init(&t->lock); | ||
602 | init_waitqueue_head(&t->wq); | ||
603 | init_completion(&t->started); | ||
604 | t->wanted_cluster = -1; | ||
605 | t->task = bL_switcher_thread_create(cpu, t); | ||
606 | } | ||
607 | |||
608 | bL_switcher_active = 1; | ||
609 | bL_activation_notify(BL_NOTIFY_POST_ENABLE); | ||
610 | pr_info("big.LITTLE switcher initialized\n"); | ||
611 | goto out; | ||
612 | |||
613 | error: | ||
614 | pr_warn("big.LITTLE switcher initialization failed\n"); | ||
615 | bL_activation_notify(BL_NOTIFY_POST_DISABLE); | ||
616 | |||
617 | out: | ||
618 | unlock_device_hotplug(); | ||
619 | mutex_unlock(&bL_switcher_activation_lock); | ||
620 | return ret; | ||
621 | } | ||
622 | |||
623 | #ifdef CONFIG_SYSFS | ||
624 | |||
625 | static void bL_switcher_disable(void) | ||
626 | { | ||
627 | unsigned int cpu, cluster; | ||
628 | struct bL_thread *t; | ||
629 | struct task_struct *task; | ||
630 | |||
631 | mutex_lock(&bL_switcher_activation_lock); | ||
632 | lock_device_hotplug(); | ||
633 | |||
634 | if (!bL_switcher_active) | ||
635 | goto out; | ||
636 | |||
637 | if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) { | ||
638 | bL_activation_notify(BL_NOTIFY_POST_ENABLE); | ||
639 | goto out; | ||
640 | } | ||
641 | |||
642 | bL_switcher_active = 0; | ||
643 | |||
644 | /* | ||
645 | * To deactivate the switcher, we must shut down the switcher | ||
646 | * threads to prevent any other requests from being accepted. | ||
647 | * Then, if the final cluster for given logical CPU is not the | ||
648 | * same as the original one, we'll recreate a switcher thread | ||
649 | * just for the purpose of switching the CPU back without any | ||
650 | * possibility for interference from external requests. | ||
651 | */ | ||
652 | for_each_online_cpu(cpu) { | ||
653 | t = &bL_threads[cpu]; | ||
654 | task = t->task; | ||
655 | t->task = NULL; | ||
656 | if (!task || IS_ERR(task)) | ||
657 | continue; | ||
658 | kthread_stop(task); | ||
659 | /* no more switch may happen on this CPU at this point */ | ||
660 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1); | ||
661 | if (cluster == bL_switcher_cpu_original_cluster[cpu]) | ||
662 | continue; | ||
663 | init_completion(&t->started); | ||
664 | t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu]; | ||
665 | task = bL_switcher_thread_create(cpu, t); | ||
666 | if (!IS_ERR(task)) { | ||
667 | wait_for_completion(&t->started); | ||
668 | kthread_stop(task); | ||
669 | cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1); | ||
670 | if (cluster == bL_switcher_cpu_original_cluster[cpu]) | ||
671 | continue; | ||
672 | } | ||
673 | /* If execution gets here, we're in trouble. */ | ||
674 | pr_crit("%s: unable to restore original cluster for CPU %d\n", | ||
675 | __func__, cpu); | ||
676 | pr_crit("%s: CPU %d can't be restored\n", | ||
677 | __func__, bL_switcher_cpu_pairing[cpu]); | ||
678 | cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu], | ||
679 | &bL_switcher_removed_logical_cpus); | ||
680 | } | ||
681 | |||
682 | bL_switcher_restore_cpus(); | ||
683 | bL_switcher_trace_trigger(); | ||
684 | |||
685 | bL_activation_notify(BL_NOTIFY_POST_DISABLE); | ||
686 | |||
687 | out: | ||
688 | unlock_device_hotplug(); | ||
689 | mutex_unlock(&bL_switcher_activation_lock); | ||
690 | } | ||
691 | |||
692 | static ssize_t bL_switcher_active_show(struct kobject *kobj, | ||
693 | struct kobj_attribute *attr, char *buf) | ||
694 | { | ||
695 | return sprintf(buf, "%u\n", bL_switcher_active); | ||
696 | } | ||
697 | |||
698 | static ssize_t bL_switcher_active_store(struct kobject *kobj, | ||
699 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
700 | { | ||
701 | int ret; | ||
702 | |||
703 | switch (buf[0]) { | ||
704 | case '0': | ||
705 | bL_switcher_disable(); | ||
706 | ret = 0; | ||
707 | break; | ||
708 | case '1': | ||
709 | ret = bL_switcher_enable(); | ||
710 | break; | ||
711 | default: | ||
712 | ret = -EINVAL; | ||
713 | } | ||
714 | |||
715 | return (ret >= 0) ? count : ret; | ||
716 | } | ||
717 | |||
718 | static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj, | ||
719 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
720 | { | ||
721 | int ret = bL_switcher_trace_trigger(); | ||
722 | |||
723 | return ret ? ret : count; | ||
724 | } | ||
725 | |||
726 | static struct kobj_attribute bL_switcher_active_attr = | ||
727 | __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store); | ||
728 | |||
729 | static struct kobj_attribute bL_switcher_trace_trigger_attr = | ||
730 | __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store); | ||
731 | |||
732 | static struct attribute *bL_switcher_attrs[] = { | ||
733 | &bL_switcher_active_attr.attr, | ||
734 | &bL_switcher_trace_trigger_attr.attr, | ||
735 | NULL, | ||
736 | }; | ||
737 | |||
738 | static struct attribute_group bL_switcher_attr_group = { | ||
739 | .attrs = bL_switcher_attrs, | ||
740 | }; | ||
741 | |||
742 | static struct kobject *bL_switcher_kobj; | ||
743 | |||
744 | static int __init bL_switcher_sysfs_init(void) | ||
745 | { | ||
746 | int ret; | ||
747 | |||
748 | bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj); | ||
749 | if (!bL_switcher_kobj) | ||
750 | return -ENOMEM; | ||
751 | ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group); | ||
752 | if (ret) | ||
753 | kobject_put(bL_switcher_kobj); | ||
754 | return ret; | ||
755 | } | ||
756 | |||
757 | #endif /* CONFIG_SYSFS */ | ||
758 | |||
759 | bool bL_switcher_get_enabled(void) | ||
760 | { | ||
761 | mutex_lock(&bL_switcher_activation_lock); | ||
762 | |||
763 | return bL_switcher_active; | ||
764 | } | ||
765 | EXPORT_SYMBOL_GPL(bL_switcher_get_enabled); | ||
766 | |||
767 | void bL_switcher_put_enabled(void) | ||
768 | { | ||
769 | mutex_unlock(&bL_switcher_activation_lock); | ||
770 | } | ||
771 | EXPORT_SYMBOL_GPL(bL_switcher_put_enabled); | ||
772 | |||
773 | /* | ||
774 | * Veto any CPU hotplug operation on those CPUs we've removed | ||
775 | * while the switcher is active. | ||
776 | * We're just not ready to deal with that given the trickery involved. | ||
777 | */ | ||
778 | static int bL_switcher_hotplug_callback(struct notifier_block *nfb, | ||
779 | unsigned long action, void *hcpu) | ||
780 | { | ||
781 | if (bL_switcher_active) { | ||
782 | int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu]; | ||
783 | switch (action & 0xf) { | ||
784 | case CPU_UP_PREPARE: | ||
785 | case CPU_DOWN_PREPARE: | ||
786 | if (pairing == -1) | ||
787 | return NOTIFY_BAD; | ||
788 | } | ||
789 | } | ||
790 | return NOTIFY_DONE; | ||
791 | } | ||
792 | |||
793 | static bool no_bL_switcher; | ||
794 | core_param(no_bL_switcher, no_bL_switcher, bool, 0644); | ||
795 | |||
796 | static int __init bL_switcher_init(void) | ||
797 | { | ||
798 | int ret; | ||
799 | |||
800 | if (MAX_NR_CLUSTERS != 2) { | ||
801 | pr_err("%s: only dual cluster systems are supported\n", __func__); | ||
802 | return -EINVAL; | ||
803 | } | ||
804 | |||
805 | cpu_notifier(bL_switcher_hotplug_callback, 0); | ||
806 | |||
807 | if (!no_bL_switcher) { | ||
808 | ret = bL_switcher_enable(); | ||
809 | if (ret) | ||
810 | return ret; | ||
811 | } | ||
812 | |||
813 | #ifdef CONFIG_SYSFS | ||
814 | ret = bL_switcher_sysfs_init(); | ||
815 | if (ret) | ||
816 | pr_err("%s: unable to create sysfs entry\n", __func__); | ||
817 | #endif | ||
818 | |||
819 | return 0; | ||
820 | } | ||
821 | |||
822 | late_initcall(bL_switcher_init); | ||
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c new file mode 100644 index 000000000000..3f47f1203c6b --- /dev/null +++ b/arch/arm/common/bL_switcher_dummy_if.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* | ||
2 | * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface | ||
3 | * | ||
4 | * Created by: Nicolas Pitre, November 2012 | ||
5 | * Copyright: (C) 2012-2013 Linaro Limited | ||
6 | * | ||
7 | * Dummy interface to user space for debugging purpose only. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License version 2 as | ||
11 | * published by the Free Software Foundation. | ||
12 | */ | ||
13 | |||
14 | #include <linux/init.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/fs.h> | ||
17 | #include <linux/miscdevice.h> | ||
18 | #include <asm/uaccess.h> | ||
19 | #include <asm/bL_switcher.h> | ||
20 | |||
21 | static ssize_t bL_switcher_write(struct file *file, const char __user *buf, | ||
22 | size_t len, loff_t *pos) | ||
23 | { | ||
24 | unsigned char val[3]; | ||
25 | unsigned int cpu, cluster; | ||
26 | int ret; | ||
27 | |||
28 | pr_debug("%s\n", __func__); | ||
29 | |||
30 | if (len < 3) | ||
31 | return -EINVAL; | ||
32 | |||
33 | if (copy_from_user(val, buf, 3)) | ||
34 | return -EFAULT; | ||
35 | |||
36 | /* format: <cpu#>,<cluster#> */ | ||
37 | if (val[0] < '0' || val[0] > '9' || | ||
38 | val[1] != ',' || | ||
39 | val[2] < '0' || val[2] > '1') | ||
40 | return -EINVAL; | ||
41 | |||
42 | cpu = val[0] - '0'; | ||
43 | cluster = val[2] - '0'; | ||
44 | ret = bL_switch_request(cpu, cluster); | ||
45 | |||
46 | return ret ? : len; | ||
47 | } | ||
48 | |||
49 | static const struct file_operations bL_switcher_fops = { | ||
50 | .write = bL_switcher_write, | ||
51 | .owner = THIS_MODULE, | ||
52 | }; | ||
53 | |||
54 | static struct miscdevice bL_switcher_device = { | ||
55 | MISC_DYNAMIC_MINOR, | ||
56 | "b.L_switcher", | ||
57 | &bL_switcher_fops | ||
58 | }; | ||
59 | |||
60 | static int __init bL_switcher_dummy_if_init(void) | ||
61 | { | ||
62 | return misc_register(&bL_switcher_device); | ||
63 | } | ||
64 | |||
65 | static void __exit bL_switcher_dummy_if_exit(void) | ||
66 | { | ||
67 | misc_deregister(&bL_switcher_device); | ||
68 | } | ||
69 | |||
70 | module_init(bL_switcher_dummy_if_init); | ||
71 | module_exit(bL_switcher_dummy_if_exit); | ||
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 6c03d0152e7f..26020a03f659 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c | |||
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) | |||
27 | sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); | 27 | sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); |
28 | } | 28 | } |
29 | 29 | ||
30 | extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2]; | ||
31 | |||
32 | void mcpm_set_early_poke(unsigned cpu, unsigned cluster, | ||
33 | unsigned long poke_phys_addr, unsigned long poke_val) | ||
34 | { | ||
35 | unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0]; | ||
36 | poke[0] = poke_phys_addr; | ||
37 | poke[1] = poke_val; | ||
38 | __cpuc_flush_dcache_area((void *)poke, 8); | ||
39 | outer_clean_range(__pa(poke), __pa(poke + 2)); | ||
40 | } | ||
41 | |||
30 | static const struct mcpm_platform_ops *platform_ops; | 42 | static const struct mcpm_platform_ops *platform_ops; |
31 | 43 | ||
32 | int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) | 44 | int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) |
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S index 39c96df3477a..e02db4b81a66 100644 --- a/arch/arm/common/mcpm_head.S +++ b/arch/arm/common/mcpm_head.S | |||
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | #include <linux/linkage.h> | 16 | #include <linux/linkage.h> |
17 | #include <asm/mcpm.h> | 17 | #include <asm/mcpm.h> |
18 | #include <asm/assembler.h> | ||
18 | 19 | ||
19 | #include "vlock.h" | 20 | #include "vlock.h" |
20 | 21 | ||
@@ -47,6 +48,7 @@ | |||
47 | 48 | ||
48 | ENTRY(mcpm_entry_point) | 49 | ENTRY(mcpm_entry_point) |
49 | 50 | ||
51 | ARM_BE8(setend be) | ||
50 | THUMB( adr r12, BSYM(1f) ) | 52 | THUMB( adr r12, BSYM(1f) ) |
51 | THUMB( bx r12 ) | 53 | THUMB( bx r12 ) |
52 | THUMB( .thumb ) | 54 | THUMB( .thumb ) |
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point) | |||
71 | * position independent way. | 73 | * position independent way. |
72 | */ | 74 | */ |
73 | adr r5, 3f | 75 | adr r5, 3f |
74 | ldmia r5, {r6, r7, r8, r11} | 76 | ldmia r5, {r0, r6, r7, r8, r11} |
77 | add r0, r5, r0 @ r0 = mcpm_entry_early_pokes | ||
75 | add r6, r5, r6 @ r6 = mcpm_entry_vectors | 78 | add r6, r5, r6 @ r6 = mcpm_entry_vectors |
76 | ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys | 79 | ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys |
77 | add r8, r5, r8 @ r8 = mcpm_sync | 80 | add r8, r5, r8 @ r8 = mcpm_sync |
78 | add r11, r5, r11 @ r11 = first_man_locks | 81 | add r11, r5, r11 @ r11 = first_man_locks |
79 | 82 | ||
83 | @ Perform an early poke, if any | ||
84 | add r0, r0, r4, lsl #3 | ||
85 | ldmia r0, {r0, r1} | ||
86 | teq r0, #0 | ||
87 | strne r1, [r0] | ||
88 | |||
80 | mov r0, #MCPM_SYNC_CLUSTER_SIZE | 89 | mov r0, #MCPM_SYNC_CLUSTER_SIZE |
81 | mla r8, r0, r10, r8 @ r8 = sync cluster base | 90 | mla r8, r0, r10, r8 @ r8 = sync cluster base |
82 | 91 | ||
@@ -195,7 +204,8 @@ mcpm_entry_gated: | |||
195 | 204 | ||
196 | .align 2 | 205 | .align 2 |
197 | 206 | ||
198 | 3: .word mcpm_entry_vectors - . | 207 | 3: .word mcpm_entry_early_pokes - . |
208 | .word mcpm_entry_vectors - 3b | ||
199 | .word mcpm_power_up_setup_phys - 3b | 209 | .word mcpm_power_up_setup_phys - 3b |
200 | .word mcpm_sync - 3b | 210 | .word mcpm_sync - 3b |
201 | .word first_man_locks - 3b | 211 | .word first_man_locks - 3b |
@@ -214,6 +224,10 @@ first_man_locks: | |||
214 | ENTRY(mcpm_entry_vectors) | 224 | ENTRY(mcpm_entry_vectors) |
215 | .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER | 225 | .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER |
216 | 226 | ||
227 | .type mcpm_entry_early_pokes, #object | ||
228 | ENTRY(mcpm_entry_early_pokes) | ||
229 | .space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER | ||
230 | |||
217 | .type mcpm_power_up_setup_phys, #object | 231 | .type mcpm_power_up_setup_phys, #object |
218 | ENTRY(mcpm_power_up_setup_phys) | 232 | ENTRY(mcpm_power_up_setup_phys) |
219 | .space 4 @ set by mcpm_sync_init() | 233 | .space 4 @ set by mcpm_sync_init() |
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore new file mode 100644 index 000000000000..6231d36b3635 --- /dev/null +++ b/arch/arm/crypto/.gitignore | |||
@@ -0,0 +1 @@ | |||
aesbs-core.S | |||
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index a2c83851bc90..81cda39860c5 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile | |||
@@ -3,7 +3,17 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o | 5 | obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o |
6 | obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o | ||
6 | obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o | 7 | obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o |
7 | 8 | ||
8 | aes-arm-y := aes-armv4.o aes_glue.o | 9 | aes-arm-y := aes-armv4.o aes_glue.o |
9 | sha1-arm-y := sha1-armv4-large.o sha1_glue.o | 10 | aes-arm-bs-y := aesbs-core.o aesbs-glue.o |
11 | sha1-arm-y := sha1-armv4-large.o sha1_glue.o | ||
12 | |||
13 | quiet_cmd_perl = PERL $@ | ||
14 | cmd_perl = $(PERL) $(<) > $(@) | ||
15 | |||
16 | $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl | ||
17 | $(call cmd,perl) | ||
18 | |||
19 | .PRECIOUS: $(obj)/aesbs-core.S | ||
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c index 59f7877ead6a..3003fa1f6fb4 100644 --- a/arch/arm/crypto/aes_glue.c +++ b/arch/arm/crypto/aes_glue.c | |||
@@ -6,22 +6,12 @@ | |||
6 | #include <linux/crypto.h> | 6 | #include <linux/crypto.h> |
7 | #include <crypto/aes.h> | 7 | #include <crypto/aes.h> |
8 | 8 | ||
9 | #define AES_MAXNR 14 | 9 | #include "aes_glue.h" |
10 | 10 | ||
11 | typedef struct { | 11 | EXPORT_SYMBOL(AES_encrypt); |
12 | unsigned int rd_key[4 *(AES_MAXNR + 1)]; | 12 | EXPORT_SYMBOL(AES_decrypt); |
13 | int rounds; | 13 | EXPORT_SYMBOL(private_AES_set_encrypt_key); |
14 | } AES_KEY; | 14 | EXPORT_SYMBOL(private_AES_set_decrypt_key); |
15 | |||
16 | struct AES_CTX { | ||
17 | AES_KEY enc_key; | ||
18 | AES_KEY dec_key; | ||
19 | }; | ||
20 | |||
21 | asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx); | ||
22 | asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx); | ||
23 | asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); | ||
24 | asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); | ||
25 | 15 | ||
26 | static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | 16 | static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) |
27 | { | 17 | { |
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = { | |||
81 | .cipher = { | 71 | .cipher = { |
82 | .cia_min_keysize = AES_MIN_KEY_SIZE, | 72 | .cia_min_keysize = AES_MIN_KEY_SIZE, |
83 | .cia_max_keysize = AES_MAX_KEY_SIZE, | 73 | .cia_max_keysize = AES_MAX_KEY_SIZE, |
84 | .cia_setkey = aes_set_key, | 74 | .cia_setkey = aes_set_key, |
85 | .cia_encrypt = aes_encrypt, | 75 | .cia_encrypt = aes_encrypt, |
86 | .cia_decrypt = aes_decrypt | 76 | .cia_decrypt = aes_decrypt |
87 | } | 77 | } |
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h new file mode 100644 index 000000000000..cca3e51eb606 --- /dev/null +++ b/arch/arm/crypto/aes_glue.h | |||
@@ -0,0 +1,19 @@ | |||
1 | |||
2 | #define AES_MAXNR 14 | ||
3 | |||
4 | struct AES_KEY { | ||
5 | unsigned int rd_key[4 * (AES_MAXNR + 1)]; | ||
6 | int rounds; | ||
7 | }; | ||
8 | |||
9 | struct AES_CTX { | ||
10 | struct AES_KEY enc_key; | ||
11 | struct AES_KEY dec_key; | ||
12 | }; | ||
13 | |||
14 | asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx); | ||
15 | asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx); | ||
16 | asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, | ||
17 | const int bits, struct AES_KEY *key); | ||
18 | asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, | ||
19 | const int bits, struct AES_KEY *key); | ||
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped new file mode 100644 index 000000000000..64205d453260 --- /dev/null +++ b/arch/arm/crypto/aesbs-core.S_shipped | |||
@@ -0,0 +1,2544 @@ | |||
1 | |||
2 | @ ==================================================================== | ||
3 | @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
4 | @ project. The module is, however, dual licensed under OpenSSL and | ||
5 | @ CRYPTOGAMS licenses depending on where you obtain it. For further | ||
6 | @ details see http://www.openssl.org/~appro/cryptogams/. | ||
7 | @ | ||
8 | @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel | ||
9 | @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is | ||
10 | @ granted. | ||
11 | @ ==================================================================== | ||
12 | |||
13 | @ Bit-sliced AES for ARM NEON | ||
14 | @ | ||
15 | @ February 2012. | ||
16 | @ | ||
17 | @ This implementation is direct adaptation of bsaes-x86_64 module for | ||
18 | @ ARM NEON. Except that this module is endian-neutral [in sense that | ||
19 | @ it can be compiled for either endianness] by courtesy of vld1.8's | ||
20 | @ neutrality. Initial version doesn't implement interface to OpenSSL, | ||
21 | @ only low-level primitives and unsupported entry points, just enough | ||
22 | @ to collect performance results, which for Cortex-A8 core are: | ||
23 | @ | ||
24 | @ encrypt 19.5 cycles per byte processed with 128-bit key | ||
25 | @ decrypt 22.1 cycles per byte processed with 128-bit key | ||
26 | @ key conv. 440 cycles per 128-bit key/0.18 of 8x block | ||
27 | @ | ||
28 | @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, | ||
29 | @ which is [much] worse than anticipated (for further details see | ||
30 | @ http://www.openssl.org/~appro/Snapdragon-S4.html). | ||
31 | @ | ||
32 | @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code | ||
33 | @ manages in 20.0 cycles]. | ||
34 | @ | ||
35 | @ When comparing to x86_64 results keep in mind that NEON unit is | ||
36 | @ [mostly] single-issue and thus can't [fully] benefit from | ||
37 | @ instruction-level parallelism. And when comparing to aes-armv4 | ||
38 | @ results keep in mind key schedule conversion overhead (see | ||
39 | @ bsaes-x86_64.pl for further details)... | ||
40 | @ | ||
41 | @ <appro@openssl.org> | ||
42 | |||
43 | @ April-August 2013 | ||
44 | @ | ||
45 | @ Add CBC, CTR and XTS subroutines, adapt for kernel use. | ||
46 | @ | ||
47 | @ <ard.biesheuvel@linaro.org> | ||
48 | |||
49 | #ifndef __KERNEL__ | ||
50 | # include "arm_arch.h" | ||
51 | |||
52 | # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} | ||
53 | # define VFP_ABI_POP vldmia sp!,{d8-d15} | ||
54 | # define VFP_ABI_FRAME 0x40 | ||
55 | #else | ||
56 | # define VFP_ABI_PUSH | ||
57 | # define VFP_ABI_POP | ||
58 | # define VFP_ABI_FRAME 0 | ||
59 | # define BSAES_ASM_EXTENDED_KEY | ||
60 | # define XTS_CHAIN_TWEAK | ||
61 | # define __ARM_ARCH__ __LINUX_ARM_ARCH__ | ||
62 | #endif | ||
63 | |||
64 | #ifdef __thumb__ | ||
65 | # define adrl adr | ||
66 | #endif | ||
67 | |||
68 | #if __ARM_ARCH__>=7 | ||
69 | .text | ||
70 | .syntax unified @ ARMv7-capable assembler is expected to handle this | ||
71 | #ifdef __thumb2__ | ||
72 | .thumb | ||
73 | #else | ||
74 | .code 32 | ||
75 | #endif | ||
76 | |||
77 | .fpu neon | ||
78 | |||
79 | .type _bsaes_decrypt8,%function | ||
80 | .align 4 | ||
81 | _bsaes_decrypt8: | ||
82 | adr r6,_bsaes_decrypt8 | ||
83 | vldmia r4!, {q9} @ round 0 key | ||
84 | add r6,r6,#.LM0ISR-_bsaes_decrypt8 | ||
85 | |||
86 | vldmia r6!, {q8} @ .LM0ISR | ||
87 | veor q10, q0, q9 @ xor with round0 key | ||
88 | veor q11, q1, q9 | ||
89 | vtbl.8 d0, {q10}, d16 | ||
90 | vtbl.8 d1, {q10}, d17 | ||
91 | veor q12, q2, q9 | ||
92 | vtbl.8 d2, {q11}, d16 | ||
93 | vtbl.8 d3, {q11}, d17 | ||
94 | veor q13, q3, q9 | ||
95 | vtbl.8 d4, {q12}, d16 | ||
96 | vtbl.8 d5, {q12}, d17 | ||
97 | veor q14, q4, q9 | ||
98 | vtbl.8 d6, {q13}, d16 | ||
99 | vtbl.8 d7, {q13}, d17 | ||
100 | veor q15, q5, q9 | ||
101 | vtbl.8 d8, {q14}, d16 | ||
102 | vtbl.8 d9, {q14}, d17 | ||
103 | veor q10, q6, q9 | ||
104 | vtbl.8 d10, {q15}, d16 | ||
105 | vtbl.8 d11, {q15}, d17 | ||
106 | veor q11, q7, q9 | ||
107 | vtbl.8 d12, {q10}, d16 | ||
108 | vtbl.8 d13, {q10}, d17 | ||
109 | vtbl.8 d14, {q11}, d16 | ||
110 | vtbl.8 d15, {q11}, d17 | ||
111 | vmov.i8 q8,#0x55 @ compose .LBS0 | ||
112 | vmov.i8 q9,#0x33 @ compose .LBS1 | ||
113 | vshr.u64 q10, q6, #1 | ||
114 | vshr.u64 q11, q4, #1 | ||
115 | veor q10, q10, q7 | ||
116 | veor q11, q11, q5 | ||
117 | vand q10, q10, q8 | ||
118 | vand q11, q11, q8 | ||
119 | veor q7, q7, q10 | ||
120 | vshl.u64 q10, q10, #1 | ||
121 | veor q5, q5, q11 | ||
122 | vshl.u64 q11, q11, #1 | ||
123 | veor q6, q6, q10 | ||
124 | veor q4, q4, q11 | ||
125 | vshr.u64 q10, q2, #1 | ||
126 | vshr.u64 q11, q0, #1 | ||
127 | veor q10, q10, q3 | ||
128 | veor q11, q11, q1 | ||
129 | vand q10, q10, q8 | ||
130 | vand q11, q11, q8 | ||
131 | veor q3, q3, q10 | ||
132 | vshl.u64 q10, q10, #1 | ||
133 | veor q1, q1, q11 | ||
134 | vshl.u64 q11, q11, #1 | ||
135 | veor q2, q2, q10 | ||
136 | veor q0, q0, q11 | ||
137 | vmov.i8 q8,#0x0f @ compose .LBS2 | ||
138 | vshr.u64 q10, q5, #2 | ||
139 | vshr.u64 q11, q4, #2 | ||
140 | veor q10, q10, q7 | ||
141 | veor q11, q11, q6 | ||
142 | vand q10, q10, q9 | ||
143 | vand q11, q11, q9 | ||
144 | veor q7, q7, q10 | ||
145 | vshl.u64 q10, q10, #2 | ||
146 | veor q6, q6, q11 | ||
147 | vshl.u64 q11, q11, #2 | ||
148 | veor q5, q5, q10 | ||
149 | veor q4, q4, q11 | ||
150 | vshr.u64 q10, q1, #2 | ||
151 | vshr.u64 q11, q0, #2 | ||
152 | veor q10, q10, q3 | ||
153 | veor q11, q11, q2 | ||
154 | vand q10, q10, q9 | ||
155 | vand q11, q11, q9 | ||
156 | veor q3, q3, q10 | ||
157 | vshl.u64 q10, q10, #2 | ||
158 | veor q2, q2, q11 | ||
159 | vshl.u64 q11, q11, #2 | ||
160 | veor q1, q1, q10 | ||
161 | veor q0, q0, q11 | ||
162 | vshr.u64 q10, q3, #4 | ||
163 | vshr.u64 q11, q2, #4 | ||
164 | veor q10, q10, q7 | ||
165 | veor q11, q11, q6 | ||
166 | vand q10, q10, q8 | ||
167 | vand q11, q11, q8 | ||
168 | veor q7, q7, q10 | ||
169 | vshl.u64 q10, q10, #4 | ||
170 | veor q6, q6, q11 | ||
171 | vshl.u64 q11, q11, #4 | ||
172 | veor q3, q3, q10 | ||
173 | veor q2, q2, q11 | ||
174 | vshr.u64 q10, q1, #4 | ||
175 | vshr.u64 q11, q0, #4 | ||
176 | veor q10, q10, q5 | ||
177 | veor q11, q11, q4 | ||
178 | vand q10, q10, q8 | ||
179 | vand q11, q11, q8 | ||
180 | veor q5, q5, q10 | ||
181 | vshl.u64 q10, q10, #4 | ||
182 | veor q4, q4, q11 | ||
183 | vshl.u64 q11, q11, #4 | ||
184 | veor q1, q1, q10 | ||
185 | veor q0, q0, q11 | ||
186 | sub r5,r5,#1 | ||
187 | b .Ldec_sbox | ||
188 | .align 4 | ||
189 | .Ldec_loop: | ||
190 | vldmia r4!, {q8-q11} | ||
191 | veor q8, q8, q0 | ||
192 | veor q9, q9, q1 | ||
193 | vtbl.8 d0, {q8}, d24 | ||
194 | vtbl.8 d1, {q8}, d25 | ||
195 | vldmia r4!, {q8} | ||
196 | veor q10, q10, q2 | ||
197 | vtbl.8 d2, {q9}, d24 | ||
198 | vtbl.8 d3, {q9}, d25 | ||
199 | vldmia r4!, {q9} | ||
200 | veor q11, q11, q3 | ||
201 | vtbl.8 d4, {q10}, d24 | ||
202 | vtbl.8 d5, {q10}, d25 | ||
203 | vldmia r4!, {q10} | ||
204 | vtbl.8 d6, {q11}, d24 | ||
205 | vtbl.8 d7, {q11}, d25 | ||
206 | vldmia r4!, {q11} | ||
207 | veor q8, q8, q4 | ||
208 | veor q9, q9, q5 | ||
209 | vtbl.8 d8, {q8}, d24 | ||
210 | vtbl.8 d9, {q8}, d25 | ||
211 | veor q10, q10, q6 | ||
212 | vtbl.8 d10, {q9}, d24 | ||
213 | vtbl.8 d11, {q9}, d25 | ||
214 | veor q11, q11, q7 | ||
215 | vtbl.8 d12, {q10}, d24 | ||
216 | vtbl.8 d13, {q10}, d25 | ||
217 | vtbl.8 d14, {q11}, d24 | ||
218 | vtbl.8 d15, {q11}, d25 | ||
219 | .Ldec_sbox: | ||
220 | veor q1, q1, q4 | ||
221 | veor q3, q3, q4 | ||
222 | |||
223 | veor q4, q4, q7 | ||
224 | veor q1, q1, q6 | ||
225 | veor q2, q2, q7 | ||
226 | veor q6, q6, q4 | ||
227 | |||
228 | veor q0, q0, q1 | ||
229 | veor q2, q2, q5 | ||
230 | veor q7, q7, q6 | ||
231 | veor q3, q3, q0 | ||
232 | veor q5, q5, q0 | ||
233 | veor q1, q1, q3 | ||
234 | veor q11, q3, q0 | ||
235 | veor q10, q7, q4 | ||
236 | veor q9, q1, q6 | ||
237 | veor q13, q4, q0 | ||
238 | vmov q8, q10 | ||
239 | veor q12, q5, q2 | ||
240 | |||
241 | vorr q10, q10, q9 | ||
242 | veor q15, q11, q8 | ||
243 | vand q14, q11, q12 | ||
244 | vorr q11, q11, q12 | ||
245 | veor q12, q12, q9 | ||
246 | vand q8, q8, q9 | ||
247 | veor q9, q6, q2 | ||
248 | vand q15, q15, q12 | ||
249 | vand q13, q13, q9 | ||
250 | veor q9, q3, q7 | ||
251 | veor q12, q1, q5 | ||
252 | veor q11, q11, q13 | ||
253 | veor q10, q10, q13 | ||
254 | vand q13, q9, q12 | ||
255 | vorr q9, q9, q12 | ||
256 | veor q11, q11, q15 | ||
257 | veor q8, q8, q13 | ||
258 | veor q10, q10, q14 | ||
259 | veor q9, q9, q15 | ||
260 | veor q8, q8, q14 | ||
261 | vand q12, q4, q6 | ||
262 | veor q9, q9, q14 | ||
263 | vand q13, q0, q2 | ||
264 | vand q14, q7, q1 | ||
265 | vorr q15, q3, q5 | ||
266 | veor q11, q11, q12 | ||
267 | veor q9, q9, q14 | ||
268 | veor q8, q8, q15 | ||
269 | veor q10, q10, q13 | ||
270 | |||
271 | @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 | ||
272 | |||
273 | @ new smaller inversion | ||
274 | |||
275 | vand q14, q11, q9 | ||
276 | vmov q12, q8 | ||
277 | |||
278 | veor q13, q10, q14 | ||
279 | veor q15, q8, q14 | ||
280 | veor q14, q8, q14 @ q14=q15 | ||
281 | |||
282 | vbsl q13, q9, q8 | ||
283 | vbsl q15, q11, q10 | ||
284 | veor q11, q11, q10 | ||
285 | |||
286 | vbsl q12, q13, q14 | ||
287 | vbsl q8, q14, q13 | ||
288 | |||
289 | vand q14, q12, q15 | ||
290 | veor q9, q9, q8 | ||
291 | |||
292 | veor q14, q14, q11 | ||
293 | veor q12, q5, q2 | ||
294 | veor q8, q1, q6 | ||
295 | veor q10, q15, q14 | ||
296 | vand q10, q10, q5 | ||
297 | veor q5, q5, q1 | ||
298 | vand q11, q1, q15 | ||
299 | vand q5, q5, q14 | ||
300 | veor q1, q11, q10 | ||
301 | veor q5, q5, q11 | ||
302 | veor q15, q15, q13 | ||
303 | veor q14, q14, q9 | ||
304 | veor q11, q15, q14 | ||
305 | veor q10, q13, q9 | ||
306 | vand q11, q11, q12 | ||
307 | vand q10, q10, q2 | ||
308 | veor q12, q12, q8 | ||
309 | veor q2, q2, q6 | ||
310 | vand q8, q8, q15 | ||
311 | vand q6, q6, q13 | ||
312 | vand q12, q12, q14 | ||
313 | vand q2, q2, q9 | ||
314 | veor q8, q8, q12 | ||
315 | veor q2, q2, q6 | ||
316 | veor q12, q12, q11 | ||
317 | veor q6, q6, q10 | ||
318 | veor q5, q5, q12 | ||
319 | veor q2, q2, q12 | ||
320 | veor q1, q1, q8 | ||
321 | veor q6, q6, q8 | ||
322 | |||
323 | veor q12, q3, q0 | ||
324 | veor q8, q7, q4 | ||
325 | veor q11, q15, q14 | ||
326 | veor q10, q13, q9 | ||
327 | vand q11, q11, q12 | ||
328 | vand q10, q10, q0 | ||
329 | veor q12, q12, q8 | ||
330 | veor q0, q0, q4 | ||
331 | vand q8, q8, q15 | ||
332 | vand q4, q4, q13 | ||
333 | vand q12, q12, q14 | ||
334 | vand q0, q0, q9 | ||
335 | veor q8, q8, q12 | ||
336 | veor q0, q0, q4 | ||
337 | veor q12, q12, q11 | ||
338 | veor q4, q4, q10 | ||
339 | veor q15, q15, q13 | ||
340 | veor q14, q14, q9 | ||
341 | veor q10, q15, q14 | ||
342 | vand q10, q10, q3 | ||
343 | veor q3, q3, q7 | ||
344 | vand q11, q7, q15 | ||
345 | vand q3, q3, q14 | ||
346 | veor q7, q11, q10 | ||
347 | veor q3, q3, q11 | ||
348 | veor q3, q3, q12 | ||
349 | veor q0, q0, q12 | ||
350 | veor q7, q7, q8 | ||
351 | veor q4, q4, q8 | ||
352 | veor q1, q1, q7 | ||
353 | veor q6, q6, q5 | ||
354 | |||
355 | veor q4, q4, q1 | ||
356 | veor q2, q2, q7 | ||
357 | veor q5, q5, q7 | ||
358 | veor q4, q4, q2 | ||
359 | veor q7, q7, q0 | ||
360 | veor q4, q4, q5 | ||
361 | veor q3, q3, q6 | ||
362 | veor q6, q6, q1 | ||
363 | veor q3, q3, q4 | ||
364 | |||
365 | veor q4, q4, q0 | ||
366 | veor q7, q7, q3 | ||
367 | subs r5,r5,#1 | ||
368 | bcc .Ldec_done | ||
369 | @ multiplication by 0x05-0x00-0x04-0x00 | ||
370 | vext.8 q8, q0, q0, #8 | ||
371 | vext.8 q14, q3, q3, #8 | ||
372 | vext.8 q15, q5, q5, #8 | ||
373 | veor q8, q8, q0 | ||
374 | vext.8 q9, q1, q1, #8 | ||
375 | veor q14, q14, q3 | ||
376 | vext.8 q10, q6, q6, #8 | ||
377 | veor q15, q15, q5 | ||
378 | vext.8 q11, q4, q4, #8 | ||
379 | veor q9, q9, q1 | ||
380 | vext.8 q12, q2, q2, #8 | ||
381 | veor q10, q10, q6 | ||
382 | vext.8 q13, q7, q7, #8 | ||
383 | veor q11, q11, q4 | ||
384 | veor q12, q12, q2 | ||
385 | veor q13, q13, q7 | ||
386 | |||
387 | veor q0, q0, q14 | ||
388 | veor q1, q1, q14 | ||
389 | veor q6, q6, q8 | ||
390 | veor q2, q2, q10 | ||
391 | veor q4, q4, q9 | ||
392 | veor q1, q1, q15 | ||
393 | veor q6, q6, q15 | ||
394 | veor q2, q2, q14 | ||
395 | veor q7, q7, q11 | ||
396 | veor q4, q4, q14 | ||
397 | veor q3, q3, q12 | ||
398 | veor q2, q2, q15 | ||
399 | veor q7, q7, q15 | ||
400 | veor q5, q5, q13 | ||
401 | vext.8 q8, q0, q0, #12 @ x0 <<< 32 | ||
402 | vext.8 q9, q1, q1, #12 | ||
403 | veor q0, q0, q8 @ x0 ^ (x0 <<< 32) | ||
404 | vext.8 q10, q6, q6, #12 | ||
405 | veor q1, q1, q9 | ||
406 | vext.8 q11, q4, q4, #12 | ||
407 | veor q6, q6, q10 | ||
408 | vext.8 q12, q2, q2, #12 | ||
409 | veor q4, q4, q11 | ||
410 | vext.8 q13, q7, q7, #12 | ||
411 | veor q2, q2, q12 | ||
412 | vext.8 q14, q3, q3, #12 | ||
413 | veor q7, q7, q13 | ||
414 | vext.8 q15, q5, q5, #12 | ||
415 | veor q3, q3, q14 | ||
416 | |||
417 | veor q9, q9, q0 | ||
418 | veor q5, q5, q15 | ||
419 | vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) | ||
420 | veor q10, q10, q1 | ||
421 | veor q8, q8, q5 | ||
422 | veor q9, q9, q5 | ||
423 | vext.8 q1, q1, q1, #8 | ||
424 | veor q13, q13, q2 | ||
425 | veor q0, q0, q8 | ||
426 | veor q14, q14, q7 | ||
427 | veor q1, q1, q9 | ||
428 | vext.8 q8, q2, q2, #8 | ||
429 | veor q12, q12, q4 | ||
430 | vext.8 q9, q7, q7, #8 | ||
431 | veor q15, q15, q3 | ||
432 | vext.8 q2, q4, q4, #8 | ||
433 | veor q11, q11, q6 | ||
434 | vext.8 q7, q5, q5, #8 | ||
435 | veor q12, q12, q5 | ||
436 | vext.8 q4, q3, q3, #8 | ||
437 | veor q11, q11, q5 | ||
438 | vext.8 q3, q6, q6, #8 | ||
439 | veor q5, q9, q13 | ||
440 | veor q11, q11, q2 | ||
441 | veor q7, q7, q15 | ||
442 | veor q6, q4, q14 | ||
443 | veor q4, q8, q12 | ||
444 | veor q2, q3, q10 | ||
445 | vmov q3, q11 | ||
446 | @ vmov q5, q9 | ||
447 | vldmia r6, {q12} @ .LISR | ||
448 | ite eq @ Thumb2 thing, sanity check in ARM | ||
449 | addeq r6,r6,#0x10 | ||
450 | bne .Ldec_loop | ||
451 | vldmia r6, {q12} @ .LISRM0 | ||
452 | b .Ldec_loop | ||
453 | .align 4 | ||
454 | .Ldec_done: | ||
455 | vmov.i8 q8,#0x55 @ compose .LBS0 | ||
456 | vmov.i8 q9,#0x33 @ compose .LBS1 | ||
457 | vshr.u64 q10, q3, #1 | ||
458 | vshr.u64 q11, q2, #1 | ||
459 | veor q10, q10, q5 | ||
460 | veor q11, q11, q7 | ||
461 | vand q10, q10, q8 | ||
462 | vand q11, q11, q8 | ||
463 | veor q5, q5, q10 | ||
464 | vshl.u64 q10, q10, #1 | ||
465 | veor q7, q7, q11 | ||
466 | vshl.u64 q11, q11, #1 | ||
467 | veor q3, q3, q10 | ||
468 | veor q2, q2, q11 | ||
469 | vshr.u64 q10, q6, #1 | ||
470 | vshr.u64 q11, q0, #1 | ||
471 | veor q10, q10, q4 | ||
472 | veor q11, q11, q1 | ||
473 | vand q10, q10, q8 | ||
474 | vand q11, q11, q8 | ||
475 | veor q4, q4, q10 | ||
476 | vshl.u64 q10, q10, #1 | ||
477 | veor q1, q1, q11 | ||
478 | vshl.u64 q11, q11, #1 | ||
479 | veor q6, q6, q10 | ||
480 | veor q0, q0, q11 | ||
481 | vmov.i8 q8,#0x0f @ compose .LBS2 | ||
482 | vshr.u64 q10, q7, #2 | ||
483 | vshr.u64 q11, q2, #2 | ||
484 | veor q10, q10, q5 | ||
485 | veor q11, q11, q3 | ||
486 | vand q10, q10, q9 | ||
487 | vand q11, q11, q9 | ||
488 | veor q5, q5, q10 | ||
489 | vshl.u64 q10, q10, #2 | ||
490 | veor q3, q3, q11 | ||
491 | vshl.u64 q11, q11, #2 | ||
492 | veor q7, q7, q10 | ||
493 | veor q2, q2, q11 | ||
494 | vshr.u64 q10, q1, #2 | ||
495 | vshr.u64 q11, q0, #2 | ||
496 | veor q10, q10, q4 | ||
497 | veor q11, q11, q6 | ||
498 | vand q10, q10, q9 | ||
499 | vand q11, q11, q9 | ||
500 | veor q4, q4, q10 | ||
501 | vshl.u64 q10, q10, #2 | ||
502 | veor q6, q6, q11 | ||
503 | vshl.u64 q11, q11, #2 | ||
504 | veor q1, q1, q10 | ||
505 | veor q0, q0, q11 | ||
506 | vshr.u64 q10, q4, #4 | ||
507 | vshr.u64 q11, q6, #4 | ||
508 | veor q10, q10, q5 | ||
509 | veor q11, q11, q3 | ||
510 | vand q10, q10, q8 | ||
511 | vand q11, q11, q8 | ||
512 | veor q5, q5, q10 | ||
513 | vshl.u64 q10, q10, #4 | ||
514 | veor q3, q3, q11 | ||
515 | vshl.u64 q11, q11, #4 | ||
516 | veor q4, q4, q10 | ||
517 | veor q6, q6, q11 | ||
518 | vshr.u64 q10, q1, #4 | ||
519 | vshr.u64 q11, q0, #4 | ||
520 | veor q10, q10, q7 | ||
521 | veor q11, q11, q2 | ||
522 | vand q10, q10, q8 | ||
523 | vand q11, q11, q8 | ||
524 | veor q7, q7, q10 | ||
525 | vshl.u64 q10, q10, #4 | ||
526 | veor q2, q2, q11 | ||
527 | vshl.u64 q11, q11, #4 | ||
528 | veor q1, q1, q10 | ||
529 | veor q0, q0, q11 | ||
530 | vldmia r4, {q8} @ last round key | ||
531 | veor q6, q6, q8 | ||
532 | veor q4, q4, q8 | ||
533 | veor q2, q2, q8 | ||
534 | veor q7, q7, q8 | ||
535 | veor q3, q3, q8 | ||
536 | veor q5, q5, q8 | ||
537 | veor q0, q0, q8 | ||
538 | veor q1, q1, q8 | ||
539 | bx lr | ||
540 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
541 | |||
542 | .type _bsaes_const,%object | ||
543 | .align 6 | ||
544 | _bsaes_const: | ||
545 | .LM0ISR: @ InvShiftRows constants | ||
546 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
547 | .LISR: | ||
548 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
549 | .LISRM0: | ||
550 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
551 | .LM0SR: @ ShiftRows constants | ||
552 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
553 | .LSR: | ||
554 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
555 | .LSRM0: | ||
556 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
557 | .LM0: | ||
558 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
559 | .LREVM0SR: | ||
560 | .quad 0x090d01050c000408, 0x03070b0f060a0e02 | ||
561 | .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>" | ||
562 | .align 6 | ||
563 | .size _bsaes_const,.-_bsaes_const | ||
564 | |||
565 | .type _bsaes_encrypt8,%function | ||
566 | .align 4 | ||
567 | _bsaes_encrypt8: | ||
568 | adr r6,_bsaes_encrypt8 | ||
569 | vldmia r4!, {q9} @ round 0 key | ||
570 | sub r6,r6,#_bsaes_encrypt8-.LM0SR | ||
571 | |||
572 | vldmia r6!, {q8} @ .LM0SR | ||
573 | _bsaes_encrypt8_alt: | ||
574 | veor q10, q0, q9 @ xor with round0 key | ||
575 | veor q11, q1, q9 | ||
576 | vtbl.8 d0, {q10}, d16 | ||
577 | vtbl.8 d1, {q10}, d17 | ||
578 | veor q12, q2, q9 | ||
579 | vtbl.8 d2, {q11}, d16 | ||
580 | vtbl.8 d3, {q11}, d17 | ||
581 | veor q13, q3, q9 | ||
582 | vtbl.8 d4, {q12}, d16 | ||
583 | vtbl.8 d5, {q12}, d17 | ||
584 | veor q14, q4, q9 | ||
585 | vtbl.8 d6, {q13}, d16 | ||
586 | vtbl.8 d7, {q13}, d17 | ||
587 | veor q15, q5, q9 | ||
588 | vtbl.8 d8, {q14}, d16 | ||
589 | vtbl.8 d9, {q14}, d17 | ||
590 | veor q10, q6, q9 | ||
591 | vtbl.8 d10, {q15}, d16 | ||
592 | vtbl.8 d11, {q15}, d17 | ||
593 | veor q11, q7, q9 | ||
594 | vtbl.8 d12, {q10}, d16 | ||
595 | vtbl.8 d13, {q10}, d17 | ||
596 | vtbl.8 d14, {q11}, d16 | ||
597 | vtbl.8 d15, {q11}, d17 | ||
598 | _bsaes_encrypt8_bitslice: | ||
599 | vmov.i8 q8,#0x55 @ compose .LBS0 | ||
600 | vmov.i8 q9,#0x33 @ compose .LBS1 | ||
601 | vshr.u64 q10, q6, #1 | ||
602 | vshr.u64 q11, q4, #1 | ||
603 | veor q10, q10, q7 | ||
604 | veor q11, q11, q5 | ||
605 | vand q10, q10, q8 | ||
606 | vand q11, q11, q8 | ||
607 | veor q7, q7, q10 | ||
608 | vshl.u64 q10, q10, #1 | ||
609 | veor q5, q5, q11 | ||
610 | vshl.u64 q11, q11, #1 | ||
611 | veor q6, q6, q10 | ||
612 | veor q4, q4, q11 | ||
613 | vshr.u64 q10, q2, #1 | ||
614 | vshr.u64 q11, q0, #1 | ||
615 | veor q10, q10, q3 | ||
616 | veor q11, q11, q1 | ||
617 | vand q10, q10, q8 | ||
618 | vand q11, q11, q8 | ||
619 | veor q3, q3, q10 | ||
620 | vshl.u64 q10, q10, #1 | ||
621 | veor q1, q1, q11 | ||
622 | vshl.u64 q11, q11, #1 | ||
623 | veor q2, q2, q10 | ||
624 | veor q0, q0, q11 | ||
625 | vmov.i8 q8,#0x0f @ compose .LBS2 | ||
626 | vshr.u64 q10, q5, #2 | ||
627 | vshr.u64 q11, q4, #2 | ||
628 | veor q10, q10, q7 | ||
629 | veor q11, q11, q6 | ||
630 | vand q10, q10, q9 | ||
631 | vand q11, q11, q9 | ||
632 | veor q7, q7, q10 | ||
633 | vshl.u64 q10, q10, #2 | ||
634 | veor q6, q6, q11 | ||
635 | vshl.u64 q11, q11, #2 | ||
636 | veor q5, q5, q10 | ||
637 | veor q4, q4, q11 | ||
638 | vshr.u64 q10, q1, #2 | ||
639 | vshr.u64 q11, q0, #2 | ||
640 | veor q10, q10, q3 | ||
641 | veor q11, q11, q2 | ||
642 | vand q10, q10, q9 | ||
643 | vand q11, q11, q9 | ||
644 | veor q3, q3, q10 | ||
645 | vshl.u64 q10, q10, #2 | ||
646 | veor q2, q2, q11 | ||
647 | vshl.u64 q11, q11, #2 | ||
648 | veor q1, q1, q10 | ||
649 | veor q0, q0, q11 | ||
650 | vshr.u64 q10, q3, #4 | ||
651 | vshr.u64 q11, q2, #4 | ||
652 | veor q10, q10, q7 | ||
653 | veor q11, q11, q6 | ||
654 | vand q10, q10, q8 | ||
655 | vand q11, q11, q8 | ||
656 | veor q7, q7, q10 | ||
657 | vshl.u64 q10, q10, #4 | ||
658 | veor q6, q6, q11 | ||
659 | vshl.u64 q11, q11, #4 | ||
660 | veor q3, q3, q10 | ||
661 | veor q2, q2, q11 | ||
662 | vshr.u64 q10, q1, #4 | ||
663 | vshr.u64 q11, q0, #4 | ||
664 | veor q10, q10, q5 | ||
665 | veor q11, q11, q4 | ||
666 | vand q10, q10, q8 | ||
667 | vand q11, q11, q8 | ||
668 | veor q5, q5, q10 | ||
669 | vshl.u64 q10, q10, #4 | ||
670 | veor q4, q4, q11 | ||
671 | vshl.u64 q11, q11, #4 | ||
672 | veor q1, q1, q10 | ||
673 | veor q0, q0, q11 | ||
674 | sub r5,r5,#1 | ||
675 | b .Lenc_sbox | ||
676 | .align 4 | ||
677 | .Lenc_loop: | ||
678 | vldmia r4!, {q8-q11} | ||
679 | veor q8, q8, q0 | ||
680 | veor q9, q9, q1 | ||
681 | vtbl.8 d0, {q8}, d24 | ||
682 | vtbl.8 d1, {q8}, d25 | ||
683 | vldmia r4!, {q8} | ||
684 | veor q10, q10, q2 | ||
685 | vtbl.8 d2, {q9}, d24 | ||
686 | vtbl.8 d3, {q9}, d25 | ||
687 | vldmia r4!, {q9} | ||
688 | veor q11, q11, q3 | ||
689 | vtbl.8 d4, {q10}, d24 | ||
690 | vtbl.8 d5, {q10}, d25 | ||
691 | vldmia r4!, {q10} | ||
692 | vtbl.8 d6, {q11}, d24 | ||
693 | vtbl.8 d7, {q11}, d25 | ||
694 | vldmia r4!, {q11} | ||
695 | veor q8, q8, q4 | ||
696 | veor q9, q9, q5 | ||
697 | vtbl.8 d8, {q8}, d24 | ||
698 | vtbl.8 d9, {q8}, d25 | ||
699 | veor q10, q10, q6 | ||
700 | vtbl.8 d10, {q9}, d24 | ||
701 | vtbl.8 d11, {q9}, d25 | ||
702 | veor q11, q11, q7 | ||
703 | vtbl.8 d12, {q10}, d24 | ||
704 | vtbl.8 d13, {q10}, d25 | ||
705 | vtbl.8 d14, {q11}, d24 | ||
706 | vtbl.8 d15, {q11}, d25 | ||
707 | .Lenc_sbox: | ||
708 | veor q2, q2, q1 | ||
709 | veor q5, q5, q6 | ||
710 | veor q3, q3, q0 | ||
711 | veor q6, q6, q2 | ||
712 | veor q5, q5, q0 | ||
713 | |||
714 | veor q6, q6, q3 | ||
715 | veor q3, q3, q7 | ||
716 | veor q7, q7, q5 | ||
717 | veor q3, q3, q4 | ||
718 | veor q4, q4, q5 | ||
719 | |||
720 | veor q2, q2, q7 | ||
721 | veor q3, q3, q1 | ||
722 | veor q1, q1, q5 | ||
723 | veor q11, q7, q4 | ||
724 | veor q10, q1, q2 | ||
725 | veor q9, q5, q3 | ||
726 | veor q13, q2, q4 | ||
727 | vmov q8, q10 | ||
728 | veor q12, q6, q0 | ||
729 | |||
730 | vorr q10, q10, q9 | ||
731 | veor q15, q11, q8 | ||
732 | vand q14, q11, q12 | ||
733 | vorr q11, q11, q12 | ||
734 | veor q12, q12, q9 | ||
735 | vand q8, q8, q9 | ||
736 | veor q9, q3, q0 | ||
737 | vand q15, q15, q12 | ||
738 | vand q13, q13, q9 | ||
739 | veor q9, q7, q1 | ||
740 | veor q12, q5, q6 | ||
741 | veor q11, q11, q13 | ||
742 | veor q10, q10, q13 | ||
743 | vand q13, q9, q12 | ||
744 | vorr q9, q9, q12 | ||
745 | veor q11, q11, q15 | ||
746 | veor q8, q8, q13 | ||
747 | veor q10, q10, q14 | ||
748 | veor q9, q9, q15 | ||
749 | veor q8, q8, q14 | ||
750 | vand q12, q2, q3 | ||
751 | veor q9, q9, q14 | ||
752 | vand q13, q4, q0 | ||
753 | vand q14, q1, q5 | ||
754 | vorr q15, q7, q6 | ||
755 | veor q11, q11, q12 | ||
756 | veor q9, q9, q14 | ||
757 | veor q8, q8, q15 | ||
758 | veor q10, q10, q13 | ||
759 | |||
760 | @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 | ||
761 | |||
762 | @ new smaller inversion | ||
763 | |||
764 | vand q14, q11, q9 | ||
765 | vmov q12, q8 | ||
766 | |||
767 | veor q13, q10, q14 | ||
768 | veor q15, q8, q14 | ||
769 | veor q14, q8, q14 @ q14=q15 | ||
770 | |||
771 | vbsl q13, q9, q8 | ||
772 | vbsl q15, q11, q10 | ||
773 | veor q11, q11, q10 | ||
774 | |||
775 | vbsl q12, q13, q14 | ||
776 | vbsl q8, q14, q13 | ||
777 | |||
778 | vand q14, q12, q15 | ||
779 | veor q9, q9, q8 | ||
780 | |||
781 | veor q14, q14, q11 | ||
782 | veor q12, q6, q0 | ||
783 | veor q8, q5, q3 | ||
784 | veor q10, q15, q14 | ||
785 | vand q10, q10, q6 | ||
786 | veor q6, q6, q5 | ||
787 | vand q11, q5, q15 | ||
788 | vand q6, q6, q14 | ||
789 | veor q5, q11, q10 | ||
790 | veor q6, q6, q11 | ||
791 | veor q15, q15, q13 | ||
792 | veor q14, q14, q9 | ||
793 | veor q11, q15, q14 | ||
794 | veor q10, q13, q9 | ||
795 | vand q11, q11, q12 | ||
796 | vand q10, q10, q0 | ||
797 | veor q12, q12, q8 | ||
798 | veor q0, q0, q3 | ||
799 | vand q8, q8, q15 | ||
800 | vand q3, q3, q13 | ||
801 | vand q12, q12, q14 | ||
802 | vand q0, q0, q9 | ||
803 | veor q8, q8, q12 | ||
804 | veor q0, q0, q3 | ||
805 | veor q12, q12, q11 | ||
806 | veor q3, q3, q10 | ||
807 | veor q6, q6, q12 | ||
808 | veor q0, q0, q12 | ||
809 | veor q5, q5, q8 | ||
810 | veor q3, q3, q8 | ||
811 | |||
812 | veor q12, q7, q4 | ||
813 | veor q8, q1, q2 | ||
814 | veor q11, q15, q14 | ||
815 | veor q10, q13, q9 | ||
816 | vand q11, q11, q12 | ||
817 | vand q10, q10, q4 | ||
818 | veor q12, q12, q8 | ||
819 | veor q4, q4, q2 | ||
820 | vand q8, q8, q15 | ||
821 | vand q2, q2, q13 | ||
822 | vand q12, q12, q14 | ||
823 | vand q4, q4, q9 | ||
824 | veor q8, q8, q12 | ||
825 | veor q4, q4, q2 | ||
826 | veor q12, q12, q11 | ||
827 | veor q2, q2, q10 | ||
828 | veor q15, q15, q13 | ||
829 | veor q14, q14, q9 | ||
830 | veor q10, q15, q14 | ||
831 | vand q10, q10, q7 | ||
832 | veor q7, q7, q1 | ||
833 | vand q11, q1, q15 | ||
834 | vand q7, q7, q14 | ||
835 | veor q1, q11, q10 | ||
836 | veor q7, q7, q11 | ||
837 | veor q7, q7, q12 | ||
838 | veor q4, q4, q12 | ||
839 | veor q1, q1, q8 | ||
840 | veor q2, q2, q8 | ||
841 | veor q7, q7, q0 | ||
842 | veor q1, q1, q6 | ||
843 | veor q6, q6, q0 | ||
844 | veor q4, q4, q7 | ||
845 | veor q0, q0, q1 | ||
846 | |||
847 | veor q1, q1, q5 | ||
848 | veor q5, q5, q2 | ||
849 | veor q2, q2, q3 | ||
850 | veor q3, q3, q5 | ||
851 | veor q4, q4, q5 | ||
852 | |||
853 | veor q6, q6, q3 | ||
854 | subs r5,r5,#1 | ||
855 | bcc .Lenc_done | ||
856 | vext.8 q8, q0, q0, #12 @ x0 <<< 32 | ||
857 | vext.8 q9, q1, q1, #12 | ||
858 | veor q0, q0, q8 @ x0 ^ (x0 <<< 32) | ||
859 | vext.8 q10, q4, q4, #12 | ||
860 | veor q1, q1, q9 | ||
861 | vext.8 q11, q6, q6, #12 | ||
862 | veor q4, q4, q10 | ||
863 | vext.8 q12, q3, q3, #12 | ||
864 | veor q6, q6, q11 | ||
865 | vext.8 q13, q7, q7, #12 | ||
866 | veor q3, q3, q12 | ||
867 | vext.8 q14, q2, q2, #12 | ||
868 | veor q7, q7, q13 | ||
869 | vext.8 q15, q5, q5, #12 | ||
870 | veor q2, q2, q14 | ||
871 | |||
872 | veor q9, q9, q0 | ||
873 | veor q5, q5, q15 | ||
874 | vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) | ||
875 | veor q10, q10, q1 | ||
876 | veor q8, q8, q5 | ||
877 | veor q9, q9, q5 | ||
878 | vext.8 q1, q1, q1, #8 | ||
879 | veor q13, q13, q3 | ||
880 | veor q0, q0, q8 | ||
881 | veor q14, q14, q7 | ||
882 | veor q1, q1, q9 | ||
883 | vext.8 q8, q3, q3, #8 | ||
884 | veor q12, q12, q6 | ||
885 | vext.8 q9, q7, q7, #8 | ||
886 | veor q15, q15, q2 | ||
887 | vext.8 q3, q6, q6, #8 | ||
888 | veor q11, q11, q4 | ||
889 | vext.8 q7, q5, q5, #8 | ||
890 | veor q12, q12, q5 | ||
891 | vext.8 q6, q2, q2, #8 | ||
892 | veor q11, q11, q5 | ||
893 | vext.8 q2, q4, q4, #8 | ||
894 | veor q5, q9, q13 | ||
895 | veor q4, q8, q12 | ||
896 | veor q3, q3, q11 | ||
897 | veor q7, q7, q15 | ||
898 | veor q6, q6, q14 | ||
899 | @ vmov q4, q8 | ||
900 | veor q2, q2, q10 | ||
901 | @ vmov q5, q9 | ||
902 | vldmia r6, {q12} @ .LSR | ||
903 | ite eq @ Thumb2 thing, samity check in ARM | ||
904 | addeq r6,r6,#0x10 | ||
905 | bne .Lenc_loop | ||
906 | vldmia r6, {q12} @ .LSRM0 | ||
907 | b .Lenc_loop | ||
908 | .align 4 | ||
909 | .Lenc_done: | ||
910 | vmov.i8 q8,#0x55 @ compose .LBS0 | ||
911 | vmov.i8 q9,#0x33 @ compose .LBS1 | ||
912 | vshr.u64 q10, q2, #1 | ||
913 | vshr.u64 q11, q3, #1 | ||
914 | veor q10, q10, q5 | ||
915 | veor q11, q11, q7 | ||
916 | vand q10, q10, q8 | ||
917 | vand q11, q11, q8 | ||
918 | veor q5, q5, q10 | ||
919 | vshl.u64 q10, q10, #1 | ||
920 | veor q7, q7, q11 | ||
921 | vshl.u64 q11, q11, #1 | ||
922 | veor q2, q2, q10 | ||
923 | veor q3, q3, q11 | ||
924 | vshr.u64 q10, q4, #1 | ||
925 | vshr.u64 q11, q0, #1 | ||
926 | veor q10, q10, q6 | ||
927 | veor q11, q11, q1 | ||
928 | vand q10, q10, q8 | ||
929 | vand q11, q11, q8 | ||
930 | veor q6, q6, q10 | ||
931 | vshl.u64 q10, q10, #1 | ||
932 | veor q1, q1, q11 | ||
933 | vshl.u64 q11, q11, #1 | ||
934 | veor q4, q4, q10 | ||
935 | veor q0, q0, q11 | ||
936 | vmov.i8 q8,#0x0f @ compose .LBS2 | ||
937 | vshr.u64 q10, q7, #2 | ||
938 | vshr.u64 q11, q3, #2 | ||
939 | veor q10, q10, q5 | ||
940 | veor q11, q11, q2 | ||
941 | vand q10, q10, q9 | ||
942 | vand q11, q11, q9 | ||
943 | veor q5, q5, q10 | ||
944 | vshl.u64 q10, q10, #2 | ||
945 | veor q2, q2, q11 | ||
946 | vshl.u64 q11, q11, #2 | ||
947 | veor q7, q7, q10 | ||
948 | veor q3, q3, q11 | ||
949 | vshr.u64 q10, q1, #2 | ||
950 | vshr.u64 q11, q0, #2 | ||
951 | veor q10, q10, q6 | ||
952 | veor q11, q11, q4 | ||
953 | vand q10, q10, q9 | ||
954 | vand q11, q11, q9 | ||
955 | veor q6, q6, q10 | ||
956 | vshl.u64 q10, q10, #2 | ||
957 | veor q4, q4, q11 | ||
958 | vshl.u64 q11, q11, #2 | ||
959 | veor q1, q1, q10 | ||
960 | veor q0, q0, q11 | ||
961 | vshr.u64 q10, q6, #4 | ||
962 | vshr.u64 q11, q4, #4 | ||
963 | veor q10, q10, q5 | ||
964 | veor q11, q11, q2 | ||
965 | vand q10, q10, q8 | ||
966 | vand q11, q11, q8 | ||
967 | veor q5, q5, q10 | ||
968 | vshl.u64 q10, q10, #4 | ||
969 | veor q2, q2, q11 | ||
970 | vshl.u64 q11, q11, #4 | ||
971 | veor q6, q6, q10 | ||
972 | veor q4, q4, q11 | ||
973 | vshr.u64 q10, q1, #4 | ||
974 | vshr.u64 q11, q0, #4 | ||
975 | veor q10, q10, q7 | ||
976 | veor q11, q11, q3 | ||
977 | vand q10, q10, q8 | ||
978 | vand q11, q11, q8 | ||
979 | veor q7, q7, q10 | ||
980 | vshl.u64 q10, q10, #4 | ||
981 | veor q3, q3, q11 | ||
982 | vshl.u64 q11, q11, #4 | ||
983 | veor q1, q1, q10 | ||
984 | veor q0, q0, q11 | ||
985 | vldmia r4, {q8} @ last round key | ||
986 | veor q4, q4, q8 | ||
987 | veor q6, q6, q8 | ||
988 | veor q3, q3, q8 | ||
989 | veor q7, q7, q8 | ||
990 | veor q2, q2, q8 | ||
991 | veor q5, q5, q8 | ||
992 | veor q0, q0, q8 | ||
993 | veor q1, q1, q8 | ||
994 | bx lr | ||
995 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
996 | .type _bsaes_key_convert,%function | ||
997 | .align 4 | ||
998 | _bsaes_key_convert: | ||
999 | adr r6,_bsaes_key_convert | ||
1000 | vld1.8 {q7}, [r4]! @ load round 0 key | ||
1001 | sub r6,r6,#_bsaes_key_convert-.LM0 | ||
1002 | vld1.8 {q15}, [r4]! @ load round 1 key | ||
1003 | |||
1004 | vmov.i8 q8, #0x01 @ bit masks | ||
1005 | vmov.i8 q9, #0x02 | ||
1006 | vmov.i8 q10, #0x04 | ||
1007 | vmov.i8 q11, #0x08 | ||
1008 | vmov.i8 q12, #0x10 | ||
1009 | vmov.i8 q13, #0x20 | ||
1010 | vldmia r6, {q14} @ .LM0 | ||
1011 | |||
1012 | #ifdef __ARMEL__ | ||
1013 | vrev32.8 q7, q7 | ||
1014 | vrev32.8 q15, q15 | ||
1015 | #endif | ||
1016 | sub r5,r5,#1 | ||
1017 | vstmia r12!, {q7} @ save round 0 key | ||
1018 | b .Lkey_loop | ||
1019 | |||
1020 | .align 4 | ||
1021 | .Lkey_loop: | ||
1022 | vtbl.8 d14,{q15},d28 | ||
1023 | vtbl.8 d15,{q15},d29 | ||
1024 | vmov.i8 q6, #0x40 | ||
1025 | vmov.i8 q15, #0x80 | ||
1026 | |||
1027 | vtst.8 q0, q7, q8 | ||
1028 | vtst.8 q1, q7, q9 | ||
1029 | vtst.8 q2, q7, q10 | ||
1030 | vtst.8 q3, q7, q11 | ||
1031 | vtst.8 q4, q7, q12 | ||
1032 | vtst.8 q5, q7, q13 | ||
1033 | vtst.8 q6, q7, q6 | ||
1034 | vtst.8 q7, q7, q15 | ||
1035 | vld1.8 {q15}, [r4]! @ load next round key | ||
1036 | vmvn q0, q0 @ "pnot" | ||
1037 | vmvn q1, q1 | ||
1038 | vmvn q5, q5 | ||
1039 | vmvn q6, q6 | ||
1040 | #ifdef __ARMEL__ | ||
1041 | vrev32.8 q15, q15 | ||
1042 | #endif | ||
1043 | subs r5,r5,#1 | ||
1044 | vstmia r12!,{q0-q7} @ write bit-sliced round key | ||
1045 | bne .Lkey_loop | ||
1046 | |||
1047 | vmov.i8 q7,#0x63 @ compose .L63 | ||
1048 | @ don't save last round key | ||
1049 | bx lr | ||
1050 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
1051 | .extern AES_cbc_encrypt | ||
1052 | .extern AES_decrypt | ||
1053 | |||
1054 | .global bsaes_cbc_encrypt | ||
1055 | .type bsaes_cbc_encrypt,%function | ||
1056 | .align 5 | ||
1057 | bsaes_cbc_encrypt: | ||
1058 | #ifndef __KERNEL__ | ||
1059 | cmp r2, #128 | ||
1060 | #ifndef __thumb__ | ||
1061 | blo AES_cbc_encrypt | ||
1062 | #else | ||
1063 | bhs 1f | ||
1064 | b AES_cbc_encrypt | ||
1065 | 1: | ||
1066 | #endif | ||
1067 | #endif | ||
1068 | |||
1069 | @ it is up to the caller to make sure we are called with enc == 0 | ||
1070 | |||
1071 | mov ip, sp | ||
1072 | stmdb sp!, {r4-r10, lr} | ||
1073 | VFP_ABI_PUSH | ||
1074 | ldr r8, [ip] @ IV is 1st arg on the stack | ||
1075 | mov r2, r2, lsr#4 @ len in 16 byte blocks | ||
1076 | sub sp, #0x10 @ scratch space to carry over the IV | ||
1077 | mov r9, sp @ save sp | ||
1078 | |||
1079 | ldr r10, [r3, #240] @ get # of rounds | ||
1080 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1081 | @ allocate the key schedule on the stack | ||
1082 | sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key | ||
1083 | add r12, #96 @ sifze of bit-slices key schedule | ||
1084 | |||
1085 | @ populate the key schedule | ||
1086 | mov r4, r3 @ pass key | ||
1087 | mov r5, r10 @ pass # of rounds | ||
1088 | mov sp, r12 @ sp is sp | ||
1089 | bl _bsaes_key_convert | ||
1090 | vldmia sp, {q6} | ||
1091 | vstmia r12, {q15} @ save last round key | ||
1092 | veor q7, q7, q6 @ fix up round 0 key | ||
1093 | vstmia sp, {q7} | ||
1094 | #else | ||
1095 | ldr r12, [r3, #244] | ||
1096 | eors r12, #1 | ||
1097 | beq 0f | ||
1098 | |||
1099 | @ populate the key schedule | ||
1100 | str r12, [r3, #244] | ||
1101 | mov r4, r3 @ pass key | ||
1102 | mov r5, r10 @ pass # of rounds | ||
1103 | add r12, r3, #248 @ pass key schedule | ||
1104 | bl _bsaes_key_convert | ||
1105 | add r4, r3, #248 | ||
1106 | vldmia r4, {q6} | ||
1107 | vstmia r12, {q15} @ save last round key | ||
1108 | veor q7, q7, q6 @ fix up round 0 key | ||
1109 | vstmia r4, {q7} | ||
1110 | |||
1111 | .align 2 | ||
1112 | 0: | ||
1113 | #endif | ||
1114 | |||
1115 | vld1.8 {q15}, [r8] @ load IV | ||
1116 | b .Lcbc_dec_loop | ||
1117 | |||
1118 | .align 4 | ||
1119 | .Lcbc_dec_loop: | ||
1120 | subs r2, r2, #0x8 | ||
1121 | bmi .Lcbc_dec_loop_finish | ||
1122 | |||
1123 | vld1.8 {q0-q1}, [r0]! @ load input | ||
1124 | vld1.8 {q2-q3}, [r0]! | ||
1125 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1126 | mov r4, sp @ pass the key | ||
1127 | #else | ||
1128 | add r4, r3, #248 | ||
1129 | #endif | ||
1130 | vld1.8 {q4-q5}, [r0]! | ||
1131 | mov r5, r10 | ||
1132 | vld1.8 {q6-q7}, [r0] | ||
1133 | sub r0, r0, #0x60 | ||
1134 | vstmia r9, {q15} @ put aside IV | ||
1135 | |||
1136 | bl _bsaes_decrypt8 | ||
1137 | |||
1138 | vldmia r9, {q14} @ reload IV | ||
1139 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1140 | veor q0, q0, q14 @ ^= IV | ||
1141 | vld1.8 {q10-q11}, [r0]! | ||
1142 | veor q1, q1, q8 | ||
1143 | veor q6, q6, q9 | ||
1144 | vld1.8 {q12-q13}, [r0]! | ||
1145 | veor q4, q4, q10 | ||
1146 | veor q2, q2, q11 | ||
1147 | vld1.8 {q14-q15}, [r0]! | ||
1148 | veor q7, q7, q12 | ||
1149 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1150 | veor q3, q3, q13 | ||
1151 | vst1.8 {q6}, [r1]! | ||
1152 | veor q5, q5, q14 | ||
1153 | vst1.8 {q4}, [r1]! | ||
1154 | vst1.8 {q2}, [r1]! | ||
1155 | vst1.8 {q7}, [r1]! | ||
1156 | vst1.8 {q3}, [r1]! | ||
1157 | vst1.8 {q5}, [r1]! | ||
1158 | |||
1159 | b .Lcbc_dec_loop | ||
1160 | |||
1161 | .Lcbc_dec_loop_finish: | ||
1162 | adds r2, r2, #8 | ||
1163 | beq .Lcbc_dec_done | ||
1164 | |||
1165 | vld1.8 {q0}, [r0]! @ load input | ||
1166 | cmp r2, #2 | ||
1167 | blo .Lcbc_dec_one | ||
1168 | vld1.8 {q1}, [r0]! | ||
1169 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1170 | mov r4, sp @ pass the key | ||
1171 | #else | ||
1172 | add r4, r3, #248 | ||
1173 | #endif | ||
1174 | mov r5, r10 | ||
1175 | vstmia r9, {q15} @ put aside IV | ||
1176 | beq .Lcbc_dec_two | ||
1177 | vld1.8 {q2}, [r0]! | ||
1178 | cmp r2, #4 | ||
1179 | blo .Lcbc_dec_three | ||
1180 | vld1.8 {q3}, [r0]! | ||
1181 | beq .Lcbc_dec_four | ||
1182 | vld1.8 {q4}, [r0]! | ||
1183 | cmp r2, #6 | ||
1184 | blo .Lcbc_dec_five | ||
1185 | vld1.8 {q5}, [r0]! | ||
1186 | beq .Lcbc_dec_six | ||
1187 | vld1.8 {q6}, [r0]! | ||
1188 | sub r0, r0, #0x70 | ||
1189 | |||
1190 | bl _bsaes_decrypt8 | ||
1191 | |||
1192 | vldmia r9, {q14} @ reload IV | ||
1193 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1194 | veor q0, q0, q14 @ ^= IV | ||
1195 | vld1.8 {q10-q11}, [r0]! | ||
1196 | veor q1, q1, q8 | ||
1197 | veor q6, q6, q9 | ||
1198 | vld1.8 {q12-q13}, [r0]! | ||
1199 | veor q4, q4, q10 | ||
1200 | veor q2, q2, q11 | ||
1201 | vld1.8 {q15}, [r0]! | ||
1202 | veor q7, q7, q12 | ||
1203 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1204 | veor q3, q3, q13 | ||
1205 | vst1.8 {q6}, [r1]! | ||
1206 | vst1.8 {q4}, [r1]! | ||
1207 | vst1.8 {q2}, [r1]! | ||
1208 | vst1.8 {q7}, [r1]! | ||
1209 | vst1.8 {q3}, [r1]! | ||
1210 | b .Lcbc_dec_done | ||
1211 | .align 4 | ||
1212 | .Lcbc_dec_six: | ||
1213 | sub r0, r0, #0x60 | ||
1214 | bl _bsaes_decrypt8 | ||
1215 | vldmia r9,{q14} @ reload IV | ||
1216 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1217 | veor q0, q0, q14 @ ^= IV | ||
1218 | vld1.8 {q10-q11}, [r0]! | ||
1219 | veor q1, q1, q8 | ||
1220 | veor q6, q6, q9 | ||
1221 | vld1.8 {q12}, [r0]! | ||
1222 | veor q4, q4, q10 | ||
1223 | veor q2, q2, q11 | ||
1224 | vld1.8 {q15}, [r0]! | ||
1225 | veor q7, q7, q12 | ||
1226 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1227 | vst1.8 {q6}, [r1]! | ||
1228 | vst1.8 {q4}, [r1]! | ||
1229 | vst1.8 {q2}, [r1]! | ||
1230 | vst1.8 {q7}, [r1]! | ||
1231 | b .Lcbc_dec_done | ||
1232 | .align 4 | ||
1233 | .Lcbc_dec_five: | ||
1234 | sub r0, r0, #0x50 | ||
1235 | bl _bsaes_decrypt8 | ||
1236 | vldmia r9, {q14} @ reload IV | ||
1237 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1238 | veor q0, q0, q14 @ ^= IV | ||
1239 | vld1.8 {q10-q11}, [r0]! | ||
1240 | veor q1, q1, q8 | ||
1241 | veor q6, q6, q9 | ||
1242 | vld1.8 {q15}, [r0]! | ||
1243 | veor q4, q4, q10 | ||
1244 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1245 | veor q2, q2, q11 | ||
1246 | vst1.8 {q6}, [r1]! | ||
1247 | vst1.8 {q4}, [r1]! | ||
1248 | vst1.8 {q2}, [r1]! | ||
1249 | b .Lcbc_dec_done | ||
1250 | .align 4 | ||
1251 | .Lcbc_dec_four: | ||
1252 | sub r0, r0, #0x40 | ||
1253 | bl _bsaes_decrypt8 | ||
1254 | vldmia r9, {q14} @ reload IV | ||
1255 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1256 | veor q0, q0, q14 @ ^= IV | ||
1257 | vld1.8 {q10}, [r0]! | ||
1258 | veor q1, q1, q8 | ||
1259 | veor q6, q6, q9 | ||
1260 | vld1.8 {q15}, [r0]! | ||
1261 | veor q4, q4, q10 | ||
1262 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1263 | vst1.8 {q6}, [r1]! | ||
1264 | vst1.8 {q4}, [r1]! | ||
1265 | b .Lcbc_dec_done | ||
1266 | .align 4 | ||
1267 | .Lcbc_dec_three: | ||
1268 | sub r0, r0, #0x30 | ||
1269 | bl _bsaes_decrypt8 | ||
1270 | vldmia r9, {q14} @ reload IV | ||
1271 | vld1.8 {q8-q9}, [r0]! @ reload input | ||
1272 | veor q0, q0, q14 @ ^= IV | ||
1273 | vld1.8 {q15}, [r0]! | ||
1274 | veor q1, q1, q8 | ||
1275 | veor q6, q6, q9 | ||
1276 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1277 | vst1.8 {q6}, [r1]! | ||
1278 | b .Lcbc_dec_done | ||
1279 | .align 4 | ||
1280 | .Lcbc_dec_two: | ||
1281 | sub r0, r0, #0x20 | ||
1282 | bl _bsaes_decrypt8 | ||
1283 | vldmia r9, {q14} @ reload IV | ||
1284 | vld1.8 {q8}, [r0]! @ reload input | ||
1285 | veor q0, q0, q14 @ ^= IV | ||
1286 | vld1.8 {q15}, [r0]! @ reload input | ||
1287 | veor q1, q1, q8 | ||
1288 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1289 | b .Lcbc_dec_done | ||
1290 | .align 4 | ||
1291 | .Lcbc_dec_one: | ||
1292 | sub r0, r0, #0x10 | ||
1293 | mov r10, r1 @ save original out pointer | ||
1294 | mov r1, r9 @ use the iv scratch space as out buffer | ||
1295 | mov r2, r3 | ||
1296 | vmov q4,q15 @ just in case ensure that IV | ||
1297 | vmov q5,q0 @ and input are preserved | ||
1298 | bl AES_decrypt | ||
1299 | vld1.8 {q0}, [r9,:64] @ load result | ||
1300 | veor q0, q0, q4 @ ^= IV | ||
1301 | vmov q15, q5 @ q5 holds input | ||
1302 | vst1.8 {q0}, [r10] @ write output | ||
1303 | |||
1304 | .Lcbc_dec_done: | ||
1305 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1306 | vmov.i32 q0, #0 | ||
1307 | vmov.i32 q1, #0 | ||
1308 | .Lcbc_dec_bzero: @ wipe key schedule [if any] | ||
1309 | vstmia sp!, {q0-q1} | ||
1310 | cmp sp, r9 | ||
1311 | bne .Lcbc_dec_bzero | ||
1312 | #endif | ||
1313 | |||
1314 | mov sp, r9 | ||
1315 | add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb | ||
1316 | vst1.8 {q15}, [r8] @ return IV | ||
1317 | VFP_ABI_POP | ||
1318 | ldmia sp!, {r4-r10, pc} | ||
1319 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1320 | .extern AES_encrypt | ||
1321 | .global bsaes_ctr32_encrypt_blocks | ||
1322 | .type bsaes_ctr32_encrypt_blocks,%function | ||
1323 | .align 5 | ||
1324 | bsaes_ctr32_encrypt_blocks: | ||
1325 | cmp r2, #8 @ use plain AES for | ||
1326 | blo .Lctr_enc_short @ small sizes | ||
1327 | |||
1328 | mov ip, sp | ||
1329 | stmdb sp!, {r4-r10, lr} | ||
1330 | VFP_ABI_PUSH | ||
1331 | ldr r8, [ip] @ ctr is 1st arg on the stack | ||
1332 | sub sp, sp, #0x10 @ scratch space to carry over the ctr | ||
1333 | mov r9, sp @ save sp | ||
1334 | |||
1335 | ldr r10, [r3, #240] @ get # of rounds | ||
1336 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1337 | @ allocate the key schedule on the stack | ||
1338 | sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key | ||
1339 | add r12, #96 @ size of bit-sliced key schedule | ||
1340 | |||
1341 | @ populate the key schedule | ||
1342 | mov r4, r3 @ pass key | ||
1343 | mov r5, r10 @ pass # of rounds | ||
1344 | mov sp, r12 @ sp is sp | ||
1345 | bl _bsaes_key_convert | ||
1346 | veor q7,q7,q15 @ fix up last round key | ||
1347 | vstmia r12, {q7} @ save last round key | ||
1348 | |||
1349 | vld1.8 {q0}, [r8] @ load counter | ||
1350 | add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 | ||
1351 | vldmia sp, {q4} @ load round0 key | ||
1352 | #else | ||
1353 | ldr r12, [r3, #244] | ||
1354 | eors r12, #1 | ||
1355 | beq 0f | ||
1356 | |||
1357 | @ populate the key schedule | ||
1358 | str r12, [r3, #244] | ||
1359 | mov r4, r3 @ pass key | ||
1360 | mov r5, r10 @ pass # of rounds | ||
1361 | add r12, r3, #248 @ pass key schedule | ||
1362 | bl _bsaes_key_convert | ||
1363 | veor q7,q7,q15 @ fix up last round key | ||
1364 | vstmia r12, {q7} @ save last round key | ||
1365 | |||
1366 | .align 2 | ||
1367 | 0: add r12, r3, #248 | ||
1368 | vld1.8 {q0}, [r8] @ load counter | ||
1369 | adrl r8, .LREVM0SR @ borrow r8 | ||
1370 | vldmia r12, {q4} @ load round0 key | ||
1371 | sub sp, #0x10 @ place for adjusted round0 key | ||
1372 | #endif | ||
1373 | |||
1374 | vmov.i32 q8,#1 @ compose 1<<96 | ||
1375 | veor q9,q9,q9 | ||
1376 | vrev32.8 q0,q0 | ||
1377 | vext.8 q8,q9,q8,#4 | ||
1378 | vrev32.8 q4,q4 | ||
1379 | vadd.u32 q9,q8,q8 @ compose 2<<96 | ||
1380 | vstmia sp, {q4} @ save adjusted round0 key | ||
1381 | b .Lctr_enc_loop | ||
1382 | |||
1383 | .align 4 | ||
1384 | .Lctr_enc_loop: | ||
1385 | vadd.u32 q10, q8, q9 @ compose 3<<96 | ||
1386 | vadd.u32 q1, q0, q8 @ +1 | ||
1387 | vadd.u32 q2, q0, q9 @ +2 | ||
1388 | vadd.u32 q3, q0, q10 @ +3 | ||
1389 | vadd.u32 q4, q1, q10 | ||
1390 | vadd.u32 q5, q2, q10 | ||
1391 | vadd.u32 q6, q3, q10 | ||
1392 | vadd.u32 q7, q4, q10 | ||
1393 | vadd.u32 q10, q5, q10 @ next counter | ||
1394 | |||
1395 | @ Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1396 | @ to flip byte order in 32-bit counter | ||
1397 | |||
1398 | vldmia sp, {q9} @ load round0 key | ||
1399 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1400 | add r4, sp, #0x10 @ pass next round key | ||
1401 | #else | ||
1402 | add r4, r3, #264 | ||
1403 | #endif | ||
1404 | vldmia r8, {q8} @ .LREVM0SR | ||
1405 | mov r5, r10 @ pass rounds | ||
1406 | vstmia r9, {q10} @ save next counter | ||
1407 | sub r6, r8, #.LREVM0SR-.LSR @ pass constants | ||
1408 | |||
1409 | bl _bsaes_encrypt8_alt | ||
1410 | |||
1411 | subs r2, r2, #8 | ||
1412 | blo .Lctr_enc_loop_done | ||
1413 | |||
1414 | vld1.8 {q8-q9}, [r0]! @ load input | ||
1415 | vld1.8 {q10-q11}, [r0]! | ||
1416 | veor q0, q8 | ||
1417 | veor q1, q9 | ||
1418 | vld1.8 {q12-q13}, [r0]! | ||
1419 | veor q4, q10 | ||
1420 | veor q6, q11 | ||
1421 | vld1.8 {q14-q15}, [r0]! | ||
1422 | veor q3, q12 | ||
1423 | vst1.8 {q0-q1}, [r1]! @ write output | ||
1424 | veor q7, q13 | ||
1425 | veor q2, q14 | ||
1426 | vst1.8 {q4}, [r1]! | ||
1427 | veor q5, q15 | ||
1428 | vst1.8 {q6}, [r1]! | ||
1429 | vmov.i32 q8, #1 @ compose 1<<96 | ||
1430 | vst1.8 {q3}, [r1]! | ||
1431 | veor q9, q9, q9 | ||
1432 | vst1.8 {q7}, [r1]! | ||
1433 | vext.8 q8, q9, q8, #4 | ||
1434 | vst1.8 {q2}, [r1]! | ||
1435 | vadd.u32 q9,q8,q8 @ compose 2<<96 | ||
1436 | vst1.8 {q5}, [r1]! | ||
1437 | vldmia r9, {q0} @ load counter | ||
1438 | |||
1439 | bne .Lctr_enc_loop | ||
1440 | b .Lctr_enc_done | ||
1441 | |||
1442 | .align 4 | ||
1443 | .Lctr_enc_loop_done: | ||
1444 | add r2, r2, #8 | ||
1445 | vld1.8 {q8}, [r0]! @ load input | ||
1446 | veor q0, q8 | ||
1447 | vst1.8 {q0}, [r1]! @ write output | ||
1448 | cmp r2, #2 | ||
1449 | blo .Lctr_enc_done | ||
1450 | vld1.8 {q9}, [r0]! | ||
1451 | veor q1, q9 | ||
1452 | vst1.8 {q1}, [r1]! | ||
1453 | beq .Lctr_enc_done | ||
1454 | vld1.8 {q10}, [r0]! | ||
1455 | veor q4, q10 | ||
1456 | vst1.8 {q4}, [r1]! | ||
1457 | cmp r2, #4 | ||
1458 | blo .Lctr_enc_done | ||
1459 | vld1.8 {q11}, [r0]! | ||
1460 | veor q6, q11 | ||
1461 | vst1.8 {q6}, [r1]! | ||
1462 | beq .Lctr_enc_done | ||
1463 | vld1.8 {q12}, [r0]! | ||
1464 | veor q3, q12 | ||
1465 | vst1.8 {q3}, [r1]! | ||
1466 | cmp r2, #6 | ||
1467 | blo .Lctr_enc_done | ||
1468 | vld1.8 {q13}, [r0]! | ||
1469 | veor q7, q13 | ||
1470 | vst1.8 {q7}, [r1]! | ||
1471 | beq .Lctr_enc_done | ||
1472 | vld1.8 {q14}, [r0] | ||
1473 | veor q2, q14 | ||
1474 | vst1.8 {q2}, [r1]! | ||
1475 | |||
1476 | .Lctr_enc_done: | ||
1477 | vmov.i32 q0, #0 | ||
1478 | vmov.i32 q1, #0 | ||
1479 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1480 | .Lctr_enc_bzero: @ wipe key schedule [if any] | ||
1481 | vstmia sp!, {q0-q1} | ||
1482 | cmp sp, r9 | ||
1483 | bne .Lctr_enc_bzero | ||
1484 | #else | ||
1485 | vstmia sp, {q0-q1} | ||
1486 | #endif | ||
1487 | |||
1488 | mov sp, r9 | ||
1489 | add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb | ||
1490 | VFP_ABI_POP | ||
1491 | ldmia sp!, {r4-r10, pc} @ return | ||
1492 | |||
1493 | .align 4 | ||
1494 | .Lctr_enc_short: | ||
1495 | ldr ip, [sp] @ ctr pointer is passed on stack | ||
1496 | stmdb sp!, {r4-r8, lr} | ||
1497 | |||
1498 | mov r4, r0 @ copy arguments | ||
1499 | mov r5, r1 | ||
1500 | mov r6, r2 | ||
1501 | mov r7, r3 | ||
1502 | ldr r8, [ip, #12] @ load counter LSW | ||
1503 | vld1.8 {q1}, [ip] @ load whole counter value | ||
1504 | #ifdef __ARMEL__ | ||
1505 | rev r8, r8 | ||
1506 | #endif | ||
1507 | sub sp, sp, #0x10 | ||
1508 | vst1.8 {q1}, [sp,:64] @ copy counter value | ||
1509 | sub sp, sp, #0x10 | ||
1510 | |||
1511 | .Lctr_enc_short_loop: | ||
1512 | add r0, sp, #0x10 @ input counter value | ||
1513 | mov r1, sp @ output on the stack | ||
1514 | mov r2, r7 @ key | ||
1515 | |||
1516 | bl AES_encrypt | ||
1517 | |||
1518 | vld1.8 {q0}, [r4]! @ load input | ||
1519 | vld1.8 {q1}, [sp,:64] @ load encrypted counter | ||
1520 | add r8, r8, #1 | ||
1521 | #ifdef __ARMEL__ | ||
1522 | rev r0, r8 | ||
1523 | str r0, [sp, #0x1c] @ next counter value | ||
1524 | #else | ||
1525 | str r8, [sp, #0x1c] @ next counter value | ||
1526 | #endif | ||
1527 | veor q0,q0,q1 | ||
1528 | vst1.8 {q0}, [r5]! @ store output | ||
1529 | subs r6, r6, #1 | ||
1530 | bne .Lctr_enc_short_loop | ||
1531 | |||
1532 | vmov.i32 q0, #0 | ||
1533 | vmov.i32 q1, #0 | ||
1534 | vstmia sp!, {q0-q1} | ||
1535 | |||
1536 | ldmia sp!, {r4-r8, pc} | ||
1537 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
1538 | .globl bsaes_xts_encrypt | ||
1539 | .type bsaes_xts_encrypt,%function | ||
1540 | .align 4 | ||
1541 | bsaes_xts_encrypt: | ||
1542 | mov ip, sp | ||
1543 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
1544 | VFP_ABI_PUSH | ||
1545 | mov r6, sp @ future r3 | ||
1546 | |||
1547 | mov r7, r0 | ||
1548 | mov r8, r1 | ||
1549 | mov r9, r2 | ||
1550 | mov r10, r3 | ||
1551 | |||
1552 | sub r0, sp, #0x10 @ 0x10 | ||
1553 | bic r0, #0xf @ align at 16 bytes | ||
1554 | mov sp, r0 | ||
1555 | |||
1556 | #ifdef XTS_CHAIN_TWEAK | ||
1557 | ldr r0, [ip] @ pointer to input tweak | ||
1558 | #else | ||
1559 | @ generate initial tweak | ||
1560 | ldr r0, [ip, #4] @ iv[] | ||
1561 | mov r1, sp | ||
1562 | ldr r2, [ip, #0] @ key2 | ||
1563 | bl AES_encrypt | ||
1564 | mov r0,sp @ pointer to initial tweak | ||
1565 | #endif | ||
1566 | |||
1567 | ldr r1, [r10, #240] @ get # of rounds | ||
1568 | mov r3, r6 | ||
1569 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1570 | @ allocate the key schedule on the stack | ||
1571 | sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key | ||
1572 | @ add r12, #96 @ size of bit-sliced key schedule | ||
1573 | sub r12, #48 @ place for tweak[9] | ||
1574 | |||
1575 | @ populate the key schedule | ||
1576 | mov r4, r10 @ pass key | ||
1577 | mov r5, r1 @ pass # of rounds | ||
1578 | mov sp, r12 | ||
1579 | add r12, #0x90 @ pass key schedule | ||
1580 | bl _bsaes_key_convert | ||
1581 | veor q7, q7, q15 @ fix up last round key | ||
1582 | vstmia r12, {q7} @ save last round key | ||
1583 | #else | ||
1584 | ldr r12, [r10, #244] | ||
1585 | eors r12, #1 | ||
1586 | beq 0f | ||
1587 | |||
1588 | str r12, [r10, #244] | ||
1589 | mov r4, r10 @ pass key | ||
1590 | mov r5, r1 @ pass # of rounds | ||
1591 | add r12, r10, #248 @ pass key schedule | ||
1592 | bl _bsaes_key_convert | ||
1593 | veor q7, q7, q15 @ fix up last round key | ||
1594 | vstmia r12, {q7} | ||
1595 | |||
1596 | .align 2 | ||
1597 | 0: sub sp, #0x90 @ place for tweak[9] | ||
1598 | #endif | ||
1599 | |||
1600 | vld1.8 {q8}, [r0] @ initial tweak | ||
1601 | adr r2, .Lxts_magic | ||
1602 | |||
1603 | subs r9, #0x80 | ||
1604 | blo .Lxts_enc_short | ||
1605 | b .Lxts_enc_loop | ||
1606 | |||
1607 | .align 4 | ||
1608 | .Lxts_enc_loop: | ||
1609 | vldmia r2, {q5} @ load XTS magic | ||
1610 | vshr.s64 q6, q8, #63 | ||
1611 | mov r0, sp | ||
1612 | vand q6, q6, q5 | ||
1613 | vadd.u64 q9, q8, q8 | ||
1614 | vst1.64 {q8}, [r0,:128]! | ||
1615 | vswp d13,d12 | ||
1616 | vshr.s64 q7, q9, #63 | ||
1617 | veor q9, q9, q6 | ||
1618 | vand q7, q7, q5 | ||
1619 | vadd.u64 q10, q9, q9 | ||
1620 | vst1.64 {q9}, [r0,:128]! | ||
1621 | vswp d15,d14 | ||
1622 | vshr.s64 q6, q10, #63 | ||
1623 | veor q10, q10, q7 | ||
1624 | vand q6, q6, q5 | ||
1625 | vld1.8 {q0}, [r7]! | ||
1626 | vadd.u64 q11, q10, q10 | ||
1627 | vst1.64 {q10}, [r0,:128]! | ||
1628 | vswp d13,d12 | ||
1629 | vshr.s64 q7, q11, #63 | ||
1630 | veor q11, q11, q6 | ||
1631 | vand q7, q7, q5 | ||
1632 | vld1.8 {q1}, [r7]! | ||
1633 | veor q0, q0, q8 | ||
1634 | vadd.u64 q12, q11, q11 | ||
1635 | vst1.64 {q11}, [r0,:128]! | ||
1636 | vswp d15,d14 | ||
1637 | vshr.s64 q6, q12, #63 | ||
1638 | veor q12, q12, q7 | ||
1639 | vand q6, q6, q5 | ||
1640 | vld1.8 {q2}, [r7]! | ||
1641 | veor q1, q1, q9 | ||
1642 | vadd.u64 q13, q12, q12 | ||
1643 | vst1.64 {q12}, [r0,:128]! | ||
1644 | vswp d13,d12 | ||
1645 | vshr.s64 q7, q13, #63 | ||
1646 | veor q13, q13, q6 | ||
1647 | vand q7, q7, q5 | ||
1648 | vld1.8 {q3}, [r7]! | ||
1649 | veor q2, q2, q10 | ||
1650 | vadd.u64 q14, q13, q13 | ||
1651 | vst1.64 {q13}, [r0,:128]! | ||
1652 | vswp d15,d14 | ||
1653 | vshr.s64 q6, q14, #63 | ||
1654 | veor q14, q14, q7 | ||
1655 | vand q6, q6, q5 | ||
1656 | vld1.8 {q4}, [r7]! | ||
1657 | veor q3, q3, q11 | ||
1658 | vadd.u64 q15, q14, q14 | ||
1659 | vst1.64 {q14}, [r0,:128]! | ||
1660 | vswp d13,d12 | ||
1661 | vshr.s64 q7, q15, #63 | ||
1662 | veor q15, q15, q6 | ||
1663 | vand q7, q7, q5 | ||
1664 | vld1.8 {q5}, [r7]! | ||
1665 | veor q4, q4, q12 | ||
1666 | vadd.u64 q8, q15, q15 | ||
1667 | vst1.64 {q15}, [r0,:128]! | ||
1668 | vswp d15,d14 | ||
1669 | veor q8, q8, q7 | ||
1670 | vst1.64 {q8}, [r0,:128] @ next round tweak | ||
1671 | |||
1672 | vld1.8 {q6-q7}, [r7]! | ||
1673 | veor q5, q5, q13 | ||
1674 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1675 | add r4, sp, #0x90 @ pass key schedule | ||
1676 | #else | ||
1677 | add r4, r10, #248 @ pass key schedule | ||
1678 | #endif | ||
1679 | veor q6, q6, q14 | ||
1680 | mov r5, r1 @ pass rounds | ||
1681 | veor q7, q7, q15 | ||
1682 | mov r0, sp | ||
1683 | |||
1684 | bl _bsaes_encrypt8 | ||
1685 | |||
1686 | vld1.64 {q8-q9}, [r0,:128]! | ||
1687 | vld1.64 {q10-q11}, [r0,:128]! | ||
1688 | veor q0, q0, q8 | ||
1689 | vld1.64 {q12-q13}, [r0,:128]! | ||
1690 | veor q1, q1, q9 | ||
1691 | veor q8, q4, q10 | ||
1692 | vst1.8 {q0-q1}, [r8]! | ||
1693 | veor q9, q6, q11 | ||
1694 | vld1.64 {q14-q15}, [r0,:128]! | ||
1695 | veor q10, q3, q12 | ||
1696 | vst1.8 {q8-q9}, [r8]! | ||
1697 | veor q11, q7, q13 | ||
1698 | veor q12, q2, q14 | ||
1699 | vst1.8 {q10-q11}, [r8]! | ||
1700 | veor q13, q5, q15 | ||
1701 | vst1.8 {q12-q13}, [r8]! | ||
1702 | |||
1703 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1704 | |||
1705 | subs r9, #0x80 | ||
1706 | bpl .Lxts_enc_loop | ||
1707 | |||
1708 | .Lxts_enc_short: | ||
1709 | adds r9, #0x70 | ||
1710 | bmi .Lxts_enc_done | ||
1711 | |||
1712 | vldmia r2, {q5} @ load XTS magic | ||
1713 | vshr.s64 q7, q8, #63 | ||
1714 | mov r0, sp | ||
1715 | vand q7, q7, q5 | ||
1716 | vadd.u64 q9, q8, q8 | ||
1717 | vst1.64 {q8}, [r0,:128]! | ||
1718 | vswp d15,d14 | ||
1719 | vshr.s64 q6, q9, #63 | ||
1720 | veor q9, q9, q7 | ||
1721 | vand q6, q6, q5 | ||
1722 | vadd.u64 q10, q9, q9 | ||
1723 | vst1.64 {q9}, [r0,:128]! | ||
1724 | vswp d13,d12 | ||
1725 | vshr.s64 q7, q10, #63 | ||
1726 | veor q10, q10, q6 | ||
1727 | vand q7, q7, q5 | ||
1728 | vld1.8 {q0}, [r7]! | ||
1729 | subs r9, #0x10 | ||
1730 | bmi .Lxts_enc_1 | ||
1731 | vadd.u64 q11, q10, q10 | ||
1732 | vst1.64 {q10}, [r0,:128]! | ||
1733 | vswp d15,d14 | ||
1734 | vshr.s64 q6, q11, #63 | ||
1735 | veor q11, q11, q7 | ||
1736 | vand q6, q6, q5 | ||
1737 | vld1.8 {q1}, [r7]! | ||
1738 | subs r9, #0x10 | ||
1739 | bmi .Lxts_enc_2 | ||
1740 | veor q0, q0, q8 | ||
1741 | vadd.u64 q12, q11, q11 | ||
1742 | vst1.64 {q11}, [r0,:128]! | ||
1743 | vswp d13,d12 | ||
1744 | vshr.s64 q7, q12, #63 | ||
1745 | veor q12, q12, q6 | ||
1746 | vand q7, q7, q5 | ||
1747 | vld1.8 {q2}, [r7]! | ||
1748 | subs r9, #0x10 | ||
1749 | bmi .Lxts_enc_3 | ||
1750 | veor q1, q1, q9 | ||
1751 | vadd.u64 q13, q12, q12 | ||
1752 | vst1.64 {q12}, [r0,:128]! | ||
1753 | vswp d15,d14 | ||
1754 | vshr.s64 q6, q13, #63 | ||
1755 | veor q13, q13, q7 | ||
1756 | vand q6, q6, q5 | ||
1757 | vld1.8 {q3}, [r7]! | ||
1758 | subs r9, #0x10 | ||
1759 | bmi .Lxts_enc_4 | ||
1760 | veor q2, q2, q10 | ||
1761 | vadd.u64 q14, q13, q13 | ||
1762 | vst1.64 {q13}, [r0,:128]! | ||
1763 | vswp d13,d12 | ||
1764 | vshr.s64 q7, q14, #63 | ||
1765 | veor q14, q14, q6 | ||
1766 | vand q7, q7, q5 | ||
1767 | vld1.8 {q4}, [r7]! | ||
1768 | subs r9, #0x10 | ||
1769 | bmi .Lxts_enc_5 | ||
1770 | veor q3, q3, q11 | ||
1771 | vadd.u64 q15, q14, q14 | ||
1772 | vst1.64 {q14}, [r0,:128]! | ||
1773 | vswp d15,d14 | ||
1774 | vshr.s64 q6, q15, #63 | ||
1775 | veor q15, q15, q7 | ||
1776 | vand q6, q6, q5 | ||
1777 | vld1.8 {q5}, [r7]! | ||
1778 | subs r9, #0x10 | ||
1779 | bmi .Lxts_enc_6 | ||
1780 | veor q4, q4, q12 | ||
1781 | sub r9, #0x10 | ||
1782 | vst1.64 {q15}, [r0,:128] @ next round tweak | ||
1783 | |||
1784 | vld1.8 {q6}, [r7]! | ||
1785 | veor q5, q5, q13 | ||
1786 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1787 | add r4, sp, #0x90 @ pass key schedule | ||
1788 | #else | ||
1789 | add r4, r10, #248 @ pass key schedule | ||
1790 | #endif | ||
1791 | veor q6, q6, q14 | ||
1792 | mov r5, r1 @ pass rounds | ||
1793 | mov r0, sp | ||
1794 | |||
1795 | bl _bsaes_encrypt8 | ||
1796 | |||
1797 | vld1.64 {q8-q9}, [r0,:128]! | ||
1798 | vld1.64 {q10-q11}, [r0,:128]! | ||
1799 | veor q0, q0, q8 | ||
1800 | vld1.64 {q12-q13}, [r0,:128]! | ||
1801 | veor q1, q1, q9 | ||
1802 | veor q8, q4, q10 | ||
1803 | vst1.8 {q0-q1}, [r8]! | ||
1804 | veor q9, q6, q11 | ||
1805 | vld1.64 {q14}, [r0,:128]! | ||
1806 | veor q10, q3, q12 | ||
1807 | vst1.8 {q8-q9}, [r8]! | ||
1808 | veor q11, q7, q13 | ||
1809 | veor q12, q2, q14 | ||
1810 | vst1.8 {q10-q11}, [r8]! | ||
1811 | vst1.8 {q12}, [r8]! | ||
1812 | |||
1813 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1814 | b .Lxts_enc_done | ||
1815 | .align 4 | ||
1816 | .Lxts_enc_6: | ||
1817 | vst1.64 {q14}, [r0,:128] @ next round tweak | ||
1818 | |||
1819 | veor q4, q4, q12 | ||
1820 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1821 | add r4, sp, #0x90 @ pass key schedule | ||
1822 | #else | ||
1823 | add r4, r10, #248 @ pass key schedule | ||
1824 | #endif | ||
1825 | veor q5, q5, q13 | ||
1826 | mov r5, r1 @ pass rounds | ||
1827 | mov r0, sp | ||
1828 | |||
1829 | bl _bsaes_encrypt8 | ||
1830 | |||
1831 | vld1.64 {q8-q9}, [r0,:128]! | ||
1832 | vld1.64 {q10-q11}, [r0,:128]! | ||
1833 | veor q0, q0, q8 | ||
1834 | vld1.64 {q12-q13}, [r0,:128]! | ||
1835 | veor q1, q1, q9 | ||
1836 | veor q8, q4, q10 | ||
1837 | vst1.8 {q0-q1}, [r8]! | ||
1838 | veor q9, q6, q11 | ||
1839 | veor q10, q3, q12 | ||
1840 | vst1.8 {q8-q9}, [r8]! | ||
1841 | veor q11, q7, q13 | ||
1842 | vst1.8 {q10-q11}, [r8]! | ||
1843 | |||
1844 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1845 | b .Lxts_enc_done | ||
1846 | |||
1847 | @ put this in range for both ARM and Thumb mode adr instructions | ||
1848 | .align 5 | ||
1849 | .Lxts_magic: | ||
1850 | .quad 1, 0x87 | ||
1851 | |||
1852 | .align 5 | ||
1853 | .Lxts_enc_5: | ||
1854 | vst1.64 {q13}, [r0,:128] @ next round tweak | ||
1855 | |||
1856 | veor q3, q3, q11 | ||
1857 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1858 | add r4, sp, #0x90 @ pass key schedule | ||
1859 | #else | ||
1860 | add r4, r10, #248 @ pass key schedule | ||
1861 | #endif | ||
1862 | veor q4, q4, q12 | ||
1863 | mov r5, r1 @ pass rounds | ||
1864 | mov r0, sp | ||
1865 | |||
1866 | bl _bsaes_encrypt8 | ||
1867 | |||
1868 | vld1.64 {q8-q9}, [r0,:128]! | ||
1869 | vld1.64 {q10-q11}, [r0,:128]! | ||
1870 | veor q0, q0, q8 | ||
1871 | vld1.64 {q12}, [r0,:128]! | ||
1872 | veor q1, q1, q9 | ||
1873 | veor q8, q4, q10 | ||
1874 | vst1.8 {q0-q1}, [r8]! | ||
1875 | veor q9, q6, q11 | ||
1876 | veor q10, q3, q12 | ||
1877 | vst1.8 {q8-q9}, [r8]! | ||
1878 | vst1.8 {q10}, [r8]! | ||
1879 | |||
1880 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1881 | b .Lxts_enc_done | ||
1882 | .align 4 | ||
1883 | .Lxts_enc_4: | ||
1884 | vst1.64 {q12}, [r0,:128] @ next round tweak | ||
1885 | |||
1886 | veor q2, q2, q10 | ||
1887 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1888 | add r4, sp, #0x90 @ pass key schedule | ||
1889 | #else | ||
1890 | add r4, r10, #248 @ pass key schedule | ||
1891 | #endif | ||
1892 | veor q3, q3, q11 | ||
1893 | mov r5, r1 @ pass rounds | ||
1894 | mov r0, sp | ||
1895 | |||
1896 | bl _bsaes_encrypt8 | ||
1897 | |||
1898 | vld1.64 {q8-q9}, [r0,:128]! | ||
1899 | vld1.64 {q10-q11}, [r0,:128]! | ||
1900 | veor q0, q0, q8 | ||
1901 | veor q1, q1, q9 | ||
1902 | veor q8, q4, q10 | ||
1903 | vst1.8 {q0-q1}, [r8]! | ||
1904 | veor q9, q6, q11 | ||
1905 | vst1.8 {q8-q9}, [r8]! | ||
1906 | |||
1907 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1908 | b .Lxts_enc_done | ||
1909 | .align 4 | ||
1910 | .Lxts_enc_3: | ||
1911 | vst1.64 {q11}, [r0,:128] @ next round tweak | ||
1912 | |||
1913 | veor q1, q1, q9 | ||
1914 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1915 | add r4, sp, #0x90 @ pass key schedule | ||
1916 | #else | ||
1917 | add r4, r10, #248 @ pass key schedule | ||
1918 | #endif | ||
1919 | veor q2, q2, q10 | ||
1920 | mov r5, r1 @ pass rounds | ||
1921 | mov r0, sp | ||
1922 | |||
1923 | bl _bsaes_encrypt8 | ||
1924 | |||
1925 | vld1.64 {q8-q9}, [r0,:128]! | ||
1926 | vld1.64 {q10}, [r0,:128]! | ||
1927 | veor q0, q0, q8 | ||
1928 | veor q1, q1, q9 | ||
1929 | veor q8, q4, q10 | ||
1930 | vst1.8 {q0-q1}, [r8]! | ||
1931 | vst1.8 {q8}, [r8]! | ||
1932 | |||
1933 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1934 | b .Lxts_enc_done | ||
1935 | .align 4 | ||
1936 | .Lxts_enc_2: | ||
1937 | vst1.64 {q10}, [r0,:128] @ next round tweak | ||
1938 | |||
1939 | veor q0, q0, q8 | ||
1940 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1941 | add r4, sp, #0x90 @ pass key schedule | ||
1942 | #else | ||
1943 | add r4, r10, #248 @ pass key schedule | ||
1944 | #endif | ||
1945 | veor q1, q1, q9 | ||
1946 | mov r5, r1 @ pass rounds | ||
1947 | mov r0, sp | ||
1948 | |||
1949 | bl _bsaes_encrypt8 | ||
1950 | |||
1951 | vld1.64 {q8-q9}, [r0,:128]! | ||
1952 | veor q0, q0, q8 | ||
1953 | veor q1, q1, q9 | ||
1954 | vst1.8 {q0-q1}, [r8]! | ||
1955 | |||
1956 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
1957 | b .Lxts_enc_done | ||
1958 | .align 4 | ||
1959 | .Lxts_enc_1: | ||
1960 | mov r0, sp | ||
1961 | veor q0, q8 | ||
1962 | mov r1, sp | ||
1963 | vst1.8 {q0}, [sp,:128] | ||
1964 | mov r2, r10 | ||
1965 | mov r4, r3 @ preserve fp | ||
1966 | |||
1967 | bl AES_encrypt | ||
1968 | |||
1969 | vld1.8 {q0}, [sp,:128] | ||
1970 | veor q0, q0, q8 | ||
1971 | vst1.8 {q0}, [r8]! | ||
1972 | mov r3, r4 | ||
1973 | |||
1974 | vmov q8, q9 @ next round tweak | ||
1975 | |||
1976 | .Lxts_enc_done: | ||
1977 | #ifndef XTS_CHAIN_TWEAK | ||
1978 | adds r9, #0x10 | ||
1979 | beq .Lxts_enc_ret | ||
1980 | sub r6, r8, #0x10 | ||
1981 | |||
1982 | .Lxts_enc_steal: | ||
1983 | ldrb r0, [r7], #1 | ||
1984 | ldrb r1, [r8, #-0x10] | ||
1985 | strb r0, [r8, #-0x10] | ||
1986 | strb r1, [r8], #1 | ||
1987 | |||
1988 | subs r9, #1 | ||
1989 | bhi .Lxts_enc_steal | ||
1990 | |||
1991 | vld1.8 {q0}, [r6] | ||
1992 | mov r0, sp | ||
1993 | veor q0, q0, q8 | ||
1994 | mov r1, sp | ||
1995 | vst1.8 {q0}, [sp,:128] | ||
1996 | mov r2, r10 | ||
1997 | mov r4, r3 @ preserve fp | ||
1998 | |||
1999 | bl AES_encrypt | ||
2000 | |||
2001 | vld1.8 {q0}, [sp,:128] | ||
2002 | veor q0, q0, q8 | ||
2003 | vst1.8 {q0}, [r6] | ||
2004 | mov r3, r4 | ||
2005 | #endif | ||
2006 | |||
2007 | .Lxts_enc_ret: | ||
2008 | bic r0, r3, #0xf | ||
2009 | vmov.i32 q0, #0 | ||
2010 | vmov.i32 q1, #0 | ||
2011 | #ifdef XTS_CHAIN_TWEAK | ||
2012 | ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
2013 | #endif | ||
2014 | .Lxts_enc_bzero: @ wipe key schedule [if any] | ||
2015 | vstmia sp!, {q0-q1} | ||
2016 | cmp sp, r0 | ||
2017 | bne .Lxts_enc_bzero | ||
2018 | |||
2019 | mov sp, r3 | ||
2020 | #ifdef XTS_CHAIN_TWEAK | ||
2021 | vst1.8 {q8}, [r1] | ||
2022 | #endif | ||
2023 | VFP_ABI_POP | ||
2024 | ldmia sp!, {r4-r10, pc} @ return | ||
2025 | |||
2026 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2027 | |||
2028 | .globl bsaes_xts_decrypt | ||
2029 | .type bsaes_xts_decrypt,%function | ||
2030 | .align 4 | ||
2031 | bsaes_xts_decrypt: | ||
2032 | mov ip, sp | ||
2033 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
2034 | VFP_ABI_PUSH | ||
2035 | mov r6, sp @ future r3 | ||
2036 | |||
2037 | mov r7, r0 | ||
2038 | mov r8, r1 | ||
2039 | mov r9, r2 | ||
2040 | mov r10, r3 | ||
2041 | |||
2042 | sub r0, sp, #0x10 @ 0x10 | ||
2043 | bic r0, #0xf @ align at 16 bytes | ||
2044 | mov sp, r0 | ||
2045 | |||
2046 | #ifdef XTS_CHAIN_TWEAK | ||
2047 | ldr r0, [ip] @ pointer to input tweak | ||
2048 | #else | ||
2049 | @ generate initial tweak | ||
2050 | ldr r0, [ip, #4] @ iv[] | ||
2051 | mov r1, sp | ||
2052 | ldr r2, [ip, #0] @ key2 | ||
2053 | bl AES_encrypt | ||
2054 | mov r0, sp @ pointer to initial tweak | ||
2055 | #endif | ||
2056 | |||
2057 | ldr r1, [r10, #240] @ get # of rounds | ||
2058 | mov r3, r6 | ||
2059 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2060 | @ allocate the key schedule on the stack | ||
2061 | sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key | ||
2062 | @ add r12, #96 @ size of bit-sliced key schedule | ||
2063 | sub r12, #48 @ place for tweak[9] | ||
2064 | |||
2065 | @ populate the key schedule | ||
2066 | mov r4, r10 @ pass key | ||
2067 | mov r5, r1 @ pass # of rounds | ||
2068 | mov sp, r12 | ||
2069 | add r12, #0x90 @ pass key schedule | ||
2070 | bl _bsaes_key_convert | ||
2071 | add r4, sp, #0x90 | ||
2072 | vldmia r4, {q6} | ||
2073 | vstmia r12, {q15} @ save last round key | ||
2074 | veor q7, q7, q6 @ fix up round 0 key | ||
2075 | vstmia r4, {q7} | ||
2076 | #else | ||
2077 | ldr r12, [r10, #244] | ||
2078 | eors r12, #1 | ||
2079 | beq 0f | ||
2080 | |||
2081 | str r12, [r10, #244] | ||
2082 | mov r4, r10 @ pass key | ||
2083 | mov r5, r1 @ pass # of rounds | ||
2084 | add r12, r10, #248 @ pass key schedule | ||
2085 | bl _bsaes_key_convert | ||
2086 | add r4, r10, #248 | ||
2087 | vldmia r4, {q6} | ||
2088 | vstmia r12, {q15} @ save last round key | ||
2089 | veor q7, q7, q6 @ fix up round 0 key | ||
2090 | vstmia r4, {q7} | ||
2091 | |||
2092 | .align 2 | ||
2093 | 0: sub sp, #0x90 @ place for tweak[9] | ||
2094 | #endif | ||
2095 | vld1.8 {q8}, [r0] @ initial tweak | ||
2096 | adr r2, .Lxts_magic | ||
2097 | |||
2098 | tst r9, #0xf @ if not multiple of 16 | ||
2099 | it ne @ Thumb2 thing, sanity check in ARM | ||
2100 | subne r9, #0x10 @ subtract another 16 bytes | ||
2101 | subs r9, #0x80 | ||
2102 | |||
2103 | blo .Lxts_dec_short | ||
2104 | b .Lxts_dec_loop | ||
2105 | |||
2106 | .align 4 | ||
2107 | .Lxts_dec_loop: | ||
2108 | vldmia r2, {q5} @ load XTS magic | ||
2109 | vshr.s64 q6, q8, #63 | ||
2110 | mov r0, sp | ||
2111 | vand q6, q6, q5 | ||
2112 | vadd.u64 q9, q8, q8 | ||
2113 | vst1.64 {q8}, [r0,:128]! | ||
2114 | vswp d13,d12 | ||
2115 | vshr.s64 q7, q9, #63 | ||
2116 | veor q9, q9, q6 | ||
2117 | vand q7, q7, q5 | ||
2118 | vadd.u64 q10, q9, q9 | ||
2119 | vst1.64 {q9}, [r0,:128]! | ||
2120 | vswp d15,d14 | ||
2121 | vshr.s64 q6, q10, #63 | ||
2122 | veor q10, q10, q7 | ||
2123 | vand q6, q6, q5 | ||
2124 | vld1.8 {q0}, [r7]! | ||
2125 | vadd.u64 q11, q10, q10 | ||
2126 | vst1.64 {q10}, [r0,:128]! | ||
2127 | vswp d13,d12 | ||
2128 | vshr.s64 q7, q11, #63 | ||
2129 | veor q11, q11, q6 | ||
2130 | vand q7, q7, q5 | ||
2131 | vld1.8 {q1}, [r7]! | ||
2132 | veor q0, q0, q8 | ||
2133 | vadd.u64 q12, q11, q11 | ||
2134 | vst1.64 {q11}, [r0,:128]! | ||
2135 | vswp d15,d14 | ||
2136 | vshr.s64 q6, q12, #63 | ||
2137 | veor q12, q12, q7 | ||
2138 | vand q6, q6, q5 | ||
2139 | vld1.8 {q2}, [r7]! | ||
2140 | veor q1, q1, q9 | ||
2141 | vadd.u64 q13, q12, q12 | ||
2142 | vst1.64 {q12}, [r0,:128]! | ||
2143 | vswp d13,d12 | ||
2144 | vshr.s64 q7, q13, #63 | ||
2145 | veor q13, q13, q6 | ||
2146 | vand q7, q7, q5 | ||
2147 | vld1.8 {q3}, [r7]! | ||
2148 | veor q2, q2, q10 | ||
2149 | vadd.u64 q14, q13, q13 | ||
2150 | vst1.64 {q13}, [r0,:128]! | ||
2151 | vswp d15,d14 | ||
2152 | vshr.s64 q6, q14, #63 | ||
2153 | veor q14, q14, q7 | ||
2154 | vand q6, q6, q5 | ||
2155 | vld1.8 {q4}, [r7]! | ||
2156 | veor q3, q3, q11 | ||
2157 | vadd.u64 q15, q14, q14 | ||
2158 | vst1.64 {q14}, [r0,:128]! | ||
2159 | vswp d13,d12 | ||
2160 | vshr.s64 q7, q15, #63 | ||
2161 | veor q15, q15, q6 | ||
2162 | vand q7, q7, q5 | ||
2163 | vld1.8 {q5}, [r7]! | ||
2164 | veor q4, q4, q12 | ||
2165 | vadd.u64 q8, q15, q15 | ||
2166 | vst1.64 {q15}, [r0,:128]! | ||
2167 | vswp d15,d14 | ||
2168 | veor q8, q8, q7 | ||
2169 | vst1.64 {q8}, [r0,:128] @ next round tweak | ||
2170 | |||
2171 | vld1.8 {q6-q7}, [r7]! | ||
2172 | veor q5, q5, q13 | ||
2173 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2174 | add r4, sp, #0x90 @ pass key schedule | ||
2175 | #else | ||
2176 | add r4, r10, #248 @ pass key schedule | ||
2177 | #endif | ||
2178 | veor q6, q6, q14 | ||
2179 | mov r5, r1 @ pass rounds | ||
2180 | veor q7, q7, q15 | ||
2181 | mov r0, sp | ||
2182 | |||
2183 | bl _bsaes_decrypt8 | ||
2184 | |||
2185 | vld1.64 {q8-q9}, [r0,:128]! | ||
2186 | vld1.64 {q10-q11}, [r0,:128]! | ||
2187 | veor q0, q0, q8 | ||
2188 | vld1.64 {q12-q13}, [r0,:128]! | ||
2189 | veor q1, q1, q9 | ||
2190 | veor q8, q6, q10 | ||
2191 | vst1.8 {q0-q1}, [r8]! | ||
2192 | veor q9, q4, q11 | ||
2193 | vld1.64 {q14-q15}, [r0,:128]! | ||
2194 | veor q10, q2, q12 | ||
2195 | vst1.8 {q8-q9}, [r8]! | ||
2196 | veor q11, q7, q13 | ||
2197 | veor q12, q3, q14 | ||
2198 | vst1.8 {q10-q11}, [r8]! | ||
2199 | veor q13, q5, q15 | ||
2200 | vst1.8 {q12-q13}, [r8]! | ||
2201 | |||
2202 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2203 | |||
2204 | subs r9, #0x80 | ||
2205 | bpl .Lxts_dec_loop | ||
2206 | |||
2207 | .Lxts_dec_short: | ||
2208 | adds r9, #0x70 | ||
2209 | bmi .Lxts_dec_done | ||
2210 | |||
2211 | vldmia r2, {q5} @ load XTS magic | ||
2212 | vshr.s64 q7, q8, #63 | ||
2213 | mov r0, sp | ||
2214 | vand q7, q7, q5 | ||
2215 | vadd.u64 q9, q8, q8 | ||
2216 | vst1.64 {q8}, [r0,:128]! | ||
2217 | vswp d15,d14 | ||
2218 | vshr.s64 q6, q9, #63 | ||
2219 | veor q9, q9, q7 | ||
2220 | vand q6, q6, q5 | ||
2221 | vadd.u64 q10, q9, q9 | ||
2222 | vst1.64 {q9}, [r0,:128]! | ||
2223 | vswp d13,d12 | ||
2224 | vshr.s64 q7, q10, #63 | ||
2225 | veor q10, q10, q6 | ||
2226 | vand q7, q7, q5 | ||
2227 | vld1.8 {q0}, [r7]! | ||
2228 | subs r9, #0x10 | ||
2229 | bmi .Lxts_dec_1 | ||
2230 | vadd.u64 q11, q10, q10 | ||
2231 | vst1.64 {q10}, [r0,:128]! | ||
2232 | vswp d15,d14 | ||
2233 | vshr.s64 q6, q11, #63 | ||
2234 | veor q11, q11, q7 | ||
2235 | vand q6, q6, q5 | ||
2236 | vld1.8 {q1}, [r7]! | ||
2237 | subs r9, #0x10 | ||
2238 | bmi .Lxts_dec_2 | ||
2239 | veor q0, q0, q8 | ||
2240 | vadd.u64 q12, q11, q11 | ||
2241 | vst1.64 {q11}, [r0,:128]! | ||
2242 | vswp d13,d12 | ||
2243 | vshr.s64 q7, q12, #63 | ||
2244 | veor q12, q12, q6 | ||
2245 | vand q7, q7, q5 | ||
2246 | vld1.8 {q2}, [r7]! | ||
2247 | subs r9, #0x10 | ||
2248 | bmi .Lxts_dec_3 | ||
2249 | veor q1, q1, q9 | ||
2250 | vadd.u64 q13, q12, q12 | ||
2251 | vst1.64 {q12}, [r0,:128]! | ||
2252 | vswp d15,d14 | ||
2253 | vshr.s64 q6, q13, #63 | ||
2254 | veor q13, q13, q7 | ||
2255 | vand q6, q6, q5 | ||
2256 | vld1.8 {q3}, [r7]! | ||
2257 | subs r9, #0x10 | ||
2258 | bmi .Lxts_dec_4 | ||
2259 | veor q2, q2, q10 | ||
2260 | vadd.u64 q14, q13, q13 | ||
2261 | vst1.64 {q13}, [r0,:128]! | ||
2262 | vswp d13,d12 | ||
2263 | vshr.s64 q7, q14, #63 | ||
2264 | veor q14, q14, q6 | ||
2265 | vand q7, q7, q5 | ||
2266 | vld1.8 {q4}, [r7]! | ||
2267 | subs r9, #0x10 | ||
2268 | bmi .Lxts_dec_5 | ||
2269 | veor q3, q3, q11 | ||
2270 | vadd.u64 q15, q14, q14 | ||
2271 | vst1.64 {q14}, [r0,:128]! | ||
2272 | vswp d15,d14 | ||
2273 | vshr.s64 q6, q15, #63 | ||
2274 | veor q15, q15, q7 | ||
2275 | vand q6, q6, q5 | ||
2276 | vld1.8 {q5}, [r7]! | ||
2277 | subs r9, #0x10 | ||
2278 | bmi .Lxts_dec_6 | ||
2279 | veor q4, q4, q12 | ||
2280 | sub r9, #0x10 | ||
2281 | vst1.64 {q15}, [r0,:128] @ next round tweak | ||
2282 | |||
2283 | vld1.8 {q6}, [r7]! | ||
2284 | veor q5, q5, q13 | ||
2285 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2286 | add r4, sp, #0x90 @ pass key schedule | ||
2287 | #else | ||
2288 | add r4, r10, #248 @ pass key schedule | ||
2289 | #endif | ||
2290 | veor q6, q6, q14 | ||
2291 | mov r5, r1 @ pass rounds | ||
2292 | mov r0, sp | ||
2293 | |||
2294 | bl _bsaes_decrypt8 | ||
2295 | |||
2296 | vld1.64 {q8-q9}, [r0,:128]! | ||
2297 | vld1.64 {q10-q11}, [r0,:128]! | ||
2298 | veor q0, q0, q8 | ||
2299 | vld1.64 {q12-q13}, [r0,:128]! | ||
2300 | veor q1, q1, q9 | ||
2301 | veor q8, q6, q10 | ||
2302 | vst1.8 {q0-q1}, [r8]! | ||
2303 | veor q9, q4, q11 | ||
2304 | vld1.64 {q14}, [r0,:128]! | ||
2305 | veor q10, q2, q12 | ||
2306 | vst1.8 {q8-q9}, [r8]! | ||
2307 | veor q11, q7, q13 | ||
2308 | veor q12, q3, q14 | ||
2309 | vst1.8 {q10-q11}, [r8]! | ||
2310 | vst1.8 {q12}, [r8]! | ||
2311 | |||
2312 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2313 | b .Lxts_dec_done | ||
2314 | .align 4 | ||
2315 | .Lxts_dec_6: | ||
2316 | vst1.64 {q14}, [r0,:128] @ next round tweak | ||
2317 | |||
2318 | veor q4, q4, q12 | ||
2319 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2320 | add r4, sp, #0x90 @ pass key schedule | ||
2321 | #else | ||
2322 | add r4, r10, #248 @ pass key schedule | ||
2323 | #endif | ||
2324 | veor q5, q5, q13 | ||
2325 | mov r5, r1 @ pass rounds | ||
2326 | mov r0, sp | ||
2327 | |||
2328 | bl _bsaes_decrypt8 | ||
2329 | |||
2330 | vld1.64 {q8-q9}, [r0,:128]! | ||
2331 | vld1.64 {q10-q11}, [r0,:128]! | ||
2332 | veor q0, q0, q8 | ||
2333 | vld1.64 {q12-q13}, [r0,:128]! | ||
2334 | veor q1, q1, q9 | ||
2335 | veor q8, q6, q10 | ||
2336 | vst1.8 {q0-q1}, [r8]! | ||
2337 | veor q9, q4, q11 | ||
2338 | veor q10, q2, q12 | ||
2339 | vst1.8 {q8-q9}, [r8]! | ||
2340 | veor q11, q7, q13 | ||
2341 | vst1.8 {q10-q11}, [r8]! | ||
2342 | |||
2343 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2344 | b .Lxts_dec_done | ||
2345 | .align 4 | ||
2346 | .Lxts_dec_5: | ||
2347 | vst1.64 {q13}, [r0,:128] @ next round tweak | ||
2348 | |||
2349 | veor q3, q3, q11 | ||
2350 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2351 | add r4, sp, #0x90 @ pass key schedule | ||
2352 | #else | ||
2353 | add r4, r10, #248 @ pass key schedule | ||
2354 | #endif | ||
2355 | veor q4, q4, q12 | ||
2356 | mov r5, r1 @ pass rounds | ||
2357 | mov r0, sp | ||
2358 | |||
2359 | bl _bsaes_decrypt8 | ||
2360 | |||
2361 | vld1.64 {q8-q9}, [r0,:128]! | ||
2362 | vld1.64 {q10-q11}, [r0,:128]! | ||
2363 | veor q0, q0, q8 | ||
2364 | vld1.64 {q12}, [r0,:128]! | ||
2365 | veor q1, q1, q9 | ||
2366 | veor q8, q6, q10 | ||
2367 | vst1.8 {q0-q1}, [r8]! | ||
2368 | veor q9, q4, q11 | ||
2369 | veor q10, q2, q12 | ||
2370 | vst1.8 {q8-q9}, [r8]! | ||
2371 | vst1.8 {q10}, [r8]! | ||
2372 | |||
2373 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2374 | b .Lxts_dec_done | ||
2375 | .align 4 | ||
2376 | .Lxts_dec_4: | ||
2377 | vst1.64 {q12}, [r0,:128] @ next round tweak | ||
2378 | |||
2379 | veor q2, q2, q10 | ||
2380 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2381 | add r4, sp, #0x90 @ pass key schedule | ||
2382 | #else | ||
2383 | add r4, r10, #248 @ pass key schedule | ||
2384 | #endif | ||
2385 | veor q3, q3, q11 | ||
2386 | mov r5, r1 @ pass rounds | ||
2387 | mov r0, sp | ||
2388 | |||
2389 | bl _bsaes_decrypt8 | ||
2390 | |||
2391 | vld1.64 {q8-q9}, [r0,:128]! | ||
2392 | vld1.64 {q10-q11}, [r0,:128]! | ||
2393 | veor q0, q0, q8 | ||
2394 | veor q1, q1, q9 | ||
2395 | veor q8, q6, q10 | ||
2396 | vst1.8 {q0-q1}, [r8]! | ||
2397 | veor q9, q4, q11 | ||
2398 | vst1.8 {q8-q9}, [r8]! | ||
2399 | |||
2400 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2401 | b .Lxts_dec_done | ||
2402 | .align 4 | ||
2403 | .Lxts_dec_3: | ||
2404 | vst1.64 {q11}, [r0,:128] @ next round tweak | ||
2405 | |||
2406 | veor q1, q1, q9 | ||
2407 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2408 | add r4, sp, #0x90 @ pass key schedule | ||
2409 | #else | ||
2410 | add r4, r10, #248 @ pass key schedule | ||
2411 | #endif | ||
2412 | veor q2, q2, q10 | ||
2413 | mov r5, r1 @ pass rounds | ||
2414 | mov r0, sp | ||
2415 | |||
2416 | bl _bsaes_decrypt8 | ||
2417 | |||
2418 | vld1.64 {q8-q9}, [r0,:128]! | ||
2419 | vld1.64 {q10}, [r0,:128]! | ||
2420 | veor q0, q0, q8 | ||
2421 | veor q1, q1, q9 | ||
2422 | veor q8, q6, q10 | ||
2423 | vst1.8 {q0-q1}, [r8]! | ||
2424 | vst1.8 {q8}, [r8]! | ||
2425 | |||
2426 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2427 | b .Lxts_dec_done | ||
2428 | .align 4 | ||
2429 | .Lxts_dec_2: | ||
2430 | vst1.64 {q10}, [r0,:128] @ next round tweak | ||
2431 | |||
2432 | veor q0, q0, q8 | ||
2433 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2434 | add r4, sp, #0x90 @ pass key schedule | ||
2435 | #else | ||
2436 | add r4, r10, #248 @ pass key schedule | ||
2437 | #endif | ||
2438 | veor q1, q1, q9 | ||
2439 | mov r5, r1 @ pass rounds | ||
2440 | mov r0, sp | ||
2441 | |||
2442 | bl _bsaes_decrypt8 | ||
2443 | |||
2444 | vld1.64 {q8-q9}, [r0,:128]! | ||
2445 | veor q0, q0, q8 | ||
2446 | veor q1, q1, q9 | ||
2447 | vst1.8 {q0-q1}, [r8]! | ||
2448 | |||
2449 | vld1.64 {q8}, [r0,:128] @ next round tweak | ||
2450 | b .Lxts_dec_done | ||
2451 | .align 4 | ||
2452 | .Lxts_dec_1: | ||
2453 | mov r0, sp | ||
2454 | veor q0, q8 | ||
2455 | mov r1, sp | ||
2456 | vst1.8 {q0}, [sp,:128] | ||
2457 | mov r2, r10 | ||
2458 | mov r4, r3 @ preserve fp | ||
2459 | mov r5, r2 @ preserve magic | ||
2460 | |||
2461 | bl AES_decrypt | ||
2462 | |||
2463 | vld1.8 {q0}, [sp,:128] | ||
2464 | veor q0, q0, q8 | ||
2465 | vst1.8 {q0}, [r8]! | ||
2466 | mov r3, r4 | ||
2467 | mov r2, r5 | ||
2468 | |||
2469 | vmov q8, q9 @ next round tweak | ||
2470 | |||
2471 | .Lxts_dec_done: | ||
2472 | #ifndef XTS_CHAIN_TWEAK | ||
2473 | adds r9, #0x10 | ||
2474 | beq .Lxts_dec_ret | ||
2475 | |||
2476 | @ calculate one round of extra tweak for the stolen ciphertext | ||
2477 | vldmia r2, {q5} | ||
2478 | vshr.s64 q6, q8, #63 | ||
2479 | vand q6, q6, q5 | ||
2480 | vadd.u64 q9, q8, q8 | ||
2481 | vswp d13,d12 | ||
2482 | veor q9, q9, q6 | ||
2483 | |||
2484 | @ perform the final decryption with the last tweak value | ||
2485 | vld1.8 {q0}, [r7]! | ||
2486 | mov r0, sp | ||
2487 | veor q0, q0, q9 | ||
2488 | mov r1, sp | ||
2489 | vst1.8 {q0}, [sp,:128] | ||
2490 | mov r2, r10 | ||
2491 | mov r4, r3 @ preserve fp | ||
2492 | |||
2493 | bl AES_decrypt | ||
2494 | |||
2495 | vld1.8 {q0}, [sp,:128] | ||
2496 | veor q0, q0, q9 | ||
2497 | vst1.8 {q0}, [r8] | ||
2498 | |||
2499 | mov r6, r8 | ||
2500 | .Lxts_dec_steal: | ||
2501 | ldrb r1, [r8] | ||
2502 | ldrb r0, [r7], #1 | ||
2503 | strb r1, [r8, #0x10] | ||
2504 | strb r0, [r8], #1 | ||
2505 | |||
2506 | subs r9, #1 | ||
2507 | bhi .Lxts_dec_steal | ||
2508 | |||
2509 | vld1.8 {q0}, [r6] | ||
2510 | mov r0, sp | ||
2511 | veor q0, q8 | ||
2512 | mov r1, sp | ||
2513 | vst1.8 {q0}, [sp,:128] | ||
2514 | mov r2, r10 | ||
2515 | |||
2516 | bl AES_decrypt | ||
2517 | |||
2518 | vld1.8 {q0}, [sp,:128] | ||
2519 | veor q0, q0, q8 | ||
2520 | vst1.8 {q0}, [r6] | ||
2521 | mov r3, r4 | ||
2522 | #endif | ||
2523 | |||
2524 | .Lxts_dec_ret: | ||
2525 | bic r0, r3, #0xf | ||
2526 | vmov.i32 q0, #0 | ||
2527 | vmov.i32 q1, #0 | ||
2528 | #ifdef XTS_CHAIN_TWEAK | ||
2529 | ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
2530 | #endif | ||
2531 | .Lxts_dec_bzero: @ wipe key schedule [if any] | ||
2532 | vstmia sp!, {q0-q1} | ||
2533 | cmp sp, r0 | ||
2534 | bne .Lxts_dec_bzero | ||
2535 | |||
2536 | mov sp, r3 | ||
2537 | #ifdef XTS_CHAIN_TWEAK | ||
2538 | vst1.8 {q8}, [r1] | ||
2539 | #endif | ||
2540 | VFP_ABI_POP | ||
2541 | ldmia sp!, {r4-r10, pc} @ return | ||
2542 | |||
2543 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2544 | #endif | ||
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c new file mode 100644 index 000000000000..4522366da759 --- /dev/null +++ b/arch/arm/crypto/aesbs-glue.c | |||
@@ -0,0 +1,434 @@ | |||
1 | /* | ||
2 | * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <crypto/aes.h> | ||
13 | #include <crypto/ablk_helper.h> | ||
14 | #include <crypto/algapi.h> | ||
15 | #include <linux/module.h> | ||
16 | |||
17 | #include "aes_glue.h" | ||
18 | |||
19 | #define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE) | ||
20 | |||
21 | struct BS_KEY { | ||
22 | struct AES_KEY rk; | ||
23 | int converted; | ||
24 | u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE]; | ||
25 | } __aligned(8); | ||
26 | |||
27 | asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in); | ||
28 | asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in); | ||
29 | |||
30 | asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes, | ||
31 | struct BS_KEY *key, u8 iv[]); | ||
32 | |||
33 | asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks, | ||
34 | struct BS_KEY *key, u8 const iv[]); | ||
35 | |||
36 | asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes, | ||
37 | struct BS_KEY *key, u8 tweak[]); | ||
38 | |||
39 | asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes, | ||
40 | struct BS_KEY *key, u8 tweak[]); | ||
41 | |||
42 | struct aesbs_cbc_ctx { | ||
43 | struct AES_KEY enc; | ||
44 | struct BS_KEY dec; | ||
45 | }; | ||
46 | |||
47 | struct aesbs_ctr_ctx { | ||
48 | struct BS_KEY enc; | ||
49 | }; | ||
50 | |||
51 | struct aesbs_xts_ctx { | ||
52 | struct BS_KEY enc; | ||
53 | struct BS_KEY dec; | ||
54 | struct AES_KEY twkey; | ||
55 | }; | ||
56 | |||
57 | static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
58 | unsigned int key_len) | ||
59 | { | ||
60 | struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); | ||
61 | int bits = key_len * 8; | ||
62 | |||
63 | if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) { | ||
64 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
65 | return -EINVAL; | ||
66 | } | ||
67 | ctx->dec.rk = ctx->enc; | ||
68 | private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk); | ||
69 | ctx->dec.converted = 0; | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
74 | unsigned int key_len) | ||
75 | { | ||
76 | struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm); | ||
77 | int bits = key_len * 8; | ||
78 | |||
79 | if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) { | ||
80 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
81 | return -EINVAL; | ||
82 | } | ||
83 | ctx->enc.converted = 0; | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
88 | unsigned int key_len) | ||
89 | { | ||
90 | struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm); | ||
91 | int bits = key_len * 4; | ||
92 | |||
93 | if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) { | ||
94 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
95 | return -EINVAL; | ||
96 | } | ||
97 | ctx->dec.rk = ctx->enc.rk; | ||
98 | private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk); | ||
99 | private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey); | ||
100 | ctx->enc.converted = ctx->dec.converted = 0; | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static int aesbs_cbc_encrypt(struct blkcipher_desc *desc, | ||
105 | struct scatterlist *dst, | ||
106 | struct scatterlist *src, unsigned int nbytes) | ||
107 | { | ||
108 | struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
109 | struct blkcipher_walk walk; | ||
110 | int err; | ||
111 | |||
112 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
113 | err = blkcipher_walk_virt(desc, &walk); | ||
114 | |||
115 | while (walk.nbytes) { | ||
116 | u32 blocks = walk.nbytes / AES_BLOCK_SIZE; | ||
117 | u8 *src = walk.src.virt.addr; | ||
118 | |||
119 | if (walk.dst.virt.addr == walk.src.virt.addr) { | ||
120 | u8 *iv = walk.iv; | ||
121 | |||
122 | do { | ||
123 | crypto_xor(src, iv, AES_BLOCK_SIZE); | ||
124 | AES_encrypt(src, src, &ctx->enc); | ||
125 | iv = src; | ||
126 | src += AES_BLOCK_SIZE; | ||
127 | } while (--blocks); | ||
128 | memcpy(walk.iv, iv, AES_BLOCK_SIZE); | ||
129 | } else { | ||
130 | u8 *dst = walk.dst.virt.addr; | ||
131 | |||
132 | do { | ||
133 | crypto_xor(walk.iv, src, AES_BLOCK_SIZE); | ||
134 | AES_encrypt(walk.iv, dst, &ctx->enc); | ||
135 | memcpy(walk.iv, dst, AES_BLOCK_SIZE); | ||
136 | src += AES_BLOCK_SIZE; | ||
137 | dst += AES_BLOCK_SIZE; | ||
138 | } while (--blocks); | ||
139 | } | ||
140 | err = blkcipher_walk_done(desc, &walk, 0); | ||
141 | } | ||
142 | return err; | ||
143 | } | ||
144 | |||
145 | static int aesbs_cbc_decrypt(struct blkcipher_desc *desc, | ||
146 | struct scatterlist *dst, | ||
147 | struct scatterlist *src, unsigned int nbytes) | ||
148 | { | ||
149 | struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
150 | struct blkcipher_walk walk; | ||
151 | int err; | ||
152 | |||
153 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
154 | err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); | ||
155 | |||
156 | while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) { | ||
157 | kernel_neon_begin(); | ||
158 | bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr, | ||
159 | walk.nbytes, &ctx->dec, walk.iv); | ||
160 | kernel_neon_end(); | ||
161 | err = blkcipher_walk_done(desc, &walk, 0); | ||
162 | } | ||
163 | while (walk.nbytes) { | ||
164 | u32 blocks = walk.nbytes / AES_BLOCK_SIZE; | ||
165 | u8 *dst = walk.dst.virt.addr; | ||
166 | u8 *src = walk.src.virt.addr; | ||
167 | u8 bk[2][AES_BLOCK_SIZE]; | ||
168 | u8 *iv = walk.iv; | ||
169 | |||
170 | do { | ||
171 | if (walk.dst.virt.addr == walk.src.virt.addr) | ||
172 | memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE); | ||
173 | |||
174 | AES_decrypt(src, dst, &ctx->dec.rk); | ||
175 | crypto_xor(dst, iv, AES_BLOCK_SIZE); | ||
176 | |||
177 | if (walk.dst.virt.addr == walk.src.virt.addr) | ||
178 | iv = bk[blocks & 1]; | ||
179 | else | ||
180 | iv = src; | ||
181 | |||
182 | dst += AES_BLOCK_SIZE; | ||
183 | src += AES_BLOCK_SIZE; | ||
184 | } while (--blocks); | ||
185 | err = blkcipher_walk_done(desc, &walk, 0); | ||
186 | } | ||
187 | return err; | ||
188 | } | ||
189 | |||
190 | static void inc_be128_ctr(__be32 ctr[], u32 addend) | ||
191 | { | ||
192 | int i; | ||
193 | |||
194 | for (i = 3; i >= 0; i--, addend = 1) { | ||
195 | u32 n = be32_to_cpu(ctr[i]) + addend; | ||
196 | |||
197 | ctr[i] = cpu_to_be32(n); | ||
198 | if (n >= addend) | ||
199 | break; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | static int aesbs_ctr_encrypt(struct blkcipher_desc *desc, | ||
204 | struct scatterlist *dst, struct scatterlist *src, | ||
205 | unsigned int nbytes) | ||
206 | { | ||
207 | struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
208 | struct blkcipher_walk walk; | ||
209 | u32 blocks; | ||
210 | int err; | ||
211 | |||
212 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
213 | err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); | ||
214 | |||
215 | while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) { | ||
216 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
217 | __be32 *ctr = (__be32 *)walk.iv; | ||
218 | u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]); | ||
219 | |||
220 | /* avoid 32 bit counter overflow in the NEON code */ | ||
221 | if (unlikely(headroom < blocks)) { | ||
222 | blocks = headroom + 1; | ||
223 | tail = walk.nbytes - blocks * AES_BLOCK_SIZE; | ||
224 | } | ||
225 | kernel_neon_begin(); | ||
226 | bsaes_ctr32_encrypt_blocks(walk.src.virt.addr, | ||
227 | walk.dst.virt.addr, blocks, | ||
228 | &ctx->enc, walk.iv); | ||
229 | kernel_neon_end(); | ||
230 | inc_be128_ctr(ctr, blocks); | ||
231 | |||
232 | nbytes -= blocks * AES_BLOCK_SIZE; | ||
233 | if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE) | ||
234 | break; | ||
235 | |||
236 | err = blkcipher_walk_done(desc, &walk, tail); | ||
237 | } | ||
238 | if (walk.nbytes) { | ||
239 | u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; | ||
240 | u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; | ||
241 | u8 ks[AES_BLOCK_SIZE]; | ||
242 | |||
243 | AES_encrypt(walk.iv, ks, &ctx->enc.rk); | ||
244 | if (tdst != tsrc) | ||
245 | memcpy(tdst, tsrc, nbytes); | ||
246 | crypto_xor(tdst, ks, nbytes); | ||
247 | err = blkcipher_walk_done(desc, &walk, 0); | ||
248 | } | ||
249 | return err; | ||
250 | } | ||
251 | |||
252 | static int aesbs_xts_encrypt(struct blkcipher_desc *desc, | ||
253 | struct scatterlist *dst, | ||
254 | struct scatterlist *src, unsigned int nbytes) | ||
255 | { | ||
256 | struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
257 | struct blkcipher_walk walk; | ||
258 | int err; | ||
259 | |||
260 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
261 | err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); | ||
262 | |||
263 | /* generate the initial tweak */ | ||
264 | AES_encrypt(walk.iv, walk.iv, &ctx->twkey); | ||
265 | |||
266 | while (walk.nbytes) { | ||
267 | kernel_neon_begin(); | ||
268 | bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr, | ||
269 | walk.nbytes, &ctx->enc, walk.iv); | ||
270 | kernel_neon_end(); | ||
271 | err = blkcipher_walk_done(desc, &walk, 0); | ||
272 | } | ||
273 | return err; | ||
274 | } | ||
275 | |||
276 | static int aesbs_xts_decrypt(struct blkcipher_desc *desc, | ||
277 | struct scatterlist *dst, | ||
278 | struct scatterlist *src, unsigned int nbytes) | ||
279 | { | ||
280 | struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
281 | struct blkcipher_walk walk; | ||
282 | int err; | ||
283 | |||
284 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
285 | err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); | ||
286 | |||
287 | /* generate the initial tweak */ | ||
288 | AES_encrypt(walk.iv, walk.iv, &ctx->twkey); | ||
289 | |||
290 | while (walk.nbytes) { | ||
291 | kernel_neon_begin(); | ||
292 | bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr, | ||
293 | walk.nbytes, &ctx->dec, walk.iv); | ||
294 | kernel_neon_end(); | ||
295 | err = blkcipher_walk_done(desc, &walk, 0); | ||
296 | } | ||
297 | return err; | ||
298 | } | ||
299 | |||
300 | static struct crypto_alg aesbs_algs[] = { { | ||
301 | .cra_name = "__cbc-aes-neonbs", | ||
302 | .cra_driver_name = "__driver-cbc-aes-neonbs", | ||
303 | .cra_priority = 0, | ||
304 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
305 | .cra_blocksize = AES_BLOCK_SIZE, | ||
306 | .cra_ctxsize = sizeof(struct aesbs_cbc_ctx), | ||
307 | .cra_alignmask = 7, | ||
308 | .cra_type = &crypto_blkcipher_type, | ||
309 | .cra_module = THIS_MODULE, | ||
310 | .cra_blkcipher = { | ||
311 | .min_keysize = AES_MIN_KEY_SIZE, | ||
312 | .max_keysize = AES_MAX_KEY_SIZE, | ||
313 | .ivsize = AES_BLOCK_SIZE, | ||
314 | .setkey = aesbs_cbc_set_key, | ||
315 | .encrypt = aesbs_cbc_encrypt, | ||
316 | .decrypt = aesbs_cbc_decrypt, | ||
317 | }, | ||
318 | }, { | ||
319 | .cra_name = "__ctr-aes-neonbs", | ||
320 | .cra_driver_name = "__driver-ctr-aes-neonbs", | ||
321 | .cra_priority = 0, | ||
322 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
323 | .cra_blocksize = 1, | ||
324 | .cra_ctxsize = sizeof(struct aesbs_ctr_ctx), | ||
325 | .cra_alignmask = 7, | ||
326 | .cra_type = &crypto_blkcipher_type, | ||
327 | .cra_module = THIS_MODULE, | ||
328 | .cra_blkcipher = { | ||
329 | .min_keysize = AES_MIN_KEY_SIZE, | ||
330 | .max_keysize = AES_MAX_KEY_SIZE, | ||
331 | .ivsize = AES_BLOCK_SIZE, | ||
332 | .setkey = aesbs_ctr_set_key, | ||
333 | .encrypt = aesbs_ctr_encrypt, | ||
334 | .decrypt = aesbs_ctr_encrypt, | ||
335 | }, | ||
336 | }, { | ||
337 | .cra_name = "__xts-aes-neonbs", | ||
338 | .cra_driver_name = "__driver-xts-aes-neonbs", | ||
339 | .cra_priority = 0, | ||
340 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
341 | .cra_blocksize = AES_BLOCK_SIZE, | ||
342 | .cra_ctxsize = sizeof(struct aesbs_xts_ctx), | ||
343 | .cra_alignmask = 7, | ||
344 | .cra_type = &crypto_blkcipher_type, | ||
345 | .cra_module = THIS_MODULE, | ||
346 | .cra_blkcipher = { | ||
347 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
348 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
349 | .ivsize = AES_BLOCK_SIZE, | ||
350 | .setkey = aesbs_xts_set_key, | ||
351 | .encrypt = aesbs_xts_encrypt, | ||
352 | .decrypt = aesbs_xts_decrypt, | ||
353 | }, | ||
354 | }, { | ||
355 | .cra_name = "cbc(aes)", | ||
356 | .cra_driver_name = "cbc-aes-neonbs", | ||
357 | .cra_priority = 300, | ||
358 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
359 | .cra_blocksize = AES_BLOCK_SIZE, | ||
360 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
361 | .cra_alignmask = 7, | ||
362 | .cra_type = &crypto_ablkcipher_type, | ||
363 | .cra_module = THIS_MODULE, | ||
364 | .cra_init = ablk_init, | ||
365 | .cra_exit = ablk_exit, | ||
366 | .cra_ablkcipher = { | ||
367 | .min_keysize = AES_MIN_KEY_SIZE, | ||
368 | .max_keysize = AES_MAX_KEY_SIZE, | ||
369 | .ivsize = AES_BLOCK_SIZE, | ||
370 | .setkey = ablk_set_key, | ||
371 | .encrypt = __ablk_encrypt, | ||
372 | .decrypt = ablk_decrypt, | ||
373 | } | ||
374 | }, { | ||
375 | .cra_name = "ctr(aes)", | ||
376 | .cra_driver_name = "ctr-aes-neonbs", | ||
377 | .cra_priority = 300, | ||
378 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
379 | .cra_blocksize = 1, | ||
380 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
381 | .cra_alignmask = 7, | ||
382 | .cra_type = &crypto_ablkcipher_type, | ||
383 | .cra_module = THIS_MODULE, | ||
384 | .cra_init = ablk_init, | ||
385 | .cra_exit = ablk_exit, | ||
386 | .cra_ablkcipher = { | ||
387 | .min_keysize = AES_MIN_KEY_SIZE, | ||
388 | .max_keysize = AES_MAX_KEY_SIZE, | ||
389 | .ivsize = AES_BLOCK_SIZE, | ||
390 | .setkey = ablk_set_key, | ||
391 | .encrypt = ablk_encrypt, | ||
392 | .decrypt = ablk_decrypt, | ||
393 | } | ||
394 | }, { | ||
395 | .cra_name = "xts(aes)", | ||
396 | .cra_driver_name = "xts-aes-neonbs", | ||
397 | .cra_priority = 300, | ||
398 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
399 | .cra_blocksize = AES_BLOCK_SIZE, | ||
400 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
401 | .cra_alignmask = 7, | ||
402 | .cra_type = &crypto_ablkcipher_type, | ||
403 | .cra_module = THIS_MODULE, | ||
404 | .cra_init = ablk_init, | ||
405 | .cra_exit = ablk_exit, | ||
406 | .cra_ablkcipher = { | ||
407 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
408 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
409 | .ivsize = AES_BLOCK_SIZE, | ||
410 | .setkey = ablk_set_key, | ||
411 | .encrypt = ablk_encrypt, | ||
412 | .decrypt = ablk_decrypt, | ||
413 | } | ||
414 | } }; | ||
415 | |||
416 | static int __init aesbs_mod_init(void) | ||
417 | { | ||
418 | if (!cpu_has_neon()) | ||
419 | return -ENODEV; | ||
420 | |||
421 | return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs)); | ||
422 | } | ||
423 | |||
424 | static void __exit aesbs_mod_exit(void) | ||
425 | { | ||
426 | crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs)); | ||
427 | } | ||
428 | |||
429 | module_init(aesbs_mod_init); | ||
430 | module_exit(aesbs_mod_exit); | ||
431 | |||
432 | MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON"); | ||
433 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
434 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl new file mode 100644 index 000000000000..f3d96d932573 --- /dev/null +++ b/arch/arm/crypto/bsaes-armv7.pl | |||
@@ -0,0 +1,2467 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # | ||
9 | # Specific modes and adaptation for Linux kernel by Ard Biesheuvel | ||
10 | # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is | ||
11 | # granted. | ||
12 | # ==================================================================== | ||
13 | |||
14 | # Bit-sliced AES for ARM NEON | ||
15 | # | ||
16 | # February 2012. | ||
17 | # | ||
18 | # This implementation is direct adaptation of bsaes-x86_64 module for | ||
19 | # ARM NEON. Except that this module is endian-neutral [in sense that | ||
20 | # it can be compiled for either endianness] by courtesy of vld1.8's | ||
21 | # neutrality. Initial version doesn't implement interface to OpenSSL, | ||
22 | # only low-level primitives and unsupported entry points, just enough | ||
23 | # to collect performance results, which for Cortex-A8 core are: | ||
24 | # | ||
25 | # encrypt 19.5 cycles per byte processed with 128-bit key | ||
26 | # decrypt 22.1 cycles per byte processed with 128-bit key | ||
27 | # key conv. 440 cycles per 128-bit key/0.18 of 8x block | ||
28 | # | ||
29 | # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, | ||
30 | # which is [much] worse than anticipated (for further details see | ||
31 | # http://www.openssl.org/~appro/Snapdragon-S4.html). | ||
32 | # | ||
33 | # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code | ||
34 | # manages in 20.0 cycles]. | ||
35 | # | ||
36 | # When comparing to x86_64 results keep in mind that NEON unit is | ||
37 | # [mostly] single-issue and thus can't [fully] benefit from | ||
38 | # instruction-level parallelism. And when comparing to aes-armv4 | ||
39 | # results keep in mind key schedule conversion overhead (see | ||
40 | # bsaes-x86_64.pl for further details)... | ||
41 | # | ||
42 | # <appro@openssl.org> | ||
43 | |||
44 | # April-August 2013 | ||
45 | # | ||
46 | # Add CBC, CTR and XTS subroutines, adapt for kernel use. | ||
47 | # | ||
48 | # <ard.biesheuvel@linaro.org> | ||
49 | |||
50 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
51 | open STDOUT,">$output"; | ||
52 | |||
53 | my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); | ||
54 | my @XMM=map("q$_",(0..15)); | ||
55 | |||
56 | { | ||
57 | my ($key,$rounds,$const)=("r4","r5","r6"); | ||
58 | |||
59 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
60 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
61 | |||
62 | sub Sbox { | ||
63 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
64 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
65 | my @b=@_[0..7]; | ||
66 | my @t=@_[8..11]; | ||
67 | my @s=@_[12..15]; | ||
68 | &InBasisChange (@b); | ||
69 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
70 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
71 | } | ||
72 | |||
73 | sub InBasisChange { | ||
74 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
75 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
76 | my @b=@_[0..7]; | ||
77 | $code.=<<___; | ||
78 | veor @b[2], @b[2], @b[1] | ||
79 | veor @b[5], @b[5], @b[6] | ||
80 | veor @b[3], @b[3], @b[0] | ||
81 | veor @b[6], @b[6], @b[2] | ||
82 | veor @b[5], @b[5], @b[0] | ||
83 | |||
84 | veor @b[6], @b[6], @b[3] | ||
85 | veor @b[3], @b[3], @b[7] | ||
86 | veor @b[7], @b[7], @b[5] | ||
87 | veor @b[3], @b[3], @b[4] | ||
88 | veor @b[4], @b[4], @b[5] | ||
89 | |||
90 | veor @b[2], @b[2], @b[7] | ||
91 | veor @b[3], @b[3], @b[1] | ||
92 | veor @b[1], @b[1], @b[5] | ||
93 | ___ | ||
94 | } | ||
95 | |||
96 | sub OutBasisChange { | ||
97 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
98 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
99 | my @b=@_[0..7]; | ||
100 | $code.=<<___; | ||
101 | veor @b[0], @b[0], @b[6] | ||
102 | veor @b[1], @b[1], @b[4] | ||
103 | veor @b[4], @b[4], @b[6] | ||
104 | veor @b[2], @b[2], @b[0] | ||
105 | veor @b[6], @b[6], @b[1] | ||
106 | |||
107 | veor @b[1], @b[1], @b[5] | ||
108 | veor @b[5], @b[5], @b[3] | ||
109 | veor @b[3], @b[3], @b[7] | ||
110 | veor @b[7], @b[7], @b[5] | ||
111 | veor @b[2], @b[2], @b[5] | ||
112 | |||
113 | veor @b[4], @b[4], @b[7] | ||
114 | ___ | ||
115 | } | ||
116 | |||
117 | sub InvSbox { | ||
118 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
119 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
120 | my @b=@_[0..7]; | ||
121 | my @t=@_[8..11]; | ||
122 | my @s=@_[12..15]; | ||
123 | &InvInBasisChange (@b); | ||
124 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
125 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
126 | } | ||
127 | |||
128 | sub InvInBasisChange { # OutBasisChange in reverse (with twist) | ||
129 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
130 | $code.=<<___ | ||
131 | veor @b[1], @b[1], @b[7] | ||
132 | veor @b[4], @b[4], @b[7] | ||
133 | |||
134 | veor @b[7], @b[7], @b[5] | ||
135 | veor @b[1], @b[1], @b[3] | ||
136 | veor @b[2], @b[2], @b[5] | ||
137 | veor @b[3], @b[3], @b[7] | ||
138 | |||
139 | veor @b[6], @b[6], @b[1] | ||
140 | veor @b[2], @b[2], @b[0] | ||
141 | veor @b[5], @b[5], @b[3] | ||
142 | veor @b[4], @b[4], @b[6] | ||
143 | veor @b[0], @b[0], @b[6] | ||
144 | veor @b[1], @b[1], @b[4] | ||
145 | ___ | ||
146 | } | ||
147 | |||
148 | sub InvOutBasisChange { # InBasisChange in reverse | ||
149 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
150 | $code.=<<___; | ||
151 | veor @b[1], @b[1], @b[5] | ||
152 | veor @b[2], @b[2], @b[7] | ||
153 | |||
154 | veor @b[3], @b[3], @b[1] | ||
155 | veor @b[4], @b[4], @b[5] | ||
156 | veor @b[7], @b[7], @b[5] | ||
157 | veor @b[3], @b[3], @b[4] | ||
158 | veor @b[5], @b[5], @b[0] | ||
159 | veor @b[3], @b[3], @b[7] | ||
160 | veor @b[6], @b[6], @b[2] | ||
161 | veor @b[2], @b[2], @b[1] | ||
162 | veor @b[6], @b[6], @b[3] | ||
163 | |||
164 | veor @b[3], @b[3], @b[0] | ||
165 | veor @b[5], @b[5], @b[6] | ||
166 | ___ | ||
167 | } | ||
168 | |||
169 | sub Mul_GF4 { | ||
170 | #;************************************************************* | ||
171 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
172 | #;************************************************************* | ||
173 | my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; | ||
174 | $code.=<<___; | ||
175 | veor $t0, $y0, $y1 | ||
176 | vand $t0, $t0, $x0 | ||
177 | veor $x0, $x0, $x1 | ||
178 | vand $t1, $x1, $y0 | ||
179 | vand $x0, $x0, $y1 | ||
180 | veor $x1, $t1, $t0 | ||
181 | veor $x0, $x0, $t1 | ||
182 | ___ | ||
183 | } | ||
184 | |||
185 | sub Mul_GF4_N { # not used, see next subroutine | ||
186 | # multiply and scale by N | ||
187 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
188 | $code.=<<___; | ||
189 | veor $t0, $y0, $y1 | ||
190 | vand $t0, $t0, $x0 | ||
191 | veor $x0, $x0, $x1 | ||
192 | vand $x1, $x1, $y0 | ||
193 | vand $x0, $x0, $y1 | ||
194 | veor $x1, $x1, $x0 | ||
195 | veor $x0, $x0, $t0 | ||
196 | ___ | ||
197 | } | ||
198 | |||
199 | sub Mul_GF4_N_GF4 { | ||
200 | # interleaved Mul_GF4_N and Mul_GF4 | ||
201 | my ($x0,$x1,$y0,$y1,$t0, | ||
202 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
203 | $code.=<<___; | ||
204 | veor $t0, $y0, $y1 | ||
205 | veor $t1, $y2, $y3 | ||
206 | vand $t0, $t0, $x0 | ||
207 | vand $t1, $t1, $x2 | ||
208 | veor $x0, $x0, $x1 | ||
209 | veor $x2, $x2, $x3 | ||
210 | vand $x1, $x1, $y0 | ||
211 | vand $x3, $x3, $y2 | ||
212 | vand $x0, $x0, $y1 | ||
213 | vand $x2, $x2, $y3 | ||
214 | veor $x1, $x1, $x0 | ||
215 | veor $x2, $x2, $x3 | ||
216 | veor $x0, $x0, $t0 | ||
217 | veor $x3, $x3, $t1 | ||
218 | ___ | ||
219 | } | ||
220 | sub Mul_GF16_2 { | ||
221 | my @x=@_[0..7]; | ||
222 | my @y=@_[8..11]; | ||
223 | my @t=@_[12..15]; | ||
224 | $code.=<<___; | ||
225 | veor @t[0], @x[0], @x[2] | ||
226 | veor @t[1], @x[1], @x[3] | ||
227 | ___ | ||
228 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); | ||
229 | $code.=<<___; | ||
230 | veor @y[0], @y[0], @y[2] | ||
231 | veor @y[1], @y[1], @y[3] | ||
232 | ___ | ||
233 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
234 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
235 | $code.=<<___; | ||
236 | veor @x[0], @x[0], @t[0] | ||
237 | veor @x[2], @x[2], @t[0] | ||
238 | veor @x[1], @x[1], @t[1] | ||
239 | veor @x[3], @x[3], @t[1] | ||
240 | |||
241 | veor @t[0], @x[4], @x[6] | ||
242 | veor @t[1], @x[5], @x[7] | ||
243 | ___ | ||
244 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
245 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
246 | $code.=<<___; | ||
247 | veor @y[0], @y[0], @y[2] | ||
248 | veor @y[1], @y[1], @y[3] | ||
249 | ___ | ||
250 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); | ||
251 | $code.=<<___; | ||
252 | veor @x[4], @x[4], @t[0] | ||
253 | veor @x[6], @x[6], @t[0] | ||
254 | veor @x[5], @x[5], @t[1] | ||
255 | veor @x[7], @x[7], @t[1] | ||
256 | ___ | ||
257 | } | ||
258 | sub Inv_GF256 { | ||
259 | #;******************************************************************** | ||
260 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
261 | #;******************************************************************** | ||
262 | my @x=@_[0..7]; | ||
263 | my @t=@_[8..11]; | ||
264 | my @s=@_[12..15]; | ||
265 | # direct optimizations from hardware | ||
266 | $code.=<<___; | ||
267 | veor @t[3], @x[4], @x[6] | ||
268 | veor @t[2], @x[5], @x[7] | ||
269 | veor @t[1], @x[1], @x[3] | ||
270 | veor @s[1], @x[7], @x[6] | ||
271 | vmov @t[0], @t[2] | ||
272 | veor @s[0], @x[0], @x[2] | ||
273 | |||
274 | vorr @t[2], @t[2], @t[1] | ||
275 | veor @s[3], @t[3], @t[0] | ||
276 | vand @s[2], @t[3], @s[0] | ||
277 | vorr @t[3], @t[3], @s[0] | ||
278 | veor @s[0], @s[0], @t[1] | ||
279 | vand @t[0], @t[0], @t[1] | ||
280 | veor @t[1], @x[3], @x[2] | ||
281 | vand @s[3], @s[3], @s[0] | ||
282 | vand @s[1], @s[1], @t[1] | ||
283 | veor @t[1], @x[4], @x[5] | ||
284 | veor @s[0], @x[1], @x[0] | ||
285 | veor @t[3], @t[3], @s[1] | ||
286 | veor @t[2], @t[2], @s[1] | ||
287 | vand @s[1], @t[1], @s[0] | ||
288 | vorr @t[1], @t[1], @s[0] | ||
289 | veor @t[3], @t[3], @s[3] | ||
290 | veor @t[0], @t[0], @s[1] | ||
291 | veor @t[2], @t[2], @s[2] | ||
292 | veor @t[1], @t[1], @s[3] | ||
293 | veor @t[0], @t[0], @s[2] | ||
294 | vand @s[0], @x[7], @x[3] | ||
295 | veor @t[1], @t[1], @s[2] | ||
296 | vand @s[1], @x[6], @x[2] | ||
297 | vand @s[2], @x[5], @x[1] | ||
298 | vorr @s[3], @x[4], @x[0] | ||
299 | veor @t[3], @t[3], @s[0] | ||
300 | veor @t[1], @t[1], @s[2] | ||
301 | veor @t[0], @t[0], @s[3] | ||
302 | veor @t[2], @t[2], @s[1] | ||
303 | |||
304 | @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
305 | |||
306 | @ new smaller inversion | ||
307 | |||
308 | vand @s[2], @t[3], @t[1] | ||
309 | vmov @s[0], @t[0] | ||
310 | |||
311 | veor @s[1], @t[2], @s[2] | ||
312 | veor @s[3], @t[0], @s[2] | ||
313 | veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] | ||
314 | |||
315 | vbsl @s[1], @t[1], @t[0] | ||
316 | vbsl @s[3], @t[3], @t[2] | ||
317 | veor @t[3], @t[3], @t[2] | ||
318 | |||
319 | vbsl @s[0], @s[1], @s[2] | ||
320 | vbsl @t[0], @s[2], @s[1] | ||
321 | |||
322 | vand @s[2], @s[0], @s[3] | ||
323 | veor @t[1], @t[1], @t[0] | ||
324 | |||
325 | veor @s[2], @s[2], @t[3] | ||
326 | ___ | ||
327 | # output in s3, s2, s1, t1 | ||
328 | |||
329 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
330 | |||
331 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
332 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
333 | |||
334 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
335 | } | ||
336 | |||
337 | # AES linear components | ||
338 | |||
339 | sub ShiftRows { | ||
340 | my @x=@_[0..7]; | ||
341 | my @t=@_[8..11]; | ||
342 | my $mask=pop; | ||
343 | $code.=<<___; | ||
344 | vldmia $key!, {@t[0]-@t[3]} | ||
345 | veor @t[0], @t[0], @x[0] | ||
346 | veor @t[1], @t[1], @x[1] | ||
347 | vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` | ||
348 | vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` | ||
349 | vldmia $key!, {@t[0]} | ||
350 | veor @t[2], @t[2], @x[2] | ||
351 | vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` | ||
352 | vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` | ||
353 | vldmia $key!, {@t[1]} | ||
354 | veor @t[3], @t[3], @x[3] | ||
355 | vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` | ||
356 | vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` | ||
357 | vldmia $key!, {@t[2]} | ||
358 | vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` | ||
359 | vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` | ||
360 | vldmia $key!, {@t[3]} | ||
361 | veor @t[0], @t[0], @x[4] | ||
362 | veor @t[1], @t[1], @x[5] | ||
363 | vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` | ||
364 | vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` | ||
365 | veor @t[2], @t[2], @x[6] | ||
366 | vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` | ||
367 | vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` | ||
368 | veor @t[3], @t[3], @x[7] | ||
369 | vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` | ||
370 | vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` | ||
371 | vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` | ||
372 | vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` | ||
373 | ___ | ||
374 | } | ||
375 | |||
376 | sub MixColumns { | ||
377 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
378 | my @x=@_[0..7]; | ||
379 | my @t=@_[8..15]; | ||
380 | my $inv=@_[16]; # optional | ||
381 | $code.=<<___; | ||
382 | vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 | ||
383 | vext.8 @t[1], @x[1], @x[1], #12 | ||
384 | veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) | ||
385 | vext.8 @t[2], @x[2], @x[2], #12 | ||
386 | veor @x[1], @x[1], @t[1] | ||
387 | vext.8 @t[3], @x[3], @x[3], #12 | ||
388 | veor @x[2], @x[2], @t[2] | ||
389 | vext.8 @t[4], @x[4], @x[4], #12 | ||
390 | veor @x[3], @x[3], @t[3] | ||
391 | vext.8 @t[5], @x[5], @x[5], #12 | ||
392 | veor @x[4], @x[4], @t[4] | ||
393 | vext.8 @t[6], @x[6], @x[6], #12 | ||
394 | veor @x[5], @x[5], @t[5] | ||
395 | vext.8 @t[7], @x[7], @x[7], #12 | ||
396 | veor @x[6], @x[6], @t[6] | ||
397 | |||
398 | veor @t[1], @t[1], @x[0] | ||
399 | veor @x[7], @x[7], @t[7] | ||
400 | vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) | ||
401 | veor @t[2], @t[2], @x[1] | ||
402 | veor @t[0], @t[0], @x[7] | ||
403 | veor @t[1], @t[1], @x[7] | ||
404 | vext.8 @x[1], @x[1], @x[1], #8 | ||
405 | veor @t[5], @t[5], @x[4] | ||
406 | veor @x[0], @x[0], @t[0] | ||
407 | veor @t[6], @t[6], @x[5] | ||
408 | veor @x[1], @x[1], @t[1] | ||
409 | vext.8 @t[0], @x[4], @x[4], #8 | ||
410 | veor @t[4], @t[4], @x[3] | ||
411 | vext.8 @t[1], @x[5], @x[5], #8 | ||
412 | veor @t[7], @t[7], @x[6] | ||
413 | vext.8 @x[4], @x[3], @x[3], #8 | ||
414 | veor @t[3], @t[3], @x[2] | ||
415 | vext.8 @x[5], @x[7], @x[7], #8 | ||
416 | veor @t[4], @t[4], @x[7] | ||
417 | vext.8 @x[3], @x[6], @x[6], #8 | ||
418 | veor @t[3], @t[3], @x[7] | ||
419 | vext.8 @x[6], @x[2], @x[2], #8 | ||
420 | veor @x[7], @t[1], @t[5] | ||
421 | ___ | ||
422 | $code.=<<___ if (!$inv); | ||
423 | veor @x[2], @t[0], @t[4] | ||
424 | veor @x[4], @x[4], @t[3] | ||
425 | veor @x[5], @x[5], @t[7] | ||
426 | veor @x[3], @x[3], @t[6] | ||
427 | @ vmov @x[2], @t[0] | ||
428 | veor @x[6], @x[6], @t[2] | ||
429 | @ vmov @x[7], @t[1] | ||
430 | ___ | ||
431 | $code.=<<___ if ($inv); | ||
432 | veor @t[3], @t[3], @x[4] | ||
433 | veor @x[5], @x[5], @t[7] | ||
434 | veor @x[2], @x[3], @t[6] | ||
435 | veor @x[3], @t[0], @t[4] | ||
436 | veor @x[4], @x[6], @t[2] | ||
437 | vmov @x[6], @t[3] | ||
438 | @ vmov @x[7], @t[1] | ||
439 | ___ | ||
440 | } | ||
441 | |||
442 | sub InvMixColumns_orig { | ||
443 | my @x=@_[0..7]; | ||
444 | my @t=@_[8..15]; | ||
445 | |||
446 | $code.=<<___; | ||
447 | @ multiplication by 0x0e | ||
448 | vext.8 @t[7], @x[7], @x[7], #12 | ||
449 | vmov @t[2], @x[2] | ||
450 | veor @x[2], @x[2], @x[5] @ 2 5 | ||
451 | veor @x[7], @x[7], @x[5] @ 7 5 | ||
452 | vext.8 @t[0], @x[0], @x[0], #12 | ||
453 | vmov @t[5], @x[5] | ||
454 | veor @x[5], @x[5], @x[0] @ 5 0 [1] | ||
455 | veor @x[0], @x[0], @x[1] @ 0 1 | ||
456 | vext.8 @t[1], @x[1], @x[1], #12 | ||
457 | veor @x[1], @x[1], @x[2] @ 1 25 | ||
458 | veor @x[0], @x[0], @x[6] @ 01 6 [2] | ||
459 | vext.8 @t[3], @x[3], @x[3], #12 | ||
460 | veor @x[1], @x[1], @x[3] @ 125 3 [4] | ||
461 | veor @x[2], @x[2], @x[0] @ 25 016 [3] | ||
462 | veor @x[3], @x[3], @x[7] @ 3 75 | ||
463 | veor @x[7], @x[7], @x[6] @ 75 6 [0] | ||
464 | vext.8 @t[6], @x[6], @x[6], #12 | ||
465 | vmov @t[4], @x[4] | ||
466 | veor @x[6], @x[6], @x[4] @ 6 4 | ||
467 | veor @x[4], @x[4], @x[3] @ 4 375 [6] | ||
468 | veor @x[3], @x[3], @x[7] @ 375 756=36 | ||
469 | veor @x[6], @x[6], @t[5] @ 64 5 [7] | ||
470 | veor @x[3], @x[3], @t[2] @ 36 2 | ||
471 | vext.8 @t[5], @t[5], @t[5], #12 | ||
472 | veor @x[3], @x[3], @t[4] @ 362 4 [5] | ||
473 | ___ | ||
474 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
475 | $code.=<<___; | ||
476 | @ multiplication by 0x0b | ||
477 | veor @y[1], @y[1], @y[0] | ||
478 | veor @y[0], @y[0], @t[0] | ||
479 | vext.8 @t[2], @t[2], @t[2], #12 | ||
480 | veor @y[1], @y[1], @t[1] | ||
481 | veor @y[0], @y[0], @t[5] | ||
482 | vext.8 @t[4], @t[4], @t[4], #12 | ||
483 | veor @y[1], @y[1], @t[6] | ||
484 | veor @y[0], @y[0], @t[7] | ||
485 | veor @t[7], @t[7], @t[6] @ clobber t[7] | ||
486 | |||
487 | veor @y[3], @y[3], @t[0] | ||
488 | veor @y[1], @y[1], @y[0] | ||
489 | vext.8 @t[0], @t[0], @t[0], #12 | ||
490 | veor @y[2], @y[2], @t[1] | ||
491 | veor @y[4], @y[4], @t[1] | ||
492 | vext.8 @t[1], @t[1], @t[1], #12 | ||
493 | veor @y[2], @y[2], @t[2] | ||
494 | veor @y[3], @y[3], @t[2] | ||
495 | veor @y[5], @y[5], @t[2] | ||
496 | veor @y[2], @y[2], @t[7] | ||
497 | vext.8 @t[2], @t[2], @t[2], #12 | ||
498 | veor @y[3], @y[3], @t[3] | ||
499 | veor @y[6], @y[6], @t[3] | ||
500 | veor @y[4], @y[4], @t[3] | ||
501 | veor @y[7], @y[7], @t[4] | ||
502 | vext.8 @t[3], @t[3], @t[3], #12 | ||
503 | veor @y[5], @y[5], @t[4] | ||
504 | veor @y[7], @y[7], @t[7] | ||
505 | veor @t[7], @t[7], @t[5] @ clobber t[7] even more | ||
506 | veor @y[3], @y[3], @t[5] | ||
507 | veor @y[4], @y[4], @t[4] | ||
508 | |||
509 | veor @y[5], @y[5], @t[7] | ||
510 | vext.8 @t[4], @t[4], @t[4], #12 | ||
511 | veor @y[6], @y[6], @t[7] | ||
512 | veor @y[4], @y[4], @t[7] | ||
513 | |||
514 | veor @t[7], @t[7], @t[5] | ||
515 | vext.8 @t[5], @t[5], @t[5], #12 | ||
516 | |||
517 | @ multiplication by 0x0d | ||
518 | veor @y[4], @y[4], @y[7] | ||
519 | veor @t[7], @t[7], @t[6] @ restore t[7] | ||
520 | veor @y[7], @y[7], @t[4] | ||
521 | vext.8 @t[6], @t[6], @t[6], #12 | ||
522 | veor @y[2], @y[2], @t[0] | ||
523 | veor @y[7], @y[7], @t[5] | ||
524 | vext.8 @t[7], @t[7], @t[7], #12 | ||
525 | veor @y[2], @y[2], @t[2] | ||
526 | |||
527 | veor @y[3], @y[3], @y[1] | ||
528 | veor @y[1], @y[1], @t[1] | ||
529 | veor @y[0], @y[0], @t[0] | ||
530 | veor @y[3], @y[3], @t[0] | ||
531 | veor @y[1], @y[1], @t[5] | ||
532 | veor @y[0], @y[0], @t[5] | ||
533 | vext.8 @t[0], @t[0], @t[0], #12 | ||
534 | veor @y[1], @y[1], @t[7] | ||
535 | veor @y[0], @y[0], @t[6] | ||
536 | veor @y[3], @y[3], @y[1] | ||
537 | veor @y[4], @y[4], @t[1] | ||
538 | vext.8 @t[1], @t[1], @t[1], #12 | ||
539 | |||
540 | veor @y[7], @y[7], @t[7] | ||
541 | veor @y[4], @y[4], @t[2] | ||
542 | veor @y[5], @y[5], @t[2] | ||
543 | veor @y[2], @y[2], @t[6] | ||
544 | veor @t[6], @t[6], @t[3] @ clobber t[6] | ||
545 | vext.8 @t[2], @t[2], @t[2], #12 | ||
546 | veor @y[4], @y[4], @y[7] | ||
547 | veor @y[3], @y[3], @t[6] | ||
548 | |||
549 | veor @y[6], @y[6], @t[6] | ||
550 | veor @y[5], @y[5], @t[5] | ||
551 | vext.8 @t[5], @t[5], @t[5], #12 | ||
552 | veor @y[6], @y[6], @t[4] | ||
553 | vext.8 @t[4], @t[4], @t[4], #12 | ||
554 | veor @y[5], @y[5], @t[6] | ||
555 | veor @y[6], @y[6], @t[7] | ||
556 | vext.8 @t[7], @t[7], @t[7], #12 | ||
557 | veor @t[6], @t[6], @t[3] @ restore t[6] | ||
558 | vext.8 @t[3], @t[3], @t[3], #12 | ||
559 | |||
560 | @ multiplication by 0x09 | ||
561 | veor @y[4], @y[4], @y[1] | ||
562 | veor @t[1], @t[1], @y[1] @ t[1]=y[1] | ||
563 | veor @t[0], @t[0], @t[5] @ clobber t[0] | ||
564 | vext.8 @t[6], @t[6], @t[6], #12 | ||
565 | veor @t[1], @t[1], @t[5] | ||
566 | veor @y[3], @y[3], @t[0] | ||
567 | veor @t[0], @t[0], @y[0] @ t[0]=y[0] | ||
568 | veor @t[1], @t[1], @t[6] | ||
569 | veor @t[6], @t[6], @t[7] @ clobber t[6] | ||
570 | veor @y[4], @y[4], @t[1] | ||
571 | veor @y[7], @y[7], @t[4] | ||
572 | veor @y[6], @y[6], @t[3] | ||
573 | veor @y[5], @y[5], @t[2] | ||
574 | veor @t[4], @t[4], @y[4] @ t[4]=y[4] | ||
575 | veor @t[3], @t[3], @y[3] @ t[3]=y[3] | ||
576 | veor @t[5], @t[5], @y[5] @ t[5]=y[5] | ||
577 | veor @t[2], @t[2], @y[2] @ t[2]=y[2] | ||
578 | veor @t[3], @t[3], @t[7] | ||
579 | veor @XMM[5], @t[5], @t[6] | ||
580 | veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] | ||
581 | veor @XMM[2], @t[2], @t[6] | ||
582 | veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] | ||
583 | |||
584 | vmov @XMM[0], @t[0] | ||
585 | vmov @XMM[1], @t[1] | ||
586 | @ vmov @XMM[2], @t[2] | ||
587 | vmov @XMM[3], @t[3] | ||
588 | vmov @XMM[4], @t[4] | ||
589 | @ vmov @XMM[5], @t[5] | ||
590 | @ vmov @XMM[6], @t[6] | ||
591 | @ vmov @XMM[7], @t[7] | ||
592 | ___ | ||
593 | } | ||
594 | |||
595 | sub InvMixColumns { | ||
596 | my @x=@_[0..7]; | ||
597 | my @t=@_[8..15]; | ||
598 | |||
599 | # Thanks to Jussi Kivilinna for providing pointer to | ||
600 | # | ||
601 | # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | | ||
602 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | | ||
603 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | | ||
604 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | | ||
605 | |||
606 | $code.=<<___; | ||
607 | @ multiplication by 0x05-0x00-0x04-0x00 | ||
608 | vext.8 @t[0], @x[0], @x[0], #8 | ||
609 | vext.8 @t[6], @x[6], @x[6], #8 | ||
610 | vext.8 @t[7], @x[7], @x[7], #8 | ||
611 | veor @t[0], @t[0], @x[0] | ||
612 | vext.8 @t[1], @x[1], @x[1], #8 | ||
613 | veor @t[6], @t[6], @x[6] | ||
614 | vext.8 @t[2], @x[2], @x[2], #8 | ||
615 | veor @t[7], @t[7], @x[7] | ||
616 | vext.8 @t[3], @x[3], @x[3], #8 | ||
617 | veor @t[1], @t[1], @x[1] | ||
618 | vext.8 @t[4], @x[4], @x[4], #8 | ||
619 | veor @t[2], @t[2], @x[2] | ||
620 | vext.8 @t[5], @x[5], @x[5], #8 | ||
621 | veor @t[3], @t[3], @x[3] | ||
622 | veor @t[4], @t[4], @x[4] | ||
623 | veor @t[5], @t[5], @x[5] | ||
624 | |||
625 | veor @x[0], @x[0], @t[6] | ||
626 | veor @x[1], @x[1], @t[6] | ||
627 | veor @x[2], @x[2], @t[0] | ||
628 | veor @x[4], @x[4], @t[2] | ||
629 | veor @x[3], @x[3], @t[1] | ||
630 | veor @x[1], @x[1], @t[7] | ||
631 | veor @x[2], @x[2], @t[7] | ||
632 | veor @x[4], @x[4], @t[6] | ||
633 | veor @x[5], @x[5], @t[3] | ||
634 | veor @x[3], @x[3], @t[6] | ||
635 | veor @x[6], @x[6], @t[4] | ||
636 | veor @x[4], @x[4], @t[7] | ||
637 | veor @x[5], @x[5], @t[7] | ||
638 | veor @x[7], @x[7], @t[5] | ||
639 | ___ | ||
640 | &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 | ||
641 | } | ||
642 | |||
643 | sub swapmove { | ||
644 | my ($a,$b,$n,$mask,$t)=@_; | ||
645 | $code.=<<___; | ||
646 | vshr.u64 $t, $b, #$n | ||
647 | veor $t, $t, $a | ||
648 | vand $t, $t, $mask | ||
649 | veor $a, $a, $t | ||
650 | vshl.u64 $t, $t, #$n | ||
651 | veor $b, $b, $t | ||
652 | ___ | ||
653 | } | ||
654 | sub swapmove2x { | ||
655 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
656 | $code.=<<___; | ||
657 | vshr.u64 $t0, $b0, #$n | ||
658 | vshr.u64 $t1, $b1, #$n | ||
659 | veor $t0, $t0, $a0 | ||
660 | veor $t1, $t1, $a1 | ||
661 | vand $t0, $t0, $mask | ||
662 | vand $t1, $t1, $mask | ||
663 | veor $a0, $a0, $t0 | ||
664 | vshl.u64 $t0, $t0, #$n | ||
665 | veor $a1, $a1, $t1 | ||
666 | vshl.u64 $t1, $t1, #$n | ||
667 | veor $b0, $b0, $t0 | ||
668 | veor $b1, $b1, $t1 | ||
669 | ___ | ||
670 | } | ||
671 | |||
672 | sub bitslice { | ||
673 | my @x=reverse(@_[0..7]); | ||
674 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
675 | $code.=<<___; | ||
676 | vmov.i8 $t0,#0x55 @ compose .LBS0 | ||
677 | vmov.i8 $t1,#0x33 @ compose .LBS1 | ||
678 | ___ | ||
679 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
680 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
681 | $code.=<<___; | ||
682 | vmov.i8 $t0,#0x0f @ compose .LBS2 | ||
683 | ___ | ||
684 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
685 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
686 | |||
687 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
688 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
689 | } | ||
690 | |||
691 | $code.=<<___; | ||
692 | #ifndef __KERNEL__ | ||
693 | # include "arm_arch.h" | ||
694 | |||
695 | # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} | ||
696 | # define VFP_ABI_POP vldmia sp!,{d8-d15} | ||
697 | # define VFP_ABI_FRAME 0x40 | ||
698 | #else | ||
699 | # define VFP_ABI_PUSH | ||
700 | # define VFP_ABI_POP | ||
701 | # define VFP_ABI_FRAME 0 | ||
702 | # define BSAES_ASM_EXTENDED_KEY | ||
703 | # define XTS_CHAIN_TWEAK | ||
704 | # define __ARM_ARCH__ __LINUX_ARM_ARCH__ | ||
705 | #endif | ||
706 | |||
707 | #ifdef __thumb__ | ||
708 | # define adrl adr | ||
709 | #endif | ||
710 | |||
711 | #if __ARM_ARCH__>=7 | ||
712 | .text | ||
713 | .syntax unified @ ARMv7-capable assembler is expected to handle this | ||
714 | #ifdef __thumb2__ | ||
715 | .thumb | ||
716 | #else | ||
717 | .code 32 | ||
718 | #endif | ||
719 | |||
720 | .fpu neon | ||
721 | |||
722 | .type _bsaes_decrypt8,%function | ||
723 | .align 4 | ||
724 | _bsaes_decrypt8: | ||
725 | adr $const,_bsaes_decrypt8 | ||
726 | vldmia $key!, {@XMM[9]} @ round 0 key | ||
727 | add $const,$const,#.LM0ISR-_bsaes_decrypt8 | ||
728 | |||
729 | vldmia $const!, {@XMM[8]} @ .LM0ISR | ||
730 | veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key | ||
731 | veor @XMM[11], @XMM[1], @XMM[9] | ||
732 | vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
733 | vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
734 | veor @XMM[12], @XMM[2], @XMM[9] | ||
735 | vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
736 | vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
737 | veor @XMM[13], @XMM[3], @XMM[9] | ||
738 | vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` | ||
739 | vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` | ||
740 | veor @XMM[14], @XMM[4], @XMM[9] | ||
741 | vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` | ||
742 | vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` | ||
743 | veor @XMM[15], @XMM[5], @XMM[9] | ||
744 | vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` | ||
745 | vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` | ||
746 | veor @XMM[10], @XMM[6], @XMM[9] | ||
747 | vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` | ||
748 | vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` | ||
749 | veor @XMM[11], @XMM[7], @XMM[9] | ||
750 | vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
751 | vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
752 | vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
753 | vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
754 | ___ | ||
755 | &bitslice (@XMM[0..7, 8..11]); | ||
756 | $code.=<<___; | ||
757 | sub $rounds,$rounds,#1 | ||
758 | b .Ldec_sbox | ||
759 | .align 4 | ||
760 | .Ldec_loop: | ||
761 | ___ | ||
762 | &ShiftRows (@XMM[0..7, 8..12]); | ||
763 | $code.=".Ldec_sbox:\n"; | ||
764 | &InvSbox (@XMM[0..7, 8..15]); | ||
765 | $code.=<<___; | ||
766 | subs $rounds,$rounds,#1 | ||
767 | bcc .Ldec_done | ||
768 | ___ | ||
769 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
770 | $code.=<<___; | ||
771 | vldmia $const, {@XMM[12]} @ .LISR | ||
772 | ite eq @ Thumb2 thing, sanity check in ARM | ||
773 | addeq $const,$const,#0x10 | ||
774 | bne .Ldec_loop | ||
775 | vldmia $const, {@XMM[12]} @ .LISRM0 | ||
776 | b .Ldec_loop | ||
777 | .align 4 | ||
778 | .Ldec_done: | ||
779 | ___ | ||
780 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
781 | $code.=<<___; | ||
782 | vldmia $key, {@XMM[8]} @ last round key | ||
783 | veor @XMM[6], @XMM[6], @XMM[8] | ||
784 | veor @XMM[4], @XMM[4], @XMM[8] | ||
785 | veor @XMM[2], @XMM[2], @XMM[8] | ||
786 | veor @XMM[7], @XMM[7], @XMM[8] | ||
787 | veor @XMM[3], @XMM[3], @XMM[8] | ||
788 | veor @XMM[5], @XMM[5], @XMM[8] | ||
789 | veor @XMM[0], @XMM[0], @XMM[8] | ||
790 | veor @XMM[1], @XMM[1], @XMM[8] | ||
791 | bx lr | ||
792 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
793 | |||
794 | .type _bsaes_const,%object | ||
795 | .align 6 | ||
796 | _bsaes_const: | ||
797 | .LM0ISR: @ InvShiftRows constants | ||
798 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
799 | .LISR: | ||
800 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
801 | .LISRM0: | ||
802 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
803 | .LM0SR: @ ShiftRows constants | ||
804 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
805 | .LSR: | ||
806 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
807 | .LSRM0: | ||
808 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
809 | .LM0: | ||
810 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
811 | .LREVM0SR: | ||
812 | .quad 0x090d01050c000408, 0x03070b0f060a0e02 | ||
813 | .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
814 | .align 6 | ||
815 | .size _bsaes_const,.-_bsaes_const | ||
816 | |||
817 | .type _bsaes_encrypt8,%function | ||
818 | .align 4 | ||
819 | _bsaes_encrypt8: | ||
820 | adr $const,_bsaes_encrypt8 | ||
821 | vldmia $key!, {@XMM[9]} @ round 0 key | ||
822 | sub $const,$const,#_bsaes_encrypt8-.LM0SR | ||
823 | |||
824 | vldmia $const!, {@XMM[8]} @ .LM0SR | ||
825 | _bsaes_encrypt8_alt: | ||
826 | veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key | ||
827 | veor @XMM[11], @XMM[1], @XMM[9] | ||
828 | vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
829 | vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
830 | veor @XMM[12], @XMM[2], @XMM[9] | ||
831 | vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
832 | vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
833 | veor @XMM[13], @XMM[3], @XMM[9] | ||
834 | vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` | ||
835 | vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` | ||
836 | veor @XMM[14], @XMM[4], @XMM[9] | ||
837 | vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` | ||
838 | vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` | ||
839 | veor @XMM[15], @XMM[5], @XMM[9] | ||
840 | vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` | ||
841 | vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` | ||
842 | veor @XMM[10], @XMM[6], @XMM[9] | ||
843 | vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` | ||
844 | vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` | ||
845 | veor @XMM[11], @XMM[7], @XMM[9] | ||
846 | vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` | ||
847 | vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` | ||
848 | vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` | ||
849 | vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` | ||
850 | _bsaes_encrypt8_bitslice: | ||
851 | ___ | ||
852 | &bitslice (@XMM[0..7, 8..11]); | ||
853 | $code.=<<___; | ||
854 | sub $rounds,$rounds,#1 | ||
855 | b .Lenc_sbox | ||
856 | .align 4 | ||
857 | .Lenc_loop: | ||
858 | ___ | ||
859 | &ShiftRows (@XMM[0..7, 8..12]); | ||
860 | $code.=".Lenc_sbox:\n"; | ||
861 | &Sbox (@XMM[0..7, 8..15]); | ||
862 | $code.=<<___; | ||
863 | subs $rounds,$rounds,#1 | ||
864 | bcc .Lenc_done | ||
865 | ___ | ||
866 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
867 | $code.=<<___; | ||
868 | vldmia $const, {@XMM[12]} @ .LSR | ||
869 | ite eq @ Thumb2 thing, samity check in ARM | ||
870 | addeq $const,$const,#0x10 | ||
871 | bne .Lenc_loop | ||
872 | vldmia $const, {@XMM[12]} @ .LSRM0 | ||
873 | b .Lenc_loop | ||
874 | .align 4 | ||
875 | .Lenc_done: | ||
876 | ___ | ||
877 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
878 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
879 | $code.=<<___; | ||
880 | vldmia $key, {@XMM[8]} @ last round key | ||
881 | veor @XMM[4], @XMM[4], @XMM[8] | ||
882 | veor @XMM[6], @XMM[6], @XMM[8] | ||
883 | veor @XMM[3], @XMM[3], @XMM[8] | ||
884 | veor @XMM[7], @XMM[7], @XMM[8] | ||
885 | veor @XMM[2], @XMM[2], @XMM[8] | ||
886 | veor @XMM[5], @XMM[5], @XMM[8] | ||
887 | veor @XMM[0], @XMM[0], @XMM[8] | ||
888 | veor @XMM[1], @XMM[1], @XMM[8] | ||
889 | bx lr | ||
890 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
891 | ___ | ||
892 | } | ||
893 | { | ||
894 | my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); | ||
895 | |||
896 | sub bitslice_key { | ||
897 | my @x=reverse(@_[0..7]); | ||
898 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
899 | |||
900 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
901 | $code.=<<___; | ||
902 | @ &swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
903 | vmov @x[2], @x[0] | ||
904 | vmov @x[3], @x[1] | ||
905 | ___ | ||
906 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
907 | |||
908 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
909 | $code.=<<___; | ||
910 | @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
911 | vmov @x[4], @x[0] | ||
912 | vmov @x[6], @x[2] | ||
913 | vmov @x[5], @x[1] | ||
914 | vmov @x[7], @x[3] | ||
915 | ___ | ||
916 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
917 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
918 | } | ||
919 | |||
920 | $code.=<<___; | ||
921 | .type _bsaes_key_convert,%function | ||
922 | .align 4 | ||
923 | _bsaes_key_convert: | ||
924 | adr $const,_bsaes_key_convert | ||
925 | vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key | ||
926 | sub $const,$const,#_bsaes_key_convert-.LM0 | ||
927 | vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key | ||
928 | |||
929 | vmov.i8 @XMM[8], #0x01 @ bit masks | ||
930 | vmov.i8 @XMM[9], #0x02 | ||
931 | vmov.i8 @XMM[10], #0x04 | ||
932 | vmov.i8 @XMM[11], #0x08 | ||
933 | vmov.i8 @XMM[12], #0x10 | ||
934 | vmov.i8 @XMM[13], #0x20 | ||
935 | vldmia $const, {@XMM[14]} @ .LM0 | ||
936 | |||
937 | #ifdef __ARMEL__ | ||
938 | vrev32.8 @XMM[7], @XMM[7] | ||
939 | vrev32.8 @XMM[15], @XMM[15] | ||
940 | #endif | ||
941 | sub $rounds,$rounds,#1 | ||
942 | vstmia $out!, {@XMM[7]} @ save round 0 key | ||
943 | b .Lkey_loop | ||
944 | |||
945 | .align 4 | ||
946 | .Lkey_loop: | ||
947 | vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` | ||
948 | vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` | ||
949 | vmov.i8 @XMM[6], #0x40 | ||
950 | vmov.i8 @XMM[15], #0x80 | ||
951 | |||
952 | vtst.8 @XMM[0], @XMM[7], @XMM[8] | ||
953 | vtst.8 @XMM[1], @XMM[7], @XMM[9] | ||
954 | vtst.8 @XMM[2], @XMM[7], @XMM[10] | ||
955 | vtst.8 @XMM[3], @XMM[7], @XMM[11] | ||
956 | vtst.8 @XMM[4], @XMM[7], @XMM[12] | ||
957 | vtst.8 @XMM[5], @XMM[7], @XMM[13] | ||
958 | vtst.8 @XMM[6], @XMM[7], @XMM[6] | ||
959 | vtst.8 @XMM[7], @XMM[7], @XMM[15] | ||
960 | vld1.8 {@XMM[15]}, [$inp]! @ load next round key | ||
961 | vmvn @XMM[0], @XMM[0] @ "pnot" | ||
962 | vmvn @XMM[1], @XMM[1] | ||
963 | vmvn @XMM[5], @XMM[5] | ||
964 | vmvn @XMM[6], @XMM[6] | ||
965 | #ifdef __ARMEL__ | ||
966 | vrev32.8 @XMM[15], @XMM[15] | ||
967 | #endif | ||
968 | subs $rounds,$rounds,#1 | ||
969 | vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key | ||
970 | bne .Lkey_loop | ||
971 | |||
972 | vmov.i8 @XMM[7],#0x63 @ compose .L63 | ||
973 | @ don't save last round key | ||
974 | bx lr | ||
975 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
976 | ___ | ||
977 | } | ||
978 | |||
979 | if (0) { # following four functions are unsupported interface | ||
980 | # used for benchmarking... | ||
981 | $code.=<<___; | ||
982 | .globl bsaes_enc_key_convert | ||
983 | .type bsaes_enc_key_convert,%function | ||
984 | .align 4 | ||
985 | bsaes_enc_key_convert: | ||
986 | stmdb sp!,{r4-r6,lr} | ||
987 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
988 | |||
989 | ldr r5,[$inp,#240] @ pass rounds | ||
990 | mov r4,$inp @ pass key | ||
991 | mov r12,$out @ pass key schedule | ||
992 | bl _bsaes_key_convert | ||
993 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
994 | vstmia r12, {@XMM[7]} @ save last round key | ||
995 | |||
996 | vldmia sp!,{d8-d15} | ||
997 | ldmia sp!,{r4-r6,pc} | ||
998 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
999 | |||
1000 | .globl bsaes_encrypt_128 | ||
1001 | .type bsaes_encrypt_128,%function | ||
1002 | .align 4 | ||
1003 | bsaes_encrypt_128: | ||
1004 | stmdb sp!,{r4-r6,lr} | ||
1005 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1006 | .Lenc128_loop: | ||
1007 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1008 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1009 | mov r4,$key @ pass the key | ||
1010 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1011 | mov r5,#10 @ pass rounds | ||
1012 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1013 | |||
1014 | bl _bsaes_encrypt8 | ||
1015 | |||
1016 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1017 | vst1.8 {@XMM[4]}, [$out]! | ||
1018 | vst1.8 {@XMM[6]}, [$out]! | ||
1019 | vst1.8 {@XMM[3]}, [$out]! | ||
1020 | vst1.8 {@XMM[7]}, [$out]! | ||
1021 | vst1.8 {@XMM[2]}, [$out]! | ||
1022 | subs $len,$len,#0x80 | ||
1023 | vst1.8 {@XMM[5]}, [$out]! | ||
1024 | bhi .Lenc128_loop | ||
1025 | |||
1026 | vldmia sp!,{d8-d15} | ||
1027 | ldmia sp!,{r4-r6,pc} | ||
1028 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
1029 | |||
1030 | .globl bsaes_dec_key_convert | ||
1031 | .type bsaes_dec_key_convert,%function | ||
1032 | .align 4 | ||
1033 | bsaes_dec_key_convert: | ||
1034 | stmdb sp!,{r4-r6,lr} | ||
1035 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1036 | |||
1037 | ldr r5,[$inp,#240] @ pass rounds | ||
1038 | mov r4,$inp @ pass key | ||
1039 | mov r12,$out @ pass key schedule | ||
1040 | bl _bsaes_key_convert | ||
1041 | vldmia $out, {@XMM[6]} | ||
1042 | vstmia r12, {@XMM[15]} @ save last round key | ||
1043 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1044 | vstmia $out, {@XMM[7]} | ||
1045 | |||
1046 | vldmia sp!,{d8-d15} | ||
1047 | ldmia sp!,{r4-r6,pc} | ||
1048 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
1049 | |||
1050 | .globl bsaes_decrypt_128 | ||
1051 | .type bsaes_decrypt_128,%function | ||
1052 | .align 4 | ||
1053 | bsaes_decrypt_128: | ||
1054 | stmdb sp!,{r4-r6,lr} | ||
1055 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
1056 | .Ldec128_loop: | ||
1057 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1058 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1059 | mov r4,$key @ pass the key | ||
1060 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1061 | mov r5,#10 @ pass rounds | ||
1062 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1063 | |||
1064 | bl _bsaes_decrypt8 | ||
1065 | |||
1066 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1067 | vst1.8 {@XMM[6]}, [$out]! | ||
1068 | vst1.8 {@XMM[4]}, [$out]! | ||
1069 | vst1.8 {@XMM[2]}, [$out]! | ||
1070 | vst1.8 {@XMM[7]}, [$out]! | ||
1071 | vst1.8 {@XMM[3]}, [$out]! | ||
1072 | subs $len,$len,#0x80 | ||
1073 | vst1.8 {@XMM[5]}, [$out]! | ||
1074 | bhi .Ldec128_loop | ||
1075 | |||
1076 | vldmia sp!,{d8-d15} | ||
1077 | ldmia sp!,{r4-r6,pc} | ||
1078 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
1079 | ___ | ||
1080 | } | ||
1081 | { | ||
1082 | my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); | ||
1083 | my ($keysched)=("sp"); | ||
1084 | |||
1085 | $code.=<<___; | ||
1086 | .extern AES_cbc_encrypt | ||
1087 | .extern AES_decrypt | ||
1088 | |||
1089 | .global bsaes_cbc_encrypt | ||
1090 | .type bsaes_cbc_encrypt,%function | ||
1091 | .align 5 | ||
1092 | bsaes_cbc_encrypt: | ||
1093 | #ifndef __KERNEL__ | ||
1094 | cmp $len, #128 | ||
1095 | #ifndef __thumb__ | ||
1096 | blo AES_cbc_encrypt | ||
1097 | #else | ||
1098 | bhs 1f | ||
1099 | b AES_cbc_encrypt | ||
1100 | 1: | ||
1101 | #endif | ||
1102 | #endif | ||
1103 | |||
1104 | @ it is up to the caller to make sure we are called with enc == 0 | ||
1105 | |||
1106 | mov ip, sp | ||
1107 | stmdb sp!, {r4-r10, lr} | ||
1108 | VFP_ABI_PUSH | ||
1109 | ldr $ivp, [ip] @ IV is 1st arg on the stack | ||
1110 | mov $len, $len, lsr#4 @ len in 16 byte blocks | ||
1111 | sub sp, #0x10 @ scratch space to carry over the IV | ||
1112 | mov $fp, sp @ save sp | ||
1113 | |||
1114 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1115 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1116 | @ allocate the key schedule on the stack | ||
1117 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1118 | add r12, #`128-32` @ sifze of bit-slices key schedule | ||
1119 | |||
1120 | @ populate the key schedule | ||
1121 | mov r4, $key @ pass key | ||
1122 | mov r5, $rounds @ pass # of rounds | ||
1123 | mov sp, r12 @ sp is $keysched | ||
1124 | bl _bsaes_key_convert | ||
1125 | vldmia $keysched, {@XMM[6]} | ||
1126 | vstmia r12, {@XMM[15]} @ save last round key | ||
1127 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1128 | vstmia $keysched, {@XMM[7]} | ||
1129 | #else | ||
1130 | ldr r12, [$key, #244] | ||
1131 | eors r12, #1 | ||
1132 | beq 0f | ||
1133 | |||
1134 | @ populate the key schedule | ||
1135 | str r12, [$key, #244] | ||
1136 | mov r4, $key @ pass key | ||
1137 | mov r5, $rounds @ pass # of rounds | ||
1138 | add r12, $key, #248 @ pass key schedule | ||
1139 | bl _bsaes_key_convert | ||
1140 | add r4, $key, #248 | ||
1141 | vldmia r4, {@XMM[6]} | ||
1142 | vstmia r12, {@XMM[15]} @ save last round key | ||
1143 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
1144 | vstmia r4, {@XMM[7]} | ||
1145 | |||
1146 | .align 2 | ||
1147 | 0: | ||
1148 | #endif | ||
1149 | |||
1150 | vld1.8 {@XMM[15]}, [$ivp] @ load IV | ||
1151 | b .Lcbc_dec_loop | ||
1152 | |||
1153 | .align 4 | ||
1154 | .Lcbc_dec_loop: | ||
1155 | subs $len, $len, #0x8 | ||
1156 | bmi .Lcbc_dec_loop_finish | ||
1157 | |||
1158 | vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input | ||
1159 | vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! | ||
1160 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1161 | mov r4, $keysched @ pass the key | ||
1162 | #else | ||
1163 | add r4, $key, #248 | ||
1164 | #endif | ||
1165 | vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! | ||
1166 | mov r5, $rounds | ||
1167 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp] | ||
1168 | sub $inp, $inp, #0x60 | ||
1169 | vstmia $fp, {@XMM[15]} @ put aside IV | ||
1170 | |||
1171 | bl _bsaes_decrypt8 | ||
1172 | |||
1173 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1174 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1175 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1176 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1177 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1178 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1179 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1180 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1181 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1182 | vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! | ||
1183 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1184 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1185 | veor @XMM[3], @XMM[3], @XMM[13] | ||
1186 | vst1.8 {@XMM[6]}, [$out]! | ||
1187 | veor @XMM[5], @XMM[5], @XMM[14] | ||
1188 | vst1.8 {@XMM[4]}, [$out]! | ||
1189 | vst1.8 {@XMM[2]}, [$out]! | ||
1190 | vst1.8 {@XMM[7]}, [$out]! | ||
1191 | vst1.8 {@XMM[3]}, [$out]! | ||
1192 | vst1.8 {@XMM[5]}, [$out]! | ||
1193 | |||
1194 | b .Lcbc_dec_loop | ||
1195 | |||
1196 | .Lcbc_dec_loop_finish: | ||
1197 | adds $len, $len, #8 | ||
1198 | beq .Lcbc_dec_done | ||
1199 | |||
1200 | vld1.8 {@XMM[0]}, [$inp]! @ load input | ||
1201 | cmp $len, #2 | ||
1202 | blo .Lcbc_dec_one | ||
1203 | vld1.8 {@XMM[1]}, [$inp]! | ||
1204 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1205 | mov r4, $keysched @ pass the key | ||
1206 | #else | ||
1207 | add r4, $key, #248 | ||
1208 | #endif | ||
1209 | mov r5, $rounds | ||
1210 | vstmia $fp, {@XMM[15]} @ put aside IV | ||
1211 | beq .Lcbc_dec_two | ||
1212 | vld1.8 {@XMM[2]}, [$inp]! | ||
1213 | cmp $len, #4 | ||
1214 | blo .Lcbc_dec_three | ||
1215 | vld1.8 {@XMM[3]}, [$inp]! | ||
1216 | beq .Lcbc_dec_four | ||
1217 | vld1.8 {@XMM[4]}, [$inp]! | ||
1218 | cmp $len, #6 | ||
1219 | blo .Lcbc_dec_five | ||
1220 | vld1.8 {@XMM[5]}, [$inp]! | ||
1221 | beq .Lcbc_dec_six | ||
1222 | vld1.8 {@XMM[6]}, [$inp]! | ||
1223 | sub $inp, $inp, #0x70 | ||
1224 | |||
1225 | bl _bsaes_decrypt8 | ||
1226 | |||
1227 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1228 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1229 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1230 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1231 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1232 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1233 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1234 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1235 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1236 | vld1.8 {@XMM[15]}, [$inp]! | ||
1237 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1238 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1239 | veor @XMM[3], @XMM[3], @XMM[13] | ||
1240 | vst1.8 {@XMM[6]}, [$out]! | ||
1241 | vst1.8 {@XMM[4]}, [$out]! | ||
1242 | vst1.8 {@XMM[2]}, [$out]! | ||
1243 | vst1.8 {@XMM[7]}, [$out]! | ||
1244 | vst1.8 {@XMM[3]}, [$out]! | ||
1245 | b .Lcbc_dec_done | ||
1246 | .align 4 | ||
1247 | .Lcbc_dec_six: | ||
1248 | sub $inp, $inp, #0x60 | ||
1249 | bl _bsaes_decrypt8 | ||
1250 | vldmia $fp,{@XMM[14]} @ reload IV | ||
1251 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1252 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1253 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1254 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1255 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1256 | vld1.8 {@XMM[12]}, [$inp]! | ||
1257 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1258 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1259 | vld1.8 {@XMM[15]}, [$inp]! | ||
1260 | veor @XMM[7], @XMM[7], @XMM[12] | ||
1261 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1262 | vst1.8 {@XMM[6]}, [$out]! | ||
1263 | vst1.8 {@XMM[4]}, [$out]! | ||
1264 | vst1.8 {@XMM[2]}, [$out]! | ||
1265 | vst1.8 {@XMM[7]}, [$out]! | ||
1266 | b .Lcbc_dec_done | ||
1267 | .align 4 | ||
1268 | .Lcbc_dec_five: | ||
1269 | sub $inp, $inp, #0x50 | ||
1270 | bl _bsaes_decrypt8 | ||
1271 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1272 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1273 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1274 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1275 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1276 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1277 | vld1.8 {@XMM[15]}, [$inp]! | ||
1278 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1279 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1280 | veor @XMM[2], @XMM[2], @XMM[11] | ||
1281 | vst1.8 {@XMM[6]}, [$out]! | ||
1282 | vst1.8 {@XMM[4]}, [$out]! | ||
1283 | vst1.8 {@XMM[2]}, [$out]! | ||
1284 | b .Lcbc_dec_done | ||
1285 | .align 4 | ||
1286 | .Lcbc_dec_four: | ||
1287 | sub $inp, $inp, #0x40 | ||
1288 | bl _bsaes_decrypt8 | ||
1289 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1290 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1291 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1292 | vld1.8 {@XMM[10]}, [$inp]! | ||
1293 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1294 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1295 | vld1.8 {@XMM[15]}, [$inp]! | ||
1296 | veor @XMM[4], @XMM[4], @XMM[10] | ||
1297 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1298 | vst1.8 {@XMM[6]}, [$out]! | ||
1299 | vst1.8 {@XMM[4]}, [$out]! | ||
1300 | b .Lcbc_dec_done | ||
1301 | .align 4 | ||
1302 | .Lcbc_dec_three: | ||
1303 | sub $inp, $inp, #0x30 | ||
1304 | bl _bsaes_decrypt8 | ||
1305 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1306 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input | ||
1307 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1308 | vld1.8 {@XMM[15]}, [$inp]! | ||
1309 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1310 | veor @XMM[6], @XMM[6], @XMM[9] | ||
1311 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1312 | vst1.8 {@XMM[6]}, [$out]! | ||
1313 | b .Lcbc_dec_done | ||
1314 | .align 4 | ||
1315 | .Lcbc_dec_two: | ||
1316 | sub $inp, $inp, #0x20 | ||
1317 | bl _bsaes_decrypt8 | ||
1318 | vldmia $fp, {@XMM[14]} @ reload IV | ||
1319 | vld1.8 {@XMM[8]}, [$inp]! @ reload input | ||
1320 | veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV | ||
1321 | vld1.8 {@XMM[15]}, [$inp]! @ reload input | ||
1322 | veor @XMM[1], @XMM[1], @XMM[8] | ||
1323 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1324 | b .Lcbc_dec_done | ||
1325 | .align 4 | ||
1326 | .Lcbc_dec_one: | ||
1327 | sub $inp, $inp, #0x10 | ||
1328 | mov $rounds, $out @ save original out pointer | ||
1329 | mov $out, $fp @ use the iv scratch space as out buffer | ||
1330 | mov r2, $key | ||
1331 | vmov @XMM[4],@XMM[15] @ just in case ensure that IV | ||
1332 | vmov @XMM[5],@XMM[0] @ and input are preserved | ||
1333 | bl AES_decrypt | ||
1334 | vld1.8 {@XMM[0]}, [$fp,:64] @ load result | ||
1335 | veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV | ||
1336 | vmov @XMM[15], @XMM[5] @ @XMM[5] holds input | ||
1337 | vst1.8 {@XMM[0]}, [$rounds] @ write output | ||
1338 | |||
1339 | .Lcbc_dec_done: | ||
1340 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1341 | vmov.i32 q0, #0 | ||
1342 | vmov.i32 q1, #0 | ||
1343 | .Lcbc_dec_bzero: @ wipe key schedule [if any] | ||
1344 | vstmia $keysched!, {q0-q1} | ||
1345 | cmp $keysched, $fp | ||
1346 | bne .Lcbc_dec_bzero | ||
1347 | #endif | ||
1348 | |||
1349 | mov sp, $fp | ||
1350 | add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb | ||
1351 | vst1.8 {@XMM[15]}, [$ivp] @ return IV | ||
1352 | VFP_ABI_POP | ||
1353 | ldmia sp!, {r4-r10, pc} | ||
1354 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1355 | ___ | ||
1356 | } | ||
1357 | { | ||
1358 | my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); | ||
1359 | my $const = "r6"; # shared with _bsaes_encrypt8_alt | ||
1360 | my $keysched = "sp"; | ||
1361 | |||
1362 | $code.=<<___; | ||
1363 | .extern AES_encrypt | ||
1364 | .global bsaes_ctr32_encrypt_blocks | ||
1365 | .type bsaes_ctr32_encrypt_blocks,%function | ||
1366 | .align 5 | ||
1367 | bsaes_ctr32_encrypt_blocks: | ||
1368 | cmp $len, #8 @ use plain AES for | ||
1369 | blo .Lctr_enc_short @ small sizes | ||
1370 | |||
1371 | mov ip, sp | ||
1372 | stmdb sp!, {r4-r10, lr} | ||
1373 | VFP_ABI_PUSH | ||
1374 | ldr $ctr, [ip] @ ctr is 1st arg on the stack | ||
1375 | sub sp, sp, #0x10 @ scratch space to carry over the ctr | ||
1376 | mov $fp, sp @ save sp | ||
1377 | |||
1378 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1379 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1380 | @ allocate the key schedule on the stack | ||
1381 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1382 | add r12, #`128-32` @ size of bit-sliced key schedule | ||
1383 | |||
1384 | @ populate the key schedule | ||
1385 | mov r4, $key @ pass key | ||
1386 | mov r5, $rounds @ pass # of rounds | ||
1387 | mov sp, r12 @ sp is $keysched | ||
1388 | bl _bsaes_key_convert | ||
1389 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
1390 | vstmia r12, {@XMM[7]} @ save last round key | ||
1391 | |||
1392 | vld1.8 {@XMM[0]}, [$ctr] @ load counter | ||
1393 | add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr | ||
1394 | vldmia $keysched, {@XMM[4]} @ load round0 key | ||
1395 | #else | ||
1396 | ldr r12, [$key, #244] | ||
1397 | eors r12, #1 | ||
1398 | beq 0f | ||
1399 | |||
1400 | @ populate the key schedule | ||
1401 | str r12, [$key, #244] | ||
1402 | mov r4, $key @ pass key | ||
1403 | mov r5, $rounds @ pass # of rounds | ||
1404 | add r12, $key, #248 @ pass key schedule | ||
1405 | bl _bsaes_key_convert | ||
1406 | veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key | ||
1407 | vstmia r12, {@XMM[7]} @ save last round key | ||
1408 | |||
1409 | .align 2 | ||
1410 | 0: add r12, $key, #248 | ||
1411 | vld1.8 {@XMM[0]}, [$ctr] @ load counter | ||
1412 | adrl $ctr, .LREVM0SR @ borrow $ctr | ||
1413 | vldmia r12, {@XMM[4]} @ load round0 key | ||
1414 | sub sp, #0x10 @ place for adjusted round0 key | ||
1415 | #endif | ||
1416 | |||
1417 | vmov.i32 @XMM[8],#1 @ compose 1<<96 | ||
1418 | veor @XMM[9],@XMM[9],@XMM[9] | ||
1419 | vrev32.8 @XMM[0],@XMM[0] | ||
1420 | vext.8 @XMM[8],@XMM[9],@XMM[8],#4 | ||
1421 | vrev32.8 @XMM[4],@XMM[4] | ||
1422 | vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 | ||
1423 | vstmia $keysched, {@XMM[4]} @ save adjusted round0 key | ||
1424 | b .Lctr_enc_loop | ||
1425 | |||
1426 | .align 4 | ||
1427 | .Lctr_enc_loop: | ||
1428 | vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 | ||
1429 | vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 | ||
1430 | vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 | ||
1431 | vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 | ||
1432 | vadd.u32 @XMM[4], @XMM[1], @XMM[10] | ||
1433 | vadd.u32 @XMM[5], @XMM[2], @XMM[10] | ||
1434 | vadd.u32 @XMM[6], @XMM[3], @XMM[10] | ||
1435 | vadd.u32 @XMM[7], @XMM[4], @XMM[10] | ||
1436 | vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter | ||
1437 | |||
1438 | @ Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1439 | @ to flip byte order in 32-bit counter | ||
1440 | |||
1441 | vldmia $keysched, {@XMM[9]} @ load round0 key | ||
1442 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1443 | add r4, $keysched, #0x10 @ pass next round key | ||
1444 | #else | ||
1445 | add r4, $key, #`248+16` | ||
1446 | #endif | ||
1447 | vldmia $ctr, {@XMM[8]} @ .LREVM0SR | ||
1448 | mov r5, $rounds @ pass rounds | ||
1449 | vstmia $fp, {@XMM[10]} @ save next counter | ||
1450 | sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants | ||
1451 | |||
1452 | bl _bsaes_encrypt8_alt | ||
1453 | |||
1454 | subs $len, $len, #8 | ||
1455 | blo .Lctr_enc_loop_done | ||
1456 | |||
1457 | vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input | ||
1458 | vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! | ||
1459 | veor @XMM[0], @XMM[8] | ||
1460 | veor @XMM[1], @XMM[9] | ||
1461 | vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! | ||
1462 | veor @XMM[4], @XMM[10] | ||
1463 | veor @XMM[6], @XMM[11] | ||
1464 | vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! | ||
1465 | veor @XMM[3], @XMM[12] | ||
1466 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output | ||
1467 | veor @XMM[7], @XMM[13] | ||
1468 | veor @XMM[2], @XMM[14] | ||
1469 | vst1.8 {@XMM[4]}, [$out]! | ||
1470 | veor @XMM[5], @XMM[15] | ||
1471 | vst1.8 {@XMM[6]}, [$out]! | ||
1472 | vmov.i32 @XMM[8], #1 @ compose 1<<96 | ||
1473 | vst1.8 {@XMM[3]}, [$out]! | ||
1474 | veor @XMM[9], @XMM[9], @XMM[9] | ||
1475 | vst1.8 {@XMM[7]}, [$out]! | ||
1476 | vext.8 @XMM[8], @XMM[9], @XMM[8], #4 | ||
1477 | vst1.8 {@XMM[2]}, [$out]! | ||
1478 | vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 | ||
1479 | vst1.8 {@XMM[5]}, [$out]! | ||
1480 | vldmia $fp, {@XMM[0]} @ load counter | ||
1481 | |||
1482 | bne .Lctr_enc_loop | ||
1483 | b .Lctr_enc_done | ||
1484 | |||
1485 | .align 4 | ||
1486 | .Lctr_enc_loop_done: | ||
1487 | add $len, $len, #8 | ||
1488 | vld1.8 {@XMM[8]}, [$inp]! @ load input | ||
1489 | veor @XMM[0], @XMM[8] | ||
1490 | vst1.8 {@XMM[0]}, [$out]! @ write output | ||
1491 | cmp $len, #2 | ||
1492 | blo .Lctr_enc_done | ||
1493 | vld1.8 {@XMM[9]}, [$inp]! | ||
1494 | veor @XMM[1], @XMM[9] | ||
1495 | vst1.8 {@XMM[1]}, [$out]! | ||
1496 | beq .Lctr_enc_done | ||
1497 | vld1.8 {@XMM[10]}, [$inp]! | ||
1498 | veor @XMM[4], @XMM[10] | ||
1499 | vst1.8 {@XMM[4]}, [$out]! | ||
1500 | cmp $len, #4 | ||
1501 | blo .Lctr_enc_done | ||
1502 | vld1.8 {@XMM[11]}, [$inp]! | ||
1503 | veor @XMM[6], @XMM[11] | ||
1504 | vst1.8 {@XMM[6]}, [$out]! | ||
1505 | beq .Lctr_enc_done | ||
1506 | vld1.8 {@XMM[12]}, [$inp]! | ||
1507 | veor @XMM[3], @XMM[12] | ||
1508 | vst1.8 {@XMM[3]}, [$out]! | ||
1509 | cmp $len, #6 | ||
1510 | blo .Lctr_enc_done | ||
1511 | vld1.8 {@XMM[13]}, [$inp]! | ||
1512 | veor @XMM[7], @XMM[13] | ||
1513 | vst1.8 {@XMM[7]}, [$out]! | ||
1514 | beq .Lctr_enc_done | ||
1515 | vld1.8 {@XMM[14]}, [$inp] | ||
1516 | veor @XMM[2], @XMM[14] | ||
1517 | vst1.8 {@XMM[2]}, [$out]! | ||
1518 | |||
1519 | .Lctr_enc_done: | ||
1520 | vmov.i32 q0, #0 | ||
1521 | vmov.i32 q1, #0 | ||
1522 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1523 | .Lctr_enc_bzero: @ wipe key schedule [if any] | ||
1524 | vstmia $keysched!, {q0-q1} | ||
1525 | cmp $keysched, $fp | ||
1526 | bne .Lctr_enc_bzero | ||
1527 | #else | ||
1528 | vstmia $keysched, {q0-q1} | ||
1529 | #endif | ||
1530 | |||
1531 | mov sp, $fp | ||
1532 | add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb | ||
1533 | VFP_ABI_POP | ||
1534 | ldmia sp!, {r4-r10, pc} @ return | ||
1535 | |||
1536 | .align 4 | ||
1537 | .Lctr_enc_short: | ||
1538 | ldr ip, [sp] @ ctr pointer is passed on stack | ||
1539 | stmdb sp!, {r4-r8, lr} | ||
1540 | |||
1541 | mov r4, $inp @ copy arguments | ||
1542 | mov r5, $out | ||
1543 | mov r6, $len | ||
1544 | mov r7, $key | ||
1545 | ldr r8, [ip, #12] @ load counter LSW | ||
1546 | vld1.8 {@XMM[1]}, [ip] @ load whole counter value | ||
1547 | #ifdef __ARMEL__ | ||
1548 | rev r8, r8 | ||
1549 | #endif | ||
1550 | sub sp, sp, #0x10 | ||
1551 | vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value | ||
1552 | sub sp, sp, #0x10 | ||
1553 | |||
1554 | .Lctr_enc_short_loop: | ||
1555 | add r0, sp, #0x10 @ input counter value | ||
1556 | mov r1, sp @ output on the stack | ||
1557 | mov r2, r7 @ key | ||
1558 | |||
1559 | bl AES_encrypt | ||
1560 | |||
1561 | vld1.8 {@XMM[0]}, [r4]! @ load input | ||
1562 | vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter | ||
1563 | add r8, r8, #1 | ||
1564 | #ifdef __ARMEL__ | ||
1565 | rev r0, r8 | ||
1566 | str r0, [sp, #0x1c] @ next counter value | ||
1567 | #else | ||
1568 | str r8, [sp, #0x1c] @ next counter value | ||
1569 | #endif | ||
1570 | veor @XMM[0],@XMM[0],@XMM[1] | ||
1571 | vst1.8 {@XMM[0]}, [r5]! @ store output | ||
1572 | subs r6, r6, #1 | ||
1573 | bne .Lctr_enc_short_loop | ||
1574 | |||
1575 | vmov.i32 q0, #0 | ||
1576 | vmov.i32 q1, #0 | ||
1577 | vstmia sp!, {q0-q1} | ||
1578 | |||
1579 | ldmia sp!, {r4-r8, pc} | ||
1580 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
1581 | ___ | ||
1582 | } | ||
1583 | { | ||
1584 | ###################################################################### | ||
1585 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1586 | # const AES_KEY *key1, const AES_KEY *key2, | ||
1587 | # const unsigned char iv[16]); | ||
1588 | # | ||
1589 | my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3))); | ||
1590 | my $const="r6"; # returned by _bsaes_key_convert | ||
1591 | my $twmask=@XMM[5]; | ||
1592 | my @T=@XMM[6..7]; | ||
1593 | |||
1594 | $code.=<<___; | ||
1595 | .globl bsaes_xts_encrypt | ||
1596 | .type bsaes_xts_encrypt,%function | ||
1597 | .align 4 | ||
1598 | bsaes_xts_encrypt: | ||
1599 | mov ip, sp | ||
1600 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
1601 | VFP_ABI_PUSH | ||
1602 | mov r6, sp @ future $fp | ||
1603 | |||
1604 | mov $inp, r0 | ||
1605 | mov $out, r1 | ||
1606 | mov $len, r2 | ||
1607 | mov $key, r3 | ||
1608 | |||
1609 | sub r0, sp, #0x10 @ 0x10 | ||
1610 | bic r0, #0xf @ align at 16 bytes | ||
1611 | mov sp, r0 | ||
1612 | |||
1613 | #ifdef XTS_CHAIN_TWEAK | ||
1614 | ldr r0, [ip] @ pointer to input tweak | ||
1615 | #else | ||
1616 | @ generate initial tweak | ||
1617 | ldr r0, [ip, #4] @ iv[] | ||
1618 | mov r1, sp | ||
1619 | ldr r2, [ip, #0] @ key2 | ||
1620 | bl AES_encrypt | ||
1621 | mov r0,sp @ pointer to initial tweak | ||
1622 | #endif | ||
1623 | |||
1624 | ldr $rounds, [$key, #240] @ get # of rounds | ||
1625 | mov $fp, r6 | ||
1626 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1627 | @ allocate the key schedule on the stack | ||
1628 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
1629 | @ add r12, #`128-32` @ size of bit-sliced key schedule | ||
1630 | sub r12, #`32+16` @ place for tweak[9] | ||
1631 | |||
1632 | @ populate the key schedule | ||
1633 | mov r4, $key @ pass key | ||
1634 | mov r5, $rounds @ pass # of rounds | ||
1635 | mov sp, r12 | ||
1636 | add r12, #0x90 @ pass key schedule | ||
1637 | bl _bsaes_key_convert | ||
1638 | veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key | ||
1639 | vstmia r12, {@XMM[7]} @ save last round key | ||
1640 | #else | ||
1641 | ldr r12, [$key, #244] | ||
1642 | eors r12, #1 | ||
1643 | beq 0f | ||
1644 | |||
1645 | str r12, [$key, #244] | ||
1646 | mov r4, $key @ pass key | ||
1647 | mov r5, $rounds @ pass # of rounds | ||
1648 | add r12, $key, #248 @ pass key schedule | ||
1649 | bl _bsaes_key_convert | ||
1650 | veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key | ||
1651 | vstmia r12, {@XMM[7]} | ||
1652 | |||
1653 | .align 2 | ||
1654 | 0: sub sp, #0x90 @ place for tweak[9] | ||
1655 | #endif | ||
1656 | |||
1657 | vld1.8 {@XMM[8]}, [r0] @ initial tweak | ||
1658 | adr $magic, .Lxts_magic | ||
1659 | |||
1660 | subs $len, #0x80 | ||
1661 | blo .Lxts_enc_short | ||
1662 | b .Lxts_enc_loop | ||
1663 | |||
1664 | .align 4 | ||
1665 | .Lxts_enc_loop: | ||
1666 | vldmia $magic, {$twmask} @ load XTS magic | ||
1667 | vshr.s64 @T[0], @XMM[8], #63 | ||
1668 | mov r0, sp | ||
1669 | vand @T[0], @T[0], $twmask | ||
1670 | ___ | ||
1671 | for($i=9;$i<16;$i++) { | ||
1672 | $code.=<<___; | ||
1673 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
1674 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
1675 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1676 | vshr.s64 @T[1], @XMM[$i], #63 | ||
1677 | veor @XMM[$i], @XMM[$i], @T[0] | ||
1678 | vand @T[1], @T[1], $twmask | ||
1679 | ___ | ||
1680 | @T=reverse(@T); | ||
1681 | |||
1682 | $code.=<<___ if ($i>=10); | ||
1683 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
1684 | ___ | ||
1685 | $code.=<<___ if ($i>=11); | ||
1686 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
1687 | ___ | ||
1688 | } | ||
1689 | $code.=<<___; | ||
1690 | vadd.u64 @XMM[8], @XMM[15], @XMM[15] | ||
1691 | vst1.64 {@XMM[15]}, [r0,:128]! | ||
1692 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1693 | veor @XMM[8], @XMM[8], @T[0] | ||
1694 | vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1695 | |||
1696 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
1697 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1698 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1699 | add r4, sp, #0x90 @ pass key schedule | ||
1700 | #else | ||
1701 | add r4, $key, #248 @ pass key schedule | ||
1702 | #endif | ||
1703 | veor @XMM[6], @XMM[6], @XMM[14] | ||
1704 | mov r5, $rounds @ pass rounds | ||
1705 | veor @XMM[7], @XMM[7], @XMM[15] | ||
1706 | mov r0, sp | ||
1707 | |||
1708 | bl _bsaes_encrypt8 | ||
1709 | |||
1710 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1711 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1712 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1713 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1714 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1715 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1716 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1717 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1718 | vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! | ||
1719 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1720 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1721 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1722 | veor @XMM[12], @XMM[2], @XMM[14] | ||
1723 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1724 | veor @XMM[13], @XMM[5], @XMM[15] | ||
1725 | vst1.8 {@XMM[12]-@XMM[13]}, [$out]! | ||
1726 | |||
1727 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1728 | |||
1729 | subs $len, #0x80 | ||
1730 | bpl .Lxts_enc_loop | ||
1731 | |||
1732 | .Lxts_enc_short: | ||
1733 | adds $len, #0x70 | ||
1734 | bmi .Lxts_enc_done | ||
1735 | |||
1736 | vldmia $magic, {$twmask} @ load XTS magic | ||
1737 | vshr.s64 @T[0], @XMM[8], #63 | ||
1738 | mov r0, sp | ||
1739 | vand @T[0], @T[0], $twmask | ||
1740 | ___ | ||
1741 | for($i=9;$i<16;$i++) { | ||
1742 | $code.=<<___; | ||
1743 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
1744 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
1745 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
1746 | vshr.s64 @T[1], @XMM[$i], #63 | ||
1747 | veor @XMM[$i], @XMM[$i], @T[0] | ||
1748 | vand @T[1], @T[1], $twmask | ||
1749 | ___ | ||
1750 | @T=reverse(@T); | ||
1751 | |||
1752 | $code.=<<___ if ($i>=10); | ||
1753 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
1754 | subs $len, #0x10 | ||
1755 | bmi .Lxts_enc_`$i-9` | ||
1756 | ___ | ||
1757 | $code.=<<___ if ($i>=11); | ||
1758 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
1759 | ___ | ||
1760 | } | ||
1761 | $code.=<<___; | ||
1762 | sub $len, #0x10 | ||
1763 | vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak | ||
1764 | |||
1765 | vld1.8 {@XMM[6]}, [$inp]! | ||
1766 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1767 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1768 | add r4, sp, #0x90 @ pass key schedule | ||
1769 | #else | ||
1770 | add r4, $key, #248 @ pass key schedule | ||
1771 | #endif | ||
1772 | veor @XMM[6], @XMM[6], @XMM[14] | ||
1773 | mov r5, $rounds @ pass rounds | ||
1774 | mov r0, sp | ||
1775 | |||
1776 | bl _bsaes_encrypt8 | ||
1777 | |||
1778 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1779 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1780 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1781 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1782 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1783 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1784 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1785 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1786 | vld1.64 {@XMM[14]}, [r0,:128]! | ||
1787 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1788 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1789 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1790 | veor @XMM[12], @XMM[2], @XMM[14] | ||
1791 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1792 | vst1.8 {@XMM[12]}, [$out]! | ||
1793 | |||
1794 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1795 | b .Lxts_enc_done | ||
1796 | .align 4 | ||
1797 | .Lxts_enc_6: | ||
1798 | vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak | ||
1799 | |||
1800 | veor @XMM[4], @XMM[4], @XMM[12] | ||
1801 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1802 | add r4, sp, #0x90 @ pass key schedule | ||
1803 | #else | ||
1804 | add r4, $key, #248 @ pass key schedule | ||
1805 | #endif | ||
1806 | veor @XMM[5], @XMM[5], @XMM[13] | ||
1807 | mov r5, $rounds @ pass rounds | ||
1808 | mov r0, sp | ||
1809 | |||
1810 | bl _bsaes_encrypt8 | ||
1811 | |||
1812 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1813 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1814 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1815 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
1816 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1817 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1818 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1819 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1820 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1821 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1822 | veor @XMM[11], @XMM[7], @XMM[13] | ||
1823 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
1824 | |||
1825 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1826 | b .Lxts_enc_done | ||
1827 | |||
1828 | @ put this in range for both ARM and Thumb mode adr instructions | ||
1829 | .align 5 | ||
1830 | .Lxts_magic: | ||
1831 | .quad 1, 0x87 | ||
1832 | |||
1833 | .align 5 | ||
1834 | .Lxts_enc_5: | ||
1835 | vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak | ||
1836 | |||
1837 | veor @XMM[3], @XMM[3], @XMM[11] | ||
1838 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1839 | add r4, sp, #0x90 @ pass key schedule | ||
1840 | #else | ||
1841 | add r4, $key, #248 @ pass key schedule | ||
1842 | #endif | ||
1843 | veor @XMM[4], @XMM[4], @XMM[12] | ||
1844 | mov r5, $rounds @ pass rounds | ||
1845 | mov r0, sp | ||
1846 | |||
1847 | bl _bsaes_encrypt8 | ||
1848 | |||
1849 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1850 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1851 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1852 | vld1.64 {@XMM[12]}, [r0,:128]! | ||
1853 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1854 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1855 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1856 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1857 | veor @XMM[10], @XMM[3], @XMM[12] | ||
1858 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1859 | vst1.8 {@XMM[10]}, [$out]! | ||
1860 | |||
1861 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1862 | b .Lxts_enc_done | ||
1863 | .align 4 | ||
1864 | .Lxts_enc_4: | ||
1865 | vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak | ||
1866 | |||
1867 | veor @XMM[2], @XMM[2], @XMM[10] | ||
1868 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1869 | add r4, sp, #0x90 @ pass key schedule | ||
1870 | #else | ||
1871 | add r4, $key, #248 @ pass key schedule | ||
1872 | #endif | ||
1873 | veor @XMM[3], @XMM[3], @XMM[11] | ||
1874 | mov r5, $rounds @ pass rounds | ||
1875 | mov r0, sp | ||
1876 | |||
1877 | bl _bsaes_encrypt8 | ||
1878 | |||
1879 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
1880 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
1881 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1882 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1883 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1884 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1885 | veor @XMM[9], @XMM[6], @XMM[11] | ||
1886 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
1887 | |||
1888 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1889 | b .Lxts_enc_done | ||
1890 | .align 4 | ||
1891 | .Lxts_enc_3: | ||
1892 | vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak | ||
1893 | |||
1894 | veor @XMM[1], @XMM[1], @XMM[9] | ||
1895 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1896 | add r4, sp, #0x90 @ pass key schedule | ||
1897 | #else | ||
1898 | add r4, $key, #248 @ pass key schedule | ||
1899 | #endif | ||
1900 | veor @XMM[2], @XMM[2], @XMM[10] | ||
1901 | mov r5, $rounds @ pass rounds | ||
1902 | mov r0, sp | ||
1903 | |||
1904 | bl _bsaes_encrypt8 | ||
1905 | |||
1906 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
1907 | vld1.64 {@XMM[10]}, [r0,:128]! | ||
1908 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1909 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1910 | veor @XMM[8], @XMM[4], @XMM[10] | ||
1911 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1912 | vst1.8 {@XMM[8]}, [$out]! | ||
1913 | |||
1914 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1915 | b .Lxts_enc_done | ||
1916 | .align 4 | ||
1917 | .Lxts_enc_2: | ||
1918 | vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak | ||
1919 | |||
1920 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1921 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
1922 | add r4, sp, #0x90 @ pass key schedule | ||
1923 | #else | ||
1924 | add r4, $key, #248 @ pass key schedule | ||
1925 | #endif | ||
1926 | veor @XMM[1], @XMM[1], @XMM[9] | ||
1927 | mov r5, $rounds @ pass rounds | ||
1928 | mov r0, sp | ||
1929 | |||
1930 | bl _bsaes_encrypt8 | ||
1931 | |||
1932 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
1933 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
1934 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
1935 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
1936 | |||
1937 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
1938 | b .Lxts_enc_done | ||
1939 | .align 4 | ||
1940 | .Lxts_enc_1: | ||
1941 | mov r0, sp | ||
1942 | veor @XMM[0], @XMM[8] | ||
1943 | mov r1, sp | ||
1944 | vst1.8 {@XMM[0]}, [sp,:128] | ||
1945 | mov r2, $key | ||
1946 | mov r4, $fp @ preserve fp | ||
1947 | |||
1948 | bl AES_encrypt | ||
1949 | |||
1950 | vld1.8 {@XMM[0]}, [sp,:128] | ||
1951 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1952 | vst1.8 {@XMM[0]}, [$out]! | ||
1953 | mov $fp, r4 | ||
1954 | |||
1955 | vmov @XMM[8], @XMM[9] @ next round tweak | ||
1956 | |||
1957 | .Lxts_enc_done: | ||
1958 | #ifndef XTS_CHAIN_TWEAK | ||
1959 | adds $len, #0x10 | ||
1960 | beq .Lxts_enc_ret | ||
1961 | sub r6, $out, #0x10 | ||
1962 | |||
1963 | .Lxts_enc_steal: | ||
1964 | ldrb r0, [$inp], #1 | ||
1965 | ldrb r1, [$out, #-0x10] | ||
1966 | strb r0, [$out, #-0x10] | ||
1967 | strb r1, [$out], #1 | ||
1968 | |||
1969 | subs $len, #1 | ||
1970 | bhi .Lxts_enc_steal | ||
1971 | |||
1972 | vld1.8 {@XMM[0]}, [r6] | ||
1973 | mov r0, sp | ||
1974 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1975 | mov r1, sp | ||
1976 | vst1.8 {@XMM[0]}, [sp,:128] | ||
1977 | mov r2, $key | ||
1978 | mov r4, $fp @ preserve fp | ||
1979 | |||
1980 | bl AES_encrypt | ||
1981 | |||
1982 | vld1.8 {@XMM[0]}, [sp,:128] | ||
1983 | veor @XMM[0], @XMM[0], @XMM[8] | ||
1984 | vst1.8 {@XMM[0]}, [r6] | ||
1985 | mov $fp, r4 | ||
1986 | #endif | ||
1987 | |||
1988 | .Lxts_enc_ret: | ||
1989 | bic r0, $fp, #0xf | ||
1990 | vmov.i32 q0, #0 | ||
1991 | vmov.i32 q1, #0 | ||
1992 | #ifdef XTS_CHAIN_TWEAK | ||
1993 | ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
1994 | #endif | ||
1995 | .Lxts_enc_bzero: @ wipe key schedule [if any] | ||
1996 | vstmia sp!, {q0-q1} | ||
1997 | cmp sp, r0 | ||
1998 | bne .Lxts_enc_bzero | ||
1999 | |||
2000 | mov sp, $fp | ||
2001 | #ifdef XTS_CHAIN_TWEAK | ||
2002 | vst1.8 {@XMM[8]}, [r1] | ||
2003 | #endif | ||
2004 | VFP_ABI_POP | ||
2005 | ldmia sp!, {r4-r10, pc} @ return | ||
2006 | |||
2007 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2008 | |||
2009 | .globl bsaes_xts_decrypt | ||
2010 | .type bsaes_xts_decrypt,%function | ||
2011 | .align 4 | ||
2012 | bsaes_xts_decrypt: | ||
2013 | mov ip, sp | ||
2014 | stmdb sp!, {r4-r10, lr} @ 0x20 | ||
2015 | VFP_ABI_PUSH | ||
2016 | mov r6, sp @ future $fp | ||
2017 | |||
2018 | mov $inp, r0 | ||
2019 | mov $out, r1 | ||
2020 | mov $len, r2 | ||
2021 | mov $key, r3 | ||
2022 | |||
2023 | sub r0, sp, #0x10 @ 0x10 | ||
2024 | bic r0, #0xf @ align at 16 bytes | ||
2025 | mov sp, r0 | ||
2026 | |||
2027 | #ifdef XTS_CHAIN_TWEAK | ||
2028 | ldr r0, [ip] @ pointer to input tweak | ||
2029 | #else | ||
2030 | @ generate initial tweak | ||
2031 | ldr r0, [ip, #4] @ iv[] | ||
2032 | mov r1, sp | ||
2033 | ldr r2, [ip, #0] @ key2 | ||
2034 | bl AES_encrypt | ||
2035 | mov r0, sp @ pointer to initial tweak | ||
2036 | #endif | ||
2037 | |||
2038 | ldr $rounds, [$key, #240] @ get # of rounds | ||
2039 | mov $fp, r6 | ||
2040 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2041 | @ allocate the key schedule on the stack | ||
2042 | sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key | ||
2043 | @ add r12, #`128-32` @ size of bit-sliced key schedule | ||
2044 | sub r12, #`32+16` @ place for tweak[9] | ||
2045 | |||
2046 | @ populate the key schedule | ||
2047 | mov r4, $key @ pass key | ||
2048 | mov r5, $rounds @ pass # of rounds | ||
2049 | mov sp, r12 | ||
2050 | add r12, #0x90 @ pass key schedule | ||
2051 | bl _bsaes_key_convert | ||
2052 | add r4, sp, #0x90 | ||
2053 | vldmia r4, {@XMM[6]} | ||
2054 | vstmia r12, {@XMM[15]} @ save last round key | ||
2055 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
2056 | vstmia r4, {@XMM[7]} | ||
2057 | #else | ||
2058 | ldr r12, [$key, #244] | ||
2059 | eors r12, #1 | ||
2060 | beq 0f | ||
2061 | |||
2062 | str r12, [$key, #244] | ||
2063 | mov r4, $key @ pass key | ||
2064 | mov r5, $rounds @ pass # of rounds | ||
2065 | add r12, $key, #248 @ pass key schedule | ||
2066 | bl _bsaes_key_convert | ||
2067 | add r4, $key, #248 | ||
2068 | vldmia r4, {@XMM[6]} | ||
2069 | vstmia r12, {@XMM[15]} @ save last round key | ||
2070 | veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key | ||
2071 | vstmia r4, {@XMM[7]} | ||
2072 | |||
2073 | .align 2 | ||
2074 | 0: sub sp, #0x90 @ place for tweak[9] | ||
2075 | #endif | ||
2076 | vld1.8 {@XMM[8]}, [r0] @ initial tweak | ||
2077 | adr $magic, .Lxts_magic | ||
2078 | |||
2079 | tst $len, #0xf @ if not multiple of 16 | ||
2080 | it ne @ Thumb2 thing, sanity check in ARM | ||
2081 | subne $len, #0x10 @ subtract another 16 bytes | ||
2082 | subs $len, #0x80 | ||
2083 | |||
2084 | blo .Lxts_dec_short | ||
2085 | b .Lxts_dec_loop | ||
2086 | |||
2087 | .align 4 | ||
2088 | .Lxts_dec_loop: | ||
2089 | vldmia $magic, {$twmask} @ load XTS magic | ||
2090 | vshr.s64 @T[0], @XMM[8], #63 | ||
2091 | mov r0, sp | ||
2092 | vand @T[0], @T[0], $twmask | ||
2093 | ___ | ||
2094 | for($i=9;$i<16;$i++) { | ||
2095 | $code.=<<___; | ||
2096 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
2097 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
2098 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2099 | vshr.s64 @T[1], @XMM[$i], #63 | ||
2100 | veor @XMM[$i], @XMM[$i], @T[0] | ||
2101 | vand @T[1], @T[1], $twmask | ||
2102 | ___ | ||
2103 | @T=reverse(@T); | ||
2104 | |||
2105 | $code.=<<___ if ($i>=10); | ||
2106 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
2107 | ___ | ||
2108 | $code.=<<___ if ($i>=11); | ||
2109 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
2110 | ___ | ||
2111 | } | ||
2112 | $code.=<<___; | ||
2113 | vadd.u64 @XMM[8], @XMM[15], @XMM[15] | ||
2114 | vst1.64 {@XMM[15]}, [r0,:128]! | ||
2115 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2116 | veor @XMM[8], @XMM[8], @T[0] | ||
2117 | vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2118 | |||
2119 | vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! | ||
2120 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2121 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2122 | add r4, sp, #0x90 @ pass key schedule | ||
2123 | #else | ||
2124 | add r4, $key, #248 @ pass key schedule | ||
2125 | #endif | ||
2126 | veor @XMM[6], @XMM[6], @XMM[14] | ||
2127 | mov r5, $rounds @ pass rounds | ||
2128 | veor @XMM[7], @XMM[7], @XMM[15] | ||
2129 | mov r0, sp | ||
2130 | |||
2131 | bl _bsaes_decrypt8 | ||
2132 | |||
2133 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2134 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2135 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2136 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2137 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2138 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2139 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2140 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2141 | vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! | ||
2142 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2143 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2144 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2145 | veor @XMM[12], @XMM[3], @XMM[14] | ||
2146 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2147 | veor @XMM[13], @XMM[5], @XMM[15] | ||
2148 | vst1.8 {@XMM[12]-@XMM[13]}, [$out]! | ||
2149 | |||
2150 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2151 | |||
2152 | subs $len, #0x80 | ||
2153 | bpl .Lxts_dec_loop | ||
2154 | |||
2155 | .Lxts_dec_short: | ||
2156 | adds $len, #0x70 | ||
2157 | bmi .Lxts_dec_done | ||
2158 | |||
2159 | vldmia $magic, {$twmask} @ load XTS magic | ||
2160 | vshr.s64 @T[0], @XMM[8], #63 | ||
2161 | mov r0, sp | ||
2162 | vand @T[0], @T[0], $twmask | ||
2163 | ___ | ||
2164 | for($i=9;$i<16;$i++) { | ||
2165 | $code.=<<___; | ||
2166 | vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] | ||
2167 | vst1.64 {@XMM[$i-1]}, [r0,:128]! | ||
2168 | vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` | ||
2169 | vshr.s64 @T[1], @XMM[$i], #63 | ||
2170 | veor @XMM[$i], @XMM[$i], @T[0] | ||
2171 | vand @T[1], @T[1], $twmask | ||
2172 | ___ | ||
2173 | @T=reverse(@T); | ||
2174 | |||
2175 | $code.=<<___ if ($i>=10); | ||
2176 | vld1.8 {@XMM[$i-10]}, [$inp]! | ||
2177 | subs $len, #0x10 | ||
2178 | bmi .Lxts_dec_`$i-9` | ||
2179 | ___ | ||
2180 | $code.=<<___ if ($i>=11); | ||
2181 | veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] | ||
2182 | ___ | ||
2183 | } | ||
2184 | $code.=<<___; | ||
2185 | sub $len, #0x10 | ||
2186 | vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak | ||
2187 | |||
2188 | vld1.8 {@XMM[6]}, [$inp]! | ||
2189 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2190 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2191 | add r4, sp, #0x90 @ pass key schedule | ||
2192 | #else | ||
2193 | add r4, $key, #248 @ pass key schedule | ||
2194 | #endif | ||
2195 | veor @XMM[6], @XMM[6], @XMM[14] | ||
2196 | mov r5, $rounds @ pass rounds | ||
2197 | mov r0, sp | ||
2198 | |||
2199 | bl _bsaes_decrypt8 | ||
2200 | |||
2201 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2202 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2203 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2204 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2205 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2206 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2207 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2208 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2209 | vld1.64 {@XMM[14]}, [r0,:128]! | ||
2210 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2211 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2212 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2213 | veor @XMM[12], @XMM[3], @XMM[14] | ||
2214 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2215 | vst1.8 {@XMM[12]}, [$out]! | ||
2216 | |||
2217 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2218 | b .Lxts_dec_done | ||
2219 | .align 4 | ||
2220 | .Lxts_dec_6: | ||
2221 | vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak | ||
2222 | |||
2223 | veor @XMM[4], @XMM[4], @XMM[12] | ||
2224 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2225 | add r4, sp, #0x90 @ pass key schedule | ||
2226 | #else | ||
2227 | add r4, $key, #248 @ pass key schedule | ||
2228 | #endif | ||
2229 | veor @XMM[5], @XMM[5], @XMM[13] | ||
2230 | mov r5, $rounds @ pass rounds | ||
2231 | mov r0, sp | ||
2232 | |||
2233 | bl _bsaes_decrypt8 | ||
2234 | |||
2235 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2236 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2237 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2238 | vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! | ||
2239 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2240 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2241 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2242 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2243 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2244 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2245 | veor @XMM[11], @XMM[7], @XMM[13] | ||
2246 | vst1.8 {@XMM[10]-@XMM[11]}, [$out]! | ||
2247 | |||
2248 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2249 | b .Lxts_dec_done | ||
2250 | .align 4 | ||
2251 | .Lxts_dec_5: | ||
2252 | vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak | ||
2253 | |||
2254 | veor @XMM[3], @XMM[3], @XMM[11] | ||
2255 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2256 | add r4, sp, #0x90 @ pass key schedule | ||
2257 | #else | ||
2258 | add r4, $key, #248 @ pass key schedule | ||
2259 | #endif | ||
2260 | veor @XMM[4], @XMM[4], @XMM[12] | ||
2261 | mov r5, $rounds @ pass rounds | ||
2262 | mov r0, sp | ||
2263 | |||
2264 | bl _bsaes_decrypt8 | ||
2265 | |||
2266 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2267 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2268 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2269 | vld1.64 {@XMM[12]}, [r0,:128]! | ||
2270 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2271 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2272 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2273 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2274 | veor @XMM[10], @XMM[2], @XMM[12] | ||
2275 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2276 | vst1.8 {@XMM[10]}, [$out]! | ||
2277 | |||
2278 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2279 | b .Lxts_dec_done | ||
2280 | .align 4 | ||
2281 | .Lxts_dec_4: | ||
2282 | vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak | ||
2283 | |||
2284 | veor @XMM[2], @XMM[2], @XMM[10] | ||
2285 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2286 | add r4, sp, #0x90 @ pass key schedule | ||
2287 | #else | ||
2288 | add r4, $key, #248 @ pass key schedule | ||
2289 | #endif | ||
2290 | veor @XMM[3], @XMM[3], @XMM[11] | ||
2291 | mov r5, $rounds @ pass rounds | ||
2292 | mov r0, sp | ||
2293 | |||
2294 | bl _bsaes_decrypt8 | ||
2295 | |||
2296 | vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! | ||
2297 | vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! | ||
2298 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2299 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2300 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2301 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2302 | veor @XMM[9], @XMM[4], @XMM[11] | ||
2303 | vst1.8 {@XMM[8]-@XMM[9]}, [$out]! | ||
2304 | |||
2305 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2306 | b .Lxts_dec_done | ||
2307 | .align 4 | ||
2308 | .Lxts_dec_3: | ||
2309 | vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak | ||
2310 | |||
2311 | veor @XMM[1], @XMM[1], @XMM[9] | ||
2312 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2313 | add r4, sp, #0x90 @ pass key schedule | ||
2314 | #else | ||
2315 | add r4, $key, #248 @ pass key schedule | ||
2316 | #endif | ||
2317 | veor @XMM[2], @XMM[2], @XMM[10] | ||
2318 | mov r5, $rounds @ pass rounds | ||
2319 | mov r0, sp | ||
2320 | |||
2321 | bl _bsaes_decrypt8 | ||
2322 | |||
2323 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
2324 | vld1.64 {@XMM[10]}, [r0,:128]! | ||
2325 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2326 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2327 | veor @XMM[8], @XMM[6], @XMM[10] | ||
2328 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2329 | vst1.8 {@XMM[8]}, [$out]! | ||
2330 | |||
2331 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2332 | b .Lxts_dec_done | ||
2333 | .align 4 | ||
2334 | .Lxts_dec_2: | ||
2335 | vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak | ||
2336 | |||
2337 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2338 | #ifndef BSAES_ASM_EXTENDED_KEY | ||
2339 | add r4, sp, #0x90 @ pass key schedule | ||
2340 | #else | ||
2341 | add r4, $key, #248 @ pass key schedule | ||
2342 | #endif | ||
2343 | veor @XMM[1], @XMM[1], @XMM[9] | ||
2344 | mov r5, $rounds @ pass rounds | ||
2345 | mov r0, sp | ||
2346 | |||
2347 | bl _bsaes_decrypt8 | ||
2348 | |||
2349 | vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! | ||
2350 | veor @XMM[0], @XMM[0], @XMM[ 8] | ||
2351 | veor @XMM[1], @XMM[1], @XMM[ 9] | ||
2352 | vst1.8 {@XMM[0]-@XMM[1]}, [$out]! | ||
2353 | |||
2354 | vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak | ||
2355 | b .Lxts_dec_done | ||
2356 | .align 4 | ||
2357 | .Lxts_dec_1: | ||
2358 | mov r0, sp | ||
2359 | veor @XMM[0], @XMM[8] | ||
2360 | mov r1, sp | ||
2361 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2362 | mov r2, $key | ||
2363 | mov r4, $fp @ preserve fp | ||
2364 | mov r5, $magic @ preserve magic | ||
2365 | |||
2366 | bl AES_decrypt | ||
2367 | |||
2368 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2369 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2370 | vst1.8 {@XMM[0]}, [$out]! | ||
2371 | mov $fp, r4 | ||
2372 | mov $magic, r5 | ||
2373 | |||
2374 | vmov @XMM[8], @XMM[9] @ next round tweak | ||
2375 | |||
2376 | .Lxts_dec_done: | ||
2377 | #ifndef XTS_CHAIN_TWEAK | ||
2378 | adds $len, #0x10 | ||
2379 | beq .Lxts_dec_ret | ||
2380 | |||
2381 | @ calculate one round of extra tweak for the stolen ciphertext | ||
2382 | vldmia $magic, {$twmask} | ||
2383 | vshr.s64 @XMM[6], @XMM[8], #63 | ||
2384 | vand @XMM[6], @XMM[6], $twmask | ||
2385 | vadd.u64 @XMM[9], @XMM[8], @XMM[8] | ||
2386 | vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")` | ||
2387 | veor @XMM[9], @XMM[9], @XMM[6] | ||
2388 | |||
2389 | @ perform the final decryption with the last tweak value | ||
2390 | vld1.8 {@XMM[0]}, [$inp]! | ||
2391 | mov r0, sp | ||
2392 | veor @XMM[0], @XMM[0], @XMM[9] | ||
2393 | mov r1, sp | ||
2394 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2395 | mov r2, $key | ||
2396 | mov r4, $fp @ preserve fp | ||
2397 | |||
2398 | bl AES_decrypt | ||
2399 | |||
2400 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2401 | veor @XMM[0], @XMM[0], @XMM[9] | ||
2402 | vst1.8 {@XMM[0]}, [$out] | ||
2403 | |||
2404 | mov r6, $out | ||
2405 | .Lxts_dec_steal: | ||
2406 | ldrb r1, [$out] | ||
2407 | ldrb r0, [$inp], #1 | ||
2408 | strb r1, [$out, #0x10] | ||
2409 | strb r0, [$out], #1 | ||
2410 | |||
2411 | subs $len, #1 | ||
2412 | bhi .Lxts_dec_steal | ||
2413 | |||
2414 | vld1.8 {@XMM[0]}, [r6] | ||
2415 | mov r0, sp | ||
2416 | veor @XMM[0], @XMM[8] | ||
2417 | mov r1, sp | ||
2418 | vst1.8 {@XMM[0]}, [sp,:128] | ||
2419 | mov r2, $key | ||
2420 | |||
2421 | bl AES_decrypt | ||
2422 | |||
2423 | vld1.8 {@XMM[0]}, [sp,:128] | ||
2424 | veor @XMM[0], @XMM[0], @XMM[8] | ||
2425 | vst1.8 {@XMM[0]}, [r6] | ||
2426 | mov $fp, r4 | ||
2427 | #endif | ||
2428 | |||
2429 | .Lxts_dec_ret: | ||
2430 | bic r0, $fp, #0xf | ||
2431 | vmov.i32 q0, #0 | ||
2432 | vmov.i32 q1, #0 | ||
2433 | #ifdef XTS_CHAIN_TWEAK | ||
2434 | ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak | ||
2435 | #endif | ||
2436 | .Lxts_dec_bzero: @ wipe key schedule [if any] | ||
2437 | vstmia sp!, {q0-q1} | ||
2438 | cmp sp, r0 | ||
2439 | bne .Lxts_dec_bzero | ||
2440 | |||
2441 | mov sp, $fp | ||
2442 | #ifdef XTS_CHAIN_TWEAK | ||
2443 | vst1.8 {@XMM[8]}, [r1] | ||
2444 | #endif | ||
2445 | VFP_ABI_POP | ||
2446 | ldmia sp!, {r4-r10, pc} @ return | ||
2447 | |||
2448 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2449 | ___ | ||
2450 | } | ||
2451 | $code.=<<___; | ||
2452 | #endif | ||
2453 | ___ | ||
2454 | |||
2455 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
2456 | |||
2457 | open SELF,$0; | ||
2458 | while(<SELF>) { | ||
2459 | next if (/^#!/); | ||
2460 | last if (!s/^#/@/ and !/^$/); | ||
2461 | print; | ||
2462 | } | ||
2463 | close SELF; | ||
2464 | |||
2465 | print $code; | ||
2466 | |||
2467 | close STDOUT; | ||
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 59ceae8f3c95..a6395c027715 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild | |||
@@ -24,6 +24,7 @@ generic-y += sembuf.h | |||
24 | generic-y += serial.h | 24 | generic-y += serial.h |
25 | generic-y += shmbuf.h | 25 | generic-y += shmbuf.h |
26 | generic-y += siginfo.h | 26 | generic-y += siginfo.h |
27 | generic-y += simd.h | ||
27 | generic-y += sizes.h | 28 | generic-y += sizes.h |
28 | generic-y += socket.h | 29 | generic-y += socket.h |
29 | generic-y += sockios.h | 30 | generic-y += sockios.h |
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index fcc1b5bf6979..5c2285160575 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h | |||
@@ -53,6 +53,13 @@ | |||
53 | #define put_byte_3 lsl #0 | 53 | #define put_byte_3 lsl #0 |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | /* Select code for any configuration running in BE8 mode */ | ||
57 | #ifdef CONFIG_CPU_ENDIAN_BE8 | ||
58 | #define ARM_BE8(code...) code | ||
59 | #else | ||
60 | #define ARM_BE8(code...) | ||
61 | #endif | ||
62 | |||
56 | /* | 63 | /* |
57 | * Data preload for architectures that support it | 64 | * Data preload for architectures that support it |
58 | */ | 65 | */ |
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index f8a4336ed8fc..62d2cb53b069 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #define __ASM_ARM_ATOMIC_H | 12 | #define __ASM_ARM_ATOMIC_H |
13 | 13 | ||
14 | #include <linux/compiler.h> | 14 | #include <linux/compiler.h> |
15 | #include <linux/prefetch.h> | ||
15 | #include <linux/types.h> | 16 | #include <linux/types.h> |
16 | #include <linux/irqflags.h> | 17 | #include <linux/irqflags.h> |
17 | #include <asm/barrier.h> | 18 | #include <asm/barrier.h> |
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v) | |||
41 | unsigned long tmp; | 42 | unsigned long tmp; |
42 | int result; | 43 | int result; |
43 | 44 | ||
45 | prefetchw(&v->counter); | ||
44 | __asm__ __volatile__("@ atomic_add\n" | 46 | __asm__ __volatile__("@ atomic_add\n" |
45 | "1: ldrex %0, [%3]\n" | 47 | "1: ldrex %0, [%3]\n" |
46 | " add %0, %0, %4\n" | 48 | " add %0, %0, %4\n" |
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v) | |||
79 | unsigned long tmp; | 81 | unsigned long tmp; |
80 | int result; | 82 | int result; |
81 | 83 | ||
84 | prefetchw(&v->counter); | ||
82 | __asm__ __volatile__("@ atomic_sub\n" | 85 | __asm__ __volatile__("@ atomic_sub\n" |
83 | "1: ldrex %0, [%3]\n" | 86 | "1: ldrex %0, [%3]\n" |
84 | " sub %0, %0, %4\n" | 87 | " sub %0, %0, %4\n" |
@@ -260,6 +263,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) | |||
260 | { | 263 | { |
261 | long long tmp; | 264 | long long tmp; |
262 | 265 | ||
266 | prefetchw(&v->counter); | ||
263 | __asm__ __volatile__("@ atomic64_set\n" | 267 | __asm__ __volatile__("@ atomic64_set\n" |
264 | "1: ldrexd %0, %H0, [%2]\n" | 268 | "1: ldrexd %0, %H0, [%2]\n" |
265 | " strexd %0, %3, %H3, [%2]\n" | 269 | " strexd %0, %3, %H3, [%2]\n" |
@@ -276,10 +280,11 @@ static inline void atomic64_add(long long i, atomic64_t *v) | |||
276 | long long result; | 280 | long long result; |
277 | unsigned long tmp; | 281 | unsigned long tmp; |
278 | 282 | ||
283 | prefetchw(&v->counter); | ||
279 | __asm__ __volatile__("@ atomic64_add\n" | 284 | __asm__ __volatile__("@ atomic64_add\n" |
280 | "1: ldrexd %0, %H0, [%3]\n" | 285 | "1: ldrexd %0, %H0, [%3]\n" |
281 | " adds %0, %0, %4\n" | 286 | " adds %Q0, %Q0, %Q4\n" |
282 | " adc %H0, %H0, %H4\n" | 287 | " adc %R0, %R0, %R4\n" |
283 | " strexd %1, %0, %H0, [%3]\n" | 288 | " strexd %1, %0, %H0, [%3]\n" |
284 | " teq %1, #0\n" | 289 | " teq %1, #0\n" |
285 | " bne 1b" | 290 | " bne 1b" |
@@ -297,8 +302,8 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) | |||
297 | 302 | ||
298 | __asm__ __volatile__("@ atomic64_add_return\n" | 303 | __asm__ __volatile__("@ atomic64_add_return\n" |
299 | "1: ldrexd %0, %H0, [%3]\n" | 304 | "1: ldrexd %0, %H0, [%3]\n" |
300 | " adds %0, %0, %4\n" | 305 | " adds %Q0, %Q0, %Q4\n" |
301 | " adc %H0, %H0, %H4\n" | 306 | " adc %R0, %R0, %R4\n" |
302 | " strexd %1, %0, %H0, [%3]\n" | 307 | " strexd %1, %0, %H0, [%3]\n" |
303 | " teq %1, #0\n" | 308 | " teq %1, #0\n" |
304 | " bne 1b" | 309 | " bne 1b" |
@@ -316,10 +321,11 @@ static inline void atomic64_sub(long long i, atomic64_t *v) | |||
316 | long long result; | 321 | long long result; |
317 | unsigned long tmp; | 322 | unsigned long tmp; |
318 | 323 | ||
324 | prefetchw(&v->counter); | ||
319 | __asm__ __volatile__("@ atomic64_sub\n" | 325 | __asm__ __volatile__("@ atomic64_sub\n" |
320 | "1: ldrexd %0, %H0, [%3]\n" | 326 | "1: ldrexd %0, %H0, [%3]\n" |
321 | " subs %0, %0, %4\n" | 327 | " subs %Q0, %Q0, %Q4\n" |
322 | " sbc %H0, %H0, %H4\n" | 328 | " sbc %R0, %R0, %R4\n" |
323 | " strexd %1, %0, %H0, [%3]\n" | 329 | " strexd %1, %0, %H0, [%3]\n" |
324 | " teq %1, #0\n" | 330 | " teq %1, #0\n" |
325 | " bne 1b" | 331 | " bne 1b" |
@@ -337,8 +343,8 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v) | |||
337 | 343 | ||
338 | __asm__ __volatile__("@ atomic64_sub_return\n" | 344 | __asm__ __volatile__("@ atomic64_sub_return\n" |
339 | "1: ldrexd %0, %H0, [%3]\n" | 345 | "1: ldrexd %0, %H0, [%3]\n" |
340 | " subs %0, %0, %4\n" | 346 | " subs %Q0, %Q0, %Q4\n" |
341 | " sbc %H0, %H0, %H4\n" | 347 | " sbc %R0, %R0, %R4\n" |
342 | " strexd %1, %0, %H0, [%3]\n" | 348 | " strexd %1, %0, %H0, [%3]\n" |
343 | " teq %1, #0\n" | 349 | " teq %1, #0\n" |
344 | " bne 1b" | 350 | " bne 1b" |
@@ -406,9 +412,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) | |||
406 | 412 | ||
407 | __asm__ __volatile__("@ atomic64_dec_if_positive\n" | 413 | __asm__ __volatile__("@ atomic64_dec_if_positive\n" |
408 | "1: ldrexd %0, %H0, [%3]\n" | 414 | "1: ldrexd %0, %H0, [%3]\n" |
409 | " subs %0, %0, #1\n" | 415 | " subs %Q0, %Q0, #1\n" |
410 | " sbc %H0, %H0, #0\n" | 416 | " sbc %R0, %R0, #0\n" |
411 | " teq %H0, #0\n" | 417 | " teq %R0, #0\n" |
412 | " bmi 2f\n" | 418 | " bmi 2f\n" |
413 | " strexd %1, %0, %H0, [%3]\n" | 419 | " strexd %1, %0, %H0, [%3]\n" |
414 | " teq %1, #0\n" | 420 | " teq %1, #0\n" |
@@ -437,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) | |||
437 | " teqeq %H0, %H5\n" | 443 | " teqeq %H0, %H5\n" |
438 | " moveq %1, #0\n" | 444 | " moveq %1, #0\n" |
439 | " beq 2f\n" | 445 | " beq 2f\n" |
440 | " adds %0, %0, %6\n" | 446 | " adds %Q0, %Q0, %Q6\n" |
441 | " adc %H0, %H0, %H6\n" | 447 | " adc %R0, %R0, %R6\n" |
442 | " strexd %2, %0, %H0, [%4]\n" | 448 | " strexd %2, %0, %H0, [%4]\n" |
443 | " teq %2, #0\n" | 449 | " teq %2, #0\n" |
444 | " bne 1b\n" | 450 | " bne 1b\n" |
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h new file mode 100644 index 000000000000..1714800fa113 --- /dev/null +++ b/arch/arm/include/asm/bL_switcher.h | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * arch/arm/include/asm/bL_switcher.h | ||
3 | * | ||
4 | * Created by: Nicolas Pitre, April 2012 | ||
5 | * Copyright: (C) 2012-2013 Linaro Limited | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #ifndef ASM_BL_SWITCHER_H | ||
13 | #define ASM_BL_SWITCHER_H | ||
14 | |||
15 | #include <linux/compiler.h> | ||
16 | #include <linux/types.h> | ||
17 | |||
18 | typedef void (*bL_switch_completion_handler)(void *cookie); | ||
19 | |||
20 | int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id, | ||
21 | bL_switch_completion_handler completer, | ||
22 | void *completer_cookie); | ||
23 | static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) | ||
24 | { | ||
25 | return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL); | ||
26 | } | ||
27 | |||
28 | /* | ||
29 | * Register here to be notified about runtime enabling/disabling of | ||
30 | * the switcher. | ||
31 | * | ||
32 | * The notifier chain is called with the switcher activation lock held: | ||
33 | * the switcher will not be enabled or disabled during callbacks. | ||
34 | * Callbacks must not call bL_switcher_{get,put}_enabled(). | ||
35 | */ | ||
36 | #define BL_NOTIFY_PRE_ENABLE 0 | ||
37 | #define BL_NOTIFY_POST_ENABLE 1 | ||
38 | #define BL_NOTIFY_PRE_DISABLE 2 | ||
39 | #define BL_NOTIFY_POST_DISABLE 3 | ||
40 | |||
41 | #ifdef CONFIG_BL_SWITCHER | ||
42 | |||
43 | int bL_switcher_register_notifier(struct notifier_block *nb); | ||
44 | int bL_switcher_unregister_notifier(struct notifier_block *nb); | ||
45 | |||
46 | /* | ||
47 | * Use these functions to temporarily prevent enabling/disabling of | ||
48 | * the switcher. | ||
49 | * bL_switcher_get_enabled() returns true if the switcher is currently | ||
50 | * enabled. Each call to bL_switcher_get_enabled() must be followed | ||
51 | * by a call to bL_switcher_put_enabled(). These functions are not | ||
52 | * recursive. | ||
53 | */ | ||
54 | bool bL_switcher_get_enabled(void); | ||
55 | void bL_switcher_put_enabled(void); | ||
56 | |||
57 | int bL_switcher_trace_trigger(void); | ||
58 | int bL_switcher_get_logical_index(u32 mpidr); | ||
59 | |||
60 | #else | ||
61 | static inline int bL_switcher_register_notifier(struct notifier_block *nb) | ||
62 | { | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | static inline int bL_switcher_unregister_notifier(struct notifier_block *nb) | ||
67 | { | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static inline bool bL_switcher_get_enabled(void) { return false; } | ||
72 | static inline void bL_switcher_put_enabled(void) { } | ||
73 | static inline int bL_switcher_trace_trigger(void) { return 0; } | ||
74 | static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; } | ||
75 | #endif /* CONFIG_BL_SWITCHER */ | ||
76 | |||
77 | #endif | ||
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index 7af5c6c3653a..b274bde24905 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h | |||
@@ -2,6 +2,8 @@ | |||
2 | #define _ASMARM_BUG_H | 2 | #define _ASMARM_BUG_H |
3 | 3 | ||
4 | #include <linux/linkage.h> | 4 | #include <linux/linkage.h> |
5 | #include <linux/types.h> | ||
6 | #include <asm/opcodes.h> | ||
5 | 7 | ||
6 | #ifdef CONFIG_BUG | 8 | #ifdef CONFIG_BUG |
7 | 9 | ||
@@ -12,10 +14,10 @@ | |||
12 | */ | 14 | */ |
13 | #ifdef CONFIG_THUMB2_KERNEL | 15 | #ifdef CONFIG_THUMB2_KERNEL |
14 | #define BUG_INSTR_VALUE 0xde02 | 16 | #define BUG_INSTR_VALUE 0xde02 |
15 | #define BUG_INSTR_TYPE ".hword " | 17 | #define BUG_INSTR(__value) __inst_thumb16(__value) |
16 | #else | 18 | #else |
17 | #define BUG_INSTR_VALUE 0xe7f001f2 | 19 | #define BUG_INSTR_VALUE 0xe7f001f2 |
18 | #define BUG_INSTR_TYPE ".word " | 20 | #define BUG_INSTR(__value) __inst_arm(__value) |
19 | #endif | 21 | #endif |
20 | 22 | ||
21 | 23 | ||
@@ -33,7 +35,7 @@ | |||
33 | 35 | ||
34 | #define __BUG(__file, __line, __value) \ | 36 | #define __BUG(__file, __line, __value) \ |
35 | do { \ | 37 | do { \ |
36 | asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n" \ | 38 | asm volatile("1:\t" BUG_INSTR(__value) "\n" \ |
37 | ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \ | 39 | ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \ |
38 | "2:\t.asciz " #__file "\n" \ | 40 | "2:\t.asciz " #__file "\n" \ |
39 | ".popsection\n" \ | 41 | ".popsection\n" \ |
@@ -48,7 +50,7 @@ do { \ | |||
48 | 50 | ||
49 | #define __BUG(__file, __line, __value) \ | 51 | #define __BUG(__file, __line, __value) \ |
50 | do { \ | 52 | do { \ |
51 | asm volatile(BUG_INSTR_TYPE #__value); \ | 53 | asm volatile(BUG_INSTR(__value) "\n"); \ |
52 | unreachable(); \ | 54 | unreachable(); \ |
53 | } while (0) | 55 | } while (0) |
54 | #endif /* CONFIG_DEBUG_BUGVERBOSE */ | 56 | #endif /* CONFIG_DEBUG_BUGVERBOSE */ |
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index 3d7351c844aa..fe3ea776dc34 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/threads.h> | 5 | #include <linux/threads.h> |
6 | #include <asm/irq.h> | 6 | #include <asm/irq.h> |
7 | 7 | ||
8 | #define NR_IPI 7 | 8 | #define NR_IPI 8 |
9 | 9 | ||
10 | typedef struct { | 10 | typedef struct { |
11 | unsigned int __softirq_pending; | 11 | unsigned int __softirq_pending; |
diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h index 0cf7a6b842ff..ad774f37c47c 100644 --- a/arch/arm/include/asm/hardware/coresight.h +++ b/arch/arm/include/asm/hardware/coresight.h | |||
@@ -24,8 +24,8 @@ | |||
24 | #define TRACER_TIMEOUT 10000 | 24 | #define TRACER_TIMEOUT 10000 |
25 | 25 | ||
26 | #define etm_writel(t, v, x) \ | 26 | #define etm_writel(t, v, x) \ |
27 | (__raw_writel((v), (t)->etm_regs + (x))) | 27 | (writel_relaxed((v), (t)->etm_regs + (x))) |
28 | #define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x))) | 28 | #define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x))) |
29 | 29 | ||
30 | /* CoreSight Management Registers */ | 30 | /* CoreSight Management Registers */ |
31 | #define CSMR_LOCKACCESS 0xfb0 | 31 | #define CSMR_LOCKACCESS 0xfb0 |
@@ -142,8 +142,8 @@ | |||
142 | #define ETBFF_TRIGFL BIT(10) | 142 | #define ETBFF_TRIGFL BIT(10) |
143 | 143 | ||
144 | #define etb_writel(t, v, x) \ | 144 | #define etb_writel(t, v, x) \ |
145 | (__raw_writel((v), (t)->etb_regs + (x))) | 145 | (writel_relaxed((v), (t)->etb_regs + (x))) |
146 | #define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x))) | 146 | #define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x))) |
147 | 147 | ||
148 | #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0) | 148 | #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0) |
149 | #define etm_unlock(t) \ | 149 | #define etm_unlock(t) \ |
diff --git a/arch/arm/include/asm/kgdb.h b/arch/arm/include/asm/kgdb.h index 48066ce9ea34..0a9d5dd93294 100644 --- a/arch/arm/include/asm/kgdb.h +++ b/arch/arm/include/asm/kgdb.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #define __ARM_KGDB_H__ | 11 | #define __ARM_KGDB_H__ |
12 | 12 | ||
13 | #include <linux/ptrace.h> | 13 | #include <linux/ptrace.h> |
14 | #include <asm/opcodes.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * GDB assumes that we're a user process being debugged, so | 17 | * GDB assumes that we're a user process being debugged, so |
@@ -41,7 +42,7 @@ | |||
41 | 42 | ||
42 | static inline void arch_kgdb_breakpoint(void) | 43 | static inline void arch_kgdb_breakpoint(void) |
43 | { | 44 | { |
44 | asm(".word 0xe7ffdeff"); | 45 | asm(__inst_arm(0xe7ffdeff)); |
45 | } | 46 | } |
46 | 47 | ||
47 | extern void kgdb_handle_bus_error(void); | 48 | extern void kgdb_handle_bus_error(void); |
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index 402a2bc6aa68..17a3fa2979e8 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h | |||
@@ -49,6 +49,7 @@ struct machine_desc { | |||
49 | bool (*smp_init)(void); | 49 | bool (*smp_init)(void); |
50 | void (*fixup)(struct tag *, char **, | 50 | void (*fixup)(struct tag *, char **, |
51 | struct meminfo *); | 51 | struct meminfo *); |
52 | void (*init_meminfo)(void); | ||
52 | void (*reserve)(void);/* reserve mem blocks */ | 53 | void (*reserve)(void);/* reserve mem blocks */ |
53 | void (*map_io)(void);/* IO mapping function */ | 54 | void (*map_io)(void);/* IO mapping function */ |
54 | void (*init_early)(void); | 55 | void (*init_early)(void); |
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index 1cf26010a6f3..608516ebabfe 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h | |||
@@ -42,6 +42,14 @@ extern void mcpm_entry_point(void); | |||
42 | void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); | 42 | void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * This sets an early poke i.e a value to be poked into some address | ||
46 | * from very early assembly code before the CPU is ungated. The | ||
47 | * address must be physical, and if 0 then nothing will happen. | ||
48 | */ | ||
49 | void mcpm_set_early_poke(unsigned cpu, unsigned cluster, | ||
50 | unsigned long poke_phys_addr, unsigned long poke_val); | ||
51 | |||
52 | /* | ||
45 | * CPU/cluster power operations API for higher subsystems to use. | 53 | * CPU/cluster power operations API for higher subsystems to use. |
46 | */ | 54 | */ |
47 | 55 | ||
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index e750a938fd3c..4dd21457ef9d 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h | |||
@@ -172,8 +172,13 @@ | |||
172 | * so that all we need to do is modify the 8-bit constant field. | 172 | * so that all we need to do is modify the 8-bit constant field. |
173 | */ | 173 | */ |
174 | #define __PV_BITS_31_24 0x81000000 | 174 | #define __PV_BITS_31_24 0x81000000 |
175 | #define __PV_BITS_7_0 0x81 | ||
176 | |||
177 | extern u64 __pv_phys_offset; | ||
178 | extern u64 __pv_offset; | ||
179 | extern void fixup_pv_table(const void *, unsigned long); | ||
180 | extern const void *__pv_table_begin, *__pv_table_end; | ||
175 | 181 | ||
176 | extern unsigned long __pv_phys_offset; | ||
177 | #define PHYS_OFFSET __pv_phys_offset | 182 | #define PHYS_OFFSET __pv_phys_offset |
178 | 183 | ||
179 | #define __pv_stub(from,to,instr,type) \ | 184 | #define __pv_stub(from,to,instr,type) \ |
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset; | |||
185 | : "=r" (to) \ | 190 | : "=r" (to) \ |
186 | : "r" (from), "I" (type)) | 191 | : "r" (from), "I" (type)) |
187 | 192 | ||
188 | static inline unsigned long __virt_to_phys(unsigned long x) | 193 | #define __pv_stub_mov_hi(t) \ |
194 | __asm__ volatile("@ __pv_stub_mov\n" \ | ||
195 | "1: mov %R0, %1\n" \ | ||
196 | " .pushsection .pv_table,\"a\"\n" \ | ||
197 | " .long 1b\n" \ | ||
198 | " .popsection\n" \ | ||
199 | : "=r" (t) \ | ||
200 | : "I" (__PV_BITS_7_0)) | ||
201 | |||
202 | #define __pv_add_carry_stub(x, y) \ | ||
203 | __asm__ volatile("@ __pv_add_carry_stub\n" \ | ||
204 | "1: adds %Q0, %1, %2\n" \ | ||
205 | " adc %R0, %R0, #0\n" \ | ||
206 | " .pushsection .pv_table,\"a\"\n" \ | ||
207 | " .long 1b\n" \ | ||
208 | " .popsection\n" \ | ||
209 | : "+r" (y) \ | ||
210 | : "r" (x), "I" (__PV_BITS_31_24) \ | ||
211 | : "cc") | ||
212 | |||
213 | static inline phys_addr_t __virt_to_phys(unsigned long x) | ||
189 | { | 214 | { |
190 | unsigned long t; | 215 | phys_addr_t t; |
191 | __pv_stub(x, t, "add", __PV_BITS_31_24); | 216 | |
217 | if (sizeof(phys_addr_t) == 4) { | ||
218 | __pv_stub(x, t, "add", __PV_BITS_31_24); | ||
219 | } else { | ||
220 | __pv_stub_mov_hi(t); | ||
221 | __pv_add_carry_stub(x, t); | ||
222 | } | ||
192 | return t; | 223 | return t; |
193 | } | 224 | } |
194 | 225 | ||
195 | static inline unsigned long __phys_to_virt(unsigned long x) | 226 | static inline unsigned long __phys_to_virt(phys_addr_t x) |
196 | { | 227 | { |
197 | unsigned long t; | 228 | unsigned long t; |
198 | __pv_stub(x, t, "sub", __PV_BITS_31_24); | 229 | __pv_stub(x, t, "sub", __PV_BITS_31_24); |
199 | return t; | 230 | return t; |
200 | } | 231 | } |
232 | |||
201 | #else | 233 | #else |
202 | #define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET) | 234 | |
203 | #define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) | 235 | static inline phys_addr_t __virt_to_phys(unsigned long x) |
236 | { | ||
237 | return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET; | ||
238 | } | ||
239 | |||
240 | static inline unsigned long __phys_to_virt(phys_addr_t x) | ||
241 | { | ||
242 | return x - PHYS_OFFSET + PAGE_OFFSET; | ||
243 | } | ||
244 | |||
204 | #endif | 245 | #endif |
205 | #endif | 246 | #endif |
206 | #endif /* __ASSEMBLY__ */ | 247 | #endif /* __ASSEMBLY__ */ |
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x) | |||
238 | 279 | ||
239 | static inline void *phys_to_virt(phys_addr_t x) | 280 | static inline void *phys_to_virt(phys_addr_t x) |
240 | { | 281 | { |
241 | return (void *)(__phys_to_virt((unsigned long)(x))); | 282 | return (void *)__phys_to_virt(x); |
242 | } | 283 | } |
243 | 284 | ||
244 | /* | 285 | /* |
245 | * Drivers should NOT use these either. | 286 | * Drivers should NOT use these either. |
246 | */ | 287 | */ |
247 | #define __pa(x) __virt_to_phys((unsigned long)(x)) | 288 | #define __pa(x) __virt_to_phys((unsigned long)(x)) |
248 | #define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) | 289 | #define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x))) |
249 | #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) | 290 | #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) |
250 | 291 | ||
292 | extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x); | ||
293 | |||
294 | /* | ||
295 | * These are for systems that have a hardware interconnect supported alias of | ||
296 | * physical memory for idmap purposes. Most cases should leave these | ||
297 | * untouched. | ||
298 | */ | ||
299 | static inline phys_addr_t __virt_to_idmap(unsigned long x) | ||
300 | { | ||
301 | if (arch_virt_to_idmap) | ||
302 | return arch_virt_to_idmap(x); | ||
303 | else | ||
304 | return __virt_to_phys(x); | ||
305 | } | ||
306 | |||
307 | #define virt_to_idmap(x) __virt_to_idmap((unsigned long)(x)) | ||
308 | |||
251 | /* | 309 | /* |
252 | * Virtual <-> DMA view memory address translations | 310 | * Virtual <-> DMA view memory address translations |
253 | * Again, these are *only* valid on the kernel direct mapped RAM | 311 | * Again, these are *only* valid on the kernel direct mapped RAM |
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 6f18da09668b..64fd15159b7d 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h | |||
@@ -16,7 +16,7 @@ typedef struct { | |||
16 | #ifdef CONFIG_CPU_HAS_ASID | 16 | #ifdef CONFIG_CPU_HAS_ASID |
17 | #define ASID_BITS 8 | 17 | #define ASID_BITS 8 |
18 | #define ASID_MASK ((~0ULL) << ASID_BITS) | 18 | #define ASID_MASK ((~0ULL) << ASID_BITS) |
19 | #define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) | 19 | #define ASID(mm) ((unsigned int)((mm)->context.id.counter & ~ASID_MASK)) |
20 | #else | 20 | #else |
21 | #define ASID(mm) (0) | 21 | #define ASID(mm) (0) |
22 | #endif | 22 | #endif |
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 413f3876341c..c3d5fc124a05 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/hw_breakpoint.h> | 22 | #include <asm/hw_breakpoint.h> |
23 | #include <asm/ptrace.h> | 23 | #include <asm/ptrace.h> |
24 | #include <asm/types.h> | 24 | #include <asm/types.h> |
25 | #include <asm/unified.h> | ||
25 | 26 | ||
26 | #ifdef __KERNEL__ | 27 | #ifdef __KERNEL__ |
27 | #define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \ | 28 | #define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \ |
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p); | |||
87 | #define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc | 88 | #define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc |
88 | #define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp | 89 | #define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp |
89 | 90 | ||
91 | #ifdef CONFIG_SMP | ||
92 | #define __ALT_SMP_ASM(smp, up) \ | ||
93 | "9998: " smp "\n" \ | ||
94 | " .pushsection \".alt.smp.init\", \"a\"\n" \ | ||
95 | " .long 9998b\n" \ | ||
96 | " " up "\n" \ | ||
97 | " .popsection\n" | ||
98 | #else | ||
99 | #define __ALT_SMP_ASM(smp, up) up | ||
100 | #endif | ||
101 | |||
90 | /* | 102 | /* |
91 | * Prefetching support - only ARMv5. | 103 | * Prefetching support - only ARMv5. |
92 | */ | 104 | */ |
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr) | |||
97 | { | 109 | { |
98 | __asm__ __volatile__( | 110 | __asm__ __volatile__( |
99 | "pld\t%a0" | 111 | "pld\t%a0" |
100 | : | 112 | :: "p" (ptr)); |
101 | : "p" (ptr) | ||
102 | : "cc"); | ||
103 | } | 113 | } |
104 | 114 | ||
115 | #if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) | ||
105 | #define ARCH_HAS_PREFETCHW | 116 | #define ARCH_HAS_PREFETCHW |
106 | #define prefetchw(ptr) prefetch(ptr) | 117 | static inline void prefetchw(const void *ptr) |
107 | 118 | { | |
108 | #define ARCH_HAS_SPINLOCK_PREFETCH | 119 | __asm__ __volatile__( |
109 | #define spin_lock_prefetch(x) do { } while (0) | 120 | ".arch_extension mp\n" |
110 | 121 | __ALT_SMP_ASM( | |
122 | WASM(pldw) "\t%a0", | ||
123 | WASM(pld) "\t%a0" | ||
124 | ) | ||
125 | :: "p" (ptr)); | ||
126 | } | ||
127 | #endif | ||
111 | #endif | 128 | #endif |
112 | 129 | ||
113 | #define HAVE_ARCH_PICK_MMAP_LAYOUT | 130 | #define HAVE_ARCH_PICK_MMAP_LAYOUT |
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h index a8cae71caceb..22a3b9b5d4a1 100644 --- a/arch/arm/include/asm/smp.h +++ b/arch/arm/include/asm/smp.h | |||
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu); | |||
84 | extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); | 84 | extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); |
85 | extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); | 85 | extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); |
86 | 86 | ||
87 | extern int register_ipi_completion(struct completion *completion, int cpu); | ||
88 | |||
87 | struct smp_operations { | 89 | struct smp_operations { |
88 | #ifdef CONFIG_SMP | 90 | #ifdef CONFIG_SMP |
89 | /* | 91 | /* |
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index ed6c22919e47..ef3c6072aa45 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h | |||
@@ -5,21 +5,13 @@ | |||
5 | #error SMP not supported on pre-ARMv6 CPUs | 5 | #error SMP not supported on pre-ARMv6 CPUs |
6 | #endif | 6 | #endif |
7 | 7 | ||
8 | #include <asm/processor.h> | 8 | #include <linux/prefetch.h> |
9 | 9 | ||
10 | /* | 10 | /* |
11 | * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K | 11 | * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K |
12 | * extensions, so when running on UP, we have to patch these instructions away. | 12 | * extensions, so when running on UP, we have to patch these instructions away. |
13 | */ | 13 | */ |
14 | #define ALT_SMP(smp, up) \ | ||
15 | "9998: " smp "\n" \ | ||
16 | " .pushsection \".alt.smp.init\", \"a\"\n" \ | ||
17 | " .long 9998b\n" \ | ||
18 | " " up "\n" \ | ||
19 | " .popsection\n" | ||
20 | |||
21 | #ifdef CONFIG_THUMB2_KERNEL | 14 | #ifdef CONFIG_THUMB2_KERNEL |
22 | #define SEV ALT_SMP("sev.w", "nop.w") | ||
23 | /* | 15 | /* |
24 | * For Thumb-2, special care is needed to ensure that the conditional WFE | 16 | * For Thumb-2, special care is needed to ensure that the conditional WFE |
25 | * instruction really does assemble to exactly 4 bytes (as required by | 17 | * instruction really does assemble to exactly 4 bytes (as required by |
@@ -31,17 +23,18 @@ | |||
31 | * the assembler won't change IT instructions which are explicitly present | 23 | * the assembler won't change IT instructions which are explicitly present |
32 | * in the input. | 24 | * in the input. |
33 | */ | 25 | */ |
34 | #define WFE(cond) ALT_SMP( \ | 26 | #define WFE(cond) __ALT_SMP_ASM( \ |
35 | "it " cond "\n\t" \ | 27 | "it " cond "\n\t" \ |
36 | "wfe" cond ".n", \ | 28 | "wfe" cond ".n", \ |
37 | \ | 29 | \ |
38 | "nop.w" \ | 30 | "nop.w" \ |
39 | ) | 31 | ) |
40 | #else | 32 | #else |
41 | #define SEV ALT_SMP("sev", "nop") | 33 | #define WFE(cond) __ALT_SMP_ASM("wfe" cond, "nop") |
42 | #define WFE(cond) ALT_SMP("wfe" cond, "nop") | ||
43 | #endif | 34 | #endif |
44 | 35 | ||
36 | #define SEV __ALT_SMP_ASM(WASM(sev), WASM(nop)) | ||
37 | |||
45 | static inline void dsb_sev(void) | 38 | static inline void dsb_sev(void) |
46 | { | 39 | { |
47 | #if __LINUX_ARM_ARCH__ >= 7 | 40 | #if __LINUX_ARM_ARCH__ >= 7 |
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) | |||
77 | u32 newval; | 70 | u32 newval; |
78 | arch_spinlock_t lockval; | 71 | arch_spinlock_t lockval; |
79 | 72 | ||
73 | prefetchw(&lock->slock); | ||
80 | __asm__ __volatile__( | 74 | __asm__ __volatile__( |
81 | "1: ldrex %0, [%3]\n" | 75 | "1: ldrex %0, [%3]\n" |
82 | " add %1, %0, %4\n" | 76 | " add %1, %0, %4\n" |
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) | |||
100 | unsigned long contended, res; | 94 | unsigned long contended, res; |
101 | u32 slock; | 95 | u32 slock; |
102 | 96 | ||
97 | prefetchw(&lock->slock); | ||
103 | do { | 98 | do { |
104 | __asm__ __volatile__( | 99 | __asm__ __volatile__( |
105 | " ldrex %0, [%3]\n" | 100 | " ldrex %0, [%3]\n" |
@@ -156,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) | |||
156 | { | 151 | { |
157 | unsigned long tmp; | 152 | unsigned long tmp; |
158 | 153 | ||
154 | prefetchw(&rw->lock); | ||
159 | __asm__ __volatile__( | 155 | __asm__ __volatile__( |
160 | "1: ldrex %0, [%1]\n" | 156 | "1: ldrex %0, [%1]\n" |
161 | " teq %0, #0\n" | 157 | " teq %0, #0\n" |
@@ -174,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) | |||
174 | { | 170 | { |
175 | unsigned long contended, res; | 171 | unsigned long contended, res; |
176 | 172 | ||
173 | prefetchw(&rw->lock); | ||
177 | do { | 174 | do { |
178 | __asm__ __volatile__( | 175 | __asm__ __volatile__( |
179 | " ldrex %0, [%2]\n" | 176 | " ldrex %0, [%2]\n" |
@@ -207,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) | |||
207 | } | 204 | } |
208 | 205 | ||
209 | /* write_can_lock - would write_trylock() succeed? */ | 206 | /* write_can_lock - would write_trylock() succeed? */ |
210 | #define arch_write_can_lock(x) ((x)->lock == 0) | 207 | #define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0) |
211 | 208 | ||
212 | /* | 209 | /* |
213 | * Read locks are a bit more hairy: | 210 | * Read locks are a bit more hairy: |
@@ -225,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw) | |||
225 | { | 222 | { |
226 | unsigned long tmp, tmp2; | 223 | unsigned long tmp, tmp2; |
227 | 224 | ||
225 | prefetchw(&rw->lock); | ||
228 | __asm__ __volatile__( | 226 | __asm__ __volatile__( |
229 | "1: ldrex %0, [%2]\n" | 227 | "1: ldrex %0, [%2]\n" |
230 | " adds %0, %0, #1\n" | 228 | " adds %0, %0, #1\n" |
@@ -245,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) | |||
245 | 243 | ||
246 | smp_mb(); | 244 | smp_mb(); |
247 | 245 | ||
246 | prefetchw(&rw->lock); | ||
248 | __asm__ __volatile__( | 247 | __asm__ __volatile__( |
249 | "1: ldrex %0, [%2]\n" | 248 | "1: ldrex %0, [%2]\n" |
250 | " sub %0, %0, #1\n" | 249 | " sub %0, %0, #1\n" |
@@ -263,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) | |||
263 | { | 262 | { |
264 | unsigned long contended, res; | 263 | unsigned long contended, res; |
265 | 264 | ||
265 | prefetchw(&rw->lock); | ||
266 | do { | 266 | do { |
267 | __asm__ __volatile__( | 267 | __asm__ __volatile__( |
268 | " ldrex %0, [%2]\n" | 268 | " ldrex %0, [%2]\n" |
@@ -284,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) | |||
284 | } | 284 | } |
285 | 285 | ||
286 | /* read_can_lock - would read_trylock() succeed? */ | 286 | /* read_can_lock - would read_trylock() succeed? */ |
287 | #define arch_read_can_lock(x) ((x)->lock < 0x80000000) | 287 | #define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000) |
288 | 288 | ||
289 | #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) | 289 | #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) |
290 | #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) | 290 | #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) |
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index b262d2f8b478..47663fcb10ad 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h | |||
@@ -25,7 +25,7 @@ typedef struct { | |||
25 | #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } | 25 | #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } |
26 | 26 | ||
27 | typedef struct { | 27 | typedef struct { |
28 | volatile unsigned int lock; | 28 | u32 lock; |
29 | } arch_rwlock_t; | 29 | } arch_rwlock_t; |
30 | 30 | ||
31 | #define __ARCH_RW_LOCK_UNLOCKED { 0 } | 31 | #define __ARCH_RW_LOCK_UNLOCKED { 0 } |
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h index f5989f46b4d2..b88beaba6b4a 100644 --- a/arch/arm/include/asm/unified.h +++ b/arch/arm/include/asm/unified.h | |||
@@ -38,6 +38,8 @@ | |||
38 | #ifdef __ASSEMBLY__ | 38 | #ifdef __ASSEMBLY__ |
39 | #define W(instr) instr.w | 39 | #define W(instr) instr.w |
40 | #define BSYM(sym) sym + 1 | 40 | #define BSYM(sym) sym + 1 |
41 | #else | ||
42 | #define WASM(instr) #instr ".w" | ||
41 | #endif | 43 | #endif |
42 | 44 | ||
43 | #else /* !CONFIG_THUMB2_KERNEL */ | 45 | #else /* !CONFIG_THUMB2_KERNEL */ |
@@ -50,6 +52,8 @@ | |||
50 | #ifdef __ASSEMBLY__ | 52 | #ifdef __ASSEMBLY__ |
51 | #define W(instr) instr | 53 | #define W(instr) instr |
52 | #define BSYM(sym) sym | 54 | #define BSYM(sym) sym |
55 | #else | ||
56 | #define WASM(instr) #instr | ||
53 | #endif | 57 | #endif |
54 | 58 | ||
55 | #endif /* CONFIG_THUMB2_KERNEL */ | 59 | #endif /* CONFIG_THUMB2_KERNEL */ |
diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S index 37c6895b87e6..92ef808a2337 100644 --- a/arch/arm/include/debug/pl01x.S +++ b/arch/arm/include/debug/pl01x.S | |||
@@ -25,12 +25,14 @@ | |||
25 | 25 | ||
26 | .macro waituart,rd,rx | 26 | .macro waituart,rd,rx |
27 | 1001: ldr \rd, [\rx, #UART01x_FR] | 27 | 1001: ldr \rd, [\rx, #UART01x_FR] |
28 | ARM_BE8( rev \rd, \rd ) | ||
28 | tst \rd, #UART01x_FR_TXFF | 29 | tst \rd, #UART01x_FR_TXFF |
29 | bne 1001b | 30 | bne 1001b |
30 | .endm | 31 | .endm |
31 | 32 | ||
32 | .macro busyuart,rd,rx | 33 | .macro busyuart,rd,rx |
33 | 1001: ldr \rd, [\rx, #UART01x_FR] | 34 | 1001: ldr \rd, [\rx, #UART01x_FR] |
35 | ARM_BE8( rev \rd, \rd ) | ||
34 | tst \rd, #UART01x_FR_BUSY | 36 | tst \rd, #UART01x_FR_BUSY |
35 | bne 1001b | 37 | bne 1001b |
36 | .endm | 38 | .endm |
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index 18d76fd5a2af..70a1c9da30ca 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild | |||
@@ -7,6 +7,7 @@ header-y += hwcap.h | |||
7 | header-y += ioctls.h | 7 | header-y += ioctls.h |
8 | header-y += kvm_para.h | 8 | header-y += kvm_para.h |
9 | header-y += mman.h | 9 | header-y += mman.h |
10 | header-y += perf_regs.h | ||
10 | header-y += posix_types.h | 11 | header-y += posix_types.h |
11 | header-y += ptrace.h | 12 | header-y += ptrace.h |
12 | header-y += setup.h | 13 | header-y += setup.h |
diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000000..ce59448458b2 --- /dev/null +++ b/arch/arm/include/uapi/asm/perf_regs.h | |||
@@ -0,0 +1,23 @@ | |||
1 | #ifndef _ASM_ARM_PERF_REGS_H | ||
2 | #define _ASM_ARM_PERF_REGS_H | ||
3 | |||
4 | enum perf_event_arm_regs { | ||
5 | PERF_REG_ARM_R0, | ||
6 | PERF_REG_ARM_R1, | ||
7 | PERF_REG_ARM_R2, | ||
8 | PERF_REG_ARM_R3, | ||
9 | PERF_REG_ARM_R4, | ||
10 | PERF_REG_ARM_R5, | ||
11 | PERF_REG_ARM_R6, | ||
12 | PERF_REG_ARM_R7, | ||
13 | PERF_REG_ARM_R8, | ||
14 | PERF_REG_ARM_R9, | ||
15 | PERF_REG_ARM_R10, | ||
16 | PERF_REG_ARM_FP, | ||
17 | PERF_REG_ARM_IP, | ||
18 | PERF_REG_ARM_SP, | ||
19 | PERF_REG_ARM_LR, | ||
20 | PERF_REG_ARM_PC, | ||
21 | PERF_REG_ARM_MAX, | ||
22 | }; | ||
23 | #endif /* _ASM_ARM_PERF_REGS_H */ | ||
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 5140df5f23aa..a30fc9be9e9e 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile | |||
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg | |||
17 | 17 | ||
18 | obj-y := elf.o entry-common.o irq.o opcodes.o \ | 18 | obj-y := elf.o entry-common.o irq.o opcodes.o \ |
19 | process.o ptrace.o return_address.o \ | 19 | process.o ptrace.o return_address.o \ |
20 | setup.o signal.o stacktrace.o sys_arm.o time.o traps.o | 20 | setup.o signal.o sigreturn_codes.o \ |
21 | stacktrace.o sys_arm.o time.o traps.o | ||
21 | 22 | ||
22 | obj-$(CONFIG_ATAGS) += atags_parse.o | 23 | obj-$(CONFIG_ATAGS) += atags_parse.o |
23 | obj-$(CONFIG_ATAGS_PROC) += atags_proc.o | 24 | obj-$(CONFIG_ATAGS_PROC) += atags_proc.o |
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o | |||
78 | obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o | 79 | obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o |
79 | obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o | 80 | obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o |
80 | obj-$(CONFIG_IWMMXT) += iwmmxt.o | 81 | obj-$(CONFIG_IWMMXT) += iwmmxt.o |
82 | obj-$(CONFIG_PERF_EVENTS) += perf_regs.o | ||
81 | obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o | 83 | obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o |
82 | AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt | 84 | AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt |
83 | obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o | 85 | obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o |
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 60d3b738d420..1f031ddd0667 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c | |||
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc); | |||
155 | 155 | ||
156 | #ifdef CONFIG_ARM_PATCH_PHYS_VIRT | 156 | #ifdef CONFIG_ARM_PATCH_PHYS_VIRT |
157 | EXPORT_SYMBOL(__pv_phys_offset); | 157 | EXPORT_SYMBOL(__pv_phys_offset); |
158 | EXPORT_SYMBOL(__pv_offset); | ||
158 | #endif | 159 | #endif |
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9cbe70c8b0ef..55090fbb81a2 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S | |||
@@ -416,9 +416,8 @@ __und_usr: | |||
416 | bne __und_usr_thumb | 416 | bne __und_usr_thumb |
417 | sub r4, r2, #4 @ ARM instr at LR - 4 | 417 | sub r4, r2, #4 @ ARM instr at LR - 4 |
418 | 1: ldrt r0, [r4] | 418 | 1: ldrt r0, [r4] |
419 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 419 | ARM_BE8(rev r0, r0) @ little endian instruction |
420 | rev r0, r0 @ little endian instruction | 420 | |
421 | #endif | ||
422 | @ r0 = 32-bit ARM instruction which caused the exception | 421 | @ r0 = 32-bit ARM instruction which caused the exception |
423 | @ r2 = PC value for the following instruction (:= regs->ARM_pc) | 422 | @ r2 = PC value for the following instruction (:= regs->ARM_pc) |
424 | @ r4 = PC value for the faulting instruction | 423 | @ r4 = PC value for the faulting instruction |
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index bc6bd9683ba4..a2dcafdf1bc8 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S | |||
@@ -393,9 +393,7 @@ ENTRY(vector_swi) | |||
393 | #else | 393 | #else |
394 | USER( ldr r10, [lr, #-4] ) @ get SWI instruction | 394 | USER( ldr r10, [lr, #-4] ) @ get SWI instruction |
395 | #endif | 395 | #endif |
396 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 396 | ARM_BE8(rev r10, r10) @ little endian instruction |
397 | rev r10, r10 @ little endian instruction | ||
398 | #endif | ||
399 | 397 | ||
400 | #elif defined(CONFIG_AEABI) | 398 | #elif defined(CONFIG_AEABI) |
401 | 399 | ||
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 476de57dcef2..7801866e626a 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S | |||
@@ -77,6 +77,7 @@ | |||
77 | 77 | ||
78 | __HEAD | 78 | __HEAD |
79 | ENTRY(stext) | 79 | ENTRY(stext) |
80 | ARM_BE8(setend be ) @ ensure we are in BE8 mode | ||
80 | 81 | ||
81 | THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM. | 82 | THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM. |
82 | THUMB( bx r9 ) @ If this is a Thumb-2 kernel, | 83 | THUMB( bx r9 ) @ If this is a Thumb-2 kernel, |
@@ -352,6 +353,9 @@ ENTRY(secondary_startup) | |||
352 | * the processor type - there is no need to check the machine type | 353 | * the processor type - there is no need to check the machine type |
353 | * as it has already been validated by the primary processor. | 354 | * as it has already been validated by the primary processor. |
354 | */ | 355 | */ |
356 | |||
357 | ARM_BE8(setend be) @ ensure we are in BE8 mode | ||
358 | |||
355 | #ifdef CONFIG_ARM_VIRT_EXT | 359 | #ifdef CONFIG_ARM_VIRT_EXT |
356 | bl __hyp_stub_install_secondary | 360 | bl __hyp_stub_install_secondary |
357 | #endif | 361 | #endif |
@@ -555,6 +559,14 @@ ENTRY(fixup_smp) | |||
555 | ldmfd sp!, {r4 - r6, pc} | 559 | ldmfd sp!, {r4 - r6, pc} |
556 | ENDPROC(fixup_smp) | 560 | ENDPROC(fixup_smp) |
557 | 561 | ||
562 | #ifdef __ARMEB__ | ||
563 | #define LOW_OFFSET 0x4 | ||
564 | #define HIGH_OFFSET 0x0 | ||
565 | #else | ||
566 | #define LOW_OFFSET 0x0 | ||
567 | #define HIGH_OFFSET 0x4 | ||
568 | #endif | ||
569 | |||
558 | #ifdef CONFIG_ARM_PATCH_PHYS_VIRT | 570 | #ifdef CONFIG_ARM_PATCH_PHYS_VIRT |
559 | 571 | ||
560 | /* __fixup_pv_table - patch the stub instructions with the delta between | 572 | /* __fixup_pv_table - patch the stub instructions with the delta between |
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp) | |||
565 | __HEAD | 577 | __HEAD |
566 | __fixup_pv_table: | 578 | __fixup_pv_table: |
567 | adr r0, 1f | 579 | adr r0, 1f |
568 | ldmia r0, {r3-r5, r7} | 580 | ldmia r0, {r3-r7} |
569 | sub r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET | 581 | mvn ip, #0 |
582 | subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET | ||
570 | add r4, r4, r3 @ adjust table start address | 583 | add r4, r4, r3 @ adjust table start address |
571 | add r5, r5, r3 @ adjust table end address | 584 | add r5, r5, r3 @ adjust table end address |
572 | add r7, r7, r3 @ adjust __pv_phys_offset address | 585 | add r6, r6, r3 @ adjust __pv_phys_offset address |
573 | str r8, [r7] @ save computed PHYS_OFFSET to __pv_phys_offset | 586 | add r7, r7, r3 @ adjust __pv_offset address |
587 | str r8, [r6, #LOW_OFFSET] @ save computed PHYS_OFFSET to __pv_phys_offset | ||
588 | strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits | ||
574 | mov r6, r3, lsr #24 @ constant for add/sub instructions | 589 | mov r6, r3, lsr #24 @ constant for add/sub instructions |
575 | teq r3, r6, lsl #24 @ must be 16MiB aligned | 590 | teq r3, r6, lsl #24 @ must be 16MiB aligned |
576 | THUMB( it ne @ cross section branch ) | 591 | THUMB( it ne @ cross section branch ) |
577 | bne __error | 592 | bne __error |
578 | str r6, [r7, #4] @ save to __pv_offset | 593 | str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits |
579 | b __fixup_a_pv_table | 594 | b __fixup_a_pv_table |
580 | ENDPROC(__fixup_pv_table) | 595 | ENDPROC(__fixup_pv_table) |
581 | 596 | ||
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table) | |||
584 | .long __pv_table_begin | 599 | .long __pv_table_begin |
585 | .long __pv_table_end | 600 | .long __pv_table_end |
586 | 2: .long __pv_phys_offset | 601 | 2: .long __pv_phys_offset |
602 | .long __pv_offset | ||
587 | 603 | ||
588 | .text | 604 | .text |
589 | __fixup_a_pv_table: | 605 | __fixup_a_pv_table: |
606 | adr r0, 3f | ||
607 | ldr r6, [r0] | ||
608 | add r6, r6, r3 | ||
609 | ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word | ||
610 | ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word | ||
611 | mov r6, r6, lsr #24 | ||
612 | cmn r0, #1 | ||
590 | #ifdef CONFIG_THUMB2_KERNEL | 613 | #ifdef CONFIG_THUMB2_KERNEL |
614 | moveq r0, #0x200000 @ set bit 21, mov to mvn instruction | ||
591 | lsls r6, #24 | 615 | lsls r6, #24 |
592 | beq 2f | 616 | beq 2f |
593 | clz r7, r6 | 617 | clz r7, r6 |
@@ -601,18 +625,42 @@ __fixup_a_pv_table: | |||
601 | b 2f | 625 | b 2f |
602 | 1: add r7, r3 | 626 | 1: add r7, r3 |
603 | ldrh ip, [r7, #2] | 627 | ldrh ip, [r7, #2] |
604 | and ip, 0x8f00 | 628 | ARM_BE8(rev16 ip, ip) |
605 | orr ip, r6 @ mask in offset bits 31-24 | 629 | tst ip, #0x4000 |
630 | and ip, #0x8f00 | ||
631 | orrne ip, r6 @ mask in offset bits 31-24 | ||
632 | orreq ip, r0 @ mask in offset bits 7-0 | ||
633 | ARM_BE8(rev16 ip, ip) | ||
606 | strh ip, [r7, #2] | 634 | strh ip, [r7, #2] |
635 | bne 2f | ||
636 | ldrh ip, [r7] | ||
637 | ARM_BE8(rev16 ip, ip) | ||
638 | bic ip, #0x20 | ||
639 | orr ip, ip, r0, lsr #16 | ||
640 | ARM_BE8(rev16 ip, ip) | ||
641 | strh ip, [r7] | ||
607 | 2: cmp r4, r5 | 642 | 2: cmp r4, r5 |
608 | ldrcc r7, [r4], #4 @ use branch for delay slot | 643 | ldrcc r7, [r4], #4 @ use branch for delay slot |
609 | bcc 1b | 644 | bcc 1b |
610 | bx lr | 645 | bx lr |
611 | #else | 646 | #else |
647 | moveq r0, #0x400000 @ set bit 22, mov to mvn instruction | ||
612 | b 2f | 648 | b 2f |
613 | 1: ldr ip, [r7, r3] | 649 | 1: ldr ip, [r7, r3] |
650 | #ifdef CONFIG_CPU_ENDIAN_BE8 | ||
651 | @ in BE8, we load data in BE, but instructions still in LE | ||
652 | bic ip, ip, #0xff000000 | ||
653 | tst ip, #0x000f0000 @ check the rotation field | ||
654 | orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 | ||
655 | biceq ip, ip, #0x00004000 @ clear bit 22 | ||
656 | orreq ip, ip, r0, lsl #24 @ mask in offset bits 7-0 | ||
657 | #else | ||
614 | bic ip, ip, #0x000000ff | 658 | bic ip, ip, #0x000000ff |
615 | orr ip, ip, r6 @ mask in offset bits 31-24 | 659 | tst ip, #0xf00 @ check the rotation field |
660 | orrne ip, ip, r6 @ mask in offset bits 31-24 | ||
661 | biceq ip, ip, #0x400000 @ clear bit 22 | ||
662 | orreq ip, ip, r0 @ mask in offset bits 7-0 | ||
663 | #endif | ||
616 | str ip, [r7, r3] | 664 | str ip, [r7, r3] |
617 | 2: cmp r4, r5 | 665 | 2: cmp r4, r5 |
618 | ldrcc r7, [r4], #4 @ use branch for delay slot | 666 | ldrcc r7, [r4], #4 @ use branch for delay slot |
@@ -621,28 +669,30 @@ __fixup_a_pv_table: | |||
621 | #endif | 669 | #endif |
622 | ENDPROC(__fixup_a_pv_table) | 670 | ENDPROC(__fixup_a_pv_table) |
623 | 671 | ||
672 | .align | ||
673 | 3: .long __pv_offset | ||
674 | |||
624 | ENTRY(fixup_pv_table) | 675 | ENTRY(fixup_pv_table) |
625 | stmfd sp!, {r4 - r7, lr} | 676 | stmfd sp!, {r4 - r7, lr} |
626 | ldr r2, 2f @ get address of __pv_phys_offset | ||
627 | mov r3, #0 @ no offset | 677 | mov r3, #0 @ no offset |
628 | mov r4, r0 @ r0 = table start | 678 | mov r4, r0 @ r0 = table start |
629 | add r5, r0, r1 @ r1 = table size | 679 | add r5, r0, r1 @ r1 = table size |
630 | ldr r6, [r2, #4] @ get __pv_offset | ||
631 | bl __fixup_a_pv_table | 680 | bl __fixup_a_pv_table |
632 | ldmfd sp!, {r4 - r7, pc} | 681 | ldmfd sp!, {r4 - r7, pc} |
633 | ENDPROC(fixup_pv_table) | 682 | ENDPROC(fixup_pv_table) |
634 | 683 | ||
635 | .align | ||
636 | 2: .long __pv_phys_offset | ||
637 | |||
638 | .data | 684 | .data |
639 | .globl __pv_phys_offset | 685 | .globl __pv_phys_offset |
640 | .type __pv_phys_offset, %object | 686 | .type __pv_phys_offset, %object |
641 | __pv_phys_offset: | 687 | __pv_phys_offset: |
642 | .long 0 | 688 | .quad 0 |
643 | .size __pv_phys_offset, . - __pv_phys_offset | 689 | .size __pv_phys_offset, . -__pv_phys_offset |
690 | |||
691 | .globl __pv_offset | ||
692 | .type __pv_offset, %object | ||
644 | __pv_offset: | 693 | __pv_offset: |
645 | .long 0 | 694 | .quad 0 |
695 | .size __pv_offset, . -__pv_offset | ||
646 | #endif | 696 | #endif |
647 | 697 | ||
648 | #include "head-common.S" | 698 | #include "head-common.S" |
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 084dc8896986..5fdb4038f969 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/sections.h> | 24 | #include <asm/sections.h> |
25 | #include <asm/smp_plat.h> | 25 | #include <asm/smp_plat.h> |
26 | #include <asm/unwind.h> | 26 | #include <asm/unwind.h> |
27 | #include <asm/opcodes.h> | ||
27 | 28 | ||
28 | #ifdef CONFIG_XIP_KERNEL | 29 | #ifdef CONFIG_XIP_KERNEL |
29 | /* | 30 | /* |
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
60 | Elf32_Sym *sym; | 61 | Elf32_Sym *sym; |
61 | const char *symname; | 62 | const char *symname; |
62 | s32 offset; | 63 | s32 offset; |
64 | u32 tmp; | ||
63 | #ifdef CONFIG_THUMB2_KERNEL | 65 | #ifdef CONFIG_THUMB2_KERNEL |
64 | u32 upper, lower, sign, j1, j2; | 66 | u32 upper, lower, sign, j1, j2; |
65 | #endif | 67 | #endif |
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
95 | case R_ARM_PC24: | 97 | case R_ARM_PC24: |
96 | case R_ARM_CALL: | 98 | case R_ARM_CALL: |
97 | case R_ARM_JUMP24: | 99 | case R_ARM_JUMP24: |
98 | offset = (*(u32 *)loc & 0x00ffffff) << 2; | 100 | offset = __mem_to_opcode_arm(*(u32 *)loc); |
101 | offset = (offset & 0x00ffffff) << 2; | ||
99 | if (offset & 0x02000000) | 102 | if (offset & 0x02000000) |
100 | offset -= 0x04000000; | 103 | offset -= 0x04000000; |
101 | 104 | ||
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
111 | } | 114 | } |
112 | 115 | ||
113 | offset >>= 2; | 116 | offset >>= 2; |
117 | offset &= 0x00ffffff; | ||
114 | 118 | ||
115 | *(u32 *)loc &= 0xff000000; | 119 | *(u32 *)loc &= __opcode_to_mem_arm(0xff000000); |
116 | *(u32 *)loc |= offset & 0x00ffffff; | 120 | *(u32 *)loc |= __opcode_to_mem_arm(offset); |
117 | break; | 121 | break; |
118 | 122 | ||
119 | case R_ARM_V4BX: | 123 | case R_ARM_V4BX: |
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
121 | * other bits to re-code instruction as | 125 | * other bits to re-code instruction as |
122 | * MOV PC,Rm. | 126 | * MOV PC,Rm. |
123 | */ | 127 | */ |
124 | *(u32 *)loc &= 0xf000000f; | 128 | *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f); |
125 | *(u32 *)loc |= 0x01a0f000; | 129 | *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000); |
126 | break; | 130 | break; |
127 | 131 | ||
128 | case R_ARM_PREL31: | 132 | case R_ARM_PREL31: |
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
132 | 136 | ||
133 | case R_ARM_MOVW_ABS_NC: | 137 | case R_ARM_MOVW_ABS_NC: |
134 | case R_ARM_MOVT_ABS: | 138 | case R_ARM_MOVT_ABS: |
135 | offset = *(u32 *)loc; | 139 | offset = tmp = __mem_to_opcode_arm(*(u32 *)loc); |
136 | offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); | 140 | offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); |
137 | offset = (offset ^ 0x8000) - 0x8000; | 141 | offset = (offset ^ 0x8000) - 0x8000; |
138 | 142 | ||
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
140 | if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) | 144 | if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) |
141 | offset >>= 16; | 145 | offset >>= 16; |
142 | 146 | ||
143 | *(u32 *)loc &= 0xfff0f000; | 147 | tmp &= 0xfff0f000; |
144 | *(u32 *)loc |= ((offset & 0xf000) << 4) | | 148 | tmp |= ((offset & 0xf000) << 4) | |
145 | (offset & 0x0fff); | 149 | (offset & 0x0fff); |
150 | |||
151 | *(u32 *)loc = __opcode_to_mem_arm(tmp); | ||
146 | break; | 152 | break; |
147 | 153 | ||
148 | #ifdef CONFIG_THUMB2_KERNEL | 154 | #ifdef CONFIG_THUMB2_KERNEL |
149 | case R_ARM_THM_CALL: | 155 | case R_ARM_THM_CALL: |
150 | case R_ARM_THM_JUMP24: | 156 | case R_ARM_THM_JUMP24: |
151 | upper = *(u16 *)loc; | 157 | upper = __mem_to_opcode_thumb16(*(u16 *)loc); |
152 | lower = *(u16 *)(loc + 2); | 158 | lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2)); |
153 | 159 | ||
154 | /* | 160 | /* |
155 | * 25 bit signed address range (Thumb-2 BL and B.W | 161 | * 25 bit signed address range (Thumb-2 BL and B.W |
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
198 | sign = (offset >> 24) & 1; | 204 | sign = (offset >> 24) & 1; |
199 | j1 = sign ^ (~(offset >> 23) & 1); | 205 | j1 = sign ^ (~(offset >> 23) & 1); |
200 | j2 = sign ^ (~(offset >> 22) & 1); | 206 | j2 = sign ^ (~(offset >> 22) & 1); |
201 | *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) | | 207 | upper = (u16)((upper & 0xf800) | (sign << 10) | |
202 | ((offset >> 12) & 0x03ff)); | 208 | ((offset >> 12) & 0x03ff)); |
203 | *(u16 *)(loc + 2) = (u16)((lower & 0xd000) | | 209 | lower = (u16)((lower & 0xd000) | |
204 | (j1 << 13) | (j2 << 11) | | 210 | (j1 << 13) | (j2 << 11) | |
205 | ((offset >> 1) & 0x07ff)); | 211 | ((offset >> 1) & 0x07ff)); |
212 | |||
213 | *(u16 *)loc = __opcode_to_mem_thumb16(upper); | ||
214 | *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower); | ||
206 | break; | 215 | break; |
207 | 216 | ||
208 | case R_ARM_THM_MOVW_ABS_NC: | 217 | case R_ARM_THM_MOVW_ABS_NC: |
209 | case R_ARM_THM_MOVT_ABS: | 218 | case R_ARM_THM_MOVT_ABS: |
210 | upper = *(u16 *)loc; | 219 | upper = __mem_to_opcode_thumb16(*(u16 *)loc); |
211 | lower = *(u16 *)(loc + 2); | 220 | lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2)); |
212 | 221 | ||
213 | /* | 222 | /* |
214 | * MOVT/MOVW instructions encoding in Thumb-2: | 223 | * MOVT/MOVW instructions encoding in Thumb-2: |
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, | |||
229 | if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) | 238 | if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) |
230 | offset >>= 16; | 239 | offset >>= 16; |
231 | 240 | ||
232 | *(u16 *)loc = (u16)((upper & 0xfbf0) | | 241 | upper = (u16)((upper & 0xfbf0) | |
233 | ((offset & 0xf000) >> 12) | | 242 | ((offset & 0xf000) >> 12) | |
234 | ((offset & 0x0800) >> 1)); | 243 | ((offset & 0x0800) >> 1)); |
235 | *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) | | 244 | lower = (u16)((lower & 0x8f00) | |
236 | ((offset & 0x0700) << 4) | | 245 | ((offset & 0x0700) << 4) | |
237 | (offset & 0x00ff)); | 246 | (offset & 0x00ff)); |
247 | *(u16 *)loc = __opcode_to_mem_thumb16(upper); | ||
248 | *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower); | ||
238 | break; | 249 | break; |
239 | #endif | 250 | #endif |
240 | 251 | ||
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index e186ee1e63f6..bc3f2efa0d86 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c | |||
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events, | |||
256 | struct perf_event *event) | 256 | struct perf_event *event) |
257 | { | 257 | { |
258 | struct arm_pmu *armpmu = to_arm_pmu(event->pmu); | 258 | struct arm_pmu *armpmu = to_arm_pmu(event->pmu); |
259 | struct pmu *leader_pmu = event->group_leader->pmu; | ||
260 | 259 | ||
261 | if (is_software_event(event)) | 260 | if (is_software_event(event)) |
262 | return 1; | 261 | return 1; |
263 | 262 | ||
264 | if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF) | 263 | if (event->state < PERF_EVENT_STATE_OFF) |
265 | return 1; | 264 | return 1; |
266 | 265 | ||
267 | if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) | 266 | if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) |
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c new file mode 100644 index 000000000000..6e4379c67cbc --- /dev/null +++ b/arch/arm/kernel/perf_regs.c | |||
@@ -0,0 +1,30 @@ | |||
1 | |||
2 | #include <linux/errno.h> | ||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/perf_event.h> | ||
5 | #include <linux/bug.h> | ||
6 | #include <asm/perf_regs.h> | ||
7 | #include <asm/ptrace.h> | ||
8 | |||
9 | u64 perf_reg_value(struct pt_regs *regs, int idx) | ||
10 | { | ||
11 | if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX)) | ||
12 | return 0; | ||
13 | |||
14 | return regs->uregs[idx]; | ||
15 | } | ||
16 | |||
17 | #define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1)) | ||
18 | |||
19 | int perf_reg_validate(u64 mask) | ||
20 | { | ||
21 | if (!mask || mask & REG_RESERVED) | ||
22 | return -EINVAL; | ||
23 | |||
24 | return 0; | ||
25 | } | ||
26 | |||
27 | u64 perf_reg_abi(struct task_struct *task) | ||
28 | { | ||
29 | return PERF_SAMPLE_REGS_ABI_32; | ||
30 | } | ||
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 53c3901f7ee3..f52150d2ec00 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c | |||
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup); | |||
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | extern void paging_init(const struct machine_desc *desc); | 75 | extern void paging_init(const struct machine_desc *desc); |
76 | extern void early_paging_init(const struct machine_desc *, | ||
77 | struct proc_info_list *); | ||
76 | extern void sanity_check_meminfo(void); | 78 | extern void sanity_check_meminfo(void); |
77 | extern enum reboot_mode reboot_mode; | 79 | extern enum reboot_mode reboot_mode; |
78 | extern void setup_dma_zone(const struct machine_desc *desc); | 80 | extern void setup_dma_zone(const struct machine_desc *desc); |
@@ -888,6 +890,8 @@ void __init setup_arch(char **cmdline_p) | |||
888 | parse_early_param(); | 890 | parse_early_param(); |
889 | 891 | ||
890 | sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); | 892 | sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); |
893 | |||
894 | early_paging_init(mdesc, lookup_processor_type(read_cpuid_id())); | ||
891 | sanity_check_meminfo(); | 895 | sanity_check_meminfo(); |
892 | arm_memblock_init(&meminfo, mdesc); | 896 | arm_memblock_init(&meminfo, mdesc); |
893 | 897 | ||
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ab3304225272..64845fc4152a 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c | |||
@@ -21,29 +21,7 @@ | |||
21 | #include <asm/unistd.h> | 21 | #include <asm/unistd.h> |
22 | #include <asm/vfp.h> | 22 | #include <asm/vfp.h> |
23 | 23 | ||
24 | /* | 24 | extern const unsigned long sigreturn_codes[7]; |
25 | * For ARM syscalls, we encode the syscall number into the instruction. | ||
26 | */ | ||
27 | #define SWI_SYS_SIGRETURN (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)) | ||
28 | #define SWI_SYS_RT_SIGRETURN (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)) | ||
29 | |||
30 | /* | ||
31 | * With EABI, the syscall number has to be loaded into r7. | ||
32 | */ | ||
33 | #define MOV_R7_NR_SIGRETURN (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE)) | ||
34 | #define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE)) | ||
35 | |||
36 | /* | ||
37 | * For Thumb syscalls, we pass the syscall number via r7. We therefore | ||
38 | * need two 16-bit instructions. | ||
39 | */ | ||
40 | #define SWI_THUMB_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE)) | ||
41 | #define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE)) | ||
42 | |||
43 | static const unsigned long sigreturn_codes[7] = { | ||
44 | MOV_R7_NR_SIGRETURN, SWI_SYS_SIGRETURN, SWI_THUMB_SIGRETURN, | ||
45 | MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN, | ||
46 | }; | ||
47 | 25 | ||
48 | static unsigned long signal_return_offset; | 26 | static unsigned long signal_return_offset; |
49 | 27 | ||
diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S new file mode 100644 index 000000000000..3c5d0f2170fd --- /dev/null +++ b/arch/arm/kernel/sigreturn_codes.S | |||
@@ -0,0 +1,80 @@ | |||
1 | /* | ||
2 | * sigreturn_codes.S - code sinpets for sigreturn syscalls | ||
3 | * | ||
4 | * Created by: Victor Kamensky, 2013-08-13 | ||
5 | * Copyright: (C) 2013 Linaro Limited | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <asm/unistd.h> | ||
18 | |||
19 | /* | ||
20 | * For ARM syscalls, we encode the syscall number into the instruction. | ||
21 | * With EABI, the syscall number has to be loaded into r7. As result | ||
22 | * ARM syscall sequence snippet will have move and svc in .arm encoding | ||
23 | * | ||
24 | * For Thumb syscalls, we pass the syscall number via r7. We therefore | ||
25 | * need two 16-bit instructions in .thumb encoding | ||
26 | * | ||
27 | * Please note sigreturn_codes code are not executed in place. Instead | ||
28 | * they just copied by kernel into appropriate places. Code inside of | ||
29 | * arch/arm/kernel/signal.c is very sensitive to layout of these code | ||
30 | * snippets. | ||
31 | */ | ||
32 | |||
33 | #if __LINUX_ARM_ARCH__ <= 4 | ||
34 | /* | ||
35 | * Note we manually set minimally required arch that supports | ||
36 | * required thumb opcodes for early arch versions. It is OK | ||
37 | * for this file to be used in combination with other | ||
38 | * lower arch variants, since these code snippets are only | ||
39 | * used as input data. | ||
40 | */ | ||
41 | .arch armv4t | ||
42 | #endif | ||
43 | |||
44 | .section .rodata | ||
45 | .global sigreturn_codes | ||
46 | .type sigreturn_codes, #object | ||
47 | |||
48 | .arm | ||
49 | |||
50 | sigreturn_codes: | ||
51 | |||
52 | /* ARM sigreturn syscall code snippet */ | ||
53 | mov r7, #(__NR_sigreturn - __NR_SYSCALL_BASE) | ||
54 | swi #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE) | ||
55 | |||
56 | /* Thumb sigreturn syscall code snippet */ | ||
57 | .thumb | ||
58 | movs r7, #(__NR_sigreturn - __NR_SYSCALL_BASE) | ||
59 | swi #0 | ||
60 | |||
61 | /* ARM sigreturn_rt syscall code snippet */ | ||
62 | .arm | ||
63 | mov r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE) | ||
64 | swi #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE) | ||
65 | |||
66 | /* Thumb sigreturn_rt syscall code snippet */ | ||
67 | .thumb | ||
68 | movs r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE) | ||
69 | swi #0 | ||
70 | |||
71 | /* | ||
72 | * Note on addtional space: setup_return in signal.c | ||
73 | * algorithm uses two words copy regardless whether | ||
74 | * it is thumb case or not, so we need additional | ||
75 | * word after real last entry. | ||
76 | */ | ||
77 | .arm | ||
78 | .space 4 | ||
79 | |||
80 | .size sigreturn_codes, . - sigreturn_codes | ||
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index db1536b8b30b..b907d9b790ab 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S | |||
@@ -55,6 +55,7 @@ | |||
55 | * specific registers and some other data for resume. | 55 | * specific registers and some other data for resume. |
56 | * r0 = suspend function arg0 | 56 | * r0 = suspend function arg0 |
57 | * r1 = suspend function | 57 | * r1 = suspend function |
58 | * r2 = MPIDR value the resuming CPU will use | ||
58 | */ | 59 | */ |
59 | ENTRY(__cpu_suspend) | 60 | ENTRY(__cpu_suspend) |
60 | stmfd sp!, {r4 - r11, lr} | 61 | stmfd sp!, {r4 - r11, lr} |
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend) | |||
67 | mov r5, sp @ current virtual SP | 68 | mov r5, sp @ current virtual SP |
68 | add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn | 69 | add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn |
69 | sub sp, sp, r4 @ allocate CPU state on stack | 70 | sub sp, sp, r4 @ allocate CPU state on stack |
70 | stmfd sp!, {r0, r1} @ save suspend func arg and pointer | ||
71 | add r0, sp, #8 @ save pointer to save block | ||
72 | mov r1, r4 @ size of save block | ||
73 | mov r2, r5 @ virtual SP | ||
74 | ldr r3, =sleep_save_sp | 71 | ldr r3, =sleep_save_sp |
72 | stmfd sp!, {r0, r1} @ save suspend func arg and pointer | ||
75 | ldr r3, [r3, #SLEEP_SAVE_SP_VIRT] | 73 | ldr r3, [r3, #SLEEP_SAVE_SP_VIRT] |
76 | ALT_SMP(mrc p15, 0, r9, c0, c0, 5) | 74 | ALT_SMP(ldr r0, =mpidr_hash) |
77 | ALT_UP_B(1f) | 75 | ALT_UP_B(1f) |
78 | ldr r8, =mpidr_hash | 76 | /* This ldmia relies on the memory layout of the mpidr_hash struct */ |
79 | /* | 77 | ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts |
80 | * This ldmia relies on the memory layout of the mpidr_hash | 78 | compute_mpidr_hash r0, r6, r7, r8, r2, r1 |
81 | * struct mpidr_hash. | 79 | add r3, r3, r0, lsl #2 |
82 | */ | 80 | 1: mov r2, r5 @ virtual SP |
83 | ldmia r8, {r4-r7} @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts | 81 | mov r1, r4 @ size of save block |
84 | compute_mpidr_hash lr, r5, r6, r7, r9, r4 | 82 | add r0, sp, #8 @ pointer to save block |
85 | add r3, r3, lr, lsl #2 | ||
86 | 1: | ||
87 | bl __cpu_suspend_save | 83 | bl __cpu_suspend_save |
88 | adr lr, BSYM(cpu_suspend_abort) | 84 | adr lr, BSYM(cpu_suspend_abort) |
89 | ldmfd sp!, {r0, pc} @ call suspend fn | 85 | ldmfd sp!, {r0, pc} @ call suspend fn |
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu) | |||
130 | .data | 126 | .data |
131 | .align | 127 | .align |
132 | ENTRY(cpu_resume) | 128 | ENTRY(cpu_resume) |
129 | ARM_BE8(setend be) @ ensure we are in BE mode | ||
133 | mov r1, #0 | 130 | mov r1, #0 |
134 | ALT_SMP(mrc p15, 0, r0, c0, c0, 5) | 131 | ALT_SMP(mrc p15, 0, r0, c0, c0, 5) |
135 | ALT_UP_B(1f) | 132 | ALT_UP_B(1f) |
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e115cbb0d25a..dc894ab3622b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c | |||
@@ -68,6 +68,7 @@ enum ipi_msg_type { | |||
68 | IPI_CALL_FUNC_SINGLE, | 68 | IPI_CALL_FUNC_SINGLE, |
69 | IPI_CPU_STOP, | 69 | IPI_CPU_STOP, |
70 | IPI_IRQ_WORK, | 70 | IPI_IRQ_WORK, |
71 | IPI_COMPLETION, | ||
71 | }; | 72 | }; |
72 | 73 | ||
73 | static DECLARE_COMPLETION(cpu_running); | 74 | static DECLARE_COMPLETION(cpu_running); |
@@ -82,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops) | |||
82 | 83 | ||
83 | static unsigned long get_arch_pgd(pgd_t *pgd) | 84 | static unsigned long get_arch_pgd(pgd_t *pgd) |
84 | { | 85 | { |
85 | phys_addr_t pgdir = virt_to_phys(pgd); | 86 | phys_addr_t pgdir = virt_to_idmap(pgd); |
86 | BUG_ON(pgdir & ARCH_PGD_MASK); | 87 | BUG_ON(pgdir & ARCH_PGD_MASK); |
87 | return pgdir >> ARCH_PGD_SHIFT; | 88 | return pgdir >> ARCH_PGD_SHIFT; |
88 | } | 89 | } |
@@ -467,6 +468,7 @@ static const char *ipi_types[NR_IPI] = { | |||
467 | S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), | 468 | S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), |
468 | S(IPI_CPU_STOP, "CPU stop interrupts"), | 469 | S(IPI_CPU_STOP, "CPU stop interrupts"), |
469 | S(IPI_IRQ_WORK, "IRQ work interrupts"), | 470 | S(IPI_IRQ_WORK, "IRQ work interrupts"), |
471 | S(IPI_COMPLETION, "completion interrupts"), | ||
470 | }; | 472 | }; |
471 | 473 | ||
472 | void show_ipi_list(struct seq_file *p, int prec) | 474 | void show_ipi_list(struct seq_file *p, int prec) |
@@ -526,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu) | |||
526 | cpu_relax(); | 528 | cpu_relax(); |
527 | } | 529 | } |
528 | 530 | ||
531 | static DEFINE_PER_CPU(struct completion *, cpu_completion); | ||
532 | |||
533 | int register_ipi_completion(struct completion *completion, int cpu) | ||
534 | { | ||
535 | per_cpu(cpu_completion, cpu) = completion; | ||
536 | return IPI_COMPLETION; | ||
537 | } | ||
538 | |||
539 | static void ipi_complete(unsigned int cpu) | ||
540 | { | ||
541 | complete(per_cpu(cpu_completion, cpu)); | ||
542 | } | ||
543 | |||
529 | /* | 544 | /* |
530 | * Main handler for inter-processor interrupts | 545 | * Main handler for inter-processor interrupts |
531 | */ | 546 | */ |
@@ -584,6 +599,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) | |||
584 | break; | 599 | break; |
585 | #endif | 600 | #endif |
586 | 601 | ||
602 | case IPI_COMPLETION: | ||
603 | irq_enter(); | ||
604 | ipi_complete(cpu); | ||
605 | irq_exit(); | ||
606 | break; | ||
607 | |||
587 | default: | 608 | default: |
588 | printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", | 609 | printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", |
589 | cpu, ipinr); | 610 | cpu, ipinr); |
diff --git a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c index 5bc1a63284e3..1aafa0d785eb 100644 --- a/arch/arm/kernel/smp_scu.c +++ b/arch/arm/kernel/smp_scu.c | |||
@@ -28,7 +28,7 @@ | |||
28 | */ | 28 | */ |
29 | unsigned int __init scu_get_core_count(void __iomem *scu_base) | 29 | unsigned int __init scu_get_core_count(void __iomem *scu_base) |
30 | { | 30 | { |
31 | unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG); | 31 | unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG); |
32 | return (ncores & 0x03) + 1; | 32 | return (ncores & 0x03) + 1; |
33 | } | 33 | } |
34 | 34 | ||
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base) | |||
42 | #ifdef CONFIG_ARM_ERRATA_764369 | 42 | #ifdef CONFIG_ARM_ERRATA_764369 |
43 | /* Cortex-A9 only */ | 43 | /* Cortex-A9 only */ |
44 | if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) { | 44 | if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) { |
45 | scu_ctrl = __raw_readl(scu_base + 0x30); | 45 | scu_ctrl = readl_relaxed(scu_base + 0x30); |
46 | if (!(scu_ctrl & 1)) | 46 | if (!(scu_ctrl & 1)) |
47 | __raw_writel(scu_ctrl | 0x1, scu_base + 0x30); | 47 | writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30); |
48 | } | 48 | } |
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | scu_ctrl = __raw_readl(scu_base + SCU_CTRL); | 51 | scu_ctrl = readl_relaxed(scu_base + SCU_CTRL); |
52 | /* already enabled? */ | 52 | /* already enabled? */ |
53 | if (scu_ctrl & 1) | 53 | if (scu_ctrl & 1) |
54 | return; | 54 | return; |
55 | 55 | ||
56 | scu_ctrl |= 1; | 56 | scu_ctrl |= 1; |
57 | __raw_writel(scu_ctrl, scu_base + SCU_CTRL); | 57 | writel_relaxed(scu_ctrl, scu_base + SCU_CTRL); |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Ensure that the data accessed by CPU0 before the SCU was | 60 | * Ensure that the data accessed by CPU0 before the SCU was |
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode) | |||
80 | if (mode > 3 || mode == 1 || cpu > 3) | 80 | if (mode > 3 || mode == 1 || cpu > 3) |
81 | return -EINVAL; | 81 | return -EINVAL; |
82 | 82 | ||
83 | val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; | 83 | val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; |
84 | val |= mode; | 84 | val |= mode; |
85 | __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu); | 85 | writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu); |
86 | 86 | ||
87 | return 0; | 87 | return 0; |
88 | } | 88 | } |
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index 2985c9f0905d..6591e26fc13f 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c | |||
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode, | |||
45 | case CLOCK_EVT_MODE_PERIODIC: | 45 | case CLOCK_EVT_MODE_PERIODIC: |
46 | ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE | 46 | ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE |
47 | | TWD_TIMER_CONTROL_PERIODIC; | 47 | | TWD_TIMER_CONTROL_PERIODIC; |
48 | __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), | 48 | writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), |
49 | twd_base + TWD_TIMER_LOAD); | 49 | twd_base + TWD_TIMER_LOAD); |
50 | break; | 50 | break; |
51 | case CLOCK_EVT_MODE_ONESHOT: | 51 | case CLOCK_EVT_MODE_ONESHOT: |
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode, | |||
58 | ctrl = 0; | 58 | ctrl = 0; |
59 | } | 59 | } |
60 | 60 | ||
61 | __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); | 61 | writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL); |
62 | } | 62 | } |
63 | 63 | ||
64 | static int twd_set_next_event(unsigned long evt, | 64 | static int twd_set_next_event(unsigned long evt, |
65 | struct clock_event_device *unused) | 65 | struct clock_event_device *unused) |
66 | { | 66 | { |
67 | unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL); | 67 | unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL); |
68 | 68 | ||
69 | ctrl |= TWD_TIMER_CONTROL_ENABLE; | 69 | ctrl |= TWD_TIMER_CONTROL_ENABLE; |
70 | 70 | ||
71 | __raw_writel(evt, twd_base + TWD_TIMER_COUNTER); | 71 | writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER); |
72 | __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); | 72 | writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL); |
73 | 73 | ||
74 | return 0; | 74 | return 0; |
75 | } | 75 | } |
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt, | |||
82 | */ | 82 | */ |
83 | static int twd_timer_ack(void) | 83 | static int twd_timer_ack(void) |
84 | { | 84 | { |
85 | if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) { | 85 | if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) { |
86 | __raw_writel(1, twd_base + TWD_TIMER_INTSTAT); | 86 | writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT); |
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | 89 | ||
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void) | |||
211 | waitjiffies += 5; | 211 | waitjiffies += 5; |
212 | 212 | ||
213 | /* enable, no interrupt or reload */ | 213 | /* enable, no interrupt or reload */ |
214 | __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL); | 214 | writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL); |
215 | 215 | ||
216 | /* maximum value */ | 216 | /* maximum value */ |
217 | __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); | 217 | writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); |
218 | 218 | ||
219 | while (get_jiffies_64() < waitjiffies) | 219 | while (get_jiffies_64() < waitjiffies) |
220 | udelay(10); | 220 | udelay(10); |
221 | 221 | ||
222 | count = __raw_readl(twd_base + TWD_TIMER_COUNTER); | 222 | count = readl_relaxed(twd_base + TWD_TIMER_COUNTER); |
223 | 223 | ||
224 | twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); | 224 | twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); |
225 | 225 | ||
@@ -277,7 +277,7 @@ static void twd_timer_setup(void) | |||
277 | * bother with the below. | 277 | * bother with the below. |
278 | */ | 278 | */ |
279 | if (per_cpu(percpu_setup_called, cpu)) { | 279 | if (per_cpu(percpu_setup_called, cpu)) { |
280 | __raw_writel(0, twd_base + TWD_TIMER_CONTROL); | 280 | writel_relaxed(0, twd_base + TWD_TIMER_CONTROL); |
281 | clockevents_register_device(clk); | 281 | clockevents_register_device(clk); |
282 | enable_percpu_irq(clk->irq, 0); | 282 | enable_percpu_irq(clk->irq, 0); |
283 | return; | 283 | return; |
@@ -290,7 +290,7 @@ static void twd_timer_setup(void) | |||
290 | * The following is done once per CPU the first time .setup() is | 290 | * The following is done once per CPU the first time .setup() is |
291 | * called. | 291 | * called. |
292 | */ | 292 | */ |
293 | __raw_writel(0, twd_base + TWD_TIMER_CONTROL); | 293 | writel_relaxed(0, twd_base + TWD_TIMER_CONTROL); |
294 | 294 | ||
295 | clk->name = "local_timer"; | 295 | clk->name = "local_timer"; |
296 | clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | | 296 | clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | |
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index 41cf3cbf756d..2835d35234ca 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include <asm/suspend.h> | 10 | #include <asm/suspend.h> |
11 | #include <asm/tlbflush.h> | 11 | #include <asm/tlbflush.h> |
12 | 12 | ||
13 | extern int __cpu_suspend(unsigned long, int (*)(unsigned long)); | 13 | extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid); |
14 | extern void cpu_resume_mmu(void); | 14 | extern void cpu_resume_mmu(void); |
15 | 15 | ||
16 | #ifdef CONFIG_MMU | 16 | #ifdef CONFIG_MMU |
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void); | |||
21 | int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) | 21 | int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) |
22 | { | 22 | { |
23 | struct mm_struct *mm = current->active_mm; | 23 | struct mm_struct *mm = current->active_mm; |
24 | u32 __mpidr = cpu_logical_map(smp_processor_id()); | ||
24 | int ret; | 25 | int ret; |
25 | 26 | ||
26 | if (!idmap_pgd) | 27 | if (!idmap_pgd) |
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) | |||
32 | * resume (indicated by a zero return code), we need to switch | 33 | * resume (indicated by a zero return code), we need to switch |
33 | * back to the correct page tables. | 34 | * back to the correct page tables. |
34 | */ | 35 | */ |
35 | ret = __cpu_suspend(arg, fn); | 36 | ret = __cpu_suspend(arg, fn, __mpidr); |
36 | if (ret == 0) { | 37 | if (ret == 0) { |
37 | cpu_switch_mm(mm->pgd, mm); | 38 | cpu_switch_mm(mm->pgd, mm); |
38 | local_flush_bp_all(); | 39 | local_flush_bp_all(); |
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) | |||
44 | #else | 45 | #else |
45 | int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) | 46 | int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) |
46 | { | 47 | { |
47 | return __cpu_suspend(arg, fn); | 48 | u32 __mpidr = cpu_logical_map(smp_processor_id()); |
49 | return __cpu_suspend(arg, fn, __mpidr); | ||
48 | } | 50 | } |
49 | #define idmap_pgd NULL | 51 | #define idmap_pgd NULL |
50 | #endif | 52 | #endif |
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 8fcda140358d..6125f259b7b5 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <asm/unwind.h> | 34 | #include <asm/unwind.h> |
35 | #include <asm/tls.h> | 35 | #include <asm/tls.h> |
36 | #include <asm/system_misc.h> | 36 | #include <asm/system_misc.h> |
37 | #include <asm/opcodes.h> | ||
37 | 38 | ||
38 | static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; | 39 | static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; |
39 | 40 | ||
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs, | |||
341 | int is_valid_bugaddr(unsigned long pc) | 342 | int is_valid_bugaddr(unsigned long pc) |
342 | { | 343 | { |
343 | #ifdef CONFIG_THUMB2_KERNEL | 344 | #ifdef CONFIG_THUMB2_KERNEL |
344 | unsigned short bkpt; | 345 | u16 bkpt; |
346 | u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE); | ||
345 | #else | 347 | #else |
346 | unsigned long bkpt; | 348 | u32 bkpt; |
349 | u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE); | ||
347 | #endif | 350 | #endif |
348 | 351 | ||
349 | if (probe_kernel_address((unsigned *)pc, bkpt)) | 352 | if (probe_kernel_address((unsigned *)pc, bkpt)) |
350 | return 0; | 353 | return 0; |
351 | 354 | ||
352 | return bkpt == BUG_INSTR_VALUE; | 355 | return bkpt == insn; |
353 | } | 356 | } |
354 | 357 | ||
355 | #endif | 358 | #endif |
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) | |||
402 | if (processor_mode(regs) == SVC_MODE) { | 405 | if (processor_mode(regs) == SVC_MODE) { |
403 | #ifdef CONFIG_THUMB2_KERNEL | 406 | #ifdef CONFIG_THUMB2_KERNEL |
404 | if (thumb_mode(regs)) { | 407 | if (thumb_mode(regs)) { |
405 | instr = ((u16 *)pc)[0]; | 408 | instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]); |
406 | if (is_wide_instruction(instr)) { | 409 | if (is_wide_instruction(instr)) { |
407 | instr <<= 16; | 410 | u16 inst2; |
408 | instr |= ((u16 *)pc)[1]; | 411 | inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]); |
412 | instr = __opcode_thumb32_compose(instr, inst2); | ||
409 | } | 413 | } |
410 | } else | 414 | } else |
411 | #endif | 415 | #endif |
412 | instr = *(u32 *) pc; | 416 | instr = __mem_to_opcode_arm(*(u32 *) pc); |
413 | } else if (thumb_mode(regs)) { | 417 | } else if (thumb_mode(regs)) { |
414 | if (get_user(instr, (u16 __user *)pc)) | 418 | if (get_user(instr, (u16 __user *)pc)) |
415 | goto die_sig; | 419 | goto die_sig; |
420 | instr = __mem_to_opcode_thumb16(instr); | ||
416 | if (is_wide_instruction(instr)) { | 421 | if (is_wide_instruction(instr)) { |
417 | unsigned int instr2; | 422 | unsigned int instr2; |
418 | if (get_user(instr2, (u16 __user *)pc+1)) | 423 | if (get_user(instr2, (u16 __user *)pc+1)) |
419 | goto die_sig; | 424 | goto die_sig; |
420 | instr <<= 16; | 425 | instr2 = __mem_to_opcode_thumb16(instr2); |
421 | instr |= instr2; | 426 | instr = __opcode_thumb32_compose(instr, instr2); |
422 | } | 427 | } |
423 | } else if (get_user(instr, (u32 __user *)pc)) { | 428 | } else if (get_user(instr, (u32 __user *)pc)) { |
429 | instr = __mem_to_opcode_arm(instr); | ||
424 | goto die_sig; | 430 | goto die_sig; |
425 | } | 431 | } |
426 | 432 | ||
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index d6408d1ee543..e0c68d5bb7dc 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h | |||
@@ -10,6 +10,11 @@ UNWIND( .fnstart ) | |||
10 | and r3, r0, #31 @ Get bit offset | 10 | and r3, r0, #31 @ Get bit offset |
11 | mov r0, r0, lsr #5 | 11 | mov r0, r0, lsr #5 |
12 | add r1, r1, r0, lsl #2 @ Get word offset | 12 | add r1, r1, r0, lsl #2 @ Get word offset |
13 | #if __LINUX_ARM_ARCH__ >= 7 | ||
14 | .arch_extension mp | ||
15 | ALT_SMP(W(pldw) [r1]) | ||
16 | ALT_UP(W(nop)) | ||
17 | #endif | ||
13 | mov r3, r2, lsl r3 | 18 | mov r3, r2, lsl r3 |
14 | 1: ldrex r2, [r1] | 19 | 1: ldrex r2, [r1] |
15 | \instr r2, r2, r3 | 20 | \instr r2, r2, r3 |
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig index 8e8437dea3ce..3c3bff715b47 100644 --- a/arch/arm/mach-highbank/Kconfig +++ b/arch/arm/mach-highbank/Kconfig | |||
@@ -4,6 +4,7 @@ config ARCH_HIGHBANK | |||
4 | select ARCH_HAS_CPUFREQ | 4 | select ARCH_HAS_CPUFREQ |
5 | select ARCH_HAS_HOLES_MEMORYMODEL | 5 | select ARCH_HAS_HOLES_MEMORYMODEL |
6 | select ARCH_HAS_OPP | 6 | select ARCH_HAS_OPP |
7 | select ARCH_SUPPORTS_BIG_ENDIAN | ||
7 | select ARCH_WANT_OPTIONAL_GPIOLIB | 8 | select ARCH_WANT_OPTIONAL_GPIOLIB |
8 | select ARM_AMBA | 9 | select ARM_AMBA |
9 | select ARM_ERRATA_764369 | 10 | select ARM_ERRATA_764369 |
diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig index 30e1ebe3a891..c342dc4e8a45 100644 --- a/arch/arm/mach-ixp4xx/Kconfig +++ b/arch/arm/mach-ixp4xx/Kconfig | |||
@@ -1,9 +1,5 @@ | |||
1 | if ARCH_IXP4XX | 1 | if ARCH_IXP4XX |
2 | 2 | ||
3 | config ARCH_SUPPORTS_BIG_ENDIAN | ||
4 | bool | ||
5 | default y | ||
6 | |||
7 | menu "Intel IXP4xx Implementation Options" | 3 | menu "Intel IXP4xx Implementation Options" |
8 | 4 | ||
9 | comment "IXP4xx Platforms" | 5 | comment "IXP4xx Platforms" |
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig index 9eb63d724602..5e269d7263ce 100644 --- a/arch/arm/mach-mvebu/Kconfig +++ b/arch/arm/mach-mvebu/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config ARCH_MVEBU | 1 | config ARCH_MVEBU |
2 | bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7 | 2 | bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7 |
3 | select ARCH_SUPPORTS_BIG_ENDIAN | ||
3 | select CLKSRC_MMIO | 4 | select CLKSRC_MMIO |
4 | select COMMON_CLK | 5 | select COMMON_CLK |
5 | select GENERIC_CLOCKEVENTS | 6 | select GENERIC_CLOCKEVENTS |
diff --git a/arch/arm/mach-mvebu/coherency_ll.S b/arch/arm/mach-mvebu/coherency_ll.S index 5476669ba905..ee7598fe75db 100644 --- a/arch/arm/mach-mvebu/coherency_ll.S +++ b/arch/arm/mach-mvebu/coherency_ll.S | |||
@@ -20,6 +20,8 @@ | |||
20 | #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0 | 20 | #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0 |
21 | #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4 | 21 | #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4 |
22 | 22 | ||
23 | #include <asm/assembler.h> | ||
24 | |||
23 | .text | 25 | .text |
24 | /* | 26 | /* |
25 | * r0: Coherency fabric base register address | 27 | * r0: Coherency fabric base register address |
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent) | |||
29 | /* Create bit by cpu index */ | 31 | /* Create bit by cpu index */ |
30 | mov r3, #(1 << 24) | 32 | mov r3, #(1 << 24) |
31 | lsl r1, r3, r1 | 33 | lsl r1, r3, r1 |
34 | ARM_BE8(rev r1, r1) | ||
32 | 35 | ||
33 | /* Add CPU to SMP group - Atomic */ | 36 | /* Add CPU to SMP group - Atomic */ |
34 | add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET | 37 | add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET |
diff --git a/arch/arm/mach-mvebu/headsmp.S b/arch/arm/mach-mvebu/headsmp.S index 8a1b0c96e9ec..3dd80df428f7 100644 --- a/arch/arm/mach-mvebu/headsmp.S +++ b/arch/arm/mach-mvebu/headsmp.S | |||
@@ -21,12 +21,16 @@ | |||
21 | #include <linux/linkage.h> | 21 | #include <linux/linkage.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | 23 | ||
24 | #include <asm/assembler.h> | ||
25 | |||
24 | /* | 26 | /* |
25 | * Armada XP specific entry point for secondary CPUs. | 27 | * Armada XP specific entry point for secondary CPUs. |
26 | * We add the CPU to the coherency fabric and then jump to secondary | 28 | * We add the CPU to the coherency fabric and then jump to secondary |
27 | * startup | 29 | * startup |
28 | */ | 30 | */ |
29 | ENTRY(armada_xp_secondary_startup) | 31 | ENTRY(armada_xp_secondary_startup) |
32 | ARM_BE8(setend be ) @ go BE8 if entered LE | ||
33 | |||
30 | /* Get coherency fabric base physical address */ | 34 | /* Get coherency fabric base physical address */ |
31 | adr r0, 1f | 35 | adr r0, 1f |
32 | ldr r1, [r0] | 36 | ldr r1, [r0] |
diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 365795447804..4fe8ebe5b2d4 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config ARCH_VEXPRESS | 1 | config ARCH_VEXPRESS |
2 | bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7 | 2 | bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7 |
3 | select ARCH_REQUIRE_GPIOLIB | 3 | select ARCH_REQUIRE_GPIOLIB |
4 | select ARCH_SUPPORTS_BIG_ENDIAN | ||
4 | select ARM_AMBA | 5 | select ARM_AMBA |
5 | select ARM_GIC | 6 | select ARM_GIC |
6 | select ARM_TIMER_SP804 | 7 | select ARM_TIMER_SP804 |
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index cd2c88e7a8f7..1f8fed94c2a4 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig | |||
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS | |||
952 | help | 952 | help |
953 | This option allows the use of custom mandatory barriers | 953 | This option allows the use of custom mandatory barriers |
954 | included via the mach/barriers.h file. | 954 | included via the mach/barriers.h file. |
955 | |||
956 | config ARCH_SUPPORTS_BIG_ENDIAN | ||
957 | bool | ||
958 | help | ||
959 | This option specifies the architecture can support big endian | ||
960 | operation. | ||
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S index 80741992a9fc..3815a8262af0 100644 --- a/arch/arm/mm/abort-ev6.S +++ b/arch/arm/mm/abort-ev6.S | |||
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort) | |||
38 | bne do_DataAbort | 38 | bne do_DataAbort |
39 | bic r1, r1, #1 << 11 @ clear bit 11 of FSR | 39 | bic r1, r1, #1 << 11 @ clear bit 11 of FSR |
40 | ldr r3, [r4] @ read aborted ARM instruction | 40 | ldr r3, [r4] @ read aborted ARM instruction |
41 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 41 | ARM_BE8(rev r3, r3) |
42 | rev r3, r3 | 42 | |
43 | #endif | ||
44 | do_ldrd_abort tmp=ip, insn=r3 | 43 | do_ldrd_abort tmp=ip, insn=r3 |
45 | tst r3, #1 << 20 @ L = 0 -> write | 44 | tst r3, #1 << 20 @ L = 0 -> write |
46 | orreq r1, r1, #1 << 11 @ yes. | 45 | orreq r1, r1, #1 << 11 @ yes. |
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 6f4585b89078..924036473b16 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <asm/cp15.h> | 25 | #include <asm/cp15.h> |
26 | #include <asm/system_info.h> | 26 | #include <asm/system_info.h> |
27 | #include <asm/unaligned.h> | 27 | #include <asm/unaligned.h> |
28 | #include <asm/opcodes.h> | ||
28 | 29 | ||
29 | #include "fault.h" | 30 | #include "fault.h" |
30 | 31 | ||
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
762 | if (thumb_mode(regs)) { | 763 | if (thumb_mode(regs)) { |
763 | u16 *ptr = (u16 *)(instrptr & ~1); | 764 | u16 *ptr = (u16 *)(instrptr & ~1); |
764 | fault = probe_kernel_address(ptr, tinstr); | 765 | fault = probe_kernel_address(ptr, tinstr); |
766 | tinstr = __mem_to_opcode_thumb16(tinstr); | ||
765 | if (!fault) { | 767 | if (!fault) { |
766 | if (cpu_architecture() >= CPU_ARCH_ARMv7 && | 768 | if (cpu_architecture() >= CPU_ARCH_ARMv7 && |
767 | IS_T32(tinstr)) { | 769 | IS_T32(tinstr)) { |
768 | /* Thumb-2 32-bit */ | 770 | /* Thumb-2 32-bit */ |
769 | u16 tinst2 = 0; | 771 | u16 tinst2 = 0; |
770 | fault = probe_kernel_address(ptr + 1, tinst2); | 772 | fault = probe_kernel_address(ptr + 1, tinst2); |
771 | instr = (tinstr << 16) | tinst2; | 773 | tinst2 = __mem_to_opcode_thumb16(tinst2); |
774 | instr = __opcode_thumb32_compose(tinstr, tinst2); | ||
772 | thumb2_32b = 1; | 775 | thumb2_32b = 1; |
773 | } else { | 776 | } else { |
774 | isize = 2; | 777 | isize = 2; |
775 | instr = thumb2arm(tinstr); | 778 | instr = thumb2arm(tinstr); |
776 | } | 779 | } |
777 | } | 780 | } |
778 | } else | 781 | } else { |
779 | fault = probe_kernel_address(instrptr, instr); | 782 | fault = probe_kernel_address(instrptr, instr); |
783 | instr = __mem_to_opcode_arm(instr); | ||
784 | } | ||
780 | 785 | ||
781 | if (fault) { | 786 | if (fault) { |
782 | type = TYPE_FAULT; | 787 | type = TYPE_FAULT; |
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index 83cb3ac27095..8e0e52eb76b5 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <asm/system_info.h> | 10 | #include <asm/system_info.h> |
11 | 11 | ||
12 | pgd_t *idmap_pgd; | 12 | pgd_t *idmap_pgd; |
13 | phys_addr_t (*arch_virt_to_idmap) (unsigned long x); | ||
13 | 14 | ||
14 | #ifdef CONFIG_ARM_LPAE | 15 | #ifdef CONFIG_ARM_LPAE |
15 | static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, | 16 | static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, |
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start, | |||
67 | unsigned long addr, end; | 68 | unsigned long addr, end; |
68 | unsigned long next; | 69 | unsigned long next; |
69 | 70 | ||
70 | addr = virt_to_phys(text_start); | 71 | addr = virt_to_idmap(text_start); |
71 | end = virt_to_phys(text_end); | 72 | end = virt_to_idmap(text_end); |
73 | pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end); | ||
72 | 74 | ||
73 | prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; | 75 | prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; |
74 | 76 | ||
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void) | |||
90 | if (!idmap_pgd) | 92 | if (!idmap_pgd) |
91 | return -ENOMEM; | 93 | return -ENOMEM; |
92 | 94 | ||
93 | pr_info("Setting up static identity map for 0x%p - 0x%p\n", | ||
94 | __idmap_text_start, __idmap_text_end); | ||
95 | identity_mapping_add(idmap_pgd, __idmap_text_start, | 95 | identity_mapping_add(idmap_pgd, __idmap_text_start, |
96 | __idmap_text_end, 0); | 96 | __idmap_text_end, 0); |
97 | 97 | ||
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index b1d17eeb59b8..78eeeca78f5a 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <asm/highmem.h> | 28 | #include <asm/highmem.h> |
29 | #include <asm/system_info.h> | 29 | #include <asm/system_info.h> |
30 | #include <asm/traps.h> | 30 | #include <asm/traps.h> |
31 | #include <asm/procinfo.h> | ||
32 | #include <asm/memory.h> | ||
31 | 33 | ||
32 | #include <asm/mach/arch.h> | 34 | #include <asm/mach/arch.h> |
33 | #include <asm/mach/map.h> | 35 | #include <asm/mach/map.h> |
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void) | |||
1315 | } | 1317 | } |
1316 | } | 1318 | } |
1317 | 1319 | ||
1320 | #ifdef CONFIG_ARM_LPAE | ||
1321 | /* | ||
1322 | * early_paging_init() recreates boot time page table setup, allowing machines | ||
1323 | * to switch over to a high (>4G) address space on LPAE systems | ||
1324 | */ | ||
1325 | void __init early_paging_init(const struct machine_desc *mdesc, | ||
1326 | struct proc_info_list *procinfo) | ||
1327 | { | ||
1328 | pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags; | ||
1329 | unsigned long map_start, map_end; | ||
1330 | pgd_t *pgd0, *pgdk; | ||
1331 | pud_t *pud0, *pudk, *pud_start; | ||
1332 | pmd_t *pmd0, *pmdk; | ||
1333 | phys_addr_t phys; | ||
1334 | int i; | ||
1335 | |||
1336 | if (!(mdesc->init_meminfo)) | ||
1337 | return; | ||
1338 | |||
1339 | /* remap kernel code and data */ | ||
1340 | map_start = init_mm.start_code; | ||
1341 | map_end = init_mm.brk; | ||
1342 | |||
1343 | /* get a handle on things... */ | ||
1344 | pgd0 = pgd_offset_k(0); | ||
1345 | pud_start = pud0 = pud_offset(pgd0, 0); | ||
1346 | pmd0 = pmd_offset(pud0, 0); | ||
1347 | |||
1348 | pgdk = pgd_offset_k(map_start); | ||
1349 | pudk = pud_offset(pgdk, map_start); | ||
1350 | pmdk = pmd_offset(pudk, map_start); | ||
1351 | |||
1352 | mdesc->init_meminfo(); | ||
1353 | |||
1354 | /* Run the patch stub to update the constants */ | ||
1355 | fixup_pv_table(&__pv_table_begin, | ||
1356 | (&__pv_table_end - &__pv_table_begin) << 2); | ||
1357 | |||
1358 | /* | ||
1359 | * Cache cleaning operations for self-modifying code | ||
1360 | * We should clean the entries by MVA but running a | ||
1361 | * for loop over every pv_table entry pointer would | ||
1362 | * just complicate the code. | ||
1363 | */ | ||
1364 | flush_cache_louis(); | ||
1365 | dsb(); | ||
1366 | isb(); | ||
1367 | |||
1368 | /* remap level 1 table */ | ||
1369 | for (i = 0; i < PTRS_PER_PGD; pud0++, i++) { | ||
1370 | set_pud(pud0, | ||
1371 | __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER)); | ||
1372 | pmd0 += PTRS_PER_PMD; | ||
1373 | } | ||
1374 | |||
1375 | /* remap pmds for kernel mapping */ | ||
1376 | phys = __pa(map_start) & PMD_MASK; | ||
1377 | do { | ||
1378 | *pmdk++ = __pmd(phys | pmdprot); | ||
1379 | phys += PMD_SIZE; | ||
1380 | } while (phys < map_end); | ||
1381 | |||
1382 | flush_cache_all(); | ||
1383 | cpu_switch_mm(pgd0, &init_mm); | ||
1384 | cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET); | ||
1385 | local_flush_bp_all(); | ||
1386 | local_flush_tlb_all(); | ||
1387 | } | ||
1388 | |||
1389 | #else | ||
1390 | |||
1391 | void __init early_paging_init(const struct machine_desc *mdesc, | ||
1392 | struct proc_info_list *procinfo) | ||
1393 | { | ||
1394 | if (mdesc->init_meminfo) | ||
1395 | mdesc->init_meminfo(); | ||
1396 | } | ||
1397 | |||
1398 | #endif | ||
1399 | |||
1318 | /* | 1400 | /* |
1319 | * paging_init() sets up the page tables, initialises the zone memory | 1401 | * paging_init() sets up the page tables, initialises the zone memory |
1320 | * maps, and sets up the zero page, bad page and bad page tables. | 1402 | * maps, and sets up the zero page, bad page and bad page tables. |
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 34d4ab217bab..5c668b7a31f9 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c | |||
@@ -296,6 +296,15 @@ void __init sanity_check_meminfo(void) | |||
296 | } | 296 | } |
297 | 297 | ||
298 | /* | 298 | /* |
299 | * early_paging_init() recreates boot time page table setup, allowing machines | ||
300 | * to switch over to a high (>4G) address space on LPAE systems | ||
301 | */ | ||
302 | void __init early_paging_init(const struct machine_desc *mdesc, | ||
303 | struct proc_info_list *procinfo) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | /* | ||
299 | * paging_init() sets up the page tables, initialises the zone memory | 308 | * paging_init() sets up the page tables, initialises the zone memory |
300 | * maps, and sets up the zero page, bad page and bad page tables. | 309 | * maps, and sets up the zero page, bad page and bad page tables. |
301 | */ | 310 | */ |
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S index 1128064fddcb..45dc29f85d56 100644 --- a/arch/arm/mm/proc-v6.S +++ b/arch/arm/mm/proc-v6.S | |||
@@ -220,9 +220,7 @@ __v6_setup: | |||
220 | #endif /* CONFIG_MMU */ | 220 | #endif /* CONFIG_MMU */ |
221 | adr r5, v6_crval | 221 | adr r5, v6_crval |
222 | ldmia r5, {r5, r6} | 222 | ldmia r5, {r5, r6} |
223 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 223 | ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables |
224 | orr r6, r6, #1 << 25 @ big-endian page tables | ||
225 | #endif | ||
226 | mrc p15, 0, r0, c1, c0, 0 @ read control register | 224 | mrc p15, 0, r0, c1, c0, 0 @ read control register |
227 | bic r0, r0, r5 @ clear bits them | 225 | bic r0, r0, r5 @ clear bits them |
228 | orr r0, r0, r6 @ set them | 226 | orr r0, r0, r6 @ set them |
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index c63d9bdee51e..60920f62fdf5 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S | |||
@@ -367,9 +367,7 @@ __v7_setup: | |||
367 | #endif | 367 | #endif |
368 | adr r5, v7_crval | 368 | adr r5, v7_crval |
369 | ldmia r5, {r5, r6} | 369 | ldmia r5, {r5, r6} |
370 | #ifdef CONFIG_CPU_ENDIAN_BE8 | 370 | ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables |
371 | orr r6, r6, #1 << 25 @ big-endian page tables | ||
372 | #endif | ||
373 | #ifdef CONFIG_SWP_EMULATE | 371 | #ifdef CONFIG_SWP_EMULATE |
374 | orr r5, r5, #(1 << 10) @ set SW bit in "clear" | 372 | orr r5, r5, #(1 << 10) @ set SW bit in "clear" |
375 | bic r6, r6, #(1 << 10) @ clear it in "mmuset" | 373 | bic r6, r6, #(1 << 10) @ clear it in "mmuset" |
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 99b44e0e8d86..9ed155ad0f97 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/if_vlan.h> | 19 | #include <linux/if_vlan.h> |
20 | #include <asm/cacheflush.h> | 20 | #include <asm/cacheflush.h> |
21 | #include <asm/hwcap.h> | 21 | #include <asm/hwcap.h> |
22 | #include <asm/opcodes.h> | ||
22 | 23 | ||
23 | #include "bpf_jit_32.h" | 24 | #include "bpf_jit_32.h" |
24 | 25 | ||
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor) | |||
113 | 114 | ||
114 | static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) | 115 | static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) |
115 | { | 116 | { |
117 | inst |= (cond << 28); | ||
118 | inst = __opcode_to_mem_arm(inst); | ||
119 | |||
116 | if (ctx->target != NULL) | 120 | if (ctx->target != NULL) |
117 | ctx->target[ctx->idx] = inst | (cond << 28); | 121 | ctx->target[ctx->idx] = inst; |
118 | 122 | ||
119 | ctx->idx++; | 123 | ctx->idx++; |
120 | } | 124 | } |
diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S index 2677bc3762d7..40f27e52de75 100644 --- a/arch/arm/plat-versatile/headsmp.S +++ b/arch/arm/plat-versatile/headsmp.S | |||
@@ -10,6 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | #include <linux/linkage.h> | 11 | #include <linux/linkage.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <asm/assembler.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Realview/Versatile Express specific entry point for secondary CPUs. | 16 | * Realview/Versatile Express specific entry point for secondary CPUs. |
@@ -17,6 +18,7 @@ | |||
17 | * until we're ready for them to initialise. | 18 | * until we're ready for them to initialise. |
18 | */ | 19 | */ |
19 | ENTRY(versatile_secondary_startup) | 20 | ENTRY(versatile_secondary_startup) |
21 | ARM_BE8(setend be) | ||
20 | mrc p15, 0, r0, c0, c0, 5 | 22 | mrc p15, 0, r0, c0, c0, 5 |
21 | bic r0, #0xff000000 | 23 | bic r0, #0xff000000 |
22 | adr r4, 1f | 24 | adr r4, 1f |
diff --git a/crypto/Kconfig b/crypto/Kconfig index 69ce573f1224..71f337aefa39 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM | |||
776 | 776 | ||
777 | See <http://csrc.nist.gov/encryption/aes/> for more information. | 777 | See <http://csrc.nist.gov/encryption/aes/> for more information. |
778 | 778 | ||
779 | config CRYPTO_AES_ARM_BS | ||
780 | tristate "Bit sliced AES using NEON instructions" | ||
781 | depends on ARM && KERNEL_MODE_NEON | ||
782 | select CRYPTO_ALGAPI | ||
783 | select CRYPTO_AES_ARM | ||
784 | select CRYPTO_ABLK_HELPER | ||
785 | help | ||
786 | Use a faster and more secure NEON based implementation of AES in CBC, | ||
787 | CTR and XTS modes | ||
788 | |||
789 | Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode | ||
790 | and for XTS mode encryption, CBC and XTS mode decryption speedup is | ||
791 | around 25%. (CBC encryption speed is not affected by this driver.) | ||
792 | This implementation does not rely on any lookup tables so it is | ||
793 | believed to be invulnerable to cache timing attacks. | ||
794 | |||
779 | config CRYPTO_ANUBIS | 795 | config CRYPTO_ANUBIS |
780 | tristate "Anubis cipher algorithm" | 796 | tristate "Anubis cipher algorithm" |
781 | select CRYPTO_ALGAPI | 797 | select CRYPTO_ALGAPI |
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c index 200926699778..2e6c275322f1 100644 --- a/drivers/bus/arm-cci.c +++ b/drivers/bus/arm-cci.c | |||
@@ -280,7 +280,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) | |||
280 | 280 | ||
281 | /* Enable the CCI port */ | 281 | /* Enable the CCI port */ |
282 | " ldr r0, [r0, %[offsetof_port_phys]] \n" | 282 | " ldr r0, [r0, %[offsetof_port_phys]] \n" |
283 | " mov r3, #"__stringify(CCI_ENABLE_REQ)" \n" | 283 | " mov r3, %[cci_enable_req]\n" |
284 | " str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n" | 284 | " str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n" |
285 | 285 | ||
286 | /* poll the status reg for completion */ | 286 | /* poll the status reg for completion */ |
@@ -288,7 +288,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) | |||
288 | " ldr r0, [r1] \n" | 288 | " ldr r0, [r1] \n" |
289 | " ldr r0, [r0, r1] @ cci_ctrl_base \n" | 289 | " ldr r0, [r0, r1] @ cci_ctrl_base \n" |
290 | "4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n" | 290 | "4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n" |
291 | " tst r1, #1 \n" | 291 | " tst r1, %[cci_control_status_bits] \n" |
292 | " bne 4b \n" | 292 | " bne 4b \n" |
293 | 293 | ||
294 | " mov r0, #0 \n" | 294 | " mov r0, #0 \n" |
@@ -301,6 +301,8 @@ asmlinkage void __naked cci_enable_port_for_self(void) | |||
301 | "7: .word cci_ctrl_phys - . \n" | 301 | "7: .word cci_ctrl_phys - . \n" |
302 | : : | 302 | : : |
303 | [sizeof_cpu_port] "i" (sizeof(cpu_port)), | 303 | [sizeof_cpu_port] "i" (sizeof(cpu_port)), |
304 | [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ), | ||
305 | [cci_control_status_bits] "i" cpu_to_le32(1), | ||
304 | #ifndef __ARMEB__ | 306 | #ifndef __ARMEB__ |
305 | [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)), | 307 | [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)), |
306 | #else | 308 | #else |
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index d0e948084eaf..9031171c141b 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c | |||
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, | |||
253 | if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) | 253 | if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) |
254 | return -EINVAL; | 254 | return -EINVAL; |
255 | 255 | ||
256 | raw_spin_lock(&irq_controller_lock); | ||
256 | mask = 0xff << shift; | 257 | mask = 0xff << shift; |
257 | bit = gic_cpu_map[cpu] << shift; | 258 | bit = gic_cpu_map[cpu] << shift; |
258 | |||
259 | raw_spin_lock(&irq_controller_lock); | ||
260 | val = readl_relaxed(reg) & ~mask; | 259 | val = readl_relaxed(reg) & ~mask; |
261 | writel_relaxed(val | bit, reg); | 260 | writel_relaxed(val | bit, reg); |
262 | raw_spin_unlock(&irq_controller_lock); | 261 | raw_spin_unlock(&irq_controller_lock); |
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic) | |||
652 | void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) | 651 | void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) |
653 | { | 652 | { |
654 | int cpu; | 653 | int cpu; |
655 | unsigned long map = 0; | 654 | unsigned long flags, map = 0; |
655 | |||
656 | raw_spin_lock_irqsave(&irq_controller_lock, flags); | ||
656 | 657 | ||
657 | /* Convert our logical CPU mask into a physical one. */ | 658 | /* Convert our logical CPU mask into a physical one. */ |
658 | for_each_cpu(cpu, mask) | 659 | for_each_cpu(cpu, mask) |
@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) | |||
666 | 667 | ||
667 | /* this always happens on GIC0 */ | 668 | /* this always happens on GIC0 */ |
668 | writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); | 669 | writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); |
670 | |||
671 | raw_spin_unlock_irqrestore(&irq_controller_lock, flags); | ||
672 | } | ||
673 | #endif | ||
674 | |||
675 | #ifdef CONFIG_BL_SWITCHER | ||
676 | /* | ||
677 | * gic_send_sgi - send a SGI directly to given CPU interface number | ||
678 | * | ||
679 | * cpu_id: the ID for the destination CPU interface | ||
680 | * irq: the IPI number to send a SGI for | ||
681 | */ | ||
682 | void gic_send_sgi(unsigned int cpu_id, unsigned int irq) | ||
683 | { | ||
684 | BUG_ON(cpu_id >= NR_GIC_CPU_IF); | ||
685 | cpu_id = 1 << cpu_id; | ||
686 | /* this always happens on GIC0 */ | ||
687 | writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); | ||
688 | } | ||
689 | |||
690 | /* | ||
691 | * gic_get_cpu_id - get the CPU interface ID for the specified CPU | ||
692 | * | ||
693 | * @cpu: the logical CPU number to get the GIC ID for. | ||
694 | * | ||
695 | * Return the CPU interface ID for the given logical CPU number, | ||
696 | * or -1 if the CPU number is too large or the interface ID is | ||
697 | * unknown (more than one bit set). | ||
698 | */ | ||
699 | int gic_get_cpu_id(unsigned int cpu) | ||
700 | { | ||
701 | unsigned int cpu_bit; | ||
702 | |||
703 | if (cpu >= NR_GIC_CPU_IF) | ||
704 | return -1; | ||
705 | cpu_bit = gic_cpu_map[cpu]; | ||
706 | if (cpu_bit & (cpu_bit - 1)) | ||
707 | return -1; | ||
708 | return __ffs(cpu_bit); | ||
669 | } | 709 | } |
710 | |||
711 | /* | ||
712 | * gic_migrate_target - migrate IRQs to another CPU interface | ||
713 | * | ||
714 | * @new_cpu_id: the CPU target ID to migrate IRQs to | ||
715 | * | ||
716 | * Migrate all peripheral interrupts with a target matching the current CPU | ||
717 | * to the interface corresponding to @new_cpu_id. The CPU interface mapping | ||
718 | * is also updated. Targets to other CPU interfaces are unchanged. | ||
719 | * This must be called with IRQs locally disabled. | ||
720 | */ | ||
721 | void gic_migrate_target(unsigned int new_cpu_id) | ||
722 | { | ||
723 | unsigned int cur_cpu_id, gic_irqs, gic_nr = 0; | ||
724 | void __iomem *dist_base; | ||
725 | int i, ror_val, cpu = smp_processor_id(); | ||
726 | u32 val, cur_target_mask, active_mask; | ||
727 | |||
728 | if (gic_nr >= MAX_GIC_NR) | ||
729 | BUG(); | ||
730 | |||
731 | dist_base = gic_data_dist_base(&gic_data[gic_nr]); | ||
732 | if (!dist_base) | ||
733 | return; | ||
734 | gic_irqs = gic_data[gic_nr].gic_irqs; | ||
735 | |||
736 | cur_cpu_id = __ffs(gic_cpu_map[cpu]); | ||
737 | cur_target_mask = 0x01010101 << cur_cpu_id; | ||
738 | ror_val = (cur_cpu_id - new_cpu_id) & 31; | ||
739 | |||
740 | raw_spin_lock(&irq_controller_lock); | ||
741 | |||
742 | /* Update the target interface for this logical CPU */ | ||
743 | gic_cpu_map[cpu] = 1 << new_cpu_id; | ||
744 | |||
745 | /* | ||
746 | * Find all the peripheral interrupts targetting the current | ||
747 | * CPU interface and migrate them to the new CPU interface. | ||
748 | * We skip DIST_TARGET 0 to 7 as they are read-only. | ||
749 | */ | ||
750 | for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) { | ||
751 | val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4); | ||
752 | active_mask = val & cur_target_mask; | ||
753 | if (active_mask) { | ||
754 | val &= ~active_mask; | ||
755 | val |= ror32(active_mask, ror_val); | ||
756 | writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4); | ||
757 | } | ||
758 | } | ||
759 | |||
760 | raw_spin_unlock(&irq_controller_lock); | ||
761 | |||
762 | /* | ||
763 | * Now let's migrate and clear any potential SGIs that might be | ||
764 | * pending for us (cur_cpu_id). Since GIC_DIST_SGI_PENDING_SET | ||
765 | * is a banked register, we can only forward the SGI using | ||
766 | * GIC_DIST_SOFTINT. The original SGI source is lost but Linux | ||
767 | * doesn't use that information anyway. | ||
768 | * | ||
769 | * For the same reason we do not adjust SGI source information | ||
770 | * for previously sent SGIs by us to other CPUs either. | ||
771 | */ | ||
772 | for (i = 0; i < 16; i += 4) { | ||
773 | int j; | ||
774 | val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i); | ||
775 | if (!val) | ||
776 | continue; | ||
777 | writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i); | ||
778 | for (j = i; j < i + 4; j++) { | ||
779 | if (val & 0xff) | ||
780 | writel_relaxed((1 << (new_cpu_id + 16)) | j, | ||
781 | dist_base + GIC_DIST_SOFTINT); | ||
782 | val >>= 8; | ||
783 | } | ||
784 | } | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * gic_get_sgir_physaddr - get the physical address for the SGI register | ||
789 | * | ||
790 | * REturn the physical address of the SGI register to be used | ||
791 | * by some early assembly code when the kernel is not yet available. | ||
792 | */ | ||
793 | static unsigned long gic_dist_physaddr; | ||
794 | |||
795 | unsigned long gic_get_sgir_physaddr(void) | ||
796 | { | ||
797 | if (!gic_dist_physaddr) | ||
798 | return 0; | ||
799 | return gic_dist_physaddr + GIC_DIST_SOFTINT; | ||
800 | } | ||
801 | |||
802 | void __init gic_init_physaddr(struct device_node *node) | ||
803 | { | ||
804 | struct resource res; | ||
805 | if (of_address_to_resource(node, 0, &res) == 0) { | ||
806 | gic_dist_physaddr = res.start; | ||
807 | pr_info("GIC physical location is %#lx\n", gic_dist_physaddr); | ||
808 | } | ||
809 | } | ||
810 | |||
811 | #else | ||
812 | #define gic_init_physaddr(node) do { } while (0) | ||
670 | #endif | 813 | #endif |
671 | 814 | ||
672 | static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, | 815 | static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, |
@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent) | |||
850 | percpu_offset = 0; | 993 | percpu_offset = 0; |
851 | 994 | ||
852 | gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); | 995 | gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); |
996 | if (!gic_cnt) | ||
997 | gic_init_physaddr(node); | ||
853 | 998 | ||
854 | if (parent) { | 999 | if (parent) { |
855 | irq = irq_of_parse_and_map(node, 0); | 1000 | irq = irq_of_parse_and_map(node, 0); |
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 0e5d9ecdb2b6..cac496b1e279 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h | |||
@@ -31,6 +31,8 @@ | |||
31 | #define GIC_DIST_TARGET 0x800 | 31 | #define GIC_DIST_TARGET 0x800 |
32 | #define GIC_DIST_CONFIG 0xc00 | 32 | #define GIC_DIST_CONFIG 0xc00 |
33 | #define GIC_DIST_SOFTINT 0xf00 | 33 | #define GIC_DIST_SOFTINT 0xf00 |
34 | #define GIC_DIST_SGI_PENDING_CLEAR 0xf10 | ||
35 | #define GIC_DIST_SGI_PENDING_SET 0xf20 | ||
34 | 36 | ||
35 | #define GICH_HCR 0x0 | 37 | #define GICH_HCR 0x0 |
36 | #define GICH_VTR 0x4 | 38 | #define GICH_VTR 0x4 |
@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start, | |||
74 | gic_init_bases(nr, start, dist, cpu, 0, NULL); | 76 | gic_init_bases(nr, start, dist, cpu, 0, NULL); |
75 | } | 77 | } |
76 | 78 | ||
79 | void gic_send_sgi(unsigned int cpu_id, unsigned int irq); | ||
80 | int gic_get_cpu_id(unsigned int cpu); | ||
81 | void gic_migrate_target(unsigned int new_cpu_id); | ||
82 | unsigned long gic_get_sgir_physaddr(void); | ||
83 | |||
77 | #endif /* __ASSEMBLY */ | 84 | #endif /* __ASSEMBLY */ |
78 | 85 | ||
79 | #endif | 86 | #endif |
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h new file mode 100644 index 000000000000..f76dd4de625e --- /dev/null +++ b/include/trace/events/power_cpu_migrate.h | |||
@@ -0,0 +1,67 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM power | ||
3 | |||
4 | #if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_POWER_CPU_MIGRATE_H | ||
6 | |||
7 | #include <linux/tracepoint.h> | ||
8 | |||
9 | #define __cpu_migrate_proto \ | ||
10 | TP_PROTO(u64 timestamp, \ | ||
11 | u32 cpu_hwid) | ||
12 | #define __cpu_migrate_args \ | ||
13 | TP_ARGS(timestamp, \ | ||
14 | cpu_hwid) | ||
15 | |||
16 | DECLARE_EVENT_CLASS(cpu_migrate, | ||
17 | |||
18 | __cpu_migrate_proto, | ||
19 | __cpu_migrate_args, | ||
20 | |||
21 | TP_STRUCT__entry( | ||
22 | __field(u64, timestamp ) | ||
23 | __field(u32, cpu_hwid ) | ||
24 | ), | ||
25 | |||
26 | TP_fast_assign( | ||
27 | __entry->timestamp = timestamp; | ||
28 | __entry->cpu_hwid = cpu_hwid; | ||
29 | ), | ||
30 | |||
31 | TP_printk("timestamp=%llu cpu_hwid=0x%08lX", | ||
32 | (unsigned long long)__entry->timestamp, | ||
33 | (unsigned long)__entry->cpu_hwid | ||
34 | ) | ||
35 | ); | ||
36 | |||
37 | #define __define_cpu_migrate_event(name) \ | ||
38 | DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \ | ||
39 | __cpu_migrate_proto, \ | ||
40 | __cpu_migrate_args \ | ||
41 | ) | ||
42 | |||
43 | __define_cpu_migrate_event(begin); | ||
44 | __define_cpu_migrate_event(finish); | ||
45 | __define_cpu_migrate_event(current); | ||
46 | |||
47 | #undef __define_cpu_migrate | ||
48 | #undef __cpu_migrate_proto | ||
49 | #undef __cpu_migrate_args | ||
50 | |||
51 | /* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */ | ||
52 | #ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING | ||
53 | #define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING | ||
54 | |||
55 | /* | ||
56 | * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate | ||
57 | * a whole-cluster migration: | ||
58 | */ | ||
59 | #define CPU_MIGRATE_ALL_CPUS 0x80000000U | ||
60 | #endif | ||
61 | |||
62 | #endif /* _TRACE_POWER_CPU_MIGRATE_H */ | ||
63 | |||
64 | /* This part must be outside protection */ | ||
65 | #undef TRACE_INCLUDE_FILE | ||
66 | #define TRACE_INCLUDE_FILE power_cpu_migrate | ||
67 | #include <trace/define_trace.h> | ||
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile index 15130b50dfe3..fe9b61e322a5 100644 --- a/tools/perf/arch/arm/Makefile +++ b/tools/perf/arch/arm/Makefile | |||
@@ -2,3 +2,6 @@ ifndef NO_DWARF | |||
2 | PERF_HAVE_DWARF_REGS := 1 | 2 | PERF_HAVE_DWARF_REGS := 1 |
3 | LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o | 3 | LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o |
4 | endif | 4 | endif |
5 | ifndef NO_LIBUNWIND | ||
6 | LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o | ||
7 | endif | ||
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h new file mode 100644 index 000000000000..2a1cfde66b69 --- /dev/null +++ b/tools/perf/arch/arm/include/perf_regs.h | |||
@@ -0,0 +1,54 @@ | |||
1 | #ifndef ARCH_PERF_REGS_H | ||
2 | #define ARCH_PERF_REGS_H | ||
3 | |||
4 | #include <stdlib.h> | ||
5 | #include "../../util/types.h" | ||
6 | #include <asm/perf_regs.h> | ||
7 | |||
8 | #define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1) | ||
9 | #define PERF_REG_IP PERF_REG_ARM_PC | ||
10 | #define PERF_REG_SP PERF_REG_ARM_SP | ||
11 | |||
12 | static inline const char *perf_reg_name(int id) | ||
13 | { | ||
14 | switch (id) { | ||
15 | case PERF_REG_ARM_R0: | ||
16 | return "r0"; | ||
17 | case PERF_REG_ARM_R1: | ||
18 | return "r1"; | ||
19 | case PERF_REG_ARM_R2: | ||
20 | return "r2"; | ||
21 | case PERF_REG_ARM_R3: | ||
22 | return "r3"; | ||
23 | case PERF_REG_ARM_R4: | ||
24 | return "r4"; | ||
25 | case PERF_REG_ARM_R5: | ||
26 | return "r5"; | ||
27 | case PERF_REG_ARM_R6: | ||
28 | return "r6"; | ||
29 | case PERF_REG_ARM_R7: | ||
30 | return "r7"; | ||
31 | case PERF_REG_ARM_R8: | ||
32 | return "r8"; | ||
33 | case PERF_REG_ARM_R9: | ||
34 | return "r9"; | ||
35 | case PERF_REG_ARM_R10: | ||
36 | return "r10"; | ||
37 | case PERF_REG_ARM_FP: | ||
38 | return "fp"; | ||
39 | case PERF_REG_ARM_IP: | ||
40 | return "ip"; | ||
41 | case PERF_REG_ARM_SP: | ||
42 | return "sp"; | ||
43 | case PERF_REG_ARM_LR: | ||
44 | return "lr"; | ||
45 | case PERF_REG_ARM_PC: | ||
46 | return "pc"; | ||
47 | default: | ||
48 | return NULL; | ||
49 | } | ||
50 | |||
51 | return NULL; | ||
52 | } | ||
53 | |||
54 | #endif /* ARCH_PERF_REGS_H */ | ||
diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c new file mode 100644 index 000000000000..da3dc950550c --- /dev/null +++ b/tools/perf/arch/arm/util/unwind.c | |||
@@ -0,0 +1,48 @@ | |||
1 | |||
2 | #include <errno.h> | ||
3 | #include <libunwind.h> | ||
4 | #include "perf_regs.h" | ||
5 | #include "../../util/unwind.h" | ||
6 | |||
7 | int unwind__arch_reg_id(int regnum) | ||
8 | { | ||
9 | switch (regnum) { | ||
10 | case UNW_ARM_R0: | ||
11 | return PERF_REG_ARM_R0; | ||
12 | case UNW_ARM_R1: | ||
13 | return PERF_REG_ARM_R1; | ||
14 | case UNW_ARM_R2: | ||
15 | return PERF_REG_ARM_R2; | ||
16 | case UNW_ARM_R3: | ||
17 | return PERF_REG_ARM_R3; | ||
18 | case UNW_ARM_R4: | ||
19 | return PERF_REG_ARM_R4; | ||
20 | case UNW_ARM_R5: | ||
21 | return PERF_REG_ARM_R5; | ||
22 | case UNW_ARM_R6: | ||
23 | return PERF_REG_ARM_R6; | ||
24 | case UNW_ARM_R7: | ||
25 | return PERF_REG_ARM_R7; | ||
26 | case UNW_ARM_R8: | ||
27 | return PERF_REG_ARM_R8; | ||
28 | case UNW_ARM_R9: | ||
29 | return PERF_REG_ARM_R9; | ||
30 | case UNW_ARM_R10: | ||
31 | return PERF_REG_ARM_R10; | ||
32 | case UNW_ARM_R11: | ||
33 | return PERF_REG_ARM_FP; | ||
34 | case UNW_ARM_R12: | ||
35 | return PERF_REG_ARM_IP; | ||
36 | case UNW_ARM_R13: | ||
37 | return PERF_REG_ARM_SP; | ||
38 | case UNW_ARM_R14: | ||
39 | return PERF_REG_ARM_LR; | ||
40 | case UNW_ARM_R15: | ||
41 | return PERF_REG_ARM_PC; | ||
42 | default: | ||
43 | pr_err("unwind: invalid reg id %d\n", regnum); | ||
44 | return -EINVAL; | ||
45 | } | ||
46 | |||
47 | return -EINVAL; | ||
48 | } | ||
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 5f6f9b3271bb..75b93d7f7860 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile | |||
@@ -29,6 +29,10 @@ ifeq ($(ARCH),x86_64) | |||
29 | NO_PERF_REGS := 0 | 29 | NO_PERF_REGS := 0 |
30 | LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 | 30 | LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 |
31 | endif | 31 | endif |
32 | ifeq ($(ARCH),arm) | ||
33 | NO_PERF_REGS := 0 | ||
34 | LIBUNWIND_LIBS = -lunwind -lunwind-arm | ||
35 | endif | ||
32 | 36 | ||
33 | ifeq ($(NO_PERF_REGS),0) | 37 | ifeq ($(NO_PERF_REGS),0) |
34 | CFLAGS += -DHAVE_PERF_REGS | 38 | CFLAGS += -DHAVE_PERF_REGS |
@@ -208,8 +212,7 @@ ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y) | |||
208 | endif # try-cc | 212 | endif # try-cc |
209 | endif # NO_LIBELF | 213 | endif # NO_LIBELF |
210 | 214 | ||
211 | # There's only x86 (both 32 and 64) support for CFI unwind so far | 215 | ifeq ($(LIBUNWIND_LIBS),) |
212 | ifneq ($(ARCH),x86) | ||
213 | NO_LIBUNWIND := 1 | 216 | NO_LIBUNWIND := 1 |
214 | endif | 217 | endif |
215 | 218 | ||
@@ -223,9 +226,13 @@ endif | |||
223 | 226 | ||
224 | FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS) | 227 | FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS) |
225 | ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y) | 228 | ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y) |
226 | msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99); | 229 | msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1); |
227 | NO_LIBUNWIND := 1 | 230 | NO_LIBUNWIND := 1 |
228 | endif # Libunwind support | 231 | endif # Libunwind support |
232 | ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y) | ||
233 | msg := $(warning No debug_frame support found in libunwind); | ||
234 | CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME | ||
235 | endif # debug_frame support in libunwind | ||
229 | endif # NO_LIBUNWIND | 236 | endif # NO_LIBUNWIND |
230 | 237 | ||
231 | ifndef NO_LIBUNWIND | 238 | ifndef NO_LIBUNWIND |
diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak index f79305739ecc..028fe997d5eb 100644 --- a/tools/perf/config/feature-tests.mak +++ b/tools/perf/config/feature-tests.mak | |||
@@ -185,7 +185,6 @@ extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, | |||
185 | unw_proc_info_t *pi, | 185 | unw_proc_info_t *pi, |
186 | int need_unwind_info, void *arg); | 186 | int need_unwind_info, void *arg); |
187 | 187 | ||
188 | |||
189 | #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) | 188 | #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) |
190 | 189 | ||
191 | int main(void) | 190 | int main(void) |
@@ -197,6 +196,26 @@ int main(void) | |||
197 | return 0; | 196 | return 0; |
198 | } | 197 | } |
199 | endef | 198 | endef |
199 | |||
200 | define SOURCE_LIBUNWIND_DEBUG_FRAME | ||
201 | #include <libunwind.h> | ||
202 | #include <stdlib.h> | ||
203 | |||
204 | extern int | ||
205 | UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, | ||
206 | unw_word_t ip, unw_word_t segbase, | ||
207 | const char *obj_name, unw_word_t start, | ||
208 | unw_word_t end); | ||
209 | |||
210 | #define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) | ||
211 | |||
212 | int main(void) | ||
213 | { | ||
214 | dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0); | ||
215 | return 0; | ||
216 | } | ||
217 | endef | ||
218 | |||
200 | endif | 219 | endif |
201 | 220 | ||
202 | ifndef NO_BACKTRACE | 221 | ifndef NO_BACKTRACE |
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c index 2f891f7e70bf..5390d0b8862a 100644 --- a/tools/perf/util/unwind.c +++ b/tools/perf/util/unwind.c | |||
@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, | |||
39 | 39 | ||
40 | #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) | 40 | #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) |
41 | 41 | ||
42 | extern int | ||
43 | UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, | ||
44 | unw_word_t ip, | ||
45 | unw_word_t segbase, | ||
46 | const char *obj_name, unw_word_t start, | ||
47 | unw_word_t end); | ||
48 | |||
49 | #define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) | ||
50 | |||
42 | #define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */ | 51 | #define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */ |
43 | #define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */ | 52 | #define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */ |
44 | 53 | ||
@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine, | |||
245 | return 0; | 254 | return 0; |
246 | } | 255 | } |
247 | 256 | ||
248 | static int read_unwind_spec(struct dso *dso, struct machine *machine, | 257 | static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine, |
249 | u64 *table_data, u64 *segbase, u64 *fde_count) | 258 | u64 *table_data, u64 *segbase, |
259 | u64 *fde_count) | ||
250 | { | 260 | { |
251 | int ret = -EINVAL, fd; | 261 | int ret = -EINVAL, fd; |
252 | u64 offset; | 262 | u64 offset; |
@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, | |||
255 | if (fd < 0) | 265 | if (fd < 0) |
256 | return -EINVAL; | 266 | return -EINVAL; |
257 | 267 | ||
268 | /* Check the .eh_frame section for unwinding info */ | ||
258 | offset = elf_section_offset(fd, ".eh_frame_hdr"); | 269 | offset = elf_section_offset(fd, ".eh_frame_hdr"); |
259 | close(fd); | 270 | close(fd); |
260 | 271 | ||
@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, | |||
263 | table_data, segbase, | 274 | table_data, segbase, |
264 | fde_count); | 275 | fde_count); |
265 | 276 | ||
266 | /* TODO .debug_frame check if eh_frame_hdr fails */ | ||
267 | return ret; | 277 | return ret; |
268 | } | 278 | } |
269 | 279 | ||
280 | #ifndef NO_LIBUNWIND_DEBUG_FRAME | ||
281 | static int read_unwind_spec_debug_frame(struct dso *dso, | ||
282 | struct machine *machine, u64 *offset) | ||
283 | { | ||
284 | int fd = dso__data_fd(dso, machine); | ||
285 | |||
286 | if (fd < 0) | ||
287 | return -EINVAL; | ||
288 | |||
289 | /* Check the .debug_frame section for unwinding info */ | ||
290 | *offset = elf_section_offset(fd, ".debug_frame"); | ||
291 | close(fd); | ||
292 | |||
293 | if (*offset) | ||
294 | return 0; | ||
295 | |||
296 | return -EINVAL; | ||
297 | } | ||
298 | #endif | ||
299 | |||
270 | static struct map *find_map(unw_word_t ip, struct unwind_info *ui) | 300 | static struct map *find_map(unw_word_t ip, struct unwind_info *ui) |
271 | { | 301 | { |
272 | struct addr_location al; | 302 | struct addr_location al; |
@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, | |||
291 | 321 | ||
292 | pr_debug("unwind: find_proc_info dso %s\n", map->dso->name); | 322 | pr_debug("unwind: find_proc_info dso %s\n", map->dso->name); |
293 | 323 | ||
294 | if (read_unwind_spec(map->dso, ui->machine, | 324 | /* Check the .eh_frame section for unwinding info */ |
295 | &table_data, &segbase, &fde_count)) | 325 | if (!read_unwind_spec_eh_frame(map->dso, ui->machine, |
296 | return -EINVAL; | 326 | &table_data, &segbase, &fde_count)) { |
327 | memset(&di, 0, sizeof(di)); | ||
328 | di.format = UNW_INFO_FORMAT_REMOTE_TABLE; | ||
329 | di.start_ip = map->start; | ||
330 | di.end_ip = map->end; | ||
331 | di.u.rti.segbase = map->start + segbase; | ||
332 | di.u.rti.table_data = map->start + table_data; | ||
333 | di.u.rti.table_len = fde_count * sizeof(struct table_entry) | ||
334 | / sizeof(unw_word_t); | ||
335 | return dwarf_search_unwind_table(as, ip, &di, pi, | ||
336 | need_unwind_info, arg); | ||
337 | } | ||
338 | |||
339 | #ifndef NO_LIBUNWIND_DEBUG_FRAME | ||
340 | /* Check the .debug_frame section for unwinding info */ | ||
341 | if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) { | ||
342 | memset(&di, 0, sizeof(di)); | ||
343 | dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name, | ||
344 | map->start, map->end); | ||
345 | return dwarf_search_unwind_table(as, ip, &di, pi, | ||
346 | need_unwind_info, arg); | ||
347 | } | ||
348 | #endif | ||
297 | 349 | ||
298 | memset(&di, 0, sizeof(di)); | 350 | return -EINVAL; |
299 | di.format = UNW_INFO_FORMAT_REMOTE_TABLE; | ||
300 | di.start_ip = map->start; | ||
301 | di.end_ip = map->end; | ||
302 | di.u.rti.segbase = map->start + segbase; | ||
303 | di.u.rti.table_data = map->start + table_data; | ||
304 | di.u.rti.table_len = fde_count * sizeof(struct table_entry) | ||
305 | / sizeof(unw_word_t); | ||
306 | return dwarf_search_unwind_table(as, ip, &di, pi, | ||
307 | need_unwind_info, arg); | ||
308 | } | 351 | } |
309 | 352 | ||
310 | static int access_fpreg(unw_addr_space_t __maybe_unused as, | 353 | static int access_fpreg(unw_addr_space_t __maybe_unused as, |