Merge branch 'devel-stable' into for-next

Conflicts: arch/arm/include/asm/atomic.h arch/arm/include/asm/hardirq.h arch/arm/kernel/smp.c
author: Russell King <rmk+kernel@arm.linux.org.uk> 2013-11-12 05:58:59 -0500
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2013-11-12 05:58:59 -0500
commit: df762eccbadf87850fbee444d729e0f1b1e946f1 (patch)
tree: 1bf47bbbd4ea91e343f983b3b50ec2ec73a739e1
parent: ec1e20a02fe33b767ffcca8920a32211492416d7 (diff)
parent: 70d42126877b9faa272d446a6de5917614c28dd9 (diff)
80 files changed, 7571 insertions, 245 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index df0c609272e5..1dbb58c1feed 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -52,6 +52,8 @@ config ARM
        select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
        select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
        select HAVE_PERF_EVENTS
+        select HAVE_PERF_REGS
+        select HAVE_PERF_USER_STACK_DUMP
        select HAVE_REGS_AND_STACK_ACCESS_API
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_UID16
@@ -482,6 +484,7 @@ config ARCH_IXP4XX
        bool "IXP4xx-based"
        depends on MMU
        select ARCH_HAS_DMA_SET_COHERENT_MASK
+        select ARCH_SUPPORTS_BIG_ENDIAN
        select ARCH_REQUIRE_GPIOLIB
        select CLKSRC_MMIO
        select CPU_XSCALE
@@ -1545,6 +1548,32 @@ config MCPM
          for (multi-)cluster based systems, such as big.LITTLE based
          systems.
+config BIG_LITTLE
+        bool "big.LITTLE support (Experimental)"
+        depends on CPU_V7 && SMP
+        select MCPM
+        help
+          This option enables support selections for the big.LITTLE
+          system architecture.
+config BL_SWITCHER
+        bool "big.LITTLE switcher support"
+        depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+        select CPU_PM
+        select ARM_CPU_SUSPEND
+        help
+          The big.LITTLE "switcher" provides the core functionality to
+          transparently handle transition between a cluster of A15's
+          and a cluster of A7's in a big.LITTLE system.
+config BL_SWITCHER_DUMMY_IF
+        tristate "Simple big.LITTLE switcher user interface"
+        depends on BL_SWITCHER && DEBUG_KERNEL
+        help
+          This is a simple and dummy char dev interface to control
+          the big.LITTLE switcher core code.  It is meant for
+          debugging purposes only.
 choice
        prompt "Memory split"
        default VMSPLIT_3G
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index db50b626be98..25f45256f098 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,6 +16,7 @@ LDFLAGS		:=
 LDFLAGS_vmlinux :=-p --no-undefined -X
 ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
 LDFLAGS_vmlinux += --be8
+LDFLAGS_MODULE  += --be8
 endif
 OBJCOPYFLAGS    :=-O binary -R .comment -S
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 75189f13cf54..066b03480b63 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -135,6 +135,7 @@ start:
                .word   _edata                  @ zImage end address
 THUMB(         .thumb                  )
 1:
+ ARM_BE8(       setend  be )                    @ go BE8 if compiled for BE8
                mrs     r9, cpsr
 #ifdef CONFIG_ARM_VIRT_EXT
                bl      __hyp_stub_install      @ get into SVC mode, reversibly
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x0030
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(       orr     r0, r0, #1 << 25 )      @ big-endian page tables
-                orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
                orr     r0, r0, #1 << 22        @ U (v6 unaligned access model)
                                                @ (needed for ARM1176)
 #ifdef CONFIG_MMU
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(       orr     r0, r0, #1 << 25 )      @ big-endian page tables
-                orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
                mrcne   p15, 0, r6, c2, c0, 2   @ read ttb control reg
                orrne   r0, r0, #1              @ MMU enabled
                movne   r1, #0xfffffffd         @ domain 0 = client
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 8c60f473e976..5c8584c4944d 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -17,3 +17,5 @@ obj-$(CONFIG_MCPM)		+= mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
 AFLAGS_mcpm_head.o              := -march=armv7-a
 AFLAGS_vlock.o                  := -march=armv7-a
 obj-$(CONFIG_TI_PRIV_EDMA)      += edma.o
+obj-$(CONFIG_BL_SWITCHER)       += bL_switcher.o
+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
new file mode 100644
index 000000000000..5774b6ea7ad5
--- /dev/null
+++ b/arch/arm/common/bL_switcher.c
@@ -0,0 +1,822 @@
+/*
+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
+ *
+ * Created by:  Nicolas Pitre, March 2012
+ * Copyright:   (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/moduleparam.h>
+#include <asm/smp_plat.h>
+#include <asm/cputype.h>
+#include <asm/suspend.h>
+#include <asm/mcpm.h>
+#include <asm/bL_switcher.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/power_cpu_migrate.h>
+/*
+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
+ * __attribute_const__ and we don't want the compiler to assume any
+ * constness here as the value _does_ change along some code paths.
+ */
+static int read_mpidr(void)
+{
+        unsigned int id;
+        asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
+        return id & MPIDR_HWID_BITMASK;
+}
+/*
+ * Get a global nanosecond time stamp for tracing.
+ */
+static s64 get_ns(void)
+{
+        struct timespec ts;
+        getnstimeofday(&ts);
+        return timespec_to_ns(&ts);
+}
+/*
+ * bL switcher core code.
+ */
+static void bL_do_switch(void *_arg)
+{
+        unsigned ib_mpidr, ib_cpu, ib_cluster;
+        long volatile handshake, **handshake_ptr = _arg;
+        pr_debug("%s\n", __func__);
+        ib_mpidr = cpu_logical_map(smp_processor_id());
+        ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+        ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+        /* Advertise our handshake location */
+        if (handshake_ptr) {
+                handshake = 0;
+                *handshake_ptr = &handshake;
+        } else
+                handshake = -1;
+        /*
+         * Our state has been saved at this point.  Let's release our
+         * inbound CPU.
+         */
+        mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
+        sev();
+        /*
+         * From this point, we must assume that our counterpart CPU might
+         * have taken over in its parallel world already, as if execution
+         * just returned from cpu_suspend().  It is therefore important to
+         * be very careful not to make any change the other guy is not
+         * expecting.  This is why we need stack isolation.
+         *
+         * Fancy under cover tasks could be performed here.  For now
+         * we have none.
+         */
+        /*
+         * Let's wait until our inbound is alive.
+         */
+        while (!handshake) {
+                wfe();
+                smp_mb();
+        }
+        /* Let's put ourself down. */
+        mcpm_cpu_power_down();
+        /* should never get here */
+        BUG();
+}
+/*
+ * Stack isolation.  To ensure 'current' remains valid, we just use another
+ * piece of our thread's stack space which should be fairly lightly used.
+ * The selected area starts just above the thread_info structure located
+ * at the very bottom of the stack, aligned to a cache line, and indexed
+ * with the cluster number.
+ */
+#define STACK_SIZE 512
+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
+static int bL_switchpoint(unsigned long _arg)
+{
+        unsigned int mpidr = read_mpidr();
+        unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+        void *stack = current_thread_info() + 1;
+        stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
+        stack += clusterid * STACK_SIZE + STACK_SIZE;
+        call_with_stack(bL_do_switch, (void *)_arg, stack);
+        BUG();
+}
+/*
+ * Generic switcher interface
+ */
+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
+static int bL_switcher_cpu_pairing[NR_CPUS];
+/*
+ * bL_switch_to - Switch to a specific cluster for the current CPU
+ * @new_cluster_id: the ID of the cluster to switch to.
+ *
+ * This function must be called on the CPU to be switched.
+ * Returns 0 on success, else a negative status code.
+ */
+static int bL_switch_to(unsigned int new_cluster_id)
+{
+        unsigned int mpidr, this_cpu, that_cpu;
+        unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
+        struct completion inbound_alive;
+        struct tick_device *tdev;
+        enum clock_event_mode tdev_mode;
+        long volatile *handshake_ptr;
+        int ipi_nr, ret;
+        this_cpu = smp_processor_id();
+        ob_mpidr = read_mpidr();
+        ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
+        ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
+        BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
+        if (new_cluster_id == ob_cluster)
+                return 0;
+        that_cpu = bL_switcher_cpu_pairing[this_cpu];
+        ib_mpidr = cpu_logical_map(that_cpu);
+        ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+        ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+        pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
+                 this_cpu, ob_mpidr, ib_mpidr);
+        this_cpu = smp_processor_id();
+        /* Close the gate for our entry vectors */
+        mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
+        mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
+        /* Install our "inbound alive" notifier. */
+        init_completion(&inbound_alive);
+        ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
+        ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
+        mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
+        /*
+         * Let's wake up the inbound CPU now in case it requires some delay
+         * to come online, but leave it gated in our entry vector code.
+         */
+        ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
+        if (ret) {
+                pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
+                return ret;
+        }
+        /*
+         * Raise a SGI on the inbound CPU to make sure it doesn't stall
+         * in a possible WFI, such as in bL_power_down().
+         */
+        gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
+        /*
+         * Wait for the inbound to come up.  This allows for other
+         * tasks to be scheduled in the mean time.
+         */
+        wait_for_completion(&inbound_alive);
+        mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
+        /*
+         * From this point we are entering the switch critical zone
+         * and can't take any interrupts anymore.
+         */
+        local_irq_disable();
+        local_fiq_disable();
+        trace_cpu_migrate_begin(get_ns(), ob_mpidr);
+        /* redirect GIC's SGIs to our counterpart */
+        gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
+        tdev = tick_get_device(this_cpu);
+        if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
+                tdev = NULL;
+        if (tdev) {
+                tdev_mode = tdev->evtdev->mode;
+                clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+        }
+        ret = cpu_pm_enter();
+        /* we can not tolerate errors at this point */
+        if (ret)
+                panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
+        /* Swap the physical CPUs in the logical map for this logical CPU. */
+        cpu_logical_map(this_cpu) = ib_mpidr;
+        cpu_logical_map(that_cpu) = ob_mpidr;
+        /* Let's do the actual CPU switch. */
+        ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
+        if (ret > 0)
+                panic("%s: cpu_suspend() returned %d\n", __func__, ret);
+        /* We are executing on the inbound CPU at this point */
+        mpidr = read_mpidr();
+        pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
+        BUG_ON(mpidr != ib_mpidr);
+        mcpm_cpu_powered_up();
+        ret = cpu_pm_exit();
+        if (tdev) {
+                clockevents_set_mode(tdev->evtdev, tdev_mode);
+                clockevents_program_event(tdev->evtdev,
+                                          tdev->evtdev->next_event, 1);
+        }
+        trace_cpu_migrate_finish(get_ns(), ib_mpidr);
+        local_fiq_enable();
+        local_irq_enable();
+        *handshake_ptr = 1;
+        dsb_sev();
+        if (ret)
+                pr_err("%s exiting with error %d\n", __func__, ret);
+        return ret;
+}
+struct bL_thread {
+        spinlock_t lock;
+        struct task_struct *task;
+        wait_queue_head_t wq;
+        int wanted_cluster;
+        struct completion started;
+        bL_switch_completion_handler completer;
+        void *completer_cookie;
+};
+static struct bL_thread bL_threads[NR_CPUS];
+static int bL_switcher_thread(void *arg)
+{
+        struct bL_thread *t = arg;
+        struct sched_param param = { .sched_priority = 1 };
+        int cluster;
+        bL_switch_completion_handler completer;
+        void *completer_cookie;
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+        complete(&t->started);
+        do {
+                if (signal_pending(current))
+                        flush_signals(current);
+                wait_event_interruptible(t->wq,
+                                t->wanted_cluster != -1 ||
+                                kthread_should_stop());
+                spin_lock(&t->lock);
+                cluster = t->wanted_cluster;
+                completer = t->completer;
+                completer_cookie = t->completer_cookie;
+                t->wanted_cluster = -1;
+                t->completer = NULL;
+                spin_unlock(&t->lock);
+                if (cluster != -1) {
+                        bL_switch_to(cluster);
+                        if (completer)
+                                completer(completer_cookie);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
+{
+        struct task_struct *task;
+        task = kthread_create_on_node(bL_switcher_thread, arg,
+                                      cpu_to_node(cpu), "kswitcher_%d", cpu);
+        if (!IS_ERR(task)) {
+                kthread_bind(task, cpu);
+                wake_up_process(task);
+        } else
+                pr_err("%s failed for CPU %d\n", __func__, cpu);
+        return task;
+}
+/*
+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
+ *      with completion notification via a callback
+ *
+ * @cpu: the CPU to switch
+ * @new_cluster_id: the ID of the cluster to switch to.
+ * @completer: switch completion callback.  if non-NULL,
+ *      @completer(@completer_cookie) will be called on completion of
+ *      the switch, in non-atomic context.
+ * @completer_cookie: opaque context argument for @completer.
+ *
+ * This function causes a cluster switch on the given CPU by waking up
+ * the appropriate switcher thread.  This function may or may not return
+ * before the switch has occurred.
+ *
+ * If a @completer callback function is supplied, it will be called when
+ * the switch is complete.  This can be used to determine asynchronously
+ * when the switch is complete, regardless of when bL_switch_request()
+ * returns.  When @completer is supplied, no new switch request is permitted
+ * for the affected CPU until after the switch is complete, and @completer
+ * has returned.
+ */
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                         bL_switch_completion_handler completer,
+                         void *completer_cookie)
+{
+        struct bL_thread *t;
+        if (cpu >= ARRAY_SIZE(bL_threads)) {
+                pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+                return -EINVAL;
+        }
+        t = &bL_threads[cpu];
+        if (IS_ERR(t->task))
+                return PTR_ERR(t->task);
+        if (!t->task)
+                return -ESRCH;
+        spin_lock(&t->lock);
+        if (t->completer) {
+                spin_unlock(&t->lock);
+                return -EBUSY;
+        }
+        t->completer = completer;
+        t->completer_cookie = completer_cookie;
+        t->wanted_cluster = new_cluster_id;
+        spin_unlock(&t->lock);
+        wake_up(&t->wq);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
+/*
+ * Activation and configuration code.
+ */
+static DEFINE_MUTEX(bL_switcher_activation_lock);
+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
+static unsigned int bL_switcher_active;
+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
+static cpumask_t bL_switcher_removed_logical_cpus;
+int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_register(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
+int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
+static int bL_activation_notify(unsigned long val)
+{
+        int ret;
+        ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
+        if (ret & NOTIFY_STOP_MASK)
+                pr_err("%s: notifier chain failed with status 0x%x\n",
+                        __func__, ret);
+        return notifier_to_errno(ret);
+}
+static void bL_switcher_restore_cpus(void)
+{
+        int i;
+        for_each_cpu(i, &bL_switcher_removed_logical_cpus)
+                cpu_up(i);
+}
+static int bL_switcher_halve_cpus(void)
+{
+        int i, j, cluster_0, gic_id, ret;
+        unsigned int cpu, cluster, mask;
+        cpumask_t available_cpus;
+        /* First pass to validate what we have */
+        mask = 0;
+        for_each_online_cpu(i) {
+                cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+                cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+                if (cluster >= 2) {
+                        pr_err("%s: only dual cluster systems are supported\n", __func__);
+                        return -EINVAL;
+                }
+                if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
+                        return -EINVAL;
+                mask |= (1 << cluster);
+        }
+        if (mask != 3) {
+                pr_err("%s: no CPU pairing possible\n", __func__);
+                return -EINVAL;
+        }
+        /*
+         * Now let's do the pairing.  We match each CPU with another CPU
+         * from a different cluster.  To get a uniform scheduling behavior
+         * without fiddling with CPU topology and compute capacity data,
+         * we'll use logical CPUs initially belonging to the same cluster.
+         */
+        memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
+        cpumask_copy(&available_cpus, cpu_online_mask);
+        cluster_0 = -1;
+        for_each_cpu(i, &available_cpus) {
+                int match = -1;
+                cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+                if (cluster_0 == -1)
+                        cluster_0 = cluster;
+                if (cluster != cluster_0)
+                        continue;
+                cpumask_clear_cpu(i, &available_cpus);
+                for_each_cpu(j, &available_cpus) {
+                        cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
+                        /*
+                         * Let's remember the last match to create "odd"
+                         * pairings on purpose in order for other code not
+                         * to assume any relation between physical and
+                         * logical CPU numbers.
+                         */
+                        if (cluster != cluster_0)
+                                match = j;
+                }
+                if (match != -1) {
+                        bL_switcher_cpu_pairing[i] = match;
+                        cpumask_clear_cpu(match, &available_cpus);
+                        pr_info("CPU%d paired with CPU%d\n", i, match);
+                }
+        }
+        /*
+         * Now we disable the unwanted CPUs i.e. everything that has no
+         * pairing information (that includes the pairing counterparts).
+         */
+        cpumask_clear(&bL_switcher_removed_logical_cpus);
+        for_each_online_cpu(i) {
+                cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+                cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+                /* Let's take note of the GIC ID for this CPU */
+                gic_id = gic_get_cpu_id(i);
+                if (gic_id < 0) {
+                        pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
+                        bL_switcher_restore_cpus();
+                        return -EINVAL;
+                }
+                bL_gic_id[cpu][cluster] = gic_id;
+                pr_info("GIC ID for CPU %u cluster %u is %u\n",
+                        cpu, cluster, gic_id);
+                if (bL_switcher_cpu_pairing[i] != -1) {
+                        bL_switcher_cpu_original_cluster[i] = cluster;
+                        continue;
+                }
+                ret = cpu_down(i);
+                if (ret) {
+                        bL_switcher_restore_cpus();
+                        return ret;
+                }
+                cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
+        }
+        return 0;
+}
+/* Determine the logical CPU a given physical CPU is grouped on. */
+int bL_switcher_get_logical_index(u32 mpidr)
+{
+        int cpu;
+        if (!bL_switcher_active)
+                return -EUNATCH;
+        mpidr &= MPIDR_HWID_BITMASK;
+        for_each_online_cpu(cpu) {
+                int pairing = bL_switcher_cpu_pairing[cpu];
+                if (pairing == -1)
+                        continue;
+                if ((mpidr == cpu_logical_map(cpu)) ||
+                    (mpidr == cpu_logical_map(pairing)))
+                        return cpu;
+        }
+        return -EINVAL;
+}
+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
+{
+        trace_cpu_migrate_current(get_ns(), read_mpidr());
+}
+int bL_switcher_trace_trigger(void)
+{
+        int ret;
+        preempt_disable();
+        bL_switcher_trace_trigger_cpu(NULL);
+        ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
+static int bL_switcher_enable(void)
+{
+        int cpu, ret;
+        mutex_lock(&bL_switcher_activation_lock);
+        lock_device_hotplug();
+        if (bL_switcher_active) {
+                unlock_device_hotplug();
+                mutex_unlock(&bL_switcher_activation_lock);
+                return 0;
+        }
+        pr_info("big.LITTLE switcher initializing\n");
+        ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
+        if (ret)
+                goto error;
+        ret = bL_switcher_halve_cpus();
+        if (ret)
+                goto error;
+        bL_switcher_trace_trigger();
+        for_each_online_cpu(cpu) {
+                struct bL_thread *t = &bL_threads[cpu];
+                spin_lock_init(&t->lock);
+                init_waitqueue_head(&t->wq);
+                init_completion(&t->started);
+                t->wanted_cluster = -1;
+                t->task = bL_switcher_thread_create(cpu, t);
+        }
+        bL_switcher_active = 1;
+        bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+        pr_info("big.LITTLE switcher initialized\n");
+        goto out;
+error:
+        pr_warn("big.LITTLE switcher initialization failed\n");
+        bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+out:
+        unlock_device_hotplug();
+        mutex_unlock(&bL_switcher_activation_lock);
+        return ret;
+}
+#ifdef CONFIG_SYSFS
+static void bL_switcher_disable(void)
+{
+        unsigned int cpu, cluster;
+        struct bL_thread *t;
+        struct task_struct *task;
+        mutex_lock(&bL_switcher_activation_lock);
+        lock_device_hotplug();
+        if (!bL_switcher_active)
+                goto out;
+        if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
+                bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+                goto out;
+        }
+        bL_switcher_active = 0;
+        /*
+         * To deactivate the switcher, we must shut down the switcher
+         * threads to prevent any other requests from being accepted.
+         * Then, if the final cluster for given logical CPU is not the
+         * same as the original one, we'll recreate a switcher thread
+         * just for the purpose of switching the CPU back without any
+         * possibility for interference from external requests.
+         */
+        for_each_online_cpu(cpu) {
+                t = &bL_threads[cpu];
+                task = t->task;
+                t->task = NULL;
+                if (!task || IS_ERR(task))
+                        continue;
+                kthread_stop(task);
+                /* no more switch may happen on this CPU at this point */
+                cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+                if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                        continue;
+                init_completion(&t->started);
+                t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
+                task = bL_switcher_thread_create(cpu, t);
+                if (!IS_ERR(task)) {
+                        wait_for_completion(&t->started);
+                        kthread_stop(task);
+                        cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+                        if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                                continue;
+                }
+                /* If execution gets here, we're in trouble. */
+                pr_crit("%s: unable to restore original cluster for CPU %d\n",
+                        __func__, cpu);
+                pr_crit("%s: CPU %d can't be restored\n",
+                        __func__, bL_switcher_cpu_pairing[cpu]);
+                cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
+                                  &bL_switcher_removed_logical_cpus);
+        }
+        bL_switcher_restore_cpus();
+        bL_switcher_trace_trigger();
+        bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+out:
+        unlock_device_hotplug();
+        mutex_unlock(&bL_switcher_activation_lock);
+}
+static ssize_t bL_switcher_active_show(struct kobject *kobj,
+                struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%u\n", bL_switcher_active);
+}
+static ssize_t bL_switcher_active_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int ret;
+        switch (buf[0]) {
+        case '0':
+                bL_switcher_disable();
+                ret = 0;
+                break;
+        case '1':
+                ret = bL_switcher_enable();
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        return (ret >= 0) ? count : ret;
+}
+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int ret = bL_switcher_trace_trigger();
+        return ret ? ret : count;
+}
+static struct kobj_attribute bL_switcher_active_attr =
+        __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
+static struct kobj_attribute bL_switcher_trace_trigger_attr =
+        __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
+static struct attribute *bL_switcher_attrs[] = {
+        &bL_switcher_active_attr.attr,
+        &bL_switcher_trace_trigger_attr.attr,
+        NULL,
+};
+static struct attribute_group bL_switcher_attr_group = {
+        .attrs = bL_switcher_attrs,
+};
+static struct kobject *bL_switcher_kobj;
+static int __init bL_switcher_sysfs_init(void)
+{
+        int ret;
+        bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
+        if (!bL_switcher_kobj)
+                return -ENOMEM;
+        ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
+        if (ret)
+                kobject_put(bL_switcher_kobj);
+        return ret;
+}
+#endif  /* CONFIG_SYSFS */
+bool bL_switcher_get_enabled(void)
+{
+        mutex_lock(&bL_switcher_activation_lock);
+        return bL_switcher_active;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
+void bL_switcher_put_enabled(void)
+{
+        mutex_unlock(&bL_switcher_activation_lock);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
+/*
+ * Veto any CPU hotplug operation on those CPUs we've removed
+ * while the switcher is active.
+ * We're just not ready to deal with that given the trickery involved.
+ */
+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
+                                        unsigned long action, void *hcpu)
+{
+        if (bL_switcher_active) {
+                int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
+                switch (action & 0xf) {
+                case CPU_UP_PREPARE:
+                case CPU_DOWN_PREPARE:
+                        if (pairing == -1)
+                                return NOTIFY_BAD;
+                }
+        }
+        return NOTIFY_DONE;
+}
+static bool no_bL_switcher;
+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
+static int __init bL_switcher_init(void)
+{
+        int ret;
+        if (MAX_NR_CLUSTERS != 2) {
+                pr_err("%s: only dual cluster systems are supported\n", __func__);
+                return -EINVAL;
+        }
+        cpu_notifier(bL_switcher_hotplug_callback, 0);
+        if (!no_bL_switcher) {
+                ret = bL_switcher_enable();
+                if (ret)
+                        return ret;
+        }
+#ifdef CONFIG_SYSFS
+        ret = bL_switcher_sysfs_init();
+        if (ret)
+                pr_err("%s: unable to create sysfs entry\n", __func__);
+#endif
+        return 0;
+}
+late_initcall(bL_switcher_init);
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c
new file mode 100644
index 000000000000..3f47f1203c6b
--- /dev/null
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -0,0 +1,71 @@
+/*
+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
+ *
+ * Created by:  Nicolas Pitre, November 2012
+ * Copyright:   (C) 2012-2013  Linaro Limited
+ *
+ * Dummy interface to user space for debugging purpose only.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <asm/bL_switcher.h>
+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
+                        size_t len, loff_t *pos)
+{
+        unsigned char val[3];
+        unsigned int cpu, cluster;
+        int ret;
+        pr_debug("%s\n", __func__);
+        if (len < 3)
+                return -EINVAL;
+        if (copy_from_user(val, buf, 3))
+                return -EFAULT;
+        /* format: <cpu#>,<cluster#> */
+        if (val[0] < '0' || val[0] > '9' ||
+            val[1] != ',' ||
+            val[2] < '0' || val[2] > '1')
+                return -EINVAL;
+        cpu = val[0] - '0';
+        cluster = val[2] - '0';
+        ret = bL_switch_request(cpu, cluster);
+        return ret ? : len;
+}
+static const struct file_operations bL_switcher_fops = {
+        .write          = bL_switcher_write,
+        .owner  = THIS_MODULE,
+};
+static struct miscdevice bL_switcher_device = {
+        MISC_DYNAMIC_MINOR,
+        "b.L_switcher",
+        &bL_switcher_fops
+};
+static int __init bL_switcher_dummy_if_init(void)
+{
+        return misc_register(&bL_switcher_device);
+}
+static void __exit bL_switcher_dummy_if_exit(void)
+{
+        misc_deregister(&bL_switcher_device);
+}
+module_init(bL_switcher_dummy_if_init);
+module_exit(bL_switcher_dummy_if_exit);
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index 6c03d0152e7f..26020a03f659 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
        sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
 }
+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                         unsigned long poke_phys_addr, unsigned long poke_val)
+{
+        unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
+        poke[0] = poke_phys_addr;
+        poke[1] = poke_val;
+        __cpuc_flush_dcache_area((void *)poke, 8);
+        outer_clean_range(__pa(poke), __pa(poke + 2));
+}
 static const struct mcpm_platform_ops *platform_ops;
 int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 39c96df3477a..e02db4b81a66 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -15,6 +15,7 @@
 #include <linux/linkage.h>
 #include <asm/mcpm.h>
+#include <asm/assembler.h>
 #include "vlock.h"
@@ -47,6 +48,7 @@
 ENTRY(mcpm_entry_point)
+ ARM_BE8(setend        be)
 THUMB( adr     r12, BSYM(1f)   )
 THUMB( bx      r12             )
 THUMB( .thumb                  )
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
         * position independent way.
         */
        adr     r5, 3f
-        ldmia   r5, {r6, r7, r8, r11}
+        ldmia   r5, {r0, r6, r7, r8, r11}
+        add     r0, r5, r0                      @ r0 = mcpm_entry_early_pokes
        add     r6, r5, r6                      @ r6 = mcpm_entry_vectors
        ldr     r7, [r5, r7]                    @ r7 = mcpm_power_up_setup_phys
        add     r8, r5, r8                      @ r8 = mcpm_sync
        add     r11, r5, r11                    @ r11 = first_man_locks
+        @ Perform an early poke, if any
+        add     r0, r0, r4, lsl #3
+        ldmia   r0, {r0, r1}
+        teq     r0, #0
+        strne   r1, [r0]
        mov     r0, #MCPM_SYNC_CLUSTER_SIZE
        mla     r8, r0, r10, r8                 @ r8 = sync cluster base
@@ -195,7 +204,8 @@ mcpm_entry_gated:
        .align  2
-3:      .word   mcpm_entry_vectors - .
+3:      .word   mcpm_entry_early_pokes - .
+        .word   mcpm_entry_vectors - 3b
        .word   mcpm_power_up_setup_phys - 3b
        .word   mcpm_sync - 3b
        .word   first_man_locks - 3b
@@ -214,6 +224,10 @@ first_man_locks:
 ENTRY(mcpm_entry_vectors)
        .space  4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+        .type   mcpm_entry_early_pokes, #object
+ENTRY(mcpm_entry_early_pokes)
+        .space  8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
        .type   mcpm_power_up_setup_phys, #object
 ENTRY(mcpm_power_up_setup_phys)
        .space  4               @ set by mcpm_sync_init()
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
new file mode 100644
index 000000000000..6231d36b3635
--- /dev/null
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
+aesbs-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index a2c83851bc90..81cda39860c5 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,17 @@
 #
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-aes-arm-y  := aes-armv4.o aes_glue.o
+aes-arm-y       := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-bs-y    := aesbs-core.o aesbs-glue.o
+sha1-arm-y      := sha1-armv4-large.o sha1_glue.o
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+        $(call cmd,perl)
+.PRECIOUS: $(obj)/aesbs-core.S
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
index 59f7877ead6a..3003fa1f6fb4 100644
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
 #include <linux/crypto.h>
 #include <crypto/aes.h>
-#define AES_MAXNR 14
+#include "aes_glue.h"
-typedef struct {
+EXPORT_SYMBOL(AES_encrypt);
-        unsigned int rd_key[4 *(AES_MAXNR + 1)];
+EXPORT_SYMBOL(AES_decrypt);
-        int rounds;
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
-} AES_KEY;
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
-struct AES_CTX {
-        AES_KEY enc_key;
-        AES_KEY dec_key;
-};
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
                .cipher = {
                        .cia_min_keysize        = AES_MIN_KEY_SIZE,
                        .cia_max_keysize        = AES_MAX_KEY_SIZE,
-                        .cia_setkey                     = aes_set_key,
+                        .cia_setkey             = aes_set_key,
                        .cia_encrypt            = aes_encrypt,
                        .cia_decrypt            = aes_decrypt
                }
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
new file mode 100644
index 000000000000..cca3e51eb606
--- /dev/null
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
+#define AES_MAXNR 14
+struct AES_KEY {
+        unsigned int rd_key[4 * (AES_MAXNR + 1)];
+        int rounds;
+};
+struct AES_CTX {
+        struct AES_KEY enc_key;
+        struct AES_KEY dec_key;
+};
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+                                           const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+                                           const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
new file mode 100644
index 000000000000..64205d453260
--- /dev/null
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt       19.5 cycles per byte processed with 128-bit key
+@ decrypt       22.1 cycles per byte processed with 128-bit key
+@ key conv.     440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@                                               <appro@openssl.org>
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@                                       <ard.biesheuvel@linaro.org>
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP    vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME  0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME  0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+#ifdef __thumb__
+# define adrl adr
+#endif
+#if __ARM_ARCH__>=7
+.text
+.syntax unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+.fpu    neon
+.type   _bsaes_decrypt8,%function
+.align  4
+_bsaes_decrypt8:
+        adr     r6,_bsaes_decrypt8
+        vldmia  r4!, {q9}               @ round 0 key
+        add     r6,r6,#.LM0ISR-_bsaes_decrypt8
+        vldmia  r6!, {q8}               @ .LM0ISR
+        veor    q10, q0, q9     @ xor with round0 key
+        veor    q11, q1, q9
+         vtbl.8 d0, {q10}, d16
+         vtbl.8 d1, {q10}, d17
+        veor    q12, q2, q9
+         vtbl.8 d2, {q11}, d16
+         vtbl.8 d3, {q11}, d17
+        veor    q13, q3, q9
+         vtbl.8 d4, {q12}, d16
+         vtbl.8 d5, {q12}, d17
+        veor    q14, q4, q9
+         vtbl.8 d6, {q13}, d16
+         vtbl.8 d7, {q13}, d17
+        veor    q15, q5, q9
+         vtbl.8 d8, {q14}, d16
+         vtbl.8 d9, {q14}, d17
+        veor    q10, q6, q9
+         vtbl.8 d10, {q15}, d16
+         vtbl.8 d11, {q15}, d17
+        veor    q11, q7, q9
+         vtbl.8 d12, {q10}, d16
+         vtbl.8 d13, {q10}, d17
+         vtbl.8 d14, {q11}, d16
+         vtbl.8 d15, {q11}, d17
+        vmov.i8 q8,#0x55                        @ compose .LBS0
+        vmov.i8 q9,#0x33                        @ compose .LBS1
+        vshr.u64        q10, q6, #1
+         vshr.u64       q11, q4, #1
+        veor            q10, q10, q7
+         veor           q11, q11, q5
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #1
+         veor           q5, q5, q11
+         vshl.u64       q11, q11, #1
+        veor            q6, q6, q10
+         veor           q4, q4, q11
+        vshr.u64        q10, q2, #1
+         vshr.u64       q11, q0, #1
+        veor            q10, q10, q3
+         veor           q11, q11, q1
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q3, q3, q10
+        vshl.u64        q10, q10, #1
+         veor           q1, q1, q11
+         vshl.u64       q11, q11, #1
+        veor            q2, q2, q10
+         veor           q0, q0, q11
+        vmov.i8 q8,#0x0f                        @ compose .LBS2
+        vshr.u64        q10, q5, #2
+         vshr.u64       q11, q4, #2
+        veor            q10, q10, q7
+         veor           q11, q11, q6
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #2
+         veor           q6, q6, q11
+         vshl.u64       q11, q11, #2
+        veor            q5, q5, q10
+         veor           q4, q4, q11
+        vshr.u64        q10, q1, #2
+         vshr.u64       q11, q0, #2
+        veor            q10, q10, q3
+         veor           q11, q11, q2
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q3, q3, q10
+        vshl.u64        q10, q10, #2
+         veor           q2, q2, q11
+         vshl.u64       q11, q11, #2
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vshr.u64        q10, q3, #4
+         vshr.u64       q11, q2, #4
+        veor            q10, q10, q7
+         veor           q11, q11, q6
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #4
+         veor           q6, q6, q11
+         vshl.u64       q11, q11, #4
+        veor            q3, q3, q10
+         veor           q2, q2, q11
+        vshr.u64        q10, q1, #4
+         vshr.u64       q11, q0, #4
+        veor            q10, q10, q5
+         veor           q11, q11, q4
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #4
+         veor           q4, q4, q11
+         vshl.u64       q11, q11, #4
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        sub     r5,r5,#1
+        b       .Ldec_sbox
+.align  4
+.Ldec_loop:
+        vldmia  r4!, {q8-q11}
+        veor    q8, q8, q0
+        veor    q9, q9, q1
+        vtbl.8  d0, {q8}, d24
+        vtbl.8  d1, {q8}, d25
+        vldmia  r4!, {q8}
+        veor    q10, q10, q2
+        vtbl.8  d2, {q9}, d24
+        vtbl.8  d3, {q9}, d25
+        vldmia  r4!, {q9}
+        veor    q11, q11, q3
+        vtbl.8  d4, {q10}, d24
+        vtbl.8  d5, {q10}, d25
+        vldmia  r4!, {q10}
+        vtbl.8  d6, {q11}, d24
+        vtbl.8  d7, {q11}, d25
+        vldmia  r4!, {q11}
+        veor    q8, q8, q4
+        veor    q9, q9, q5
+        vtbl.8  d8, {q8}, d24
+        vtbl.8  d9, {q8}, d25
+        veor    q10, q10, q6
+        vtbl.8  d10, {q9}, d24
+        vtbl.8  d11, {q9}, d25
+        veor    q11, q11, q7
+        vtbl.8  d12, {q10}, d24
+        vtbl.8  d13, {q10}, d25
+        vtbl.8  d14, {q11}, d24
+        vtbl.8  d15, {q11}, d25
+.Ldec_sbox:
+         veor   q1, q1, q4
+        veor    q3, q3, q4
+        veor    q4, q4, q7
+         veor   q1, q1, q6
+        veor    q2, q2, q7
+        veor    q6, q6, q4
+        veor    q0, q0, q1
+        veor    q2, q2, q5
+         veor   q7, q7, q6
+        veor    q3, q3, q0
+        veor    q5, q5, q0
+        veor    q1, q1, q3
+        veor    q11, q3, q0
+        veor    q10, q7, q4
+        veor    q9, q1, q6
+        veor    q13, q4, q0
+         vmov   q8, q10
+        veor    q12, q5, q2
+        vorr    q10, q10, q9
+        veor    q15, q11, q8
+        vand    q14, q11, q12
+        vorr    q11, q11, q12
+        veor    q12, q12, q9
+        vand    q8, q8, q9
+        veor    q9, q6, q2
+        vand    q15, q15, q12
+        vand    q13, q13, q9
+        veor    q9, q3, q7
+        veor    q12, q1, q5
+        veor    q11, q11, q13
+        veor    q10, q10, q13
+        vand    q13, q9, q12
+        vorr    q9, q9, q12
+        veor    q11, q11, q15
+        veor    q8, q8, q13
+        veor    q10, q10, q14
+        veor    q9, q9, q15
+        veor    q8, q8, q14
+        vand    q12, q4, q6
+        veor    q9, q9, q14
+        vand    q13, q0, q2
+        vand    q14, q7, q1
+        vorr    q15, q3, q5
+        veor    q11, q11, q12
+        veor    q9, q9, q14
+        veor    q8, q8, q15
+        veor    q10, q10, q13
+        @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+        @ new smaller inversion
+        vand    q14, q11, q9
+        vmov    q12, q8
+        veor    q13, q10, q14
+        veor    q15, q8, q14
+        veor    q14, q8, q14    @ q14=q15
+        vbsl    q13, q9, q8
+        vbsl    q15, q11, q10
+        veor    q11, q11, q10
+        vbsl    q12, q13, q14
+        vbsl    q8, q14, q13
+        vand    q14, q12, q15
+        veor    q9, q9, q8
+        veor    q14, q14, q11
+        veor    q12, q5, q2
+        veor    q8, q1, q6
+        veor    q10, q15, q14
+        vand    q10, q10, q5
+        veor    q5, q5, q1
+        vand    q11, q1, q15
+        vand    q5, q5, q14
+        veor    q1, q11, q10
+        veor    q5, q5, q11
+        veor    q15, q15, q13
+        veor    q14, q14, q9
+        veor    q11, q15, q14
+         veor   q10, q13, q9
+        vand    q11, q11, q12
+         vand   q10, q10, q2
+        veor    q12, q12, q8
+         veor   q2, q2, q6
+        vand    q8, q8, q15
+         vand   q6, q6, q13
+        vand    q12, q12, q14
+         vand   q2, q2, q9
+        veor    q8, q8, q12
+         veor   q2, q2, q6
+        veor    q12, q12, q11
+         veor   q6, q6, q10
+        veor    q5, q5, q12
+        veor    q2, q2, q12
+        veor    q1, q1, q8
+        veor    q6, q6, q8
+        veor    q12, q3, q0
+        veor    q8, q7, q4
+        veor    q11, q15, q14
+         veor   q10, q13, q9
+        vand    q11, q11, q12
+         vand   q10, q10, q0
+        veor    q12, q12, q8
+         veor   q0, q0, q4
+        vand    q8, q8, q15
+         vand   q4, q4, q13
+        vand    q12, q12, q14
+         vand   q0, q0, q9
+        veor    q8, q8, q12
+         veor   q0, q0, q4
+        veor    q12, q12, q11
+         veor   q4, q4, q10
+        veor    q15, q15, q13
+        veor    q14, q14, q9
+        veor    q10, q15, q14
+        vand    q10, q10, q3
+        veor    q3, q3, q7
+        vand    q11, q7, q15
+        vand    q3, q3, q14
+        veor    q7, q11, q10
+        veor    q3, q3, q11
+        veor    q3, q3, q12
+        veor    q0, q0, q12
+        veor    q7, q7, q8
+        veor    q4, q4, q8
+        veor    q1, q1, q7
+        veor    q6, q6, q5
+        veor    q4, q4, q1
+        veor    q2, q2, q7
+        veor    q5, q5, q7
+        veor    q4, q4, q2
+         veor   q7, q7, q0
+        veor    q4, q4, q5
+         veor   q3, q3, q6
+         veor   q6, q6, q1
+        veor    q3, q3, q4
+        veor    q4, q4, q0
+        veor    q7, q7, q3
+        subs    r5,r5,#1
+        bcc     .Ldec_done
+        @ multiplication by 0x05-0x00-0x04-0x00
+        vext.8  q8, q0, q0, #8
+        vext.8  q14, q3, q3, #8
+        vext.8  q15, q5, q5, #8
+        veor    q8, q8, q0
+        vext.8  q9, q1, q1, #8
+        veor    q14, q14, q3
+        vext.8  q10, q6, q6, #8
+        veor    q15, q15, q5
+        vext.8  q11, q4, q4, #8
+        veor    q9, q9, q1
+        vext.8  q12, q2, q2, #8
+        veor    q10, q10, q6
+        vext.8  q13, q7, q7, #8
+        veor    q11, q11, q4
+        veor    q12, q12, q2
+        veor    q13, q13, q7
+         veor   q0, q0, q14
+         veor   q1, q1, q14
+         veor   q6, q6, q8
+         veor   q2, q2, q10
+         veor   q4, q4, q9
+         veor   q1, q1, q15
+         veor   q6, q6, q15
+         veor   q2, q2, q14
+         veor   q7, q7, q11
+         veor   q4, q4, q14
+         veor   q3, q3, q12
+         veor   q2, q2, q15
+         veor   q7, q7, q15
+         veor   q5, q5, q13
+        vext.8  q8, q0, q0, #12 @ x0 <<< 32
+        vext.8  q9, q1, q1, #12
+         veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+        vext.8  q10, q6, q6, #12
+         veor   q1, q1, q9
+        vext.8  q11, q4, q4, #12
+         veor   q6, q6, q10
+        vext.8  q12, q2, q2, #12
+         veor   q4, q4, q11
+        vext.8  q13, q7, q7, #12
+         veor   q2, q2, q12
+        vext.8  q14, q3, q3, #12
+         veor   q7, q7, q13
+        vext.8  q15, q5, q5, #12
+         veor   q3, q3, q14
+        veor    q9, q9, q0
+         veor   q5, q5, q15
+         vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+        veor    q10, q10, q1
+        veor    q8, q8, q5
+        veor    q9, q9, q5
+         vext.8 q1, q1, q1, #8
+        veor    q13, q13, q2
+         veor   q0, q0, q8
+        veor    q14, q14, q7
+         veor   q1, q1, q9
+         vext.8 q8, q2, q2, #8
+        veor    q12, q12, q4
+         vext.8 q9, q7, q7, #8
+        veor    q15, q15, q3
+         vext.8 q2, q4, q4, #8
+        veor    q11, q11, q6
+         vext.8 q7, q5, q5, #8
+        veor    q12, q12, q5
+         vext.8 q4, q3, q3, #8
+        veor    q11, q11, q5
+         vext.8 q3, q6, q6, #8
+        veor    q5, q9, q13
+        veor    q11, q11, q2
+        veor    q7, q7, q15
+        veor    q6, q4, q14
+        veor    q4, q8, q12
+        veor    q2, q3, q10
+        vmov    q3, q11
+         @ vmov q5, q9
+        vldmia  r6, {q12}               @ .LISR
+        ite     eq                              @ Thumb2 thing, sanity check in ARM
+        addeq   r6,r6,#0x10
+        bne     .Ldec_loop
+        vldmia  r6, {q12}               @ .LISRM0
+        b       .Ldec_loop
+.align  4
+.Ldec_done:
+        vmov.i8 q8,#0x55                        @ compose .LBS0
+        vmov.i8 q9,#0x33                        @ compose .LBS1
+        vshr.u64        q10, q3, #1
+         vshr.u64       q11, q2, #1
+        veor            q10, q10, q5
+         veor           q11, q11, q7
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #1
+         veor           q7, q7, q11
+         vshl.u64       q11, q11, #1
+        veor            q3, q3, q10
+         veor           q2, q2, q11
+        vshr.u64        q10, q6, #1
+         vshr.u64       q11, q0, #1
+        veor            q10, q10, q4
+         veor           q11, q11, q1
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q4, q4, q10
+        vshl.u64        q10, q10, #1
+         veor           q1, q1, q11
+         vshl.u64       q11, q11, #1
+        veor            q6, q6, q10
+         veor           q0, q0, q11
+        vmov.i8 q8,#0x0f                        @ compose .LBS2
+        vshr.u64        q10, q7, #2
+         vshr.u64       q11, q2, #2
+        veor            q10, q10, q5
+         veor           q11, q11, q3
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #2
+         veor           q3, q3, q11
+         vshl.u64       q11, q11, #2
+        veor            q7, q7, q10
+         veor           q2, q2, q11
+        vshr.u64        q10, q1, #2
+         vshr.u64       q11, q0, #2
+        veor            q10, q10, q4
+         veor           q11, q11, q6
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q4, q4, q10
+        vshl.u64        q10, q10, #2
+         veor           q6, q6, q11
+         vshl.u64       q11, q11, #2
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vshr.u64        q10, q4, #4
+         vshr.u64       q11, q6, #4
+        veor            q10, q10, q5
+         veor           q11, q11, q3
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #4
+         veor           q3, q3, q11
+         vshl.u64       q11, q11, #4
+        veor            q4, q4, q10
+         veor           q6, q6, q11
+        vshr.u64        q10, q1, #4
+         vshr.u64       q11, q0, #4
+        veor            q10, q10, q7
+         veor           q11, q11, q2
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #4
+         veor           q2, q2, q11
+         vshl.u64       q11, q11, #4
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vldmia  r4, {q8}                        @ last round key
+        veor    q6, q6, q8
+        veor    q4, q4, q8
+        veor    q2, q2, q8
+        veor    q7, q7, q8
+        veor    q3, q3, q8
+        veor    q5, q5, q8
+        veor    q0, q0, q8
+        veor    q1, q1, q8
+        bx      lr
+.size   _bsaes_decrypt8,.-_bsaes_decrypt8
+.type   _bsaes_const,%object
+.align  6
+_bsaes_const:
+.LM0ISR:        @ InvShiftRows constants
+        .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+        .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+        .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:         @ ShiftRows constants
+        .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+        .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+        .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+        .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+        .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz  "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align  6
+.size   _bsaes_const,.-_bsaes_const
+.type   _bsaes_encrypt8,%function
+.align  4
+_bsaes_encrypt8:
+        adr     r6,_bsaes_encrypt8
+        vldmia  r4!, {q9}               @ round 0 key
+        sub     r6,r6,#_bsaes_encrypt8-.LM0SR
+        vldmia  r6!, {q8}               @ .LM0SR
+_bsaes_encrypt8_alt:
+        veor    q10, q0, q9     @ xor with round0 key
+        veor    q11, q1, q9
+         vtbl.8 d0, {q10}, d16
+         vtbl.8 d1, {q10}, d17
+        veor    q12, q2, q9
+         vtbl.8 d2, {q11}, d16
+         vtbl.8 d3, {q11}, d17
+        veor    q13, q3, q9
+         vtbl.8 d4, {q12}, d16
+         vtbl.8 d5, {q12}, d17
+        veor    q14, q4, q9
+         vtbl.8 d6, {q13}, d16
+         vtbl.8 d7, {q13}, d17
+        veor    q15, q5, q9
+         vtbl.8 d8, {q14}, d16
+         vtbl.8 d9, {q14}, d17
+        veor    q10, q6, q9
+         vtbl.8 d10, {q15}, d16
+         vtbl.8 d11, {q15}, d17
+        veor    q11, q7, q9
+         vtbl.8 d12, {q10}, d16
+         vtbl.8 d13, {q10}, d17
+         vtbl.8 d14, {q11}, d16
+         vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+        vmov.i8 q8,#0x55                        @ compose .LBS0
+        vmov.i8 q9,#0x33                        @ compose .LBS1
+        vshr.u64        q10, q6, #1
+         vshr.u64       q11, q4, #1
+        veor            q10, q10, q7
+         veor           q11, q11, q5
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #1
+         veor           q5, q5, q11
+         vshl.u64       q11, q11, #1
+        veor            q6, q6, q10
+         veor           q4, q4, q11
+        vshr.u64        q10, q2, #1
+         vshr.u64       q11, q0, #1
+        veor            q10, q10, q3
+         veor           q11, q11, q1
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q3, q3, q10
+        vshl.u64        q10, q10, #1
+         veor           q1, q1, q11
+         vshl.u64       q11, q11, #1
+        veor            q2, q2, q10
+         veor           q0, q0, q11
+        vmov.i8 q8,#0x0f                        @ compose .LBS2
+        vshr.u64        q10, q5, #2
+         vshr.u64       q11, q4, #2
+        veor            q10, q10, q7
+         veor           q11, q11, q6
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #2
+         veor           q6, q6, q11
+         vshl.u64       q11, q11, #2
+        veor            q5, q5, q10
+         veor           q4, q4, q11
+        vshr.u64        q10, q1, #2
+         vshr.u64       q11, q0, #2
+        veor            q10, q10, q3
+         veor           q11, q11, q2
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q3, q3, q10
+        vshl.u64        q10, q10, #2
+         veor           q2, q2, q11
+         vshl.u64       q11, q11, #2
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vshr.u64        q10, q3, #4
+         vshr.u64       q11, q2, #4
+        veor            q10, q10, q7
+         veor           q11, q11, q6
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #4
+         veor           q6, q6, q11
+         vshl.u64       q11, q11, #4
+        veor            q3, q3, q10
+         veor           q2, q2, q11
+        vshr.u64        q10, q1, #4
+         vshr.u64       q11, q0, #4
+        veor            q10, q10, q5
+         veor           q11, q11, q4
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #4
+         veor           q4, q4, q11
+         vshl.u64       q11, q11, #4
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        sub     r5,r5,#1
+        b       .Lenc_sbox
+.align  4
+.Lenc_loop:
+        vldmia  r4!, {q8-q11}
+        veor    q8, q8, q0
+        veor    q9, q9, q1
+        vtbl.8  d0, {q8}, d24
+        vtbl.8  d1, {q8}, d25
+        vldmia  r4!, {q8}
+        veor    q10, q10, q2
+        vtbl.8  d2, {q9}, d24
+        vtbl.8  d3, {q9}, d25
+        vldmia  r4!, {q9}
+        veor    q11, q11, q3
+        vtbl.8  d4, {q10}, d24
+        vtbl.8  d5, {q10}, d25
+        vldmia  r4!, {q10}
+        vtbl.8  d6, {q11}, d24
+        vtbl.8  d7, {q11}, d25
+        vldmia  r4!, {q11}
+        veor    q8, q8, q4
+        veor    q9, q9, q5
+        vtbl.8  d8, {q8}, d24
+        vtbl.8  d9, {q8}, d25
+        veor    q10, q10, q6
+        vtbl.8  d10, {q9}, d24
+        vtbl.8  d11, {q9}, d25
+        veor    q11, q11, q7
+        vtbl.8  d12, {q10}, d24
+        vtbl.8  d13, {q10}, d25
+        vtbl.8  d14, {q11}, d24
+        vtbl.8  d15, {q11}, d25
+.Lenc_sbox:
+        veor    q2, q2, q1
+        veor    q5, q5, q6
+        veor    q3, q3, q0
+        veor    q6, q6, q2
+        veor    q5, q5, q0
+        veor    q6, q6, q3
+        veor    q3, q3, q7
+        veor    q7, q7, q5
+        veor    q3, q3, q4
+        veor    q4, q4, q5
+        veor    q2, q2, q7
+        veor    q3, q3, q1
+        veor    q1, q1, q5
+        veor    q11, q7, q4
+        veor    q10, q1, q2
+        veor    q9, q5, q3
+        veor    q13, q2, q4
+         vmov   q8, q10
+        veor    q12, q6, q0
+        vorr    q10, q10, q9
+        veor    q15, q11, q8
+        vand    q14, q11, q12
+        vorr    q11, q11, q12
+        veor    q12, q12, q9
+        vand    q8, q8, q9
+        veor    q9, q3, q0
+        vand    q15, q15, q12
+        vand    q13, q13, q9
+        veor    q9, q7, q1
+        veor    q12, q5, q6
+        veor    q11, q11, q13
+        veor    q10, q10, q13
+        vand    q13, q9, q12
+        vorr    q9, q9, q12
+        veor    q11, q11, q15
+        veor    q8, q8, q13
+        veor    q10, q10, q14
+        veor    q9, q9, q15
+        veor    q8, q8, q14
+        vand    q12, q2, q3
+        veor    q9, q9, q14
+        vand    q13, q4, q0
+        vand    q14, q1, q5
+        vorr    q15, q7, q6
+        veor    q11, q11, q12
+        veor    q9, q9, q14
+        veor    q8, q8, q15
+        veor    q10, q10, q13
+        @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+        @ new smaller inversion
+        vand    q14, q11, q9
+        vmov    q12, q8
+        veor    q13, q10, q14
+        veor    q15, q8, q14
+        veor    q14, q8, q14    @ q14=q15
+        vbsl    q13, q9, q8
+        vbsl    q15, q11, q10
+        veor    q11, q11, q10
+        vbsl    q12, q13, q14
+        vbsl    q8, q14, q13
+        vand    q14, q12, q15
+        veor    q9, q9, q8
+        veor    q14, q14, q11
+        veor    q12, q6, q0
+        veor    q8, q5, q3
+        veor    q10, q15, q14
+        vand    q10, q10, q6
+        veor    q6, q6, q5
+        vand    q11, q5, q15
+        vand    q6, q6, q14
+        veor    q5, q11, q10
+        veor    q6, q6, q11
+        veor    q15, q15, q13
+        veor    q14, q14, q9
+        veor    q11, q15, q14
+         veor   q10, q13, q9
+        vand    q11, q11, q12
+         vand   q10, q10, q0
+        veor    q12, q12, q8
+         veor   q0, q0, q3
+        vand    q8, q8, q15
+         vand   q3, q3, q13
+        vand    q12, q12, q14
+         vand   q0, q0, q9
+        veor    q8, q8, q12
+         veor   q0, q0, q3
+        veor    q12, q12, q11
+         veor   q3, q3, q10
+        veor    q6, q6, q12
+        veor    q0, q0, q12
+        veor    q5, q5, q8
+        veor    q3, q3, q8
+        veor    q12, q7, q4
+        veor    q8, q1, q2
+        veor    q11, q15, q14
+         veor   q10, q13, q9
+        vand    q11, q11, q12
+         vand   q10, q10, q4
+        veor    q12, q12, q8
+         veor   q4, q4, q2
+        vand    q8, q8, q15
+         vand   q2, q2, q13
+        vand    q12, q12, q14
+         vand   q4, q4, q9
+        veor    q8, q8, q12
+         veor   q4, q4, q2
+        veor    q12, q12, q11
+         veor   q2, q2, q10
+        veor    q15, q15, q13
+        veor    q14, q14, q9
+        veor    q10, q15, q14
+        vand    q10, q10, q7
+        veor    q7, q7, q1
+        vand    q11, q1, q15
+        vand    q7, q7, q14
+        veor    q1, q11, q10
+        veor    q7, q7, q11
+        veor    q7, q7, q12
+        veor    q4, q4, q12
+        veor    q1, q1, q8
+        veor    q2, q2, q8
+        veor    q7, q7, q0
+        veor    q1, q1, q6
+        veor    q6, q6, q0
+        veor    q4, q4, q7
+        veor    q0, q0, q1
+        veor    q1, q1, q5
+        veor    q5, q5, q2
+        veor    q2, q2, q3
+        veor    q3, q3, q5
+        veor    q4, q4, q5
+        veor    q6, q6, q3
+        subs    r5,r5,#1
+        bcc     .Lenc_done
+        vext.8  q8, q0, q0, #12 @ x0 <<< 32
+        vext.8  q9, q1, q1, #12
+         veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+        vext.8  q10, q4, q4, #12
+         veor   q1, q1, q9
+        vext.8  q11, q6, q6, #12
+         veor   q4, q4, q10
+        vext.8  q12, q3, q3, #12
+         veor   q6, q6, q11
+        vext.8  q13, q7, q7, #12
+         veor   q3, q3, q12
+        vext.8  q14, q2, q2, #12
+         veor   q7, q7, q13
+        vext.8  q15, q5, q5, #12
+         veor   q2, q2, q14
+        veor    q9, q9, q0
+         veor   q5, q5, q15
+         vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+        veor    q10, q10, q1
+        veor    q8, q8, q5
+        veor    q9, q9, q5
+         vext.8 q1, q1, q1, #8
+        veor    q13, q13, q3
+         veor   q0, q0, q8
+        veor    q14, q14, q7
+         veor   q1, q1, q9
+         vext.8 q8, q3, q3, #8
+        veor    q12, q12, q6
+         vext.8 q9, q7, q7, #8
+        veor    q15, q15, q2
+         vext.8 q3, q6, q6, #8
+        veor    q11, q11, q4
+         vext.8 q7, q5, q5, #8
+        veor    q12, q12, q5
+         vext.8 q6, q2, q2, #8
+        veor    q11, q11, q5
+         vext.8 q2, q4, q4, #8
+        veor    q5, q9, q13
+        veor    q4, q8, q12
+        veor    q3, q3, q11
+        veor    q7, q7, q15
+        veor    q6, q6, q14
+         @ vmov q4, q8
+        veor    q2, q2, q10
+         @ vmov q5, q9
+        vldmia  r6, {q12}               @ .LSR
+        ite     eq                              @ Thumb2 thing, samity check in ARM
+        addeq   r6,r6,#0x10
+        bne     .Lenc_loop
+        vldmia  r6, {q12}               @ .LSRM0
+        b       .Lenc_loop
+.align  4
+.Lenc_done:
+        vmov.i8 q8,#0x55                        @ compose .LBS0
+        vmov.i8 q9,#0x33                        @ compose .LBS1
+        vshr.u64        q10, q2, #1
+         vshr.u64       q11, q3, #1
+        veor            q10, q10, q5
+         veor           q11, q11, q7
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #1
+         veor           q7, q7, q11
+         vshl.u64       q11, q11, #1
+        veor            q2, q2, q10
+         veor           q3, q3, q11
+        vshr.u64        q10, q4, #1
+         vshr.u64       q11, q0, #1
+        veor            q10, q10, q6
+         veor           q11, q11, q1
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q6, q6, q10
+        vshl.u64        q10, q10, #1
+         veor           q1, q1, q11
+         vshl.u64       q11, q11, #1
+        veor            q4, q4, q10
+         veor           q0, q0, q11
+        vmov.i8 q8,#0x0f                        @ compose .LBS2
+        vshr.u64        q10, q7, #2
+         vshr.u64       q11, q3, #2
+        veor            q10, q10, q5
+         veor           q11, q11, q2
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #2
+         veor           q2, q2, q11
+         vshl.u64       q11, q11, #2
+        veor            q7, q7, q10
+         veor           q3, q3, q11
+        vshr.u64        q10, q1, #2
+         vshr.u64       q11, q0, #2
+        veor            q10, q10, q6
+         veor           q11, q11, q4
+        vand            q10, q10, q9
+         vand           q11, q11, q9
+        veor            q6, q6, q10
+        vshl.u64        q10, q10, #2
+         veor           q4, q4, q11
+         vshl.u64       q11, q11, #2
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vshr.u64        q10, q6, #4
+         vshr.u64       q11, q4, #4
+        veor            q10, q10, q5
+         veor           q11, q11, q2
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q5, q5, q10
+        vshl.u64        q10, q10, #4
+         veor           q2, q2, q11
+         vshl.u64       q11, q11, #4
+        veor            q6, q6, q10
+         veor           q4, q4, q11
+        vshr.u64        q10, q1, #4
+         vshr.u64       q11, q0, #4
+        veor            q10, q10, q7
+         veor           q11, q11, q3
+        vand            q10, q10, q8
+         vand           q11, q11, q8
+        veor            q7, q7, q10
+        vshl.u64        q10, q10, #4
+         veor           q3, q3, q11
+         vshl.u64       q11, q11, #4
+        veor            q1, q1, q10
+         veor           q0, q0, q11
+        vldmia  r4, {q8}                        @ last round key
+        veor    q4, q4, q8
+        veor    q6, q6, q8
+        veor    q3, q3, q8
+        veor    q7, q7, q8
+        veor    q2, q2, q8
+        veor    q5, q5, q8
+        veor    q0, q0, q8
+        veor    q1, q1, q8
+        bx      lr
+.size   _bsaes_encrypt8,.-_bsaes_encrypt8
+.type   _bsaes_key_convert,%function
+.align  4
+_bsaes_key_convert:
+        adr     r6,_bsaes_key_convert
+        vld1.8  {q7},  [r4]!            @ load round 0 key
+        sub     r6,r6,#_bsaes_key_convert-.LM0
+        vld1.8  {q15}, [r4]!            @ load round 1 key
+        vmov.i8 q8,  #0x01                      @ bit masks
+        vmov.i8 q9,  #0x02
+        vmov.i8 q10, #0x04
+        vmov.i8 q11, #0x08
+        vmov.i8 q12, #0x10
+        vmov.i8 q13, #0x20
+        vldmia  r6, {q14}               @ .LM0
+#ifdef __ARMEL__
+        vrev32.8        q7,  q7
+        vrev32.8        q15, q15
+#endif
+        sub     r5,r5,#1
+        vstmia  r12!, {q7}              @ save round 0 key
+        b       .Lkey_loop
+.align  4
+.Lkey_loop:
+        vtbl.8  d14,{q15},d28
+        vtbl.8  d15,{q15},d29
+        vmov.i8 q6,  #0x40
+        vmov.i8 q15, #0x80
+        vtst.8  q0, q7, q8
+        vtst.8  q1, q7, q9
+        vtst.8  q2, q7, q10
+        vtst.8  q3, q7, q11
+        vtst.8  q4, q7, q12
+        vtst.8  q5, q7, q13
+        vtst.8  q6, q7, q6
+        vtst.8  q7, q7, q15
+        vld1.8  {q15}, [r4]!            @ load next round key
+        vmvn    q0, q0          @ "pnot"
+        vmvn    q1, q1
+        vmvn    q5, q5
+        vmvn    q6, q6
+#ifdef __ARMEL__
+        vrev32.8        q15, q15
+#endif
+        subs    r5,r5,#1
+        vstmia  r12!,{q0-q7}            @ write bit-sliced round key
+        bne     .Lkey_loop
+        vmov.i8 q7,#0x63                        @ compose .L63
+        @ don't save last round key
+        bx      lr
+.size   _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+.global bsaes_cbc_encrypt
+.type   bsaes_cbc_encrypt,%function
+.align  5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+        cmp     r2, #128
+#ifndef __thumb__
+        blo     AES_cbc_encrypt
+#else
+        bhs     1f
+        b       AES_cbc_encrypt
+1:
+#endif
+#endif
+        @ it is up to the caller to make sure we are called with enc == 0
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}
+        VFP_ABI_PUSH
+        ldr     r8, [ip]                        @ IV is 1st arg on the stack
+        mov     r2, r2, lsr#4           @ len in 16 byte blocks
+        sub     sp, #0x10                       @ scratch space to carry over the IV
+        mov     r9, sp                          @ save sp
+        ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+        add     r12, #96                        @ sifze of bit-slices key schedule
+        @ populate the key schedule
+        mov     r4, r3                  @ pass key
+        mov     r5, r10                 @ pass # of rounds
+        mov     sp, r12                         @ sp is sp
+        bl      _bsaes_key_convert
+        vldmia  sp, {q6}
+        vstmia  r12,  {q15}             @ save last round key
+        veor    q7, q7, q6      @ fix up round 0 key
+        vstmia  sp, {q7}
+#else
+        ldr     r12, [r3, #244]
+        eors    r12, #1
+        beq     0f
+        @ populate the key schedule
+        str     r12, [r3, #244]
+        mov     r4, r3                  @ pass key
+        mov     r5, r10                 @ pass # of rounds
+        add     r12, r3, #248                   @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, r3, #248
+        vldmia  r4, {q6}
+        vstmia  r12, {q15}                      @ save last round key
+        veor    q7, q7, q6      @ fix up round 0 key
+        vstmia  r4, {q7}
+.align  2
+0:
+#endif
+        vld1.8  {q15}, [r8]             @ load IV
+        b       .Lcbc_dec_loop
+.align  4
+.Lcbc_dec_loop:
+        subs    r2, r2, #0x8
+        bmi     .Lcbc_dec_loop_finish
+        vld1.8  {q0-q1}, [r0]!  @ load input
+        vld1.8  {q2-q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+        mov     r4, sp                  @ pass the key
+#else
+        add     r4, r3, #248
+#endif
+        vld1.8  {q4-q5}, [r0]!
+        mov     r5, r10
+        vld1.8  {q6-q7}, [r0]
+        sub     r0, r0, #0x60
+        vstmia  r9, {q15}                       @ put aside IV
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q10-q11}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vld1.8  {q12-q13}, [r0]!
+        veor    q4, q4, q10
+        veor    q2, q2, q11
+        vld1.8  {q14-q15}, [r0]!
+        veor    q7, q7, q12
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        veor    q3, q3, q13
+        vst1.8  {q6}, [r1]!
+        veor    q5, q5, q14
+        vst1.8  {q4}, [r1]!
+        vst1.8  {q2}, [r1]!
+        vst1.8  {q7}, [r1]!
+        vst1.8  {q3}, [r1]!
+        vst1.8  {q5}, [r1]!
+        b       .Lcbc_dec_loop
+.Lcbc_dec_loop_finish:
+        adds    r2, r2, #8
+        beq     .Lcbc_dec_done
+        vld1.8  {q0}, [r0]!             @ load input
+        cmp     r2, #2
+        blo     .Lcbc_dec_one
+        vld1.8  {q1}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+        mov     r4, sp                  @ pass the key
+#else
+        add     r4, r3, #248
+#endif
+        mov     r5, r10
+        vstmia  r9, {q15}                       @ put aside IV
+        beq     .Lcbc_dec_two
+        vld1.8  {q2}, [r0]!
+        cmp     r2, #4
+        blo     .Lcbc_dec_three
+        vld1.8  {q3}, [r0]!
+        beq     .Lcbc_dec_four
+        vld1.8  {q4}, [r0]!
+        cmp     r2, #6
+        blo     .Lcbc_dec_five
+        vld1.8  {q5}, [r0]!
+        beq     .Lcbc_dec_six
+        vld1.8  {q6}, [r0]!
+        sub     r0, r0, #0x70
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q10-q11}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vld1.8  {q12-q13}, [r0]!
+        veor    q4, q4, q10
+        veor    q2, q2, q11
+        vld1.8  {q15}, [r0]!
+        veor    q7, q7, q12
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        veor    q3, q3, q13
+        vst1.8  {q6}, [r1]!
+        vst1.8  {q4}, [r1]!
+        vst1.8  {q2}, [r1]!
+        vst1.8  {q7}, [r1]!
+        vst1.8  {q3}, [r1]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_six:
+        sub     r0, r0, #0x60
+        bl      _bsaes_decrypt8
+        vldmia  r9,{q14}                        @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q10-q11}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vld1.8  {q12}, [r0]!
+        veor    q4, q4, q10
+        veor    q2, q2, q11
+        vld1.8  {q15}, [r0]!
+        veor    q7, q7, q12
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        vst1.8  {q6}, [r1]!
+        vst1.8  {q4}, [r1]!
+        vst1.8  {q2}, [r1]!
+        vst1.8  {q7}, [r1]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_five:
+        sub     r0, r0, #0x50
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q10-q11}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vld1.8  {q15}, [r0]!
+        veor    q4, q4, q10
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        veor    q2, q2, q11
+        vst1.8  {q6}, [r1]!
+        vst1.8  {q4}, [r1]!
+        vst1.8  {q2}, [r1]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_four:
+        sub     r0, r0, #0x40
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q10}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vld1.8  {q15}, [r0]!
+        veor    q4, q4, q10
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        vst1.8  {q6}, [r1]!
+        vst1.8  {q4}, [r1]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_three:
+        sub     r0, r0, #0x30
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8-q9}, [r0]!  @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q15}, [r0]!
+        veor    q1, q1, q8
+        veor    q6, q6, q9
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        vst1.8  {q6}, [r1]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_two:
+        sub     r0, r0, #0x20
+        bl      _bsaes_decrypt8
+        vldmia  r9, {q14}                       @ reload IV
+        vld1.8  {q8}, [r0]!             @ reload input
+        veor    q0, q0, q14     @ ^= IV
+        vld1.8  {q15}, [r0]!            @ reload input
+        veor    q1, q1, q8
+        vst1.8  {q0-q1}, [r1]!  @ write output
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_one:
+        sub     r0, r0, #0x10
+        mov     r10, r1                 @ save original out pointer
+        mov     r1, r9                  @ use the iv scratch space as out buffer
+        mov     r2, r3
+        vmov    q4,q15          @ just in case ensure that IV
+        vmov    q5,q0                   @ and input are preserved
+        bl      AES_decrypt
+        vld1.8  {q0}, [r9,:64]          @ load result
+        veor    q0, q0, q4      @ ^= IV
+        vmov    q15, q5         @ q5 holds input
+        vst1.8  {q0}, [r10]             @ write output
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+.Lcbc_dec_bzero:                                @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r9
+        bne             .Lcbc_dec_bzero
+#endif
+        mov     sp, r9
+        add     sp, #0x10                       @ add sp,r9,#0x10 is no good for thumb
+        vst1.8  {q15}, [r8]             @ return IV
+        VFP_ABI_POP
+        ldmia   sp!, {r4-r10, pc}
+.size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type   bsaes_ctr32_encrypt_blocks,%function
+.align  5
+bsaes_ctr32_encrypt_blocks:
+        cmp     r2, #8                  @ use plain AES for
+        blo     .Lctr_enc_short                 @ small sizes
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}
+        VFP_ABI_PUSH
+        ldr     r8, [ip]                        @ ctr is 1st arg on the stack
+        sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+        mov     r9, sp                          @ save sp
+        ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+        add     r12, #96                        @ size of bit-sliced key schedule
+        @ populate the key schedule
+        mov     r4, r3                  @ pass key
+        mov     r5, r10                 @ pass # of rounds
+        mov     sp, r12                         @ sp is sp
+        bl      _bsaes_key_convert
+        veor    q7,q7,q15       @ fix up last round key
+        vstmia  r12, {q7}                       @ save last round key
+        vld1.8  {q0}, [r8]              @ load counter
+        add     r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+        vldmia  sp, {q4}                @ load round0 key
+#else
+        ldr     r12, [r3, #244]
+        eors    r12, #1
+        beq     0f
+        @ populate the key schedule
+        str     r12, [r3, #244]
+        mov     r4, r3                  @ pass key
+        mov     r5, r10                 @ pass # of rounds
+        add     r12, r3, #248                   @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    q7,q7,q15       @ fix up last round key
+        vstmia  r12, {q7}                       @ save last round key
+.align  2
+0:      add     r12, r3, #248
+        vld1.8  {q0}, [r8]              @ load counter
+        adrl    r8, .LREVM0SR                   @ borrow r8
+        vldmia  r12, {q4}                       @ load round0 key
+        sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+        vmov.i32        q8,#1           @ compose 1<<96
+        veor            q9,q9,q9
+        vrev32.8        q0,q0
+        vext.8          q8,q9,q8,#4
+        vrev32.8        q4,q4
+        vadd.u32        q9,q8,q8        @ compose 2<<96
+        vstmia  sp, {q4}                @ save adjusted round0 key
+        b       .Lctr_enc_loop
+.align  4
+.Lctr_enc_loop:
+        vadd.u32        q10, q8, q9     @ compose 3<<96
+        vadd.u32        q1, q0, q8      @ +1
+        vadd.u32        q2, q0, q9      @ +2
+        vadd.u32        q3, q0, q10     @ +3
+        vadd.u32        q4, q1, q10
+        vadd.u32        q5, q2, q10
+        vadd.u32        q6, q3, q10
+        vadd.u32        q7, q4, q10
+        vadd.u32        q10, q5, q10    @ next counter
+        @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+        @ to flip byte order in 32-bit counter
+        vldmia          sp, {q9}                @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x10           @ pass next round key
+#else
+        add             r4, r3, #264
+#endif
+        vldmia          r8, {q8}                        @ .LREVM0SR
+        mov             r5, r10                 @ pass rounds
+        vstmia          r9, {q10}                       @ save next counter
+        sub             r6, r8, #.LREVM0SR-.LSR @ pass constants
+        bl              _bsaes_encrypt8_alt
+        subs            r2, r2, #8
+        blo             .Lctr_enc_loop_done
+        vld1.8          {q8-q9}, [r0]!  @ load input
+        vld1.8          {q10-q11}, [r0]!
+        veor            q0, q8
+        veor            q1, q9
+        vld1.8          {q12-q13}, [r0]!
+        veor            q4, q10
+        veor            q6, q11
+        vld1.8          {q14-q15}, [r0]!
+        veor            q3, q12
+        vst1.8          {q0-q1}, [r1]!  @ write output
+        veor            q7, q13
+        veor            q2, q14
+        vst1.8          {q4}, [r1]!
+        veor            q5, q15
+        vst1.8          {q6}, [r1]!
+        vmov.i32        q8, #1                  @ compose 1<<96
+        vst1.8          {q3}, [r1]!
+        veor            q9, q9, q9
+        vst1.8          {q7}, [r1]!
+        vext.8          q8, q9, q8, #4
+        vst1.8          {q2}, [r1]!
+        vadd.u32        q9,q8,q8                @ compose 2<<96
+        vst1.8          {q5}, [r1]!
+        vldmia          r9, {q0}                        @ load counter
+        bne             .Lctr_enc_loop
+        b               .Lctr_enc_done
+.align  4
+.Lctr_enc_loop_done:
+        add             r2, r2, #8
+        vld1.8          {q8}, [r0]!     @ load input
+        veor            q0, q8
+        vst1.8          {q0}, [r1]!     @ write output
+        cmp             r2, #2
+        blo             .Lctr_enc_done
+        vld1.8          {q9}, [r0]!
+        veor            q1, q9
+        vst1.8          {q1}, [r1]!
+        beq             .Lctr_enc_done
+        vld1.8          {q10}, [r0]!
+        veor            q4, q10
+        vst1.8          {q4}, [r1]!
+        cmp             r2, #4
+        blo             .Lctr_enc_done
+        vld1.8          {q11}, [r0]!
+        veor            q6, q11
+        vst1.8          {q6}, [r1]!
+        beq             .Lctr_enc_done
+        vld1.8          {q12}, [r0]!
+        veor            q3, q12
+        vst1.8          {q3}, [r1]!
+        cmp             r2, #6
+        blo             .Lctr_enc_done
+        vld1.8          {q13}, [r0]!
+        veor            q7, q13
+        vst1.8          {q7}, [r1]!
+        beq             .Lctr_enc_done
+        vld1.8          {q14}, [r0]
+        veor            q2, q14
+        vst1.8          {q2}, [r1]!
+.Lctr_enc_done:
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                        @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r9
+        bne             .Lctr_enc_bzero
+#else
+        vstmia          sp, {q0-q1}
+#endif
+        mov     sp, r9
+        add     sp, #0x10               @ add sp,r9,#0x10 is no good for thumb
+        VFP_ABI_POP
+        ldmia   sp!, {r4-r10, pc}       @ return
+.align  4
+.Lctr_enc_short:
+        ldr     ip, [sp]                @ ctr pointer is passed on stack
+        stmdb   sp!, {r4-r8, lr}
+        mov     r4, r0          @ copy arguments
+        mov     r5, r1
+        mov     r6, r2
+        mov     r7, r3
+        ldr     r8, [ip, #12]           @ load counter LSW
+        vld1.8  {q1}, [ip]              @ load whole counter value
+#ifdef __ARMEL__
+        rev     r8, r8
+#endif
+        sub     sp, sp, #0x10
+        vst1.8  {q1}, [sp,:64]  @ copy counter value
+        sub     sp, sp, #0x10
+.Lctr_enc_short_loop:
+        add     r0, sp, #0x10           @ input counter value
+        mov     r1, sp                  @ output on the stack
+        mov     r2, r7                  @ key
+        bl      AES_encrypt
+        vld1.8  {q0}, [r4]!     @ load input
+        vld1.8  {q1}, [sp,:64]  @ load encrypted counter
+        add     r8, r8, #1
+#ifdef __ARMEL__
+        rev     r0, r8
+        str     r0, [sp, #0x1c]         @ next counter value
+#else
+        str     r8, [sp, #0x1c]         @ next counter value
+#endif
+        veor    q0,q0,q1
+        vst1.8  {q0}, [r5]!     @ store output
+        subs    r6, r6, #1
+        bne     .Lctr_enc_short_loop
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+        vstmia          sp!, {q0-q1}
+        ldmia   sp!, {r4-r8, pc}
+.size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl  bsaes_xts_encrypt
+.type   bsaes_xts_encrypt,%function
+.align  4
+bsaes_xts_encrypt:
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}               @ 0x20
+        VFP_ABI_PUSH
+        mov     r6, sp                          @ future r3
+        mov     r7, r0
+        mov     r8, r1
+        mov     r9, r2
+        mov     r10, r3
+        sub     r0, sp, #0x10                   @ 0x10
+        bic     r0, #0xf                        @ align at 16 bytes
+        mov     sp, r0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr     r0, [ip]                        @ pointer to input tweak
+#else
+        @ generate initial tweak
+        ldr     r0, [ip, #4]                    @ iv[]
+        mov     r1, sp
+        ldr     r2, [ip, #0]                    @ key2
+        bl      AES_encrypt
+        mov     r0,sp                           @ pointer to initial tweak
+#endif
+        ldr     r1, [r10, #240]         @ get # of rounds
+        mov     r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+        @ add   r12, #96                        @ size of bit-sliced key schedule
+        sub     r12, #48                        @ place for tweak[9]
+        @ populate the key schedule
+        mov     r4, r10                 @ pass key
+        mov     r5, r1                  @ pass # of rounds
+        mov     sp, r12
+        add     r12, #0x90                      @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    q7, q7, q15     @ fix up last round key
+        vstmia  r12, {q7}                       @ save last round key
+#else
+        ldr     r12, [r10, #244]
+        eors    r12, #1
+        beq     0f
+        str     r12, [r10, #244]
+        mov     r4, r10                 @ pass key
+        mov     r5, r1                  @ pass # of rounds
+        add     r12, r10, #248                  @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    q7, q7, q15     @ fix up last round key
+        vstmia  r12, {q7}
+.align  2
+0:      sub     sp, #0x90                       @ place for tweak[9]
+#endif
+        vld1.8  {q8}, [r0]                      @ initial tweak
+        adr     r2, .Lxts_magic
+        subs    r9, #0x80
+        blo     .Lxts_enc_short
+        b       .Lxts_enc_loop
+.align  4
+.Lxts_enc_loop:
+        vldmia          r2, {q5}        @ load XTS magic
+        vshr.s64        q6, q8, #63
+        mov             r0, sp
+        vand            q6, q6, q5
+        vadd.u64        q9, q8, q8
+        vst1.64         {q8}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q9, #63
+        veor            q9, q9, q6
+        vand            q7, q7, q5
+        vadd.u64        q10, q9, q9
+        vst1.64         {q9}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q10, #63
+        veor            q10, q10, q7
+        vand            q6, q6, q5
+        vld1.8          {q0}, [r7]!
+        vadd.u64        q11, q10, q10
+        vst1.64         {q10}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q11, #63
+        veor            q11, q11, q6
+        vand            q7, q7, q5
+        vld1.8          {q1}, [r7]!
+        veor            q0, q0, q8
+        vadd.u64        q12, q11, q11
+        vst1.64         {q11}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q12, #63
+        veor            q12, q12, q7
+        vand            q6, q6, q5
+        vld1.8          {q2}, [r7]!
+        veor            q1, q1, q9
+        vadd.u64        q13, q12, q12
+        vst1.64         {q12}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q13, #63
+        veor            q13, q13, q6
+        vand            q7, q7, q5
+        vld1.8          {q3}, [r7]!
+        veor            q2, q2, q10
+        vadd.u64        q14, q13, q13
+        vst1.64         {q13}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q14, #63
+        veor            q14, q14, q7
+        vand            q6, q6, q5
+        vld1.8          {q4}, [r7]!
+        veor            q3, q3, q11
+        vadd.u64        q15, q14, q14
+        vst1.64         {q14}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q15, #63
+        veor            q15, q15, q6
+        vand            q7, q7, q5
+        vld1.8          {q5}, [r7]!
+        veor            q4, q4, q12
+        vadd.u64        q8, q15, q15
+        vst1.64         {q15}, [r0,:128]!
+        vswp            d15,d14
+        veor            q8, q8, q7
+        vst1.64         {q8}, [r0,:128]         @ next round tweak
+        vld1.8          {q6-q7}, [r7]!
+        veor            q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q6, q6, q14
+        mov             r5, r1                  @ pass rounds
+        veor            q7, q7, q15
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q6, q11
+        vld1.64         {q14-q15}, [r0,:128]!
+        veor            q10, q3, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        veor            q12, q2, q14
+        vst1.8          {q10-q11}, [r8]!
+        veor            q13, q5, q15
+        vst1.8          {q12-q13}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        subs            r9, #0x80
+        bpl             .Lxts_enc_loop
+.Lxts_enc_short:
+        adds            r9, #0x70
+        bmi             .Lxts_enc_done
+        vldmia          r2, {q5}        @ load XTS magic
+        vshr.s64        q7, q8, #63
+        mov             r0, sp
+        vand            q7, q7, q5
+        vadd.u64        q9, q8, q8
+        vst1.64         {q8}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q9, #63
+        veor            q9, q9, q7
+        vand            q6, q6, q5
+        vadd.u64        q10, q9, q9
+        vst1.64         {q9}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q10, #63
+        veor            q10, q10, q6
+        vand            q7, q7, q5
+        vld1.8          {q0}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_1
+        vadd.u64        q11, q10, q10
+        vst1.64         {q10}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q11, #63
+        veor            q11, q11, q7
+        vand            q6, q6, q5
+        vld1.8          {q1}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_2
+        veor            q0, q0, q8
+        vadd.u64        q12, q11, q11
+        vst1.64         {q11}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q12, #63
+        veor            q12, q12, q6
+        vand            q7, q7, q5
+        vld1.8          {q2}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_3
+        veor            q1, q1, q9
+        vadd.u64        q13, q12, q12
+        vst1.64         {q12}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q13, #63
+        veor            q13, q13, q7
+        vand            q6, q6, q5
+        vld1.8          {q3}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_4
+        veor            q2, q2, q10
+        vadd.u64        q14, q13, q13
+        vst1.64         {q13}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q14, #63
+        veor            q14, q14, q6
+        vand            q7, q7, q5
+        vld1.8          {q4}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_5
+        veor            q3, q3, q11
+        vadd.u64        q15, q14, q14
+        vst1.64         {q14}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q15, #63
+        veor            q15, q15, q7
+        vand            q6, q6, q5
+        vld1.8          {q5}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_enc_6
+        veor            q4, q4, q12
+        sub             r9, #0x10
+        vst1.64         {q15}, [r0,:128]                @ next round tweak
+        vld1.8          {q6}, [r7]!
+        veor            q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q6, q6, q14
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q6, q11
+        vld1.64         {q14}, [r0,:128]!
+        veor            q10, q3, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        veor            q12, q2, q14
+        vst1.8          {q10-q11}, [r8]!
+        vst1.8          {q12}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_6:
+        vst1.64         {q14}, [r0,:128]                @ next round tweak
+        veor            q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q5, q5, q13
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q6, q11
+        veor            q10, q3, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        vst1.8          {q10-q11}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+@ put this in range for both ARM and Thumb mode adr instructions
+.align  5
+.Lxts_magic:
+        .quad   1, 0x87
+.align  5
+.Lxts_enc_5:
+        vst1.64         {q13}, [r0,:128]                @ next round tweak
+        veor            q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q4, q4, q12
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q6, q11
+        veor            q10, q3, q12
+        vst1.8          {q8-q9}, [r8]!
+        vst1.8          {q10}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_4:
+        vst1.64         {q12}, [r0,:128]                @ next round tweak
+        veor            q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q3, q3, q11
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q6, q11
+        vst1.8          {q8-q9}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_3:
+        vst1.64         {q11}, [r0,:128]                @ next round tweak
+        veor            q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q2, q2, q10
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        veor            q8, q4, q10
+        vst1.8          {q0-q1}, [r8]!
+        vst1.8          {q8}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_2:
+        vst1.64         {q10}, [r0,:128]                @ next round tweak
+        veor            q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q1, q1, q9
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        vst1.8          {q0-q1}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_1:
+        mov             r0, sp
+        veor            q0, q8
+        mov             r1, sp
+        vst1.8          {q0}, [sp,:128]
+        mov             r2, r10
+        mov             r4, r3                          @ preserve fp
+        bl              AES_encrypt
+        vld1.8          {q0}, [sp,:128]
+        veor            q0, q0, q8
+        vst1.8          {q0}, [r8]!
+        mov             r3, r4
+        vmov            q8, q9          @ next round tweak
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+        adds            r9, #0x10
+        beq             .Lxts_enc_ret
+        sub             r6, r8, #0x10
+.Lxts_enc_steal:
+        ldrb            r0, [r7], #1
+        ldrb            r1, [r8, #-0x10]
+        strb            r0, [r8, #-0x10]
+        strb            r1, [r8], #1
+        subs            r9, #1
+        bhi             .Lxts_enc_steal
+        vld1.8          {q0}, [r6]
+        mov             r0, sp
+        veor            q0, q0, q8
+        mov             r1, sp
+        vst1.8          {q0}, [sp,:128]
+        mov             r2, r10
+        mov             r4, r3                  @ preserve fp
+        bl              AES_encrypt
+        vld1.8          {q0}, [sp,:128]
+        veor            q0, q0, q8
+        vst1.8          {q0}, [r6]
+        mov             r3, r4
+#endif
+.Lxts_enc_ret:
+        bic             r0, r3, #0xf
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_enc_bzero:                                @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r0
+        bne             .Lxts_enc_bzero
+        mov             sp, r3
+#ifdef  XTS_CHAIN_TWEAK
+        vst1.8          {q8}, [r1]
+#endif
+        VFP_ABI_POP
+        ldmia           sp!, {r4-r10, pc}       @ return
+.size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
+.globl  bsaes_xts_decrypt
+.type   bsaes_xts_decrypt,%function
+.align  4
+bsaes_xts_decrypt:
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}               @ 0x20
+        VFP_ABI_PUSH
+        mov     r6, sp                          @ future r3
+        mov     r7, r0
+        mov     r8, r1
+        mov     r9, r2
+        mov     r10, r3
+        sub     r0, sp, #0x10                   @ 0x10
+        bic     r0, #0xf                        @ align at 16 bytes
+        mov     sp, r0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr     r0, [ip]                        @ pointer to input tweak
+#else
+        @ generate initial tweak
+        ldr     r0, [ip, #4]                    @ iv[]
+        mov     r1, sp
+        ldr     r2, [ip, #0]                    @ key2
+        bl      AES_encrypt
+        mov     r0, sp                          @ pointer to initial tweak
+#endif
+        ldr     r1, [r10, #240]         @ get # of rounds
+        mov     r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+        @ add   r12, #96                        @ size of bit-sliced key schedule
+        sub     r12, #48                        @ place for tweak[9]
+        @ populate the key schedule
+        mov     r4, r10                 @ pass key
+        mov     r5, r1                  @ pass # of rounds
+        mov     sp, r12
+        add     r12, #0x90                      @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, sp, #0x90
+        vldmia  r4, {q6}
+        vstmia  r12,  {q15}             @ save last round key
+        veor    q7, q7, q6      @ fix up round 0 key
+        vstmia  r4, {q7}
+#else
+        ldr     r12, [r10, #244]
+        eors    r12, #1
+        beq     0f
+        str     r12, [r10, #244]
+        mov     r4, r10                 @ pass key
+        mov     r5, r1                  @ pass # of rounds
+        add     r12, r10, #248                  @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, r10, #248
+        vldmia  r4, {q6}
+        vstmia  r12,  {q15}             @ save last round key
+        veor    q7, q7, q6      @ fix up round 0 key
+        vstmia  r4, {q7}
+.align  2
+0:      sub     sp, #0x90                       @ place for tweak[9]
+#endif
+        vld1.8  {q8}, [r0]                      @ initial tweak
+        adr     r2, .Lxts_magic
+        tst     r9, #0xf                        @ if not multiple of 16
+        it      ne                              @ Thumb2 thing, sanity check in ARM
+        subne   r9, #0x10                       @ subtract another 16 bytes
+        subs    r9, #0x80
+        blo     .Lxts_dec_short
+        b       .Lxts_dec_loop
+.align  4
+.Lxts_dec_loop:
+        vldmia          r2, {q5}        @ load XTS magic
+        vshr.s64        q6, q8, #63
+        mov             r0, sp
+        vand            q6, q6, q5
+        vadd.u64        q9, q8, q8
+        vst1.64         {q8}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q9, #63
+        veor            q9, q9, q6
+        vand            q7, q7, q5
+        vadd.u64        q10, q9, q9
+        vst1.64         {q9}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q10, #63
+        veor            q10, q10, q7
+        vand            q6, q6, q5
+        vld1.8          {q0}, [r7]!
+        vadd.u64        q11, q10, q10
+        vst1.64         {q10}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q11, #63
+        veor            q11, q11, q6
+        vand            q7, q7, q5
+        vld1.8          {q1}, [r7]!
+        veor            q0, q0, q8
+        vadd.u64        q12, q11, q11
+        vst1.64         {q11}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q12, #63
+        veor            q12, q12, q7
+        vand            q6, q6, q5
+        vld1.8          {q2}, [r7]!
+        veor            q1, q1, q9
+        vadd.u64        q13, q12, q12
+        vst1.64         {q12}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q13, #63
+        veor            q13, q13, q6
+        vand            q7, q7, q5
+        vld1.8          {q3}, [r7]!
+        veor            q2, q2, q10
+        vadd.u64        q14, q13, q13
+        vst1.64         {q13}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q14, #63
+        veor            q14, q14, q7
+        vand            q6, q6, q5
+        vld1.8          {q4}, [r7]!
+        veor            q3, q3, q11
+        vadd.u64        q15, q14, q14
+        vst1.64         {q14}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q15, #63
+        veor            q15, q15, q6
+        vand            q7, q7, q5
+        vld1.8          {q5}, [r7]!
+        veor            q4, q4, q12
+        vadd.u64        q8, q15, q15
+        vst1.64         {q15}, [r0,:128]!
+        vswp            d15,d14
+        veor            q8, q8, q7
+        vst1.64         {q8}, [r0,:128]         @ next round tweak
+        vld1.8          {q6-q7}, [r7]!
+        veor            q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q6, q6, q14
+        mov             r5, r1                  @ pass rounds
+        veor            q7, q7, q15
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q4, q11
+        vld1.64         {q14-q15}, [r0,:128]!
+        veor            q10, q2, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        veor            q12, q3, q14
+        vst1.8          {q10-q11}, [r8]!
+        veor            q13, q5, q15
+        vst1.8          {q12-q13}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        subs            r9, #0x80
+        bpl             .Lxts_dec_loop
+.Lxts_dec_short:
+        adds            r9, #0x70
+        bmi             .Lxts_dec_done
+        vldmia          r2, {q5}        @ load XTS magic
+        vshr.s64        q7, q8, #63
+        mov             r0, sp
+        vand            q7, q7, q5
+        vadd.u64        q9, q8, q8
+        vst1.64         {q8}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q9, #63
+        veor            q9, q9, q7
+        vand            q6, q6, q5
+        vadd.u64        q10, q9, q9
+        vst1.64         {q9}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q10, #63
+        veor            q10, q10, q6
+        vand            q7, q7, q5
+        vld1.8          {q0}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_1
+        vadd.u64        q11, q10, q10
+        vst1.64         {q10}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q11, #63
+        veor            q11, q11, q7
+        vand            q6, q6, q5
+        vld1.8          {q1}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_2
+        veor            q0, q0, q8
+        vadd.u64        q12, q11, q11
+        vst1.64         {q11}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q12, #63
+        veor            q12, q12, q6
+        vand            q7, q7, q5
+        vld1.8          {q2}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_3
+        veor            q1, q1, q9
+        vadd.u64        q13, q12, q12
+        vst1.64         {q12}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q13, #63
+        veor            q13, q13, q7
+        vand            q6, q6, q5
+        vld1.8          {q3}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_4
+        veor            q2, q2, q10
+        vadd.u64        q14, q13, q13
+        vst1.64         {q13}, [r0,:128]!
+        vswp            d13,d12
+        vshr.s64        q7, q14, #63
+        veor            q14, q14, q6
+        vand            q7, q7, q5
+        vld1.8          {q4}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_5
+        veor            q3, q3, q11
+        vadd.u64        q15, q14, q14
+        vst1.64         {q14}, [r0,:128]!
+        vswp            d15,d14
+        vshr.s64        q6, q15, #63
+        veor            q15, q15, q7
+        vand            q6, q6, q5
+        vld1.8          {q5}, [r7]!
+        subs            r9, #0x10
+        bmi             .Lxts_dec_6
+        veor            q4, q4, q12
+        sub             r9, #0x10
+        vst1.64         {q15}, [r0,:128]                @ next round tweak
+        vld1.8          {q6}, [r7]!
+        veor            q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q6, q6, q14
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q4, q11
+        vld1.64         {q14}, [r0,:128]!
+        veor            q10, q2, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        veor            q12, q3, q14
+        vst1.8          {q10-q11}, [r8]!
+        vst1.8          {q12}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_6:
+        vst1.64         {q14}, [r0,:128]                @ next round tweak
+        veor            q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q5, q5, q13
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12-q13}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q4, q11
+        veor            q10, q2, q12
+        vst1.8          {q8-q9}, [r8]!
+        veor            q11, q7, q13
+        vst1.8          {q10-q11}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_5:
+        vst1.64         {q13}, [r0,:128]                @ next round tweak
+        veor            q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q4, q4, q12
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        vld1.64         {q12}, [r0,:128]!
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q4, q11
+        veor            q10, q2, q12
+        vst1.8          {q8-q9}, [r8]!
+        vst1.8          {q10}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_4:
+        vst1.64         {q12}, [r0,:128]                @ next round tweak
+        veor            q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q3, q3, q11
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10-q11}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        veor            q9, q4, q11
+        vst1.8          {q8-q9}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_3:
+        vst1.64         {q11}, [r0,:128]                @ next round tweak
+        veor            q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q2, q2, q10
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        vld1.64         {q10}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        veor            q8, q6, q10
+        vst1.8          {q0-q1}, [r8]!
+        vst1.8          {q8}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_2:
+        vst1.64         {q10}, [r0,:128]                @ next round tweak
+        veor            q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, r10, #248                   @ pass key schedule
+#endif
+        veor            q1, q1, q9
+        mov             r5, r1                  @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {q8-q9}, [r0,:128]!
+        veor            q0, q0, q8
+        veor            q1, q1, q9
+        vst1.8          {q0-q1}, [r8]!
+        vld1.64         {q8}, [r0,:128]         @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_1:
+        mov             r0, sp
+        veor            q0, q8
+        mov             r1, sp
+        vst1.8          {q0}, [sp,:128]
+        mov             r2, r10
+        mov             r4, r3                          @ preserve fp
+        mov             r5, r2                  @ preserve magic
+        bl              AES_decrypt
+        vld1.8          {q0}, [sp,:128]
+        veor            q0, q0, q8
+        vst1.8          {q0}, [r8]!
+        mov             r3, r4
+        mov             r2, r5
+        vmov            q8, q9          @ next round tweak
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+        adds            r9, #0x10
+        beq             .Lxts_dec_ret
+        @ calculate one round of extra tweak for the stolen ciphertext
+        vldmia          r2, {q5}
+        vshr.s64        q6, q8, #63
+        vand            q6, q6, q5
+        vadd.u64        q9, q8, q8
+        vswp            d13,d12
+        veor            q9, q9, q6
+        @ perform the final decryption with the last tweak value
+        vld1.8          {q0}, [r7]!
+        mov             r0, sp
+        veor            q0, q0, q9
+        mov             r1, sp
+        vst1.8          {q0}, [sp,:128]
+        mov             r2, r10
+        mov             r4, r3                  @ preserve fp
+        bl              AES_decrypt
+        vld1.8          {q0}, [sp,:128]
+        veor            q0, q0, q9
+        vst1.8          {q0}, [r8]
+        mov             r6, r8
+.Lxts_dec_steal:
+        ldrb            r1, [r8]
+        ldrb            r0, [r7], #1
+        strb            r1, [r8, #0x10]
+        strb            r0, [r8], #1
+        subs            r9, #1
+        bhi             .Lxts_dec_steal
+        vld1.8          {q0}, [r6]
+        mov             r0, sp
+        veor            q0, q8
+        mov             r1, sp
+        vst1.8          {q0}, [sp,:128]
+        mov             r2, r10
+        bl              AES_decrypt
+        vld1.8          {q0}, [sp,:128]
+        veor            q0, q0, q8
+        vst1.8          {q0}, [r6]
+        mov             r3, r4
+#endif
+.Lxts_dec_ret:
+        bic             r0, r3, #0xf
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_dec_bzero:                                @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r0
+        bne             .Lxts_dec_bzero
+        mov             sp, r3
+#ifdef  XTS_CHAIN_TWEAK
+        vst1.8          {q8}, [r1]
+#endif
+        VFP_ABI_POP
+        ldmia           sp!, {r4-r10, pc}       @ return
+.size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
new file mode 100644
index 000000000000..4522366da759
--- /dev/null
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include "aes_glue.h"
+#define BIT_SLICED_KEY_MAXSIZE  (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+struct BS_KEY {
+        struct AES_KEY  rk;
+        int             converted;
+        u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                  struct BS_KEY *key, u8 iv[]);
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+                                           struct BS_KEY *key, u8 const iv[]);
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                  struct BS_KEY *key, u8 tweak[]);
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+                                  struct BS_KEY *key, u8 tweak[]);
+struct aesbs_cbc_ctx {
+        struct AES_KEY  enc;
+        struct BS_KEY   dec;
+};
+struct aesbs_ctr_ctx {
+        struct BS_KEY   enc;
+};
+struct aesbs_xts_ctx {
+        struct BS_KEY   enc;
+        struct BS_KEY   dec;
+        struct AES_KEY  twkey;
+};
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                             unsigned int key_len)
+{
+        struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+        int bits = key_len * 8;
+        if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+                tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+                return -EINVAL;
+        }
+        ctx->dec.rk = ctx->enc;
+        private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+        ctx->dec.converted = 0;
+        return 0;
+}
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                             unsigned int key_len)
+{
+        struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+        int bits = key_len * 8;
+        if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+                tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+                return -EINVAL;
+        }
+        ctx->enc.converted = 0;
+        return 0;
+}
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                             unsigned int key_len)
+{
+        struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+        int bits = key_len * 4;
+        if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+                tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+                return -EINVAL;
+        }
+        ctx->dec.rk = ctx->enc.rk;
+        private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+        private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+        ctx->enc.converted = ctx->dec.converted = 0;
+        return 0;
+}
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+                             struct scatterlist *dst,
+                             struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        while (walk.nbytes) {
+                u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+                u8 *src = walk.src.virt.addr;
+                if (walk.dst.virt.addr == walk.src.virt.addr) {
+                        u8 *iv = walk.iv;
+                        do {
+                                crypto_xor(src, iv, AES_BLOCK_SIZE);
+                                AES_encrypt(src, src, &ctx->enc);
+                                iv = src;
+                                src += AES_BLOCK_SIZE;
+                        } while (--blocks);
+                        memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+                } else {
+                        u8 *dst = walk.dst.virt.addr;
+                        do {
+                                crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+                                AES_encrypt(walk.iv, dst, &ctx->enc);
+                                memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+                                src += AES_BLOCK_SIZE;
+                                dst += AES_BLOCK_SIZE;
+                        } while (--blocks);
+                }
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+                             struct scatterlist *dst,
+                             struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+        while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+                kernel_neon_begin();
+                bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                  walk.nbytes, &ctx->dec, walk.iv);
+                kernel_neon_end();
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        while (walk.nbytes) {
+                u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+                u8 *dst = walk.dst.virt.addr;
+                u8 *src = walk.src.virt.addr;
+                u8 bk[2][AES_BLOCK_SIZE];
+                u8 *iv = walk.iv;
+                do {
+                        if (walk.dst.virt.addr == walk.src.virt.addr)
+                                memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+                        AES_decrypt(src, dst, &ctx->dec.rk);
+                        crypto_xor(dst, iv, AES_BLOCK_SIZE);
+                        if (walk.dst.virt.addr == walk.src.virt.addr)
+                                iv = bk[blocks & 1];
+                        else
+                                iv = src;
+                        dst += AES_BLOCK_SIZE;
+                        src += AES_BLOCK_SIZE;
+                } while (--blocks);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+        int i;
+        for (i = 3; i >= 0; i--, addend = 1) {
+                u32 n = be32_to_cpu(ctr[i]) + addend;
+                ctr[i] = cpu_to_be32(n);
+                if (n >= addend)
+                        break;
+        }
+}
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+                             struct scatterlist *dst, struct scatterlist *src,
+                             unsigned int nbytes)
+{
+        struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        struct blkcipher_walk walk;
+        u32 blocks;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+        while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+                u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+                __be32 *ctr = (__be32 *)walk.iv;
+                u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+                /* avoid 32 bit counter overflow in the NEON code */
+                if (unlikely(headroom < blocks)) {
+                        blocks = headroom + 1;
+                        tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+                }
+                kernel_neon_begin();
+                bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+                                           walk.dst.virt.addr, blocks,
+                                           &ctx->enc, walk.iv);
+                kernel_neon_end();
+                inc_be128_ctr(ctr, blocks);
+                nbytes -= blocks * AES_BLOCK_SIZE;
+                if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+                        break;
+                err = blkcipher_walk_done(desc, &walk, tail);
+        }
+        if (walk.nbytes) {
+                u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 ks[AES_BLOCK_SIZE];
+                AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+                if (tdst != tsrc)
+                        memcpy(tdst, tsrc, nbytes);
+                crypto_xor(tdst, ks, nbytes);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+                             struct scatterlist *dst,
+                             struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+        /* generate the initial tweak */
+        AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+        while (walk.nbytes) {
+                kernel_neon_begin();
+                bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                  walk.nbytes, &ctx->enc, walk.iv);
+                kernel_neon_end();
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+                             struct scatterlist *dst,
+                             struct scatterlist *src, unsigned int nbytes)
+{
+        struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        struct blkcipher_walk walk;
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+        /* generate the initial tweak */
+        AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+        while (walk.nbytes) {
+                kernel_neon_begin();
+                bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                  walk.nbytes, &ctx->dec, walk.iv);
+                kernel_neon_end();
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static struct crypto_alg aesbs_algs[] = { {
+        .cra_name               = "__cbc-aes-neonbs",
+        .cra_driver_name        = "__driver-cbc-aes-neonbs",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct aesbs_cbc_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = aesbs_cbc_set_key,
+                .encrypt        = aesbs_cbc_encrypt,
+                .decrypt        = aesbs_cbc_decrypt,
+        },
+}, {
+        .cra_name               = "__ctr-aes-neonbs",
+        .cra_driver_name        = "__driver-ctr-aes-neonbs",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct aesbs_ctr_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = aesbs_ctr_set_key,
+                .encrypt        = aesbs_ctr_encrypt,
+                .decrypt        = aesbs_ctr_encrypt,
+        },
+}, {
+        .cra_name               = "__xts-aes-neonbs",
+        .cra_driver_name        = "__driver-xts-aes-neonbs",
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct aesbs_xts_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = aesbs_xts_set_key,
+                .encrypt        = aesbs_xts_encrypt,
+                .decrypt        = aesbs_xts_decrypt,
+        },
+}, {
+        .cra_name               = "cbc(aes)",
+        .cra_driver_name        = "cbc-aes-neonbs",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = __ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "ctr(aes)",
+        .cra_driver_name        = "ctr-aes-neonbs",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "xts(aes)",
+        .cra_driver_name        = "xts-aes-neonbs",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+} };
+static int __init aesbs_mod_init(void)
+{
+        if (!cpu_has_neon())
+                return -ENODEV;
+        return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+static void __exit aesbs_mod_exit(void)
+{
+        crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644
index 000000000000..f3d96d932573
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt       19.5 cycles per byte processed with 128-bit key
+# decrypt       22.1 cycles per byte processed with 128-bit key
+# key conv.     440  cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+#                                               <appro@openssl.org>
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+#                                       <ard.biesheuvel@linaro.org>
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+        &InBasisChange  (@b);
+        &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
+        &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+        veor    @b[2], @b[2], @b[1]
+        veor    @b[5], @b[5], @b[6]
+        veor    @b[3], @b[3], @b[0]
+        veor    @b[6], @b[6], @b[2]
+        veor    @b[5], @b[5], @b[0]
+        veor    @b[6], @b[6], @b[3]
+        veor    @b[3], @b[3], @b[7]
+        veor    @b[7], @b[7], @b[5]
+        veor    @b[3], @b[3], @b[4]
+        veor    @b[4], @b[4], @b[5]
+        veor    @b[2], @b[2], @b[7]
+        veor    @b[3], @b[3], @b[1]
+        veor    @b[1], @b[1], @b[5]
+___
+}
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+        veor    @b[0], @b[0], @b[6]
+        veor    @b[1], @b[1], @b[4]
+        veor    @b[4], @b[4], @b[6]
+        veor    @b[2], @b[2], @b[0]
+        veor    @b[6], @b[6], @b[1]
+        veor    @b[1], @b[1], @b[5]
+        veor    @b[5], @b[5], @b[3]
+        veor    @b[3], @b[3], @b[7]
+        veor    @b[7], @b[7], @b[5]
+        veor    @b[2], @b[2], @b[5]
+        veor    @b[4], @b[4], @b[7]
+___
+}
+sub InvSbox {
+# input in lsb  > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+        &InvInBasisChange       (@b);
+        &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
+        &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
+}
+sub InvInBasisChange {          # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+         veor   @b[1], @b[1], @b[7]
+        veor    @b[4], @b[4], @b[7]
+        veor    @b[7], @b[7], @b[5]
+         veor   @b[1], @b[1], @b[3]
+        veor    @b[2], @b[2], @b[5]
+        veor    @b[3], @b[3], @b[7]
+        veor    @b[6], @b[6], @b[1]
+        veor    @b[2], @b[2], @b[0]
+         veor   @b[5], @b[5], @b[3]
+        veor    @b[4], @b[4], @b[6]
+        veor    @b[0], @b[0], @b[6]
+        veor    @b[1], @b[1], @b[4]
+___
+}
+sub InvOutBasisChange {         # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+        veor    @b[1], @b[1], @b[5]
+        veor    @b[2], @b[2], @b[7]
+        veor    @b[3], @b[3], @b[1]
+        veor    @b[4], @b[4], @b[5]
+        veor    @b[7], @b[7], @b[5]
+        veor    @b[3], @b[3], @b[4]
+         veor   @b[5], @b[5], @b[0]
+        veor    @b[3], @b[3], @b[7]
+         veor   @b[6], @b[6], @b[2]
+         veor   @b[2], @b[2], @b[1]
+        veor    @b[6], @b[6], @b[3]
+        veor    @b[3], @b[3], @b[0]
+        veor    @b[5], @b[5], @b[6]
+___
+}
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+        veor    $t0, $y0, $y1
+        vand    $t0, $t0, $x0
+        veor    $x0, $x0, $x1
+        vand    $t1, $x1, $y0
+        vand    $x0, $x0, $y1
+        veor    $x1, $t1, $t0
+        veor    $x0, $x0, $t1
+___
+}
+sub Mul_GF4_N {                         # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+        veor    $t0, $y0, $y1
+        vand    $t0, $t0, $x0
+        veor    $x0, $x0, $x1
+        vand    $x1, $x1, $y0
+        vand    $x0, $x0, $y1
+        veor    $x1, $x1, $x0
+        veor    $x0, $x0, $t0
+___
+}
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+        veor    $t0, $y0, $y1
+         veor   $t1, $y2, $y3
+        vand    $t0, $t0, $x0
+         vand   $t1, $t1, $x2
+        veor    $x0, $x0, $x1
+         veor   $x2, $x2, $x3
+        vand    $x1, $x1, $y0
+         vand   $x3, $x3, $y2
+        vand    $x0, $x0, $y1
+         vand   $x2, $x2, $y3
+        veor    $x1, $x1, $x0
+         veor   $x2, $x2, $x3
+        veor    $x0, $x0, $t0
+         veor   $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+        veor    @t[0], @x[0], @x[2]
+        veor    @t[1], @x[1], @x[3]
+___
+        &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+        veor    @y[0], @y[0], @y[2]
+        veor    @y[1], @y[1], @y[3]
+___
+        Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
+                         @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+        veor    @x[0], @x[0], @t[0]
+        veor    @x[2], @x[2], @t[0]
+        veor    @x[1], @x[1], @t[1]
+        veor    @x[3], @x[3], @t[1]
+        veor    @t[0], @x[4], @x[6]
+        veor    @t[1], @x[5], @x[7]
+___
+        &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
+                         @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+        veor    @y[0], @y[0], @y[2]
+        veor    @y[1], @y[1], @y[3]
+___
+        &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+        veor    @x[4], @x[4], @t[0]
+        veor    @x[6], @x[6], @t[0]
+        veor    @x[5], @x[5], @t[1]
+        veor    @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+        veor    @t[3], @x[4], @x[6]
+        veor    @t[2], @x[5], @x[7]
+        veor    @t[1], @x[1], @x[3]
+        veor    @s[1], @x[7], @x[6]
+         vmov   @t[0], @t[2]
+        veor    @s[0], @x[0], @x[2]
+        vorr    @t[2], @t[2], @t[1]
+        veor    @s[3], @t[3], @t[0]
+        vand    @s[2], @t[3], @s[0]
+        vorr    @t[3], @t[3], @s[0]
+        veor    @s[0], @s[0], @t[1]
+        vand    @t[0], @t[0], @t[1]
+        veor    @t[1], @x[3], @x[2]
+        vand    @s[3], @s[3], @s[0]
+        vand    @s[1], @s[1], @t[1]
+        veor    @t[1], @x[4], @x[5]
+        veor    @s[0], @x[1], @x[0]
+        veor    @t[3], @t[3], @s[1]
+        veor    @t[2], @t[2], @s[1]
+        vand    @s[1], @t[1], @s[0]
+        vorr    @t[1], @t[1], @s[0]
+        veor    @t[3], @t[3], @s[3]
+        veor    @t[0], @t[0], @s[1]
+        veor    @t[2], @t[2], @s[2]
+        veor    @t[1], @t[1], @s[3]
+        veor    @t[0], @t[0], @s[2]
+        vand    @s[0], @x[7], @x[3]
+        veor    @t[1], @t[1], @s[2]
+        vand    @s[1], @x[6], @x[2]
+        vand    @s[2], @x[5], @x[1]
+        vorr    @s[3], @x[4], @x[0]
+        veor    @t[3], @t[3], @s[0]
+        veor    @t[1], @t[1], @s[2]
+        veor    @t[0], @t[0], @s[3]
+        veor    @t[2], @t[2], @s[1]
+        @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+        @ new smaller inversion
+        vand    @s[2], @t[3], @t[1]
+        vmov    @s[0], @t[0]
+        veor    @s[1], @t[2], @s[2]
+        veor    @s[3], @t[0], @s[2]
+        veor    @s[2], @t[0], @s[2]     @ @s[2]=@s[3]
+        vbsl    @s[1], @t[1], @t[0]
+        vbsl    @s[3], @t[3], @t[2]
+        veor    @t[3], @t[3], @t[2]
+        vbsl    @s[0], @s[1], @s[2]
+        vbsl    @t[0], @s[2], @s[1]
+        vand    @s[2], @s[0], @s[3]
+        veor    @t[1], @t[1], @t[0]
+        veor    @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+        &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+# AES linear components
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+        vldmia  $key!, {@t[0]-@t[3]}
+        veor    @t[0], @t[0], @x[0]
+        veor    @t[1], @t[1], @x[1]
+        vtbl.8  `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+        vldmia  $key!, {@t[0]}
+        veor    @t[2], @t[2], @x[2]
+        vtbl.8  `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+        vldmia  $key!, {@t[1]}
+        veor    @t[3], @t[3], @x[3]
+        vtbl.8  `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+        vldmia  $key!, {@t[2]}
+        vtbl.8  `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+        vldmia  $key!, {@t[3]}
+        veor    @t[0], @t[0], @x[4]
+        veor    @t[1], @t[1], @x[5]
+        vtbl.8  `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+        veor    @t[2], @t[2], @x[6]
+        vtbl.8  `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+        veor    @t[3], @t[3], @x[7]
+        vtbl.8  `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+        vtbl.8  `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+        vtbl.8  `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+        vext.8  @t[0], @x[0], @x[0], #12        @ x0 <<< 32
+        vext.8  @t[1], @x[1], @x[1], #12
+         veor   @x[0], @x[0], @t[0]             @ x0 ^ (x0 <<< 32)
+        vext.8  @t[2], @x[2], @x[2], #12
+         veor   @x[1], @x[1], @t[1]
+        vext.8  @t[3], @x[3], @x[3], #12
+         veor   @x[2], @x[2], @t[2]
+        vext.8  @t[4], @x[4], @x[4], #12
+         veor   @x[3], @x[3], @t[3]
+        vext.8  @t[5], @x[5], @x[5], #12
+         veor   @x[4], @x[4], @t[4]
+        vext.8  @t[6], @x[6], @x[6], #12
+         veor   @x[5], @x[5], @t[5]
+        vext.8  @t[7], @x[7], @x[7], #12
+         veor   @x[6], @x[6], @t[6]
+        veor    @t[1], @t[1], @x[0]
+         veor   @x[7], @x[7], @t[7]
+         vext.8 @x[0], @x[0], @x[0], #8         @ (x0 ^ (x0 <<< 32)) <<< 64)
+        veor    @t[2], @t[2], @x[1]
+        veor    @t[0], @t[0], @x[7]
+        veor    @t[1], @t[1], @x[7]
+         vext.8 @x[1], @x[1], @x[1], #8
+        veor    @t[5], @t[5], @x[4]
+         veor   @x[0], @x[0], @t[0]
+        veor    @t[6], @t[6], @x[5]
+         veor   @x[1], @x[1], @t[1]
+         vext.8 @t[0], @x[4], @x[4], #8
+        veor    @t[4], @t[4], @x[3]
+         vext.8 @t[1], @x[5], @x[5], #8
+        veor    @t[7], @t[7], @x[6]
+         vext.8 @x[4], @x[3], @x[3], #8
+        veor    @t[3], @t[3], @x[2]
+         vext.8 @x[5], @x[7], @x[7], #8
+        veor    @t[4], @t[4], @x[7]
+         vext.8 @x[3], @x[6], @x[6], #8
+        veor    @t[3], @t[3], @x[7]
+         vext.8 @x[6], @x[2], @x[2], #8
+        veor    @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+        veor    @x[2], @t[0], @t[4]
+        veor    @x[4], @x[4], @t[3]
+        veor    @x[5], @x[5], @t[7]
+        veor    @x[3], @x[3], @t[6]
+         @ vmov @x[2], @t[0]
+        veor    @x[6], @x[6], @t[2]
+         @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+        veor    @t[3], @t[3], @x[4]
+        veor    @x[5], @x[5], @t[7]
+        veor    @x[2], @x[3], @t[6]
+        veor    @x[3], @t[0], @t[4]
+        veor    @x[4], @x[6], @t[2]
+        vmov    @x[6], @t[3]
+         @ vmov @x[7], @t[1]
+___
+}
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+        @ multiplication by 0x0e
+        vext.8  @t[7], @x[7], @x[7], #12
+        vmov    @t[2], @x[2]
+        veor    @x[2], @x[2], @x[5]             @ 2 5
+        veor    @x[7], @x[7], @x[5]             @ 7 5
+        vext.8  @t[0], @x[0], @x[0], #12
+        vmov    @t[5], @x[5]
+        veor    @x[5], @x[5], @x[0]             @ 5 0           [1]
+        veor    @x[0], @x[0], @x[1]             @ 0 1
+        vext.8  @t[1], @x[1], @x[1], #12
+        veor    @x[1], @x[1], @x[2]             @ 1 25
+        veor    @x[0], @x[0], @x[6]             @ 01 6          [2]
+        vext.8  @t[3], @x[3], @x[3], #12
+        veor    @x[1], @x[1], @x[3]             @ 125 3         [4]
+        veor    @x[2], @x[2], @x[0]             @ 25 016        [3]
+        veor    @x[3], @x[3], @x[7]             @ 3 75
+        veor    @x[7], @x[7], @x[6]             @ 75 6          [0]
+        vext.8  @t[6], @x[6], @x[6], #12
+        vmov    @t[4], @x[4]
+        veor    @x[6], @x[6], @x[4]             @ 6 4
+        veor    @x[4], @x[4], @x[3]             @ 4 375         [6]
+        veor    @x[3], @x[3], @x[7]             @ 375 756=36
+        veor    @x[6], @x[6], @t[5]             @ 64 5          [7]
+        veor    @x[3], @x[3], @t[2]             @ 36 2
+        vext.8  @t[5], @t[5], @t[5], #12
+        veor    @x[3], @x[3], @t[4]             @ 362 4         [5]
+___
+                                        my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+        @ multiplication by 0x0b
+        veor    @y[1], @y[1], @y[0]
+        veor    @y[0], @y[0], @t[0]
+        vext.8  @t[2], @t[2], @t[2], #12
+        veor    @y[1], @y[1], @t[1]
+        veor    @y[0], @y[0], @t[5]
+        vext.8  @t[4], @t[4], @t[4], #12
+        veor    @y[1], @y[1], @t[6]
+        veor    @y[0], @y[0], @t[7]
+        veor    @t[7], @t[7], @t[6]             @ clobber t[7]
+        veor    @y[3], @y[3], @t[0]
+         veor   @y[1], @y[1], @y[0]
+        vext.8  @t[0], @t[0], @t[0], #12
+        veor    @y[2], @y[2], @t[1]
+        veor    @y[4], @y[4], @t[1]
+        vext.8  @t[1], @t[1], @t[1], #12
+        veor    @y[2], @y[2], @t[2]
+        veor    @y[3], @y[3], @t[2]
+        veor    @y[5], @y[5], @t[2]
+        veor    @y[2], @y[2], @t[7]
+        vext.8  @t[2], @t[2], @t[2], #12
+        veor    @y[3], @y[3], @t[3]
+        veor    @y[6], @y[6], @t[3]
+        veor    @y[4], @y[4], @t[3]
+        veor    @y[7], @y[7], @t[4]
+        vext.8  @t[3], @t[3], @t[3], #12
+        veor    @y[5], @y[5], @t[4]
+        veor    @y[7], @y[7], @t[7]
+        veor    @t[7], @t[7], @t[5]             @ clobber t[7] even more
+        veor    @y[3], @y[3], @t[5]
+        veor    @y[4], @y[4], @t[4]
+        veor    @y[5], @y[5], @t[7]
+        vext.8  @t[4], @t[4], @t[4], #12
+        veor    @y[6], @y[6], @t[7]
+        veor    @y[4], @y[4], @t[7]
+        veor    @t[7], @t[7], @t[5]
+        vext.8  @t[5], @t[5], @t[5], #12
+        @ multiplication by 0x0d
+        veor    @y[4], @y[4], @y[7]
+         veor   @t[7], @t[7], @t[6]             @ restore t[7]
+        veor    @y[7], @y[7], @t[4]
+        vext.8  @t[6], @t[6], @t[6], #12
+        veor    @y[2], @y[2], @t[0]
+        veor    @y[7], @y[7], @t[5]
+        vext.8  @t[7], @t[7], @t[7], #12
+        veor    @y[2], @y[2], @t[2]
+        veor    @y[3], @y[3], @y[1]
+        veor    @y[1], @y[1], @t[1]
+        veor    @y[0], @y[0], @t[0]
+        veor    @y[3], @y[3], @t[0]
+        veor    @y[1], @y[1], @t[5]
+        veor    @y[0], @y[0], @t[5]
+        vext.8  @t[0], @t[0], @t[0], #12
+        veor    @y[1], @y[1], @t[7]
+        veor    @y[0], @y[0], @t[6]
+        veor    @y[3], @y[3], @y[1]
+        veor    @y[4], @y[4], @t[1]
+        vext.8  @t[1], @t[1], @t[1], #12
+        veor    @y[7], @y[7], @t[7]
+        veor    @y[4], @y[4], @t[2]
+        veor    @y[5], @y[5], @t[2]
+        veor    @y[2], @y[2], @t[6]
+        veor    @t[6], @t[6], @t[3]             @ clobber t[6]
+        vext.8  @t[2], @t[2], @t[2], #12
+        veor    @y[4], @y[4], @y[7]
+        veor    @y[3], @y[3], @t[6]
+        veor    @y[6], @y[6], @t[6]
+        veor    @y[5], @y[5], @t[5]
+        vext.8  @t[5], @t[5], @t[5], #12
+        veor    @y[6], @y[6], @t[4]
+        vext.8  @t[4], @t[4], @t[4], #12
+        veor    @y[5], @y[5], @t[6]
+        veor    @y[6], @y[6], @t[7]
+        vext.8  @t[7], @t[7], @t[7], #12
+        veor    @t[6], @t[6], @t[3]             @ restore t[6]
+        vext.8  @t[3], @t[3], @t[3], #12
+        @ multiplication by 0x09
+        veor    @y[4], @y[4], @y[1]
+        veor    @t[1], @t[1], @y[1]             @ t[1]=y[1]
+        veor    @t[0], @t[0], @t[5]             @ clobber t[0]
+        vext.8  @t[6], @t[6], @t[6], #12
+        veor    @t[1], @t[1], @t[5]
+        veor    @y[3], @y[3], @t[0]
+        veor    @t[0], @t[0], @y[0]             @ t[0]=y[0]
+        veor    @t[1], @t[1], @t[6]
+        veor    @t[6], @t[6], @t[7]             @ clobber t[6]
+        veor    @y[4], @y[4], @t[1]
+        veor    @y[7], @y[7], @t[4]
+        veor    @y[6], @y[6], @t[3]
+        veor    @y[5], @y[5], @t[2]
+        veor    @t[4], @t[4], @y[4]             @ t[4]=y[4]
+        veor    @t[3], @t[3], @y[3]             @ t[3]=y[3]
+        veor    @t[5], @t[5], @y[5]             @ t[5]=y[5]
+        veor    @t[2], @t[2], @y[2]             @ t[2]=y[2]
+        veor    @t[3], @t[3], @t[7]
+        veor    @XMM[5], @t[5], @t[6]
+        veor    @XMM[6], @t[6], @y[6]           @ t[6]=y[6]
+        veor    @XMM[2], @t[2], @t[6]
+        veor    @XMM[7], @t[7], @y[7]           @ t[7]=y[7]
+        vmov    @XMM[0], @t[0]
+        vmov    @XMM[1], @t[1]
+        @ vmov  @XMM[2], @t[2]
+        vmov    @XMM[3], @t[3]
+        vmov    @XMM[4], @t[4]
+        @ vmov  @XMM[5], @t[5]
+        @ vmov  @XMM[6], @t[6]
+        @ vmov  @XMM[7], @t[7]
+___
+}
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+$code.=<<___;
+        @ multiplication by 0x05-0x00-0x04-0x00
+        vext.8  @t[0], @x[0], @x[0], #8
+        vext.8  @t[6], @x[6], @x[6], #8
+        vext.8  @t[7], @x[7], @x[7], #8
+        veor    @t[0], @t[0], @x[0]
+        vext.8  @t[1], @x[1], @x[1], #8
+        veor    @t[6], @t[6], @x[6]
+        vext.8  @t[2], @x[2], @x[2], #8
+        veor    @t[7], @t[7], @x[7]
+        vext.8  @t[3], @x[3], @x[3], #8
+        veor    @t[1], @t[1], @x[1]
+        vext.8  @t[4], @x[4], @x[4], #8
+        veor    @t[2], @t[2], @x[2]
+        vext.8  @t[5], @x[5], @x[5], #8
+        veor    @t[3], @t[3], @x[3]
+        veor    @t[4], @t[4], @x[4]
+        veor    @t[5], @t[5], @x[5]
+         veor   @x[0], @x[0], @t[6]
+         veor   @x[1], @x[1], @t[6]
+         veor   @x[2], @x[2], @t[0]
+         veor   @x[4], @x[4], @t[2]
+         veor   @x[3], @x[3], @t[1]
+         veor   @x[1], @x[1], @t[7]
+         veor   @x[2], @x[2], @t[7]
+         veor   @x[4], @x[4], @t[6]
+         veor   @x[5], @x[5], @t[3]
+         veor   @x[3], @x[3], @t[6]
+         veor   @x[6], @x[6], @t[4]
+         veor   @x[4], @x[4], @t[7]
+         veor   @x[5], @x[5], @t[7]
+         veor   @x[7], @x[7], @t[5]
+___
+        &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
+}
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+        vshr.u64        $t, $b, #$n
+        veor            $t, $t, $a
+        vand            $t, $t, $mask
+        veor            $a, $a, $t
+        vshl.u64        $t, $t, #$n
+        veor            $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+        vshr.u64        $t0, $b0, #$n
+         vshr.u64       $t1, $b1, #$n
+        veor            $t0, $t0, $a0
+         veor           $t1, $t1, $a1
+        vand            $t0, $t0, $mask
+         vand           $t1, $t1, $mask
+        veor            $a0, $a0, $t0
+        vshl.u64        $t0, $t0, #$n
+         veor           $a1, $a1, $t1
+         vshl.u64       $t1, $t1, #$n
+        veor            $b0, $b0, $t0
+         veor           $b1, $b1, $t1
+___
+}
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+        vmov.i8 $t0,#0x55                       @ compose .LBS0
+        vmov.i8 $t1,#0x33                       @ compose .LBS1
+___
+        &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+        &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+        vmov.i8 $t0,#0x0f                       @ compose .LBS2
+___
+        &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+        &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+        &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+        &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP    vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME  0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME  0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+#ifdef __thumb__
+# define adrl adr
+#endif
+#if __ARM_ARCH__>=7
+.text
+.syntax unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+.fpu    neon
+.type   _bsaes_decrypt8,%function
+.align  4
+_bsaes_decrypt8:
+        adr     $const,_bsaes_decrypt8
+        vldmia  $key!, {@XMM[9]}                @ round 0 key
+        add     $const,$const,#.LM0ISR-_bsaes_decrypt8
+        vldmia  $const!, {@XMM[8]}              @ .LM0ISR
+        veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+        veor    @XMM[11], @XMM[1], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        veor    @XMM[12], @XMM[2], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+        veor    @XMM[13], @XMM[3], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+        veor    @XMM[14], @XMM[4], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+        veor    @XMM[15], @XMM[5], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+        veor    @XMM[10], @XMM[6], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+        veor    @XMM[11], @XMM[7], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+         vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+        &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+        sub     $rounds,$rounds,#1
+        b       .Ldec_sbox
+.align  4
+.Ldec_loop:
+___
+        &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+        &InvSbox        (@XMM[0..7, 8..15]);
+$code.=<<___;
+        subs    $rounds,$rounds,#1
+        bcc     .Ldec_done
+___
+        &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+        vldmia  $const, {@XMM[12]}              @ .LISR
+        ite     eq                              @ Thumb2 thing, sanity check in ARM
+        addeq   $const,$const,#0x10
+        bne     .Ldec_loop
+        vldmia  $const, {@XMM[12]}              @ .LISRM0
+        b       .Ldec_loop
+.align  4
+.Ldec_done:
+___
+        &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+        vldmia  $key, {@XMM[8]}                 @ last round key
+        veor    @XMM[6], @XMM[6], @XMM[8]
+        veor    @XMM[4], @XMM[4], @XMM[8]
+        veor    @XMM[2], @XMM[2], @XMM[8]
+        veor    @XMM[7], @XMM[7], @XMM[8]
+        veor    @XMM[3], @XMM[3], @XMM[8]
+        veor    @XMM[5], @XMM[5], @XMM[8]
+        veor    @XMM[0], @XMM[0], @XMM[8]
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        bx      lr
+.size   _bsaes_decrypt8,.-_bsaes_decrypt8
+.type   _bsaes_const,%object
+.align  6
+_bsaes_const:
+.LM0ISR:        @ InvShiftRows constants
+        .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+        .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+        .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:         @ ShiftRows constants
+        .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+        .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+        .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+        .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+        .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz  "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  6
+.size   _bsaes_const,.-_bsaes_const
+.type   _bsaes_encrypt8,%function
+.align  4
+_bsaes_encrypt8:
+        adr     $const,_bsaes_encrypt8
+        vldmia  $key!, {@XMM[9]}                @ round 0 key
+        sub     $const,$const,#_bsaes_encrypt8-.LM0SR
+        vldmia  $const!, {@XMM[8]}              @ .LM0SR
+_bsaes_encrypt8_alt:
+        veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+        veor    @XMM[11], @XMM[1], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        veor    @XMM[12], @XMM[2], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+        veor    @XMM[13], @XMM[3], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+        veor    @XMM[14], @XMM[4], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+        veor    @XMM[15], @XMM[5], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+        veor    @XMM[10], @XMM[6], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+        veor    @XMM[11], @XMM[7], @XMM[9]
+         vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+         vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+         vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+        &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+        sub     $rounds,$rounds,#1
+        b       .Lenc_sbox
+.align  4
+.Lenc_loop:
+___
+        &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+        &Sbox           (@XMM[0..7, 8..15]);
+$code.=<<___;
+        subs    $rounds,$rounds,#1
+        bcc     .Lenc_done
+___
+        &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+        vldmia  $const, {@XMM[12]}              @ .LSR
+        ite     eq                              @ Thumb2 thing, samity check in ARM
+        addeq   $const,$const,#0x10
+        bne     .Lenc_loop
+        vldmia  $const, {@XMM[12]}              @ .LSRM0
+        b       .Lenc_loop
+.align  4
+.Lenc_done:
+___
+        # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+        &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+        vldmia  $key, {@XMM[8]}                 @ last round key
+        veor    @XMM[4], @XMM[4], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[8]
+        veor    @XMM[3], @XMM[3], @XMM[8]
+        veor    @XMM[7], @XMM[7], @XMM[8]
+        veor    @XMM[2], @XMM[2], @XMM[8]
+        veor    @XMM[5], @XMM[5], @XMM[8]
+        veor    @XMM[0], @XMM[0], @XMM[8]
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        bx      lr
+.size   _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+        &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+        @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+        vmov    @x[2], @x[0]
+        vmov    @x[3], @x[1]
+___
+        #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+        &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+        @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+        vmov    @x[4], @x[0]
+        vmov    @x[6], @x[2]
+        vmov    @x[5], @x[1]
+        vmov    @x[7], @x[3]
+___
+        &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
+        &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+$code.=<<___;
+.type   _bsaes_key_convert,%function
+.align  4
+_bsaes_key_convert:
+        adr     $const,_bsaes_key_convert
+        vld1.8  {@XMM[7]},  [$inp]!             @ load round 0 key
+        sub     $const,$const,#_bsaes_key_convert-.LM0
+        vld1.8  {@XMM[15]}, [$inp]!             @ load round 1 key
+        vmov.i8 @XMM[8],  #0x01                 @ bit masks
+        vmov.i8 @XMM[9],  #0x02
+        vmov.i8 @XMM[10], #0x04
+        vmov.i8 @XMM[11], #0x08
+        vmov.i8 @XMM[12], #0x10
+        vmov.i8 @XMM[13], #0x20
+        vldmia  $const, {@XMM[14]}              @ .LM0
+#ifdef __ARMEL__
+        vrev32.8        @XMM[7],  @XMM[7]
+        vrev32.8        @XMM[15], @XMM[15]
+#endif
+        sub     $rounds,$rounds,#1
+        vstmia  $out!, {@XMM[7]}                @ save round 0 key
+        b       .Lkey_loop
+.align  4
+.Lkey_loop:
+        vtbl.8  `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+        vtbl.8  `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+        vmov.i8 @XMM[6],  #0x40
+        vmov.i8 @XMM[15], #0x80
+        vtst.8  @XMM[0], @XMM[7], @XMM[8]
+        vtst.8  @XMM[1], @XMM[7], @XMM[9]
+        vtst.8  @XMM[2], @XMM[7], @XMM[10]
+        vtst.8  @XMM[3], @XMM[7], @XMM[11]
+        vtst.8  @XMM[4], @XMM[7], @XMM[12]
+        vtst.8  @XMM[5], @XMM[7], @XMM[13]
+        vtst.8  @XMM[6], @XMM[7], @XMM[6]
+        vtst.8  @XMM[7], @XMM[7], @XMM[15]
+        vld1.8  {@XMM[15]}, [$inp]!             @ load next round key
+        vmvn    @XMM[0], @XMM[0]                @ "pnot"
+        vmvn    @XMM[1], @XMM[1]
+        vmvn    @XMM[5], @XMM[5]
+        vmvn    @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+        vrev32.8        @XMM[15], @XMM[15]
+#endif
+        subs    $rounds,$rounds,#1
+        vstmia  $out!,{@XMM[0]-@XMM[7]}         @ write bit-sliced round key
+        bne     .Lkey_loop
+        vmov.i8 @XMM[7],#0x63                   @ compose .L63
+        @ don't save last round key
+        bx      lr
+.size   _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+if (0) {                # following four functions are unsupported interface
+                        # used for benchmarking...
+$code.=<<___;
+.globl  bsaes_enc_key_convert
+.type   bsaes_enc_key_convert,%function
+.align  4
+bsaes_enc_key_convert:
+        stmdb   sp!,{r4-r6,lr}
+        vstmdb  sp!,{d8-d15}            @ ABI specification says so
+        ldr     r5,[$inp,#240]                  @ pass rounds
+        mov     r4,$inp                         @ pass key
+        mov     r12,$out                        @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+        vstmia  r12, {@XMM[7]}                  @ save last round key
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r6,pc}
+.size   bsaes_enc_key_convert,.-bsaes_enc_key_convert
+.globl  bsaes_encrypt_128
+.type   bsaes_encrypt_128,%function
+.align  4
+bsaes_encrypt_128:
+        stmdb   sp!,{r4-r6,lr}
+        vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Lenc128_loop:
+        vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+        vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+        mov     r4,$key                         @ pass the key
+        vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+        mov     r5,#10                          @ pass rounds
+        vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+        bl      _bsaes_encrypt8
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[3]}, [$out]!
+        vst1.8  {@XMM[7]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        subs    $len,$len,#0x80
+        vst1.8  {@XMM[5]}, [$out]!
+        bhi     .Lenc128_loop
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r6,pc}
+.size   bsaes_encrypt_128,.-bsaes_encrypt_128
+.globl  bsaes_dec_key_convert
+.type   bsaes_dec_key_convert,%function
+.align  4
+bsaes_dec_key_convert:
+        stmdb   sp!,{r4-r6,lr}
+        vstmdb  sp!,{d8-d15}            @ ABI specification says so
+        ldr     r5,[$inp,#240]                  @ pass rounds
+        mov     r4,$inp                         @ pass key
+        mov     r12,$out                        @ pass key schedule
+        bl      _bsaes_key_convert
+        vldmia  $out, {@XMM[6]}
+        vstmia  r12,  {@XMM[15]}                @ save last round key
+        veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+        vstmia  $out, {@XMM[7]}
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r6,pc}
+.size   bsaes_dec_key_convert,.-bsaes_dec_key_convert
+.globl  bsaes_decrypt_128
+.type   bsaes_decrypt_128,%function
+.align  4
+bsaes_decrypt_128:
+        stmdb   sp!,{r4-r6,lr}
+        vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Ldec128_loop:
+        vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+        vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+        mov     r4,$key                         @ pass the key
+        vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+        mov     r5,#10                          @ pass rounds
+        vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+        bl      _bsaes_decrypt8
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        vst1.8  {@XMM[7]}, [$out]!
+        vst1.8  {@XMM[3]}, [$out]!
+        subs    $len,$len,#0x80
+        vst1.8  {@XMM[5]}, [$out]!
+        bhi     .Ldec128_loop
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r6,pc}
+.size   bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+.global bsaes_cbc_encrypt
+.type   bsaes_cbc_encrypt,%function
+.align  5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+        cmp     $len, #128
+#ifndef __thumb__
+        blo     AES_cbc_encrypt
+#else
+        bhs     1f
+        b       AES_cbc_encrypt
+1:
+#endif
+#endif
+        @ it is up to the caller to make sure we are called with enc == 0
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}
+        VFP_ABI_PUSH
+        ldr     $ivp, [ip]                      @ IV is 1st arg on the stack
+        mov     $len, $len, lsr#4               @ len in 16 byte blocks
+        sub     sp, #0x10                       @ scratch space to carry over the IV
+        mov     $fp, sp                         @ save sp
+        ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+        add     r12, #`128-32`                  @ sifze of bit-slices key schedule
+        @ populate the key schedule
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        mov     sp, r12                         @ sp is $keysched
+        bl      _bsaes_key_convert
+        vldmia  $keysched, {@XMM[6]}
+        vstmia  r12,  {@XMM[15]}                @ save last round key
+        veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+        vstmia  $keysched, {@XMM[7]}
+#else
+        ldr     r12, [$key, #244]
+        eors    r12, #1
+        beq     0f
+        @ populate the key schedule
+        str     r12, [$key, #244]
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        add     r12, $key, #248                 @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, $key, #248
+        vldmia  r4, {@XMM[6]}
+        vstmia  r12, {@XMM[15]}                 @ save last round key
+        veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+        vstmia  r4, {@XMM[7]}
+.align  2
+0:
+#endif
+        vld1.8  {@XMM[15]}, [$ivp]              @ load IV
+        b       .Lcbc_dec_loop
+.align  4
+.Lcbc_dec_loop:
+        subs    $len, $len, #0x8
+        bmi     .Lcbc_dec_loop_finish
+        vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+        vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+        mov     r4, $keysched                   @ pass the key
+#else
+        add     r4, $key, #248
+#endif
+        vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+        mov     r5, $rounds
+        vld1.8  {@XMM[6]-@XMM[7]}, [$inp]
+        sub     $inp, $inp, #0x60
+        vstmia  $fp, {@XMM[15]}                 @ put aside IV
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+        veor    @XMM[4], @XMM[4], @XMM[10]
+        veor    @XMM[2], @XMM[2], @XMM[11]
+        vld1.8  {@XMM[14]-@XMM[15]}, [$inp]!
+        veor    @XMM[7], @XMM[7], @XMM[12]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        veor    @XMM[3], @XMM[3], @XMM[13]
+        vst1.8  {@XMM[6]}, [$out]!
+        veor    @XMM[5], @XMM[5], @XMM[14]
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        vst1.8  {@XMM[7]}, [$out]!
+        vst1.8  {@XMM[3]}, [$out]!
+        vst1.8  {@XMM[5]}, [$out]!
+        b       .Lcbc_dec_loop
+.Lcbc_dec_loop_finish:
+        adds    $len, $len, #8
+        beq     .Lcbc_dec_done
+        vld1.8  {@XMM[0]}, [$inp]!              @ load input
+        cmp     $len, #2
+        blo     .Lcbc_dec_one
+        vld1.8  {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+        mov     r4, $keysched                   @ pass the key
+#else
+        add     r4, $key, #248
+#endif
+        mov     r5, $rounds
+        vstmia  $fp, {@XMM[15]}                 @ put aside IV
+        beq     .Lcbc_dec_two
+        vld1.8  {@XMM[2]}, [$inp]!
+        cmp     $len, #4
+        blo     .Lcbc_dec_three
+        vld1.8  {@XMM[3]}, [$inp]!
+        beq     .Lcbc_dec_four
+        vld1.8  {@XMM[4]}, [$inp]!
+        cmp     $len, #6
+        blo     .Lcbc_dec_five
+        vld1.8  {@XMM[5]}, [$inp]!
+        beq     .Lcbc_dec_six
+        vld1.8  {@XMM[6]}, [$inp]!
+        sub     $inp, $inp, #0x70
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+        veor    @XMM[4], @XMM[4], @XMM[10]
+        veor    @XMM[2], @XMM[2], @XMM[11]
+        vld1.8  {@XMM[15]}, [$inp]!
+        veor    @XMM[7], @XMM[7], @XMM[12]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        veor    @XMM[3], @XMM[3], @XMM[13]
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        vst1.8  {@XMM[7]}, [$out]!
+        vst1.8  {@XMM[3]}, [$out]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_six:
+        sub     $inp, $inp, #0x60
+        bl      _bsaes_decrypt8
+        vldmia  $fp,{@XMM[14]}                  @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vld1.8  {@XMM[12]}, [$inp]!
+        veor    @XMM[4], @XMM[4], @XMM[10]
+        veor    @XMM[2], @XMM[2], @XMM[11]
+        vld1.8  {@XMM[15]}, [$inp]!
+        veor    @XMM[7], @XMM[7], @XMM[12]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        vst1.8  {@XMM[7]}, [$out]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_five:
+        sub     $inp, $inp, #0x50
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vld1.8  {@XMM[15]}, [$inp]!
+        veor    @XMM[4], @XMM[4], @XMM[10]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        veor    @XMM[2], @XMM[2], @XMM[11]
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[4]}, [$out]!
+        vst1.8  {@XMM[2]}, [$out]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_four:
+        sub     $inp, $inp, #0x40
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[10]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vld1.8  {@XMM[15]}, [$inp]!
+        veor    @XMM[4], @XMM[4], @XMM[10]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        vst1.8  {@XMM[6]}, [$out]!
+        vst1.8  {@XMM[4]}, [$out]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_three:
+        sub     $inp, $inp, #0x30
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[15]}, [$inp]!
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        veor    @XMM[6], @XMM[6], @XMM[9]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        vst1.8  {@XMM[6]}, [$out]!
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_two:
+        sub     $inp, $inp, #0x20
+        bl      _bsaes_decrypt8
+        vldmia  $fp, {@XMM[14]}                 @ reload IV
+        vld1.8  {@XMM[8]}, [$inp]!              @ reload input
+        veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+        vld1.8  {@XMM[15]}, [$inp]!             @ reload input
+        veor    @XMM[1], @XMM[1], @XMM[8]
+        vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        b       .Lcbc_dec_done
+.align  4
+.Lcbc_dec_one:
+        sub     $inp, $inp, #0x10
+        mov     $rounds, $out                   @ save original out pointer
+        mov     $out, $fp                       @ use the iv scratch space as out buffer
+        mov     r2, $key
+        vmov    @XMM[4],@XMM[15]                @ just in case ensure that IV
+        vmov    @XMM[5],@XMM[0]                 @ and input are preserved
+        bl      AES_decrypt
+        vld1.8  {@XMM[0]}, [$fp,:64]            @ load result
+        veor    @XMM[0], @XMM[0], @XMM[4]       @ ^= IV
+        vmov    @XMM[15], @XMM[5]               @ @XMM[5] holds input
+        vst1.8  {@XMM[0]}, [$rounds]            @ write output
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+.Lcbc_dec_bzero:                                @ wipe key schedule [if any]
+        vstmia          $keysched!, {q0-q1}
+        cmp             $keysched, $fp
+        bne             .Lcbc_dec_bzero
+#endif
+        mov     sp, $fp
+        add     sp, #0x10                       @ add sp,$fp,#0x10 is no good for thumb
+        vst1.8  {@XMM[15]}, [$ivp]              @ return IV
+        VFP_ABI_POP
+        ldmia   sp!, {r4-r10, pc}
+.size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6";       # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type   bsaes_ctr32_encrypt_blocks,%function
+.align  5
+bsaes_ctr32_encrypt_blocks:
+        cmp     $len, #8                        @ use plain AES for
+        blo     .Lctr_enc_short                 @ small sizes
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}
+        VFP_ABI_PUSH
+        ldr     $ctr, [ip]                      @ ctr is 1st arg on the stack
+        sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+        mov     $fp, sp                         @ save sp
+        ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+        add     r12, #`128-32`                  @ size of bit-sliced key schedule
+        @ populate the key schedule
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        mov     sp, r12                         @ sp is $keysched
+        bl      _bsaes_key_convert
+        veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+        vstmia  r12, {@XMM[7]}                  @ save last round key
+        vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+        add     $ctr, $const, #.LREVM0SR-.LM0   @ borrow $ctr
+        vldmia  $keysched, {@XMM[4]}            @ load round0 key
+#else
+        ldr     r12, [$key, #244]
+        eors    r12, #1
+        beq     0f
+        @ populate the key schedule
+        str     r12, [$key, #244]
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        add     r12, $key, #248                 @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+        vstmia  r12, {@XMM[7]}                  @ save last round key
+.align  2
+0:      add     r12, $key, #248
+        vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+        adrl    $ctr, .LREVM0SR                 @ borrow $ctr
+        vldmia  r12, {@XMM[4]}                  @ load round0 key
+        sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+        vmov.i32        @XMM[8],#1              @ compose 1<<96
+        veor            @XMM[9],@XMM[9],@XMM[9]
+        vrev32.8        @XMM[0],@XMM[0]
+        vext.8          @XMM[8],@XMM[9],@XMM[8],#4
+        vrev32.8        @XMM[4],@XMM[4]
+        vadd.u32        @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+        vstmia  $keysched, {@XMM[4]}            @ save adjusted round0 key
+        b       .Lctr_enc_loop
+.align  4
+.Lctr_enc_loop:
+        vadd.u32        @XMM[10], @XMM[8], @XMM[9]      @ compose 3<<96
+        vadd.u32        @XMM[1], @XMM[0], @XMM[8]       @ +1
+        vadd.u32        @XMM[2], @XMM[0], @XMM[9]       @ +2
+        vadd.u32        @XMM[3], @XMM[0], @XMM[10]      @ +3
+        vadd.u32        @XMM[4], @XMM[1], @XMM[10]
+        vadd.u32        @XMM[5], @XMM[2], @XMM[10]
+        vadd.u32        @XMM[6], @XMM[3], @XMM[10]
+        vadd.u32        @XMM[7], @XMM[4], @XMM[10]
+        vadd.u32        @XMM[10], @XMM[5], @XMM[10]     @ next counter
+        @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+        @ to flip byte order in 32-bit counter
+        vldmia          $keysched, {@XMM[9]}            @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, $keysched, #0x10            @ pass next round key
+#else
+        add             r4, $key, #`248+16`
+#endif
+        vldmia          $ctr, {@XMM[8]}                 @ .LREVM0SR
+        mov             r5, $rounds                     @ pass rounds
+        vstmia          $fp, {@XMM[10]}                 @ save next counter
+        sub             $const, $ctr, #.LREVM0SR-.LSR   @ pass constants
+        bl              _bsaes_encrypt8_alt
+        subs            $len, $len, #8
+        blo             .Lctr_enc_loop_done
+        vld1.8          {@XMM[8]-@XMM[9]}, [$inp]!      @ load input
+        vld1.8          {@XMM[10]-@XMM[11]}, [$inp]!
+        veor            @XMM[0], @XMM[8]
+        veor            @XMM[1], @XMM[9]
+        vld1.8          {@XMM[12]-@XMM[13]}, [$inp]!
+        veor            @XMM[4], @XMM[10]
+        veor            @XMM[6], @XMM[11]
+        vld1.8          {@XMM[14]-@XMM[15]}, [$inp]!
+        veor            @XMM[3], @XMM[12]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+        veor            @XMM[7], @XMM[13]
+        veor            @XMM[2], @XMM[14]
+        vst1.8          {@XMM[4]}, [$out]!
+        veor            @XMM[5], @XMM[15]
+        vst1.8          {@XMM[6]}, [$out]!
+        vmov.i32        @XMM[8], #1                     @ compose 1<<96
+        vst1.8          {@XMM[3]}, [$out]!
+        veor            @XMM[9], @XMM[9], @XMM[9]
+        vst1.8          {@XMM[7]}, [$out]!
+        vext.8          @XMM[8], @XMM[9], @XMM[8], #4
+        vst1.8          {@XMM[2]}, [$out]!
+        vadd.u32        @XMM[9],@XMM[8],@XMM[8]         @ compose 2<<96
+        vst1.8          {@XMM[5]}, [$out]!
+        vldmia          $fp, {@XMM[0]}                  @ load counter
+        bne             .Lctr_enc_loop
+        b               .Lctr_enc_done
+.align  4
+.Lctr_enc_loop_done:
+        add             $len, $len, #8
+        vld1.8          {@XMM[8]}, [$inp]!      @ load input
+        veor            @XMM[0], @XMM[8]
+        vst1.8          {@XMM[0]}, [$out]!      @ write output
+        cmp             $len, #2
+        blo             .Lctr_enc_done
+        vld1.8          {@XMM[9]}, [$inp]!
+        veor            @XMM[1], @XMM[9]
+        vst1.8          {@XMM[1]}, [$out]!
+        beq             .Lctr_enc_done
+        vld1.8          {@XMM[10]}, [$inp]!
+        veor            @XMM[4], @XMM[10]
+        vst1.8          {@XMM[4]}, [$out]!
+        cmp             $len, #4
+        blo             .Lctr_enc_done
+        vld1.8          {@XMM[11]}, [$inp]!
+        veor            @XMM[6], @XMM[11]
+        vst1.8          {@XMM[6]}, [$out]!
+        beq             .Lctr_enc_done
+        vld1.8          {@XMM[12]}, [$inp]!
+        veor            @XMM[3], @XMM[12]
+        vst1.8          {@XMM[3]}, [$out]!
+        cmp             $len, #6
+        blo             .Lctr_enc_done
+        vld1.8          {@XMM[13]}, [$inp]!
+        veor            @XMM[7], @XMM[13]
+        vst1.8          {@XMM[7]}, [$out]!
+        beq             .Lctr_enc_done
+        vld1.8          {@XMM[14]}, [$inp]
+        veor            @XMM[2], @XMM[14]
+        vst1.8          {@XMM[2]}, [$out]!
+.Lctr_enc_done:
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                        @ wipe key schedule [if any]
+        vstmia          $keysched!, {q0-q1}
+        cmp             $keysched, $fp
+        bne             .Lctr_enc_bzero
+#else
+        vstmia          $keysched, {q0-q1}
+#endif
+        mov     sp, $fp
+        add     sp, #0x10               @ add sp,$fp,#0x10 is no good for thumb
+        VFP_ABI_POP
+        ldmia   sp!, {r4-r10, pc}       @ return
+.align  4
+.Lctr_enc_short:
+        ldr     ip, [sp]                @ ctr pointer is passed on stack
+        stmdb   sp!, {r4-r8, lr}
+        mov     r4, $inp                @ copy arguments
+        mov     r5, $out
+        mov     r6, $len
+        mov     r7, $key
+        ldr     r8, [ip, #12]           @ load counter LSW
+        vld1.8  {@XMM[1]}, [ip]         @ load whole counter value
+#ifdef __ARMEL__
+        rev     r8, r8
+#endif
+        sub     sp, sp, #0x10
+        vst1.8  {@XMM[1]}, [sp,:64]     @ copy counter value
+        sub     sp, sp, #0x10
+.Lctr_enc_short_loop:
+        add     r0, sp, #0x10           @ input counter value
+        mov     r1, sp                  @ output on the stack
+        mov     r2, r7                  @ key
+        bl      AES_encrypt
+        vld1.8  {@XMM[0]}, [r4]!        @ load input
+        vld1.8  {@XMM[1]}, [sp,:64]     @ load encrypted counter
+        add     r8, r8, #1
+#ifdef __ARMEL__
+        rev     r0, r8
+        str     r0, [sp, #0x1c]         @ next counter value
+#else
+        str     r8, [sp, #0x1c]         @ next counter value
+#endif
+        veor    @XMM[0],@XMM[0],@XMM[1]
+        vst1.8  {@XMM[0]}, [r5]!        @ store output
+        subs    r6, r6, #1
+        bne     .Lctr_enc_short_loop
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+        vstmia          sp!, {q0-q1}
+        ldmia   sp!, {r4-r8, pc}
+.size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#       const AES_KEY *key1, const AES_KEY *key2,
+#       const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6";         # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+$code.=<<___;
+.globl  bsaes_xts_encrypt
+.type   bsaes_xts_encrypt,%function
+.align  4
+bsaes_xts_encrypt:
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}               @ 0x20
+        VFP_ABI_PUSH
+        mov     r6, sp                          @ future $fp
+        mov     $inp, r0
+        mov     $out, r1
+        mov     $len, r2
+        mov     $key, r3
+        sub     r0, sp, #0x10                   @ 0x10
+        bic     r0, #0xf                        @ align at 16 bytes
+        mov     sp, r0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr     r0, [ip]                        @ pointer to input tweak
+#else
+        @ generate initial tweak
+        ldr     r0, [ip, #4]                    @ iv[]
+        mov     r1, sp
+        ldr     r2, [ip, #0]                    @ key2
+        bl      AES_encrypt
+        mov     r0,sp                           @ pointer to initial tweak
+#endif
+        ldr     $rounds, [$key, #240]           @ get # of rounds
+        mov     $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+        @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+        sub     r12, #`32+16`                   @ place for tweak[9]
+        @ populate the key schedule
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        mov     sp, r12
+        add     r12, #0x90                      @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+        vstmia  r12, {@XMM[7]}                  @ save last round key
+#else
+        ldr     r12, [$key, #244]
+        eors    r12, #1
+        beq     0f
+        str     r12, [$key, #244]
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        add     r12, $key, #248                 @ pass key schedule
+        bl      _bsaes_key_convert
+        veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+        vstmia  r12, {@XMM[7]}
+.align  2
+0:      sub     sp, #0x90                       @ place for tweak[9]
+#endif
+        vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+        adr     $magic, .Lxts_magic
+        subs    $len, #0x80
+        blo     .Lxts_enc_short
+        b       .Lxts_enc_loop
+.align  4
+.Lxts_enc_loop:
+        vldmia          $magic, {$twmask}       @ load XTS magic
+        vshr.s64        @T[0], @XMM[8], #63
+        mov             r0, sp
+        vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+        vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+        vst1.64         {@XMM[$i-1]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        vshr.s64        @T[1], @XMM[$i], #63
+        veor            @XMM[$i], @XMM[$i], @T[0]
+        vand            @T[1], @T[1], $twmask
+___
+        @T=reverse(@T);
+$code.=<<___ if ($i>=10);
+        vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+        veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+        vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+        vst1.64         {@XMM[15]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        veor            @XMM[8], @XMM[8], @T[0]
+        vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+        veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[6], @XMM[6], @XMM[14]
+        mov             r5, $rounds                     @ pass rounds
+        veor            @XMM[7], @XMM[7], @XMM[15]
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[6], @XMM[11]
+        vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+        veor            @XMM[10], @XMM[3], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        veor            @XMM[12], @XMM[2], @XMM[14]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        veor            @XMM[13], @XMM[5], @XMM[15]
+        vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        subs            $len, #0x80
+        bpl             .Lxts_enc_loop
+.Lxts_enc_short:
+        adds            $len, #0x70
+        bmi             .Lxts_enc_done
+        vldmia          $magic, {$twmask}       @ load XTS magic
+        vshr.s64        @T[0], @XMM[8], #63
+        mov             r0, sp
+        vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+        vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+        vst1.64         {@XMM[$i-1]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        vshr.s64        @T[1], @XMM[$i], #63
+        veor            @XMM[$i], @XMM[$i], @T[0]
+        vand            @T[1], @T[1], $twmask
+___
+        @T=reverse(@T);
+$code.=<<___ if ($i>=10);
+        vld1.8          {@XMM[$i-10]}, [$inp]!
+        subs            $len, #0x10
+        bmi             .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+        veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+        sub             $len, #0x10
+        vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+        vld1.8          {@XMM[6]}, [$inp]!
+        veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[6], @XMM[6], @XMM[14]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[6], @XMM[11]
+        vld1.64         {@XMM[14]}, [r0,:128]!
+        veor            @XMM[10], @XMM[3], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        veor            @XMM[12], @XMM[2], @XMM[14]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        vst1.8          {@XMM[12]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_6:
+        vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+        veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[5], @XMM[5], @XMM[13]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[6], @XMM[11]
+        veor            @XMM[10], @XMM[3], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+@ put this in range for both ARM and Thumb mode adr instructions
+.align  5
+.Lxts_magic:
+        .quad   1, 0x87
+.align  5
+.Lxts_enc_5:
+        vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+        veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[4], @XMM[4], @XMM[12]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[6], @XMM[11]
+        veor            @XMM[10], @XMM[3], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        vst1.8          {@XMM[10]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_4:
+        vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+        veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[3], @XMM[3], @XMM[11]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[6], @XMM[11]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_3:
+        vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+        veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[2], @XMM[2], @XMM[10]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+        vld1.64         {@XMM[10]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[4], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        vst1.8          {@XMM[8]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_2:
+        vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+        veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[1], @XMM[1], @XMM[9]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_encrypt8
+        vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_enc_done
+.align  4
+.Lxts_enc_1:
+        mov             r0, sp
+        veor            @XMM[0], @XMM[8]
+        mov             r1, sp
+        vst1.8          {@XMM[0]}, [sp,:128]
+        mov             r2, $key
+        mov             r4, $fp                         @ preserve fp
+        bl              AES_encrypt
+        vld1.8          {@XMM[0]}, [sp,:128]
+        veor            @XMM[0], @XMM[0], @XMM[8]
+        vst1.8          {@XMM[0]}, [$out]!
+        mov             $fp, r4
+        vmov            @XMM[8], @XMM[9]                @ next round tweak
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+        adds            $len, #0x10
+        beq             .Lxts_enc_ret
+        sub             r6, $out, #0x10
+.Lxts_enc_steal:
+        ldrb            r0, [$inp], #1
+        ldrb            r1, [$out, #-0x10]
+        strb            r0, [$out, #-0x10]
+        strb            r1, [$out], #1
+        subs            $len, #1
+        bhi             .Lxts_enc_steal
+        vld1.8          {@XMM[0]}, [r6]
+        mov             r0, sp
+        veor            @XMM[0], @XMM[0], @XMM[8]
+        mov             r1, sp
+        vst1.8          {@XMM[0]}, [sp,:128]
+        mov             r2, $key
+        mov             r4, $fp                 @ preserve fp
+        bl              AES_encrypt
+        vld1.8          {@XMM[0]}, [sp,:128]
+        veor            @XMM[0], @XMM[0], @XMM[8]
+        vst1.8          {@XMM[0]}, [r6]
+        mov             $fp, r4
+#endif
+.Lxts_enc_ret:
+        bic             r0, $fp, #0xf
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_enc_bzero:                                @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r0
+        bne             .Lxts_enc_bzero
+        mov             sp, $fp
+#ifdef  XTS_CHAIN_TWEAK
+        vst1.8          {@XMM[8]}, [r1]
+#endif
+        VFP_ABI_POP
+        ldmia           sp!, {r4-r10, pc}       @ return
+.size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
+.globl  bsaes_xts_decrypt
+.type   bsaes_xts_decrypt,%function
+.align  4
+bsaes_xts_decrypt:
+        mov     ip, sp
+        stmdb   sp!, {r4-r10, lr}               @ 0x20
+        VFP_ABI_PUSH
+        mov     r6, sp                          @ future $fp
+        mov     $inp, r0
+        mov     $out, r1
+        mov     $len, r2
+        mov     $key, r3
+        sub     r0, sp, #0x10                   @ 0x10
+        bic     r0, #0xf                        @ align at 16 bytes
+        mov     sp, r0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr     r0, [ip]                        @ pointer to input tweak
+#else
+        @ generate initial tweak
+        ldr     r0, [ip, #4]                    @ iv[]
+        mov     r1, sp
+        ldr     r2, [ip, #0]                    @ key2
+        bl      AES_encrypt
+        mov     r0, sp                          @ pointer to initial tweak
+#endif
+        ldr     $rounds, [$key, #240]           @ get # of rounds
+        mov     $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+        @ allocate the key schedule on the stack
+        sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+        @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+        sub     r12, #`32+16`                   @ place for tweak[9]
+        @ populate the key schedule
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        mov     sp, r12
+        add     r12, #0x90                      @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, sp, #0x90
+        vldmia  r4, {@XMM[6]}
+        vstmia  r12,  {@XMM[15]}                @ save last round key
+        veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+        vstmia  r4, {@XMM[7]}
+#else
+        ldr     r12, [$key, #244]
+        eors    r12, #1
+        beq     0f
+        str     r12, [$key, #244]
+        mov     r4, $key                        @ pass key
+        mov     r5, $rounds                     @ pass # of rounds
+        add     r12, $key, #248                 @ pass key schedule
+        bl      _bsaes_key_convert
+        add     r4, $key, #248
+        vldmia  r4, {@XMM[6]}
+        vstmia  r12,  {@XMM[15]}                @ save last round key
+        veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+        vstmia  r4, {@XMM[7]}
+.align  2
+0:      sub     sp, #0x90                       @ place for tweak[9]
+#endif
+        vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+        adr     $magic, .Lxts_magic
+        tst     $len, #0xf                      @ if not multiple of 16
+        it      ne                              @ Thumb2 thing, sanity check in ARM
+        subne   $len, #0x10                     @ subtract another 16 bytes
+        subs    $len, #0x80
+        blo     .Lxts_dec_short
+        b       .Lxts_dec_loop
+.align  4
+.Lxts_dec_loop:
+        vldmia          $magic, {$twmask}       @ load XTS magic
+        vshr.s64        @T[0], @XMM[8], #63
+        mov             r0, sp
+        vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+        vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+        vst1.64         {@XMM[$i-1]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        vshr.s64        @T[1], @XMM[$i], #63
+        veor            @XMM[$i], @XMM[$i], @T[0]
+        vand            @T[1], @T[1], $twmask
+___
+        @T=reverse(@T);
+$code.=<<___ if ($i>=10);
+        vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+        veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+        vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+        vst1.64         {@XMM[15]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        veor            @XMM[8], @XMM[8], @T[0]
+        vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+        veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[6], @XMM[6], @XMM[14]
+        mov             r5, $rounds                     @ pass rounds
+        veor            @XMM[7], @XMM[7], @XMM[15]
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[4], @XMM[11]
+        vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+        veor            @XMM[10], @XMM[2], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        veor            @XMM[12], @XMM[3], @XMM[14]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        veor            @XMM[13], @XMM[5], @XMM[15]
+        vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        subs            $len, #0x80
+        bpl             .Lxts_dec_loop
+.Lxts_dec_short:
+        adds            $len, #0x70
+        bmi             .Lxts_dec_done
+        vldmia          $magic, {$twmask}       @ load XTS magic
+        vshr.s64        @T[0], @XMM[8], #63
+        mov             r0, sp
+        vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+        vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+        vst1.64         {@XMM[$i-1]}, [r0,:128]!
+        vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+        vshr.s64        @T[1], @XMM[$i], #63
+        veor            @XMM[$i], @XMM[$i], @T[0]
+        vand            @T[1], @T[1], $twmask
+___
+        @T=reverse(@T);
+$code.=<<___ if ($i>=10);
+        vld1.8          {@XMM[$i-10]}, [$inp]!
+        subs            $len, #0x10
+        bmi             .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+        veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+        sub             $len, #0x10
+        vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+        vld1.8          {@XMM[6]}, [$inp]!
+        veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[6], @XMM[6], @XMM[14]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[4], @XMM[11]
+        vld1.64         {@XMM[14]}, [r0,:128]!
+        veor            @XMM[10], @XMM[2], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        veor            @XMM[12], @XMM[3], @XMM[14]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        vst1.8          {@XMM[12]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_6:
+        vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+        veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[5], @XMM[5], @XMM[13]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[4], @XMM[11]
+        veor            @XMM[10], @XMM[2], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        veor            @XMM[11], @XMM[7], @XMM[13]
+        vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_5:
+        vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+        veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[4], @XMM[4], @XMM[12]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        vld1.64         {@XMM[12]}, [r0,:128]!
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[4], @XMM[11]
+        veor            @XMM[10], @XMM[2], @XMM[12]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        vst1.8          {@XMM[10]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_4:
+        vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+        veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[3], @XMM[3], @XMM[11]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+        vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        veor            @XMM[9], @XMM[4], @XMM[11]
+        vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_3:
+        vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+        veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[2], @XMM[2], @XMM[10]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+        vld1.64         {@XMM[10]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        veor            @XMM[8], @XMM[6], @XMM[10]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        vst1.8          {@XMM[8]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_2:
+        vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+        veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+        add             r4, sp, #0x90                   @ pass key schedule
+#else
+        add             r4, $key, #248                  @ pass key schedule
+#endif
+        veor            @XMM[1], @XMM[1], @XMM[9]
+        mov             r5, $rounds                     @ pass rounds
+        mov             r0, sp
+        bl              _bsaes_decrypt8
+        vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+        veor            @XMM[0], @XMM[0], @XMM[ 8]
+        veor            @XMM[1], @XMM[1], @XMM[ 9]
+        vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+        vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+        b               .Lxts_dec_done
+.align  4
+.Lxts_dec_1:
+        mov             r0, sp
+        veor            @XMM[0], @XMM[8]
+        mov             r1, sp
+        vst1.8          {@XMM[0]}, [sp,:128]
+        mov             r2, $key
+        mov             r4, $fp                         @ preserve fp
+        mov             r5, $magic                      @ preserve magic
+        bl              AES_decrypt
+        vld1.8          {@XMM[0]}, [sp,:128]
+        veor            @XMM[0], @XMM[0], @XMM[8]
+        vst1.8          {@XMM[0]}, [$out]!
+        mov             $fp, r4
+        mov             $magic, r5
+        vmov            @XMM[8], @XMM[9]                @ next round tweak
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+        adds            $len, #0x10
+        beq             .Lxts_dec_ret
+        @ calculate one round of extra tweak for the stolen ciphertext
+        vldmia          $magic, {$twmask}
+        vshr.s64        @XMM[6], @XMM[8], #63
+        vand            @XMM[6], @XMM[6], $twmask
+        vadd.u64        @XMM[9], @XMM[8], @XMM[8]
+        vswp            `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+        veor            @XMM[9], @XMM[9], @XMM[6]
+        @ perform the final decryption with the last tweak value
+        vld1.8          {@XMM[0]}, [$inp]!
+        mov             r0, sp
+        veor            @XMM[0], @XMM[0], @XMM[9]
+        mov             r1, sp
+        vst1.8          {@XMM[0]}, [sp,:128]
+        mov             r2, $key
+        mov             r4, $fp                 @ preserve fp
+        bl              AES_decrypt
+        vld1.8          {@XMM[0]}, [sp,:128]
+        veor            @XMM[0], @XMM[0], @XMM[9]
+        vst1.8          {@XMM[0]}, [$out]
+        mov             r6, $out
+.Lxts_dec_steal:
+        ldrb            r1, [$out]
+        ldrb            r0, [$inp], #1
+        strb            r1, [$out, #0x10]
+        strb            r0, [$out], #1
+        subs            $len, #1
+        bhi             .Lxts_dec_steal
+        vld1.8          {@XMM[0]}, [r6]
+        mov             r0, sp
+        veor            @XMM[0], @XMM[8]
+        mov             r1, sp
+        vst1.8          {@XMM[0]}, [sp,:128]
+        mov             r2, $key
+        bl              AES_decrypt
+        vld1.8          {@XMM[0]}, [sp,:128]
+        veor            @XMM[0], @XMM[0], @XMM[8]
+        vst1.8          {@XMM[0]}, [r6]
+        mov             $fp, r4
+#endif
+.Lxts_dec_ret:
+        bic             r0, $fp, #0xf
+        vmov.i32        q0, #0
+        vmov.i32        q1, #0
+#ifdef  XTS_CHAIN_TWEAK
+        ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_dec_bzero:                                @ wipe key schedule [if any]
+        vstmia          sp!, {q0-q1}
+        cmp             sp, r0
+        bne             .Lxts_dec_bzero
+        mov             sp, $fp
+#ifdef  XTS_CHAIN_TWEAK
+        vst1.8          {@XMM[8]}, [r1]
+#endif
+        VFP_ABI_POP
+        ldmia           sp!, {r4-r10, pc}       @ return
+.size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/@/ and !/^$/);
+        print;
+}
+close SELF;
+print $code;
+close STDOUT;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 59ceae8f3c95..a6395c027715 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
 generic-y += siginfo.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += socket.h
 generic-y += sockios.h
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index fcc1b5bf6979..5c2285160575 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -53,6 +53,13 @@
 #define put_byte_3      lsl #0
 #endif
+/* Select code for any configuration running in BE8 mode */
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define ARM_BE8(code...) code
+#else
+#define ARM_BE8(code...)
+#endif
 /*
 * Data preload for architectures that support it
 */
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index f8a4336ed8fc..62d2cb53b069 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -12,6 +12,7 @@
 #define __ASM_ARM_ATOMIC_H
 #include <linux/compiler.h>
+#include <linux/prefetch.h>
 #include <linux/types.h>
 #include <linux/irqflags.h>
 #include <asm/barrier.h>
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
        unsigned long tmp;
        int result;
+        prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic_add\n"
 "1:     ldrex   %0, [%3]\n"
 "       add     %0, %0, %4\n"
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
        unsigned long tmp;
        int result;
+        prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic_sub\n"
 "1:     ldrex   %0, [%3]\n"
 "       sub     %0, %0, %4\n"
@@ -260,6 +263,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
 {
        long long tmp;
+        prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_set\n"
 "1:     ldrexd  %0, %H0, [%2]\n"
 "       strexd  %0, %3, %H3, [%2]\n"
@@ -276,10 +280,11 @@ static inline void atomic64_add(long long i, atomic64_t *v)
        long long result;
        unsigned long tmp;
+        prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_add\n"
 "1:     ldrexd  %0, %H0, [%3]\n"
-"       adds    %0, %0, %4\n"
+"       adds    %Q0, %Q0, %Q4\n"
-"       adc     %H0, %H0, %H4\n"
+"       adc     %R0, %R0, %R4\n"
 "       strexd  %1, %0, %H0, [%3]\n"
 "       teq     %1, #0\n"
 "       bne     1b"
@@ -297,8 +302,8 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v)
        __asm__ __volatile__("@ atomic64_add_return\n"
 "1:     ldrexd  %0, %H0, [%3]\n"
-"       adds    %0, %0, %4\n"
+"       adds    %Q0, %Q0, %Q4\n"
-"       adc     %H0, %H0, %H4\n"
+"       adc     %R0, %R0, %R4\n"
 "       strexd  %1, %0, %H0, [%3]\n"
 "       teq     %1, #0\n"
 "       bne     1b"
@@ -316,10 +321,11 @@ static inline void atomic64_sub(long long i, atomic64_t *v)
        long long result;
        unsigned long tmp;
+        prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_sub\n"
 "1:     ldrexd  %0, %H0, [%3]\n"
-"       subs    %0, %0, %4\n"
+"       subs    %Q0, %Q0, %Q4\n"
-"       sbc     %H0, %H0, %H4\n"
+"       sbc     %R0, %R0, %R4\n"
 "       strexd  %1, %0, %H0, [%3]\n"
 "       teq     %1, #0\n"
 "       bne     1b"
@@ -337,8 +343,8 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v)
        __asm__ __volatile__("@ atomic64_sub_return\n"
 "1:     ldrexd  %0, %H0, [%3]\n"
-"       subs    %0, %0, %4\n"
+"       subs    %Q0, %Q0, %Q4\n"
-"       sbc     %H0, %H0, %H4\n"
+"       sbc     %R0, %R0, %R4\n"
 "       strexd  %1, %0, %H0, [%3]\n"
 "       teq     %1, #0\n"
 "       bne     1b"
@@ -406,9 +412,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
        __asm__ __volatile__("@ atomic64_dec_if_positive\n"
 "1:     ldrexd  %0, %H0, [%3]\n"
-"       subs    %0, %0, #1\n"
+"       subs    %Q0, %Q0, #1\n"
-"       sbc     %H0, %H0, #0\n"
+"       sbc     %R0, %R0, #0\n"
-"       teq     %H0, #0\n"
+"       teq     %R0, #0\n"
 "       bmi     2f\n"
 "       strexd  %1, %0, %H0, [%3]\n"
 "       teq     %1, #0\n"
@@ -437,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 "       teqeq   %H0, %H5\n"
 "       moveq   %1, #0\n"
 "       beq     2f\n"
-"       adds    %0, %0, %6\n"
+"       adds    %Q0, %Q0, %Q6\n"
-"       adc     %H0, %H0, %H6\n"
+"       adc     %R0, %R0, %R6\n"
 "       strexd  %2, %0, %H0, [%4]\n"
 "       teq     %2, #0\n"
 "       bne     1b\n"
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h
new file mode 100644
index 000000000000..1714800fa113
--- /dev/null
+++ b/arch/arm/include/asm/bL_switcher.h
@@ -0,0 +1,77 @@
+/*
+ * arch/arm/include/asm/bL_switcher.h
+ *
+ * Created by:  Nicolas Pitre, April 2012
+ * Copyright:   (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef ASM_BL_SWITCHER_H
+#define ASM_BL_SWITCHER_H
+#include <linux/compiler.h>
+#include <linux/types.h>
+typedef void (*bL_switch_completion_handler)(void *cookie);
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                         bL_switch_completion_handler completer,
+                         void *completer_cookie);
+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
+{
+        return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
+}
+/*
+ * Register here to be notified about runtime enabling/disabling of
+ * the switcher.
+ *
+ * The notifier chain is called with the switcher activation lock held:
+ * the switcher will not be enabled or disabled during callbacks.
+ * Callbacks must not call bL_switcher_{get,put}_enabled().
+ */
+#define BL_NOTIFY_PRE_ENABLE    0
+#define BL_NOTIFY_POST_ENABLE   1
+#define BL_NOTIFY_PRE_DISABLE   2
+#define BL_NOTIFY_POST_DISABLE  3
+#ifdef CONFIG_BL_SWITCHER
+int bL_switcher_register_notifier(struct notifier_block *nb);
+int bL_switcher_unregister_notifier(struct notifier_block *nb);
+/*
+ * Use these functions to temporarily prevent enabling/disabling of
+ * the switcher.
+ * bL_switcher_get_enabled() returns true if the switcher is currently
+ * enabled.  Each call to bL_switcher_get_enabled() must be followed
+ * by a call to bL_switcher_put_enabled().  These functions are not
+ * recursive.
+ */
+bool bL_switcher_get_enabled(void);
+void bL_switcher_put_enabled(void);
+int bL_switcher_trace_trigger(void);
+int bL_switcher_get_logical_index(u32 mpidr);
+#else
+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+        return 0;
+}
+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+        return 0;
+}
+static inline bool bL_switcher_get_enabled(void) { return false; }
+static inline void bL_switcher_put_enabled(void) { }
+static inline int bL_switcher_trace_trigger(void) { return 0; }
+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
+#endif /* CONFIG_BL_SWITCHER */
+#endif
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h
index 7af5c6c3653a..b274bde24905 100644
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -2,6 +2,8 @@
 #define _ASMARM_BUG_H
 #include <linux/linkage.h>
+#include <linux/types.h>
+#include <asm/opcodes.h>
 #ifdef CONFIG_BUG
@@ -12,10 +14,10 @@
 */
 #ifdef CONFIG_THUMB2_KERNEL
 #define BUG_INSTR_VALUE 0xde02
-#define BUG_INSTR_TYPE ".hword "
+#define BUG_INSTR(__value) __inst_thumb16(__value)
 #else
 #define BUG_INSTR_VALUE 0xe7f001f2
-#define BUG_INSTR_TYPE ".word "
+#define BUG_INSTR(__value) __inst_arm(__value)
 #endif
@@ -33,7 +35,7 @@
 #define __BUG(__file, __line, __value)                          \
 do {                                                            \
-        asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n"        \
+        asm volatile("1:\t" BUG_INSTR(__value) "\n"  \
                ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
                "2:\t.asciz " #__file "\n"                      \
                ".popsection\n"                                 \
@@ -48,7 +50,7 @@ do {								\
 #define __BUG(__file, __line, __value)                          \
 do {                                                            \
-        asm volatile(BUG_INSTR_TYPE #__value);                  \
+        asm volatile(BUG_INSTR(__value) "\n");                  \
        unreachable();                                          \
 } while (0)
 #endif  /* CONFIG_DEBUG_BUGVERBOSE */
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index 3d7351c844aa..fe3ea776dc34 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
-#define NR_IPI  7
+#define NR_IPI  8
 typedef struct {
        unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h
index 0cf7a6b842ff..ad774f37c47c 100644
--- a/arch/arm/include/asm/hardware/coresight.h
+++ b/arch/arm/include/asm/hardware/coresight.h
@@ -24,8 +24,8 @@
 #define TRACER_TIMEOUT 10000
 #define etm_writel(t, v, x) \
-        (__raw_writel((v), (t)->etm_regs + (x)))
+        (writel_relaxed((v), (t)->etm_regs + (x)))
-#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x)))
+#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
 /* CoreSight Management Registers */
 #define CSMR_LOCKACCESS 0xfb0
@@ -142,8 +142,8 @@
 #define ETBFF_TRIGFL            BIT(10)
 #define etb_writel(t, v, x) \
-        (__raw_writel((v), (t)->etb_regs + (x)))
+        (writel_relaxed((v), (t)->etb_regs + (x)))
-#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x)))
+#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
 #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
 #define etm_unlock(t) \
diff --git a/arch/arm/include/asm/kgdb.h b/arch/arm/include/asm/kgdb.h
index 48066ce9ea34..0a9d5dd93294 100644
--- a/arch/arm/include/asm/kgdb.h
+++ b/arch/arm/include/asm/kgdb.h
@@ -11,6 +11,7 @@
 #define __ARM_KGDB_H__
 #include <linux/ptrace.h>
+#include <asm/opcodes.h>
 /*
 * GDB assumes that we're a user process being debugged, so
@@ -41,7 +42,7 @@
 static inline void arch_kgdb_breakpoint(void)
 {
-        asm(".word 0xe7ffdeff");
+        asm(__inst_arm(0xe7ffdeff));
 }
 extern void kgdb_handle_bus_error(void);
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 402a2bc6aa68..17a3fa2979e8 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -49,6 +49,7 @@ struct machine_desc {
        bool                    (*smp_init)(void);
        void                    (*fixup)(struct tag *, char **,
                                         struct meminfo *);
+        void                    (*init_meminfo)(void);
        void                    (*reserve)(void);/* reserve mem blocks  */
        void                    (*map_io)(void);/* IO mapping function  */
        void                    (*init_early)(void);
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
index 1cf26010a6f3..608516ebabfe 100644
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -42,6 +42,14 @@ extern void mcpm_entry_point(void);
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
 /*
+ * This sets an early poke i.e a value to be poked into some address
+ * from very early assembly code before the CPU is ungated.  The
+ * address must be physical, and if 0 then nothing will happen.
+ */
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                         unsigned long poke_phys_addr, unsigned long poke_val);
+/*
 * CPU/cluster power operations API for higher subsystems to use.
 */
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index e750a938fd3c..4dd21457ef9d 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -172,8 +172,13 @@
 * so that all we need to do is modify the 8-bit constant field.
 */
 #define __PV_BITS_31_24 0x81000000
+#define __PV_BITS_7_0   0x81
+extern u64 __pv_phys_offset;
+extern u64 __pv_offset;
+extern void fixup_pv_table(const void *, unsigned long);
+extern const void *__pv_table_begin, *__pv_table_end;
-extern unsigned long __pv_phys_offset;
 #define PHYS_OFFSET __pv_phys_offset
 #define __pv_stub(from,to,instr,type)                   \
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
        : "=r" (to)                                     \
        : "r" (from), "I" (type))
-static inline unsigned long __virt_to_phys(unsigned long x)
+#define __pv_stub_mov_hi(t)                             \
+        __asm__ volatile("@ __pv_stub_mov\n"            \
+        "1:     mov     %R0, %1\n"                      \
+        "       .pushsection .pv_table,\"a\"\n"         \
+        "       .long   1b\n"                           \
+        "       .popsection\n"                          \
+        : "=r" (t)                                      \
+        : "I" (__PV_BITS_7_0))
+#define __pv_add_carry_stub(x, y)                       \
+        __asm__ volatile("@ __pv_add_carry_stub\n"      \
+        "1:     adds    %Q0, %1, %2\n"                  \
+        "       adc     %R0, %R0, #0\n"                 \
+        "       .pushsection .pv_table,\"a\"\n"         \
+        "       .long   1b\n"                           \
+        "       .popsection\n"                          \
+        : "+r" (y)                                      \
+        : "r" (x), "I" (__PV_BITS_31_24)                \
+        : "cc")
+static inline phys_addr_t __virt_to_phys(unsigned long x)
 {
-        unsigned long t;
+        phys_addr_t t;
-        __pv_stub(x, t, "add", __PV_BITS_31_24);
+        if (sizeof(phys_addr_t) == 4) {
+                __pv_stub(x, t, "add", __PV_BITS_31_24);
+        } else {
+                __pv_stub_mov_hi(t);
+                __pv_add_carry_stub(x, t);
+        }
        return t;
 }
-static inline unsigned long __phys_to_virt(unsigned long x)
+static inline unsigned long __phys_to_virt(phys_addr_t x)
 {
        unsigned long t;
        __pv_stub(x, t, "sub", __PV_BITS_31_24);
        return t;
 }
 #else
-#define __virt_to_phys(x)       ((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x)       ((x) - PHYS_OFFSET + PAGE_OFFSET)
+static inline phys_addr_t __virt_to_phys(unsigned long x)
+{
+        return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+}
+static inline unsigned long __phys_to_virt(phys_addr_t x)
+{
+        return x - PHYS_OFFSET + PAGE_OFFSET;
+}
 #endif
 #endif
 #endif /* __ASSEMBLY__ */
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
 static inline void *phys_to_virt(phys_addr_t x)
 {
-        return (void *)(__phys_to_virt((unsigned long)(x)));
+        return (void *)__phys_to_virt(x);
 }
 /*
 * Drivers should NOT use these either.
 */
 #define __pa(x)                 __virt_to_phys((unsigned long)(x))
-#define __va(x)                 ((void *)__phys_to_virt((unsigned long)(x)))
+#define __va(x)                 ((void *)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)       __va((pfn) << PAGE_SHIFT)
+extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
+/*
+ * These are for systems that have a hardware interconnect supported alias of
+ * physical memory for idmap purposes.  Most cases should leave these
+ * untouched.
+ */
+static inline phys_addr_t __virt_to_idmap(unsigned long x)
+{
+        if (arch_virt_to_idmap)
+                return arch_virt_to_idmap(x);
+        else
+                return __virt_to_phys(x);
+}
+#define virt_to_idmap(x)        __virt_to_idmap((unsigned long)(x))
 /*
 * Virtual <-> DMA view memory address translations
 * Again, these are *only* valid on the kernel direct mapped RAM
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 6f18da09668b..64fd15159b7d 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -16,7 +16,7 @@ typedef struct {
 #ifdef CONFIG_CPU_HAS_ASID
 #define ASID_BITS       8
 #define ASID_MASK       ((~0ULL) << ASID_BITS)
-#define ASID(mm)        ((mm)->context.id.counter & ~ASID_MASK)
+#define ASID(mm)        ((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
 #else
 #define ASID(mm)        (0)
 #endif
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 413f3876341c..c3d5fc124a05 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -22,6 +22,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/ptrace.h>
 #include <asm/types.h>
+#include <asm/unified.h>
 #ifdef __KERNEL__
 #define STACK_TOP       ((current->personality & ADDR_LIMIT_32BIT) ? \
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
 #define KSTK_EIP(tsk)   task_pt_regs(tsk)->ARM_pc
 #define KSTK_ESP(tsk)   task_pt_regs(tsk)->ARM_sp
+#ifdef CONFIG_SMP
+#define __ALT_SMP_ASM(smp, up)                                          \
+        "9998:  " smp "\n"                                              \
+        "       .pushsection \".alt.smp.init\", \"a\"\n"                \
+        "       .long   9998b\n"                                        \
+        "       " up "\n"                                               \
+        "       .popsection\n"
+#else
+#define __ALT_SMP_ASM(smp, up)  up
+#endif
 /*
 * Prefetching support - only ARMv5.
 */
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
 {
        __asm__ __volatile__(
                "pld\t%a0"
-                :
+                :: "p" (ptr));
-                : "p" (ptr)
-                : "cc");
 }
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
 #define ARCH_HAS_PREFETCHW
-#define prefetchw(ptr)  prefetch(ptr)
+static inline void prefetchw(const void *ptr)
+{
-#define ARCH_HAS_SPINLOCK_PREFETCH
+        __asm__ __volatile__(
-#define spin_lock_prefetch(x) do { } while (0)
+                ".arch_extension        mp\n"
+                __ALT_SMP_ASM(
+                        WASM(pldw)              "\t%a0",
+                        WASM(pld)               "\t%a0"
+                )
+                :: "p" (ptr));
+}
+#endif
 #endif
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index a8cae71caceb..22a3b9b5d4a1 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+extern int register_ipi_completion(struct completion *completion, int cpu);
 struct smp_operations {
 #ifdef CONFIG_SMP
        /*
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index ed6c22919e47..ef3c6072aa45 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -5,21 +5,13 @@
 #error SMP not supported on pre-ARMv6 CPUs
 #endif
-#include <asm/processor.h>
+#include <linux/prefetch.h>
 /*
 * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
 * extensions, so when running on UP, we have to patch these instructions away.
 */
-#define ALT_SMP(smp, up)                                        \
-        "9998:  " smp "\n"                                      \
-        "       .pushsection \".alt.smp.init\", \"a\"\n"        \
-        "       .long   9998b\n"                                \
-        "       " up "\n"                                       \
-        "       .popsection\n"
 #ifdef CONFIG_THUMB2_KERNEL
-#define SEV             ALT_SMP("sev.w", "nop.w")
 /*
 * For Thumb-2, special care is needed to ensure that the conditional WFE
 * instruction really does assemble to exactly 4 bytes (as required by
@@ -31,17 +23,18 @@
 * the assembler won't change IT instructions which are explicitly present
 * in the input.
 */
-#define WFE(cond)       ALT_SMP(                \
+#define WFE(cond)       __ALT_SMP_ASM(          \
        "it " cond "\n\t"                       \
        "wfe" cond ".n",                        \
                                                \
        "nop.w"                                 \
 )
 #else
-#define SEV             ALT_SMP("sev", "nop")
+#define WFE(cond)       __ALT_SMP_ASM("wfe" cond, "nop")
-#define WFE(cond)       ALT_SMP("wfe" cond, "nop")
 #endif
+#define SEV             __ALT_SMP_ASM(WASM(sev), WASM(nop))
 static inline void dsb_sev(void)
 {
 #if __LINUX_ARM_ARCH__ >= 7
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
        u32 newval;
        arch_spinlock_t lockval;
+        prefetchw(&lock->slock);
        __asm__ __volatile__(
 "1:     ldrex   %0, [%3]\n"
 "       add     %1, %0, %4\n"
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
        unsigned long contended, res;
        u32 slock;
+        prefetchw(&lock->slock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%3]\n"
@@ -156,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 {
        unsigned long tmp;
+        prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:     ldrex   %0, [%1]\n"
 "       teq     %0, #0\n"
@@ -174,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 {
        unsigned long contended, res;
+        prefetchw(&rw->lock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%2]\n"
@@ -207,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 }
 /* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x)          ((x)->lock == 0)
+#define arch_write_can_lock(x)          (ACCESS_ONCE((x)->lock) == 0)
 /*
 * Read locks are a bit more hairy:
@@ -225,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 {
        unsigned long tmp, tmp2;
+        prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:     ldrex   %0, [%2]\n"
 "       adds    %0, %0, #1\n"
@@ -245,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
        smp_mb();
+        prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:     ldrex   %0, [%2]\n"
 "       sub     %0, %0, #1\n"
@@ -263,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 {
        unsigned long contended, res;
+        prefetchw(&rw->lock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%2]\n"
@@ -284,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 }
 /* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x)           ((x)->lock < 0x80000000)
+#define arch_read_can_lock(x)           (ACCESS_ONCE((x)->lock) < 0x80000000)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
index b262d2f8b478..47663fcb10ad 100644
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -25,7 +25,7 @@ typedef struct {
 #define __ARCH_SPIN_LOCK_UNLOCKED       { { 0 } }
 typedef struct {
-        volatile unsigned int lock;
+        u32 lock;
 } arch_rwlock_t;
 #define __ARCH_RW_LOCK_UNLOCKED         { 0 }
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index f5989f46b4d2..b88beaba6b4a 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -38,6 +38,8 @@
 #ifdef __ASSEMBLY__
 #define W(instr)        instr.w
 #define BSYM(sym)       sym + 1
+#else
+#define WASM(instr)     #instr ".w"
 #endif
 #else   /* !CONFIG_THUMB2_KERNEL */
@@ -50,6 +52,8 @@
 #ifdef __ASSEMBLY__
 #define W(instr)        instr
 #define BSYM(sym)       sym
+#else
+#define WASM(instr)     #instr
 #endif
 #endif  /* CONFIG_THUMB2_KERNEL */
diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S
index 37c6895b87e6..92ef808a2337 100644
--- a/arch/arm/include/debug/pl01x.S
+++ b/arch/arm/include/debug/pl01x.S
@@ -25,12 +25,14 @@
                .macro  waituart,rd,rx
 1001:           ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(       rev     \rd, \rd )
                tst     \rd, #UART01x_FR_TXFF
                bne     1001b
                .endm
                .macro  busyuart,rd,rx
 1001:           ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(       rev     \rd, \rd )
                tst     \rd, #UART01x_FR_BUSY
                bne     1001b
                .endm
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 18d76fd5a2af..70a1c9da30ca 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += hwcap.h
 header-y += ioctls.h
 header-y += kvm_para.h
 header-y += mman.h
+header-y += perf_regs.h
 header-y += posix_types.h
 header-y += ptrace.h
 header-y += setup.h
diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..ce59448458b2
--- /dev/null
+++ b/arch/arm/include/uapi/asm/perf_regs.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_ARM_PERF_REGS_H
+#define _ASM_ARM_PERF_REGS_H
+enum perf_event_arm_regs {
+        PERF_REG_ARM_R0,
+        PERF_REG_ARM_R1,
+        PERF_REG_ARM_R2,
+        PERF_REG_ARM_R3,
+        PERF_REG_ARM_R4,
+        PERF_REG_ARM_R5,
+        PERF_REG_ARM_R6,
+        PERF_REG_ARM_R7,
+        PERF_REG_ARM_R8,
+        PERF_REG_ARM_R9,
+        PERF_REG_ARM_R10,
+        PERF_REG_ARM_FP,
+        PERF_REG_ARM_IP,
+        PERF_REG_ARM_SP,
+        PERF_REG_ARM_LR,
+        PERF_REG_ARM_PC,
+        PERF_REG_ARM_MAX,
+};
+#endif /* _ASM_ARM_PERF_REGS_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 5140df5f23aa..a30fc9be9e9e 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
 obj-y           := elf.o entry-common.o irq.o opcodes.o \
                   process.o ptrace.o return_address.o \
-                   setup.o signal.o stacktrace.o sys_arm.o time.o traps.o
+                   setup.o signal.o sigreturn_codes.o \
+                   stacktrace.o sys_arm.o time.o traps.o
 obj-$(CONFIG_ATAGS)             += atags_parse.o
 obj-$(CONFIG_ATAGS_PROC)        += atags_proc.o
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)        += xscale-cp0.o
 obj-$(CONFIG_CPU_PJ4)           += pj4-cp0.o
 obj-$(CONFIG_IWMMXT)            += iwmmxt.o
+obj-$(CONFIG_PERF_EVENTS)       += perf_regs.o
 obj-$(CONFIG_HW_PERF_EVENTS)    += perf_event.o perf_event_cpu.o
 AFLAGS_iwmmxt.o                 := -Wa,-mcpu=iwmmxt
 obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 60d3b738d420..1f031ddd0667 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 EXPORT_SYMBOL(__pv_phys_offset);
+EXPORT_SYMBOL(__pv_offset);
 #endif
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9cbe70c8b0ef..55090fbb81a2 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -416,9 +416,8 @@ __und_usr:
        bne     __und_usr_thumb
        sub     r4, r2, #4                      @ ARM instr at LR - 4
 1:      ldrt    r0, [r4]
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(rev    r0, r0)                         @ little endian instruction
-        rev     r0, r0                          @ little endian instruction
-#endif
        @ r0 = 32-bit ARM instruction which caused the exception
        @ r2 = PC value for the following instruction (:= regs->ARM_pc)
        @ r4 = PC value for the faulting instruction
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index bc6bd9683ba4..a2dcafdf1bc8 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -393,9 +393,7 @@ ENTRY(vector_swi)
 #else
 USER(  ldr     r10, [lr, #-4]          )       @ get SWI instruction
 #endif
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(rev    r10, r10)                       @ little endian instruction
-        rev     r10, r10                        @ little endian instruction
-#endif
 #elif defined(CONFIG_AEABI)
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 476de57dcef2..7801866e626a 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -77,6 +77,7 @@
        __HEAD
 ENTRY(stext)
+ ARM_BE8(setend be )                    @ ensure we are in BE8 mode
 THUMB( adr     r9, BSYM(1f)    )       @ Kernel is always entered in ARM.
 THUMB( bx      r9              )       @ If this is a Thumb-2 kernel,
@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
         * the processor type - there is no need to check the machine type
         * as it has already been validated by the primary processor.
         */
+ ARM_BE8(setend be)                             @ ensure we are in BE8 mode
 #ifdef CONFIG_ARM_VIRT_EXT
        bl      __hyp_stub_install_secondary
 #endif
@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
        ldmfd   sp!, {r4 - r6, pc}
 ENDPROC(fixup_smp)
+#ifdef __ARMEB__
+#define LOW_OFFSET      0x4
+#define HIGH_OFFSET     0x0
+#else
+#define LOW_OFFSET      0x0
+#define HIGH_OFFSET     0x4
+#endif
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 /* __fixup_pv_table - patch the stub instructions with the delta between
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
        __HEAD
 __fixup_pv_table:
        adr     r0, 1f
-        ldmia   r0, {r3-r5, r7}
+        ldmia   r0, {r3-r7}
-        sub     r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
+        mvn     ip, #0
+        subs    r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
        add     r4, r4, r3      @ adjust table start address
        add     r5, r5, r3      @ adjust table end address
-        add     r7, r7, r3      @ adjust __pv_phys_offset address
+        add     r6, r6, r3      @ adjust __pv_phys_offset address
-        str     r8, [r7]        @ save computed PHYS_OFFSET to __pv_phys_offset
+        add     r7, r7, r3      @ adjust __pv_offset address
+        str     r8, [r6, #LOW_OFFSET]   @ save computed PHYS_OFFSET to __pv_phys_offset
+        strcc   ip, [r7, #HIGH_OFFSET]  @ save to __pv_offset high bits
        mov     r6, r3, lsr #24 @ constant for add/sub instructions
        teq     r3, r6, lsl #24 @ must be 16MiB aligned
 THUMB(  it      ne              @ cross section branch )
        bne     __error
-        str     r6, [r7, #4]    @ save to __pv_offset
+        str     r3, [r7, #LOW_OFFSET]   @ save to __pv_offset low bits
        b       __fixup_a_pv_table
 ENDPROC(__fixup_pv_table)
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
        .long   __pv_table_begin
        .long   __pv_table_end
 2:      .long   __pv_phys_offset
+        .long   __pv_offset
        .text
 __fixup_a_pv_table:
+        adr     r0, 3f
+        ldr     r6, [r0]
+        add     r6, r6, r3
+        ldr     r0, [r6, #HIGH_OFFSET]  @ pv_offset high word
+        ldr     r6, [r6, #LOW_OFFSET]   @ pv_offset low word
+        mov     r6, r6, lsr #24
+        cmn     r0, #1
 #ifdef CONFIG_THUMB2_KERNEL
+        moveq   r0, #0x200000   @ set bit 21, mov to mvn instruction
        lsls    r6, #24
        beq     2f
        clz     r7, r6
@@ -601,18 +625,42 @@ __fixup_a_pv_table:
        b       2f
 1:      add     r7, r3
        ldrh    ip, [r7, #2]
-        and     ip, 0x8f00
+ARM_BE8(rev16   ip, ip)
-        orr     ip, r6  @ mask in offset bits 31-24
+        tst     ip, #0x4000
+        and     ip, #0x8f00
+        orrne   ip, r6  @ mask in offset bits 31-24
+        orreq   ip, r0  @ mask in offset bits 7-0
+ARM_BE8(rev16   ip, ip)
        strh    ip, [r7, #2]
+        bne     2f
+        ldrh    ip, [r7]
+ARM_BE8(rev16   ip, ip)
+        bic     ip, #0x20
+        orr     ip, ip, r0, lsr #16
+ARM_BE8(rev16   ip, ip)
+        strh    ip, [r7]
 2:      cmp     r4, r5
        ldrcc   r7, [r4], #4    @ use branch for delay slot
        bcc     1b
        bx      lr
 #else
+        moveq   r0, #0x400000   @ set bit 22, mov to mvn instruction
        b       2f
 1:      ldr     ip, [r7, r3]
+#ifdef CONFIG_CPU_ENDIAN_BE8
+        @ in BE8, we load data in BE, but instructions still in LE
+        bic     ip, ip, #0xff000000
+        tst     ip, #0x000f0000 @ check the rotation field
+        orrne   ip, ip, r6, lsl #24 @ mask in offset bits 31-24
+        biceq   ip, ip, #0x00004000 @ clear bit 22
+        orreq   ip, ip, r0, lsl #24 @ mask in offset bits 7-0
+#else
        bic     ip, ip, #0x000000ff
-        orr     ip, ip, r6      @ mask in offset bits 31-24
+        tst     ip, #0xf00      @ check the rotation field
+        orrne   ip, ip, r6      @ mask in offset bits 31-24
+        biceq   ip, ip, #0x400000       @ clear bit 22
+        orreq   ip, ip, r0      @ mask in offset bits 7-0
+#endif
        str     ip, [r7, r3]
 2:      cmp     r4, r5
        ldrcc   r7, [r4], #4    @ use branch for delay slot
@@ -621,28 +669,30 @@ __fixup_a_pv_table:
 #endif
 ENDPROC(__fixup_a_pv_table)
+        .align
+3:      .long __pv_offset
 ENTRY(fixup_pv_table)
        stmfd   sp!, {r4 - r7, lr}
-        ldr     r2, 2f                  @ get address of __pv_phys_offset
        mov     r3, #0                  @ no offset
        mov     r4, r0                  @ r0 = table start
        add     r5, r0, r1              @ r1 = table size
-        ldr     r6, [r2, #4]            @ get __pv_offset
        bl      __fixup_a_pv_table
        ldmfd   sp!, {r4 - r7, pc}
 ENDPROC(fixup_pv_table)
-        .align
-2:      .long   __pv_phys_offset
        .data
        .globl  __pv_phys_offset
        .type   __pv_phys_offset, %object
 __pv_phys_offset:
-        .long   0
+        .quad   0
-        .size   __pv_phys_offset, . - __pv_phys_offset
+        .size   __pv_phys_offset, . -__pv_phys_offset
+        .globl  __pv_offset
+        .type   __pv_offset, %object
 __pv_offset:
-        .long   0
+        .quad   0
+        .size   __pv_offset, . -__pv_offset
 #endif
 #include "head-common.S"
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 084dc8896986..5fdb4038f969 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -24,6 +24,7 @@
 #include <asm/sections.h>
 #include <asm/smp_plat.h>
 #include <asm/unwind.h>
+#include <asm/opcodes.h>
 #ifdef CONFIG_XIP_KERNEL
 /*
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                Elf32_Sym *sym;
                const char *symname;
                s32 offset;
+                u32 tmp;
 #ifdef CONFIG_THUMB2_KERNEL
                u32 upper, lower, sign, j1, j2;
 #endif
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                case R_ARM_PC24:
                case R_ARM_CALL:
                case R_ARM_JUMP24:
-                        offset = (*(u32 *)loc & 0x00ffffff) << 2;
+                        offset = __mem_to_opcode_arm(*(u32 *)loc);
+                        offset = (offset & 0x00ffffff) << 2;
                        if (offset & 0x02000000)
                                offset -= 0x04000000;
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        }
                        offset >>= 2;
+                        offset &= 0x00ffffff;
-                        *(u32 *)loc &= 0xff000000;
+                        *(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
-                        *(u32 *)loc |= offset & 0x00ffffff;
+                        *(u32 *)loc |= __opcode_to_mem_arm(offset);
                        break;
               case R_ARM_V4BX:
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        * other bits to re-code instruction as
                        * MOV PC,Rm.
                        */
-                       *(u32 *)loc &= 0xf000000f;
+                       *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
-                       *(u32 *)loc |= 0x01a0f000;
+                       *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
                       break;
                case R_ARM_PREL31:
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                case R_ARM_MOVW_ABS_NC:
                case R_ARM_MOVT_ABS:
-                        offset = *(u32 *)loc;
+                        offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
                        offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
                        offset = (offset ^ 0x8000) - 0x8000;
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
                                offset >>= 16;
-                        *(u32 *)loc &= 0xfff0f000;
+                        tmp &= 0xfff0f000;
-                        *(u32 *)loc |= ((offset & 0xf000) << 4) |
+                        tmp |= ((offset & 0xf000) << 4) |
-                                        (offset & 0x0fff);
+                                (offset & 0x0fff);
+                        *(u32 *)loc = __opcode_to_mem_arm(tmp);
                        break;
 #ifdef CONFIG_THUMB2_KERNEL
                case R_ARM_THM_CALL:
                case R_ARM_THM_JUMP24:
-                        upper = *(u16 *)loc;
+                        upper = __mem_to_opcode_thumb16(*(u16 *)loc);
-                        lower = *(u16 *)(loc + 2);
+                        lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
                        /*
                         * 25 bit signed address range (Thumb-2 BL and B.W
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        sign = (offset >> 24) & 1;
                        j1 = sign ^ (~(offset >> 23) & 1);
                        j2 = sign ^ (~(offset >> 22) & 1);
-                        *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) |
+                        upper = (u16)((upper & 0xf800) | (sign << 10) |
                                            ((offset >> 12) & 0x03ff));
-                        *(u16 *)(loc + 2) = (u16)((lower & 0xd000) |
+                        lower = (u16)((lower & 0xd000) |
-                                                  (j1 << 13) | (j2 << 11) |
+                                      (j1 << 13) | (j2 << 11) |
-                                                  ((offset >> 1) & 0x07ff));
+                                      ((offset >> 1) & 0x07ff));
+                        *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                        *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                        break;
                case R_ARM_THM_MOVW_ABS_NC:
                case R_ARM_THM_MOVT_ABS:
-                        upper = *(u16 *)loc;
+                        upper = __mem_to_opcode_thumb16(*(u16 *)loc);
-                        lower = *(u16 *)(loc + 2);
+                        lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
                        /*
                         * MOVT/MOVW instructions encoding in Thumb-2:
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
                                offset >>= 16;
-                        *(u16 *)loc = (u16)((upper & 0xfbf0) |
+                        upper = (u16)((upper & 0xfbf0) |
-                                            ((offset & 0xf000) >> 12) |
+                                      ((offset & 0xf000) >> 12) |
-                                            ((offset & 0x0800) >> 1));
+                                      ((offset & 0x0800) >> 1));
-                        *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) |
+                        lower = (u16)((lower & 0x8f00) |
-                                                  ((offset & 0x0700) << 4) |
+                                      ((offset & 0x0700) << 4) |
-                                                  (offset & 0x00ff));
+                                      (offset & 0x00ff));
+                        *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                        *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                        break;
 #endif
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index e186ee1e63f6..bc3f2efa0d86 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
               struct perf_event *event)
 {
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-        struct pmu *leader_pmu = event->group_leader->pmu;
        if (is_software_event(event))
                return 1;
-        if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
+        if (event->state < PERF_EVENT_STATE_OFF)
                return 1;
        if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
new file mode 100644
index 000000000000..6e4379c67cbc
--- /dev/null
+++ b/arch/arm/kernel/perf_regs.c
@@ -0,0 +1,30 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+        if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
+                return 0;
+        return regs->uregs[idx];
+}
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
+int perf_reg_validate(u64 mask)
+{
+        if (!mask || mask & REG_RESERVED)
+                return -EINVAL;
+        return 0;
+}
+u64 perf_reg_abi(struct task_struct *task)
+{
+        return PERF_SAMPLE_REGS_ABI_32;
+}
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 53c3901f7ee3..f52150d2ec00 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
 #endif
 extern void paging_init(const struct machine_desc *desc);
+extern void early_paging_init(const struct machine_desc *,
+                              struct proc_info_list *);
 extern void sanity_check_meminfo(void);
 extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(const struct machine_desc *desc);
@@ -888,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
        parse_early_param();
        sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
+        early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
        sanity_check_meminfo();
        arm_memblock_init(&meminfo, mdesc);
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index ab3304225272..64845fc4152a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -21,29 +21,7 @@
 #include <asm/unistd.h>
 #include <asm/vfp.h>
-/*
+extern const unsigned long sigreturn_codes[7];
- * For ARM syscalls, we encode the syscall number into the instruction.
- */
-#define SWI_SYS_SIGRETURN       (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-#define SWI_SYS_RT_SIGRETURN    (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-/*
- * With EABI, the syscall number has to be loaded into r7.
- */
-#define MOV_R7_NR_SIGRETURN     (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define MOV_R7_NR_RT_SIGRETURN  (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-/*
- * For Thumb syscalls, we pass the syscall number via r7.  We therefore
- * need two 16-bit instructions.
- */
-#define SWI_THUMB_SIGRETURN     (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define SWI_THUMB_RT_SIGRETURN  (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-static const unsigned long sigreturn_codes[7] = {
-        MOV_R7_NR_SIGRETURN,    SWI_SYS_SIGRETURN,    SWI_THUMB_SIGRETURN,
-        MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
-};
 static unsigned long signal_return_offset;
diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S
new file mode 100644
index 000000000000..3c5d0f2170fd
--- /dev/null
+++ b/arch/arm/kernel/sigreturn_codes.S
@@ -0,0 +1,80 @@
+/*
+ * sigreturn_codes.S - code sinpets for sigreturn syscalls
+ *
+ * Created by:  Victor Kamensky, 2013-08-13
+ * Copyright:   (C) 2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <asm/unistd.h>
+/*
+ * For ARM syscalls, we encode the syscall number into the instruction.
+ * With EABI, the syscall number has to be loaded into r7. As result
+ * ARM syscall sequence snippet will have move and svc in .arm encoding
+ *
+ * For Thumb syscalls, we pass the syscall number via r7.  We therefore
+ * need two 16-bit instructions in .thumb encoding
+ *
+ * Please note sigreturn_codes code are not executed in place. Instead
+ * they just copied by kernel into appropriate places. Code inside of
+ * arch/arm/kernel/signal.c is very sensitive to layout of these code
+ * snippets.
+ */
+#if __LINUX_ARM_ARCH__ <= 4
+        /*
+         * Note we manually set minimally required arch that supports
+         * required thumb opcodes for early arch versions. It is OK
+         * for this file to be used in combination with other
+         * lower arch variants, since these code snippets are only
+         * used as input data.
+         */
+        .arch armv4t
+#endif
+        .section .rodata
+        .global sigreturn_codes
+        .type   sigreturn_codes, #object
+        .arm
+sigreturn_codes:
+        /* ARM sigreturn syscall code snippet */
+        mov     r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+        swi     #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+        /* Thumb sigreturn syscall code snippet */
+        .thumb
+        movs    r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+        swi     #0
+        /* ARM sigreturn_rt syscall code snippet */
+        .arm
+        mov     r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+        swi     #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+        /* Thumb sigreturn_rt syscall code snippet */
+        .thumb
+        movs    r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+        swi     #0
+        /*
+         * Note on addtional space: setup_return in signal.c
+         * algorithm uses two words copy regardless whether
+         * it is thumb case or not, so we need additional
+         * word after real last entry.
+         */
+        .arm
+        .space  4
+        .size   sigreturn_codes, . - sigreturn_codes
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S
index db1536b8b30b..b907d9b790ab 100644
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -55,6 +55,7 @@
 * specific registers and some other data for resume.
 *  r0 = suspend function arg0
 *  r1 = suspend function
+ *  r2 = MPIDR value the resuming CPU will use
 */
 ENTRY(__cpu_suspend)
        stmfd   sp!, {r4 - r11, lr}
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
        mov     r5, sp                  @ current virtual SP
        add     r4, r4, #12             @ Space for pgd, virt sp, phys resume fn
        sub     sp, sp, r4              @ allocate CPU state on stack
-        stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
-        add     r0, sp, #8              @ save pointer to save block
-        mov     r1, r4                  @ size of save block
-        mov     r2, r5                  @ virtual SP
        ldr     r3, =sleep_save_sp
+        stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
        ldr     r3, [r3, #SLEEP_SAVE_SP_VIRT]
-        ALT_SMP(mrc p15, 0, r9, c0, c0, 5)
+        ALT_SMP(ldr r0, =mpidr_hash)
-        ALT_UP_B(1f)
+        ALT_UP_B(1f)
-        ldr     r8, =mpidr_hash
+        /* This ldmia relies on the memory layout of the mpidr_hash struct */
-        /*
+        ldmia   r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
-         * This ldmia relies on the memory layout of the mpidr_hash
+        compute_mpidr_hash      r0, r6, r7, r8, r2, r1
-         * struct mpidr_hash.
+        add     r3, r3, r0, lsl #2
-         */
+1:      mov     r2, r5                  @ virtual SP
-        ldmia   r8, {r4-r7}     @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts
+        mov     r1, r4                  @ size of save block
-        compute_mpidr_hash      lr, r5, r6, r7, r9, r4
+        add     r0, sp, #8              @ pointer to save block
-        add     r3, r3, lr, lsl #2
-1:
        bl      __cpu_suspend_save
        adr     lr, BSYM(cpu_suspend_abort)
        ldmfd   sp!, {r0, pc}           @ call suspend fn
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
        .data
        .align
 ENTRY(cpu_resume)
+ARM_BE8(setend be)                      @ ensure we are in BE mode
        mov     r1, #0
        ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
        ALT_UP_B(1f)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e115cbb0d25a..dc894ab3622b 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -68,6 +68,7 @@ enum ipi_msg_type {
        IPI_CALL_FUNC_SINGLE,
        IPI_CPU_STOP,
        IPI_IRQ_WORK,
+        IPI_COMPLETION,
 };
 static DECLARE_COMPLETION(cpu_running);
@@ -82,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
 static unsigned long get_arch_pgd(pgd_t *pgd)
 {
-        phys_addr_t pgdir = virt_to_phys(pgd);
+        phys_addr_t pgdir = virt_to_idmap(pgd);
        BUG_ON(pgdir & ARCH_PGD_MASK);
        return pgdir >> ARCH_PGD_SHIFT;
 }
@@ -467,6 +468,7 @@ static const char *ipi_types[NR_IPI] = {
        S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
        S(IPI_CPU_STOP, "CPU stop interrupts"),
        S(IPI_IRQ_WORK, "IRQ work interrupts"),
+        S(IPI_COMPLETION, "completion interrupts"),
 };
 void show_ipi_list(struct seq_file *p, int prec)
@@ -526,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
                cpu_relax();
 }
+static DEFINE_PER_CPU(struct completion *, cpu_completion);
+int register_ipi_completion(struct completion *completion, int cpu)
+{
+        per_cpu(cpu_completion, cpu) = completion;
+        return IPI_COMPLETION;
+}
+static void ipi_complete(unsigned int cpu)
+{
+        complete(per_cpu(cpu_completion, cpu));
+}
 /*
 * Main handler for inter-processor interrupts
 */
@@ -584,6 +599,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                break;
 #endif
+        case IPI_COMPLETION:
+                irq_enter();
+                ipi_complete(cpu);
+                irq_exit();
+                break;
        default:
                printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
                       cpu, ipinr);
diff --git a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c
index 5bc1a63284e3..1aafa0d785eb 100644
--- a/arch/arm/kernel/smp_scu.c
+++ b/arch/arm/kernel/smp_scu.c
@@ -28,7 +28,7 @@
 */
 unsigned int __init scu_get_core_count(void __iomem *scu_base)
 {
-        unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG);
+        unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
        return (ncores & 0x03) + 1;
 }
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
 #ifdef CONFIG_ARM_ERRATA_764369
        /* Cortex-A9 only */
        if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
-                scu_ctrl = __raw_readl(scu_base + 0x30);
+                scu_ctrl = readl_relaxed(scu_base + 0x30);
                if (!(scu_ctrl & 1))
-                        __raw_writel(scu_ctrl | 0x1, scu_base + 0x30);
+                        writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
        }
 #endif
-        scu_ctrl = __raw_readl(scu_base + SCU_CTRL);
+        scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
        /* already enabled? */
        if (scu_ctrl & 1)
                return;
        scu_ctrl |= 1;
-        __raw_writel(scu_ctrl, scu_base + SCU_CTRL);
+        writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
        /*
         * Ensure that the data accessed by CPU0 before the SCU was
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
        if (mode > 3 || mode == 1 || cpu > 3)
                return -EINVAL;
-        val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
+        val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
        val |= mode;
-        __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu);
+        writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
        return 0;
 }
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index 2985c9f0905d..6591e26fc13f 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
        case CLOCK_EVT_MODE_PERIODIC:
                ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
                        | TWD_TIMER_CONTROL_PERIODIC;
-                __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
+                writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
                        twd_base + TWD_TIMER_LOAD);
                break;
        case CLOCK_EVT_MODE_ONESHOT:
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
                ctrl = 0;
        }
-        __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+        writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
 }
 static int twd_set_next_event(unsigned long evt,
                        struct clock_event_device *unused)
 {
-        unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
+        unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
        ctrl |= TWD_TIMER_CONTROL_ENABLE;
-        __raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
+        writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
-        __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+        writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
        return 0;
 }
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
 */
 static int twd_timer_ack(void)
 {
-        if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
+        if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
-                __raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
+                writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
                return 1;
        }
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
                waitjiffies += 5;
                                 /* enable, no interrupt or reload */
-                __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
+                writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
                                 /* maximum value */
-                __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
+                writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
                while (get_jiffies_64() < waitjiffies)
                        udelay(10);
-                count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
+                count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
                twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
         * bother with the below.
         */
        if (per_cpu(percpu_setup_called, cpu)) {
-                __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+                writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
                clockevents_register_device(clk);
                enable_percpu_irq(clk->irq, 0);
                return;
@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
         * The following is done once per CPU the first time .setup() is
         * called.
         */
-        __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+        writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
        clk->name = "local_timer";
        clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index 41cf3cbf756d..2835d35234ca 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -10,7 +10,7 @@
 #include <asm/suspend.h>
 #include <asm/tlbflush.h>
-extern int __cpu_suspend(unsigned long, int (*)(unsigned long));
+extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
 extern void cpu_resume_mmu(void);
 #ifdef CONFIG_MMU
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
        struct mm_struct *mm = current->active_mm;
+        u32 __mpidr = cpu_logical_map(smp_processor_id());
        int ret;
        if (!idmap_pgd)
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
         * resume (indicated by a zero return code), we need to switch
         * back to the correct page tables.
         */
-        ret = __cpu_suspend(arg, fn);
+        ret = __cpu_suspend(arg, fn, __mpidr);
        if (ret == 0) {
                cpu_switch_mm(mm->pgd, mm);
                local_flush_bp_all();
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 #else
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
-        return __cpu_suspend(arg, fn);
+        u32 __mpidr = cpu_logical_map(smp_processor_id());
+        return __cpu_suspend(arg, fn, __mpidr);
 }
 #define idmap_pgd       NULL
 #endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 8fcda140358d..6125f259b7b5 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -34,6 +34,7 @@
 #include <asm/unwind.h>
 #include <asm/tls.h>
 #include <asm/system_misc.h>
+#include <asm/opcodes.h>
 static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
 int is_valid_bugaddr(unsigned long pc)
 {
 #ifdef CONFIG_THUMB2_KERNEL
-        unsigned short bkpt;
+        u16 bkpt;
+        u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
 #else
-        unsigned long bkpt;
+        u32 bkpt;
+        u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
 #endif
        if (probe_kernel_address((unsigned *)pc, bkpt))
                return 0;
-        return bkpt == BUG_INSTR_VALUE;
+        return bkpt == insn;
 }
 #endif
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
        if (processor_mode(regs) == SVC_MODE) {
 #ifdef CONFIG_THUMB2_KERNEL
                if (thumb_mode(regs)) {
-                        instr = ((u16 *)pc)[0];
+                        instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
                        if (is_wide_instruction(instr)) {
-                                instr <<= 16;
+                                u16 inst2;
-                                instr |= ((u16 *)pc)[1];
+                                inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
+                                instr = __opcode_thumb32_compose(instr, inst2);
                        }
                } else
 #endif
-                        instr = *(u32 *) pc;
+                        instr = __mem_to_opcode_arm(*(u32 *) pc);
        } else if (thumb_mode(regs)) {
                if (get_user(instr, (u16 __user *)pc))
                        goto die_sig;
+                instr = __mem_to_opcode_thumb16(instr);
                if (is_wide_instruction(instr)) {
                        unsigned int instr2;
                        if (get_user(instr2, (u16 __user *)pc+1))
                                goto die_sig;
-                        instr <<= 16;
+                        instr2 = __mem_to_opcode_thumb16(instr2);
-                        instr |= instr2;
+                        instr = __opcode_thumb32_compose(instr, instr2);
                }
        } else if (get_user(instr, (u32 __user *)pc)) {
+                instr = __mem_to_opcode_arm(instr);
                goto die_sig;
        }
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index d6408d1ee543..e0c68d5bb7dc 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -10,6 +10,11 @@ UNWIND(	.fnstart	)
        and     r3, r0, #31             @ Get bit offset
        mov     r0, r0, lsr #5
        add     r1, r1, r0, lsl #2      @ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7
+        .arch_extension mp
+        ALT_SMP(W(pldw) [r1])
+        ALT_UP(W(nop))
+#endif
        mov     r3, r2, lsl r3
 1:      ldrex   r2, [r1]
        \instr  r2, r2, r3
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 8e8437dea3ce..3c3bff715b47 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -4,6 +4,7 @@ config ARCH_HIGHBANK
        select ARCH_HAS_CPUFREQ
        select ARCH_HAS_HOLES_MEMORYMODEL
        select ARCH_HAS_OPP
+        select ARCH_SUPPORTS_BIG_ENDIAN
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select ARM_AMBA
        select ARM_ERRATA_764369
diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig
index 30e1ebe3a891..c342dc4e8a45 100644
--- a/arch/arm/mach-ixp4xx/Kconfig
+++ b/arch/arm/mach-ixp4xx/Kconfig
@@ -1,9 +1,5 @@
 if ARCH_IXP4XX
-config ARCH_SUPPORTS_BIG_ENDIAN
-        bool
-        default y
 menu "Intel IXP4xx Implementation Options"
 comment "IXP4xx Platforms"
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index 9eb63d724602..5e269d7263ce 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -1,5 +1,6 @@
 config ARCH_MVEBU
        bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
+        select ARCH_SUPPORTS_BIG_ENDIAN
        select CLKSRC_MMIO
        select COMMON_CLK
        select GENERIC_CLOCKEVENTS
diff --git a/arch/arm/mach-mvebu/coherency_ll.S b/arch/arm/mach-mvebu/coherency_ll.S
index 5476669ba905..ee7598fe75db 100644
--- a/arch/arm/mach-mvebu/coherency_ll.S
+++ b/arch/arm/mach-mvebu/coherency_ll.S
@@ -20,6 +20,8 @@
 #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
 #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
+#include <asm/assembler.h>
        .text
 /*
 * r0: Coherency fabric base register address
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
        /* Create bit by cpu index */
        mov     r3, #(1 << 24)
        lsl     r1, r3, r1
+ARM_BE8(rev     r1, r1)
        /* Add CPU to SMP group - Atomic */
        add     r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
diff --git a/arch/arm/mach-mvebu/headsmp.S b/arch/arm/mach-mvebu/headsmp.S
index 8a1b0c96e9ec..3dd80df428f7 100644
--- a/arch/arm/mach-mvebu/headsmp.S
+++ b/arch/arm/mach-mvebu/headsmp.S
@@ -21,12 +21,16 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <asm/assembler.h>
 /*
 * Armada XP specific entry point for secondary CPUs.
 * We add the CPU to the coherency fabric and then jump to secondary
 * startup
 */
 ENTRY(armada_xp_secondary_startup)
+ ARM_BE8(setend be )                    @ go BE8 if entered LE
        /* Get coherency fabric base physical address */
        adr     r0, 1f
        ldr     r1, [r0]
diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
index 365795447804..4fe8ebe5b2d4 100644
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -1,6 +1,7 @@
 config ARCH_VEXPRESS
        bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
        select ARCH_REQUIRE_GPIOLIB
+        select ARCH_SUPPORTS_BIG_ENDIAN
        select ARM_AMBA
        select ARM_GIC
        select ARM_TIMER_SP804
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index cd2c88e7a8f7..1f8fed94c2a4 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
        help
          This option allows the use of custom mandatory barriers
          included via the mach/barriers.h file.
+config ARCH_SUPPORTS_BIG_ENDIAN
+        bool
+        help
+          This option specifies the architecture can support big endian
+          operation.
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 80741992a9fc..3815a8262af0 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
        bne     do_DataAbort
        bic     r1, r1, #1 << 11                @ clear bit 11 of FSR
        ldr     r3, [r4]                        @ read aborted ARM instruction
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(rev    r3, r3)
-        rev     r3, r3
-#endif
        do_ldrd_abort tmp=ip, insn=r3
        tst     r3, #1 << 20                    @ L = 0 -> write
        orreq   r1, r1, #1 << 11                @ yes.
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 6f4585b89078..924036473b16 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -25,6 +25,7 @@
 #include <asm/cp15.h>
 #include <asm/system_info.h>
 #include <asm/unaligned.h>
+#include <asm/opcodes.h>
 #include "fault.h"
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
        if (thumb_mode(regs)) {
                u16 *ptr = (u16 *)(instrptr & ~1);
                fault = probe_kernel_address(ptr, tinstr);
+                tinstr = __mem_to_opcode_thumb16(tinstr);
                if (!fault) {
                        if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
                            IS_T32(tinstr)) {
                                /* Thumb-2 32-bit */
                                u16 tinst2 = 0;
                                fault = probe_kernel_address(ptr + 1, tinst2);
-                                instr = (tinstr << 16) | tinst2;
+                                tinst2 = __mem_to_opcode_thumb16(tinst2);
+                                instr = __opcode_thumb32_compose(tinstr, tinst2);
                                thumb2_32b = 1;
                        } else {
                                isize = 2;
                                instr = thumb2arm(tinstr);
                        }
                }
-        } else
+        } else {
                fault = probe_kernel_address(instrptr, instr);
+                instr = __mem_to_opcode_arm(instr);
+        }
        if (fault) {
                type = TYPE_FAULT;
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 83cb3ac27095..8e0e52eb76b5 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -10,6 +10,7 @@
 #include <asm/system_info.h>
 pgd_t *idmap_pgd;
+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
 #ifdef CONFIG_ARM_LPAE
 static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
        unsigned long addr, end;
        unsigned long next;
-        addr = virt_to_phys(text_start);
+        addr = virt_to_idmap(text_start);
-        end = virt_to_phys(text_end);
+        end = virt_to_idmap(text_end);
+        pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
        prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
        if (!idmap_pgd)
                return -ENOMEM;
-        pr_info("Setting up static identity map for 0x%p - 0x%p\n",
-                __idmap_text_start, __idmap_text_end);
        identity_mapping_add(idmap_pgd, __idmap_text_start,
                             __idmap_text_end, 0);
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index b1d17eeb59b8..78eeeca78f5a 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -28,6 +28,8 @@
 #include <asm/highmem.h>
 #include <asm/system_info.h>
 #include <asm/traps.h>
+#include <asm/procinfo.h>
+#include <asm/memory.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
        }
 }
+#ifdef CONFIG_ARM_LPAE
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                              struct proc_info_list *procinfo)
+{
+        pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
+        unsigned long map_start, map_end;
+        pgd_t *pgd0, *pgdk;
+        pud_t *pud0, *pudk, *pud_start;
+        pmd_t *pmd0, *pmdk;
+        phys_addr_t phys;
+        int i;
+        if (!(mdesc->init_meminfo))
+                return;
+        /* remap kernel code and data */
+        map_start = init_mm.start_code;
+        map_end   = init_mm.brk;
+        /* get a handle on things... */
+        pgd0 = pgd_offset_k(0);
+        pud_start = pud0 = pud_offset(pgd0, 0);
+        pmd0 = pmd_offset(pud0, 0);
+        pgdk = pgd_offset_k(map_start);
+        pudk = pud_offset(pgdk, map_start);
+        pmdk = pmd_offset(pudk, map_start);
+        mdesc->init_meminfo();
+        /* Run the patch stub to update the constants */
+        fixup_pv_table(&__pv_table_begin,
+                (&__pv_table_end - &__pv_table_begin) << 2);
+        /*
+         * Cache cleaning operations for self-modifying code
+         * We should clean the entries by MVA but running a
+         * for loop over every pv_table entry pointer would
+         * just complicate the code.
+         */
+        flush_cache_louis();
+        dsb();
+        isb();
+        /* remap level 1 table */
+        for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
+                set_pud(pud0,
+                        __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
+                pmd0 += PTRS_PER_PMD;
+        }
+        /* remap pmds for kernel mapping */
+        phys = __pa(map_start) & PMD_MASK;
+        do {
+                *pmdk++ = __pmd(phys | pmdprot);
+                phys += PMD_SIZE;
+        } while (phys < map_end);
+        flush_cache_all();
+        cpu_switch_mm(pgd0, &init_mm);
+        cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+        local_flush_bp_all();
+        local_flush_tlb_all();
+}
+#else
+void __init early_paging_init(const struct machine_desc *mdesc,
+                              struct proc_info_list *procinfo)
+{
+        if (mdesc->init_meminfo)
+                mdesc->init_meminfo();
+}
+#endif
 /*
 * paging_init() sets up the page tables, initialises the zone memory
 * maps, and sets up the zero page, bad page and bad page tables.
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 34d4ab217bab..5c668b7a31f9 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -296,6 +296,15 @@ void __init sanity_check_meminfo(void)
 }
 /*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                              struct proc_info_list *procinfo)
+{
+}
+/*
 * paging_init() sets up the page tables, initialises the zone memory
 * maps, and sets up the zero page, bad page and bad page tables.
 */
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S
index 1128064fddcb..45dc29f85d56 100644
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -220,9 +220,7 @@ __v6_setup:
 #endif /* CONFIG_MMU */
        adr     r5, v6_crval
        ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(orr    r6, r6, #1 << 25)               @ big-endian page tables
-        orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
        mrc     p15, 0, r0, c1, c0, 0           @ read control register
        bic     r0, r0, r5                      @ clear bits them
        orr     r0, r0, r6                      @ set them
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index c63d9bdee51e..60920f62fdf5 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -367,9 +367,7 @@ __v7_setup:
 #endif
        adr     r5, v7_crval
        ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
+ ARM_BE8(orr    r6, r6, #1 << 25)               @ big-endian page tables
-        orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
 #ifdef CONFIG_SWP_EMULATE
        orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
        bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 99b44e0e8d86..9ed155ad0f97 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -19,6 +19,7 @@
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
 #include <asm/hwcap.h>
+#include <asm/opcodes.h>
 #include "bpf_jit_32.h"
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
 static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
 {
+        inst |= (cond << 28);
+        inst = __opcode_to_mem_arm(inst);
        if (ctx->target != NULL)
-                ctx->target[ctx->idx] = inst | (cond << 28);
+                ctx->target[ctx->idx] = inst;
        ctx->idx++;
 }
diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S
index 2677bc3762d7..40f27e52de75 100644
--- a/arch/arm/plat-versatile/headsmp.S
+++ b/arch/arm/plat-versatile/headsmp.S
@@ -10,6 +10,7 @@
 */
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <asm/assembler.h>
 /*
 * Realview/Versatile Express specific entry point for secondary CPUs.
@@ -17,6 +18,7 @@
 * until we're ready for them to initialise.
 */
 ENTRY(versatile_secondary_startup)
+ ARM_BE8(setend be)
        mrc     p15, 0, r0, c0, c0, 5
        bic     r0, #0xff000000
        adr     r4, 1f
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 69ce573f1224..71f337aefa39 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
          See <http://csrc.nist.gov/encryption/aes/> for more information.
+config CRYPTO_AES_ARM_BS
+        tristate "Bit sliced AES using NEON instructions"
+        depends on ARM && KERNEL_MODE_NEON
+        select CRYPTO_ALGAPI
+        select CRYPTO_AES_ARM
+        select CRYPTO_ABLK_HELPER
+        help
+          Use a faster and more secure NEON based implementation of AES in CBC,
+          CTR and XTS modes
+          Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
+          and for XTS mode encryption, CBC and XTS mode decryption speedup is
+          around 25%. (CBC encryption speed is not affected by this driver.)
+          This implementation does not rely on any lookup tables so it is
+          believed to be invulnerable to cache timing attacks.
 config CRYPTO_ANUBIS
        tristate "Anubis cipher algorithm"
        select CRYPTO_ALGAPI
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index 200926699778..2e6c275322f1 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -280,7 +280,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
        /* Enable the CCI port */
 "       ldr     r0, [r0, %[offsetof_port_phys]] \n"
-"       mov     r3, #"__stringify(CCI_ENABLE_REQ)" \n"
+"       mov     r3, %[cci_enable_req]\n"                   
 "       str     r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n"
        /* poll the status reg for completion */
@@ -288,7 +288,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
 "       ldr     r0, [r1] \n"
 "       ldr     r0, [r0, r1]            @ cci_ctrl_base \n"
 "4:     ldr     r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n"
-"       tst     r1, #1 \n"
+"       tst     r1, %[cci_control_status_bits] \n"                      
 "       bne     4b \n"
 "       mov     r0, #0 \n"
@@ -301,6 +301,8 @@ asmlinkage void __naked cci_enable_port_for_self(void)
 "7:     .word   cci_ctrl_phys - . \n"
        : :
        [sizeof_cpu_port] "i" (sizeof(cpu_port)),
+        [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ),
+        [cci_control_status_bits] "i" cpu_to_le32(1),
 #ifndef __ARMEB__
        [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)),
 #else
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index d0e948084eaf..9031171c141b 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
        if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
                return -EINVAL;
+        raw_spin_lock(&irq_controller_lock);
        mask = 0xff << shift;
        bit = gic_cpu_map[cpu] << shift;
-        raw_spin_lock(&irq_controller_lock);
        val = readl_relaxed(reg) & ~mask;
        writel_relaxed(val | bit, reg);
        raw_spin_unlock(&irq_controller_lock);
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
 void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 {
        int cpu;
-        unsigned long map = 0;
+        unsigned long flags, map = 0;
+        raw_spin_lock_irqsave(&irq_controller_lock, flags);
        /* Convert our logical CPU mask into a physical one. */
        for_each_cpu(cpu, mask)
@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
        /* this always happens on GIC0 */
        writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+        raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+#endif
+#ifdef CONFIG_BL_SWITCHER
+/*
+ * gic_send_sgi - send a SGI directly to given CPU interface number
+ *
+ * cpu_id: the ID for the destination CPU interface
+ * irq: the IPI number to send a SGI for
+ */
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
+{
+        BUG_ON(cpu_id >= NR_GIC_CPU_IF);
+        cpu_id = 1 << cpu_id;
+        /* this always happens on GIC0 */
+        writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+}
+/*
+ * gic_get_cpu_id - get the CPU interface ID for the specified CPU
+ *
+ * @cpu: the logical CPU number to get the GIC ID for.
+ *
+ * Return the CPU interface ID for the given logical CPU number,
+ * or -1 if the CPU number is too large or the interface ID is
+ * unknown (more than one bit set).
+ */
+int gic_get_cpu_id(unsigned int cpu)
+{
+        unsigned int cpu_bit;
+        if (cpu >= NR_GIC_CPU_IF)
+                return -1;
+        cpu_bit = gic_cpu_map[cpu];
+        if (cpu_bit & (cpu_bit - 1))
+                return -1;
+        return __ffs(cpu_bit);
 }
+/*
+ * gic_migrate_target - migrate IRQs to another CPU interface
+ *
+ * @new_cpu_id: the CPU target ID to migrate IRQs to
+ *
+ * Migrate all peripheral interrupts with a target matching the current CPU
+ * to the interface corresponding to @new_cpu_id.  The CPU interface mapping
+ * is also updated.  Targets to other CPU interfaces are unchanged.
+ * This must be called with IRQs locally disabled.
+ */
+void gic_migrate_target(unsigned int new_cpu_id)
+{
+        unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
+        void __iomem *dist_base;
+        int i, ror_val, cpu = smp_processor_id();
+        u32 val, cur_target_mask, active_mask;
+        if (gic_nr >= MAX_GIC_NR)
+                BUG();
+        dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+        if (!dist_base)
+                return;
+        gic_irqs = gic_data[gic_nr].gic_irqs;
+        cur_cpu_id = __ffs(gic_cpu_map[cpu]);
+        cur_target_mask = 0x01010101 << cur_cpu_id;
+        ror_val = (cur_cpu_id - new_cpu_id) & 31;
+        raw_spin_lock(&irq_controller_lock);
+        /* Update the target interface for this logical CPU */
+        gic_cpu_map[cpu] = 1 << new_cpu_id;
+        /*
+         * Find all the peripheral interrupts targetting the current
+         * CPU interface and migrate them to the new CPU interface.
+         * We skip DIST_TARGET 0 to 7 as they are read-only.
+         */
+        for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
+                val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
+                active_mask = val & cur_target_mask;
+                if (active_mask) {
+                        val &= ~active_mask;
+                        val |= ror32(active_mask, ror_val);
+                        writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
+                }
+        }
+        raw_spin_unlock(&irq_controller_lock);
+        /*
+         * Now let's migrate and clear any potential SGIs that might be
+         * pending for us (cur_cpu_id).  Since GIC_DIST_SGI_PENDING_SET
+         * is a banked register, we can only forward the SGI using
+         * GIC_DIST_SOFTINT.  The original SGI source is lost but Linux
+         * doesn't use that information anyway.
+         *
+         * For the same reason we do not adjust SGI source information
+         * for previously sent SGIs by us to other CPUs either.
+         */
+        for (i = 0; i < 16; i += 4) {
+                int j;
+                val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
+                if (!val)
+                        continue;
+                writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
+                for (j = i; j < i + 4; j++) {
+                        if (val & 0xff)
+                                writel_relaxed((1 << (new_cpu_id + 16)) | j,
+                                                dist_base + GIC_DIST_SOFTINT);
+                        val >>= 8;
+                }
+        }
+}
+/*
+ * gic_get_sgir_physaddr - get the physical address for the SGI register
+ *
+ * REturn the physical address of the SGI register to be used
+ * by some early assembly code when the kernel is not yet available.
+ */
+static unsigned long gic_dist_physaddr;
+unsigned long gic_get_sgir_physaddr(void)
+{
+        if (!gic_dist_physaddr)
+                return 0;
+        return gic_dist_physaddr + GIC_DIST_SOFTINT;
+}
+void __init gic_init_physaddr(struct device_node *node)
+{
+        struct resource res;
+        if (of_address_to_resource(node, 0, &res) == 0) {
+                gic_dist_physaddr = res.start;
+                pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
+        }
+}
+#else
+#define gic_init_physaddr(node)  do { } while (0)
 #endif
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
                percpu_offset = 0;
        gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+        if (!gic_cnt)
+                gic_init_physaddr(node);
        if (parent) {
                irq = irq_of_parse_and_map(node, 0);
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 0e5d9ecdb2b6..cac496b1e279 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -31,6 +31,8 @@
 #define GIC_DIST_TARGET                 0x800
 #define GIC_DIST_CONFIG                 0xc00
 #define GIC_DIST_SOFTINT                0xf00
+#define GIC_DIST_SGI_PENDING_CLEAR      0xf10
+#define GIC_DIST_SGI_PENDING_SET        0xf20
 #define GICH_HCR                        0x0
 #define GICH_VTR                        0x4
@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start,
        gic_init_bases(nr, start, dist, cpu, 0, NULL);
 }
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
+int gic_get_cpu_id(unsigned int cpu);
+void gic_migrate_target(unsigned int new_cpu_id);
+unsigned long gic_get_sgir_physaddr(void);
 #endif /* __ASSEMBLY */
 #endif
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h
new file mode 100644
index 000000000000..f76dd4de625e
--- /dev/null
+++ b/include/trace/events/power_cpu_migrate.h
@@ -0,0 +1,67 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_CPU_MIGRATE_H
+#include <linux/tracepoint.h>
+#define __cpu_migrate_proto                     \
+        TP_PROTO(u64 timestamp,                 \
+                 u32 cpu_hwid)
+#define __cpu_migrate_args                      \
+        TP_ARGS(timestamp,                      \
+                cpu_hwid)
+DECLARE_EVENT_CLASS(cpu_migrate,
+        __cpu_migrate_proto,
+        __cpu_migrate_args,
+        TP_STRUCT__entry(
+                __field(u64,    timestamp               )
+                __field(u32,    cpu_hwid                )
+        ),
+        TP_fast_assign(
+                __entry->timestamp = timestamp;
+                __entry->cpu_hwid = cpu_hwid;
+        ),
+        TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
+                (unsigned long long)__entry->timestamp,
+                (unsigned long)__entry->cpu_hwid
+        )
+);
+#define __define_cpu_migrate_event(name)                \
+        DEFINE_EVENT(cpu_migrate, cpu_migrate_##name,   \
+                __cpu_migrate_proto,                    \
+                __cpu_migrate_args                      \
+        )
+__define_cpu_migrate_event(begin);
+__define_cpu_migrate_event(finish);
+__define_cpu_migrate_event(current);
+#undef __define_cpu_migrate
+#undef __cpu_migrate_proto
+#undef __cpu_migrate_args
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+/*
+ * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
+ * a whole-cluster migration:
+ */
+#define CPU_MIGRATE_ALL_CPUS 0x80000000U
+#endif
+#endif /* _TRACE_POWER_CPU_MIGRATE_H */
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE power_cpu_migrate
+#include <trace/define_trace.h>
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile
index 15130b50dfe3..fe9b61e322a5 100644
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -2,3 +2,6 @@ ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
+endif
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
new file mode 100644
index 000000000000..2a1cfde66b69
--- /dev/null
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -0,0 +1,54 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+#include <stdlib.h>
+#include "../../util/types.h"
+#include <asm/perf_regs.h>
+#define PERF_REGS_MASK  ((1ULL << PERF_REG_ARM_MAX) - 1)
+#define PERF_REG_IP     PERF_REG_ARM_PC
+#define PERF_REG_SP     PERF_REG_ARM_SP
+static inline const char *perf_reg_name(int id)
+{
+        switch (id) {
+        case PERF_REG_ARM_R0:
+                return "r0";
+        case PERF_REG_ARM_R1:
+                return "r1";
+        case PERF_REG_ARM_R2:
+                return "r2";
+        case PERF_REG_ARM_R3:
+                return "r3";
+        case PERF_REG_ARM_R4:
+                return "r4";
+        case PERF_REG_ARM_R5:
+                return "r5";
+        case PERF_REG_ARM_R6:
+                return "r6";
+        case PERF_REG_ARM_R7:
+                return "r7";
+        case PERF_REG_ARM_R8:
+                return "r8";
+        case PERF_REG_ARM_R9:
+                return "r9";
+        case PERF_REG_ARM_R10:
+                return "r10";
+        case PERF_REG_ARM_FP:
+                return "fp";
+        case PERF_REG_ARM_IP:
+                return "ip";
+        case PERF_REG_ARM_SP:
+                return "sp";
+        case PERF_REG_ARM_LR:
+                return "lr";
+        case PERF_REG_ARM_PC:
+                return "pc";
+        default:
+                return NULL;
+        }
+        return NULL;
+}
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c
new file mode 100644
index 000000000000..da3dc950550c
--- /dev/null
+++ b/tools/perf/arch/arm/util/unwind.c
@@ -0,0 +1,48 @@
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+int unwind__arch_reg_id(int regnum)
+{
+        switch (regnum) {
+        case UNW_ARM_R0:
+                return PERF_REG_ARM_R0;
+        case UNW_ARM_R1:
+                return PERF_REG_ARM_R1;
+        case UNW_ARM_R2:
+                return PERF_REG_ARM_R2;
+        case UNW_ARM_R3:
+                return PERF_REG_ARM_R3;
+        case UNW_ARM_R4:
+                return PERF_REG_ARM_R4;
+        case UNW_ARM_R5:
+                return PERF_REG_ARM_R5;
+        case UNW_ARM_R6:
+                return PERF_REG_ARM_R6;
+        case UNW_ARM_R7:
+                return PERF_REG_ARM_R7;
+        case UNW_ARM_R8:
+                return PERF_REG_ARM_R8;
+        case UNW_ARM_R9:
+                return PERF_REG_ARM_R9;
+        case UNW_ARM_R10:
+                return PERF_REG_ARM_R10;
+        case UNW_ARM_R11:
+                return PERF_REG_ARM_FP;
+        case UNW_ARM_R12:
+                return PERF_REG_ARM_IP;
+        case UNW_ARM_R13:
+                return PERF_REG_ARM_SP;
+        case UNW_ARM_R14:
+                return PERF_REG_ARM_LR;
+        case UNW_ARM_R15:
+                return PERF_REG_ARM_PC;
+        default:
+                pr_err("unwind: invalid reg id %d\n", regnum);
+                return -EINVAL;
+        }
+        return -EINVAL;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 5f6f9b3271bb..75b93d7f7860 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -29,6 +29,10 @@ ifeq ($(ARCH),x86_64)
  NO_PERF_REGS := 0
  LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
 endif
+ifeq ($(ARCH),arm)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS = -lunwind -lunwind-arm
+endif
 ifeq ($(NO_PERF_REGS),0)
  CFLAGS += -DHAVE_PERF_REGS
@@ -208,8 +212,7 @@ ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
 endif # try-cc
 endif # NO_LIBELF
-# There's only x86 (both 32 and 64) support for CFI unwind so far
+ifeq ($(LIBUNWIND_LIBS),)
-ifneq ($(ARCH),x86)
  NO_LIBUNWIND := 1
 endif
@@ -223,9 +226,13 @@ endif
 FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS)
 ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y)
-  msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99);
+  msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
  NO_LIBUNWIND := 1
 endif # Libunwind support
+ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y)
+  msg := $(warning No debug_frame support found in libunwind);
+CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+endif # debug_frame support in libunwind
 endif # NO_LIBUNWIND
 ifndef NO_LIBUNWIND
diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak
index f79305739ecc..028fe997d5eb 100644
--- a/tools/perf/config/feature-tests.mak
+++ b/tools/perf/config/feature-tests.mak
@@ -185,7 +185,6 @@ extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
                                      unw_proc_info_t *pi,
                                      int need_unwind_info, void *arg);
 #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
 int main(void)
@@ -197,6 +196,26 @@ int main(void)
        return 0;
 }
 endef
+define SOURCE_LIBUNWIND_DEBUG_FRAME
+#include <libunwind.h>
+#include <stdlib.h>
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                 unw_word_t ip, unw_word_t segbase,
+                                 const char *obj_name, unw_word_t start,
+                                 unw_word_t end);
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+int main(void)
+{
+        dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+        return 0;
+}
+endef
 endif
 ifndef NO_BACKTRACE
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c
index 2f891f7e70bf..5390d0b8862a 100644
--- a/tools/perf/util/unwind.c
+++ b/tools/perf/util/unwind.c
@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
 #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                 unw_word_t ip,
+                                 unw_word_t segbase,
+                                 const char *obj_name, unw_word_t start,
+                                 unw_word_t end);
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
 #define DW_EH_PE_FORMAT_MASK    0x0f    /* format of the encoded value */
 #define DW_EH_PE_APPL_MASK      0x70    /* how the value is to be applied */
@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
        return 0;
 }
-static int read_unwind_spec(struct dso *dso, struct machine *machine,
+static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
-                            u64 *table_data, u64 *segbase, u64 *fde_count)
+                                     u64 *table_data, u64 *segbase,
+                                     u64 *fde_count)
 {
        int ret = -EINVAL, fd;
        u64 offset;
@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
        if (fd < 0)
                return -EINVAL;
+        /* Check the .eh_frame section for unwinding info */
        offset = elf_section_offset(fd, ".eh_frame_hdr");
        close(fd);
@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
                                          table_data, segbase,
                                          fde_count);
-        /* TODO .debug_frame check if eh_frame_hdr fails */
        return ret;
 }
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int read_unwind_spec_debug_frame(struct dso *dso,
+                                        struct machine *machine, u64 *offset)
+{
+        int fd = dso__data_fd(dso, machine);
+        if (fd < 0)
+                return -EINVAL;
+        /* Check the .debug_frame section for unwinding info */
+        *offset = elf_section_offset(fd, ".debug_frame");
+        close(fd);
+        if (*offset)
+                return 0;
+        return -EINVAL;
+}
+#endif
 static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
 {
        struct addr_location al;
@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
        pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
-        if (read_unwind_spec(map->dso, ui->machine,
+        /* Check the .eh_frame section for unwinding info */
-                             &table_data, &segbase, &fde_count))
+        if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
-                return -EINVAL;
+                                       &table_data, &segbase, &fde_count)) {
+                memset(&di, 0, sizeof(di));
+                di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
+                di.start_ip = map->start;
+                di.end_ip   = map->end;
+                di.u.rti.segbase    = map->start + segbase;
+                di.u.rti.table_data = map->start + table_data;
+                di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
+                                      / sizeof(unw_word_t);
+                return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                 need_unwind_info, arg);
+        }
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+        /* Check the .debug_frame section for unwinding info */
+        if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
+                memset(&di, 0, sizeof(di));
+                dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
+                                       map->start, map->end);
+                return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                 need_unwind_info, arg);
+        }
+#endif
-        memset(&di, 0, sizeof(di));
+        return -EINVAL;
-        di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
-        di.start_ip = map->start;
-        di.end_ip   = map->end;
-        di.u.rti.segbase    = map->start + segbase;
-        di.u.rti.table_data = map->start + table_data;
-        di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
-                              / sizeof(unw_word_t);
-        return dwarf_search_unwind_table(as, ip, &di, pi,
-                                         need_unwind_info, arg);
 }
 static int access_fpreg(unw_addr_space_t __maybe_unused as,
author	Russell King <rmk+kernel@arm.linux.org.uk>	2013-11-12 05:58:59 -0500
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2013-11-12 05:58:59 -0500
commit	df762eccbadf87850fbee444d729e0f1b1e946f1 (patch)
tree	1bf47bbbd4ea91e343f983b3b50ec2ec73a739e1
parent	ec1e20a02fe33b767ffcca8920a32211492416d7 (diff)
parent	70d42126877b9faa272d446a6de5917614c28dd9 (diff)