96 files changed, 6391 insertions, 605 deletions
diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index f89515adac60..eb577f4f5f70 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -52,15 +52,7 @@ extern inline void *return_address(unsigned int level)
 #endif
-#define HAVE_ARCH_CALLER_ADDR
+#define ftrace_return_addr(n) return_address(n)
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 ((unsigned long)return_address(1))
-#define CALLER_ADDR2 ((unsigned long)return_address(2))
-#define CALLER_ADDR3 ((unsigned long)return_address(3))
-#define CALLER_ADDR4 ((unsigned long)return_address(4))
-#define CALLER_ADDR5 ((unsigned long)return_address(5))
-#define CALLER_ADDR6 ((unsigned long)return_address(6))
 #endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e384ab9b3862..7295419165e1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -30,12 +30,17 @@ config ARM64
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
+        select HAVE_C_RECORDMCOUNT
        select HAVE_DEBUG_BUGVERBOSE
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_API_DEBUG
        select HAVE_DMA_ATTRS
        select HAVE_DMA_CONTIGUOUS
+        select HAVE_DYNAMIC_FTRACE
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
+        select HAVE_FTRACE_MCOUNT_RECORD
+        select HAVE_FUNCTION_TRACER
+        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_HW_BREAKPOINT if PERF_EVENTS
        select HAVE_MEMBLOCK
@@ -43,6 +48,7 @@ config ARM64
        select HAVE_PERF_EVENTS
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
+        select HAVE_SYSCALL_TRACEPOINTS
        select IRQ_DOMAIN
        select MODULES_USE_ELF_RELA
        select NO_BOOTMEM
@@ -245,6 +251,9 @@ config ARCH_WANT_HUGE_PMD_SHARE
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
        def_bool y
+config ARCH_HAS_CACHE_LINE_SIZE
+        def_bool y
 source "mm/Kconfig"
 config XEN_DOM0
@@ -359,5 +368,8 @@ source "arch/arm64/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
+if CRYPTO
+source "arch/arm64/crypto/Kconfig"
+endif
 source "lib/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 2fceb71ac3b7..8185a913c5ed 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -45,6 +45,7 @@ export	TEXT_OFFSET GZFLAGS
 core-y          += arch/arm64/kernel/ arch/arm64/mm/
 core-$(CONFIG_KVM) += arch/arm64/kvm/
 core-$(CONFIG_XEN) += arch/arm64/xen/
+core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
 libs-y          := arch/arm64/lib/ $(libs-y)
 libs-y          += $(LIBGCC)
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 7959dd0ca5d5..157e1d8d9a47 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1,11 +1,11 @@
 # CONFIG_LOCALVERSION_AUTO is not set
-# CONFIG_SWAP is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
@@ -27,6 +27,7 @@ CONFIG_ARCH_VEXPRESS=y
 CONFIG_ARCH_XGENE=y
 CONFIG_SMP=y
 CONFIG_PREEMPT=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_CMA=y
 CONFIG_CMDLINE="console=ttyAMA0"
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
@@ -44,7 +45,7 @@ CONFIG_IP_PNP_BOOTP=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DMA_CMA=y
-CONFIG_SCSI=y
+CONFIG_VIRTIO_BLK=y
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -56,20 +57,18 @@ CONFIG_SMC91X=y
 CONFIG_SMSC911X=y
 # CONFIG_WLAN is not set
 CONFIG_INPUT_EVDEV=y
-# CONFIG_SERIO_I8042 is not set
 # CONFIG_SERIO_SERPORT is not set
 CONFIG_LEGACY_PTY_COUNT=16
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_AMBA_PL011=y
 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
 CONFIG_REGULATOR=y
 CONFIG_REGULATOR_FIXED_VOLTAGE=y
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_MONO is not set
@@ -79,27 +78,38 @@ CONFIG_USB_ISP1760_HCD=y
 CONFIG_USB_STORAGE=y
 CONFIG_MMC=y
 CONFIG_MMC_ARMMMCI=y
+CONFIG_VIRTIO_MMIO=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
 CONFIG_FUSE_FS=y
 CONFIG_CUSE=y
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
 # CONFIG_MISC_FILESYSTEMS is not set
 CONFIG_NFS_FS=y
 CONFIG_ROOT_NFS=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
-CONFIG_MAGIC_SYSRQ=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_KVM=y
+CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
+CONFIG_LOCKUP_DETECTOR=y
 # CONFIG_SCHED_DEBUG is not set
-CONFIG_DEBUG_INFO=y
 # CONFIG_FTRACE is not set
-CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_CRYPTO_ANSI_CPRNG=y
-CONFIG_VIRTIO_MMIO=y
+CONFIG_ARM64_CRYPTO=y
-CONFIG_VIRTIO_BLK=y
+CONFIG_CRYPTO_SHA1_ARM64_CE=y
+CONFIG_CRYPTO_SHA2_ARM64_CE=y
+CONFIG_CRYPTO_GHASH_ARM64_CE=y
+CONFIG_CRYPTO_AES_ARM64_CE=y
+CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
+CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
+CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
new file mode 100644
index 000000000000..5562652c5316
--- /dev/null
+++ b/arch/arm64/crypto/Kconfig
@@ -0,0 +1,53 @@
+menuconfig ARM64_CRYPTO
+        bool "ARM64 Accelerated Cryptographic Algorithms"
+        depends on ARM64
+        help
+          Say Y here to choose from a selection of cryptographic algorithms
+          implemented using ARM64 specific CPU features or instructions.
+if ARM64_CRYPTO
+config CRYPTO_SHA1_ARM64_CE
+        tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_SHA2_ARM64_CE
+        tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_GHASH_ARM64_CE
+        tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_AES_ARM64_CE
+        tristate "AES core cipher using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_ALGAPI
+        select CRYPTO_AES
+config CRYPTO_AES_ARM64_CE_CCM
+        tristate "AES in CCM mode using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_ALGAPI
+        select CRYPTO_AES
+        select CRYPTO_AEAD
+config CRYPTO_AES_ARM64_CE_BLK
+        tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_BLKCIPHER
+        select CRYPTO_AES
+        select CRYPTO_ABLK_HELPER
+config CRYPTO_AES_ARM64_NEON_BLK
+        tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_BLKCIPHER
+        select CRYPTO_AES
+        select CRYPTO_ABLK_HELPER
+endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
new file mode 100644
index 000000000000..2070a56ecc46
--- /dev/null
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,38 @@
+#
+# linux/arch/arm64/crypto/Makefile
+#
+# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
+sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
+obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
+sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
+obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
+CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
+aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+AFLAGS_aes-ce.o         := -DINTERLEAVE=2 -DINTERLEAVE_INLINE
+AFLAGS_aes-neon.o       := -DINTERLEAVE=4
+CFLAGS_aes-glue-ce.o    := -DUSE_V8_CRYPTO_EXTENSIONS
+$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
+        $(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
new file mode 100644
index 000000000000..432e4841cd81
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -0,0 +1,222 @@
+/*
+ * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+        .text
+        .arch   armv8-a+crypto
+        /*
+         * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+         *                           u32 *macp, u8 const rk[], u32 rounds);
+         */
+ENTRY(ce_aes_ccm_auth_data)
+        ldr     w8, [x3]                        /* leftover from prev round? */
+        ld1     {v0.2d}, [x0]                   /* load mac */
+        cbz     w8, 1f
+        sub     w8, w8, #16
+        eor     v1.16b, v1.16b, v1.16b
+0:      ldrb    w7, [x1], #1                    /* get 1 byte of input */
+        subs    w2, w2, #1
+        add     w8, w8, #1
+        ins     v1.b[0], w7
+        ext     v1.16b, v1.16b, v1.16b, #1      /* rotate in the input bytes */
+        beq     8f                              /* out of input? */
+        cbnz    w8, 0b
+        eor     v0.16b, v0.16b, v1.16b
+1:      ld1     {v3.2d}, [x4]                   /* load first round key */
+        prfm    pldl1strm, [x1]
+        cmp     w5, #12                         /* which key size? */
+        add     x6, x4, #16
+        sub     w7, w5, #2                      /* modified # of rounds */
+        bmi     2f
+        bne     5f
+        mov     v5.16b, v3.16b
+        b       4f
+2:      mov     v4.16b, v3.16b
+        ld1     {v5.2d}, [x6], #16              /* load 2nd round key */
+3:      aese    v0.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+4:      ld1     {v3.2d}, [x6], #16              /* load next round key */
+        aese    v0.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+5:      ld1     {v4.2d}, [x6], #16              /* load next round key */
+        subs    w7, w7, #3
+        aese    v0.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        ld1     {v5.2d}, [x6], #16              /* load next round key */
+        bpl     3b
+        aese    v0.16b, v4.16b
+        subs    w2, w2, #16                     /* last data? */
+        eor     v0.16b, v0.16b, v5.16b          /* final round */
+        bmi     6f
+        ld1     {v1.16b}, [x1], #16             /* load next input block */
+        eor     v0.16b, v0.16b, v1.16b          /* xor with mac */
+        bne     1b
+6:      st1     {v0.2d}, [x0]                   /* store mac */
+        beq     10f
+        adds    w2, w2, #16
+        beq     10f
+        mov     w8, w2
+7:      ldrb    w7, [x1], #1
+        umov    w6, v0.b[0]
+        eor     w6, w6, w7
+        strb    w6, [x0], #1
+        subs    w2, w2, #1
+        beq     10f
+        ext     v0.16b, v0.16b, v0.16b, #1      /* rotate out the mac bytes */
+        b       7b
+8:      mov     w7, w8
+        add     w8, w8, #16
+9:      ext     v1.16b, v1.16b, v1.16b, #1
+        adds    w7, w7, #1
+        bne     9b
+        eor     v0.16b, v0.16b, v1.16b
+        st1     {v0.2d}, [x0]
+10:     str     w8, [x3]
+        ret
+ENDPROC(ce_aes_ccm_auth_data)
+        /*
+         * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
+         *                       u32 rounds);
+         */
+ENTRY(ce_aes_ccm_final)
+        ld1     {v3.2d}, [x2], #16              /* load first round key */
+        ld1     {v0.2d}, [x0]                   /* load mac */
+        cmp     w3, #12                         /* which key size? */
+        sub     w3, w3, #2                      /* modified # of rounds */
+        ld1     {v1.2d}, [x1]                   /* load 1st ctriv */
+        bmi     0f
+        bne     3f
+        mov     v5.16b, v3.16b
+        b       2f
+0:      mov     v4.16b, v3.16b
+1:      ld1     {v5.2d}, [x2], #16              /* load next round key */
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+2:      ld1     {v3.2d}, [x2], #16              /* load next round key */
+        aese    v0.16b, v5.16b
+        aese    v1.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+3:      ld1     {v4.2d}, [x2], #16              /* load next round key */
+        subs    w3, w3, #3
+        aese    v0.16b, v3.16b
+        aese    v1.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+        bpl     1b
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        /* final round key cancels out */
+        eor     v0.16b, v0.16b, v1.16b          /* en-/decrypt the mac */
+        st1     {v0.2d}, [x0]                   /* store result */
+        ret
+ENDPROC(ce_aes_ccm_final)
+        .macro  aes_ccm_do_crypt,enc
+        ldr     x8, [x6, #8]                    /* load lower ctr */
+        ld1     {v0.2d}, [x5]                   /* load mac */
+        rev     x8, x8                          /* keep swabbed ctr in reg */
+0:      /* outer loop */
+        ld1     {v1.1d}, [x6]                   /* load upper ctr */
+        prfm    pldl1strm, [x1]
+        add     x8, x8, #1
+        rev     x9, x8
+        cmp     w4, #12                         /* which key size? */
+        sub     w7, w4, #2                      /* get modified # of rounds */
+        ins     v1.d[1], x9                     /* no carry in lower ctr */
+        ld1     {v3.2d}, [x3]                   /* load first round key */
+        add     x10, x3, #16
+        bmi     1f
+        bne     4f
+        mov     v5.16b, v3.16b
+        b       3f
+1:      mov     v4.16b, v3.16b
+        ld1     {v5.2d}, [x10], #16             /* load 2nd round key */
+2:      /* inner loop: 3 rounds, 2x interleaved */
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+3:      ld1     {v3.2d}, [x10], #16             /* load next round key */
+        aese    v0.16b, v5.16b
+        aese    v1.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+4:      ld1     {v4.2d}, [x10], #16             /* load next round key */
+        subs    w7, w7, #3
+        aese    v0.16b, v3.16b
+        aese    v1.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+        ld1     {v5.2d}, [x10], #16             /* load next round key */
+        bpl     2b
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        subs    w2, w2, #16
+        bmi     6f                              /* partial block? */
+        ld1     {v2.16b}, [x1], #16             /* load next input block */
+        .if     \enc == 1
+        eor     v2.16b, v2.16b, v5.16b          /* final round enc+mac */
+        eor     v1.16b, v1.16b, v2.16b          /* xor with crypted ctr */
+        .else
+        eor     v2.16b, v2.16b, v1.16b          /* xor with crypted ctr */
+        eor     v1.16b, v2.16b, v5.16b          /* final round enc */
+        .endif
+        eor     v0.16b, v0.16b, v2.16b          /* xor mac with pt ^ rk[last] */
+        st1     {v1.16b}, [x0], #16             /* write output block */
+        bne     0b
+        rev     x8, x8
+        st1     {v0.2d}, [x5]                   /* store mac */
+        str     x8, [x6, #8]                    /* store lsb end of ctr (BE) */
+5:      ret
+6:      eor     v0.16b, v0.16b, v5.16b          /* final round mac */
+        eor     v1.16b, v1.16b, v5.16b          /* final round enc */
+        st1     {v0.2d}, [x5]                   /* store mac */
+        add     w2, w2, #16                     /* process partial tail block */
+7:      ldrb    w9, [x1], #1                    /* get 1 byte of input */
+        umov    w6, v1.b[0]                     /* get top crypted ctr byte */
+        umov    w7, v0.b[0]                     /* get top mac byte */
+        .if     \enc == 1
+        eor     w7, w7, w9
+        eor     w9, w9, w6
+        .else
+        eor     w9, w9, w6
+        eor     w7, w7, w9
+        .endif
+        strb    w9, [x0], #1                    /* store out byte */
+        strb    w7, [x5], #1                    /* store mac byte */
+        subs    w2, w2, #1
+        beq     5b
+        ext     v0.16b, v0.16b, v0.16b, #1      /* shift out mac byte */
+        ext     v1.16b, v1.16b, v1.16b, #1      /* shift out ctr byte */
+        b       7b
+        .endm
+        /*
+         * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
+         *                         u8 const rk[], u32 rounds, u8 mac[],
+         *                         u8 ctr[]);
+         * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
+         *                         u8 const rk[], u32 rounds, u8 mac[],
+         *                         u8 ctr[]);
+         */
+ENTRY(ce_aes_ccm_encrypt)
+        aes_ccm_do_crypt        1
+ENDPROC(ce_aes_ccm_encrypt)
+ENTRY(ce_aes_ccm_decrypt)
+        aes_ccm_do_crypt        0
+ENDPROC(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
new file mode 100644
index 000000000000..9e6cdde9b43d
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -0,0 +1,297 @@
+/*
+ * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/scatterwalk.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+        /*
+         * # of rounds specified by AES:
+         * 128 bit key          10 rounds
+         * 192 bit key          12 rounds
+         * 256 bit key          14 rounds
+         * => n byte key        => 6 + (n/4) rounds
+         */
+        return 6 + ctx->key_length / 4;
+}
+asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+                                     u32 *macp, u32 const rk[], u32 rounds);
+asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
+                                   u32 const rk[], u32 rounds, u8 mac[],
+                                   u8 ctr[]);
+asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
+                                   u32 const rk[], u32 rounds, u8 mac[],
+                                   u8 ctr[]);
+asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
+                                 u32 rounds);
+static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
+                      unsigned int key_len)
+{
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
+        int ret;
+        ret = crypto_aes_expand_key(ctx, in_key, key_len);
+        if (!ret)
+                return 0;
+        tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+        return -EINVAL;
+}
+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+        if ((authsize & 1) || authsize < 4)
+                return -EINVAL;
+        return 0;
+}
+static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
+        u32 l = req->iv[0] + 1;
+        /* verify that CCM dimension 'L' is set correctly in the IV */
+        if (l < 2 || l > 8)
+                return -EINVAL;
+        /* verify that msglen can in fact be represented in L bytes */
+        if (l < 4 && msglen >> (8 * l))
+                return -EOVERFLOW;
+        /*
+         * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
+         * uses a u32 type to represent msglen so the top 4 bytes are always 0.
+         */
+        n[0] = 0;
+        n[1] = cpu_to_be32(msglen);
+        memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
+        /*
+         * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
+         * - bits 0..2  : max # of bytes required to represent msglen, minus 1
+         *                (already set by caller)
+         * - bits 3..5  : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
+         * - bit 6      : indicates presence of authenticate-only data
+         */
+        maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
+        if (req->assoclen)
+                maciv[0] |= 0x40;
+        memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
+        return 0;
+}
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        struct __packed { __be16 l; __be32 h; u16 len; } ltag;
+        struct scatter_walk walk;
+        u32 len = req->assoclen;
+        u32 macp = 0;
+        /* prepend the AAD with a length tag */
+        if (len < 0xff00) {
+                ltag.l = cpu_to_be16(len);
+                ltag.len = 2;
+        } else  {
+                ltag.l = cpu_to_be16(0xfffe);
+                put_unaligned_be32(len, &ltag.h);
+                ltag.len = 6;
+        }
+        ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
+                             num_rounds(ctx));
+        scatterwalk_start(&walk, req->assoc);
+        do {
+                u32 n = scatterwalk_clamp(&walk, len);
+                u8 *p;
+                if (!n) {
+                        scatterwalk_start(&walk, sg_next(walk.sg));
+                        n = scatterwalk_clamp(&walk, len);
+                }
+                p = scatterwalk_map(&walk);
+                ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
+                                     num_rounds(ctx));
+                len -= n;
+                scatterwalk_unmap(p);
+                scatterwalk_advance(&walk, n);
+                scatterwalk_done(&walk, 0, len);
+        } while (len);
+}
+static int ccm_encrypt(struct aead_request *req)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        struct blkcipher_desc desc = { .info = req->iv };
+        struct blkcipher_walk walk;
+        u8 __aligned(8) mac[AES_BLOCK_SIZE];
+        u8 buf[AES_BLOCK_SIZE];
+        u32 len = req->cryptlen;
+        int err;
+        err = ccm_init_mac(req, mac, len);
+        if (err)
+                return err;
+        kernel_neon_begin_partial(6);
+        if (req->assoclen)
+                ccm_calculate_auth_mac(req, mac);
+        /* preserve the original iv for the final round */
+        memcpy(buf, req->iv, AES_BLOCK_SIZE);
+        blkcipher_walk_init(&walk, req->dst, req->src, len);
+        err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
+                                             AES_BLOCK_SIZE);
+        while (walk.nbytes) {
+                u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+                if (walk.nbytes == len)
+                        tail = 0;
+                ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                   walk.nbytes - tail, ctx->key_enc,
+                                   num_rounds(ctx), mac, walk.iv);
+                len -= walk.nbytes - tail;
+                err = blkcipher_walk_done(&desc, &walk, tail);
+        }
+        if (!err)
+                ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+        kernel_neon_end();
+        if (err)
+                return err;
+        /* copy authtag to end of dst */
+        scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
+                                 crypto_aead_authsize(aead), 1);
+        return 0;
+}
+static int ccm_decrypt(struct aead_request *req)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        unsigned int authsize = crypto_aead_authsize(aead);
+        struct blkcipher_desc desc = { .info = req->iv };
+        struct blkcipher_walk walk;
+        u8 __aligned(8) mac[AES_BLOCK_SIZE];
+        u8 buf[AES_BLOCK_SIZE];
+        u32 len = req->cryptlen - authsize;
+        int err;
+        err = ccm_init_mac(req, mac, len);
+        if (err)
+                return err;
+        kernel_neon_begin_partial(6);
+        if (req->assoclen)
+                ccm_calculate_auth_mac(req, mac);
+        /* preserve the original iv for the final round */
+        memcpy(buf, req->iv, AES_BLOCK_SIZE);
+        blkcipher_walk_init(&walk, req->dst, req->src, len);
+        err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
+                                             AES_BLOCK_SIZE);
+        while (walk.nbytes) {
+                u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+                if (walk.nbytes == len)
+                        tail = 0;
+                ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                   walk.nbytes - tail, ctx->key_enc,
+                                   num_rounds(ctx), mac, walk.iv);
+                len -= walk.nbytes - tail;
+                err = blkcipher_walk_done(&desc, &walk, tail);
+        }
+        if (!err)
+                ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+        kernel_neon_end();
+        if (err)
+                return err;
+        /* compare calculated auth tag with the stored one */
+        scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
+                                 authsize, 0);
+        if (memcmp(mac, buf, authsize))
+                return -EBADMSG;
+        return 0;
+}
+static struct crypto_alg ccm_aes_alg = {
+        .cra_name               = "ccm(aes)",
+        .cra_driver_name        = "ccm-aes-ce",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_AEAD,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_aead_type,
+        .cra_module             = THIS_MODULE,
+        .cra_aead = {
+                .ivsize         = AES_BLOCK_SIZE,
+                .maxauthsize    = AES_BLOCK_SIZE,
+                .setkey         = ccm_setkey,
+                .setauthsize    = ccm_setauthsize,
+                .encrypt        = ccm_encrypt,
+                .decrypt        = ccm_decrypt,
+        }
+};
+static int __init aes_mod_init(void)
+{
+        if (!(elf_hwcap & HWCAP_AES))
+                return -ENODEV;
+        return crypto_register_alg(&ccm_aes_alg);
+}
+static void __exit aes_mod_exit(void)
+{
+        crypto_unregister_alg(&ccm_aes_alg);
+}
+module_init(aes_mod_init);
+module_exit(aes_mod_exit);
+MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ccm(aes)");
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
new file mode 100644
index 000000000000..2075e1acae6b
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -0,0 +1,155 @@
+/*
+ * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+struct aes_block {
+        u8 b[AES_BLOCK_SIZE];
+};
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+        /*
+         * # of rounds specified by AES:
+         * 128 bit key          10 rounds
+         * 192 bit key          12 rounds
+         * 256 bit key          14 rounds
+         * => n byte key        => 6 + (n/4) rounds
+         */
+        return 6 + ctx->key_length / 4;
+}
+static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+        struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        struct aes_block *out = (struct aes_block *)dst;
+        struct aes_block const *in = (struct aes_block *)src;
+        void *dummy0;
+        int dummy1;
+        kernel_neon_begin_partial(4);
+        __asm__("       ld1     {v0.16b}, %[in]                 ;"
+                "       ld1     {v1.2d}, [%[key]], #16          ;"
+                "       cmp     %w[rounds], #10                 ;"
+                "       bmi     0f                              ;"
+                "       bne     3f                              ;"
+                "       mov     v3.16b, v1.16b                  ;"
+                "       b       2f                              ;"
+                "0:     mov     v2.16b, v1.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "1:     aese    v0.16b, v2.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "2:     ld1     {v1.2d}, [%[key]], #16          ;"
+                "       aese    v0.16b, v3.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "3:     ld1     {v2.2d}, [%[key]], #16          ;"
+                "       subs    %w[rounds], %w[rounds], #3      ;"
+                "       aese    v0.16b, v1.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "       bpl     1b                              ;"
+                "       aese    v0.16b, v2.16b                  ;"
+                "       eor     v0.16b, v0.16b, v3.16b          ;"
+                "       st1     {v0.16b}, %[out]                ;"
+        :       [out]           "=Q"(*out),
+                [key]           "=r"(dummy0),
+                [rounds]        "=r"(dummy1)
+        :       [in]            "Q"(*in),
+                                "1"(ctx->key_enc),
+                                "2"(num_rounds(ctx) - 2)
+        :       "cc");
+        kernel_neon_end();
+}
+static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+        struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        struct aes_block *out = (struct aes_block *)dst;
+        struct aes_block const *in = (struct aes_block *)src;
+        void *dummy0;
+        int dummy1;
+        kernel_neon_begin_partial(4);
+        __asm__("       ld1     {v0.16b}, %[in]                 ;"
+                "       ld1     {v1.2d}, [%[key]], #16          ;"
+                "       cmp     %w[rounds], #10                 ;"
+                "       bmi     0f                              ;"
+                "       bne     3f                              ;"
+                "       mov     v3.16b, v1.16b                  ;"
+                "       b       2f                              ;"
+                "0:     mov     v2.16b, v1.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "1:     aesd    v0.16b, v2.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "2:     ld1     {v1.2d}, [%[key]], #16          ;"
+                "       aesd    v0.16b, v3.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "3:     ld1     {v2.2d}, [%[key]], #16          ;"
+                "       subs    %w[rounds], %w[rounds], #3      ;"
+                "       aesd    v0.16b, v1.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "       bpl     1b                              ;"
+                "       aesd    v0.16b, v2.16b                  ;"
+                "       eor     v0.16b, v0.16b, v3.16b          ;"
+                "       st1     {v0.16b}, %[out]                ;"
+        :       [out]           "=Q"(*out),
+                [key]           "=r"(dummy0),
+                [rounds]        "=r"(dummy1)
+        :       [in]            "Q"(*in),
+                                "1"(ctx->key_dec),
+                                "2"(num_rounds(ctx) - 2)
+        :       "cc");
+        kernel_neon_end();
+}
+static struct crypto_alg aes_alg = {
+        .cra_name               = "aes",
+        .cra_driver_name        = "aes-ce",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_module             = THIS_MODULE,
+        .cra_cipher = {
+                .cia_min_keysize        = AES_MIN_KEY_SIZE,
+                .cia_max_keysize        = AES_MAX_KEY_SIZE,
+                .cia_setkey             = crypto_aes_set_key,
+                .cia_encrypt            = aes_cipher_encrypt,
+                .cia_decrypt            = aes_cipher_decrypt
+        }
+};
+static int __init aes_mod_init(void)
+{
+        return crypto_register_alg(&aes_alg);
+}
+static void __exit aes_mod_exit(void)
+{
+        crypto_unregister_alg(&aes_alg);
+}
+module_cpu_feature_match(AES, aes_mod_init);
+module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..685a18f731eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,133 @@
+/*
+ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
+ *                                    Crypto Extensions
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#define AES_ENTRY(func)         ENTRY(ce_ ## func)
+#define AES_ENDPROC(func)       ENDPROC(ce_ ## func)
+        .arch           armv8-a+crypto
+        /* preload all round keys */
+        .macro          load_round_keys, rounds, rk
+        cmp             \rounds, #12
+        blo             2222f           /* 128 bits */
+        beq             1111f           /* 192 bits */
+        ld1             {v17.16b-v18.16b}, [\rk], #32
+1111:   ld1             {v19.16b-v20.16b}, [\rk], #32
+2222:   ld1             {v21.16b-v24.16b}, [\rk], #64
+        ld1             {v25.16b-v28.16b}, [\rk], #64
+        ld1             {v29.16b-v31.16b}, [\rk]
+        .endm
+        /* prepare for encryption with key in rk[] */
+        .macro          enc_prepare, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        /* prepare for encryption (again) but with new key in rk[] */
+        .macro          enc_switch_key, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        /* prepare for decryption with key in rk[] */
+        .macro          dec_prepare, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        .macro          do_enc_Nx, de, mc, k, i0, i1, i2, i3
+        aes\de          \i0\().16b, \k\().16b
+        .ifnb           \i1
+        aes\de          \i1\().16b, \k\().16b
+        .ifnb           \i3
+        aes\de          \i2\().16b, \k\().16b
+        aes\de          \i3\().16b, \k\().16b
+        .endif
+        .endif
+        aes\mc          \i0\().16b, \i0\().16b
+        .ifnb           \i1
+        aes\mc          \i1\().16b, \i1\().16b
+        .ifnb           \i3
+        aes\mc          \i2\().16b, \i2\().16b
+        aes\mc          \i3\().16b, \i3\().16b
+        .endif
+        .endif
+        .endm
+        /* up to 4 interleaved encryption rounds with the same round key */
+        .macro          round_Nx, enc, k, i0, i1, i2, i3
+        .ifc            \enc, e
+        do_enc_Nx       e, mc, \k, \i0, \i1, \i2, \i3
+        .else
+        do_enc_Nx       d, imc, \k, \i0, \i1, \i2, \i3
+        .endif
+        .endm
+        /* up to 4 interleaved final rounds */
+        .macro          fin_round_Nx, de, k, k2, i0, i1, i2, i3
+        aes\de          \i0\().16b, \k\().16b
+        .ifnb           \i1
+        aes\de          \i1\().16b, \k\().16b
+        .ifnb           \i3
+        aes\de          \i2\().16b, \k\().16b
+        aes\de          \i3\().16b, \k\().16b
+        .endif
+        .endif
+        eor             \i0\().16b, \i0\().16b, \k2\().16b
+        .ifnb           \i1
+        eor             \i1\().16b, \i1\().16b, \k2\().16b
+        .ifnb           \i3
+        eor             \i2\().16b, \i2\().16b, \k2\().16b
+        eor             \i3\().16b, \i3\().16b, \k2\().16b
+        .endif
+        .endif
+        .endm
+        /* up to 4 interleaved blocks */
+        .macro          do_block_Nx, enc, rounds, i0, i1, i2, i3
+        cmp             \rounds, #12
+        blo             2222f           /* 128 bits */
+        beq             1111f           /* 192 bits */
+        round_Nx        \enc, v17, \i0, \i1, \i2, \i3
+        round_Nx        \enc, v18, \i0, \i1, \i2, \i3
+1111:   round_Nx        \enc, v19, \i0, \i1, \i2, \i3
+        round_Nx        \enc, v20, \i0, \i1, \i2, \i3
+2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        round_Nx        \enc, \key, \i0, \i1, \i2, \i3
+        .endr
+        fin_round_Nx    \enc, v30, v31, \i0, \i1, \i2, \i3
+        .endm
+        .macro          encrypt_block, in, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \in
+        .endm
+        .macro          encrypt_block2x, i0, i1, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \i0, \i1
+        .endm
+        .macro          encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \i0, \i1, \i2, \i3
+        .endm
+        .macro          decrypt_block, in, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \in
+        .endm
+        .macro          decrypt_block2x, i0, i1, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \i0, \i1
+        .endm
+        .macro          decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \i0, \i1, \i2, \i3
+        .endm
+#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
+/*
+ * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/hwcap.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+#define MODE                    "ce"
+#define PRIO                    300
+#define aes_ecb_encrypt         ce_aes_ecb_encrypt
+#define aes_ecb_decrypt         ce_aes_ecb_decrypt
+#define aes_cbc_encrypt         ce_aes_cbc_encrypt
+#define aes_cbc_decrypt         ce_aes_cbc_decrypt
+#define aes_ctr_encrypt         ce_aes_ctr_encrypt
+#define aes_xts_encrypt         ce_aes_xts_encrypt
+#define aes_xts_decrypt         ce_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+#else
+#define MODE                    "neon"
+#define PRIO                    200
+#define aes_ecb_encrypt         neon_aes_ecb_encrypt
+#define aes_ecb_decrypt         neon_aes_ecb_decrypt
+#define aes_cbc_encrypt         neon_aes_cbc_encrypt
+#define aes_cbc_decrypt         neon_aes_cbc_decrypt
+#define aes_ctr_encrypt         neon_aes_ctr_encrypt
+#define aes_xts_encrypt         neon_aes_xts_encrypt
+#define aes_xts_decrypt         neon_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_ALIAS("ecb(aes)");
+MODULE_ALIAS("cbc(aes)");
+MODULE_ALIAS("ctr(aes)");
+MODULE_ALIAS("xts(aes)");
+#endif
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+/* defined in aes-modes.S */
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, int first);
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, int first);
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 ctr[], int first);
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
+                                int rounds, int blocks, u8 const rk2[], u8 iv[],
+                                int first);
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
+                                int rounds, int blocks, u8 const rk2[], u8 iv[],
+                                int first);
+struct crypto_aes_xts_ctx {
+        struct crypto_aes_ctx key1;
+        struct crypto_aes_ctx __aligned(8) key2;
+};
+static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                       unsigned int key_len)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+        int ret;
+        ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
+        if (!ret)
+                ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
+                                            key_len / 2);
+        if (!ret)
+                return 0;
+        tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+        return -EINVAL;
+}
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_dec, rounds, blocks, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+                                first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
+                                first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+        first = 1;
+        kernel_neon_begin();
+        while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+                aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+                                first);
+                first = 0;
+                nbytes -= blocks * AES_BLOCK_SIZE;
+                if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
+                        break;
+                err = blkcipher_walk_done(desc, &walk,
+                                          walk.nbytes % AES_BLOCK_SIZE);
+        }
+        if (nbytes) {
+                u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 __aligned(8) tail[AES_BLOCK_SIZE];
+                /*
+                 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
+                 * to tell aes_ctr_encrypt() to only read half a block.
+                 */
+                blocks = (nbytes <= 8) ? -1 : 1;
+                aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+                                blocks, walk.iv, first);
+                memcpy(tdst, tail, nbytes);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key1.key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key1.key_enc, rounds, blocks,
+                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key1.key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key1.key_dec, rounds, blocks,
+                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static struct crypto_alg aes_algs[] = { {
+        .cra_name               = "__ecb-aes-" MODE,
+        .cra_driver_name        = "__driver-ecb-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = ecb_encrypt,
+                .decrypt        = ecb_decrypt,
+        },
+}, {
+        .cra_name               = "__cbc-aes-" MODE,
+        .cra_driver_name        = "__driver-cbc-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = cbc_encrypt,
+                .decrypt        = cbc_decrypt,
+        },
+}, {
+        .cra_name               = "__ctr-aes-" MODE,
+        .cra_driver_name        = "__driver-ctr-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = ctr_encrypt,
+                .decrypt        = ctr_encrypt,
+        },
+}, {
+        .cra_name               = "__xts-aes-" MODE,
+        .cra_driver_name        = "__driver-xts-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_xts_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = xts_set_key,
+                .encrypt        = xts_encrypt,
+                .decrypt        = xts_decrypt,
+        },
+}, {
+        .cra_name               = "ecb(aes)",
+        .cra_driver_name        = "ecb-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "cbc(aes)",
+        .cra_driver_name        = "cbc-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "ctr(aes)",
+        .cra_driver_name        = "ctr-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "xts(aes)",
+        .cra_driver_name        = "xts-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+} };
+static int __init aes_init(void)
+{
+        return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+static void __exit aes_exit(void)
+{
+        crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+module_cpu_feature_match(AES, aes_init);
+#else
+module_init(aes_init);
+#endif
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..f6e372c528eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,532 @@
+/*
+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* included by aes-ce.S and aes-neon.S */
+        .text
+        .align          4
+/*
+ * There are several ways to instantiate this code:
+ * - no interleave, all inline
+ * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
+ * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
+ *
+ * Macros imported by this code:
+ * - enc_prepare        - setup NEON registers for encryption
+ * - dec_prepare        - setup NEON registers for decryption
+ * - enc_switch_key     - change to new key after having prepared for encryption
+ * - encrypt_block      - encrypt a single block
+ * - decrypt block      - decrypt a single block
+ * - encrypt_block2x    - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - decrypt_block2x    - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - encrypt_block4x    - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - decrypt_block4x    - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ */
+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
+#define FRAME_PUSH      stp x29, x30, [sp,#-16]! ; mov x29, sp
+#define FRAME_POP       ldp x29, x30, [sp],#16
+#if INTERLEAVE == 2
+aes_encrypt_block2x:
+        encrypt_block2x v0, v1, w3, x2, x6, w7
+        ret
+ENDPROC(aes_encrypt_block2x)
+aes_decrypt_block2x:
+        decrypt_block2x v0, v1, w3, x2, x6, w7
+        ret
+ENDPROC(aes_decrypt_block2x)
+#elif INTERLEAVE == 4
+aes_encrypt_block4x:
+        encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        ret
+ENDPROC(aes_encrypt_block4x)
+aes_decrypt_block4x:
+        decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        ret
+ENDPROC(aes_decrypt_block4x)
+#else
+#error INTERLEAVE should equal 2 or 4
+#endif
+        .macro          do_encrypt_block2x
+        bl              aes_encrypt_block2x
+        .endm
+        .macro          do_decrypt_block2x
+        bl              aes_decrypt_block2x
+        .endm
+        .macro          do_encrypt_block4x
+        bl              aes_encrypt_block4x
+        .endm
+        .macro          do_decrypt_block4x
+        bl              aes_decrypt_block4x
+        .endm
+#else
+#define FRAME_PUSH
+#define FRAME_POP
+        .macro          do_encrypt_block2x
+        encrypt_block2x v0, v1, w3, x2, x6, w7
+        .endm
+        .macro          do_decrypt_block2x
+        decrypt_block2x v0, v1, w3, x2, x6, w7
+        .endm
+        .macro          do_encrypt_block4x
+        encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        .endm
+        .macro          do_decrypt_block4x
+        decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        .endm
+#endif
+        /*
+         * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, int first)
+         * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, int first)
+         */
+AES_ENTRY(aes_ecb_encrypt)
+        FRAME_PUSH
+        cbz             w5, .LecbencloopNx
+        enc_prepare     w3, x2, x5
+.LecbencloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lecbenc1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+        do_encrypt_block2x
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+        do_encrypt_block4x
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LecbencloopNx
+.Lecbenc1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lecbencout
+#endif
+.Lecbencloop:
+        ld1             {v0.16b}, [x1], #16             /* get next pt block */
+        encrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lecbencloop
+.Lecbencout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ecb_encrypt)
+AES_ENTRY(aes_ecb_decrypt)
+        FRAME_PUSH
+        cbz             w5, .LecbdecloopNx
+        dec_prepare     w3, x2, x5
+.LecbdecloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lecbdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        do_decrypt_block2x
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        do_decrypt_block4x
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LecbdecloopNx
+.Lecbdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lecbdecout
+#endif
+.Lecbdecloop:
+        ld1             {v0.16b}, [x1], #16             /* get next ct block */
+        decrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lecbdecloop
+.Lecbdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ecb_decrypt)
+        /*
+         * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 iv[], int first)
+         * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 iv[], int first)
+         */
+AES_ENTRY(aes_cbc_encrypt)
+        cbz             w6, .Lcbcencloop
+        ld1             {v0.16b}, [x5]                  /* get iv */
+        enc_prepare     w3, x2, x5
+.Lcbcencloop:
+        ld1             {v1.16b}, [x1], #16             /* get next pt block */
+        eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
+        encrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lcbcencloop
+        ret
+AES_ENDPROC(aes_cbc_encrypt)
+AES_ENTRY(aes_cbc_decrypt)
+        FRAME_PUSH
+        cbz             w6, .LcbcdecloopNx
+        ld1             {v7.16b}, [x5]                  /* get iv */
+        dec_prepare     w3, x2, x5
+.LcbcdecloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lcbcdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        mov             v2.16b, v0.16b
+        mov             v3.16b, v1.16b
+        do_decrypt_block2x
+        eor             v0.16b, v0.16b, v7.16b
+        eor             v1.16b, v1.16b, v2.16b
+        mov             v7.16b, v3.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        mov             v4.16b, v0.16b
+        mov             v5.16b, v1.16b
+        mov             v6.16b, v2.16b
+        do_decrypt_block4x
+        sub             x1, x1, #16
+        eor             v0.16b, v0.16b, v7.16b
+        eor             v1.16b, v1.16b, v4.16b
+        ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
+        eor             v2.16b, v2.16b, v5.16b
+        eor             v3.16b, v3.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LcbcdecloopNx
+.Lcbcdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lcbcdecout
+#endif
+.Lcbcdecloop:
+        ld1             {v1.16b}, [x1], #16             /* get next ct block */
+        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
+        decrypt_block   v0, w3, x2, x5, w6
+        eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
+        mov             v7.16b, v1.16b                  /* ct is next iv */
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lcbcdecloop
+.Lcbcdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_cbc_decrypt)
+        /*
+         * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 ctr[], int first)
+         */
+AES_ENTRY(aes_ctr_encrypt)
+        FRAME_PUSH
+        cbnz            w6, .Lctrfirst          /* 1st time around? */
+        umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+        rev             x5, x5
+#if INTERLEAVE >= 2
+        cmn             w5, w4                  /* 32 bit overflow? */
+        bcs             .Lctrinc
+        add             x5, x5, #1              /* increment BE ctr */
+        b               .LctrincNx
+#else
+        b               .Lctrinc
+#endif
+.Lctrfirst:
+        enc_prepare     w3, x2, x6
+        ld1             {v4.16b}, [x5]
+        umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+        rev             x5, x5
+#if INTERLEAVE >= 2
+        cmn             w5, w4                  /* 32 bit overflow? */
+        bcs             .Lctrloop
+.LctrloopNx:
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lctr1x
+#if INTERLEAVE == 2
+        mov             v0.8b, v4.8b
+        mov             v1.8b, v4.8b
+        rev             x7, x5
+        add             x5, x5, #1
+        ins             v0.d[1], x7
+        rev             x7, x5
+        add             x5, x5, #1
+        ins             v1.d[1], x7
+        ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
+        do_encrypt_block2x
+        eor             v0.16b, v0.16b, v2.16b
+        eor             v1.16b, v1.16b, v3.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
+        dup             v7.4s, w5
+        mov             v0.16b, v4.16b
+        add             v7.4s, v7.4s, v8.4s
+        mov             v1.16b, v4.16b
+        rev32           v8.16b, v7.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+        mov             v1.s[3], v8.s[0]
+        mov             v2.s[3], v8.s[1]
+        mov             v3.s[3], v8.s[2]
+        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
+        do_encrypt_block4x
+        eor             v0.16b, v5.16b, v0.16b
+        ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
+        eor             v1.16b, v6.16b, v1.16b
+        eor             v2.16b, v7.16b, v2.16b
+        eor             v3.16b, v5.16b, v3.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        add             x5, x5, #INTERLEAVE
+#endif
+        cbz             w4, .LctroutNx
+.LctrincNx:
+        rev             x7, x5
+        ins             v4.d[1], x7
+        b               .LctrloopNx
+.LctroutNx:
+        sub             x5, x5, #1
+        rev             x7, x5
+        ins             v4.d[1], x7
+        b               .Lctrout
+.Lctr1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lctrout
+#endif
+.Lctrloop:
+        mov             v0.16b, v4.16b
+        encrypt_block   v0, w3, x2, x6, w7
+        subs            w4, w4, #1
+        bmi             .Lctrhalfblock          /* blocks < 0 means 1/2 block */
+        ld1             {v3.16b}, [x1], #16
+        eor             v3.16b, v0.16b, v3.16b
+        st1             {v3.16b}, [x0], #16
+        beq             .Lctrout
+.Lctrinc:
+        adds            x5, x5, #1              /* increment BE ctr */
+        rev             x7, x5
+        ins             v4.d[1], x7
+        bcc             .Lctrloop               /* no overflow? */
+        umov            x7, v4.d[0]             /* load upper word of ctr  */
+        rev             x7, x7                  /* ... to handle the carry */
+        add             x7, x7, #1
+        rev             x7, x7
+        ins             v4.d[0], x7
+        b               .Lctrloop
+.Lctrhalfblock:
+        ld1             {v3.8b}, [x1]
+        eor             v3.8b, v0.8b, v3.8b
+        st1             {v3.8b}, [x0]
+.Lctrout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ctr_encrypt)
+        .ltorg
+        /*
+         * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+         *                 int blocks, u8 const rk2[], u8 iv[], int first)
+         * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+         *                 int blocks, u8 const rk2[], u8 iv[], int first)
+         */
+        .macro          next_tweak, out, in, const, tmp
+        sshr            \tmp\().2d,  \in\().2d,   #63
+        and             \tmp\().16b, \tmp\().16b, \const\().16b
+        add             \out\().2d,  \in\().2d,   \in\().2d
+        ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+        eor             \out\().16b, \out\().16b, \tmp\().16b
+        .endm
+.Lxts_mul_x:
+        .word           1, 0, 0x87, 0
+AES_ENTRY(aes_xts_encrypt)
+        FRAME_PUSH
+        cbz             w7, .LxtsencloopNx
+        ld1             {v4.16b}, [x6]
+        enc_prepare     w3, x5, x6
+        encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+        enc_switch_key  w3, x2, x6
+        ldr             q7, .Lxts_mul_x
+        b               .LxtsencNx
+.LxtsencloopNx:
+        ldr             q7, .Lxts_mul_x
+        next_tweak      v4, v4, v7, v8
+.LxtsencNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lxtsenc1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        do_encrypt_block2x
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+        cbz             w4, .LxtsencoutNx
+        next_tweak      v4, v5, v7, v8
+        b               .LxtsencNx
+.LxtsencoutNx:
+        mov             v4.16b, v5.16b
+        b               .Lxtsencout
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        next_tweak      v6, v5, v7, v8
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        next_tweak      v7, v6, v7, v8
+        eor             v3.16b, v3.16b, v7.16b
+        do_encrypt_block4x
+        eor             v3.16b, v3.16b, v7.16b
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        mov             v4.16b, v7.16b
+        cbz             w4, .Lxtsencout
+        b               .LxtsencloopNx
+#endif
+.Lxtsenc1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lxtsencout
+#endif
+.Lxtsencloop:
+        ld1             {v1.16b}, [x1], #16
+        eor             v0.16b, v1.16b, v4.16b
+        encrypt_block   v0, w3, x2, x6, w7
+        eor             v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        beq             .Lxtsencout
+        next_tweak      v4, v4, v7, v8
+        b               .Lxtsencloop
+.Lxtsencout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_xts_encrypt)
+AES_ENTRY(aes_xts_decrypt)
+        FRAME_PUSH
+        cbz             w7, .LxtsdecloopNx
+        ld1             {v4.16b}, [x6]
+        enc_prepare     w3, x5, x6
+        encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+        dec_prepare     w3, x2, x6
+        ldr             q7, .Lxts_mul_x
+        b               .LxtsdecNx
+.LxtsdecloopNx:
+        ldr             q7, .Lxts_mul_x
+        next_tweak      v4, v4, v7, v8
+.LxtsdecNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lxtsdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        do_decrypt_block2x
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+        cbz             w4, .LxtsdecoutNx
+        next_tweak      v4, v5, v7, v8
+        b               .LxtsdecNx
+.LxtsdecoutNx:
+        mov             v4.16b, v5.16b
+        b               .Lxtsdecout
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        next_tweak      v6, v5, v7, v8
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        next_tweak      v7, v6, v7, v8
+        eor             v3.16b, v3.16b, v7.16b
+        do_decrypt_block4x
+        eor             v3.16b, v3.16b, v7.16b
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        mov             v4.16b, v7.16b
+        cbz             w4, .Lxtsdecout
+        b               .LxtsdecloopNx
+#endif
+.Lxtsdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lxtsdecout
+#endif
+.Lxtsdecloop:
+        ld1             {v1.16b}, [x1], #16
+        eor             v0.16b, v1.16b, v4.16b
+        decrypt_block   v0, w3, x2, x6, w7
+        eor             v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        beq             .Lxtsdecout
+        next_tweak      v4, v4, v7, v8
+        b               .Lxtsdecloop
+.Lxtsdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..b93170e1cc93
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
+/*
+ * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#define AES_ENTRY(func)         ENTRY(neon_ ## func)
+#define AES_ENDPROC(func)       ENDPROC(neon_ ## func)
+        /* multiply by polynomial 'x' in GF(2^8) */
+        .macro          mul_by_x, out, in, temp, const
+        sshr            \temp, \in, #7
+        add             \out, \in, \in
+        and             \temp, \temp, \const
+        eor             \out, \out, \temp
+        .endm
+        /* preload the entire Sbox */
+        .macro          prepare, sbox, shiftrows, temp
+        adr             \temp, \sbox
+        movi            v12.16b, #0x40
+        ldr             q13, \shiftrows
+        movi            v14.16b, #0x1b
+        ld1             {v16.16b-v19.16b}, [\temp], #64
+        ld1             {v20.16b-v23.16b}, [\temp], #64
+        ld1             {v24.16b-v27.16b}, [\temp], #64
+        ld1             {v28.16b-v31.16b}, [\temp]
+        .endm
+        /* do preload for encryption */
+        .macro          enc_prepare, ignore0, ignore1, temp
+        prepare         .LForward_Sbox, .LForward_ShiftRows, \temp
+        .endm
+        .macro          enc_switch_key, ignore0, ignore1, temp
+        /* do nothing */
+        .endm
+        /* do preload for decryption */
+        .macro          dec_prepare, ignore0, ignore1, temp
+        prepare         .LReverse_Sbox, .LReverse_ShiftRows, \temp
+        .endm
+        /* apply SubBytes transformation using the the preloaded Sbox */
+        .macro          sub_bytes, in
+        sub             v9.16b, \in\().16b, v12.16b
+        tbl             \in\().16b, {v16.16b-v19.16b}, \in\().16b
+        sub             v10.16b, v9.16b, v12.16b
+        tbx             \in\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v11.16b, v10.16b, v12.16b
+        tbx             \in\().16b, {v24.16b-v27.16b}, v10.16b
+        tbx             \in\().16b, {v28.16b-v31.16b}, v11.16b
+        .endm
+        /* apply MixColumns transformation */
+        .macro          mix_columns, in
+        mul_by_x        v10.16b, \in\().16b, v9.16b, v14.16b
+        rev32           v8.8h, \in\().8h
+        eor             \in\().16b, v10.16b, \in\().16b
+        shl             v9.4s, v8.4s, #24
+        shl             v11.4s, \in\().4s, #24
+        sri             v9.4s, v8.4s, #8
+        sri             v11.4s, \in\().4s, #8
+        eor             v9.16b, v9.16b, v8.16b
+        eor             v10.16b, v10.16b, v9.16b
+        eor             \in\().16b, v10.16b, v11.16b
+        .endm
+        /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+        .macro          inv_mix_columns, in
+        mul_by_x        v11.16b, \in\().16b, v10.16b, v14.16b
+        mul_by_x        v11.16b, v11.16b, v10.16b, v14.16b
+        eor             \in\().16b, \in\().16b, v11.16b
+        rev32           v11.8h, v11.8h
+        eor             \in\().16b, \in\().16b, v11.16b
+        mix_columns     \in
+        .endm
+        .macro          do_block, enc, in, rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
+        tbl             \in\().16b, {\in\().16b}, v13.16b       /* ShiftRows */
+        sub_bytes       \in
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns     \in
+        .else
+        inv_mix_columns \in
+        .endif
+        b               1111b
+2222:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
+        .endm
+        .macro          encrypt_block, in, rounds, rk, rkp, i
+        do_block        1, \in, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block, in, rounds, rk, rkp, i
+        do_block        0, \in, \rounds, \rk, \rkp, \i
+        .endm
+        /*
+         * Interleaved versions: functionally equivalent to the
+         * ones above, but applied to 2 or 4 AES states in parallel.
+         */
+        .macro          sub_bytes_2x, in0, in1
+        sub             v8.16b, \in0\().16b, v12.16b
+        sub             v9.16b, \in1\().16b, v12.16b
+        tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+        tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+        sub             v10.16b, v8.16b, v12.16b
+        sub             v11.16b, v9.16b, v12.16b
+        tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
+        tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v8.16b, v10.16b, v12.16b
+        sub             v9.16b, v11.16b, v12.16b
+        tbx             \in0\().16b, {v24.16b-v27.16b}, v10.16b
+        tbx             \in1\().16b, {v24.16b-v27.16b}, v11.16b
+        tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
+        tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
+        .endm
+        .macro          sub_bytes_4x, in0, in1, in2, in3
+        sub             v8.16b, \in0\().16b, v12.16b
+        tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+        sub             v9.16b, \in1\().16b, v12.16b
+        tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+        sub             v10.16b, \in2\().16b, v12.16b
+        tbl             \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
+        sub             v11.16b, \in3\().16b, v12.16b
+        tbl             \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
+        tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
+        tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v8.16b, v8.16b, v12.16b
+        tbx             \in2\().16b, {v20.16b-v23.16b}, v10.16b
+        sub             v9.16b, v9.16b, v12.16b
+        tbx             \in3\().16b, {v20.16b-v23.16b}, v11.16b
+        sub             v10.16b, v10.16b, v12.16b
+        tbx             \in0\().16b, {v24.16b-v27.16b}, v8.16b
+        sub             v11.16b, v11.16b, v12.16b
+        tbx             \in1\().16b, {v24.16b-v27.16b}, v9.16b
+        sub             v8.16b, v8.16b, v12.16b
+        tbx             \in2\().16b, {v24.16b-v27.16b}, v10.16b
+        sub             v9.16b, v9.16b, v12.16b
+        tbx             \in3\().16b, {v24.16b-v27.16b}, v11.16b
+        sub             v10.16b, v10.16b, v12.16b
+        tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
+        sub             v11.16b, v11.16b, v12.16b
+        tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
+        tbx             \in2\().16b, {v28.16b-v31.16b}, v10.16b
+        tbx             \in3\().16b, {v28.16b-v31.16b}, v11.16b
+        .endm
+        .macro          mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
+        sshr            \tmp0\().16b, \in0\().16b,  #7
+        add             \out0\().16b, \in0\().16b,  \in0\().16b
+        sshr            \tmp1\().16b, \in1\().16b,  #7
+        and             \tmp0\().16b, \tmp0\().16b, \const\().16b
+        add             \out1\().16b, \in1\().16b,  \in1\().16b
+        and             \tmp1\().16b, \tmp1\().16b, \const\().16b
+        eor             \out0\().16b, \out0\().16b, \tmp0\().16b
+        eor             \out1\().16b, \out1\().16b, \tmp1\().16b
+        .endm
+        .macro          mix_columns_2x, in0, in1
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        rev32           v10.8h, \in0\().8h
+        rev32           v11.8h, \in1\().8h
+        eor             \in0\().16b, v8.16b, \in0\().16b
+        eor             \in1\().16b, v9.16b, \in1\().16b
+        shl             v12.4s, v10.4s, #24
+        shl             v13.4s, v11.4s, #24
+        eor             v8.16b, v8.16b, v10.16b
+        sri             v12.4s, v10.4s, #8
+        shl             v10.4s, \in0\().4s, #24
+        eor             v9.16b, v9.16b, v11.16b
+        sri             v13.4s, v11.4s, #8
+        shl             v11.4s, \in1\().4s, #24
+        sri             v10.4s, \in0\().4s, #8
+        eor             \in0\().16b, v8.16b, v12.16b
+        sri             v11.4s, \in1\().4s, #8
+        eor             \in1\().16b, v9.16b, v13.16b
+        eor             \in0\().16b, v10.16b, \in0\().16b
+        eor             \in1\().16b, v11.16b, \in1\().16b
+        .endm
+        .macro          inv_mix_cols_2x, in0, in1
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        mul_by_x_2x     v8, v9, v8, v9, v10, v11, v14
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        rev32           v8.8h, v8.8h
+        rev32           v9.8h, v9.8h
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        mix_columns_2x  \in0, \in1
+        .endm
+        .macro          inv_mix_cols_4x, in0, in1, in2, in3
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        mul_by_x_2x     v10, v11, \in2, \in3, v12, v13, v14
+        mul_by_x_2x     v8, v9, v8, v9, v12, v13, v14
+        mul_by_x_2x     v10, v11, v10, v11, v12, v13, v14
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        eor             \in2\().16b, \in2\().16b, v10.16b
+        eor             \in3\().16b, \in3\().16b, v11.16b
+        rev32           v8.8h, v8.8h
+        rev32           v9.8h, v9.8h
+        rev32           v10.8h, v10.8h
+        rev32           v11.8h, v11.8h
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        eor             \in2\().16b, \in2\().16b, v10.16b
+        eor             \in3\().16b, \in3\().16b, v11.16b
+        mix_columns_2x  \in0, \in1
+        mix_columns_2x  \in2, \in3
+        .endm
+        .macro          do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        sub_bytes_2x    \in0, \in1
+        tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns_2x  \in0, \in1
+        ldr             q13, .LForward_ShiftRows
+        .else
+        inv_mix_cols_2x \in0, \in1
+        ldr             q13, .LReverse_ShiftRows
+        .endif
+        movi            v12.16b, #0x40
+        b               1111b
+2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        .endm
+        .macro          do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
+        eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
+        sub_bytes_4x    \in0, \in1, \in2, \in3
+        tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in2\().16b, {\in2\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in3\().16b, {\in3\().16b}, v13.16b     /* ShiftRows */
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns_2x  \in0, \in1
+        mix_columns_2x  \in2, \in3
+        ldr             q13, .LForward_ShiftRows
+        .else
+        inv_mix_cols_4x \in0, \in1, \in2, \in3
+        ldr             q13, .LReverse_ShiftRows
+        .endif
+        movi            v12.16b, #0x40
+        b               1111b
+2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
+        eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
+        .endm
+        .macro          encrypt_block2x, in0, in1, rounds, rk, rkp, i
+        do_block_2x     1, \in0, \in1, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block2x, in0, in1, rounds, rk, rkp, i
+        do_block_2x     0, \in0, \in1, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+        do_block_4x     1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+        do_block_4x     0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+        .endm
+#include "aes-modes.S"
+        .text
+        .align          4
+.LForward_ShiftRows:
+        .byte           0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
+        .byte           0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
+.LReverse_ShiftRows:
+        .byte           0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
+        .byte           0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
+.LForward_Sbox:
+        .byte           0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+        .byte           0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+        .byte           0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+        .byte           0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+        .byte           0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+        .byte           0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+        .byte           0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+        .byte           0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+        .byte           0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+        .byte           0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+        .byte           0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+        .byte           0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+        .byte           0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+        .byte           0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+        .byte           0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+        .byte           0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+        .byte           0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+        .byte           0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+        .byte           0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+        .byte           0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+        .byte           0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+        .byte           0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+        .byte           0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+        .byte           0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+        .byte           0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+        .byte           0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+        .byte           0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+        .byte           0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+        .byte           0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+        .byte           0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+        .byte           0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+        .byte           0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.LReverse_Sbox:
+        .byte           0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+        .byte           0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+        .byte           0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+        .byte           0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+        .byte           0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+        .byte           0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+        .byte           0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+        .byte           0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+        .byte           0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+        .byte           0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+        .byte           0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+        .byte           0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+        .byte           0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+        .byte           0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+        .byte           0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+        .byte           0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+        .byte           0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+        .byte           0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+        .byte           0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+        .byte           0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+        .byte           0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+        .byte           0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+        .byte           0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+        .byte           0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+        .byte           0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+        .byte           0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+        .byte           0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+        .byte           0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+        .byte           0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+        .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+        .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+        .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
new file mode 100644
index 000000000000..b9e6eaf41c9b
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -0,0 +1,95 @@
+/*
+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *           Vinodh Gopal
+ *           Erdinc Ozturk
+ *           Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        DATA    .req    v0
+        SHASH   .req    v1
+        IN1     .req    v2
+        T1      .req    v2
+        T2      .req    v3
+        T3      .req    v4
+        VZR     .req    v5
+        .text
+        .arch           armv8-a+crypto
+        /*
+         * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+         *                         struct ghash_key const *k, const char *head)
+         */
+ENTRY(pmull_ghash_update)
+        ld1             {DATA.16b}, [x1]
+        ld1             {SHASH.16b}, [x3]
+        eor             VZR.16b, VZR.16b, VZR.16b
+        /* do the head block first, if supplied */
+        cbz             x4, 0f
+        ld1             {IN1.2d}, [x4]
+        b               1f
+0:      ld1             {IN1.2d}, [x2], #16
+        sub             w0, w0, #1
+1:      ext             IN1.16b, IN1.16b, IN1.16b, #8
+CPU_LE( rev64           IN1.16b, IN1.16b        )
+        eor             DATA.16b, DATA.16b, IN1.16b
+        /* multiply DATA by SHASH in GF(2^128) */
+        ext             T2.16b, DATA.16b, DATA.16b, #8
+        ext             T3.16b, SHASH.16b, SHASH.16b, #8
+        eor             T2.16b, T2.16b, DATA.16b
+        eor             T3.16b, T3.16b, SHASH.16b
+        pmull2          T1.1q, SHASH.2d, DATA.2d        // a1 * b1
+        pmull           DATA.1q, SHASH.1d, DATA.1d      // a0 * b0
+        pmull           T2.1q, T2.1d, T3.1d             // (a1 + a0)(b1 + b0)
+        eor             T2.16b, T2.16b, T1.16b          // (a0 * b1) + (a1 * b0)
+        eor             T2.16b, T2.16b, DATA.16b
+        ext             T3.16b, VZR.16b, T2.16b, #8
+        ext             T2.16b, T2.16b, VZR.16b, #8
+        eor             DATA.16b, DATA.16b, T3.16b
+        eor             T1.16b, T1.16b, T2.16b  // <T1:DATA> is result of
+                                                // carry-less multiplication
+        /* first phase of the reduction */
+        shl             T3.2d, DATA.2d, #1
+        eor             T3.16b, T3.16b, DATA.16b
+        shl             T3.2d, T3.2d, #5
+        eor             T3.16b, T3.16b, DATA.16b
+        shl             T3.2d, T3.2d, #57
+        ext             T2.16b, VZR.16b, T3.16b, #8
+        ext             T3.16b, T3.16b, VZR.16b, #8
+        eor             DATA.16b, DATA.16b, T2.16b
+        eor             T1.16b, T1.16b, T3.16b
+        /* second phase of the reduction */
+        ushr            T2.2d, DATA.2d, #5
+        eor             T2.16b, T2.16b, DATA.16b
+        ushr            T2.2d, T2.2d, #1
+        eor             T2.16b, T2.16b, DATA.16b
+        ushr            T2.2d, T2.2d, #1
+        eor             T1.16b, T1.16b, T2.16b
+        eor             DATA.16b, DATA.16b, T1.16b
+        cbnz            w0, 0b
+        st1             {DATA.16b}, [x1]
+        ret
+ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
new file mode 100644
index 000000000000..b92baf3f68c7
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -0,0 +1,155 @@
+/*
+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+#define GHASH_BLOCK_SIZE        16
+#define GHASH_DIGEST_SIZE       16
+struct ghash_key {
+        u64 a;
+        u64 b;
+};
+struct ghash_desc_ctx {
+        u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
+        u8 buf[GHASH_BLOCK_SIZE];
+        u32 count;
+};
+asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+                                   struct ghash_key const *k, const char *head);
+static int ghash_init(struct shash_desc *desc)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        *ctx = (struct ghash_desc_ctx){};
+        return 0;
+}
+static int ghash_update(struct shash_desc *desc, const u8 *src,
+                        unsigned int len)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+        ctx->count += len;
+        if ((partial + len) >= GHASH_BLOCK_SIZE) {
+                struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+                int blocks;
+                if (partial) {
+                        int p = GHASH_BLOCK_SIZE - partial;
+                        memcpy(ctx->buf + partial, src, p);
+                        src += p;
+                        len -= p;
+                }
+                blocks = len / GHASH_BLOCK_SIZE;
+                len %= GHASH_BLOCK_SIZE;
+                kernel_neon_begin_partial(6);
+                pmull_ghash_update(blocks, ctx->digest, src, key,
+                                   partial ? ctx->buf : NULL);
+                kernel_neon_end();
+                src += blocks * GHASH_BLOCK_SIZE;
+        }
+        if (len)
+                memcpy(ctx->buf + partial, src, len);
+        return 0;
+}
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+        if (partial) {
+                struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+                memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+                kernel_neon_begin_partial(6);
+                pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
+                kernel_neon_end();
+        }
+        put_unaligned_be64(ctx->digest[1], dst);
+        put_unaligned_be64(ctx->digest[0], dst + 8);
+        *ctx = (struct ghash_desc_ctx){};
+        return 0;
+}
+static int ghash_setkey(struct crypto_shash *tfm,
+                        const u8 *inkey, unsigned int keylen)
+{
+        struct ghash_key *key = crypto_shash_ctx(tfm);
+        u64 a, b;
+        if (keylen != GHASH_BLOCK_SIZE) {
+                crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+                return -EINVAL;
+        }
+        /* perform multiplication by 'x' in GF(2^128) */
+        b = get_unaligned_be64(inkey);
+        a = get_unaligned_be64(inkey + 8);
+        key->a = (a << 1) | (b >> 63);
+        key->b = (b << 1) | (a >> 63);
+        if (b >> 63)
+                key->b ^= 0xc200000000000000UL;
+        return 0;
+}
+static struct shash_alg ghash_alg = {
+        .digestsize     = GHASH_DIGEST_SIZE,
+        .init           = ghash_init,
+        .update         = ghash_update,
+        .final          = ghash_final,
+        .setkey         = ghash_setkey,
+        .descsize       = sizeof(struct ghash_desc_ctx),
+        .base           = {
+                .cra_name               = "ghash",
+                .cra_driver_name        = "ghash-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = GHASH_BLOCK_SIZE,
+                .cra_ctxsize            = sizeof(struct ghash_key),
+                .cra_module             = THIS_MODULE,
+        },
+};
+static int __init ghash_ce_mod_init(void)
+{
+        return crypto_register_shash(&ghash_alg);
+}
+static void __exit ghash_ce_mod_exit(void)
+{
+        crypto_unregister_shash(&ghash_alg);
+}
+module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+module_exit(ghash_ce_mod_exit);
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
new file mode 100644
index 000000000000..09d57d98609c
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -0,0 +1,153 @@
+/*
+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        .text
+        .arch           armv8-a+crypto
+        k0              .req    v0
+        k1              .req    v1
+        k2              .req    v2
+        k3              .req    v3
+        t0              .req    v4
+        t1              .req    v5
+        dga             .req    q6
+        dgav            .req    v6
+        dgb             .req    s7
+        dgbv            .req    v7
+        dg0q            .req    q12
+        dg0s            .req    s12
+        dg0v            .req    v12
+        dg1s            .req    s13
+        dg1v            .req    v13
+        dg2s            .req    s14
+        .macro          add_only, op, ev, rc, s0, dg1
+        .ifc            \ev, ev
+        add             t1.4s, v\s0\().4s, \rc\().4s
+        sha1h           dg2s, dg0s
+        .ifnb           \dg1
+        sha1\op         dg0q, \dg1, t0.4s
+        .else
+        sha1\op         dg0q, dg1s, t0.4s
+        .endif
+        .else
+        .ifnb           \s0
+        add             t0.4s, v\s0\().4s, \rc\().4s
+        .endif
+        sha1h           dg1s, dg0s
+        sha1\op         dg0q, dg2s, t1.4s
+        .endif
+        .endm
+        .macro          add_update, op, ev, rc, s0, s1, s2, s3, dg1
+        sha1su0         v\s0\().4s, v\s1\().4s, v\s2\().4s
+        add_only        \op, \ev, \rc, \s1, \dg1
+        sha1su1         v\s0\().4s, v\s3\().4s
+        .endm
+        /*
+         * The SHA1 round constants
+         */
+        .align          4
+.Lsha1_rcon:
+        .word           0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
+        /*
+         * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+         *                        u8 *head, long bytes)
+         */
+ENTRY(sha1_ce_transform)
+        /* load round constants */
+        adr             x6, .Lsha1_rcon
+        ld1r            {k0.4s}, [x6], #4
+        ld1r            {k1.4s}, [x6], #4
+        ld1r            {k2.4s}, [x6], #4
+        ld1r            {k3.4s}, [x6]
+        /* load state */
+        ldr             dga, [x2]
+        ldr             dgb, [x2, #16]
+        /* load partial state (if supplied) */
+        cbz             x3, 0f
+        ld1             {v8.4s-v11.4s}, [x3]
+        b               1f
+        /* load input */
+0:      ld1             {v8.4s-v11.4s}, [x1], #64
+        sub             w0, w0, #1
+1:
+CPU_LE( rev32           v8.16b, v8.16b          )
+CPU_LE( rev32           v9.16b, v9.16b          )
+CPU_LE( rev32           v10.16b, v10.16b        )
+CPU_LE( rev32           v11.16b, v11.16b        )
+2:      add             t0.4s, v8.4s, k0.4s
+        mov             dg0v.16b, dgav.16b
+        add_update      c, ev, k0,  8,  9, 10, 11, dgb
+        add_update      c, od, k0,  9, 10, 11,  8
+        add_update      c, ev, k0, 10, 11,  8,  9
+        add_update      c, od, k0, 11,  8,  9, 10
+        add_update      c, ev, k1,  8,  9, 10, 11
+        add_update      p, od, k1,  9, 10, 11,  8
+        add_update      p, ev, k1, 10, 11,  8,  9
+        add_update      p, od, k1, 11,  8,  9, 10
+        add_update      p, ev, k1,  8,  9, 10, 11
+        add_update      p, od, k2,  9, 10, 11,  8
+        add_update      m, ev, k2, 10, 11,  8,  9
+        add_update      m, od, k2, 11,  8,  9, 10
+        add_update      m, ev, k2,  8,  9, 10, 11
+        add_update      m, od, k2,  9, 10, 11,  8
+        add_update      m, ev, k3, 10, 11,  8,  9
+        add_update      p, od, k3, 11,  8,  9, 10
+        add_only        p, ev, k3,  9
+        add_only        p, od, k3, 10
+        add_only        p, ev, k3, 11
+        add_only        p, od
+        /* update state */
+        add             dgbv.2s, dgbv.2s, dg1v.2s
+        add             dgav.4s, dgav.4s, dg0v.4s
+        cbnz            w0, 0b
+        /*
+         * Final block: add padding and total bit count.
+         * Skip if we have no total byte count in x4. In that case, the input
+         * size was not a round multiple of the block size, and the padding is
+         * handled by the C code.
+         */
+        cbz             x4, 3f
+        movi            v9.2d, #0
+        mov             x8, #0x80000000
+        movi            v10.2d, #0
+        ror             x7, x4, #29             // ror(lsl(x4, 3), 32)
+        fmov            d8, x8
+        mov             x4, #0
+        mov             v11.d[0], xzr
+        mov             v11.d[1], x7
+        b               2b
+        /* store new state */
+3:      str             dga, [x2]
+        str             dgb, [x2, #16]
+        ret
+ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
new file mode 100644
index 000000000000..6fe83f37a750
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -0,0 +1,174 @@
+/*
+ * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+                                  u8 *head, long bytes);
+static int sha1_init(struct shash_desc *desc)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha1_state){
+                .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+        };
+        return 0;
+}
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+        sctx->count += len;
+        if ((partial + len) >= SHA1_BLOCK_SIZE) {
+                int blocks;
+                if (partial) {
+                        int p = SHA1_BLOCK_SIZE - partial;
+                        memcpy(sctx->buffer + partial, data, p);
+                        data += p;
+                        len -= p;
+                }
+                blocks = len / SHA1_BLOCK_SIZE;
+                len %= SHA1_BLOCK_SIZE;
+                kernel_neon_begin_partial(16);
+                sha1_ce_transform(blocks, data, sctx->state,
+                                  partial ? sctx->buffer : NULL, 0);
+                kernel_neon_end();
+                data += blocks * SHA1_BLOCK_SIZE;
+                partial = 0;
+        }
+        if (len)
+                memcpy(sctx->buffer + partial, data, len);
+        return 0;
+}
+static int sha1_final(struct shash_desc *desc, u8 *out)
+{
+        static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        __be64 bits = cpu_to_be64(sctx->count << 3);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        u32 padlen = SHA1_BLOCK_SIZE
+                     - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
+        sha1_update(desc, padding, padlen);
+        sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+        for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha1_state){};
+        return 0;
+}
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+                      unsigned int len, u8 *out)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int blocks;
+        int i;
+        if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
+                sha1_update(desc, data, len);
+                return sha1_final(desc, out);
+        }
+        /*
+         * Use a fast path if the input is a multiple of 64 bytes. In
+         * this case, there is no need to copy data around, and we can
+         * perform the entire digest calculation in a single invocation
+         * of sha1_ce_transform()
+         */
+        blocks = len / SHA1_BLOCK_SIZE;
+        kernel_neon_begin_partial(16);
+        sha1_ce_transform(blocks, data, sctx->state, NULL, len);
+        kernel_neon_end();
+        for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha1_state){};
+        return 0;
+}
+static int sha1_export(struct shash_desc *desc, void *out)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        struct sha1_state *dst = out;
+        *dst = *sctx;
+        return 0;
+}
+static int sha1_import(struct shash_desc *desc, const void *in)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        struct sha1_state const *src = in;
+        *sctx = *src;
+        return 0;
+}
+static struct shash_alg alg = {
+        .init                   = sha1_init,
+        .update                 = sha1_update,
+        .final                  = sha1_final,
+        .finup                  = sha1_finup,
+        .export                 = sha1_export,
+        .import                 = sha1_import,
+        .descsize               = sizeof(struct sha1_state),
+        .digestsize             = SHA1_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha1_state),
+        .base                   = {
+                .cra_name               = "sha1",
+                .cra_driver_name        = "sha1-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA1_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+};
+static int __init sha1_ce_mod_init(void)
+{
+        return crypto_register_shash(&alg);
+}
+static void __exit sha1_ce_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_cpu_feature_match(SHA1, sha1_ce_mod_init);
+module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
new file mode 100644
index 000000000000..7f29fc031ea8
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -0,0 +1,156 @@
+/*
+ * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        .text
+        .arch           armv8-a+crypto
+        dga             .req    q20
+        dgav            .req    v20
+        dgb             .req    q21
+        dgbv            .req    v21
+        t0              .req    v22
+        t1              .req    v23
+        dg0q            .req    q24
+        dg0v            .req    v24
+        dg1q            .req    q25
+        dg1v            .req    v25
+        dg2q            .req    q26
+        dg2v            .req    v26
+        .macro          add_only, ev, rc, s0
+        mov             dg2v.16b, dg0v.16b
+        .ifeq           \ev
+        add             t1.4s, v\s0\().4s, \rc\().4s
+        sha256h         dg0q, dg1q, t0.4s
+        sha256h2        dg1q, dg2q, t0.4s
+        .else
+        .ifnb           \s0
+        add             t0.4s, v\s0\().4s, \rc\().4s
+        .endif
+        sha256h         dg0q, dg1q, t1.4s
+        sha256h2        dg1q, dg2q, t1.4s
+        .endif
+        .endm
+        .macro          add_update, ev, rc, s0, s1, s2, s3
+        sha256su0       v\s0\().4s, v\s1\().4s
+        add_only        \ev, \rc, \s1
+        sha256su1       v\s0\().4s, v\s2\().4s, v\s3\().4s
+        .endm
+        /*
+         * The SHA-256 round constants
+         */
+        .align          4
+.Lsha2_rcon:
+        .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+        .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+        .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+        .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+        .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+        .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+        .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+        .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+        .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+        .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+        .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+        .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+        .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+        .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+        .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+        .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+        /*
+         * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+         *                        u8 *head, long bytes)
+         */
+ENTRY(sha2_ce_transform)
+        /* load round constants */
+        adr             x8, .Lsha2_rcon
+        ld1             { v0.4s- v3.4s}, [x8], #64
+        ld1             { v4.4s- v7.4s}, [x8], #64
+        ld1             { v8.4s-v11.4s}, [x8], #64
+        ld1             {v12.4s-v15.4s}, [x8]
+        /* load state */
+        ldp             dga, dgb, [x2]
+        /* load partial input (if supplied) */
+        cbz             x3, 0f
+        ld1             {v16.4s-v19.4s}, [x3]
+        b               1f
+        /* load input */
+0:      ld1             {v16.4s-v19.4s}, [x1], #64
+        sub             w0, w0, #1
+1:
+CPU_LE( rev32           v16.16b, v16.16b        )
+CPU_LE( rev32           v17.16b, v17.16b        )
+CPU_LE( rev32           v18.16b, v18.16b        )
+CPU_LE( rev32           v19.16b, v19.16b        )
+2:      add             t0.4s, v16.4s, v0.4s
+        mov             dg0v.16b, dgav.16b
+        mov             dg1v.16b, dgbv.16b
+        add_update      0,  v1, 16, 17, 18, 19
+        add_update      1,  v2, 17, 18, 19, 16
+        add_update      0,  v3, 18, 19, 16, 17
+        add_update      1,  v4, 19, 16, 17, 18
+        add_update      0,  v5, 16, 17, 18, 19
+        add_update      1,  v6, 17, 18, 19, 16
+        add_update      0,  v7, 18, 19, 16, 17
+        add_update      1,  v8, 19, 16, 17, 18
+        add_update      0,  v9, 16, 17, 18, 19
+        add_update      1, v10, 17, 18, 19, 16
+        add_update      0, v11, 18, 19, 16, 17
+        add_update      1, v12, 19, 16, 17, 18
+        add_only        0, v13, 17
+        add_only        1, v14, 18
+        add_only        0, v15, 19
+        add_only        1
+        /* update state */
+        add             dgav.4s, dgav.4s, dg0v.4s
+        add             dgbv.4s, dgbv.4s, dg1v.4s
+        /* handled all input blocks? */
+        cbnz            w0, 0b
+        /*
+         * Final block: add padding and total bit count.
+         * Skip if we have no total byte count in x4. In that case, the input
+         * size was not a round multiple of the block size, and the padding is
+         * handled by the C code.
+         */
+        cbz             x4, 3f
+        movi            v17.2d, #0
+        mov             x8, #0x80000000
+        movi            v18.2d, #0
+        ror             x7, x4, #29             // ror(lsl(x4, 3), 32)
+        fmov            d16, x8
+        mov             x4, #0
+        mov             v19.d[0], xzr
+        mov             v19.d[1], x7
+        b               2b
+        /* store new state */
+3:      stp             dga, dgb, [x2]
+        ret
+ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
new file mode 100644
index 000000000000..c294e67d3925
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -0,0 +1,255 @@
+/*
+ * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+                                 u8 *head, long bytes);
+static int sha224_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha256_state){
+                .state = {
+                        SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+                        SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+                }
+        };
+        return 0;
+}
+static int sha256_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha256_state){
+                .state = {
+                        SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+                        SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+                }
+        };
+        return 0;
+}
+static int sha2_update(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+        sctx->count += len;
+        if ((partial + len) >= SHA256_BLOCK_SIZE) {
+                int blocks;
+                if (partial) {
+                        int p = SHA256_BLOCK_SIZE - partial;
+                        memcpy(sctx->buf + partial, data, p);
+                        data += p;
+                        len -= p;
+                }
+                blocks = len / SHA256_BLOCK_SIZE;
+                len %= SHA256_BLOCK_SIZE;
+                kernel_neon_begin_partial(28);
+                sha2_ce_transform(blocks, data, sctx->state,
+                                  partial ? sctx->buf : NULL, 0);
+                kernel_neon_end();
+                data += blocks * SHA256_BLOCK_SIZE;
+                partial = 0;
+        }
+        if (len)
+                memcpy(sctx->buf + partial, data, len);
+        return 0;
+}
+static void sha2_final(struct shash_desc *desc)
+{
+        static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be64 bits = cpu_to_be64(sctx->count << 3);
+        u32 padlen = SHA256_BLOCK_SIZE
+                     - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
+        sha2_update(desc, padding, padlen);
+        sha2_update(desc, (const u8 *)&bits, sizeof(bits));
+}
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_final(desc);
+        for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_final(desc);
+        for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static void sha2_finup(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        int blocks;
+        if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
+                sha2_update(desc, data, len);
+                sha2_final(desc);
+                return;
+        }
+        /*
+         * Use a fast path if the input is a multiple of 64 bytes. In
+         * this case, there is no need to copy data around, and we can
+         * perform the entire digest calculation in a single invocation
+         * of sha2_ce_transform()
+         */
+        blocks = len / SHA256_BLOCK_SIZE;
+        kernel_neon_begin_partial(28);
+        sha2_ce_transform(blocks, data, sctx->state, NULL, len);
+        kernel_neon_end();
+        data += blocks * SHA256_BLOCK_SIZE;
+}
+static int sha224_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_finup(desc, data, len);
+        for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_finup(desc, data, len);
+        for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha2_export(struct shash_desc *desc, void *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        struct sha256_state *dst = out;
+        *dst = *sctx;
+        return 0;
+}
+static int sha2_import(struct shash_desc *desc, const void *in)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        struct sha256_state const *src = in;
+        *sctx = *src;
+        return 0;
+}
+static struct shash_alg algs[] = { {
+        .init                   = sha224_init,
+        .update                 = sha2_update,
+        .final                  = sha224_final,
+        .finup                  = sha224_finup,
+        .export                 = sha2_export,
+        .import                 = sha2_import,
+        .descsize               = sizeof(struct sha256_state),
+        .digestsize             = SHA224_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha256_state),
+        .base                   = {
+                .cra_name               = "sha224",
+                .cra_driver_name        = "sha224-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA256_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+}, {
+        .init                   = sha256_init,
+        .update                 = sha2_update,
+        .final                  = sha256_final,
+        .finup                  = sha256_finup,
+        .export                 = sha2_export,
+        .import                 = sha2_import,
+        .descsize               = sizeof(struct sha256_state),
+        .digestsize             = SHA256_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha256_state),
+        .base                   = {
+                .cra_name               = "sha256",
+                .cra_driver_name        = "sha256-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA256_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+} };
+static int __init sha2_ce_mod_init(void)
+{
+        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+static void __exit sha2_ce_mod_fini(void)
+{
+        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_cpu_feature_match(SHA2, sha2_ce_mod_init);
+module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += socket.h
 generic-y += sockios.h
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index fd3e3924041b..5901480bfdca 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -21,6 +21,7 @@
 #endif
 #include <asm/ptrace.h>
+#include <asm/thread_info.h>
 /*
 * Stack pushing/popping (register pairs only). Equivalent to store decrement
@@ -68,23 +69,31 @@
        msr     daifclr, #8
        .endm
-        .macro  disable_step, tmp
+        .macro  disable_step_tsk, flgs, tmp
+        tbz     \flgs, #TIF_SINGLESTEP, 9990f
        mrs     \tmp, mdscr_el1
        bic     \tmp, \tmp, #1
        msr     mdscr_el1, \tmp
+        isb     // Synchronise with enable_dbg
+9990:
        .endm
-        .macro  enable_step, tmp
+        .macro  enable_step_tsk, flgs, tmp
+        tbz     \flgs, #TIF_SINGLESTEP, 9990f
+        disable_dbg
        mrs     \tmp, mdscr_el1
        orr     \tmp, \tmp, #1
        msr     mdscr_el1, \tmp
+9990:
        .endm
-        .macro  enable_dbg_if_not_stepping, tmp
+/*
-        mrs     \tmp, mdscr_el1
+ * Enable both debug exceptions and interrupts. This is likely to be
-        tbnz    \tmp, #0, 9990f
+ * faster than two daifclr operations, since writes to this register
-        enable_dbg
+ * are self-synchronising.
-9990:
+ */
+        .macro  enable_dbg_and_irq
+        msr     daifclr, #(8 | 2)
        .endm
 /*
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 57e8cb49824c..65f1569ac96e 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -157,7 +157,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 */
 #define ATOMIC64_INIT(i) { (i) }
-#define atomic64_read(v)        (*(volatile long long *)&(v)->counter)
+#define atomic64_read(v)        (*(volatile long *)&(v)->counter)
 #define atomic64_set(v,i)       (((v)->counter) = (i))
 static inline void atomic64_add(u64 i, atomic64_t *v)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 48b9e704af7c..6389d60574d9 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -25,12 +25,12 @@
 #define wfi()           asm volatile("wfi" : : : "memory")
 #define isb()           asm volatile("isb" : : : "memory")
-#define dmb(opt)        asm volatile("dmb sy" : : : "memory")
+#define dmb(opt)        asm volatile("dmb " #opt : : : "memory")
-#define dsb(opt)        asm volatile("dsb sy" : : : "memory")
+#define dsb(opt)        asm volatile("dsb " #opt : : : "memory")
-#define mb()            dsb()
+#define mb()            dsb(sy)
-#define rmb()           asm volatile("dsb ld" : : : "memory")
+#define rmb()           dsb(ld)
-#define wmb()           asm volatile("dsb st" : : : "memory")
+#define wmb()           dsb(st)
 #ifndef CONFIG_SMP
 #define smp_mb()        barrier()
@@ -40,7 +40,7 @@
 #define smp_store_release(p, v)                                         \
 do {                                                                    \
        compiletime_assert_atomic_type(*p);                             \
-        smp_mb();                                                       \
+        barrier();                                                      \
        ACCESS_ONCE(*p) = (v);                                          \
 } while (0)
@@ -48,15 +48,15 @@ do {									\
 ({                                                                      \
        typeof(*p) ___p1 = ACCESS_ONCE(*p);                             \
        compiletime_assert_atomic_type(*p);                             \
-        smp_mb();                                                       \
+        barrier();                                                      \
        ___p1;                                                          \
 })
 #else
-#define smp_mb()        asm volatile("dmb ish" : : : "memory")
+#define smp_mb()        dmb(ish)
-#define smp_rmb()       asm volatile("dmb ishld" : : : "memory")
+#define smp_rmb()       dmb(ishld)
-#define smp_wmb()       asm volatile("dmb ishst" : : : "memory")
+#define smp_wmb()       dmb(ishst)
 #define smp_store_release(p, v)                                         \
 do {                                                                    \
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 390308a67f0d..88cc05b5f3ac 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_CACHE_H
 #define __ASM_CACHE_H
+#include <asm/cachetype.h>
 #define L1_CACHE_SHIFT          6
 #define L1_CACHE_BYTES          (1 << L1_CACHE_SHIFT)
@@ -27,6 +29,15 @@
 * the CPU.
 */
 #define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
-#define ARCH_SLAB_MINALIGN      8
+#ifndef __ASSEMBLY__
+static inline int cache_line_size(void)
+{
+        u32 cwg = cache_type_cwg();
+        return cwg ? 4 << cwg : L1_CACHE_BYTES;
+}
+#endif  /* __ASSEMBLY__ */
 #endif
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 4c60e64a801c..a5176cf32dad 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -123,7 +123,7 @@ extern void flush_dcache_page(struct page *);
 static inline void __flush_icache_all(void)
 {
        asm("ic ialluis");
-        dsb();
+        dsb(ish);
 }
 #define flush_dcache_mmap_lock(mapping) \
@@ -150,7 +150,7 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
         * set_pte_at() called from vmap_pte_range() does not
         * have a DSB after cleaning the cache line.
         */
-        dsb();
+        dsb(ish);
 }
 static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h
index 85f5f511352a..4b23e758d5e0 100644
--- a/arch/arm64/include/asm/cachetype.h
+++ b/arch/arm64/include/asm/cachetype.h
@@ -20,12 +20,16 @@
 #define CTR_L1IP_SHIFT          14
 #define CTR_L1IP_MASK           3
+#define CTR_CWG_SHIFT           24
+#define CTR_CWG_MASK            15
 #define ICACHE_POLICY_RESERVED  0
 #define ICACHE_POLICY_AIVIVT    1
 #define ICACHE_POLICY_VIPT      2
 #define ICACHE_POLICY_PIPT      3
+#ifndef __ASSEMBLY__
 static inline u32 icache_policy(void)
 {
        return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK;
@@ -45,4 +49,11 @@ static inline int icache_is_aivivt(void)
        return icache_policy() == ICACHE_POLICY_AIVIVT;
 }
+static inline u32 cache_type_cwg(void)
+{
+        return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK;
+}
+#endif  /* __ASSEMBLY__ */
 #endif  /* __ASM_CACHETYPE_H */
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 57c0fa7bf711..ddb9d7830558 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -72,7 +72,12 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 }
 #define xchg(ptr,x) \
-        ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+({ \
+        __typeof__(*(ptr)) __ret; \
+        __ret = (__typeof__(*(ptr))) \
+                __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \
+        __ret; \
+})
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
                                      unsigned long new, int size)
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index e71f81fe127a..253e33bc94fb 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -305,11 +305,6 @@ static inline int is_compat_thread(struct thread_info *thread)
 #else /* !CONFIG_COMPAT */
-static inline int is_compat_task(void)
-{
-        return 0;
-}
 static inline int is_compat_thread(struct thread_info *thread)
 {
        return 0;
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index c4a7f940b387..72674f4c3871 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -18,9 +18,11 @@
 #ifndef __ASM_ESR_H
 #define __ASM_ESR_H
-#define ESR_EL1_EC_SHIFT        (26)
+#define ESR_EL1_WRITE           (1 << 6)
-#define ESR_EL1_IL              (1U << 25)
+#define ESR_EL1_CM              (1 << 8)
+#define ESR_EL1_IL              (1 << 25)
+#define ESR_EL1_EC_SHIFT        (26)
 #define ESR_EL1_EC_UNKNOWN      (0x00)
 #define ESR_EL1_EC_WFI          (0x01)
 #define ESR_EL1_EC_CP15_32      (0x03)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac13008..50f559f574fe 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,8 +37,21 @@ struct fpsimd_state {
                        u32 fpcr;
                };
        };
+        /* the id of the last cpu to have restored this state */
+        unsigned int cpu;
 };
+/*
+ * Struct for stacking the bottom 'n' FP/SIMD registers.
+ */
+struct fpsimd_partial_state {
+        u32             fpsr;
+        u32             fpcr;
+        u32             num_regs;
+        __uint128_t     vregs[32];
+};
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK     0xf800009f
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
 extern void fpsimd_thread_switch(struct task_struct *next);
 extern void fpsimd_flush_thread(void);
+extern void fpsimd_preserve_current_state(void);
+extern void fpsimd_restore_current_state(void);
+extern void fpsimd_update_current_state(struct fpsimd_state *state);
+extern void fpsimd_flush_task_state(struct task_struct *target);
+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
+                                      u32 num_regs);
+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
 #endif
 #endif
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599c96bd..768414d55e64 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@
        ldr     w\tmpnr, [\state, #16 * 2 + 4]
        msr     fpcr, x\tmpnr
 .endm
+.altmacro
+.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
+        mrs     x\tmpnr1, fpsr
+        str     w\numnr, [\state, #8]
+        mrs     x\tmpnr2, fpcr
+        stp     w\tmpnr1, w\tmpnr2, [\state]
+        adr     x\tmpnr1, 0f
+        add     \state, \state, x\numnr, lsl #4
+        sub     x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
+        br      x\tmpnr1
+        .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+        .irp    qb, %(qa + 1)
+        stp     q\qa, q\qb, [\state, # -16 * \qa - 16]
+        .endr
+        .endr
+0:
+.endm
+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
+        ldp     w\tmpnr1, w\tmpnr2, [\state]
+        msr     fpsr, x\tmpnr1
+        msr     fpcr, x\tmpnr2
+        adr     x\tmpnr1, 0f
+        ldr     w\tmpnr2, [\state, #8]
+        add     \state, \state, x\tmpnr2, lsl #4
+        sub     x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+        br      x\tmpnr1
+        .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+        .irp    qb, %(qa + 1)
+        ldp     q\qa, q\qb, [\state, # -16 * \qa - 16]
+        .endr
+        .endr
+0:
+.endm
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
new file mode 100644
index 000000000000..c5534facf941
--- /dev/null
+++ b/arch/arm64/include/asm/ftrace.h
@@ -0,0 +1,59 @@
+/*
+ * arch/arm64/include/asm/ftrace.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_FTRACE_H
+#define __ASM_FTRACE_H
+#include <asm/insn.h>
+#define MCOUNT_ADDR             ((unsigned long)_mcount)
+#define MCOUNT_INSN_SIZE        AARCH64_INSN_SIZE
+#ifndef __ASSEMBLY__
+#include <linux/compat.h>
+extern void _mcount(unsigned long);
+extern void *return_address(unsigned int);
+struct dyn_arch_ftrace {
+        /* No extra data needed for arm64 */
+};
+extern unsigned long ftrace_graph_call;
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+        /*
+         * addr is the address of the mcount call instruction.
+         * recordmcount does the necessary offset calculation.
+         */
+        return addr;
+}
+#define ftrace_return_address(n) return_address(n)
+/*
+ * Because AArch32 mode does not share the same syscall table with AArch64,
+ * tracing compat syscalls may result in reporting bogus syscalls or even
+ * hang-up, so just do not trace them.
+ * See kernel/trace/trace_syscalls.c
+ *
+ * x86 code says:
+ * If the user realy wants these, then they should use the
+ * raw syscall tracepoints with filtering.
+ */
+#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
+{
+        return is_compat_task();
+}
+#endif /* ifndef __ASSEMBLY__ */
+#endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index ae4801d77514..0be67821f9ce 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
-#define NR_IPI  5
+#define NR_IPI  6
 typedef struct {
        unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index c44ad39ed310..dc1f73b13e74 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -21,6 +21,7 @@
 /* A64 instructions are always 32 bits. */
 #define AARCH64_INSN_SIZE               4
+#ifndef __ASSEMBLY__
 /*
 * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
 * Section C3.1 "A64 instruction index by encoding":
@@ -104,5 +105,6 @@ bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
 int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
 int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt);
 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+#endif /* __ASSEMBLY__ */
 #endif  /* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index a1bef78f0303..e0ecdcf6632d 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -230,19 +230,11 @@ extern void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot
 extern void __iounmap(volatile void __iomem *addr);
 extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
-#define PROT_DEFAULT            (PTE_TYPE_PAGE | PTE_AF | PTE_DIRTY)
-#define PROT_DEVICE_nGnRE       (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
-#define PROT_NORMAL_NC          (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL_NC))
-#define PROT_NORMAL             (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
 #define ioremap(addr, size)             __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define ioremap_nocache(addr, size)     __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define ioremap_wc(addr, size)          __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
 #define iounmap                         __iounmap
-#define PROT_SECT_DEFAULT       (PMD_TYPE_SECT | PMD_SECT_AF)
-#define PROT_SECT_DEVICE_nGnRE  (PROT_SECT_DEFAULT | PTE_PXN | PTE_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
 #define ARCH_HAS_IOREMAP_WC
 #include <asm-generic/iomap.h>
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a97780..13ce4cc18e26 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@
 * published by the Free Software Foundation.
 */
+#include <linux/types.h>
 #define cpu_has_neon()          (1)
-void kernel_neon_begin(void);
+#define kernel_neon_begin()     kernel_neon_begin_partial(32)
+void kernel_neon_begin_partial(u32 num_regs);
 void kernel_neon_end(void);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5fc8a66c3924..955e8c5f0afb 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -29,6 +29,8 @@
 */
 #define PUD_TABLE_BIT           (_AT(pgdval_t, 1) << 1)
+#define PUD_TYPE_MASK           (_AT(pgdval_t, 3) << 0)
+#define PUD_TYPE_SECT           (_AT(pgdval_t, 1) << 0)
 /*
 * Level 2 descriptor (PMD).
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e2f96748859b..598cc384fc1c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -52,66 +52,59 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
 #endif
 #define pgd_ERROR(pgd)          __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
-/*
+#ifdef CONFIG_SMP
- * The pgprot_* and protection_map entries will be fixed up at runtime to
+#define PROT_DEFAULT            (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
- * include the cachable and bufferable bits based on memory policy, as well as
+#define PROT_SECT_DEFAULT       (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
- * any architecture dependent bits like global/ASID and SMP shared mapping
+#else
- * bits.
+#define PROT_DEFAULT            (PTE_TYPE_PAGE | PTE_AF)
- */
+#define PROT_SECT_DEFAULT       (PMD_TYPE_SECT | PMD_SECT_AF)
-#define _PAGE_DEFAULT           PTE_TYPE_PAGE | PTE_AF
+#endif
-extern pgprot_t pgprot_default;
+#define PROT_DEVICE_nGnRE       (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
+#define PROT_NORMAL_NC          (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
+#define PROT_NORMAL             (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL))
-#define __pgprot_modify(prot,mask,bits) \
+#define PROT_SECT_DEVICE_nGnRE  (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
-        __pgprot((pgprot_val(prot) & ~(mask)) | (bits))
+#define PROT_SECT_NORMAL        (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
+#define PROT_SECT_NORMAL_EXEC   (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
-#define _MOD_PROT(p, b)         __pgprot_modify(p, 0, b)
+#define _PAGE_DEFAULT           (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define PAGE_NONE               __pgprot_modify(pgprot_default, PTE_TYPE_MASK, PTE_PROT_NONE | PTE_PXN | PTE_UXN)
+#define PAGE_KERNEL             __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
-#define PAGE_SHARED             _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
+#define PAGE_KERNEL_EXEC        __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
-#define PAGE_SHARED_EXEC        _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
-#define PAGE_COPY               _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
-#define PAGE_COPY_EXEC          _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN)
-#define PAGE_READONLY           _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
-#define PAGE_READONLY_EXEC      _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN)
-#define PAGE_KERNEL             _MOD_PROT(pgprot_default, PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
-#define PAGE_KERNEL_EXEC        _MOD_PROT(pgprot_default, PTE_UXN | PTE_DIRTY | PTE_WRITE)
-#define PAGE_HYP                _MOD_PROT(pgprot_default, PTE_HYP)
+#define PAGE_HYP                __pgprot(_PAGE_DEFAULT | PTE_HYP)
 #define PAGE_HYP_DEVICE         __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
-#define PAGE_S2                 __pgprot_modify(pgprot_default, PTE_S2_MEMATTR_MASK, PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
+#define PAGE_S2                 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE          __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN)
-#define __PAGE_NONE             __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN)
+#define PAGE_NONE               __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN)
-#define __PAGE_SHARED           __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
+#define PAGE_SHARED             __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
-#define __PAGE_SHARED_EXEC      __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
+#define PAGE_SHARED_EXEC        __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
-#define __PAGE_COPY             __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
+#define PAGE_COPY               __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
-#define __PAGE_COPY_EXEC        __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
+#define PAGE_COPY_EXEC          __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
-#define __PAGE_READONLY         __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
+#define PAGE_READONLY           __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
-#define __PAGE_READONLY_EXEC    __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
+#define PAGE_READONLY_EXEC      __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
-#endif /* __ASSEMBLY__ */
+#define __P000  PAGE_NONE
+#define __P001  PAGE_READONLY
-#define __P000  __PAGE_NONE
+#define __P010  PAGE_COPY
-#define __P001  __PAGE_READONLY
+#define __P011  PAGE_COPY
-#define __P010  __PAGE_COPY
+#define __P100  PAGE_READONLY_EXEC
-#define __P011  __PAGE_COPY
+#define __P101  PAGE_READONLY_EXEC
-#define __P100  __PAGE_READONLY_EXEC
+#define __P110  PAGE_COPY_EXEC
-#define __P101  __PAGE_READONLY_EXEC
+#define __P111  PAGE_COPY_EXEC
-#define __P110  __PAGE_COPY_EXEC
-#define __P111  __PAGE_COPY_EXEC
+#define __S000  PAGE_NONE
+#define __S001  PAGE_READONLY
-#define __S000  __PAGE_NONE
+#define __S010  PAGE_SHARED
-#define __S001  __PAGE_READONLY
+#define __S011  PAGE_SHARED
-#define __S010  __PAGE_SHARED
+#define __S100  PAGE_READONLY_EXEC
-#define __S011  __PAGE_SHARED
+#define __S101  PAGE_READONLY_EXEC
-#define __S100  __PAGE_READONLY_EXEC
+#define __S110  PAGE_SHARED_EXEC
-#define __S101  __PAGE_READONLY_EXEC
+#define __S111  PAGE_SHARED_EXEC
-#define __S110  __PAGE_SHARED_EXEC
-#define __S111  __PAGE_SHARED_EXEC
-#ifndef __ASSEMBLY__
 /*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
@@ -265,6 +258,7 @@ static inline pmd_t pte_pmd(pte_t pte)
 #define mk_pmd(page,prot)       pfn_pmd(page_to_pfn(page),prot)
 #define pmd_page(pmd)           pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
+#define pud_pfn(pud)            (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
 #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
@@ -273,6 +267,9 @@ static inline int has_transparent_hugepage(void)
        return 1;
 }
+#define __pgprot_modify(prot,mask,bits) \
+        __pgprot((pgprot_val(prot) & ~(mask)) | (bits))
 /*
 * Mark the prot value as uncacheable and unbufferable.
 */
@@ -295,11 +292,17 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 #define pmd_sect(pmd)           ((pmd_val(pmd) & PMD_TYPE_MASK) == \
                                 PMD_TYPE_SECT)
+#ifdef ARM64_64K_PAGES
+#define pud_sect(pud)           (0)
+#else
+#define pud_sect(pud)           ((pud_val(pud) & PUD_TYPE_MASK) == \
+                                 PUD_TYPE_SECT)
+#endif
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
        *pmdp = pmd;
-        dsb();
+        dsb(ishst);
 }
 static inline void pmd_clear(pmd_t *pmdp)
@@ -329,7 +332,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
        *pudp = pud;
-        dsb();
+        dsb(ishst);
 }
 static inline void pud_clear(pud_t *pudp)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 45b20cd6cbca..34de2a8f7d93 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -79,6 +79,7 @@ struct thread_struct {
        unsigned long           tp_value;
        struct fpsimd_state     fpsimd_state;
        unsigned long           fault_address;  /* fault info */
+        unsigned long           fault_code;     /* ESR_EL1 value */
        struct debug_info       debug;          /* debugging */
 };
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index c7ba261dd4b3..a429b5940be2 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -135,6 +135,11 @@ struct pt_regs {
 #define user_stack_pointer(regs) \
        (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp)
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+        return regs->regs[0];
+}
 /*
 * Are the current registers suitable for user mode? (used to maintain
 * security in signal handlers)
diff --git a/arch/arm64/include/asm/sigcontext.h b/arch/arm64/include/asm/sigcontext.h
deleted file mode 100644
index dca1094acc74..000000000000
--- a/arch/arm64/include/asm/sigcontext.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_SIGCONTEXT_H
-#define __ASM_SIGCONTEXT_H
-#include <uapi/asm/sigcontext.h>
-/*
- * Auxiliary context saved in the sigcontext.__reserved array. Not exported to
- * user space as it will change with the addition of new context. User space
- * should check the magic/size information.
- */
-struct aux_context {
-        struct fpsimd_context fpsimd;
-        /* additional context to be added before "end" */
-        struct _aarch64_ctx end;
-};
-#endif
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3ee8b303d9a9..64d2d4884a9d 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,6 +22,18 @@ extern char *strrchr(const char *, int c);
 #define __HAVE_ARCH_STRCHR
 extern char *strchr(const char *, int c);
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *, const char *);
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *, const char *, __kernel_size_t);
+#define __HAVE_ARCH_STRLEN
+extern __kernel_size_t strlen(const char *);
+#define __HAVE_ARCH_STRNLEN
+extern __kernel_size_t strnlen(const char *, __kernel_size_t);
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
@@ -34,4 +46,7 @@ extern void *memchr(const void *, int, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *, int, __kernel_size_t);
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
 #endif
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 70ba9d4ee978..383771eb0b87 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -18,6 +18,7 @@
 #include <linux/err.h>
+extern const void *sys_call_table[];
 static inline int syscall_get_nr(struct task_struct *task,
                                 struct pt_regs *regs)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 7b8e3a2a00fb..e40b6d06d515 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -91,6 +91,9 @@ static inline struct thread_info *current_thread_info(void)
 /*
 * thread information flags:
 *  TIF_SYSCALL_TRACE   - syscall trace active
+ *  TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace
+ *  TIF_SYSCALL_AUDIT   - syscall auditing
+ *  TIF_SECOMP          - syscall secure computing
 *  TIF_SIGPENDING      - signal pending
 *  TIF_NEED_RESCHED    - rescheduling necessary
 *  TIF_NOTIFY_RESUME   - callback before returning to user
@@ -99,7 +102,11 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SIGPENDING          0
 #define TIF_NEED_RESCHED        1
 #define TIF_NOTIFY_RESUME       2       /* callback before returning to user */
+#define TIF_FOREIGN_FPSTATE     3       /* CPU's FP state is not current's */
 #define TIF_SYSCALL_TRACE       8
+#define TIF_SYSCALL_AUDIT       9
+#define TIF_SYSCALL_TRACEPOINT  10
+#define TIF_SECCOMP             11
 #define TIF_MEMDIE              18      /* is terminating due to OOM killer */
 #define TIF_FREEZE              19
 #define TIF_RESTORE_SIGMASK     20
@@ -110,10 +117,18 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SIGPENDING         (1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED       (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME      (1 << TIF_NOTIFY_RESUME)
+#define _TIF_FOREIGN_FPSTATE    (1 << TIF_FOREIGN_FPSTATE)
+#define _TIF_SYSCALL_TRACE      (1 << TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_AUDIT      (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SECCOMP            (1 << TIF_SECCOMP)
 #define _TIF_32BIT              (1 << TIF_32BIT)
 #define _TIF_WORK_MASK          (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-                                 _TIF_NOTIFY_RESUME)
+                                 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
+#define _TIF_SYSCALL_WORK       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+                                 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
 #endif /* __KERNEL__ */
 #endif /* __ASM_THREAD_INFO_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 8b482035cfc2..b9349c4513ea 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -72,9 +72,9 @@ extern struct cpu_tlb_fns cpu_tlb;
 */
 static inline void flush_tlb_all(void)
 {
-        dsb();
+        dsb(ishst);
        asm("tlbi       vmalle1is");
-        dsb();
+        dsb(ish);
        isb();
 }
@@ -82,9 +82,9 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 {
        unsigned long asid = (unsigned long)ASID(mm) << 48;
-        dsb();
+        dsb(ishst);
        asm("tlbi       aside1is, %0" : : "r" (asid));
-        dsb();
+        dsb(ish);
 }
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -93,16 +93,36 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
        unsigned long addr = uaddr >> 12 |
                ((unsigned long)ASID(vma->vm_mm) << 48);
-        dsb();
+        dsb(ishst);
        asm("tlbi       vae1is, %0" : : "r" (addr));
-        dsb();
+        dsb(ish);
 }
-/*
+static inline void flush_tlb_range(struct vm_area_struct *vma,
- * Convert calls to our calling convention.
+                                        unsigned long start, unsigned long end)
- */
+{
-#define flush_tlb_range(vma,start,end)  __cpu_flush_user_tlb_range(start,end,vma)
+        unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48;
-#define flush_tlb_kernel_range(s,e)     __cpu_flush_kern_tlb_range(s,e)
+        unsigned long addr;
+        start = asid | (start >> 12);
+        end = asid | (end >> 12);
+        dsb(ishst);
+        for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
+                asm("tlbi vae1is, %0" : : "r"(addr));
+        dsb(ish);
+}
+static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+        unsigned long addr;
+        start >>= 12;
+        end >>= 12;
+        dsb(ishst);
+        for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
+                asm("tlbi vaae1is, %0" : : "r"(addr));
+        dsb(ish);
+}
 /*
 * On AArch64, the cache coherency is handled via the set_pte_at() function.
@@ -114,7 +134,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
         * set_pte() does not have a DSB, so make sure that the page table
         * write is visible.
         */
-        dsb();
+        dsb(ishst);
 }
 #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 0172e6d76bf3..7ebcd31ce51c 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -20,9 +20,6 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_core_cpumask(cpu)      (&cpu_topology[cpu].core_sibling)
 #define topology_thread_cpumask(cpu)    (&cpu_topology[cpu].thread_sibling)
-#define mc_capable()    (cpu_topology[0].cluster_id != -1)
-#define smt_capable()   (cpu_topology[0].thread_id != -1)
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index a4654c656a1e..e5f47df00c24 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -29,3 +29,5 @@
 #endif
 #define __ARCH_WANT_SYS_CLONE
 #include <uapi/asm/unistd.h>
+#define NR_syscalls (__NR_syscalls)
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 690ad51cc901..b72cf405b3fe 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -53,5 +53,12 @@ struct fpsimd_context {
        __uint128_t vregs[32];
 };
+/* ESR_EL1 context */
+#define ESR_MAGIC       0x45535201
+struct esr_context {
+        struct _aarch64_ctx head;
+        u64 esr;
+};
 #endif /* _UAPI__ASM_SIGCONTEXT_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ba5e17a522d5..cdaedad3afe5 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -7,14 +7,19 @@ AFLAGS_head.o		:= -DTEXT_OFFSET=$(TEXT_OFFSET)
 CFLAGS_efi-stub.o       := -DTEXT_OFFSET=$(TEXT_OFFSET) \
                           -I$(src)/../../../scripts/dtc/libfdt
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_insn.o = -pg
+CFLAGS_REMOVE_return_address.o = -pg
 # Object file lists.
 arm64-obj-y             := cputable.o debug-monitors.o entry.o irq.o fpsimd.o   \
                           entry-fpsimd.o process.o ptrace.o setup.o signal.o   \
                           sys.o stacktrace.o time.o traps.o io.o vdso.o        \
-                           hyp-stub.o psci.o cpu_ops.o insn.o
+                           hyp-stub.o psci.o cpu_ops.o insn.o return_address.o
 arm64-obj-$(CONFIG_COMPAT)              += sys32.o kuser32.o signal32.o         \
                                           sys_compat.o
+arm64-obj-$(CONFIG_FUNCTION_TRACER)     += ftrace.o entry-ftrace.o
 arm64-obj-$(CONFIG_MODULES)             += arm64ksyms.o module.o
 arm64-obj-$(CONFIG_SMP)                 += smp.o smp_spin_table.o topology.o
 arm64-obj-$(CONFIG_PERF_EVENTS)         += perf_regs.o
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..a85843ddbde8 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr);
        /* string / mem functions */
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(memcmp);
        /* atomic bitops */
 EXPORT_SYMBOL(set_bit);
@@ -56,3 +61,7 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+#ifdef CONFIG_FUNCTION_TRACER
+EXPORT_SYMBOL(_mcount);
+#endif
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa6..d358ccacfc00 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
        fpsimd_restore x0, 8
        ret
 ENDPROC(fpsimd_load_state)
+#ifdef CONFIG_KERNEL_MODE_NEON
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+        fpsimd_save_partial x0, 1, 8, 9
+        ret
+ENDPROC(fpsimd_load_partial_state)
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+        fpsimd_restore_partial x0, 8, 9
+        ret
+ENDPROC(fpsimd_load_partial_state)
+#endif
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
new file mode 100644
index 000000000000..b051871f2965
--- /dev/null
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -0,0 +1,218 @@
+/*
+ * arch/arm64/kernel/entry-ftrace.S
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+/*
+ * Gcc with -pg will put the following code in the beginning of each function:
+ *      mov x0, x30
+ *      bl _mcount
+ *      [function's body ...]
+ * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic
+ * ftrace is enabled.
+ *
+ * Please note that x0 as an argument will not be used here because we can
+ * get lr(x30) of instrumented function at any time by winding up call stack
+ * as long as the kernel is compiled without -fomit-frame-pointer.
+ * (or CONFIG_FRAME_POINTER, this is forced on arm64)
+ *
+ * stack layout after mcount_enter in _mcount():
+ *
+ * current sp/fp =>  0:+-----+
+ * in _mcount()        | x29 | -> instrumented function's fp
+ *                     +-----+
+ *                     | x30 | -> _mcount()'s lr (= instrumented function's pc)
+ * old sp       => +16:+-----+
+ * when instrumented   |     |
+ * function calls      | ... |
+ * _mcount()           |     |
+ *                     |     |
+ * instrumented => +xx:+-----+
+ * function's fp       | x29 | -> parent's fp
+ *                     +-----+
+ *                     | x30 | -> instrumented function's lr (= parent's pc)
+ *                     +-----+
+ *                     | ... |
+ */
+        .macro mcount_enter
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+        .endm
+        .macro mcount_exit
+        ldp     x29, x30, [sp], #16
+        ret
+        .endm
+        .macro mcount_adjust_addr rd, rn
+        sub     \rd, \rn, #AARCH64_INSN_SIZE
+        .endm
+        /* for instrumented function's parent */
+        .macro mcount_get_parent_fp reg
+        ldr     \reg, [x29]
+        ldr     \reg, [\reg]
+        .endm
+        /* for instrumented function */
+        .macro mcount_get_pc0 reg
+        mcount_adjust_addr      \reg, x30
+        .endm
+        .macro mcount_get_pc reg
+        ldr     \reg, [x29, #8]
+        mcount_adjust_addr      \reg, \reg
+        .endm
+        .macro mcount_get_lr reg
+        ldr     \reg, [x29]
+        ldr     \reg, [\reg, #8]
+        mcount_adjust_addr      \reg, \reg
+        .endm
+        .macro mcount_get_lr_addr reg
+        ldr     \reg, [x29]
+        add     \reg, \reg, #8
+        .endm
+#ifndef CONFIG_DYNAMIC_FTRACE
+/*
+ * void _mcount(unsigned long return_address)
+ * @return_address: return address to instrumented function
+ *
+ * This function makes calls, if enabled, to:
+ *     - tracer function to probe instrumented function's entry,
+ *     - ftrace_graph_caller to set up an exit hook
+ */
+ENTRY(_mcount)
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        ldr     x0, =ftrace_trace_stop
+        ldr     x0, [x0]                // if ftrace_trace_stop
+        ret                             //   return;
+#endif
+        mcount_enter
+        ldr     x0, =ftrace_trace_function
+        ldr     x2, [x0]
+        adr     x0, ftrace_stub
+        cmp     x0, x2                  // if (ftrace_trace_function
+        b.eq    skip_ftrace_call        //     != ftrace_stub) {
+        mcount_get_pc   x0              //       function's pc
+        mcount_get_lr   x1              //       function's lr (= parent's pc)
+        blr     x2                      //   (*ftrace_trace_function)(pc, lr);
+#ifndef CONFIG_FUNCTION_GRAPH_TRACER
+skip_ftrace_call:                       //   return;
+        mcount_exit                     // }
+#else
+        mcount_exit                     //   return;
+                                        // }
+skip_ftrace_call:
+        ldr     x1, =ftrace_graph_return
+        ldr     x2, [x1]                //   if ((ftrace_graph_return
+        cmp     x0, x2                  //        != ftrace_stub)
+        b.ne    ftrace_graph_caller
+        ldr     x1, =ftrace_graph_entry //     || (ftrace_graph_entry
+        ldr     x2, [x1]                //        != ftrace_graph_entry_stub))
+        ldr     x0, =ftrace_graph_entry_stub
+        cmp     x0, x2
+        b.ne    ftrace_graph_caller     //     ftrace_graph_caller();
+        mcount_exit
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ENDPROC(_mcount)
+#else /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * _mcount() is used to build the kernel with -pg option, but all the branch
+ * instructions to _mcount() are replaced to NOP initially at kernel start up,
+ * and later on, NOP to branch to ftrace_caller() when enabled or branch to
+ * NOP when disabled per-function base.
+ */
+ENTRY(_mcount)
+        ret
+ENDPROC(_mcount)
+/*
+ * void ftrace_caller(unsigned long return_address)
+ * @return_address: return address to instrumented function
+ *
+ * This function is a counterpart of _mcount() in 'static' ftrace, and
+ * makes calls to:
+ *     - tracer function to probe instrumented function's entry,
+ *     - ftrace_graph_caller to set up an exit hook
+ */
+ENTRY(ftrace_caller)
+        mcount_enter
+        mcount_get_pc0  x0              //     function's pc
+        mcount_get_lr   x1              //     function's lr
+        .global ftrace_call
+ftrace_call:                            // tracer(pc, lr);
+        nop                             // This will be replaced with "bl xxx"
+                                        // where xxx can be any kind of tracer.
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        .global ftrace_graph_call
+ftrace_graph_call:                      // ftrace_graph_caller();
+        nop                             // If enabled, this will be replaced
+                                        // "b ftrace_graph_caller"
+#endif
+        mcount_exit
+ENDPROC(ftrace_caller)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+ENTRY(ftrace_stub)
+        ret
+ENDPROC(ftrace_stub)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * void ftrace_graph_caller(void)
+ *
+ * Called from _mcount() or ftrace_caller() when function_graph tracer is
+ * selected.
+ * This function w/ prepare_ftrace_return() fakes link register's value on
+ * the call stack in order to intercept instrumented function's return path
+ * and run return_to_handler() later on its exit.
+ */
+ENTRY(ftrace_graph_caller)
+        mcount_get_lr_addr        x0    //     pointer to function's saved lr
+        mcount_get_pc             x1    //     function's pc
+        mcount_get_parent_fp      x2    //     parent's fp
+        bl      prepare_ftrace_return   // prepare_ftrace_return(&lr, pc, fp)
+        mcount_exit
+ENDPROC(ftrace_graph_caller)
+/*
+ * void return_to_handler(void)
+ *
+ * Run ftrace_return_to_handler() before going back to parent.
+ * @fp is checked against the value passed by ftrace_graph_caller()
+ * only when CONFIG_FUNCTION_GRAPH_FP_TEST is enabled.
+ */
+ENTRY(return_to_handler)
+        str     x0, [sp, #-16]!
+        mov     x0, x29                 //     parent's fp
+        bl      ftrace_return_to_handler// addr = ftrace_return_to_hander(fp);
+        mov     x30, x0                 // restore the original return address
+        ldr     x0, [sp], #16
+        ret
+END(return_to_handler)
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 39ac630d83de..bf017f4ffb4f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -60,6 +60,9 @@
        push    x0, x1
        .if     \el == 0
        mrs     x21, sp_el0
+        get_thread_info tsk                     // Ensure MDSCR_EL1.SS is clear,
+        ldr     x19, [tsk, #TI_FLAGS]           // since we can unmask debug
+        disable_step_tsk x19, x20               // exceptions when scheduling.
        .else
        add     x21, sp, #S_FRAME_SIZE
        .endif
@@ -259,7 +262,7 @@ el1_da:
         * Data abort handling
         */
        mrs     x0, far_el1
-        enable_dbg_if_not_stepping x2
+        enable_dbg
        // re-enable interrupts if they were enabled in the aborted context
        tbnz    x23, #7, 1f                     // PSR_I_BIT
        enable_irq
@@ -275,6 +278,7 @@ el1_sp_pc:
         * Stack or PC alignment exception handling
         */
        mrs     x0, far_el1
+        enable_dbg
        mov     x1, x25
        mov     x2, sp
        b       do_sp_pc_abort
@@ -282,6 +286,7 @@ el1_undef:
        /*
         * Undefined instruction
         */
+        enable_dbg
        mov     x0, sp
        b       do_undefinstr
 el1_dbg:
@@ -294,10 +299,11 @@ el1_dbg:
        mrs     x0, far_el1
        mov     x2, sp                          // struct pt_regs
        bl      do_debug_exception
+        enable_dbg
        kernel_exit 1
 el1_inv:
        // TODO: add support for undefined instructions in kernel mode
+        enable_dbg
        mov     x0, sp
        mov     x1, #BAD_SYNC
        mrs     x2, esr_el1
@@ -307,7 +313,7 @@ ENDPROC(el1_sync)
        .align  6
 el1_irq:
        kernel_entry 1
-        enable_dbg_if_not_stepping x0
+        enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
        bl      trace_hardirqs_off
 #endif
@@ -332,8 +338,7 @@ ENDPROC(el1_irq)
 #ifdef CONFIG_PREEMPT
 el1_preempt:
        mov     x24, lr
-1:      enable_dbg
+1:      bl      preempt_schedule_irq            // irq en/disable is done inside
-        bl      preempt_schedule_irq            // irq en/disable is done inside
        ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
        tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
        ret     x24
@@ -349,7 +354,7 @@ el0_sync:
        lsr     x24, x25, #ESR_EL1_EC_SHIFT     // exception class
        cmp     x24, #ESR_EL1_EC_SVC64          // SVC in 64-bit state
        b.eq    el0_svc
-        adr     lr, ret_from_exception
+        adr     lr, ret_to_user
        cmp     x24, #ESR_EL1_EC_DABT_EL0       // data abort in EL0
        b.eq    el0_da
        cmp     x24, #ESR_EL1_EC_IABT_EL0       // instruction abort in EL0
@@ -378,7 +383,7 @@ el0_sync_compat:
        lsr     x24, x25, #ESR_EL1_EC_SHIFT     // exception class
        cmp     x24, #ESR_EL1_EC_SVC32          // SVC in 32-bit state
        b.eq    el0_svc_compat
-        adr     lr, ret_from_exception
+        adr     lr, ret_to_user
        cmp     x24, #ESR_EL1_EC_DABT_EL0       // data abort in EL0
        b.eq    el0_da
        cmp     x24, #ESR_EL1_EC_IABT_EL0       // instruction abort in EL0
@@ -423,11 +428,8 @@ el0_da:
         */
        mrs     x0, far_el1
        bic     x0, x0, #(0xff << 56)
-        disable_step x1
-        isb
-        enable_dbg
        // enable interrupts before calling the main handler
-        enable_irq
+        enable_dbg_and_irq
        mov     x1, x25
        mov     x2, sp
        b       do_mem_abort
@@ -436,11 +438,8 @@ el0_ia:
         * Instruction abort handling
         */
        mrs     x0, far_el1
-        disable_step x1
-        isb
-        enable_dbg
        // enable interrupts before calling the main handler
-        enable_irq
+        enable_dbg_and_irq
        orr     x1, x25, #1 << 24               // use reserved ISS bit for instruction aborts
        mov     x2, sp
        b       do_mem_abort
@@ -448,6 +447,7 @@ el0_fpsimd_acc:
        /*
         * Floating Point or Advanced SIMD access
         */
+        enable_dbg
        mov     x0, x25
        mov     x1, sp
        b       do_fpsimd_acc
@@ -455,6 +455,7 @@ el0_fpsimd_exc:
        /*
         * Floating Point or Advanced SIMD exception
         */
+        enable_dbg
        mov     x0, x25
        mov     x1, sp
        b       do_fpsimd_exc
@@ -463,11 +464,8 @@ el0_sp_pc:
         * Stack or PC alignment exception handling
         */
        mrs     x0, far_el1
-        disable_step x1
-        isb
-        enable_dbg
        // enable interrupts before calling the main handler
-        enable_irq
+        enable_dbg_and_irq
        mov     x1, x25
        mov     x2, sp
        b       do_sp_pc_abort
@@ -475,9 +473,9 @@ el0_undef:
        /*
         * Undefined instruction
         */
-        mov     x0, sp
        // enable interrupts before calling the main handler
-        enable_irq
+        enable_dbg_and_irq
+        mov     x0, sp
        b       do_undefinstr
 el0_dbg:
        /*
@@ -485,11 +483,13 @@ el0_dbg:
         */
        tbnz    x24, #0, el0_inv                // EL0 only
        mrs     x0, far_el1
-        disable_step x1
        mov     x1, x25
        mov     x2, sp
-        b       do_debug_exception
+        bl      do_debug_exception
+        enable_dbg
+        b       ret_to_user
 el0_inv:
+        enable_dbg
        mov     x0, sp
        mov     x1, #BAD_SYNC
        mrs     x2, esr_el1
@@ -500,15 +500,12 @@ ENDPROC(el0_sync)
 el0_irq:
        kernel_entry 0
 el0_irq_naked:
-        disable_step x1
-        isb
        enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
        bl      trace_hardirqs_off
 #endif
        irq_handler
-        get_thread_info tsk
 #ifdef CONFIG_TRACE_IRQFLAGS
        bl      trace_hardirqs_on
@@ -517,14 +514,6 @@ el0_irq_naked:
 ENDPROC(el0_irq)
 /*
- * This is the return code to user mode for abort handlers
- */
-ret_from_exception:
-        get_thread_info tsk
-        b       ret_to_user
-ENDPROC(ret_from_exception)
-/*
 * Register switch for AArch64. The callee-saved registers need to be saved
 * and restored. On entry:
 *   x0 = previous task_struct (must be preserved across the switch)
@@ -563,10 +552,7 @@ ret_fast_syscall:
        ldr     x1, [tsk, #TI_FLAGS]
        and     x2, x1, #_TIF_WORK_MASK
        cbnz    x2, fast_work_pending
-        tbz     x1, #TIF_SINGLESTEP, fast_exit
+        enable_step_tsk x1, x2
-        disable_dbg
-        enable_step x2
-fast_exit:
        kernel_exit 0, ret = 1
 /*
@@ -576,7 +562,7 @@ fast_work_pending:
        str     x0, [sp, #S_X0]                 // returned x0
 work_pending:
        tbnz    x1, #TIF_NEED_RESCHED, work_resched
-        /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */
+        /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
        ldr     x2, [sp, #S_PSTATE]
        mov     x0, sp                          // 'regs'
        tst     x2, #PSR_MODE_MASK              // user mode regs?
@@ -585,7 +571,6 @@ work_pending:
        bl      do_notify_resume
        b       ret_to_user
 work_resched:
-        enable_dbg
        bl      schedule
 /*
@@ -596,9 +581,7 @@ ret_to_user:
        ldr     x1, [tsk, #TI_FLAGS]
        and     x2, x1, #_TIF_WORK_MASK
        cbnz    x2, work_pending
-        tbz     x1, #TIF_SINGLESTEP, no_work_pending
+        enable_step_tsk x1, x2
-        disable_dbg
-        enable_step x2
 no_work_pending:
        kernel_exit 0, ret = 0
 ENDPROC(ret_to_user)
@@ -625,14 +608,11 @@ el0_svc:
        mov     sc_nr, #__NR_syscalls
 el0_svc_naked:                                  // compat entry point
        stp     x0, scno, [sp, #S_ORIG_X0]      // save the original x0 and syscall number
-        disable_step x16
+        enable_dbg_and_irq
-        isb
-        enable_dbg
-        enable_irq
-        get_thread_info tsk
+        ldr     x16, [tsk, #TI_FLAGS]           // check for syscall hooks
-        ldr     x16, [tsk, #TI_FLAGS]           // check for syscall tracing
+        tst     x16, #_TIF_SYSCALL_WORK
-        tbnz    x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls?
+        b.ne    __sys_trace
        adr     lr, ret_fast_syscall            // return address
        cmp     scno, sc_nr                     // check upper syscall limit
        b.hs    ni_sys
@@ -648,9 +628,8 @@ ENDPROC(el0_svc)
         * switches, and waiting for our parent to respond.
         */
 __sys_trace:
-        mov     x1, sp
+        mov     x0, sp
-        mov     w0, #0                          // trace entry
+        bl      syscall_trace_enter
-        bl      syscall_trace
        adr     lr, __sys_trace_return          // return address
        uxtw    scno, w0                        // syscall number (possibly new)
        mov     x1, sp                          // pointer to regs
@@ -665,9 +644,8 @@ __sys_trace:
 __sys_trace_return:
        str     x0, [sp]                        // save returned x0
-        mov     x1, sp
+        mov     x0, sp
-        mov     w0, #1                          // trace exit
+        bl      syscall_trace_exit
-        bl      syscall_trace
        b       ret_to_user
 /*
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4aef42a04bdc..ad8aebb1cdef 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -35,6 +35,60 @@
 #define FPEXC_IDF       (1 << 7)
 /*
+ * In order to reduce the number of times the FPSIMD state is needlessly saved
+ * and restored, we need to keep track of two things:
+ * (a) for each task, we need to remember which CPU was the last one to have
+ *     the task's FPSIMD state loaded into its FPSIMD registers;
+ * (b) for each CPU, we need to remember which task's userland FPSIMD state has
+ *     been loaded into its FPSIMD registers most recently, or whether it has
+ *     been used to perform kernel mode NEON in the meantime.
+ *
+ * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
+ * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
+ * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
+ * address of the userland FPSIMD state of the task that was loaded onto the CPU
+ * the most recently, or NULL if kernel mode NEON has been performed after that.
+ *
+ * With this in place, we no longer have to restore the next FPSIMD state right
+ * when switching between tasks. Instead, we can defer this check to userland
+ * resume, at which time we verify whether the CPU's fpsimd_last_state and the
+ * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
+ * can omit the FPSIMD restore.
+ *
+ * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
+ * indicate whether or not the userland FPSIMD state of the current task is
+ * present in the registers. The flag is set unless the FPSIMD registers of this
+ * CPU currently contain the most recent userland FPSIMD state of the current
+ * task.
+ *
+ * For a certain task, the sequence may look something like this:
+ * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
+ *   contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
+ *   variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
+ *   cleared, otherwise it is set;
+ *
+ * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
+ *   userland FPSIMD state is copied from memory to the registers, the task's
+ *   fpsimd_state.cpu field is set to the id of the current CPU, the current
+ *   CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
+ *   TIF_FOREIGN_FPSTATE flag is cleared;
+ *
+ * - the task executes an ordinary syscall; upon return to userland, the
+ *   TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
+ *   restored;
+ *
+ * - the task executes a syscall which executes some NEON instructions; this is
+ *   preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
+ *   register contents to memory, clears the fpsimd_last_state per-cpu variable
+ *   and sets the TIF_FOREIGN_FPSTATE flag;
+ *
+ * - the task gets preempted after kernel_neon_end() is called; as we have not
+ *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
+ *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
+ */
+static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+/*
 * Trapped FP/ASIMD access.
 */
 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 void fpsimd_thread_switch(struct task_struct *next)
 {
-        /* check if not kernel threads */
+        /*
-        if (current->mm)
+         * Save the current FPSIMD state to memory, but only if whatever is in
+         * the registers is in fact the most recent userland FPSIMD state of
+         * 'current'.
+         */
+        if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
                fpsimd_save_state(&current->thread.fpsimd_state);
-        if (next->mm)
-                fpsimd_load_state(&next->thread.fpsimd_state);
+        if (next->mm) {
+                /*
+                 * If we are switching to a task whose most recent userland
+                 * FPSIMD state is already in the registers of *this* cpu,
+                 * we can skip loading the state from memory. Otherwise, set
+                 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
+                 * upon the next return to userland.
+                 */
+                struct fpsimd_state *st = &next->thread.fpsimd_state;
+                if (__this_cpu_read(fpsimd_last_state) == st
+                    && st->cpu == smp_processor_id())
+                        clear_ti_thread_flag(task_thread_info(next),
+                                             TIF_FOREIGN_FPSTATE);
+                else
+                        set_ti_thread_flag(task_thread_info(next),
+                                           TIF_FOREIGN_FPSTATE);
+        }
 }
 void fpsimd_flush_thread(void)
 {
-        preempt_disable();
        memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
-        fpsimd_load_state(&current->thread.fpsimd_state);
+        set_thread_flag(TIF_FOREIGN_FPSTATE);
+}
+/*
+ * Save the userland FPSIMD state of 'current' to memory, but only if the state
+ * currently held in the registers does in fact belong to 'current'
+ */
+void fpsimd_preserve_current_state(void)
+{
+        preempt_disable();
+        if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
+                fpsimd_save_state(&current->thread.fpsimd_state);
+        preempt_enable();
+}
+/*
+ * Load the userland FPSIMD state of 'current' from memory, but only if the
+ * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
+ * state of 'current'
+ */
+void fpsimd_restore_current_state(void)
+{
+        preempt_disable();
+        if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+                struct fpsimd_state *st = &current->thread.fpsimd_state;
+                fpsimd_load_state(st);
+                this_cpu_write(fpsimd_last_state, st);
+                st->cpu = smp_processor_id();
+        }
+        preempt_enable();
+}
+/*
+ * Load an updated userland FPSIMD state for 'current' from memory and set the
+ * flag that indicates that the FPSIMD register contents are the most recent
+ * FPSIMD state of 'current'
+ */
+void fpsimd_update_current_state(struct fpsimd_state *state)
+{
+        preempt_disable();
+        fpsimd_load_state(state);
+        if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+                struct fpsimd_state *st = &current->thread.fpsimd_state;
+                this_cpu_write(fpsimd_last_state, st);
+                st->cpu = smp_processor_id();
+        }
        preempt_enable();
 }
+/*
+ * Invalidate live CPU copies of task t's FPSIMD state
+ */
+void fpsimd_flush_task_state(struct task_struct *t)
+{
+        t->thread.fpsimd_state.cpu = NR_CPUS;
+}
 #ifdef CONFIG_KERNEL_MODE_NEON
+static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
+static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
 /*
 * Kernel-side NEON support functions
 */
-void kernel_neon_begin(void)
+void kernel_neon_begin_partial(u32 num_regs)
 {
-        /* Avoid using the NEON in interrupt context */
+        if (in_interrupt()) {
-        BUG_ON(in_interrupt());
+                struct fpsimd_partial_state *s = this_cpu_ptr(
-        preempt_disable();
+                        in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
-        if (current->mm)
+                BUG_ON(num_regs > 32);
-                fpsimd_save_state(&current->thread.fpsimd_state);
+                fpsimd_save_partial_state(s, roundup(num_regs, 2));
+        } else {
+                /*
+                 * Save the userland FPSIMD state if we have one and if we
+                 * haven't done so already. Clear fpsimd_last_state to indicate
+                 * that there is no longer userland FPSIMD state in the
+                 * registers.
+                 */
+                preempt_disable();
+                if (current->mm &&
+                    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
+                        fpsimd_save_state(&current->thread.fpsimd_state);
+                this_cpu_write(fpsimd_last_state, NULL);
+        }
 }
-EXPORT_SYMBOL(kernel_neon_begin);
+EXPORT_SYMBOL(kernel_neon_begin_partial);
 void kernel_neon_end(void)
 {
-        if (current->mm)
+        if (in_interrupt()) {
-                fpsimd_load_state(&current->thread.fpsimd_state);
+                struct fpsimd_partial_state *s = this_cpu_ptr(
+                        in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
-        preempt_enable();
+                fpsimd_load_partial_state(s);
+        } else {
+                preempt_enable();
+        }
 }
 EXPORT_SYMBOL(kernel_neon_end);
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
 {
        switch (cmd) {
        case CPU_PM_ENTER:
-                if (current->mm)
+                if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
                        fpsimd_save_state(&current->thread.fpsimd_state);
                break;
        case CPU_PM_EXIT:
                if (current->mm)
-                        fpsimd_load_state(&current->thread.fpsimd_state);
+                        set_thread_flag(TIF_FOREIGN_FPSTATE);
                break;
        case CPU_PM_ENTER_FAILED:
        default:
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
new file mode 100644
index 000000000000..7924d73b6476
--- /dev/null
+++ b/arch/arm64/kernel/ftrace.c
@@ -0,0 +1,176 @@
+/*
+ * arch/arm64/kernel/ftrace.c
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/ftrace.h>
+#include <linux/swab.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * Replace a single instruction, which may be a branch or NOP.
+ * If @validate == true, a replaced instruction is checked against 'old'.
+ */
+static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
+                              bool validate)
+{
+        u32 replaced;
+        /*
+         * Note:
+         * Due to modules and __init, code can disappear and change,
+         * we need to protect against faulting as well as code changing.
+         * We do this by aarch64_insn_*() which use the probe_kernel_*().
+         *
+         * No lock is held here because all the modifications are run
+         * through stop_machine().
+         */
+        if (validate) {
+                if (aarch64_insn_read((void *)pc, &replaced))
+                        return -EFAULT;
+                if (replaced != old)
+                        return -EINVAL;
+        }
+        if (aarch64_insn_patch_text_nosync((void *)pc, new))
+                return -EPERM;
+        return 0;
+}
+/*
+ * Replace tracer function in ftrace_caller()
+ */
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+        unsigned long pc;
+        u32 new;
+        pc = (unsigned long)&ftrace_call;
+        new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true);
+        return ftrace_modify_code(pc, 0, new, false);
+}
+/*
+ * Turn on the call to ftrace_caller() in instrumented function
+ */
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+        unsigned long pc = rec->ip;
+        u32 old, new;
+        old = aarch64_insn_gen_nop();
+        new = aarch64_insn_gen_branch_imm(pc, addr, true);
+        return ftrace_modify_code(pc, old, new, true);
+}
+/*
+ * Turn off the call to ftrace_caller() in instrumented function
+ */
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+                    unsigned long addr)
+{
+        unsigned long pc = rec->ip;
+        u32 old, new;
+        old = aarch64_insn_gen_branch_imm(pc, addr, true);
+        new = aarch64_insn_gen_nop();
+        return ftrace_modify_code(pc, old, new, true);
+}
+int __init ftrace_dyn_arch_init(void)
+{
+        return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * function_graph tracer expects ftrace_return_to_handler() to be called
+ * on the way back to parent. For this purpose, this function is called
+ * in _mcount() or ftrace_caller() to replace return address (*parent) on
+ * the call stack to return_to_handler.
+ *
+ * Note that @frame_pointer is used only for sanity check later.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+                           unsigned long frame_pointer)
+{
+        unsigned long return_hooker = (unsigned long)&return_to_handler;
+        unsigned long old;
+        struct ftrace_graph_ent trace;
+        int err;
+        if (unlikely(atomic_read(&current->tracing_graph_pause)))
+                return;
+        /*
+         * Note:
+         * No protection against faulting at *parent, which may be seen
+         * on other archs. It's unlikely on AArch64.
+         */
+        old = *parent;
+        *parent = return_hooker;
+        trace.func = self_addr;
+        trace.depth = current->curr_ret_stack + 1;
+        /* Only trace if the calling function expects to */
+        if (!ftrace_graph_entry(&trace)) {
+                *parent = old;
+                return;
+        }
+        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
+                                       frame_pointer);
+        if (err == -EBUSY) {
+                *parent = old;
+                return;
+        }
+}
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * Turn on/off the call to ftrace_graph_caller() in ftrace_caller()
+ * depending on @enable.
+ */
+static int ftrace_modify_graph_caller(bool enable)
+{
+        unsigned long pc = (unsigned long)&ftrace_graph_call;
+        u32 branch, nop;
+        branch = aarch64_insn_gen_branch_imm(pc,
+                        (unsigned long)ftrace_graph_caller, false);
+        nop = aarch64_insn_gen_nop();
+        if (enable)
+                return ftrace_modify_code(pc, nop, branch, true);
+        else
+                return ftrace_modify_code(pc, branch, nop, true);
+}
+int ftrace_enable_ftrace_graph_caller(void)
+{
+        return ftrace_modify_graph_caller(true);
+}
+int ftrace_disable_ftrace_graph_caller(void)
+{
+        return ftrace_modify_graph_caller(false);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 738291b5be29..a96d3a6a63f6 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -342,11 +342,9 @@ ENTRY(set_cpu_boot_mode_flag)
        cmp     w20, #BOOT_CPU_MODE_EL2
        b.ne    1f
        add     x1, x1, #4
-1:      dc      cvac, x1                        // Clean potentially dirty cache line
+1:      str     w20, [x1]                       // This CPU has booted in EL1
-        dsb     sy
+        dmb     sy
-        str     w20, [x1]                       // This CPU has booted in EL1
+        dc      ivac, x1                        // Invalidate potentially stale cache line
-        dc      civac, x1                       // Clean&invalidate potentially stale cache line
-        dsb     sy
        ret
 ENDPROC(set_cpu_boot_mode_flag)
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index bee789757806..df1cf15377b4 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -20,6 +20,7 @@
 #define pr_fmt(fmt) "hw-breakpoint: " fmt
+#include <linux/compat.h>
 #include <linux/cpu_pm.h>
 #include <linux/errno.h>
 #include <linux/hw_breakpoint.h>
@@ -27,7 +28,6 @@
 #include <linux/ptrace.h>
 #include <linux/smp.h>
-#include <asm/compat.h>
 #include <asm/current.h>
 #include <asm/debug-monitors.h>
 #include <asm/hw_breakpoint.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6391485f342d..43b7c34f92cb 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -20,6 +20,7 @@
 #include <stdarg.h>
+#include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -113,32 +114,62 @@ void arch_cpu_idle_dead(void)
 }
 #endif
+/*
+ * Called by kexec, immediately prior to machine_kexec().
+ *
+ * This must completely disable all secondary CPUs; simply causing those CPUs
+ * to execute e.g. a RAM-based pin loop is not sufficient. This allows the
+ * kexec'd kernel to use any and all RAM as it sees fit, without having to
+ * avoid any code or data used by any SW CPU pin loop. The CPU hotplug
+ * functionality embodied in disable_nonboot_cpus() to achieve this.
+ */
 void machine_shutdown(void)
 {
-#ifdef CONFIG_SMP
+        disable_nonboot_cpus();
-        smp_send_stop();
-#endif
 }
+/*
+ * Halting simply requires that the secondary CPUs stop performing any
+ * activity (executing tasks, handling interrupts). smp_send_stop()
+ * achieves this.
+ */
 void machine_halt(void)
 {
-        machine_shutdown();
+        local_irq_disable();
+        smp_send_stop();
        while (1);
 }
+/*
+ * Power-off simply requires that the secondary CPUs stop performing any
+ * activity (executing tasks, handling interrupts). smp_send_stop()
+ * achieves this. When the system power is turned off, it will take all CPUs
+ * with it.
+ */
 void machine_power_off(void)
 {
-        machine_shutdown();
+        local_irq_disable();
+        smp_send_stop();
        if (pm_power_off)
                pm_power_off();
 }
+/*
+ * Restart requires that the secondary CPUs stop performing any activity
+ * while the primary CPU resets the system. Systems with a single CPU can
+ * use soft_restart() as their machine descriptor's .restart hook, since that
+ * will cause the only available CPU to reset. Systems with multiple CPUs must
+ * provide a HW restart implementation, to ensure that all CPUs reset at once.
+ * This is required so that any code running after reset on the primary CPU
+ * doesn't have to co-ordinate with other CPUs to ensure they aren't still
+ * executing pre-reset code, and using RAM that the primary CPU's code wishes
+ * to use. Implementing such co-ordination would be essentially impossible.
+ */
 void machine_restart(char *cmd)
 {
-        machine_shutdown();
        /* Disable interrupts first */
        local_irq_disable();
+        smp_send_stop();
        /* Now call the architecture specific reboot code. */
        if (arm_pm_restart)
@@ -205,7 +236,7 @@ void release_thread(struct task_struct *dead_task)
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-        fpsimd_save_state(&current->thread.fpsimd_state);
+        fpsimd_preserve_current_state();
        *dst = *src;
        return 0;
 }
@@ -300,7 +331,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
         * Complete any pending TLB or cache maintenance on this CPU in case
         * the thread migrates to a different CPU.
         */
-        dsb();
+        dsb(ish);
        /* the actual thread switch */
        last = cpu_switch_to(prev, next);
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 6a8928bba03c..3e926b9c0641 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -41,6 +42,9 @@
 #include <asm/traps.h>
 #include <asm/system_misc.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
 /*
 * TODO: does not yet catch signals sent when the child dies.
 * in exit.c or in signal.c.
@@ -517,6 +521,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
                return ret;
        target->thread.fpsimd_state.user_fpsimd = newstate;
+        fpsimd_flush_task_state(target);
        return ret;
 }
@@ -764,6 +769,7 @@ static int compat_vfp_set(struct task_struct *target,
                uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
        }
+        fpsimd_flush_task_state(target);
        return ret;
 }
@@ -1058,35 +1064,49 @@ long arch_ptrace(struct task_struct *child, long request,
        return ptrace_request(child, request, addr, data);
 }
-asmlinkage int syscall_trace(int dir, struct pt_regs *regs)
+enum ptrace_syscall_dir {
+        PTRACE_SYSCALL_ENTER = 0,
+        PTRACE_SYSCALL_EXIT,
+};
+static void tracehook_report_syscall(struct pt_regs *regs,
+                                     enum ptrace_syscall_dir dir)
 {
+        int regno;
        unsigned long saved_reg;
-        if (!test_thread_flag(TIF_SYSCALL_TRACE))
+        /*
-                return regs->syscallno;
+         * A scratch register (ip(r12) on AArch32, x7 on AArch64) is
+         * used to denote syscall entry/exit:
-        if (is_compat_task()) {
+         */
-                /* AArch32 uses ip (r12) for scratch */
+        regno = (is_compat_task() ? 12 : 7);
-                saved_reg = regs->regs[12];
+        saved_reg = regs->regs[regno];
-                regs->regs[12] = dir;
+        regs->regs[regno] = dir;
-        } else {
-                /*
-                 * Save X7. X7 is used to denote syscall entry/exit:
-                 *   X7 = 0 -> entry, = 1 -> exit
-                 */
-                saved_reg = regs->regs[7];
-                regs->regs[7] = dir;
-        }
-        if (dir)
+        if (dir == PTRACE_SYSCALL_EXIT)
                tracehook_report_syscall_exit(regs, 0);
        else if (tracehook_report_syscall_entry(regs))
                regs->syscallno = ~0UL;
-        if (is_compat_task())
+        regs->regs[regno] = saved_reg;
-                regs->regs[12] = saved_reg;
+}
-        else
-                regs->regs[7] = saved_reg;
+asmlinkage int syscall_trace_enter(struct pt_regs *regs)
+{
+        if (test_thread_flag(TIF_SYSCALL_TRACE))
+                tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+        if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+                trace_sys_enter(regs, regs->syscallno);
        return regs->syscallno;
 }
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+        if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+                trace_sys_exit(regs, regs_return_value(regs));
+        if (test_thread_flag(TIF_SYSCALL_TRACE))
+                tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+}
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
new file mode 100644
index 000000000000..89102a6ffad5
--- /dev/null
+++ b/arch/arm64/kernel/return_address.c
@@ -0,0 +1,55 @@
+/*
+ * arch/arm64/kernel/return_address.c
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/export.h>
+#include <linux/ftrace.h>
+#include <asm/stacktrace.h>
+struct return_address_data {
+        unsigned int level;
+        void *addr;
+};
+static int save_return_addr(struct stackframe *frame, void *d)
+{
+        struct return_address_data *data = d;
+        if (!data->level) {
+                data->addr = (void *)frame->pc;
+                return 1;
+        } else {
+                --data->level;
+                return 0;
+        }
+}
+void *return_address(unsigned int level)
+{
+        struct return_address_data data;
+        struct stackframe frame;
+        register unsigned long current_sp asm ("sp");
+        data.level = level + 2;
+        data.addr = NULL;
+        frame.fp = (unsigned long)__builtin_frame_address(0);
+        frame.sp = current_sp;
+        frame.pc = (unsigned long)return_address; /* dummy */
+        walk_stackframe(&frame, save_return_addr, &data);
+        if (!data.level)
+                return data.addr;
+        else
+                return NULL;
+}
+EXPORT_SYMBOL_GPL(return_address);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index e578171b22ff..46d1125571f6 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -25,6 +25,7 @@
 #include <linux/utsname.h>
 #include <linux/initrd.h>
 #include <linux/console.h>
+#include <linux/cache.h>
 #include <linux/bootmem.h>
 #include <linux/seq_file.h>
 #include <linux/screen_info.h>
@@ -200,6 +201,8 @@ static void __init setup_processor(void)
 {
        struct cpu_info *cpu_info;
        u64 features, block;
+        u32 cwg;
+        int cls;
        cpu_info = lookup_processor_type(read_cpuid_id());
        if (!cpu_info) {
@@ -217,6 +220,18 @@ static void __init setup_processor(void)
        elf_hwcap = 0;
        /*
+         * Check for sane CTR_EL0.CWG value.
+         */
+        cwg = cache_type_cwg();
+        cls = cache_line_size();
+        if (!cwg)
+                pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
+                        cls);
+        if (L1_CACHE_BYTES < cls)
+                pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
+                        L1_CACHE_BYTES, cls);
+        /*
         * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks.
         * The blocks we test below represent incremental functionality
         * for non-negative values. Negative values are reserved.
@@ -363,7 +378,6 @@ void __init setup_arch(char **cmdline_p)
        *cmdline_p = boot_command_line;
-        init_mem_pgprot();
        early_ioremap_init();
        parse_early_param();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 890a591f75dd..6357b9c6c90e 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
@@ -25,7 +26,6 @@
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
-#include <asm/compat.h>
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
        int err;
        /* dump the hardware registers to the fpsimd_state structure */
-        fpsimd_save_state(fpsimd);
+        fpsimd_preserve_current_state();
        /* copy the FP and status/control registers */
        err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
        __get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
        /* load the hardware registers from the fpsimd_state structure */
-        if (!err) {
+        if (!err)
-                preempt_disable();
+                fpsimd_update_current_state(&fpsimd);
-                fpsimd_load_state(&fpsimd);
-                preempt_enable();
-        }
        return err ? -EFAULT : 0;
 }
@@ -100,8 +97,7 @@ static int restore_sigframe(struct pt_regs *regs,
 {
        sigset_t set;
        int i, err;
-        struct aux_context __user *aux =
+        void *aux = sf->uc.uc_mcontext.__reserved;
-                (struct aux_context __user *)sf->uc.uc_mcontext.__reserved;
        err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
        if (err == 0)
@@ -121,8 +117,11 @@ static int restore_sigframe(struct pt_regs *regs,
        err |= !valid_user_regs(&regs->user_regs);
-        if (err == 0)
+        if (err == 0) {
-                err |= restore_fpsimd_context(&aux->fpsimd);
+                struct fpsimd_context *fpsimd_ctx =
+                        container_of(aux, struct fpsimd_context, head);
+                err |= restore_fpsimd_context(fpsimd_ctx);
+        }
        return err;
 }
@@ -167,8 +166,8 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
                          struct pt_regs *regs, sigset_t *set)
 {
        int i, err = 0;
-        struct aux_context __user *aux =
+        void *aux = sf->uc.uc_mcontext.__reserved;
-                (struct aux_context __user *)sf->uc.uc_mcontext.__reserved;
+        struct _aarch64_ctx *end;
        /* set up the stack frame for unwinding */
        __put_user_error(regs->regs[29], &sf->fp, err);
@@ -185,12 +184,27 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
        err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
-        if (err == 0)
+        if (err == 0) {
-                err |= preserve_fpsimd_context(&aux->fpsimd);
+                struct fpsimd_context *fpsimd_ctx =
+                        container_of(aux, struct fpsimd_context, head);
+                err |= preserve_fpsimd_context(fpsimd_ctx);
+                aux += sizeof(*fpsimd_ctx);
+        }
+        /* fault information, if valid */
+        if (current->thread.fault_code) {
+                struct esr_context *esr_ctx =
+                        container_of(aux, struct esr_context, head);
+                __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
+                __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
+                __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
+                aux += sizeof(*esr_ctx);
+        }
        /* set the "end" magic */
-        __put_user_error(0, &aux->end.magic, err);
+        end = aux;
-        __put_user_error(0, &aux->end.size, err);
+        __put_user_error(0, &end->magic, err);
+        __put_user_error(0, &end->size, err);
        return err;
 }
@@ -416,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
                clear_thread_flag(TIF_NOTIFY_RESUME);
                tracehook_notify_resume(regs);
        }
+        if (thread_flags & _TIF_FOREIGN_FPSTATE)
+                fpsimd_restore_current_state();
 }
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index b3fc9f5ec6d3..3491c638f172 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -23,6 +23,7 @@
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
+#include <asm/esr.h>
 #include <asm/fpsimd.h>
 #include <asm/signal32.h>
 #include <asm/uaccess.h>
@@ -81,6 +82,8 @@ struct compat_vfp_sigframe {
 #define VFP_MAGIC               0x56465001
 #define VFP_STORAGE_SIZE        sizeof(struct compat_vfp_sigframe)
+#define FSR_WRITE_SHIFT         (11)
 struct compat_aux_sigframe {
        struct compat_vfp_sigframe      vfp;
@@ -219,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
         * Note that this also saves V16-31, which aren't visible
         * in AArch32.
         */
-        fpsimd_save_state(fpsimd);
+        fpsimd_preserve_current_state();
        /* Place structure header on the stack */
        __put_user_error(magic, &frame->magic, err);
@@ -282,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
         * We don't need to touch the exception register, so
         * reload the hardware state.
         */
-        if (!err) {
+        if (!err)
-                preempt_disable();
+                fpsimd_update_current_state(&fpsimd);
-                fpsimd_load_state(&fpsimd);
-                preempt_enable();
-        }
        return err ? -EFAULT : 0;
 }
@@ -500,7 +500,9 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf,
        __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err);
        __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err);
-        __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err);
+        /* set the compat FSR WnR */
+        __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) <<
+                         FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err);
        __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
        __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index c3cb160edc69..40f38f46c8e0 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -35,6 +35,7 @@
 #include <linux/clockchips.h>
 #include <linux/completion.h>
 #include <linux/of.h>
+#include <linux/irq_work.h>
 #include <asm/atomic.h>
 #include <asm/cacheflush.h>
@@ -62,6 +63,7 @@ enum ipi_msg_type {
        IPI_CALL_FUNC_SINGLE,
        IPI_CPU_STOP,
        IPI_TIMER,
+        IPI_IRQ_WORK,
 };
 /*
@@ -477,6 +479,14 @@ void arch_send_call_function_single_ipi(int cpu)
        smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
 }
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
+{
+        if (smp_cross_call)
+                smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
+}
+#endif
 static const char *ipi_types[NR_IPI] = {
 #define S(x,s)  [x - IPI_RESCHEDULE] = s
        S(IPI_RESCHEDULE, "Rescheduling interrupts"),
@@ -484,6 +494,7 @@ static const char *ipi_types[NR_IPI] = {
        S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
        S(IPI_CPU_STOP, "CPU stop interrupts"),
        S(IPI_TIMER, "Timer broadcast interrupts"),
+        S(IPI_IRQ_WORK, "IRQ work interrupts"),
 };
 void show_ipi_list(struct seq_file *p, int prec)
@@ -576,6 +587,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                break;
 #endif
+#ifdef CONFIG_IRQ_WORK
+        case IPI_IRQ_WORK:
+                irq_enter();
+                irq_work_run();
+                irq_exit();
+                break;
+#endif
        default:
                pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
                break;
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 7a530d2cc807..0347d38eea29 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -30,7 +30,6 @@ extern void secondary_holding_pen(void);
 volatile unsigned long secondary_holding_pen_release = INVALID_HWID;
 static phys_addr_t cpu_release_addr[NR_CPUS];
-static DEFINE_RAW_SPINLOCK(boot_lock);
 /*
 * Write secondary_holding_pen_release in a way that is guaranteed to be
@@ -94,14 +93,6 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
 static int smp_spin_table_cpu_boot(unsigned int cpu)
 {
-        unsigned long timeout;
-        /*
-         * Set synchronisation state between this boot processor
-         * and the secondary one
-         */
-        raw_spin_lock(&boot_lock);
        /*
         * Update the pen release flag.
         */
@@ -112,34 +103,7 @@ static int smp_spin_table_cpu_boot(unsigned int cpu)
         */
        sev();
-        timeout = jiffies + (1 * HZ);
+        return 0;
-        while (time_before(jiffies, timeout)) {
-                if (secondary_holding_pen_release == INVALID_HWID)
-                        break;
-                udelay(10);
-        }
-        /*
-         * Now the secondary core is starting up let it run its
-         * calibrations, then wait for it to finish
-         */
-        raw_spin_unlock(&boot_lock);
-        return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0;
-}
-static void smp_spin_table_cpu_postboot(void)
-{
-        /*
-         * Let the primary processor know we're out of the pen.
-         */
-        write_pen_release(INVALID_HWID);
-        /*
-         * Synchronise with the boot thread.
-         */
-        raw_spin_lock(&boot_lock);
-        raw_spin_unlock(&boot_lock);
 }
 const struct cpu_operations smp_spin_table_ops = {
@@ -147,5 +111,4 @@ const struct cpu_operations smp_spin_table_ops = {
        .cpu_init       = smp_spin_table_cpu_init,
        .cpu_prepare    = smp_spin_table_cpu_prepare,
        .cpu_boot       = smp_spin_table_cpu_boot,
-        .cpu_postboot   = smp_spin_table_cpu_postboot,
 };
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 38f0558f0c0a..55437ba1f5a4 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -35,7 +35,7 @@
 *      ldp     x29, x30, [sp]
 *      add     sp, sp, #0x10
 */
-int unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct stackframe *frame)
 {
        unsigned long high, low;
        unsigned long fp = frame->fp;
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 6815987b50f8..1a7125c3099b 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <linux/clockchips.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
@@ -69,6 +70,8 @@ void __init time_init(void)
        of_clk_init(NULL);
        clocksource_of_init();
+        tick_setup_hrtimer_broadcast();
        arch_timer_rate = arch_timer_get_rate();
        if (!arch_timer_rate)
                panic("Unable to initialise architected timer.\n");
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 3e06b0be4ec8..43514f905916 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -17,10 +17,192 @@
 #include <linux/percpu.h>
 #include <linux/node.h>
 #include <linux/nodemask.h>
+#include <linux/of.h>
 #include <linux/sched.h>
 #include <asm/topology.h>
+static int __init get_cpu_for_node(struct device_node *node)
+{
+        struct device_node *cpu_node;
+        int cpu;
+        cpu_node = of_parse_phandle(node, "cpu", 0);
+        if (!cpu_node)
+                return -1;
+        for_each_possible_cpu(cpu) {
+                if (of_get_cpu_node(cpu, NULL) == cpu_node) {
+                        of_node_put(cpu_node);
+                        return cpu;
+                }
+        }
+        pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
+        of_node_put(cpu_node);
+        return -1;
+}
+static int __init parse_core(struct device_node *core, int cluster_id,
+                             int core_id)
+{
+        char name[10];
+        bool leaf = true;
+        int i = 0;
+        int cpu;
+        struct device_node *t;
+        do {
+                snprintf(name, sizeof(name), "thread%d", i);
+                t = of_get_child_by_name(core, name);
+                if (t) {
+                        leaf = false;
+                        cpu = get_cpu_for_node(t);
+                        if (cpu >= 0) {
+                                cpu_topology[cpu].cluster_id = cluster_id;
+                                cpu_topology[cpu].core_id = core_id;
+                                cpu_topology[cpu].thread_id = i;
+                        } else {
+                                pr_err("%s: Can't get CPU for thread\n",
+                                       t->full_name);
+                                of_node_put(t);
+                                return -EINVAL;
+                        }
+                        of_node_put(t);
+                }
+                i++;
+        } while (t);
+        cpu = get_cpu_for_node(core);
+        if (cpu >= 0) {
+                if (!leaf) {
+                        pr_err("%s: Core has both threads and CPU\n",
+                               core->full_name);
+                        return -EINVAL;
+                }
+                cpu_topology[cpu].cluster_id = cluster_id;
+                cpu_topology[cpu].core_id = core_id;
+        } else if (leaf) {
+                pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int __init parse_cluster(struct device_node *cluster, int depth)
+{
+        char name[10];
+        bool leaf = true;
+        bool has_cores = false;
+        struct device_node *c;
+        static int cluster_id __initdata;
+        int core_id = 0;
+        int i, ret;
+        /*
+         * First check for child clusters; we currently ignore any
+         * information about the nesting of clusters and present the
+         * scheduler with a flat list of them.
+         */
+        i = 0;
+        do {
+                snprintf(name, sizeof(name), "cluster%d", i);
+                c = of_get_child_by_name(cluster, name);
+                if (c) {
+                        leaf = false;
+                        ret = parse_cluster(c, depth + 1);
+                        of_node_put(c);
+                        if (ret != 0)
+                                return ret;
+                }
+                i++;
+        } while (c);
+        /* Now check for cores */
+        i = 0;
+        do {
+                snprintf(name, sizeof(name), "core%d", i);
+                c = of_get_child_by_name(cluster, name);
+                if (c) {
+                        has_cores = true;
+                        if (depth == 0) {
+                                pr_err("%s: cpu-map children should be clusters\n",
+                                       c->full_name);
+                                of_node_put(c);
+                                return -EINVAL;
+                        }
+                        if (leaf) {
+                                ret = parse_core(c, cluster_id, core_id++);
+                        } else {
+                                pr_err("%s: Non-leaf cluster with core %s\n",
+                                       cluster->full_name, name);
+                                ret = -EINVAL;
+                        }
+                        of_node_put(c);
+                        if (ret != 0)
+                                return ret;
+                }
+                i++;
+        } while (c);
+        if (leaf && !has_cores)
+                pr_warn("%s: empty cluster\n", cluster->full_name);
+        if (leaf)
+                cluster_id++;
+        return 0;
+}
+static int __init parse_dt_topology(void)
+{
+        struct device_node *cn, *map;
+        int ret = 0;
+        int cpu;
+        cn = of_find_node_by_path("/cpus");
+        if (!cn) {
+                pr_err("No CPU information found in DT\n");
+                return 0;
+        }
+        /*
+         * When topology is provided cpu-map is essentially a root
+         * cluster with restricted subnodes.
+         */
+        map = of_get_child_by_name(cn, "cpu-map");
+        if (!map)
+                goto out;
+        ret = parse_cluster(map, 0);
+        if (ret != 0)
+                goto out_map;
+        /*
+         * Check that all cores are in the topology; the SMP code will
+         * only mark cores described in the DT as possible.
+         */
+        for_each_possible_cpu(cpu) {
+                if (cpu_topology[cpu].cluster_id == -1) {
+                        pr_err("CPU%d: No topology information specified\n",
+                               cpu);
+                        ret = -EINVAL;
+                }
+        }
+out_map:
+        of_node_put(map);
+out:
+        of_node_put(cn);
+        return ret;
+}
 /*
 * cpu topology table
 */
@@ -39,13 +221,9 @@ static void update_siblings_masks(unsigned int cpuid)
        if (cpuid_topo->cluster_id == -1) {
                /*
-                 * DT does not contain topology information for this cpu
+                 * DT does not contain topology information for this cpu.
-                 * reset it to default behaviour
                 */
                pr_debug("CPU%u: No topology information configured\n", cpuid);
-                cpuid_topo->core_id = 0;
-                cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling);
-                cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling);
                return;
        }
@@ -74,22 +252,32 @@ void store_cpu_topology(unsigned int cpuid)
        update_siblings_masks(cpuid);
 }
-/*
+static void __init reset_cpu_topology(void)
- * init_cpu_topology is called at boot when only one cpu is running
- * which prevent simultaneous write access to cpu_topology array
- */
-void __init init_cpu_topology(void)
 {
        unsigned int cpu;
-        /* init core mask and power*/
        for_each_possible_cpu(cpu) {
                struct cpu_topology *cpu_topo = &cpu_topology[cpu];
                cpu_topo->thread_id = -1;
-                cpu_topo->core_id =  -1;
+                cpu_topo->core_id = 0;
                cpu_topo->cluster_id = -1;
                cpumask_clear(&cpu_topo->core_sibling);
+                cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
                cpumask_clear(&cpu_topo->thread_sibling);
+                cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
        }
 }
+void __init init_cpu_topology(void)
+{
+        reset_cpu_topology();
+        /*
+         * Discard anything that was parsed if we hit an error so we
+         * don't use partial information.
+         */
+        if (parse_dt_topology())
+                reset_cpu_topology();
+}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7ffadddb645d..c43cfa9b8304 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -251,10 +251,13 @@ void die(const char *str, struct pt_regs *regs, int err)
 void arm64_notify_die(const char *str, struct pt_regs *regs,
                      struct siginfo *info, int err)
 {
-        if (user_mode(regs))
+        if (user_mode(regs)) {
+                current->thread.fault_address = 0;
+                current->thread.fault_code = err;
                force_sig_info(info->si_signo, info, current);
-        else
+        } else {
                die(str, regs, err);
+        }
 }
 asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 4ba7a55b49c7..f1e6d5c032e1 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@
 #define ARM_EXIT_DISCARD(x)     x
 OUTPUT_ARCH(aarch64)
-ENTRY(stext)
+ENTRY(_text)
 jiffies = jiffies_64;
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 2c56012cb2d2..b0d1512acf08 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -630,9 +630,15 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
         * whole of Stage-1. Weep...
         */
        tlbi    ipas2e1is, x1
-        dsb     sy
+        /*
+         * We have to ensure completion of the invalidation at Stage-2,
+         * since a table walk on another CPU could refill a TLB with a
+         * complete (S1 + S2) walk based on the old Stage-2 mapping if
+         * the Stage-1 invalidation happened first.
+         */
+        dsb     ish
        tlbi    vmalle1is
-        dsb     sy
+        dsb     ish
        isb
        msr     vttbr_el2, xzr
@@ -643,7 +649,7 @@ ENTRY(__kvm_flush_vm_context)
        dsb     ishst
        tlbi    alle1is
        ic      ialluis
-        dsb     sy
+        dsb     ish
        ret
 ENDPROC(__kvm_flush_vm_context)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 03244582bc55..c59a1bdab5eb 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -71,13 +71,13 @@ static u32 get_ccsidr(u32 csselr)
 static void do_dc_cisw(u32 val)
 {
        asm volatile("dc cisw, %x0" : : "r" (val));
-        dsb();
+        dsb(ish);
 }
 static void do_dc_csw(u32 val)
 {
        asm volatile("dc csw, %x0" : : "r" (val));
-        dsb();
+        dsb(ish);
 }
 /* See note at ARM ARM B1.14.4 */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..d98d3e39879e 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,5 @@
 lib-y           := bitops.o clear_user.o delay.o copy_from_user.o       \
                   copy_to_user.o copy_in_user.o copy_page.o            \
                   clear_page.o memchr.o memcpy.o memmove.o memset.o    \
+                   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
                   strchr.o strrchr.o
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
new file mode 100644
index 000000000000..6ea0776ba6de
--- /dev/null
+++ b/arch/arm64/lib/memcmp.S
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+* compare memory areas(when two memory areas' offset are different,
+* alignment handled by the hardware)
+*
+* Parameters:
+*  x0 - const memory area 1 pointer
+*  x1 - const memory area 2 pointer
+*  x2 - the maximal compare byte length
+* Returns:
+*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
+*/
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+limit           .req    x2
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x3
+data1w          .req    w3
+data2           .req    x4
+data2w          .req    w4
+has_nul         .req    x5
+diff            .req    x6
+endloop         .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+pos             .req    x11
+limit_wd        .req    x12
+mask            .req    x13
+ENTRY(memcmp)
+        cbz     limit, .Lret0
+        eor     tmp1, src1, src2
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #3 /* Convert to Dwords.  */
+        /*
+        * The input source addresses are at alignment boundary.
+        * Directly compare eight bytes each time.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        subs    limit_wd, limit_wd, #1
+        eor     diff, data1, data2      /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, cs  /* Last Dword or differences.  */
+        cbz     endloop, .Lloop_aligned
+        /* Not reached the limit, must have found a diff.  */
+        tbz     limit_wd, #63, .Lnot_limit
+        /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
+        ands    limit, limit, #7
+        b.eq    .Lnot_limit
+        /*
+        * The remained bytes less than 8. It is needed to extract valid data
+        * from last eight bytes of the intended memory range.
+        */
+        lsl     limit, limit, #3        /* bytes-> bits.  */
+        mov     mask, #~0
+CPU_BE( lsr     mask, mask, limit )
+CPU_LE( lsl     mask, mask, limit )
+        bic     data1, data1, mask
+        bic     data2, data2, mask
+        orr     diff, diff, mask
+        b       .Lnot_limit
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary. Round down the addresses and then mask off
+        * the bytes that precede the start point.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        /*
+        * We can not add limit with alignment offset(tmp1) here. Since the
+        * addition probably make the limit overflown.
+        */
+        sub     limit_wd, limit, #1/*limit != 0, so no underflow.*/
+        and     tmp3, limit_wd, #7
+        lsr     limit_wd, limit_wd, #3
+        add     tmp3, tmp3, tmp1
+        add     limit_wd, limit_wd, tmp3, lsr #3
+        add     limit, limit, tmp1/* Adjust the limit for the extra.  */
+        lsl     tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
+        neg     tmp1, tmp1/* Bits to alignment -64.  */
+        mov     tmp2, #~0
+        /*mask off the non-intended bytes before the start address.*/
+CPU_BE( lsl     tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        b       .Lstart_realigned
+        /*src1 and src2 have different alignment offset.*/
+.Lmisaligned8:
+        cmp     limit, #8
+        b.lo    .Ltiny8proc /*limit < 8: compare byte by byte*/
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum.*/
+        sub     limit, limit, pos
+        /*compare the proceeding bytes in the first 8 byte segment.*/
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*diff occurred before the last byte.*/
+        cmp     data1w, data2w
+        b.eq    .Lstart_align
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        /*process more leading bytes to make src1 aligned...*/
+        add     src1, src1, tmp3 /*backwards src1 to alignment boundary*/
+        add     src2, src2, tmp3
+        sub     limit, limit, tmp3
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        /*load 8 bytes from aligned SRC1..*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        subs    limit_wd, limit_wd, #1
+        eor     diff, data1, data2  /*Non-zero if differences found.*/
+        csinv   endloop, diff, xzr, ne
+        cbnz    endloop, .Lunequal_proc
+        /*How far is the current SRC2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:/*src1 is aligned now..*/
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes and compare from
+        * the SRC2 alignment boundary. If all 8 bytes are equal,then start
+        * the second part's comparison. Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        cbnz    diff, .Lnot_limit
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        subs    limit_wd, limit_wd, #1
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        cbz     endloop, .Lloopcmp_proc
+.Lunequal_proc:
+        cbz     diff, .Lremain8
+/*There is differnence occured in the latest comparison.*/
+.Lnot_limit:
+/*
+* For little endian,reverse the low significant equal bits into MSB,then
+* following CLZ can find how many equal bits exist.
+*/
+CPU_LE( rev     diff, diff )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * The MS-non-zero bit of DIFF marks either the first bit
+        * that is different, or the end of the significant data.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        clz     pos, diff
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * We need to zero-extend (char is unsigned) the value and then
+        * perform a signed subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+.Lremain8:
+        /* Limit % 8 == 0 =>. all data are equal.*/
+        ands    limit, limit, #7
+        b.eq    .Lret0
+.Ltiny8proc:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    limit, limit, #1
+        ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000. */
+        b.eq    .Ltiny8proc
+        sub     result, data1, data2
+        ret
+.Lret0:
+        mov     result, #0
+        ret
+ENDPROC(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003609b6..8a9a96d3ddae 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
 * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
 * Returns:
 *      x0 - dest
 */
+dstin   .req    x0
+src     .req    x1
+count   .req    x2
+tmp1    .req    x3
+tmp1w   .req    w3
+tmp2    .req    x4
+tmp2w   .req    w4
+tmp3    .req    x5
+tmp3w   .req    w5
+dst     .req    x6
+A_l     .req    x7
+A_h     .req    x8
+B_l     .req    x9
+B_h     .req    x10
+C_l     .req    x11
+C_h     .req    x12
+D_l     .req    x13
+D_h     .req    x14
 ENTRY(memcpy)
-        mov     x4, x0
+        mov     dst, dstin
-        subs    x2, x2, #8
+        cmp     count, #16
-        b.mi    2f
+        /*When memory length is less than 16, the accessed are not aligned.*/
-1:      ldr     x3, [x1], #8
+        b.lo    .Ltiny15
-        subs    x2, x2, #8
-        str     x3, [x4], #8
+        neg     tmp2, src
-        b.pl    1b
+        ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
-2:      adds    x2, x2, #4
+        b.eq    .LSrcAligned
-        b.mi    3f
+        sub     count, count, tmp2
-        ldr     w3, [x1], #4
+        /*
-        sub     x2, x2, #4
+        * Copy the leading memory data from src to dst in an increasing
-        str     w3, [x4], #4
+        * address order.By this way,the risk of overwritting the source
-3:      adds    x2, x2, #2
+        * memory data is eliminated when the distance between src and
-        b.mi    4f
+        * dst is less than 16. The memory accesses here are alignment.
-        ldrh    w3, [x1], #2
+        */
-        sub     x2, x2, #2
+        tbz     tmp2, #0, 1f
-        strh    w3, [x4], #2
+        ldrb    tmp1w, [src], #1
-4:      adds    x2, x2, #1
+        strb    tmp1w, [dst], #1
-        b.mi    5f
+1:
-        ldrb    w3, [x1]
+        tbz     tmp2, #1, 2f
-        strb    w3, [x4]
+        ldrh    tmp1w, [src], #2
-5:      ret
+        strh    tmp1w, [dst], #2
+2:
+        tbz     tmp2, #2, 3f
+        ldr     tmp1w, [src], #4
+        str     tmp1w, [dst], #4
+3:
+        tbz     tmp2, #3, .LSrcAligned
+        ldr     tmp1, [src],#8
+        str     tmp1, [dst],#8
+.LSrcAligned:
+        cmp     count, #64
+        b.ge    .Lcpy_over64
+        /*
+        * Deal with small copies quickly by dropping straight into the
+        * exit block.
+        */
+.Ltail63:
+        /*
+        * Copy up to 48 bytes of data. At this point we only need the
+        * bottom 6 bits of count to be accurate.
+        */
+        ands    tmp1, count, #0x30
+        b.eq    .Ltiny15
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+1:
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+2:
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+.Ltiny15:
+        /*
+        * Prefer to break one ldp/stp into several load/store to access
+        * memory in an increasing address order,rather than to load/store 16
+        * bytes from (src-16) to (dst-16) and to backward the src to aligned
+        * address,which way is used in original cortex memcpy. If keeping
+        * the original memcpy process here, memmove need to satisfy the
+        * precondition that src address is at least 16 bytes bigger than dst
+        * address,otherwise some source data will be overwritten when memove
+        * call memcpy directly. To make memmove simpler and decouple the
+        * memcpy's dependency on memmove, withdrew the original process.
+        */
+        tbz     count, #3, 1f
+        ldr     tmp1, [src], #8
+        str     tmp1, [dst], #8
+1:
+        tbz     count, #2, 2f
+        ldr     tmp1w, [src], #4
+        str     tmp1w, [dst], #4
+2:
+        tbz     count, #1, 3f
+        ldrh    tmp1w, [src], #2
+        strh    tmp1w, [dst], #2
+3:
+        tbz     count, #0, .Lexitfunc
+        ldrb    tmp1w, [src]
+        strb    tmp1w, [dst]
+.Lexitfunc:
+        ret
+.Lcpy_over64:
+        subs    count, count, #128
+        b.ge    .Lcpy_body_large
+        /*
+        * Less than 128 bytes to copy, so handle 64 here and then jump
+        * to the tail.
+        */
+        ldp     A_l, A_h, [src],#16
+        stp     A_l, A_h, [dst],#16
+        ldp     B_l, B_h, [src],#16
+        ldp     C_l, C_h, [src],#16
+        stp     B_l, B_h, [dst],#16
+        stp     C_l, C_h, [dst],#16
+        ldp     D_l, D_h, [src],#16
+        stp     D_l, D_h, [dst],#16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
+        /*
+        * Critical loop.  Start at a new cache line boundary.  Assuming
+        * 64 bytes per line this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lcpy_body_large:
+        /* pre-get 64 bytes data. */
+        ldp     A_l, A_h, [src],#16
+        ldp     B_l, B_h, [src],#16
+        ldp     C_l, C_h, [src],#16
+        ldp     D_l, D_h, [src],#16
+1:
+        /*
+        * interlace the load of next 64 bytes data block with store of the last
+        * loaded 64 bytes data.
+        */
+        stp     A_l, A_h, [dst],#16
+        ldp     A_l, A_h, [src],#16
+        stp     B_l, B_h, [dst],#16
+        ldp     B_l, B_h, [src],#16
+        stp     C_l, C_h, [dst],#16
+        ldp     C_l, C_h, [src],#16
+        stp     D_l, D_h, [dst],#16
+        ldp     D_l, D_h, [src],#16
+        subs    count, count, #64
+        b.ge    1b
+        stp     A_l, A_h, [dst],#16
+        stp     B_l, B_h, [dst],#16
+        stp     C_l, C_h, [dst],#16
+        stp     D_l, D_h, [dst],#16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
 ENDPROC(memcpy)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index b79fdfa42d39..57b19ea2dad4 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
 * Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
 * Returns:
 *      x0 - dest
 */
+dstin   .req    x0
+src     .req    x1
+count   .req    x2
+tmp1    .req    x3
+tmp1w   .req    w3
+tmp2    .req    x4
+tmp2w   .req    w4
+tmp3    .req    x5
+tmp3w   .req    w5
+dst     .req    x6
+A_l     .req    x7
+A_h     .req    x8
+B_l     .req    x9
+B_h     .req    x10
+C_l     .req    x11
+C_h     .req    x12
+D_l     .req    x13
+D_h     .req    x14
 ENTRY(memmove)
-        cmp     x0, x1
+        cmp     dstin, src
-        b.ls    memcpy
+        b.lo    memcpy
-        add     x4, x0, x2
+        add     tmp1, src, count
-        add     x1, x1, x2
+        cmp     dstin, tmp1
-        subs    x2, x2, #8
+        b.hs    memcpy          /* No overlap.  */
-        b.mi    2f
-1:      ldr     x3, [x1, #-8]!
+        add     dst, dstin, count
-        subs    x2, x2, #8
+        add     src, src, count
-        str     x3, [x4, #-8]!
+        cmp     count, #16
-        b.pl    1b
+        b.lo    .Ltail15  /*probably non-alignment accesses.*/
-2:      adds    x2, x2, #4
-        b.mi    3f
+        ands    tmp2, src, #15     /* Bytes to reach alignment.  */
-        ldr     w3, [x1, #-4]!
+        b.eq    .LSrcAligned
-        sub     x2, x2, #4
+        sub     count, count, tmp2
-        str     w3, [x4, #-4]!
+        /*
-3:      adds    x2, x2, #2
+        * process the aligned offset length to make the src aligned firstly.
-        b.mi    4f
+        * those extra instructions' cost is acceptable. It also make the
-        ldrh    w3, [x1, #-2]!
+        * coming accesses are based on aligned address.
-        sub     x2, x2, #2
+        */
-        strh    w3, [x4, #-2]!
+        tbz     tmp2, #0, 1f
-4:      adds    x2, x2, #1
+        ldrb    tmp1w, [src, #-1]!
-        b.mi    5f
+        strb    tmp1w, [dst, #-1]!
-        ldrb    w3, [x1, #-1]
+1:
-        strb    w3, [x4, #-1]
+        tbz     tmp2, #1, 2f
-5:      ret
+        ldrh    tmp1w, [src, #-2]!
+        strh    tmp1w, [dst, #-2]!
+2:
+        tbz     tmp2, #2, 3f
+        ldr     tmp1w, [src, #-4]!
+        str     tmp1w, [dst, #-4]!
+3:
+        tbz     tmp2, #3, .LSrcAligned
+        ldr     tmp1, [src, #-8]!
+        str     tmp1, [dst, #-8]!
+.LSrcAligned:
+        cmp     count, #64
+        b.ge    .Lcpy_over64
+        /*
+        * Deal with small copies quickly by dropping straight into the
+        * exit block.
+        */
+.Ltail63:
+        /*
+        * Copy up to 48 bytes of data. At this point we only need the
+        * bottom 6 bits of count to be accurate.
+        */
+        ands    tmp1, count, #0x30
+        b.eq    .Ltail15
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+1:
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+2:
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+.Ltail15:
+        tbz     count, #3, 1f
+        ldr     tmp1, [src, #-8]!
+        str     tmp1, [dst, #-8]!
+1:
+        tbz     count, #2, 2f
+        ldr     tmp1w, [src, #-4]!
+        str     tmp1w, [dst, #-4]!
+2:
+        tbz     count, #1, 3f
+        ldrh    tmp1w, [src, #-2]!
+        strh    tmp1w, [dst, #-2]!
+3:
+        tbz     count, #0, .Lexitfunc
+        ldrb    tmp1w, [src, #-1]
+        strb    tmp1w, [dst, #-1]
+.Lexitfunc:
+        ret
+.Lcpy_over64:
+        subs    count, count, #128
+        b.ge    .Lcpy_body_large
+        /*
+        * Less than 128 bytes to copy, so handle 64 bytes here and then jump
+        * to the tail.
+        */
+        ldp     A_l, A_h, [src, #-16]
+        stp     A_l, A_h, [dst, #-16]
+        ldp     B_l, B_h, [src, #-32]
+        ldp     C_l, C_h, [src, #-48]
+        stp     B_l, B_h, [dst, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        ldp     D_l, D_h, [src, #-64]!
+        stp     D_l, D_h, [dst, #-64]!
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
+        /*
+        * Critical loop. Start at a new cache line boundary. Assuming
+        * 64 bytes per line this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lcpy_body_large:
+        /* pre-load 64 bytes data. */
+        ldp     A_l, A_h, [src, #-16]
+        ldp     B_l, B_h, [src, #-32]
+        ldp     C_l, C_h, [src, #-48]
+        ldp     D_l, D_h, [src, #-64]!
+1:
+        /*
+        * interlace the load of next 64 bytes data block with store of the last
+        * loaded 64 bytes data.
+        */
+        stp     A_l, A_h, [dst, #-16]
+        ldp     A_l, A_h, [src, #-16]
+        stp     B_l, B_h, [dst, #-32]
+        ldp     B_l, B_h, [src, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        ldp     C_l, C_h, [src, #-48]
+        stp     D_l, D_h, [dst, #-64]!
+        ldp     D_l, D_h, [src, #-64]!
+        subs    count, count, #64
+        b.ge    1b
+        stp     A_l, A_h, [dst, #-16]
+        stp     B_l, B_h, [dst, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        stp     D_l, D_h, [dst, #-64]!
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
 ENDPROC(memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68fbbbc..7c72dfd36b63 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
 * Fill in the buffer with character c (alignment handled by the hardware)
@@ -27,27 +36,181 @@
 * Returns:
 *      x0 - buf
 */
+dstin           .req    x0
+val             .req    w1
+count           .req    x2
+tmp1            .req    x3
+tmp1w           .req    w3
+tmp2            .req    x4
+tmp2w           .req    w4
+zva_len_x       .req    x5
+zva_len         .req    w5
+zva_bits_x      .req    x6
+A_l             .req    x7
+A_lw            .req    w7
+dst             .req    x8
+tmp3w           .req    w9
+tmp3            .req    x9
 ENTRY(memset)
-        mov     x4, x0
+        mov     dst, dstin      /* Preserve return value.  */
-        and     w1, w1, #0xff
+        and     A_lw, val, #255
-        orr     w1, w1, w1, lsl #8
+        orr     A_lw, A_lw, A_lw, lsl #8
-        orr     w1, w1, w1, lsl #16
+        orr     A_lw, A_lw, A_lw, lsl #16
-        orr     x1, x1, x1, lsl #32
+        orr     A_l, A_l, A_l, lsl #32
-        subs    x2, x2, #8
-        b.mi    2f
+        cmp     count, #15
-1:      str     x1, [x4], #8
+        b.hi    .Lover16_proc
-        subs    x2, x2, #8
+        /*All store maybe are non-aligned..*/
-        b.pl    1b
+        tbz     count, #3, 1f
-2:      adds    x2, x2, #4
+        str     A_l, [dst], #8
-        b.mi    3f
+1:
-        sub     x2, x2, #4
+        tbz     count, #2, 2f
-        str     w1, [x4], #4
+        str     A_lw, [dst], #4
-3:      adds    x2, x2, #2
+2:
-        b.mi    4f
+        tbz     count, #1, 3f
-        sub     x2, x2, #2
+        strh    A_lw, [dst], #2
-        strh    w1, [x4], #2
+3:
-4:      adds    x2, x2, #1
+        tbz     count, #0, 4f
-        b.mi    5f
+        strb    A_lw, [dst]
-        strb    w1, [x4]
+4:
-5:      ret
+        ret
+.Lover16_proc:
+        /*Whether  the start address is aligned with 16.*/
+        neg     tmp2, dst
+        ands    tmp2, tmp2, #15
+        b.eq    .Laligned
+/*
+* The count is not less than 16, we can use stp to store the start 16 bytes,
+* then adjust the dst aligned with 16.This process will make the current
+* memory address at alignment boundary.
+*/
+        stp     A_l, A_l, [dst] /*non-aligned store..*/
+        /*make the dst aligned..*/
+        sub     count, count, tmp2
+        add     dst, dst, tmp2
+.Laligned:
+        cbz     A_l, .Lzero_mem
+.Ltail_maybe_long:
+        cmp     count, #64
+        b.ge    .Lnot_short
+.Ltail63:
+        ands    tmp1, count, #0x30
+        b.eq    3f
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        stp     A_l, A_l, [dst], #16
+1:
+        stp     A_l, A_l, [dst], #16
+2:
+        stp     A_l, A_l, [dst], #16
+/*
+* The last store length is less than 16,use stp to write last 16 bytes.
+* It will lead some bytes written twice and the access is non-aligned.
+*/
+3:
+        ands    count, count, #15
+        cbz     count, 4f
+        add     dst, dst, count
+        stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
+4:
+        ret
+        /*
+        * Critical loop. Start at a new cache line boundary. Assuming
+        * 64 bytes per line, this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lnot_short:
+        sub     dst, dst, #16/* Pre-bias.  */
+        sub     count, count, #64
+1:
+        stp     A_l, A_l, [dst, #16]
+        stp     A_l, A_l, [dst, #32]
+        stp     A_l, A_l, [dst, #48]
+        stp     A_l, A_l, [dst, #64]!
+        subs    count, count, #64
+        b.ge    1b
+        tst     count, #0x3f
+        add     dst, dst, #16
+        b.ne    .Ltail63
+.Lexitfunc:
+        ret
+        /*
+        * For zeroing memory, check to see if we can use the ZVA feature to
+        * zero entire 'cache' lines.
+        */
+.Lzero_mem:
+        cmp     count, #63
+        b.le    .Ltail63
+        /*
+        * For zeroing small amounts of memory, it's not worth setting up
+        * the line-clear code.
+        */
+        cmp     count, #128
+        b.lt    .Lnot_short /*count is at least  128 bytes*/
+        mrs     tmp1, dczid_el0
+        tbnz    tmp1, #4, .Lnot_short
+        mov     tmp3w, #4
+        and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
+        lsl     zva_len, tmp3w, zva_len
+        ands    tmp3w, zva_len, #63
+        /*
+        * ensure the zva_len is not less than 64.
+        * It is not meaningful to use ZVA if the block size is less than 64.
+        */
+        b.ne    .Lnot_short
+.Lzero_by_line:
+        /*
+        * Compute how far we need to go to become suitably aligned. We're
+        * already at quad-word alignment.
+        */
+        cmp     count, zva_len_x
+        b.lt    .Lnot_short             /* Not enough to reach alignment.  */
+        sub     zva_bits_x, zva_len_x, #1
+        neg     tmp2, dst
+        ands    tmp2, tmp2, zva_bits_x
+        b.eq    2f                      /* Already aligned.  */
+        /* Not aligned, check that there's enough to copy after alignment.*/
+        sub     tmp1, count, tmp2
+        /*
+        * grantee the remain length to be ZVA is bigger than 64,
+        * avoid to make the 2f's process over mem range.*/
+        cmp     tmp1, #64
+        ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
+        b.lt    .Lnot_short
+        /*
+        * We know that there's at least 64 bytes to zero and that it's safe
+        * to overrun by 64 bytes.
+        */
+        mov     count, tmp1
+1:
+        stp     A_l, A_l, [dst]
+        stp     A_l, A_l, [dst, #16]
+        stp     A_l, A_l, [dst, #32]
+        subs    tmp2, tmp2, #64
+        stp     A_l, A_l, [dst, #48]
+        add     dst, dst, #64
+        b.ge    1b
+        /* We've overrun a bit, so adjust dst downwards.*/
+        add     dst, dst, tmp2
+2:
+        sub     count, count, zva_len_x
+3:
+        dc      zva, dst
+        add     dst, dst, zva_len_x
+        subs    count, count, zva_len_x
+        b.ge    3b
+        ands    count, count, zva_bits_x
+        b.ne    .Ltail_maybe_long
+        ret
 ENDPROC(memset)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
new file mode 100644
index 000000000000..42f828b06c59
--- /dev/null
+++ b/arch/arm64/lib/strcmp.S
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *      x0 - const string 1 pointer
+ *    x1 - const string 2 pointer
+ * Returns:
+ * x0 - an integer less than, equal to, or greater than zero
+ * if  s1  is  found, respectively, to be less than, to match,
+ * or be greater than s2.
+ */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x2
+data1w          .req    w2
+data2           .req    x3
+data2w          .req    w3
+has_nul         .req    x4
+diff            .req    x5
+syndrome        .req    x6
+tmp1            .req    x7
+tmp2            .req    x8
+tmp3            .req    x9
+zeroones        .req    x10
+pos             .req    x11
+ENTRY(strcmp)
+        eor     tmp1, src1, src2
+        mov     zeroones, #REP8_01
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2      /* Non-zero if differences found.  */
+        bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lloop_aligned
+        b       .Lcal_cmpresult
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary.  Round down the addresses and then mask off
+        * the bytes that preceed the start point.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
+        ldr     data1, [src1], #8
+        neg     tmp1, tmp1              /* Bits to alignment -64.  */
+        ldr     data2, [src2], #8
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        b       .Lstart_realigned
+.Lmisaligned8:
+        /*
+        * Get the align offset length to compare per byte first.
+        * After this process, one string's address will be aligned.
+        */
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum. */
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*find the null or unequal...*/
+        cmp     data1w, #1
+        ccmp    data1w, data2w, #0, cs
+        b.eq    .Lstart_align /*the last bytes are equal....*/
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        /*process more leading bytes to make str1 aligned...*/
+        add     src1, src1, tmp3
+        add     src2, src2, tmp3
+        /*load 8 bytes from aligned str1 and non-aligned str2..*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2 /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbnz    syndrome, .Lcal_cmpresult
+        /*How far is the current str2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes from the SRC2 alignment
+        * boundary,then compare with the relative bytes from SRC1.
+        * If all 8 bytes are equal,then start the second part's comparison.
+        * Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbnz    syndrome, .Lcal_cmpresult
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lloopcmp_proc
+.Lcal_cmpresult:
+        /*
+        * reversed the byte-order as big-endian,then CLZ can find the most
+        * significant zero bits.
+        */
+CPU_LE( rev     syndrome, syndrome )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * For big-endian we cannot use the trick with the syndrome value
+        * as carry-propagation can corrupt the upper bits if the trailing
+        * bytes in the string contain 0x01.
+        * However, if there is no NUL byte in the dword, we can generate
+        * the result directly.  We ca not just subtract the bytes as the
+        * MSB might be significant.
+        */
+CPU_BE( cbnz    has_nul, 1f )
+CPU_BE( cmp     data1, data2 )
+CPU_BE( cset    result, ne )
+CPU_BE( cneg    result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+        /*Re-compute the NUL-byte detection, using a byte-reversed value. */
+CPU_BE( rev     tmp3, data1 )
+CPU_BE( sub     tmp1, tmp3, zeroones )
+CPU_BE( orr     tmp2, tmp3, #REP8_7f )
+CPU_BE( bic     has_nul, tmp1, tmp2 )
+CPU_BE( rev     has_nul, has_nul )
+CPU_BE( orr     syndrome, diff, has_nul )
+        clz     pos, syndrome
+        /*
+        * The MS-non-zero bit of the syndrome marks either the first bit
+        * that is different, or the top bit of the first zero byte.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * But we need to zero-extend (char is unsigned) the value and then
+        * perform a signed 32-bit subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+ENDPROC(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
new file mode 100644
index 000000000000..987b68b9ce44
--- /dev/null
+++ b/arch/arm64/lib/strlen.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * calculate the length of a string
+ *
+ * Parameters:
+ *      x0 - const string pointer
+ * Returns:
+ *      x0 - the return length of specific string
+ */
+/* Arguments and results.  */
+srcin           .req    x0
+len             .req    x0
+/* Locals and temporaries.  */
+src             .req    x1
+data1           .req    x2
+data2           .req    x3
+data2a          .req    x4
+has_nul1        .req    x5
+has_nul2        .req    x6
+tmp1            .req    x7
+tmp2            .req    x8
+tmp3            .req    x9
+tmp4            .req    x10
+zeroones        .req    x11
+pos             .req    x12
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+ENTRY(strlen)
+        mov     zeroones, #REP8_01
+        bic     src, srcin, #15
+        ands    tmp1, srcin, #15
+        b.ne    .Lmisaligned
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+        /*
+        * The inner loop deals with two Dwords at a time. This has a
+        * slightly higher start-up cost, but we should win quite quickly,
+        * especially on cores with a high number of issue slots per
+        * cycle, as we get much better parallelism out of the operations.
+        */
+.Lloop:
+        ldp     data1, data2, [src], #16
+.Lrealigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        sub     tmp3, data2, zeroones
+        orr     tmp4, data2, #REP8_7f
+        bic     has_nul1, tmp1, tmp2
+        bics    has_nul2, tmp3, tmp4
+        ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
+        b.eq    .Lloop
+        sub     len, src, srcin
+        cbz     has_nul1, .Lnul_in_data2
+CPU_BE( mov     data2, data1 )  /*prepare data to re-calculate the syndrome*/
+        sub     len, len, #8
+        mov     has_nul2, has_nul1
+.Lnul_in_data2:
+        /*
+        * For big-endian, carry propagation (if the final byte in the
+        * string is 0x01) means we cannot use has_nul directly.  The
+        * easiest way to get the correct byte is to byte-swap the data
+        * and calculate the syndrome a second time.
+        */
+CPU_BE( rev     data2, data2 )
+CPU_BE( sub     tmp1, data2, zeroones )
+CPU_BE( orr     tmp2, data2, #REP8_7f )
+CPU_BE( bic     has_nul2, tmp1, tmp2 )
+        sub     len, len, #8
+        rev     has_nul2, has_nul2
+        clz     pos, has_nul2
+        add     len, len, pos, lsr #3           /* Bits to bytes.  */
+        ret
+.Lmisaligned:
+        cmp     tmp1, #8
+        neg     tmp1, tmp1
+        ldp     data1, data2, [src], #16
+        lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        orr     data1, data1, tmp2
+        orr     data2a, data2, tmp2
+        csinv   data1, data1, xzr, le
+        csel    data2, data2, data2a, le
+        b       .Lrealigned
+ENDPROC(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
new file mode 100644
index 000000000000..0224cf5a5533
--- /dev/null
+++ b/arch/arm64/lib/strncmp.S
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *  x0 - const string 1 pointer
+ *  x1 - const string 2 pointer
+ *  x2 - the maximal length to be compared
+ * Returns:
+ *  x0 - an integer less than, equal to, or greater than zero if s1 is found,
+ *     respectively, to be less than, to match, or be greater than s2.
+ */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+limit           .req    x2
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x3
+data1w          .req    w3
+data2           .req    x4
+data2w          .req    w4
+has_nul         .req    x5
+diff            .req    x6
+syndrome        .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+zeroones        .req    x11
+pos             .req    x12
+limit_wd        .req    x13
+mask            .req    x14
+endloop         .req    x15
+ENTRY(strncmp)
+        cbz     limit, .Lret0
+        eor     tmp1, src1, src2
+        mov     zeroones, #REP8_01
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        /* Calculate the number of full and partial words -1.  */
+        /*
+        * when limit is mulitply of 8, if not sub 1,
+        * the judgement of last dword will wrong.
+        */
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, pl  /* Last Dword or differences.*/
+        bics    has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+        ccmp    endloop, #0, #0, eq
+        b.eq    .Lloop_aligned
+        /*Not reached the limit, must have found the end or a diff.  */
+        tbz     limit_wd, #63, .Lnot_limit
+        /* Limit % 8 == 0 => all bytes significant.  */
+        ands    limit, limit, #7
+        b.eq    .Lnot_limit
+        lsl     limit, limit, #3    /* Bits -> bytes.  */
+        mov     mask, #~0
+CPU_BE( lsr     mask, mask, limit )
+CPU_LE( lsl     mask, mask, limit )
+        bic     data1, data1, mask
+        bic     data2, data2, mask
+        /* Make sure that the NUL byte is marked in the syndrome.  */
+        orr     has_nul, has_nul, mask
+.Lnot_limit:
+        orr     syndrome, diff, has_nul
+        b       .Lcal_cmpresult
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary.  Round down the addresses and then mask off
+        * the bytes that precede the start point.
+        * We also need to adjust the limit calculations, but without
+        * overflowing if the limit is near ULONG_MAX.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        ldr     data1, [src1], #8
+        neg     tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */
+        ldr     data2, [src2], #8
+        mov     tmp2, #~0
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp3 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp3 )      /* Shift (tmp1 & 63).  */
+        and     tmp3, limit_wd, #7
+        lsr     limit_wd, limit_wd, #3
+        /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
+        add     limit, limit, tmp1
+        add     tmp3, tmp3, tmp1
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        add     limit_wd, limit_wd, tmp3, lsr #3
+        b       .Lstart_realigned
+/*when src1 offset is not equal to src2 offset...*/
+.Lmisaligned8:
+        cmp     limit, #8
+        b.lo    .Ltiny8proc /*limit < 8... */
+        /*
+        * Get the align offset length to compare per byte first.
+        * After this process, one string's address will be aligned.*/
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum. */
+        /*
+        * Here, limit is not less than 8, so directly run .Ltinycmp
+        * without checking the limit.*/
+        sub     limit, limit, pos
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*find the null or unequal...*/
+        cmp     data1w, #1
+        ccmp    data1w, data2w, #0, cs
+        b.eq    .Lstart_align /*the last bytes are equal....*/
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        /*process more leading bytes to make str1 aligned...*/
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        add     src1, src1, tmp3        /*tmp3 is positive in this branch.*/
+        add     src2, src2, tmp3
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     limit, limit, tmp3
+        lsr     limit_wd, limit, #3
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        bics    has_nul, tmp1, tmp2
+        ccmp    endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+        b.ne    .Lunequal_proc
+        /*How far is the current str2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes from the SRC2 alignment
+        * boundary,then compare with the relative bytes from SRC1.
+        * If all 8 bytes are equal,then start the second part's comparison.
+        * Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bics    has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, eq
+        cbnz    endloop, .Lunequal_proc
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        bics    has_nul, tmp1, tmp2
+        ccmp    endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+        b.eq    .Lloopcmp_proc
+.Lunequal_proc:
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lremain8
+.Lcal_cmpresult:
+        /*
+        * reversed the byte-order as big-endian,then CLZ can find the most
+        * significant zero bits.
+        */
+CPU_LE( rev     syndrome, syndrome )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * For big-endian we cannot use the trick with the syndrome value
+        * as carry-propagation can corrupt the upper bits if the trailing
+        * bytes in the string contain 0x01.
+        * However, if there is no NUL byte in the dword, we can generate
+        * the result directly.  We can't just subtract the bytes as the
+        * MSB might be significant.
+        */
+CPU_BE( cbnz    has_nul, 1f )
+CPU_BE( cmp     data1, data2 )
+CPU_BE( cset    result, ne )
+CPU_BE( cneg    result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+        /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
+CPU_BE( rev     tmp3, data1 )
+CPU_BE( sub     tmp1, tmp3, zeroones )
+CPU_BE( orr     tmp2, tmp3, #REP8_7f )
+CPU_BE( bic     has_nul, tmp1, tmp2 )
+CPU_BE( rev     has_nul, has_nul )
+CPU_BE( orr     syndrome, diff, has_nul )
+        /*
+        * The MS-non-zero bit of the syndrome marks either the first bit
+        * that is different, or the top bit of the first zero byte.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        clz     pos, syndrome
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * But we need to zero-extend (char is unsigned) the value and then
+        * perform a signed 32-bit subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+.Lremain8:
+        /* Limit % 8 == 0 => all bytes significant.  */
+        ands    limit, limit, #7
+        b.eq    .Lret0
+.Ltiny8proc:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    limit, limit, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltiny8proc
+        sub     result, data1, data2
+        ret
+.Lret0:
+        mov     result, #0
+        ret
+ENDPROC(strncmp)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
new file mode 100644
index 000000000000..2ca665711bf2
--- /dev/null
+++ b/arch/arm64/lib/strnlen.S
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * determine the length of a fixed-size string
+ *
+ * Parameters:
+ *      x0 - const string pointer
+ *      x1 - maximal string length
+ * Returns:
+ *      x0 - the return length of specific string
+ */
+/* Arguments and results.  */
+srcin           .req    x0
+len             .req    x0
+limit           .req    x1
+/* Locals and temporaries.  */
+src             .req    x2
+data1           .req    x3
+data2           .req    x4
+data2a          .req    x5
+has_nul1        .req    x6
+has_nul2        .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+tmp4            .req    x11
+zeroones        .req    x12
+pos             .req    x13
+limit_wd        .req    x14
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+ENTRY(strnlen)
+        cbz     limit, .Lhit_limit
+        mov     zeroones, #REP8_01
+        bic     src, srcin, #15
+        ands    tmp1, srcin, #15
+        b.ne    .Lmisaligned
+        /* Calculate the number of full and partial words -1.  */
+        sub     limit_wd, limit, #1 /* Limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #4  /* Convert to Qwords.  */
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+        /*
+        * The inner loop deals with two Dwords at a time.  This has a
+        * slightly higher start-up cost, but we should win quite quickly,
+        * especially on cores with a high number of issue slots per
+        * cycle, as we get much better parallelism out of the operations.
+        */
+.Lloop:
+        ldp     data1, data2, [src], #16
+.Lrealigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        sub     tmp3, data2, zeroones
+        orr     tmp4, data2, #REP8_7f
+        bic     has_nul1, tmp1, tmp2
+        bic     has_nul2, tmp3, tmp4
+        subs    limit_wd, limit_wd, #1
+        orr     tmp1, has_nul1, has_nul2
+        ccmp    tmp1, #0, #0, pl    /* NZCV = 0000  */
+        b.eq    .Lloop
+        cbz     tmp1, .Lhit_limit   /* No null in final Qword.  */
+        /*
+        * We know there's a null in the final Qword. The easiest thing
+        * to do now is work out the length of the string and return
+        * MIN (len, limit).
+        */
+        sub     len, src, srcin
+        cbz     has_nul1, .Lnul_in_data2
+CPU_BE( mov     data2, data1 )  /*perpare data to re-calculate the syndrome*/
+        sub     len, len, #8
+        mov     has_nul2, has_nul1
+.Lnul_in_data2:
+        /*
+        * For big-endian, carry propagation (if the final byte in the
+        * string is 0x01) means we cannot use has_nul directly.  The
+        * easiest way to get the correct byte is to byte-swap the data
+        * and calculate the syndrome a second time.
+        */
+CPU_BE( rev     data2, data2 )
+CPU_BE( sub     tmp1, data2, zeroones )
+CPU_BE( orr     tmp2, data2, #REP8_7f )
+CPU_BE( bic     has_nul2, tmp1, tmp2 )
+        sub     len, len, #8
+        rev     has_nul2, has_nul2
+        clz     pos, has_nul2
+        add     len, len, pos, lsr #3       /* Bits to bytes.  */
+        cmp     len, limit
+        csel    len, len, limit, ls     /* Return the lower value.  */
+        ret
+.Lmisaligned:
+        /*
+        * Deal with a partial first word.
+        * We're doing two things in parallel here;
+        * 1) Calculate the number of words (but avoiding overflow if
+        * limit is near ULONG_MAX) - to do this we need to work out
+        * limit + tmp1 - 1 as a 65-bit value before shifting it;
+        * 2) Load and mask the initial data words - we force the bytes
+        * before the ones we are interested in to 0xff - this ensures
+        * early bytes will not hit any zero detection.
+        */
+        ldp     data1, data2, [src], #16
+        sub     limit_wd, limit, #1
+        and     tmp3, limit_wd, #15
+        lsr     limit_wd, limit_wd, #4
+        add     tmp3, tmp3, tmp1
+        add     limit_wd, limit_wd, tmp3, lsr #4
+        neg     tmp4, tmp1
+        lsl     tmp4, tmp4, #3  /* Bytes beyond alignment -> bits.  */
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp4 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp4 )      /* Shift (tmp1 & 63).  */
+        cmp     tmp1, #8
+        orr     data1, data1, tmp2
+        orr     data2a, data2, tmp2
+        csinv   data1, data1, xzr, le
+        csel    data2, data2, data2a, le
+        b       .Lrealigned
+.Lhit_limit:
+        mov     len, limit
+        ret
+ENDPROC(strnlen)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index b51d36401d83..3ecb56c624d3 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y                           := dma-mapping.o extable.o fault.o init.o \
                                   cache.o copypage.o flush.o \
                                   ioremap.o mmap.o pgd.o mmu.o \
-                                   context.o tlb.o proc.o
+                                   context.o proc.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index fda756875fa6..23663837acff 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -31,7 +31,7 @@
 *      Corrupted registers: x0-x7, x9-x11
 */
 __flush_dcache_all:
-        dsb     sy                              // ensure ordering with previous memory accesses
+        dmb     sy                              // ensure ordering with previous memory accesses
        mrs     x0, clidr_el1                   // read clidr
        and     x3, x0, #0x7000000              // extract loc from clidr
        lsr     x3, x3, #23                     // left align loc bit field
@@ -128,7 +128,7 @@ USER(9f, dc	cvau, x4	)		// clean D line to PoU
        add     x4, x4, x2
        cmp     x4, x1
        b.lo    1b
-        dsb     sy
+        dsb     ish
        icache_line_size x2, x3
        sub     x3, x2, #1
@@ -139,7 +139,7 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
        cmp     x4, x1
        b.lo    1b
 9:                                              // ignore any faulting cache operation
-        dsb     sy
+        dsb     ish
        isb
        ret
 ENDPROC(flush_icache_range)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index c851eb44dc50..4164c5ace9f8 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -115,7 +115,7 @@ static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
        for (i = 0; i < (size >> PAGE_SHIFT); i++)
                map[i] = page + i;
        coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP,
-                            __get_dma_pgprot(attrs, pgprot_default, false));
+                            __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false));
        kfree(map);
        if (!coherent_ptr)
                goto no_map;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c23751b06120..bcc965e2cce1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,6 +32,7 @@
 #include <asm/exception.h>
 #include <asm/debug-monitors.h>
+#include <asm/esr.h>
 #include <asm/system_misc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -123,6 +124,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
        }
        tsk->thread.fault_address = addr;
+        tsk->thread.fault_code = esr;
        si.si_signo = sig;
        si.si_errno = 0;
        si.si_code = code;
@@ -148,8 +150,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 #define VM_FAULT_BADMAP         0x010000
 #define VM_FAULT_BADACCESS      0x020000
-#define ESR_WRITE               (1 << 6)
-#define ESR_CM                  (1 << 8)
 #define ESR_LNX_EXEC            (1 << 24)
 static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
@@ -218,7 +218,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
        if (esr & ESR_LNX_EXEC) {
                vm_flags = VM_EXEC;
-        } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
+        } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) {
                vm_flags = VM_WRITE;
                mm_flags |= FAULT_FLAG_WRITE;
        }
@@ -525,7 +525,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
        info.si_errno = 0;
        info.si_code  = inf->code;
        info.si_addr  = (void __user *)addr;
-        arm64_notify_die("", regs, &info, esr);
+        arm64_notify_die("", regs, &info, 0);
        return 0;
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4a829a210bb6..c43f1dd19489 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -43,11 +43,6 @@
 struct page *empty_zero_page;
 EXPORT_SYMBOL(empty_zero_page);
-pgprot_t pgprot_default;
-EXPORT_SYMBOL(pgprot_default);
-static pmdval_t prot_sect_kernel;
 struct cachepolicy {
        const char      policy[16];
        u64             mair;
@@ -122,33 +117,6 @@ static int __init early_cachepolicy(char *p)
 }
 early_param("cachepolicy", early_cachepolicy);
-/*
- * Adjust the PMD section entries according to the CPU in use.
- */
-void __init init_mem_pgprot(void)
-{
-        pteval_t default_pgprot;
-        int i;
-        default_pgprot = PTE_ATTRINDX(MT_NORMAL);
-        prot_sect_kernel = PMD_TYPE_SECT | PMD_SECT_AF | PMD_ATTRINDX(MT_NORMAL);
-#ifdef CONFIG_SMP
-        /*
-         * Mark memory with the "shared" attribute for SMP systems
-         */
-        default_pgprot |= PTE_SHARED;
-        prot_sect_kernel |= PMD_SECT_S;
-#endif
-        for (i = 0; i < 16; i++) {
-                unsigned long v = pgprot_val(protection_map[i]);
-                protection_map[i] = __pgprot(v | default_pgprot);
-        }
-        pgprot_default = __pgprot(PTE_TYPE_PAGE | PTE_AF | default_pgprot);
-}
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot)
 {
@@ -196,11 +164,10 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
        pgprot_t prot_pte;
        if (map_io) {
-                prot_sect = PMD_TYPE_SECT | PMD_SECT_AF |
+                prot_sect = PROT_SECT_DEVICE_nGnRE;
-                            PMD_ATTRINDX(MT_DEVICE_nGnRE);
                prot_pte = __pgprot(PROT_DEVICE_nGnRE);
        } else {
-                prot_sect = prot_sect_kernel;
+                prot_sect = PROT_SECT_NORMAL_EXEC;
                prot_pte = PAGE_KERNEL_EXEC;
        }
@@ -242,7 +209,30 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
        do {
                next = pud_addr_end(addr, end);
-                alloc_init_pmd(pud, addr, next, phys, map_io);
+                /*
+                 * For 4K granule only, attempt to put down a 1GB block
+                 */
+                if (!map_io && (PAGE_SHIFT == 12) &&
+                    ((addr | next | phys) & ~PUD_MASK) == 0) {
+                        pud_t old_pud = *pud;
+                        set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC));
+                        /*
+                         * If we have an old value for a pud, it will
+                         * be pointing to a pmd table that we no longer
+                         * need (from swapper_pg_dir).
+                         *
+                         * Look up the old pmd table and free it.
+                         */
+                        if (!pud_none(old_pud)) {
+                                phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
+                                memblock_free(table, PAGE_SIZE);
+                                flush_tlb_all();
+                        }
+                } else {
+                        alloc_init_pmd(pud, addr, next, phys, map_io);
+                }
                phys += next - addr;
        } while (pud++, addr = next, addr != end);
 }
@@ -399,6 +389,9 @@ int kern_addr_valid(unsigned long addr)
        if (pud_none(*pud))
                return 0;
+        if (pud_sect(*pud))
+                return pfn_valid(pud_pfn(*pud));
        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return 0;
@@ -446,7 +439,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
                        if (!p)
                                return -ENOMEM;
-                        set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel));
+                        set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL));
                } else
                        vmemmap_verify((pte_t *)pmd, node, addr, next);
        } while (addr = next, addr != end);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9042aff5e9e3..7736779c9809 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -182,7 +182,7 @@ ENDPROC(cpu_do_switch_mm)
 ENTRY(__cpu_setup)
        ic      iallu                           // I+BTB cache invalidate
        tlbi    vmalle1is                       // invalidate I + D TLBs
-        dsb     sy
+        dsb     ish
        mov     x0, #3 << 20
        msr     cpacr_el1, x0                   // Enable FP/ASIMD
diff --git a/arch/arm64/mm/tlb.S b/arch/arm64/mm/tlb.S
deleted file mode 100644
index 19da91e0cd27..000000000000
--- a/arch/arm64/mm/tlb.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Based on arch/arm/mm/tlb.S
- *
- * Copyright (C) 1997-2002 Russell King
- * Copyright (C) 2012 ARM Ltd.
- * Written by Catalin Marinas <catalin.marinas@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include "proc-macros.S"
-/*
- *      __cpu_flush_user_tlb_range(start, end, vma)
- *
- *      Invalidate a range of TLB entries in the specified address space.
- *
- *      - start - start address (may not be aligned)
- *      - end   - end address (exclusive, may not be aligned)
- *      - vma   - vma_struct describing address range
- */
-ENTRY(__cpu_flush_user_tlb_range)
-        vma_vm_mm x3, x2                        // get vma->vm_mm
-        mmid    w3, x3                          // get vm_mm->context.id
-        dsb     sy
-        lsr     x0, x0, #12                     // align address
-        lsr     x1, x1, #12
-        bfi     x0, x3, #48, #16                // start VA and ASID
-        bfi     x1, x3, #48, #16                // end VA and ASID
-1:      tlbi    vae1is, x0                      // TLB invalidate by address and ASID
-        add     x0, x0, #1
-        cmp     x0, x1
-        b.lo    1b
-        dsb     sy
-        ret
-ENDPROC(__cpu_flush_user_tlb_range)
-/*
- *      __cpu_flush_kern_tlb_range(start,end)
- *
- *      Invalidate a range of kernel TLB entries.
- *
- *      - start - start address (may not be aligned)
- *      - end   - end address (exclusive, may not be aligned)
- */
-ENTRY(__cpu_flush_kern_tlb_range)
-        dsb     sy
-        lsr     x0, x0, #12                     // align address
-        lsr     x1, x1, #12
-1:      tlbi    vaae1is, x0                     // TLB invalidate by address
-        add     x0, x0, #1
-        cmp     x0, x1
-        b.lo    1b
-        dsb     sy
-        isb
-        ret
-ENDPROC(__cpu_flush_kern_tlb_range)
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h
index 8a029505d7b7..2f1c3c2657ad 100644
--- a/arch/blackfin/include/asm/ftrace.h
+++ b/arch/blackfin/include/asm/ftrace.h
@@ -66,16 +66,7 @@ extern inline void *return_address(unsigned int level)
 #endif /* CONFIG_FRAME_POINTER */
-#define HAVE_ARCH_CALLER_ADDR
+#define ftrace_return_address(n) return_address(n)
-/* inline function or macro may lead to unexpected result */
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 ((unsigned long)return_address(1))
-#define CALLER_ADDR2 ((unsigned long)return_address(2))
-#define CALLER_ADDR3 ((unsigned long)return_address(3))
-#define CALLER_ADDR4 ((unsigned long)return_address(4))
-#define CALLER_ADDR5 ((unsigned long)return_address(5))
-#define CALLER_ADDR6 ((unsigned long)return_address(6))
 #endif /* __ASSEMBLY__ */
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
index 72c0fafaa039..544ed8ef87eb 100644
--- a/arch/parisc/include/asm/ftrace.h
+++ b/arch/parisc/include/asm/ftrace.h
@@ -24,15 +24,7 @@ extern void return_to_handler(void);
 extern unsigned long return_address(unsigned int);
-#define HAVE_ARCH_CALLER_ADDR
+#define ftrace_return_address(n) return_address(n)
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 return_address(1)
-#define CALLER_ADDR2 return_address(2)
-#define CALLER_ADDR3 return_address(3)
-#define CALLER_ADDR4 return_address(4)
-#define CALLER_ADDR5 return_address(5)
-#define CALLER_ADDR6 return_address(6)
 #endif /* __ASSEMBLY__ */
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index 13e9966464c2..e79fb6ebaa42 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -40,15 +40,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 /* arch/sh/kernel/return_address.c */
 extern void *return_address(unsigned int);
-#define HAVE_ARCH_CALLER_ADDR
+#define ftrace_return_address(n) return_address(n)
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 ((unsigned long)return_address(1))
-#define CALLER_ADDR2 ((unsigned long)return_address(2))
-#define CALLER_ADDR3 ((unsigned long)return_address(3))
-#define CALLER_ADDR4 ((unsigned long)return_address(4))
-#define CALLER_ADDR5 ((unsigned long)return_address(5))
-#define CALLER_ADDR6 ((unsigned long)return_address(6))
 #endif /* __ASSEMBLY__ */
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h
index 736b9d214d80..6c6d9a9f185f 100644
--- a/arch/xtensa/include/asm/ftrace.h
+++ b/arch/xtensa/include/asm/ftrace.h
@@ -12,24 +12,18 @@
 #include <asm/processor.h>
-#define HAVE_ARCH_CALLER_ADDR
 #ifndef __ASSEMBLY__
-#define CALLER_ADDR0 ({ unsigned long a0, a1; \
+#define ftrace_return_address0 ({ unsigned long a0, a1; \
                __asm__ __volatile__ ( \
                        "mov %0, a0\n" \
                        "mov %1, a1\n" \
                        : "=r"(a0), "=r"(a1)); \
                MAKE_PC_FROM_RA(a0, a1); })
 #ifdef CONFIG_FRAME_POINTER
 extern unsigned long return_address(unsigned level);
-#define CALLER_ADDR1 return_address(1)
+#define ftrace_return_address(n) return_address(n)
-#define CALLER_ADDR2 return_address(2)
+#endif
-#define CALLER_ADDR3 return_address(3)
-#else /* CONFIG_FRAME_POINTER */
-#define CALLER_ADDR1 (0)
-#define CALLER_ADDR2 (0)
-#define CALLER_ADDR3 (0)
-#endif /* CONFIG_FRAME_POINTER */
 #endif /* __ASSEMBLY__ */
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 03cf5936bad6..1ac097279db1 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -4,22 +4,27 @@
 /*
 * This is the most generic implementation of unaligned accesses
 * and should work almost anywhere.
- *
- * If an architecture can handle unaligned accesses in hardware,
- * it may want to use the linux/unaligned/access_ok.h implementation
- * instead.
 */
 #include <asm/byteorder.h>
+/* Set by the arch if it can handle unaligned accesses in hardware. */
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+# include <linux/unaligned/access_ok.h>
+#endif
 #if defined(__LITTLE_ENDIAN)
-# include <linux/unaligned/le_struct.h>
+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-# include <linux/unaligned/be_byteshift.h>
+#  include <linux/unaligned/le_struct.h>
+#  include <linux/unaligned/be_byteshift.h>
+# endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned  __get_unaligned_le
 # define put_unaligned  __put_unaligned_le
 #elif defined(__BIG_ENDIAN)
-# include <linux/unaligned/be_struct.h>
+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-# include <linux/unaligned/le_byteshift.h>
+#  include <linux/unaligned/be_struct.h>
+#  include <linux/unaligned/le_byteshift.h>
+# endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned  __get_unaligned_be
 # define put_unaligned  __put_unaligned_be
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ae9504b4b67d..2018751cad9e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -616,25 +616,27 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 }
-#ifndef HAVE_ARCH_CALLER_ADDR
+/* All archs should have this, but we define it for consistency */
+#ifndef ftrace_return_address0
+# define ftrace_return_address0 __builtin_return_address(0)
+#endif
+/* Archs may use other ways for ADDR1 and beyond */
+#ifndef ftrace_return_address
 # ifdef CONFIG_FRAME_POINTER
-#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define ftrace_return_address(n) __builtin_return_address(n)
-#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
-#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
-#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
-#  define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
-#  define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
-#  define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
 # else
-#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define ftrace_return_address(n) 0UL
-#  define CALLER_ADDR1 0UL
-#  define CALLER_ADDR2 0UL
-#  define CALLER_ADDR3 0UL
-#  define CALLER_ADDR4 0UL
-#  define CALLER_ADDR5 0UL
-#  define CALLER_ADDR6 0UL
 # endif
-#endif /* ifndef HAVE_ARCH_CALLER_ADDR */
+#endif
+#define CALLER_ADDR0 ((unsigned long)ftrace_return_address0)
+#define CALLER_ADDR1 ((unsigned long)ftrace_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)ftrace_return_address(2))
+#define CALLER_ADDR3 ((unsigned long)ftrace_return_address(3))
+#define CALLER_ADDR4 ((unsigned long)ftrace_return_address(4))
+#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
+#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
 #ifdef CONFIG_IRQSOFF_TRACER
  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 9c22317778eb..e11aa4a156d2 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -40,6 +40,11 @@
 #define R_METAG_NONE                     3
 #endif
+#ifndef EM_AARCH64
+#define EM_AARCH64      183
+#define R_AARCH64_ABS64 257
+#endif
 static int fd_map;      /* File descriptor for file being modified. */
 static int mmap_failed; /* Boolean flag. */
 static void *ehdr_curr; /* current ElfXX_Ehdr *  for resource cleanup */
@@ -347,6 +352,8 @@ do_file(char const *const fname)
        case EM_ARM:     reltype = R_ARM_ABS32;
                         altmcount = "__gnu_mcount_nc";
                         break;
+        case EM_AARCH64:
+                         reltype = R_AARCH64_ABS64; gpfx = '_'; break;
        case EM_IA_64:   reltype = R_IA64_IMM64;   gpfx = '_'; break;
        case EM_METAG:   reltype = R_METAG_ADDR32;
                         altmcount = "_mcount_wrapper";
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 91280b82da08..397b6b84e8c5 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -279,6 +279,11 @@ if ($arch eq "x86_64") {
    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" .
                        "\\s+(__gnu_mcount_nc|mcount)\$";
+} elsif ($arch eq "arm64") {
+    $alignment = 3;
+    $section_type = '%progbits';
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$";
+    $type = ".quad";
 } elsif ($arch eq "ia64") {
    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
    $type = "data8";