Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream

FPSIMD register bank context switching and crypto algorithms optimisations for arm64 from Ard Biesheuvel. * tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm: arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions arm64: pull in <asm/simd.h> from asm-generic arm64/crypto: AES in CCM mode using ARMv8 Crypto Extensions arm64/crypto: AES using ARMv8 Crypto Extensions arm64/crypto: GHASH secure hash using ARMv8 Crypto Extensions arm64/crypto: SHA-224/SHA-256 using ARMv8 Crypto Extensions arm64/crypto: SHA-1 using ARMv8 Crypto Extensions arm64: add support for kernel mode NEON in interrupt context arm64: defer reloading a task's FPSIMD state to userland resume arm64: add abstractions for FPSIMD state manipulation asm-generic: allow generic unaligned access if the arch supports it Conflicts: arch/arm64/include/asm/thread_info.h
author: Catalin Marinas <catalin.marinas@arm.com> 2014-05-16 05:05:11 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-16 05:05:11 -0400
commit: cf5c95db57ffa02e430c3840c08d1ee0403849d4 (patch)
tree: b3b4df5e1edcde098cf45b7fa00c8450e6d665f8
parent: fd92d4a54a069953b4679958121317f2a25389cd (diff)
parent: 49788fe2a128217f78a21ee4edbe6e92e988f222 (diff)
30 files changed, 3535 insertions, 43 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9a5b5fea86ba..78b356d079dd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
+if CRYPTO
+source "arch/arm64/crypto/Kconfig"
+endif
 source "lib/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 2fceb71ac3b7..8185a913c5ed 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -45,6 +45,7 @@ export	TEXT_OFFSET GZFLAGS
 core-y          += arch/arm64/kernel/ arch/arm64/mm/
 core-$(CONFIG_KVM) += arch/arm64/kvm/
 core-$(CONFIG_XEN) += arch/arm64/xen/
+core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
 libs-y          := arch/arm64/lib/ $(libs-y)
 libs-y          += $(LIBGCC)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
new file mode 100644
index 000000000000..5562652c5316
--- /dev/null
+++ b/arch/arm64/crypto/Kconfig
@@ -0,0 +1,53 @@
+menuconfig ARM64_CRYPTO
+        bool "ARM64 Accelerated Cryptographic Algorithms"
+        depends on ARM64
+        help
+          Say Y here to choose from a selection of cryptographic algorithms
+          implemented using ARM64 specific CPU features or instructions.
+if ARM64_CRYPTO
+config CRYPTO_SHA1_ARM64_CE
+        tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_SHA2_ARM64_CE
+        tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_GHASH_ARM64_CE
+        tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_HASH
+config CRYPTO_AES_ARM64_CE
+        tristate "AES core cipher using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_ALGAPI
+        select CRYPTO_AES
+config CRYPTO_AES_ARM64_CE_CCM
+        tristate "AES in CCM mode using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_ALGAPI
+        select CRYPTO_AES
+        select CRYPTO_AEAD
+config CRYPTO_AES_ARM64_CE_BLK
+        tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_BLKCIPHER
+        select CRYPTO_AES
+        select CRYPTO_ABLK_HELPER
+config CRYPTO_AES_ARM64_NEON_BLK
+        tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+        depends on ARM64 && KERNEL_MODE_NEON
+        select CRYPTO_BLKCIPHER
+        select CRYPTO_AES
+        select CRYPTO_ABLK_HELPER
+endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
new file mode 100644
index 000000000000..2070a56ecc46
--- /dev/null
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,38 @@
+#
+# linux/arch/arm64/crypto/Makefile
+#
+# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
+sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
+obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
+sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
+obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
+CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
+aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+AFLAGS_aes-ce.o         := -DINTERLEAVE=2 -DINTERLEAVE_INLINE
+AFLAGS_aes-neon.o       := -DINTERLEAVE=4
+CFLAGS_aes-glue-ce.o    := -DUSE_V8_CRYPTO_EXTENSIONS
+$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
+        $(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
new file mode 100644
index 000000000000..432e4841cd81
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -0,0 +1,222 @@
+/*
+ * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+        .text
+        .arch   armv8-a+crypto
+        /*
+         * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+         *                           u32 *macp, u8 const rk[], u32 rounds);
+         */
+ENTRY(ce_aes_ccm_auth_data)
+        ldr     w8, [x3]                        /* leftover from prev round? */
+        ld1     {v0.2d}, [x0]                   /* load mac */
+        cbz     w8, 1f
+        sub     w8, w8, #16
+        eor     v1.16b, v1.16b, v1.16b
+0:      ldrb    w7, [x1], #1                    /* get 1 byte of input */
+        subs    w2, w2, #1
+        add     w8, w8, #1
+        ins     v1.b[0], w7
+        ext     v1.16b, v1.16b, v1.16b, #1      /* rotate in the input bytes */
+        beq     8f                              /* out of input? */
+        cbnz    w8, 0b
+        eor     v0.16b, v0.16b, v1.16b
+1:      ld1     {v3.2d}, [x4]                   /* load first round key */
+        prfm    pldl1strm, [x1]
+        cmp     w5, #12                         /* which key size? */
+        add     x6, x4, #16
+        sub     w7, w5, #2                      /* modified # of rounds */
+        bmi     2f
+        bne     5f
+        mov     v5.16b, v3.16b
+        b       4f
+2:      mov     v4.16b, v3.16b
+        ld1     {v5.2d}, [x6], #16              /* load 2nd round key */
+3:      aese    v0.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+4:      ld1     {v3.2d}, [x6], #16              /* load next round key */
+        aese    v0.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+5:      ld1     {v4.2d}, [x6], #16              /* load next round key */
+        subs    w7, w7, #3
+        aese    v0.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        ld1     {v5.2d}, [x6], #16              /* load next round key */
+        bpl     3b
+        aese    v0.16b, v4.16b
+        subs    w2, w2, #16                     /* last data? */
+        eor     v0.16b, v0.16b, v5.16b          /* final round */
+        bmi     6f
+        ld1     {v1.16b}, [x1], #16             /* load next input block */
+        eor     v0.16b, v0.16b, v1.16b          /* xor with mac */
+        bne     1b
+6:      st1     {v0.2d}, [x0]                   /* store mac */
+        beq     10f
+        adds    w2, w2, #16
+        beq     10f
+        mov     w8, w2
+7:      ldrb    w7, [x1], #1
+        umov    w6, v0.b[0]
+        eor     w6, w6, w7
+        strb    w6, [x0], #1
+        subs    w2, w2, #1
+        beq     10f
+        ext     v0.16b, v0.16b, v0.16b, #1      /* rotate out the mac bytes */
+        b       7b
+8:      mov     w7, w8
+        add     w8, w8, #16
+9:      ext     v1.16b, v1.16b, v1.16b, #1
+        adds    w7, w7, #1
+        bne     9b
+        eor     v0.16b, v0.16b, v1.16b
+        st1     {v0.2d}, [x0]
+10:     str     w8, [x3]
+        ret
+ENDPROC(ce_aes_ccm_auth_data)
+        /*
+         * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
+         *                       u32 rounds);
+         */
+ENTRY(ce_aes_ccm_final)
+        ld1     {v3.2d}, [x2], #16              /* load first round key */
+        ld1     {v0.2d}, [x0]                   /* load mac */
+        cmp     w3, #12                         /* which key size? */
+        sub     w3, w3, #2                      /* modified # of rounds */
+        ld1     {v1.2d}, [x1]                   /* load 1st ctriv */
+        bmi     0f
+        bne     3f
+        mov     v5.16b, v3.16b
+        b       2f
+0:      mov     v4.16b, v3.16b
+1:      ld1     {v5.2d}, [x2], #16              /* load next round key */
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+2:      ld1     {v3.2d}, [x2], #16              /* load next round key */
+        aese    v0.16b, v5.16b
+        aese    v1.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+3:      ld1     {v4.2d}, [x2], #16              /* load next round key */
+        subs    w3, w3, #3
+        aese    v0.16b, v3.16b
+        aese    v1.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+        bpl     1b
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        /* final round key cancels out */
+        eor     v0.16b, v0.16b, v1.16b          /* en-/decrypt the mac */
+        st1     {v0.2d}, [x0]                   /* store result */
+        ret
+ENDPROC(ce_aes_ccm_final)
+        .macro  aes_ccm_do_crypt,enc
+        ldr     x8, [x6, #8]                    /* load lower ctr */
+        ld1     {v0.2d}, [x5]                   /* load mac */
+        rev     x8, x8                          /* keep swabbed ctr in reg */
+0:      /* outer loop */
+        ld1     {v1.1d}, [x6]                   /* load upper ctr */
+        prfm    pldl1strm, [x1]
+        add     x8, x8, #1
+        rev     x9, x8
+        cmp     w4, #12                         /* which key size? */
+        sub     w7, w4, #2                      /* get modified # of rounds */
+        ins     v1.d[1], x9                     /* no carry in lower ctr */
+        ld1     {v3.2d}, [x3]                   /* load first round key */
+        add     x10, x3, #16
+        bmi     1f
+        bne     4f
+        mov     v5.16b, v3.16b
+        b       3f
+1:      mov     v4.16b, v3.16b
+        ld1     {v5.2d}, [x10], #16             /* load 2nd round key */
+2:      /* inner loop: 3 rounds, 2x interleaved */
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+3:      ld1     {v3.2d}, [x10], #16             /* load next round key */
+        aese    v0.16b, v5.16b
+        aese    v1.16b, v5.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+4:      ld1     {v4.2d}, [x10], #16             /* load next round key */
+        subs    w7, w7, #3
+        aese    v0.16b, v3.16b
+        aese    v1.16b, v3.16b
+        aesmc   v0.16b, v0.16b
+        aesmc   v1.16b, v1.16b
+        ld1     {v5.2d}, [x10], #16             /* load next round key */
+        bpl     2b
+        aese    v0.16b, v4.16b
+        aese    v1.16b, v4.16b
+        subs    w2, w2, #16
+        bmi     6f                              /* partial block? */
+        ld1     {v2.16b}, [x1], #16             /* load next input block */
+        .if     \enc == 1
+        eor     v2.16b, v2.16b, v5.16b          /* final round enc+mac */
+        eor     v1.16b, v1.16b, v2.16b          /* xor with crypted ctr */
+        .else
+        eor     v2.16b, v2.16b, v1.16b          /* xor with crypted ctr */
+        eor     v1.16b, v2.16b, v5.16b          /* final round enc */
+        .endif
+        eor     v0.16b, v0.16b, v2.16b          /* xor mac with pt ^ rk[last] */
+        st1     {v1.16b}, [x0], #16             /* write output block */
+        bne     0b
+        rev     x8, x8
+        st1     {v0.2d}, [x5]                   /* store mac */
+        str     x8, [x6, #8]                    /* store lsb end of ctr (BE) */
+5:      ret
+6:      eor     v0.16b, v0.16b, v5.16b          /* final round mac */
+        eor     v1.16b, v1.16b, v5.16b          /* final round enc */
+        st1     {v0.2d}, [x5]                   /* store mac */
+        add     w2, w2, #16                     /* process partial tail block */
+7:      ldrb    w9, [x1], #1                    /* get 1 byte of input */
+        umov    w6, v1.b[0]                     /* get top crypted ctr byte */
+        umov    w7, v0.b[0]                     /* get top mac byte */
+        .if     \enc == 1
+        eor     w7, w7, w9
+        eor     w9, w9, w6
+        .else
+        eor     w9, w9, w6
+        eor     w7, w7, w9
+        .endif
+        strb    w9, [x0], #1                    /* store out byte */
+        strb    w7, [x5], #1                    /* store mac byte */
+        subs    w2, w2, #1
+        beq     5b
+        ext     v0.16b, v0.16b, v0.16b, #1      /* shift out mac byte */
+        ext     v1.16b, v1.16b, v1.16b, #1      /* shift out ctr byte */
+        b       7b
+        .endm
+        /*
+         * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
+         *                         u8 const rk[], u32 rounds, u8 mac[],
+         *                         u8 ctr[]);
+         * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
+         *                         u8 const rk[], u32 rounds, u8 mac[],
+         *                         u8 ctr[]);
+         */
+ENTRY(ce_aes_ccm_encrypt)
+        aes_ccm_do_crypt        1
+ENDPROC(ce_aes_ccm_encrypt)
+ENTRY(ce_aes_ccm_decrypt)
+        aes_ccm_do_crypt        0
+ENDPROC(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
new file mode 100644
index 000000000000..9e6cdde9b43d
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -0,0 +1,297 @@
+/*
+ * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/scatterwalk.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+        /*
+         * # of rounds specified by AES:
+         * 128 bit key          10 rounds
+         * 192 bit key          12 rounds
+         * 256 bit key          14 rounds
+         * => n byte key        => 6 + (n/4) rounds
+         */
+        return 6 + ctx->key_length / 4;
+}
+asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+                                     u32 *macp, u32 const rk[], u32 rounds);
+asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
+                                   u32 const rk[], u32 rounds, u8 mac[],
+                                   u8 ctr[]);
+asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
+                                   u32 const rk[], u32 rounds, u8 mac[],
+                                   u8 ctr[]);
+asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
+                                 u32 rounds);
+static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
+                      unsigned int key_len)
+{
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
+        int ret;
+        ret = crypto_aes_expand_key(ctx, in_key, key_len);
+        if (!ret)
+                return 0;
+        tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+        return -EINVAL;
+}
+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+        if ((authsize & 1) || authsize < 4)
+                return -EINVAL;
+        return 0;
+}
+static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
+        u32 l = req->iv[0] + 1;
+        /* verify that CCM dimension 'L' is set correctly in the IV */
+        if (l < 2 || l > 8)
+                return -EINVAL;
+        /* verify that msglen can in fact be represented in L bytes */
+        if (l < 4 && msglen >> (8 * l))
+                return -EOVERFLOW;
+        /*
+         * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
+         * uses a u32 type to represent msglen so the top 4 bytes are always 0.
+         */
+        n[0] = 0;
+        n[1] = cpu_to_be32(msglen);
+        memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
+        /*
+         * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
+         * - bits 0..2  : max # of bytes required to represent msglen, minus 1
+         *                (already set by caller)
+         * - bits 3..5  : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
+         * - bit 6      : indicates presence of authenticate-only data
+         */
+        maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
+        if (req->assoclen)
+                maciv[0] |= 0x40;
+        memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
+        return 0;
+}
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        struct __packed { __be16 l; __be32 h; u16 len; } ltag;
+        struct scatter_walk walk;
+        u32 len = req->assoclen;
+        u32 macp = 0;
+        /* prepend the AAD with a length tag */
+        if (len < 0xff00) {
+                ltag.l = cpu_to_be16(len);
+                ltag.len = 2;
+        } else  {
+                ltag.l = cpu_to_be16(0xfffe);
+                put_unaligned_be32(len, &ltag.h);
+                ltag.len = 6;
+        }
+        ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
+                             num_rounds(ctx));
+        scatterwalk_start(&walk, req->assoc);
+        do {
+                u32 n = scatterwalk_clamp(&walk, len);
+                u8 *p;
+                if (!n) {
+                        scatterwalk_start(&walk, sg_next(walk.sg));
+                        n = scatterwalk_clamp(&walk, len);
+                }
+                p = scatterwalk_map(&walk);
+                ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
+                                     num_rounds(ctx));
+                len -= n;
+                scatterwalk_unmap(p);
+                scatterwalk_advance(&walk, n);
+                scatterwalk_done(&walk, 0, len);
+        } while (len);
+}
+static int ccm_encrypt(struct aead_request *req)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        struct blkcipher_desc desc = { .info = req->iv };
+        struct blkcipher_walk walk;
+        u8 __aligned(8) mac[AES_BLOCK_SIZE];
+        u8 buf[AES_BLOCK_SIZE];
+        u32 len = req->cryptlen;
+        int err;
+        err = ccm_init_mac(req, mac, len);
+        if (err)
+                return err;
+        kernel_neon_begin_partial(6);
+        if (req->assoclen)
+                ccm_calculate_auth_mac(req, mac);
+        /* preserve the original iv for the final round */
+        memcpy(buf, req->iv, AES_BLOCK_SIZE);
+        blkcipher_walk_init(&walk, req->dst, req->src, len);
+        err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
+                                             AES_BLOCK_SIZE);
+        while (walk.nbytes) {
+                u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+                if (walk.nbytes == len)
+                        tail = 0;
+                ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                   walk.nbytes - tail, ctx->key_enc,
+                                   num_rounds(ctx), mac, walk.iv);
+                len -= walk.nbytes - tail;
+                err = blkcipher_walk_done(&desc, &walk, tail);
+        }
+        if (!err)
+                ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+        kernel_neon_end();
+        if (err)
+                return err;
+        /* copy authtag to end of dst */
+        scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
+                                 crypto_aead_authsize(aead), 1);
+        return 0;
+}
+static int ccm_decrypt(struct aead_request *req)
+{
+        struct crypto_aead *aead = crypto_aead_reqtfm(req);
+        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
+        unsigned int authsize = crypto_aead_authsize(aead);
+        struct blkcipher_desc desc = { .info = req->iv };
+        struct blkcipher_walk walk;
+        u8 __aligned(8) mac[AES_BLOCK_SIZE];
+        u8 buf[AES_BLOCK_SIZE];
+        u32 len = req->cryptlen - authsize;
+        int err;
+        err = ccm_init_mac(req, mac, len);
+        if (err)
+                return err;
+        kernel_neon_begin_partial(6);
+        if (req->assoclen)
+                ccm_calculate_auth_mac(req, mac);
+        /* preserve the original iv for the final round */
+        memcpy(buf, req->iv, AES_BLOCK_SIZE);
+        blkcipher_walk_init(&walk, req->dst, req->src, len);
+        err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
+                                             AES_BLOCK_SIZE);
+        while (walk.nbytes) {
+                u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+                if (walk.nbytes == len)
+                        tail = 0;
+                ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                   walk.nbytes - tail, ctx->key_enc,
+                                   num_rounds(ctx), mac, walk.iv);
+                len -= walk.nbytes - tail;
+                err = blkcipher_walk_done(&desc, &walk, tail);
+        }
+        if (!err)
+                ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+        kernel_neon_end();
+        if (err)
+                return err;
+        /* compare calculated auth tag with the stored one */
+        scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
+                                 authsize, 0);
+        if (memcmp(mac, buf, authsize))
+                return -EBADMSG;
+        return 0;
+}
+static struct crypto_alg ccm_aes_alg = {
+        .cra_name               = "ccm(aes)",
+        .cra_driver_name        = "ccm-aes-ce",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_AEAD,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_aead_type,
+        .cra_module             = THIS_MODULE,
+        .cra_aead = {
+                .ivsize         = AES_BLOCK_SIZE,
+                .maxauthsize    = AES_BLOCK_SIZE,
+                .setkey         = ccm_setkey,
+                .setauthsize    = ccm_setauthsize,
+                .encrypt        = ccm_encrypt,
+                .decrypt        = ccm_decrypt,
+        }
+};
+static int __init aes_mod_init(void)
+{
+        if (!(elf_hwcap & HWCAP_AES))
+                return -ENODEV;
+        return crypto_register_alg(&ccm_aes_alg);
+}
+static void __exit aes_mod_exit(void)
+{
+        crypto_unregister_alg(&ccm_aes_alg);
+}
+module_init(aes_mod_init);
+module_exit(aes_mod_exit);
+MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ccm(aes)");
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
new file mode 100644
index 000000000000..2075e1acae6b
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -0,0 +1,155 @@
+/*
+ * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+struct aes_block {
+        u8 b[AES_BLOCK_SIZE];
+};
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+        /*
+         * # of rounds specified by AES:
+         * 128 bit key          10 rounds
+         * 192 bit key          12 rounds
+         * 256 bit key          14 rounds
+         * => n byte key        => 6 + (n/4) rounds
+         */
+        return 6 + ctx->key_length / 4;
+}
+static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+        struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        struct aes_block *out = (struct aes_block *)dst;
+        struct aes_block const *in = (struct aes_block *)src;
+        void *dummy0;
+        int dummy1;
+        kernel_neon_begin_partial(4);
+        __asm__("       ld1     {v0.16b}, %[in]                 ;"
+                "       ld1     {v1.2d}, [%[key]], #16          ;"
+                "       cmp     %w[rounds], #10                 ;"
+                "       bmi     0f                              ;"
+                "       bne     3f                              ;"
+                "       mov     v3.16b, v1.16b                  ;"
+                "       b       2f                              ;"
+                "0:     mov     v2.16b, v1.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "1:     aese    v0.16b, v2.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "2:     ld1     {v1.2d}, [%[key]], #16          ;"
+                "       aese    v0.16b, v3.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "3:     ld1     {v2.2d}, [%[key]], #16          ;"
+                "       subs    %w[rounds], %w[rounds], #3      ;"
+                "       aese    v0.16b, v1.16b                  ;"
+                "       aesmc   v0.16b, v0.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "       bpl     1b                              ;"
+                "       aese    v0.16b, v2.16b                  ;"
+                "       eor     v0.16b, v0.16b, v3.16b          ;"
+                "       st1     {v0.16b}, %[out]                ;"
+        :       [out]           "=Q"(*out),
+                [key]           "=r"(dummy0),
+                [rounds]        "=r"(dummy1)
+        :       [in]            "Q"(*in),
+                                "1"(ctx->key_enc),
+                                "2"(num_rounds(ctx) - 2)
+        :       "cc");
+        kernel_neon_end();
+}
+static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+        struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        struct aes_block *out = (struct aes_block *)dst;
+        struct aes_block const *in = (struct aes_block *)src;
+        void *dummy0;
+        int dummy1;
+        kernel_neon_begin_partial(4);
+        __asm__("       ld1     {v0.16b}, %[in]                 ;"
+                "       ld1     {v1.2d}, [%[key]], #16          ;"
+                "       cmp     %w[rounds], #10                 ;"
+                "       bmi     0f                              ;"
+                "       bne     3f                              ;"
+                "       mov     v3.16b, v1.16b                  ;"
+                "       b       2f                              ;"
+                "0:     mov     v2.16b, v1.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "1:     aesd    v0.16b, v2.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "2:     ld1     {v1.2d}, [%[key]], #16          ;"
+                "       aesd    v0.16b, v3.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "3:     ld1     {v2.2d}, [%[key]], #16          ;"
+                "       subs    %w[rounds], %w[rounds], #3      ;"
+                "       aesd    v0.16b, v1.16b                  ;"
+                "       aesimc  v0.16b, v0.16b                  ;"
+                "       ld1     {v3.2d}, [%[key]], #16          ;"
+                "       bpl     1b                              ;"
+                "       aesd    v0.16b, v2.16b                  ;"
+                "       eor     v0.16b, v0.16b, v3.16b          ;"
+                "       st1     {v0.16b}, %[out]                ;"
+        :       [out]           "=Q"(*out),
+                [key]           "=r"(dummy0),
+                [rounds]        "=r"(dummy1)
+        :       [in]            "Q"(*in),
+                                "1"(ctx->key_dec),
+                                "2"(num_rounds(ctx) - 2)
+        :       "cc");
+        kernel_neon_end();
+}
+static struct crypto_alg aes_alg = {
+        .cra_name               = "aes",
+        .cra_driver_name        = "aes-ce",
+        .cra_priority           = 300,
+        .cra_flags              = CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_module             = THIS_MODULE,
+        .cra_cipher = {
+                .cia_min_keysize        = AES_MIN_KEY_SIZE,
+                .cia_max_keysize        = AES_MAX_KEY_SIZE,
+                .cia_setkey             = crypto_aes_set_key,
+                .cia_encrypt            = aes_cipher_encrypt,
+                .cia_decrypt            = aes_cipher_decrypt
+        }
+};
+static int __init aes_mod_init(void)
+{
+        return crypto_register_alg(&aes_alg);
+}
+static void __exit aes_mod_exit(void)
+{
+        crypto_unregister_alg(&aes_alg);
+}
+module_cpu_feature_match(AES, aes_mod_init);
+module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..685a18f731eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,133 @@
+/*
+ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
+ *                                    Crypto Extensions
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#define AES_ENTRY(func)         ENTRY(ce_ ## func)
+#define AES_ENDPROC(func)       ENDPROC(ce_ ## func)
+        .arch           armv8-a+crypto
+        /* preload all round keys */
+        .macro          load_round_keys, rounds, rk
+        cmp             \rounds, #12
+        blo             2222f           /* 128 bits */
+        beq             1111f           /* 192 bits */
+        ld1             {v17.16b-v18.16b}, [\rk], #32
+1111:   ld1             {v19.16b-v20.16b}, [\rk], #32
+2222:   ld1             {v21.16b-v24.16b}, [\rk], #64
+        ld1             {v25.16b-v28.16b}, [\rk], #64
+        ld1             {v29.16b-v31.16b}, [\rk]
+        .endm
+        /* prepare for encryption with key in rk[] */
+        .macro          enc_prepare, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        /* prepare for encryption (again) but with new key in rk[] */
+        .macro          enc_switch_key, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        /* prepare for decryption with key in rk[] */
+        .macro          dec_prepare, rounds, rk, ignore
+        load_round_keys \rounds, \rk
+        .endm
+        .macro          do_enc_Nx, de, mc, k, i0, i1, i2, i3
+        aes\de          \i0\().16b, \k\().16b
+        .ifnb           \i1
+        aes\de          \i1\().16b, \k\().16b
+        .ifnb           \i3
+        aes\de          \i2\().16b, \k\().16b
+        aes\de          \i3\().16b, \k\().16b
+        .endif
+        .endif
+        aes\mc          \i0\().16b, \i0\().16b
+        .ifnb           \i1
+        aes\mc          \i1\().16b, \i1\().16b
+        .ifnb           \i3
+        aes\mc          \i2\().16b, \i2\().16b
+        aes\mc          \i3\().16b, \i3\().16b
+        .endif
+        .endif
+        .endm
+        /* up to 4 interleaved encryption rounds with the same round key */
+        .macro          round_Nx, enc, k, i0, i1, i2, i3
+        .ifc            \enc, e
+        do_enc_Nx       e, mc, \k, \i0, \i1, \i2, \i3
+        .else
+        do_enc_Nx       d, imc, \k, \i0, \i1, \i2, \i3
+        .endif
+        .endm
+        /* up to 4 interleaved final rounds */
+        .macro          fin_round_Nx, de, k, k2, i0, i1, i2, i3
+        aes\de          \i0\().16b, \k\().16b
+        .ifnb           \i1
+        aes\de          \i1\().16b, \k\().16b
+        .ifnb           \i3
+        aes\de          \i2\().16b, \k\().16b
+        aes\de          \i3\().16b, \k\().16b
+        .endif
+        .endif
+        eor             \i0\().16b, \i0\().16b, \k2\().16b
+        .ifnb           \i1
+        eor             \i1\().16b, \i1\().16b, \k2\().16b
+        .ifnb           \i3
+        eor             \i2\().16b, \i2\().16b, \k2\().16b
+        eor             \i3\().16b, \i3\().16b, \k2\().16b
+        .endif
+        .endif
+        .endm
+        /* up to 4 interleaved blocks */
+        .macro          do_block_Nx, enc, rounds, i0, i1, i2, i3
+        cmp             \rounds, #12
+        blo             2222f           /* 128 bits */
+        beq             1111f           /* 192 bits */
+        round_Nx        \enc, v17, \i0, \i1, \i2, \i3
+        round_Nx        \enc, v18, \i0, \i1, \i2, \i3
+1111:   round_Nx        \enc, v19, \i0, \i1, \i2, \i3
+        round_Nx        \enc, v20, \i0, \i1, \i2, \i3
+2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+        round_Nx        \enc, \key, \i0, \i1, \i2, \i3
+        .endr
+        fin_round_Nx    \enc, v30, v31, \i0, \i1, \i2, \i3
+        .endm
+        .macro          encrypt_block, in, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \in
+        .endm
+        .macro          encrypt_block2x, i0, i1, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \i0, \i1
+        .endm
+        .macro          encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+        do_block_Nx     e, \rounds, \i0, \i1, \i2, \i3
+        .endm
+        .macro          decrypt_block, in, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \in
+        .endm
+        .macro          decrypt_block2x, i0, i1, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \i0, \i1
+        .endm
+        .macro          decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+        do_block_Nx     d, \rounds, \i0, \i1, \i2, \i3
+        .endm
+#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
+/*
+ * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/hwcap.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+#define MODE                    "ce"
+#define PRIO                    300
+#define aes_ecb_encrypt         ce_aes_ecb_encrypt
+#define aes_ecb_decrypt         ce_aes_ecb_decrypt
+#define aes_cbc_encrypt         ce_aes_cbc_encrypt
+#define aes_cbc_decrypt         ce_aes_cbc_decrypt
+#define aes_ctr_encrypt         ce_aes_ctr_encrypt
+#define aes_xts_encrypt         ce_aes_xts_encrypt
+#define aes_xts_decrypt         ce_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+#else
+#define MODE                    "neon"
+#define PRIO                    200
+#define aes_ecb_encrypt         neon_aes_ecb_encrypt
+#define aes_ecb_decrypt         neon_aes_ecb_decrypt
+#define aes_cbc_encrypt         neon_aes_cbc_encrypt
+#define aes_cbc_decrypt         neon_aes_cbc_decrypt
+#define aes_ctr_encrypt         neon_aes_ctr_encrypt
+#define aes_xts_encrypt         neon_aes_xts_encrypt
+#define aes_xts_decrypt         neon_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_ALIAS("ecb(aes)");
+MODULE_ALIAS("cbc(aes)");
+MODULE_ALIAS("ctr(aes)");
+MODULE_ALIAS("xts(aes)");
+#endif
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+/* defined in aes-modes.S */
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, int first);
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, int first);
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                int rounds, int blocks, u8 ctr[], int first);
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
+                                int rounds, int blocks, u8 const rk2[], u8 iv[],
+                                int first);
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
+                                int rounds, int blocks, u8 const rk2[], u8 iv[],
+                                int first);
+struct crypto_aes_xts_ctx {
+        struct crypto_aes_ctx key1;
+        struct crypto_aes_ctx __aligned(8) key2;
+};
+static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                       unsigned int key_len)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+        int ret;
+        ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
+        if (!ret)
+                ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
+                                            key_len / 2);
+        if (!ret)
+                return 0;
+        tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+        return -EINVAL;
+}
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_dec, rounds, blocks, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+                                first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
+                                first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key_length / 4;
+        struct blkcipher_walk walk;
+        int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+        first = 1;
+        kernel_neon_begin();
+        while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+                aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+                                first);
+                first = 0;
+                nbytes -= blocks * AES_BLOCK_SIZE;
+                if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
+                        break;
+                err = blkcipher_walk_done(desc, &walk,
+                                          walk.nbytes % AES_BLOCK_SIZE);
+        }
+        if (nbytes) {
+                u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+                u8 __aligned(8) tail[AES_BLOCK_SIZE];
+                /*
+                 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
+                 * to tell aes_ctr_encrypt() to only read half a block.
+                 */
+                blocks = (nbytes <= 8) ? -1 : 1;
+                aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+                                blocks, walk.iv, first);
+                memcpy(tdst, tail, nbytes);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key1.key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key1.key_enc, rounds, blocks,
+                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+                       struct scatterlist *src, unsigned int nbytes)
+{
+        struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+        int err, first, rounds = 6 + ctx->key1.key_length / 4;
+        struct blkcipher_walk walk;
+        unsigned int blocks;
+        desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt(desc, &walk);
+        kernel_neon_begin();
+        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+                aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                (u8 *)ctx->key1.key_dec, rounds, blocks,
+                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        kernel_neon_end();
+        return err;
+}
+static struct crypto_alg aes_algs[] = { {
+        .cra_name               = "__ecb-aes-" MODE,
+        .cra_driver_name        = "__driver-ecb-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = ecb_encrypt,
+                .decrypt        = ecb_decrypt,
+        },
+}, {
+        .cra_name               = "__cbc-aes-" MODE,
+        .cra_driver_name        = "__driver-cbc-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = cbc_encrypt,
+                .decrypt        = cbc_decrypt,
+        },
+}, {
+        .cra_name               = "__ctr-aes-" MODE,
+        .cra_driver_name        = "__driver-ctr-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = crypto_aes_set_key,
+                .encrypt        = ctr_encrypt,
+                .decrypt        = ctr_encrypt,
+        },
+}, {
+        .cra_name               = "__xts-aes-" MODE,
+        .cra_driver_name        = "__driver-xts-aes-" MODE,
+        .cra_priority           = 0,
+        .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_xts_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_blkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_blkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = xts_set_key,
+                .encrypt        = xts_encrypt,
+                .decrypt        = xts_decrypt,
+        },
+}, {
+        .cra_name               = "ecb(aes)",
+        .cra_driver_name        = "ecb-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "cbc(aes)",
+        .cra_driver_name        = "cbc-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "ctr(aes)",
+        .cra_driver_name        = "ctr-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = 1,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = AES_MIN_KEY_SIZE,
+                .max_keysize    = AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+}, {
+        .cra_name               = "xts(aes)",
+        .cra_driver_name        = "xts-aes-" MODE,
+        .cra_priority           = PRIO,
+        .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct async_helper_ctx),
+        .cra_alignmask          = 7,
+        .cra_type               = &crypto_ablkcipher_type,
+        .cra_module             = THIS_MODULE,
+        .cra_init               = ablk_init,
+        .cra_exit               = ablk_exit,
+        .cra_ablkcipher = {
+                .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+                .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+                .ivsize         = AES_BLOCK_SIZE,
+                .setkey         = ablk_set_key,
+                .encrypt        = ablk_encrypt,
+                .decrypt        = ablk_decrypt,
+        }
+} };
+static int __init aes_init(void)
+{
+        return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+static void __exit aes_exit(void)
+{
+        crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+module_cpu_feature_match(AES, aes_init);
+#else
+module_init(aes_init);
+#endif
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..f6e372c528eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,532 @@
+/*
+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* included by aes-ce.S and aes-neon.S */
+        .text
+        .align          4
+/*
+ * There are several ways to instantiate this code:
+ * - no interleave, all inline
+ * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
+ * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
+ *
+ * Macros imported by this code:
+ * - enc_prepare        - setup NEON registers for encryption
+ * - dec_prepare        - setup NEON registers for decryption
+ * - enc_switch_key     - change to new key after having prepared for encryption
+ * - encrypt_block      - encrypt a single block
+ * - decrypt block      - decrypt a single block
+ * - encrypt_block2x    - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - decrypt_block2x    - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
+ * - encrypt_block4x    - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - decrypt_block4x    - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ */
+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
+#define FRAME_PUSH      stp x29, x30, [sp,#-16]! ; mov x29, sp
+#define FRAME_POP       ldp x29, x30, [sp],#16
+#if INTERLEAVE == 2
+aes_encrypt_block2x:
+        encrypt_block2x v0, v1, w3, x2, x6, w7
+        ret
+ENDPROC(aes_encrypt_block2x)
+aes_decrypt_block2x:
+        decrypt_block2x v0, v1, w3, x2, x6, w7
+        ret
+ENDPROC(aes_decrypt_block2x)
+#elif INTERLEAVE == 4
+aes_encrypt_block4x:
+        encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        ret
+ENDPROC(aes_encrypt_block4x)
+aes_decrypt_block4x:
+        decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        ret
+ENDPROC(aes_decrypt_block4x)
+#else
+#error INTERLEAVE should equal 2 or 4
+#endif
+        .macro          do_encrypt_block2x
+        bl              aes_encrypt_block2x
+        .endm
+        .macro          do_decrypt_block2x
+        bl              aes_decrypt_block2x
+        .endm
+        .macro          do_encrypt_block4x
+        bl              aes_encrypt_block4x
+        .endm
+        .macro          do_decrypt_block4x
+        bl              aes_decrypt_block4x
+        .endm
+#else
+#define FRAME_PUSH
+#define FRAME_POP
+        .macro          do_encrypt_block2x
+        encrypt_block2x v0, v1, w3, x2, x6, w7
+        .endm
+        .macro          do_decrypt_block2x
+        decrypt_block2x v0, v1, w3, x2, x6, w7
+        .endm
+        .macro          do_encrypt_block4x
+        encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        .endm
+        .macro          do_decrypt_block4x
+        decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+        .endm
+#endif
+        /*
+         * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, int first)
+         * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, int first)
+         */
+AES_ENTRY(aes_ecb_encrypt)
+        FRAME_PUSH
+        cbz             w5, .LecbencloopNx
+        enc_prepare     w3, x2, x5
+.LecbencloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lecbenc1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+        do_encrypt_block2x
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+        do_encrypt_block4x
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LecbencloopNx
+.Lecbenc1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lecbencout
+#endif
+.Lecbencloop:
+        ld1             {v0.16b}, [x1], #16             /* get next pt block */
+        encrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lecbencloop
+.Lecbencout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ecb_encrypt)
+AES_ENTRY(aes_ecb_decrypt)
+        FRAME_PUSH
+        cbz             w5, .LecbdecloopNx
+        dec_prepare     w3, x2, x5
+.LecbdecloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lecbdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        do_decrypt_block2x
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        do_decrypt_block4x
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LecbdecloopNx
+.Lecbdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lecbdecout
+#endif
+.Lecbdecloop:
+        ld1             {v0.16b}, [x1], #16             /* get next ct block */
+        decrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lecbdecloop
+.Lecbdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ecb_decrypt)
+        /*
+         * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 iv[], int first)
+         * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 iv[], int first)
+         */
+AES_ENTRY(aes_cbc_encrypt)
+        cbz             w6, .Lcbcencloop
+        ld1             {v0.16b}, [x5]                  /* get iv */
+        enc_prepare     w3, x2, x5
+.Lcbcencloop:
+        ld1             {v1.16b}, [x1], #16             /* get next pt block */
+        eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
+        encrypt_block   v0, w3, x2, x5, w6
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lcbcencloop
+        ret
+AES_ENDPROC(aes_cbc_encrypt)
+AES_ENTRY(aes_cbc_decrypt)
+        FRAME_PUSH
+        cbz             w6, .LcbcdecloopNx
+        ld1             {v7.16b}, [x5]                  /* get iv */
+        dec_prepare     w3, x2, x5
+.LcbcdecloopNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lcbcdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        mov             v2.16b, v0.16b
+        mov             v3.16b, v1.16b
+        do_decrypt_block2x
+        eor             v0.16b, v0.16b, v7.16b
+        eor             v1.16b, v1.16b, v2.16b
+        mov             v7.16b, v3.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        mov             v4.16b, v0.16b
+        mov             v5.16b, v1.16b
+        mov             v6.16b, v2.16b
+        do_decrypt_block4x
+        sub             x1, x1, #16
+        eor             v0.16b, v0.16b, v7.16b
+        eor             v1.16b, v1.16b, v4.16b
+        ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
+        eor             v2.16b, v2.16b, v5.16b
+        eor             v3.16b, v3.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+#endif
+        b               .LcbcdecloopNx
+.Lcbcdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lcbcdecout
+#endif
+.Lcbcdecloop:
+        ld1             {v1.16b}, [x1], #16             /* get next ct block */
+        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
+        decrypt_block   v0, w3, x2, x5, w6
+        eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
+        mov             v7.16b, v1.16b                  /* ct is next iv */
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        bne             .Lcbcdecloop
+.Lcbcdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_cbc_decrypt)
+        /*
+         * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+         *                 int blocks, u8 ctr[], int first)
+         */
+AES_ENTRY(aes_ctr_encrypt)
+        FRAME_PUSH
+        cbnz            w6, .Lctrfirst          /* 1st time around? */
+        umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+        rev             x5, x5
+#if INTERLEAVE >= 2
+        cmn             w5, w4                  /* 32 bit overflow? */
+        bcs             .Lctrinc
+        add             x5, x5, #1              /* increment BE ctr */
+        b               .LctrincNx
+#else
+        b               .Lctrinc
+#endif
+.Lctrfirst:
+        enc_prepare     w3, x2, x6
+        ld1             {v4.16b}, [x5]
+        umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
+        rev             x5, x5
+#if INTERLEAVE >= 2
+        cmn             w5, w4                  /* 32 bit overflow? */
+        bcs             .Lctrloop
+.LctrloopNx:
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lctr1x
+#if INTERLEAVE == 2
+        mov             v0.8b, v4.8b
+        mov             v1.8b, v4.8b
+        rev             x7, x5
+        add             x5, x5, #1
+        ins             v0.d[1], x7
+        rev             x7, x5
+        add             x5, x5, #1
+        ins             v1.d[1], x7
+        ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
+        do_encrypt_block2x
+        eor             v0.16b, v0.16b, v2.16b
+        eor             v1.16b, v1.16b, v3.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+#else
+        ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
+        dup             v7.4s, w5
+        mov             v0.16b, v4.16b
+        add             v7.4s, v7.4s, v8.4s
+        mov             v1.16b, v4.16b
+        rev32           v8.16b, v7.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+        mov             v1.s[3], v8.s[0]
+        mov             v2.s[3], v8.s[1]
+        mov             v3.s[3], v8.s[2]
+        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
+        do_encrypt_block4x
+        eor             v0.16b, v5.16b, v0.16b
+        ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
+        eor             v1.16b, v6.16b, v1.16b
+        eor             v2.16b, v7.16b, v2.16b
+        eor             v3.16b, v5.16b, v3.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        add             x5, x5, #INTERLEAVE
+#endif
+        cbz             w4, .LctroutNx
+.LctrincNx:
+        rev             x7, x5
+        ins             v4.d[1], x7
+        b               .LctrloopNx
+.LctroutNx:
+        sub             x5, x5, #1
+        rev             x7, x5
+        ins             v4.d[1], x7
+        b               .Lctrout
+.Lctr1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lctrout
+#endif
+.Lctrloop:
+        mov             v0.16b, v4.16b
+        encrypt_block   v0, w3, x2, x6, w7
+        subs            w4, w4, #1
+        bmi             .Lctrhalfblock          /* blocks < 0 means 1/2 block */
+        ld1             {v3.16b}, [x1], #16
+        eor             v3.16b, v0.16b, v3.16b
+        st1             {v3.16b}, [x0], #16
+        beq             .Lctrout
+.Lctrinc:
+        adds            x5, x5, #1              /* increment BE ctr */
+        rev             x7, x5
+        ins             v4.d[1], x7
+        bcc             .Lctrloop               /* no overflow? */
+        umov            x7, v4.d[0]             /* load upper word of ctr  */
+        rev             x7, x7                  /* ... to handle the carry */
+        add             x7, x7, #1
+        rev             x7, x7
+        ins             v4.d[0], x7
+        b               .Lctrloop
+.Lctrhalfblock:
+        ld1             {v3.8b}, [x1]
+        eor             v3.8b, v0.8b, v3.8b
+        st1             {v3.8b}, [x0]
+.Lctrout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_ctr_encrypt)
+        .ltorg
+        /*
+         * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+         *                 int blocks, u8 const rk2[], u8 iv[], int first)
+         * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+         *                 int blocks, u8 const rk2[], u8 iv[], int first)
+         */
+        .macro          next_tweak, out, in, const, tmp
+        sshr            \tmp\().2d,  \in\().2d,   #63
+        and             \tmp\().16b, \tmp\().16b, \const\().16b
+        add             \out\().2d,  \in\().2d,   \in\().2d
+        ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+        eor             \out\().16b, \out\().16b, \tmp\().16b
+        .endm
+.Lxts_mul_x:
+        .word           1, 0, 0x87, 0
+AES_ENTRY(aes_xts_encrypt)
+        FRAME_PUSH
+        cbz             w7, .LxtsencloopNx
+        ld1             {v4.16b}, [x6]
+        enc_prepare     w3, x5, x6
+        encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+        enc_switch_key  w3, x2, x6
+        ldr             q7, .Lxts_mul_x
+        b               .LxtsencNx
+.LxtsencloopNx:
+        ldr             q7, .Lxts_mul_x
+        next_tweak      v4, v4, v7, v8
+.LxtsencNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lxtsenc1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        do_encrypt_block2x
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+        cbz             w4, .LxtsencoutNx
+        next_tweak      v4, v5, v7, v8
+        b               .LxtsencNx
+.LxtsencoutNx:
+        mov             v4.16b, v5.16b
+        b               .Lxtsencout
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        next_tweak      v6, v5, v7, v8
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        next_tweak      v7, v6, v7, v8
+        eor             v3.16b, v3.16b, v7.16b
+        do_encrypt_block4x
+        eor             v3.16b, v3.16b, v7.16b
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        mov             v4.16b, v7.16b
+        cbz             w4, .Lxtsencout
+        b               .LxtsencloopNx
+#endif
+.Lxtsenc1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lxtsencout
+#endif
+.Lxtsencloop:
+        ld1             {v1.16b}, [x1], #16
+        eor             v0.16b, v1.16b, v4.16b
+        encrypt_block   v0, w3, x2, x6, w7
+        eor             v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        beq             .Lxtsencout
+        next_tweak      v4, v4, v7, v8
+        b               .Lxtsencloop
+.Lxtsencout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_xts_encrypt)
+AES_ENTRY(aes_xts_decrypt)
+        FRAME_PUSH
+        cbz             w7, .LxtsdecloopNx
+        ld1             {v4.16b}, [x6]
+        enc_prepare     w3, x5, x6
+        encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
+        dec_prepare     w3, x2, x6
+        ldr             q7, .Lxts_mul_x
+        b               .LxtsdecNx
+.LxtsdecloopNx:
+        ldr             q7, .Lxts_mul_x
+        next_tweak      v4, v4, v7, v8
+.LxtsdecNx:
+#if INTERLEAVE >= 2
+        subs            w4, w4, #INTERLEAVE
+        bmi             .Lxtsdec1x
+#if INTERLEAVE == 2
+        ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        do_decrypt_block2x
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        st1             {v0.16b-v1.16b}, [x0], #32
+        cbz             w4, .LxtsdecoutNx
+        next_tweak      v4, v5, v7, v8
+        b               .LxtsdecNx
+.LxtsdecoutNx:
+        mov             v4.16b, v5.16b
+        b               .Lxtsdecout
+#else
+        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+        next_tweak      v5, v4, v7, v8
+        eor             v0.16b, v0.16b, v4.16b
+        next_tweak      v6, v5, v7, v8
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        next_tweak      v7, v6, v7, v8
+        eor             v3.16b, v3.16b, v7.16b
+        do_decrypt_block4x
+        eor             v3.16b, v3.16b, v7.16b
+        eor             v0.16b, v0.16b, v4.16b
+        eor             v1.16b, v1.16b, v5.16b
+        eor             v2.16b, v2.16b, v6.16b
+        st1             {v0.16b-v3.16b}, [x0], #64
+        mov             v4.16b, v7.16b
+        cbz             w4, .Lxtsdecout
+        b               .LxtsdecloopNx
+#endif
+.Lxtsdec1x:
+        adds            w4, w4, #INTERLEAVE
+        beq             .Lxtsdecout
+#endif
+.Lxtsdecloop:
+        ld1             {v1.16b}, [x1], #16
+        eor             v0.16b, v1.16b, v4.16b
+        decrypt_block   v0, w3, x2, x6, w7
+        eor             v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [x0], #16
+        subs            w4, w4, #1
+        beq             .Lxtsdecout
+        next_tweak      v4, v4, v7, v8
+        b               .Lxtsdecloop
+.Lxtsdecout:
+        FRAME_POP
+        ret
+AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..b93170e1cc93
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
+/*
+ * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#define AES_ENTRY(func)         ENTRY(neon_ ## func)
+#define AES_ENDPROC(func)       ENDPROC(neon_ ## func)
+        /* multiply by polynomial 'x' in GF(2^8) */
+        .macro          mul_by_x, out, in, temp, const
+        sshr            \temp, \in, #7
+        add             \out, \in, \in
+        and             \temp, \temp, \const
+        eor             \out, \out, \temp
+        .endm
+        /* preload the entire Sbox */
+        .macro          prepare, sbox, shiftrows, temp
+        adr             \temp, \sbox
+        movi            v12.16b, #0x40
+        ldr             q13, \shiftrows
+        movi            v14.16b, #0x1b
+        ld1             {v16.16b-v19.16b}, [\temp], #64
+        ld1             {v20.16b-v23.16b}, [\temp], #64
+        ld1             {v24.16b-v27.16b}, [\temp], #64
+        ld1             {v28.16b-v31.16b}, [\temp]
+        .endm
+        /* do preload for encryption */
+        .macro          enc_prepare, ignore0, ignore1, temp
+        prepare         .LForward_Sbox, .LForward_ShiftRows, \temp
+        .endm
+        .macro          enc_switch_key, ignore0, ignore1, temp
+        /* do nothing */
+        .endm
+        /* do preload for decryption */
+        .macro          dec_prepare, ignore0, ignore1, temp
+        prepare         .LReverse_Sbox, .LReverse_ShiftRows, \temp
+        .endm
+        /* apply SubBytes transformation using the the preloaded Sbox */
+        .macro          sub_bytes, in
+        sub             v9.16b, \in\().16b, v12.16b
+        tbl             \in\().16b, {v16.16b-v19.16b}, \in\().16b
+        sub             v10.16b, v9.16b, v12.16b
+        tbx             \in\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v11.16b, v10.16b, v12.16b
+        tbx             \in\().16b, {v24.16b-v27.16b}, v10.16b
+        tbx             \in\().16b, {v28.16b-v31.16b}, v11.16b
+        .endm
+        /* apply MixColumns transformation */
+        .macro          mix_columns, in
+        mul_by_x        v10.16b, \in\().16b, v9.16b, v14.16b
+        rev32           v8.8h, \in\().8h
+        eor             \in\().16b, v10.16b, \in\().16b
+        shl             v9.4s, v8.4s, #24
+        shl             v11.4s, \in\().4s, #24
+        sri             v9.4s, v8.4s, #8
+        sri             v11.4s, \in\().4s, #8
+        eor             v9.16b, v9.16b, v8.16b
+        eor             v10.16b, v10.16b, v9.16b
+        eor             \in\().16b, v10.16b, v11.16b
+        .endm
+        /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+        .macro          inv_mix_columns, in
+        mul_by_x        v11.16b, \in\().16b, v10.16b, v14.16b
+        mul_by_x        v11.16b, v11.16b, v10.16b, v14.16b
+        eor             \in\().16b, \in\().16b, v11.16b
+        rev32           v11.8h, v11.8h
+        eor             \in\().16b, \in\().16b, v11.16b
+        mix_columns     \in
+        .endm
+        .macro          do_block, enc, in, rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
+        tbl             \in\().16b, {\in\().16b}, v13.16b       /* ShiftRows */
+        sub_bytes       \in
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns     \in
+        .else
+        inv_mix_columns \in
+        .endif
+        b               1111b
+2222:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
+        .endm
+        .macro          encrypt_block, in, rounds, rk, rkp, i
+        do_block        1, \in, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block, in, rounds, rk, rkp, i
+        do_block        0, \in, \rounds, \rk, \rkp, \i
+        .endm
+        /*
+         * Interleaved versions: functionally equivalent to the
+         * ones above, but applied to 2 or 4 AES states in parallel.
+         */
+        .macro          sub_bytes_2x, in0, in1
+        sub             v8.16b, \in0\().16b, v12.16b
+        sub             v9.16b, \in1\().16b, v12.16b
+        tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+        tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+        sub             v10.16b, v8.16b, v12.16b
+        sub             v11.16b, v9.16b, v12.16b
+        tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
+        tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v8.16b, v10.16b, v12.16b
+        sub             v9.16b, v11.16b, v12.16b
+        tbx             \in0\().16b, {v24.16b-v27.16b}, v10.16b
+        tbx             \in1\().16b, {v24.16b-v27.16b}, v11.16b
+        tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
+        tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
+        .endm
+        .macro          sub_bytes_4x, in0, in1, in2, in3
+        sub             v8.16b, \in0\().16b, v12.16b
+        tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+        sub             v9.16b, \in1\().16b, v12.16b
+        tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+        sub             v10.16b, \in2\().16b, v12.16b
+        tbl             \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
+        sub             v11.16b, \in3\().16b, v12.16b
+        tbl             \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
+        tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
+        tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
+        sub             v8.16b, v8.16b, v12.16b
+        tbx             \in2\().16b, {v20.16b-v23.16b}, v10.16b
+        sub             v9.16b, v9.16b, v12.16b
+        tbx             \in3\().16b, {v20.16b-v23.16b}, v11.16b
+        sub             v10.16b, v10.16b, v12.16b
+        tbx             \in0\().16b, {v24.16b-v27.16b}, v8.16b
+        sub             v11.16b, v11.16b, v12.16b
+        tbx             \in1\().16b, {v24.16b-v27.16b}, v9.16b
+        sub             v8.16b, v8.16b, v12.16b
+        tbx             \in2\().16b, {v24.16b-v27.16b}, v10.16b
+        sub             v9.16b, v9.16b, v12.16b
+        tbx             \in3\().16b, {v24.16b-v27.16b}, v11.16b
+        sub             v10.16b, v10.16b, v12.16b
+        tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
+        sub             v11.16b, v11.16b, v12.16b
+        tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
+        tbx             \in2\().16b, {v28.16b-v31.16b}, v10.16b
+        tbx             \in3\().16b, {v28.16b-v31.16b}, v11.16b
+        .endm
+        .macro          mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
+        sshr            \tmp0\().16b, \in0\().16b,  #7
+        add             \out0\().16b, \in0\().16b,  \in0\().16b
+        sshr            \tmp1\().16b, \in1\().16b,  #7
+        and             \tmp0\().16b, \tmp0\().16b, \const\().16b
+        add             \out1\().16b, \in1\().16b,  \in1\().16b
+        and             \tmp1\().16b, \tmp1\().16b, \const\().16b
+        eor             \out0\().16b, \out0\().16b, \tmp0\().16b
+        eor             \out1\().16b, \out1\().16b, \tmp1\().16b
+        .endm
+        .macro          mix_columns_2x, in0, in1
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        rev32           v10.8h, \in0\().8h
+        rev32           v11.8h, \in1\().8h
+        eor             \in0\().16b, v8.16b, \in0\().16b
+        eor             \in1\().16b, v9.16b, \in1\().16b
+        shl             v12.4s, v10.4s, #24
+        shl             v13.4s, v11.4s, #24
+        eor             v8.16b, v8.16b, v10.16b
+        sri             v12.4s, v10.4s, #8
+        shl             v10.4s, \in0\().4s, #24
+        eor             v9.16b, v9.16b, v11.16b
+        sri             v13.4s, v11.4s, #8
+        shl             v11.4s, \in1\().4s, #24
+        sri             v10.4s, \in0\().4s, #8
+        eor             \in0\().16b, v8.16b, v12.16b
+        sri             v11.4s, \in1\().4s, #8
+        eor             \in1\().16b, v9.16b, v13.16b
+        eor             \in0\().16b, v10.16b, \in0\().16b
+        eor             \in1\().16b, v11.16b, \in1\().16b
+        .endm
+        .macro          inv_mix_cols_2x, in0, in1
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        mul_by_x_2x     v8, v9, v8, v9, v10, v11, v14
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        rev32           v8.8h, v8.8h
+        rev32           v9.8h, v9.8h
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        mix_columns_2x  \in0, \in1
+        .endm
+        .macro          inv_mix_cols_4x, in0, in1, in2, in3
+        mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
+        mul_by_x_2x     v10, v11, \in2, \in3, v12, v13, v14
+        mul_by_x_2x     v8, v9, v8, v9, v12, v13, v14
+        mul_by_x_2x     v10, v11, v10, v11, v12, v13, v14
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        eor             \in2\().16b, \in2\().16b, v10.16b
+        eor             \in3\().16b, \in3\().16b, v11.16b
+        rev32           v8.8h, v8.8h
+        rev32           v9.8h, v9.8h
+        rev32           v10.8h, v10.8h
+        rev32           v11.8h, v11.8h
+        eor             \in0\().16b, \in0\().16b, v8.16b
+        eor             \in1\().16b, \in1\().16b, v9.16b
+        eor             \in2\().16b, \in2\().16b, v10.16b
+        eor             \in3\().16b, \in3\().16b, v11.16b
+        mix_columns_2x  \in0, \in1
+        mix_columns_2x  \in2, \in3
+        .endm
+        .macro          do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        sub_bytes_2x    \in0, \in1
+        tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns_2x  \in0, \in1
+        ldr             q13, .LForward_ShiftRows
+        .else
+        inv_mix_cols_2x \in0, \in1
+        ldr             q13, .LReverse_ShiftRows
+        .endif
+        movi            v12.16b, #0x40
+        b               1111b
+2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        .endm
+        .macro          do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
+        ld1             {v15.16b}, [\rk]
+        add             \rkp, \rk, #16
+        mov             \i, \rounds
+1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
+        eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
+        sub_bytes_4x    \in0, \in1, \in2, \in3
+        tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in2\().16b, {\in2\().16b}, v13.16b     /* ShiftRows */
+        tbl             \in3\().16b, {\in3\().16b}, v13.16b     /* ShiftRows */
+        ld1             {v15.16b}, [\rkp], #16
+        subs            \i, \i, #1
+        beq             2222f
+        .if             \enc == 1
+        mix_columns_2x  \in0, \in1
+        mix_columns_2x  \in2, \in3
+        ldr             q13, .LForward_ShiftRows
+        .else
+        inv_mix_cols_4x \in0, \in1, \in2, \in3
+        ldr             q13, .LReverse_ShiftRows
+        .endif
+        movi            v12.16b, #0x40
+        b               1111b
+2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
+        eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
+        eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
+        eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
+        .endm
+        .macro          encrypt_block2x, in0, in1, rounds, rk, rkp, i
+        do_block_2x     1, \in0, \in1, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block2x, in0, in1, rounds, rk, rkp, i
+        do_block_2x     0, \in0, \in1, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+        do_block_4x     1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+        .endm
+        .macro          decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+        do_block_4x     0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+        .endm
+#include "aes-modes.S"
+        .text
+        .align          4
+.LForward_ShiftRows:
+        .byte           0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
+        .byte           0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
+.LReverse_ShiftRows:
+        .byte           0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
+        .byte           0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
+.LForward_Sbox:
+        .byte           0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+        .byte           0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+        .byte           0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+        .byte           0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+        .byte           0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+        .byte           0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+        .byte           0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+        .byte           0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+        .byte           0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+        .byte           0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+        .byte           0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+        .byte           0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+        .byte           0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+        .byte           0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+        .byte           0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+        .byte           0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+        .byte           0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+        .byte           0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+        .byte           0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+        .byte           0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+        .byte           0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+        .byte           0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+        .byte           0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+        .byte           0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+        .byte           0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+        .byte           0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+        .byte           0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+        .byte           0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+        .byte           0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+        .byte           0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+        .byte           0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+        .byte           0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.LReverse_Sbox:
+        .byte           0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+        .byte           0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+        .byte           0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+        .byte           0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+        .byte           0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+        .byte           0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+        .byte           0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+        .byte           0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+        .byte           0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+        .byte           0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+        .byte           0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+        .byte           0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+        .byte           0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+        .byte           0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+        .byte           0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+        .byte           0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+        .byte           0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+        .byte           0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+        .byte           0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+        .byte           0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+        .byte           0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+        .byte           0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+        .byte           0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+        .byte           0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+        .byte           0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+        .byte           0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+        .byte           0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+        .byte           0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+        .byte           0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+        .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+        .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+        .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
new file mode 100644
index 000000000000..b9e6eaf41c9b
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -0,0 +1,95 @@
+/*
+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
+ *
+ * Copyright (c) 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *           Vinodh Gopal
+ *           Erdinc Ozturk
+ *           Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        DATA    .req    v0
+        SHASH   .req    v1
+        IN1     .req    v2
+        T1      .req    v2
+        T2      .req    v3
+        T3      .req    v4
+        VZR     .req    v5
+        .text
+        .arch           armv8-a+crypto
+        /*
+         * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+         *                         struct ghash_key const *k, const char *head)
+         */
+ENTRY(pmull_ghash_update)
+        ld1             {DATA.16b}, [x1]
+        ld1             {SHASH.16b}, [x3]
+        eor             VZR.16b, VZR.16b, VZR.16b
+        /* do the head block first, if supplied */
+        cbz             x4, 0f
+        ld1             {IN1.2d}, [x4]
+        b               1f
+0:      ld1             {IN1.2d}, [x2], #16
+        sub             w0, w0, #1
+1:      ext             IN1.16b, IN1.16b, IN1.16b, #8
+CPU_LE( rev64           IN1.16b, IN1.16b        )
+        eor             DATA.16b, DATA.16b, IN1.16b
+        /* multiply DATA by SHASH in GF(2^128) */
+        ext             T2.16b, DATA.16b, DATA.16b, #8
+        ext             T3.16b, SHASH.16b, SHASH.16b, #8
+        eor             T2.16b, T2.16b, DATA.16b
+        eor             T3.16b, T3.16b, SHASH.16b
+        pmull2          T1.1q, SHASH.2d, DATA.2d        // a1 * b1
+        pmull           DATA.1q, SHASH.1d, DATA.1d      // a0 * b0
+        pmull           T2.1q, T2.1d, T3.1d             // (a1 + a0)(b1 + b0)
+        eor             T2.16b, T2.16b, T1.16b          // (a0 * b1) + (a1 * b0)
+        eor             T2.16b, T2.16b, DATA.16b
+        ext             T3.16b, VZR.16b, T2.16b, #8
+        ext             T2.16b, T2.16b, VZR.16b, #8
+        eor             DATA.16b, DATA.16b, T3.16b
+        eor             T1.16b, T1.16b, T2.16b  // <T1:DATA> is result of
+                                                // carry-less multiplication
+        /* first phase of the reduction */
+        shl             T3.2d, DATA.2d, #1
+        eor             T3.16b, T3.16b, DATA.16b
+        shl             T3.2d, T3.2d, #5
+        eor             T3.16b, T3.16b, DATA.16b
+        shl             T3.2d, T3.2d, #57
+        ext             T2.16b, VZR.16b, T3.16b, #8
+        ext             T3.16b, T3.16b, VZR.16b, #8
+        eor             DATA.16b, DATA.16b, T2.16b
+        eor             T1.16b, T1.16b, T3.16b
+        /* second phase of the reduction */
+        ushr            T2.2d, DATA.2d, #5
+        eor             T2.16b, T2.16b, DATA.16b
+        ushr            T2.2d, T2.2d, #1
+        eor             T2.16b, T2.16b, DATA.16b
+        ushr            T2.2d, T2.2d, #1
+        eor             T1.16b, T1.16b, T2.16b
+        eor             DATA.16b, DATA.16b, T1.16b
+        cbnz            w0, 0b
+        st1             {DATA.16b}, [x1]
+        ret
+ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
new file mode 100644
index 000000000000..b92baf3f68c7
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -0,0 +1,155 @@
+/*
+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ *
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+#define GHASH_BLOCK_SIZE        16
+#define GHASH_DIGEST_SIZE       16
+struct ghash_key {
+        u64 a;
+        u64 b;
+};
+struct ghash_desc_ctx {
+        u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
+        u8 buf[GHASH_BLOCK_SIZE];
+        u32 count;
+};
+asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+                                   struct ghash_key const *k, const char *head);
+static int ghash_init(struct shash_desc *desc)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        *ctx = (struct ghash_desc_ctx){};
+        return 0;
+}
+static int ghash_update(struct shash_desc *desc, const u8 *src,
+                        unsigned int len)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+        ctx->count += len;
+        if ((partial + len) >= GHASH_BLOCK_SIZE) {
+                struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+                int blocks;
+                if (partial) {
+                        int p = GHASH_BLOCK_SIZE - partial;
+                        memcpy(ctx->buf + partial, src, p);
+                        src += p;
+                        len -= p;
+                }
+                blocks = len / GHASH_BLOCK_SIZE;
+                len %= GHASH_BLOCK_SIZE;
+                kernel_neon_begin_partial(6);
+                pmull_ghash_update(blocks, ctx->digest, src, key,
+                                   partial ? ctx->buf : NULL);
+                kernel_neon_end();
+                src += blocks * GHASH_BLOCK_SIZE;
+        }
+        if (len)
+                memcpy(ctx->buf + partial, src, len);
+        return 0;
+}
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+        struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+        unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+        if (partial) {
+                struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+                memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+                kernel_neon_begin_partial(6);
+                pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
+                kernel_neon_end();
+        }
+        put_unaligned_be64(ctx->digest[1], dst);
+        put_unaligned_be64(ctx->digest[0], dst + 8);
+        *ctx = (struct ghash_desc_ctx){};
+        return 0;
+}
+static int ghash_setkey(struct crypto_shash *tfm,
+                        const u8 *inkey, unsigned int keylen)
+{
+        struct ghash_key *key = crypto_shash_ctx(tfm);
+        u64 a, b;
+        if (keylen != GHASH_BLOCK_SIZE) {
+                crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+                return -EINVAL;
+        }
+        /* perform multiplication by 'x' in GF(2^128) */
+        b = get_unaligned_be64(inkey);
+        a = get_unaligned_be64(inkey + 8);
+        key->a = (a << 1) | (b >> 63);
+        key->b = (b << 1) | (a >> 63);
+        if (b >> 63)
+                key->b ^= 0xc200000000000000UL;
+        return 0;
+}
+static struct shash_alg ghash_alg = {
+        .digestsize     = GHASH_DIGEST_SIZE,
+        .init           = ghash_init,
+        .update         = ghash_update,
+        .final          = ghash_final,
+        .setkey         = ghash_setkey,
+        .descsize       = sizeof(struct ghash_desc_ctx),
+        .base           = {
+                .cra_name               = "ghash",
+                .cra_driver_name        = "ghash-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = GHASH_BLOCK_SIZE,
+                .cra_ctxsize            = sizeof(struct ghash_key),
+                .cra_module             = THIS_MODULE,
+        },
+};
+static int __init ghash_ce_mod_init(void)
+{
+        return crypto_register_shash(&ghash_alg);
+}
+static void __exit ghash_ce_mod_exit(void)
+{
+        crypto_unregister_shash(&ghash_alg);
+}
+module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+module_exit(ghash_ce_mod_exit);
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
new file mode 100644
index 000000000000..09d57d98609c
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -0,0 +1,153 @@
+/*
+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        .text
+        .arch           armv8-a+crypto
+        k0              .req    v0
+        k1              .req    v1
+        k2              .req    v2
+        k3              .req    v3
+        t0              .req    v4
+        t1              .req    v5
+        dga             .req    q6
+        dgav            .req    v6
+        dgb             .req    s7
+        dgbv            .req    v7
+        dg0q            .req    q12
+        dg0s            .req    s12
+        dg0v            .req    v12
+        dg1s            .req    s13
+        dg1v            .req    v13
+        dg2s            .req    s14
+        .macro          add_only, op, ev, rc, s0, dg1
+        .ifc            \ev, ev
+        add             t1.4s, v\s0\().4s, \rc\().4s
+        sha1h           dg2s, dg0s
+        .ifnb           \dg1
+        sha1\op         dg0q, \dg1, t0.4s
+        .else
+        sha1\op         dg0q, dg1s, t0.4s
+        .endif
+        .else
+        .ifnb           \s0
+        add             t0.4s, v\s0\().4s, \rc\().4s
+        .endif
+        sha1h           dg1s, dg0s
+        sha1\op         dg0q, dg2s, t1.4s
+        .endif
+        .endm
+        .macro          add_update, op, ev, rc, s0, s1, s2, s3, dg1
+        sha1su0         v\s0\().4s, v\s1\().4s, v\s2\().4s
+        add_only        \op, \ev, \rc, \s1, \dg1
+        sha1su1         v\s0\().4s, v\s3\().4s
+        .endm
+        /*
+         * The SHA1 round constants
+         */
+        .align          4
+.Lsha1_rcon:
+        .word           0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
+        /*
+         * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+         *                        u8 *head, long bytes)
+         */
+ENTRY(sha1_ce_transform)
+        /* load round constants */
+        adr             x6, .Lsha1_rcon
+        ld1r            {k0.4s}, [x6], #4
+        ld1r            {k1.4s}, [x6], #4
+        ld1r            {k2.4s}, [x6], #4
+        ld1r            {k3.4s}, [x6]
+        /* load state */
+        ldr             dga, [x2]
+        ldr             dgb, [x2, #16]
+        /* load partial state (if supplied) */
+        cbz             x3, 0f
+        ld1             {v8.4s-v11.4s}, [x3]
+        b               1f
+        /* load input */
+0:      ld1             {v8.4s-v11.4s}, [x1], #64
+        sub             w0, w0, #1
+1:
+CPU_LE( rev32           v8.16b, v8.16b          )
+CPU_LE( rev32           v9.16b, v9.16b          )
+CPU_LE( rev32           v10.16b, v10.16b        )
+CPU_LE( rev32           v11.16b, v11.16b        )
+2:      add             t0.4s, v8.4s, k0.4s
+        mov             dg0v.16b, dgav.16b
+        add_update      c, ev, k0,  8,  9, 10, 11, dgb
+        add_update      c, od, k0,  9, 10, 11,  8
+        add_update      c, ev, k0, 10, 11,  8,  9
+        add_update      c, od, k0, 11,  8,  9, 10
+        add_update      c, ev, k1,  8,  9, 10, 11
+        add_update      p, od, k1,  9, 10, 11,  8
+        add_update      p, ev, k1, 10, 11,  8,  9
+        add_update      p, od, k1, 11,  8,  9, 10
+        add_update      p, ev, k1,  8,  9, 10, 11
+        add_update      p, od, k2,  9, 10, 11,  8
+        add_update      m, ev, k2, 10, 11,  8,  9
+        add_update      m, od, k2, 11,  8,  9, 10
+        add_update      m, ev, k2,  8,  9, 10, 11
+        add_update      m, od, k2,  9, 10, 11,  8
+        add_update      m, ev, k3, 10, 11,  8,  9
+        add_update      p, od, k3, 11,  8,  9, 10
+        add_only        p, ev, k3,  9
+        add_only        p, od, k3, 10
+        add_only        p, ev, k3, 11
+        add_only        p, od
+        /* update state */
+        add             dgbv.2s, dgbv.2s, dg1v.2s
+        add             dgav.4s, dgav.4s, dg0v.4s
+        cbnz            w0, 0b
+        /*
+         * Final block: add padding and total bit count.
+         * Skip if we have no total byte count in x4. In that case, the input
+         * size was not a round multiple of the block size, and the padding is
+         * handled by the C code.
+         */
+        cbz             x4, 3f
+        movi            v9.2d, #0
+        mov             x8, #0x80000000
+        movi            v10.2d, #0
+        ror             x7, x4, #29             // ror(lsl(x4, 3), 32)
+        fmov            d8, x8
+        mov             x4, #0
+        mov             v11.d[0], xzr
+        mov             v11.d[1], x7
+        b               2b
+        /* store new state */
+3:      str             dga, [x2]
+        str             dgb, [x2, #16]
+        ret
+ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
new file mode 100644
index 000000000000..6fe83f37a750
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -0,0 +1,174 @@
+/*
+ * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+                                  u8 *head, long bytes);
+static int sha1_init(struct shash_desc *desc)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha1_state){
+                .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+        };
+        return 0;
+}
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+        sctx->count += len;
+        if ((partial + len) >= SHA1_BLOCK_SIZE) {
+                int blocks;
+                if (partial) {
+                        int p = SHA1_BLOCK_SIZE - partial;
+                        memcpy(sctx->buffer + partial, data, p);
+                        data += p;
+                        len -= p;
+                }
+                blocks = len / SHA1_BLOCK_SIZE;
+                len %= SHA1_BLOCK_SIZE;
+                kernel_neon_begin_partial(16);
+                sha1_ce_transform(blocks, data, sctx->state,
+                                  partial ? sctx->buffer : NULL, 0);
+                kernel_neon_end();
+                data += blocks * SHA1_BLOCK_SIZE;
+                partial = 0;
+        }
+        if (len)
+                memcpy(sctx->buffer + partial, data, len);
+        return 0;
+}
+static int sha1_final(struct shash_desc *desc, u8 *out)
+{
+        static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        __be64 bits = cpu_to_be64(sctx->count << 3);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        u32 padlen = SHA1_BLOCK_SIZE
+                     - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
+        sha1_update(desc, padding, padlen);
+        sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+        for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha1_state){};
+        return 0;
+}
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+                      unsigned int len, u8 *out)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int blocks;
+        int i;
+        if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
+                sha1_update(desc, data, len);
+                return sha1_final(desc, out);
+        }
+        /*
+         * Use a fast path if the input is a multiple of 64 bytes. In
+         * this case, there is no need to copy data around, and we can
+         * perform the entire digest calculation in a single invocation
+         * of sha1_ce_transform()
+         */
+        blocks = len / SHA1_BLOCK_SIZE;
+        kernel_neon_begin_partial(16);
+        sha1_ce_transform(blocks, data, sctx->state, NULL, len);
+        kernel_neon_end();
+        for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha1_state){};
+        return 0;
+}
+static int sha1_export(struct shash_desc *desc, void *out)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        struct sha1_state *dst = out;
+        *dst = *sctx;
+        return 0;
+}
+static int sha1_import(struct shash_desc *desc, const void *in)
+{
+        struct sha1_state *sctx = shash_desc_ctx(desc);
+        struct sha1_state const *src = in;
+        *sctx = *src;
+        return 0;
+}
+static struct shash_alg alg = {
+        .init                   = sha1_init,
+        .update                 = sha1_update,
+        .final                  = sha1_final,
+        .finup                  = sha1_finup,
+        .export                 = sha1_export,
+        .import                 = sha1_import,
+        .descsize               = sizeof(struct sha1_state),
+        .digestsize             = SHA1_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha1_state),
+        .base                   = {
+                .cra_name               = "sha1",
+                .cra_driver_name        = "sha1-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA1_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+};
+static int __init sha1_ce_mod_init(void)
+{
+        return crypto_register_shash(&alg);
+}
+static void __exit sha1_ce_mod_fini(void)
+{
+        crypto_unregister_shash(&alg);
+}
+module_cpu_feature_match(SHA1, sha1_ce_mod_init);
+module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
new file mode 100644
index 000000000000..7f29fc031ea8
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -0,0 +1,156 @@
+/*
+ * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+        .text
+        .arch           armv8-a+crypto
+        dga             .req    q20
+        dgav            .req    v20
+        dgb             .req    q21
+        dgbv            .req    v21
+        t0              .req    v22
+        t1              .req    v23
+        dg0q            .req    q24
+        dg0v            .req    v24
+        dg1q            .req    q25
+        dg1v            .req    v25
+        dg2q            .req    q26
+        dg2v            .req    v26
+        .macro          add_only, ev, rc, s0
+        mov             dg2v.16b, dg0v.16b
+        .ifeq           \ev
+        add             t1.4s, v\s0\().4s, \rc\().4s
+        sha256h         dg0q, dg1q, t0.4s
+        sha256h2        dg1q, dg2q, t0.4s
+        .else
+        .ifnb           \s0
+        add             t0.4s, v\s0\().4s, \rc\().4s
+        .endif
+        sha256h         dg0q, dg1q, t1.4s
+        sha256h2        dg1q, dg2q, t1.4s
+        .endif
+        .endm
+        .macro          add_update, ev, rc, s0, s1, s2, s3
+        sha256su0       v\s0\().4s, v\s1\().4s
+        add_only        \ev, \rc, \s1
+        sha256su1       v\s0\().4s, v\s2\().4s, v\s3\().4s
+        .endm
+        /*
+         * The SHA-256 round constants
+         */
+        .align          4
+.Lsha2_rcon:
+        .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+        .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+        .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+        .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+        .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+        .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+        .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+        .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+        .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+        .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+        .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+        .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+        .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+        .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+        .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+        .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+        /*
+         * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+         *                        u8 *head, long bytes)
+         */
+ENTRY(sha2_ce_transform)
+        /* load round constants */
+        adr             x8, .Lsha2_rcon
+        ld1             { v0.4s- v3.4s}, [x8], #64
+        ld1             { v4.4s- v7.4s}, [x8], #64
+        ld1             { v8.4s-v11.4s}, [x8], #64
+        ld1             {v12.4s-v15.4s}, [x8]
+        /* load state */
+        ldp             dga, dgb, [x2]
+        /* load partial input (if supplied) */
+        cbz             x3, 0f
+        ld1             {v16.4s-v19.4s}, [x3]
+        b               1f
+        /* load input */
+0:      ld1             {v16.4s-v19.4s}, [x1], #64
+        sub             w0, w0, #1
+1:
+CPU_LE( rev32           v16.16b, v16.16b        )
+CPU_LE( rev32           v17.16b, v17.16b        )
+CPU_LE( rev32           v18.16b, v18.16b        )
+CPU_LE( rev32           v19.16b, v19.16b        )
+2:      add             t0.4s, v16.4s, v0.4s
+        mov             dg0v.16b, dgav.16b
+        mov             dg1v.16b, dgbv.16b
+        add_update      0,  v1, 16, 17, 18, 19
+        add_update      1,  v2, 17, 18, 19, 16
+        add_update      0,  v3, 18, 19, 16, 17
+        add_update      1,  v4, 19, 16, 17, 18
+        add_update      0,  v5, 16, 17, 18, 19
+        add_update      1,  v6, 17, 18, 19, 16
+        add_update      0,  v7, 18, 19, 16, 17
+        add_update      1,  v8, 19, 16, 17, 18
+        add_update      0,  v9, 16, 17, 18, 19
+        add_update      1, v10, 17, 18, 19, 16
+        add_update      0, v11, 18, 19, 16, 17
+        add_update      1, v12, 19, 16, 17, 18
+        add_only        0, v13, 17
+        add_only        1, v14, 18
+        add_only        0, v15, 19
+        add_only        1
+        /* update state */
+        add             dgav.4s, dgav.4s, dg0v.4s
+        add             dgbv.4s, dgbv.4s, dg1v.4s
+        /* handled all input blocks? */
+        cbnz            w0, 0b
+        /*
+         * Final block: add padding and total bit count.
+         * Skip if we have no total byte count in x4. In that case, the input
+         * size was not a round multiple of the block size, and the padding is
+         * handled by the C code.
+         */
+        cbz             x4, 3f
+        movi            v17.2d, #0
+        mov             x8, #0x80000000
+        movi            v18.2d, #0
+        ror             x7, x4, #29             // ror(lsl(x4, 3), 32)
+        fmov            d16, x8
+        mov             x4, #0
+        mov             v19.d[0], xzr
+        mov             v19.d[1], x7
+        b               2b
+        /* store new state */
+3:      stp             dga, dgb, [x2]
+        ret
+ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
new file mode 100644
index 000000000000..c294e67d3925
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -0,0 +1,255 @@
+/*
+ * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+                                 u8 *head, long bytes);
+static int sha224_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha256_state){
+                .state = {
+                        SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+                        SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+                }
+        };
+        return 0;
+}
+static int sha256_init(struct shash_desc *desc)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        *sctx = (struct sha256_state){
+                .state = {
+                        SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+                        SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+                }
+        };
+        return 0;
+}
+static int sha2_update(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+        sctx->count += len;
+        if ((partial + len) >= SHA256_BLOCK_SIZE) {
+                int blocks;
+                if (partial) {
+                        int p = SHA256_BLOCK_SIZE - partial;
+                        memcpy(sctx->buf + partial, data, p);
+                        data += p;
+                        len -= p;
+                }
+                blocks = len / SHA256_BLOCK_SIZE;
+                len %= SHA256_BLOCK_SIZE;
+                kernel_neon_begin_partial(28);
+                sha2_ce_transform(blocks, data, sctx->state,
+                                  partial ? sctx->buf : NULL, 0);
+                kernel_neon_end();
+                data += blocks * SHA256_BLOCK_SIZE;
+                partial = 0;
+        }
+        if (len)
+                memcpy(sctx->buf + partial, data, len);
+        return 0;
+}
+static void sha2_final(struct shash_desc *desc)
+{
+        static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be64 bits = cpu_to_be64(sctx->count << 3);
+        u32 padlen = SHA256_BLOCK_SIZE
+                     - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
+        sha2_update(desc, padding, padlen);
+        sha2_update(desc, (const u8 *)&bits, sizeof(bits));
+}
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_final(desc);
+        for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_final(desc);
+        for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static void sha2_finup(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        int blocks;
+        if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
+                sha2_update(desc, data, len);
+                sha2_final(desc);
+                return;
+        }
+        /*
+         * Use a fast path if the input is a multiple of 64 bytes. In
+         * this case, there is no need to copy data around, and we can
+         * perform the entire digest calculation in a single invocation
+         * of sha2_ce_transform()
+         */
+        blocks = len / SHA256_BLOCK_SIZE;
+        kernel_neon_begin_partial(28);
+        sha2_ce_transform(blocks, data, sctx->state, NULL, len);
+        kernel_neon_end();
+        data += blocks * SHA256_BLOCK_SIZE;
+}
+static int sha224_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_finup(desc, data, len);
+        for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        __be32 *dst = (__be32 *)out;
+        int i;
+        sha2_finup(desc, data, len);
+        for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
+                put_unaligned_be32(sctx->state[i], dst++);
+        *sctx = (struct sha256_state){};
+        return 0;
+}
+static int sha2_export(struct shash_desc *desc, void *out)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        struct sha256_state *dst = out;
+        *dst = *sctx;
+        return 0;
+}
+static int sha2_import(struct shash_desc *desc, const void *in)
+{
+        struct sha256_state *sctx = shash_desc_ctx(desc);
+        struct sha256_state const *src = in;
+        *sctx = *src;
+        return 0;
+}
+static struct shash_alg algs[] = { {
+        .init                   = sha224_init,
+        .update                 = sha2_update,
+        .final                  = sha224_final,
+        .finup                  = sha224_finup,
+        .export                 = sha2_export,
+        .import                 = sha2_import,
+        .descsize               = sizeof(struct sha256_state),
+        .digestsize             = SHA224_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha256_state),
+        .base                   = {
+                .cra_name               = "sha224",
+                .cra_driver_name        = "sha224-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA256_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+}, {
+        .init                   = sha256_init,
+        .update                 = sha2_update,
+        .final                  = sha256_final,
+        .finup                  = sha256_finup,
+        .export                 = sha2_export,
+        .import                 = sha2_import,
+        .descsize               = sizeof(struct sha256_state),
+        .digestsize             = SHA256_DIGEST_SIZE,
+        .statesize              = sizeof(struct sha256_state),
+        .base                   = {
+                .cra_name               = "sha256",
+                .cra_driver_name        = "sha256-ce",
+                .cra_priority           = 200,
+                .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+                .cra_blocksize          = SHA256_BLOCK_SIZE,
+                .cra_module             = THIS_MODULE,
+        }
+} };
+static int __init sha2_ce_mod_init(void)
+{
+        return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+static void __exit sha2_ce_mod_fini(void)
+{
+        crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_cpu_feature_match(SHA2, sha2_ce_mod_init);
+module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += socket.h
 generic-y += sockios.h
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac13008..50f559f574fe 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,8 +37,21 @@ struct fpsimd_state {
                        u32 fpcr;
                };
        };
+        /* the id of the last cpu to have restored this state */
+        unsigned int cpu;
 };
+/*
+ * Struct for stacking the bottom 'n' FP/SIMD registers.
+ */
+struct fpsimd_partial_state {
+        u32             fpsr;
+        u32             fpcr;
+        u32             num_regs;
+        __uint128_t     vregs[32];
+};
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK     0xf800009f
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
 extern void fpsimd_thread_switch(struct task_struct *next);
 extern void fpsimd_flush_thread(void);
+extern void fpsimd_preserve_current_state(void);
+extern void fpsimd_restore_current_state(void);
+extern void fpsimd_update_current_state(struct fpsimd_state *state);
+extern void fpsimd_flush_task_state(struct task_struct *target);
+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
+                                      u32 num_regs);
+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
 #endif
 #endif
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599c96bd..768414d55e64 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@
        ldr     w\tmpnr, [\state, #16 * 2 + 4]
        msr     fpcr, x\tmpnr
 .endm
+.altmacro
+.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
+        mrs     x\tmpnr1, fpsr
+        str     w\numnr, [\state, #8]
+        mrs     x\tmpnr2, fpcr
+        stp     w\tmpnr1, w\tmpnr2, [\state]
+        adr     x\tmpnr1, 0f
+        add     \state, \state, x\numnr, lsl #4
+        sub     x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
+        br      x\tmpnr1
+        .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+        .irp    qb, %(qa + 1)
+        stp     q\qa, q\qb, [\state, # -16 * \qa - 16]
+        .endr
+        .endr
+0:
+.endm
+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
+        ldp     w\tmpnr1, w\tmpnr2, [\state]
+        msr     fpsr, x\tmpnr1
+        msr     fpcr, x\tmpnr2
+        adr     x\tmpnr1, 0f
+        ldr     w\tmpnr2, [\state, #8]
+        add     \state, \state, x\tmpnr2, lsl #4
+        sub     x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
+        br      x\tmpnr1
+        .irp    qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+        .irp    qb, %(qa + 1)
+        ldp     q\qa, q\qb, [\state, # -16 * \qa - 16]
+        .endr
+        .endr
+0:
+.endm
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a97780..13ce4cc18e26 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@
 * published by the Free Software Foundation.
 */
+#include <linux/types.h>
 #define cpu_has_neon()          (1)
-void kernel_neon_begin(void);
+#define kernel_neon_begin()     kernel_neon_begin_partial(32)
+void kernel_neon_begin_partial(u32 num_regs);
 void kernel_neon_end(void);
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 0a8b2a97a32e..9c086c63f911 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SIGPENDING          0
 #define TIF_NEED_RESCHED        1
 #define TIF_NOTIFY_RESUME       2       /* callback before returning to user */
+#define TIF_FOREIGN_FPSTATE     3       /* CPU's FP state is not current's */
 #define TIF_SYSCALL_TRACE       8
 #define TIF_SYSCALL_AUDIT       9
 #define TIF_SYSCALL_TRACEPOINT  10
@@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SIGPENDING         (1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED       (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME      (1 << TIF_NOTIFY_RESUME)
+#define _TIF_FOREIGN_FPSTATE    (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_SYSCALL_TRACE      (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT      (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_32BIT              (1 << TIF_32BIT)
 #define _TIF_WORK_MASK          (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-                                 _TIF_NOTIFY_RESUME)
+                                 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
 #define _TIF_SYSCALL_WORK       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
                                 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa6..d358ccacfc00 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
        fpsimd_restore x0, 8
        ret
 ENDPROC(fpsimd_load_state)
+#ifdef CONFIG_KERNEL_MODE_NEON
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+        fpsimd_save_partial x0, 1, 8, 9
+        ret
+ENDPROC(fpsimd_load_partial_state)
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+        fpsimd_restore_partial x0, 8, 9
+        ret
+ENDPROC(fpsimd_load_partial_state)
+#endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a670d0a98c89..bf017f4ffb4f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -562,7 +562,7 @@ fast_work_pending:
        str     x0, [sp, #S_X0]                 // returned x0
 work_pending:
        tbnz    x1, #TIF_NEED_RESCHED, work_resched
-        /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */
+        /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
        ldr     x2, [sp, #S_PSTATE]
        mov     x0, sp                          // 'regs'
        tst     x2, #PSR_MODE_MASK              // user mode regs?
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4aef42a04bdc..ad8aebb1cdef 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -35,6 +35,60 @@
 #define FPEXC_IDF       (1 << 7)
 /*
+ * In order to reduce the number of times the FPSIMD state is needlessly saved
+ * and restored, we need to keep track of two things:
+ * (a) for each task, we need to remember which CPU was the last one to have
+ *     the task's FPSIMD state loaded into its FPSIMD registers;
+ * (b) for each CPU, we need to remember which task's userland FPSIMD state has
+ *     been loaded into its FPSIMD registers most recently, or whether it has
+ *     been used to perform kernel mode NEON in the meantime.
+ *
+ * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
+ * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
+ * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
+ * address of the userland FPSIMD state of the task that was loaded onto the CPU
+ * the most recently, or NULL if kernel mode NEON has been performed after that.
+ *
+ * With this in place, we no longer have to restore the next FPSIMD state right
+ * when switching between tasks. Instead, we can defer this check to userland
+ * resume, at which time we verify whether the CPU's fpsimd_last_state and the
+ * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
+ * can omit the FPSIMD restore.
+ *
+ * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
+ * indicate whether or not the userland FPSIMD state of the current task is
+ * present in the registers. The flag is set unless the FPSIMD registers of this
+ * CPU currently contain the most recent userland FPSIMD state of the current
+ * task.
+ *
+ * For a certain task, the sequence may look something like this:
+ * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
+ *   contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
+ *   variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
+ *   cleared, otherwise it is set;
+ *
+ * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
+ *   userland FPSIMD state is copied from memory to the registers, the task's
+ *   fpsimd_state.cpu field is set to the id of the current CPU, the current
+ *   CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
+ *   TIF_FOREIGN_FPSTATE flag is cleared;
+ *
+ * - the task executes an ordinary syscall; upon return to userland, the
+ *   TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
+ *   restored;
+ *
+ * - the task executes a syscall which executes some NEON instructions; this is
+ *   preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
+ *   register contents to memory, clears the fpsimd_last_state per-cpu variable
+ *   and sets the TIF_FOREIGN_FPSTATE flag;
+ *
+ * - the task gets preempted after kernel_neon_end() is called; as we have not
+ *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
+ *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
+ */
+static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+/*
 * Trapped FP/ASIMD access.
 */
 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 void fpsimd_thread_switch(struct task_struct *next)
 {
-        /* check if not kernel threads */
+        /*
-        if (current->mm)
+         * Save the current FPSIMD state to memory, but only if whatever is in
+         * the registers is in fact the most recent userland FPSIMD state of
+         * 'current'.
+         */
+        if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
                fpsimd_save_state(&current->thread.fpsimd_state);
-        if (next->mm)
-                fpsimd_load_state(&next->thread.fpsimd_state);
+        if (next->mm) {
+                /*
+                 * If we are switching to a task whose most recent userland
+                 * FPSIMD state is already in the registers of *this* cpu,
+                 * we can skip loading the state from memory. Otherwise, set
+                 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
+                 * upon the next return to userland.
+                 */
+                struct fpsimd_state *st = &next->thread.fpsimd_state;
+                if (__this_cpu_read(fpsimd_last_state) == st
+                    && st->cpu == smp_processor_id())
+                        clear_ti_thread_flag(task_thread_info(next),
+                                             TIF_FOREIGN_FPSTATE);
+                else
+                        set_ti_thread_flag(task_thread_info(next),
+                                           TIF_FOREIGN_FPSTATE);
+        }
 }
 void fpsimd_flush_thread(void)
 {
-        preempt_disable();
        memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
-        fpsimd_load_state(&current->thread.fpsimd_state);
+        set_thread_flag(TIF_FOREIGN_FPSTATE);
+}
+/*
+ * Save the userland FPSIMD state of 'current' to memory, but only if the state
+ * currently held in the registers does in fact belong to 'current'
+ */
+void fpsimd_preserve_current_state(void)
+{
+        preempt_disable();
+        if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
+                fpsimd_save_state(&current->thread.fpsimd_state);
+        preempt_enable();
+}
+/*
+ * Load the userland FPSIMD state of 'current' from memory, but only if the
+ * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
+ * state of 'current'
+ */
+void fpsimd_restore_current_state(void)
+{
+        preempt_disable();
+        if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+                struct fpsimd_state *st = &current->thread.fpsimd_state;
+                fpsimd_load_state(st);
+                this_cpu_write(fpsimd_last_state, st);
+                st->cpu = smp_processor_id();
+        }
+        preempt_enable();
+}
+/*
+ * Load an updated userland FPSIMD state for 'current' from memory and set the
+ * flag that indicates that the FPSIMD register contents are the most recent
+ * FPSIMD state of 'current'
+ */
+void fpsimd_update_current_state(struct fpsimd_state *state)
+{
+        preempt_disable();
+        fpsimd_load_state(state);
+        if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+                struct fpsimd_state *st = &current->thread.fpsimd_state;
+                this_cpu_write(fpsimd_last_state, st);
+                st->cpu = smp_processor_id();
+        }
        preempt_enable();
 }
+/*
+ * Invalidate live CPU copies of task t's FPSIMD state
+ */
+void fpsimd_flush_task_state(struct task_struct *t)
+{
+        t->thread.fpsimd_state.cpu = NR_CPUS;
+}
 #ifdef CONFIG_KERNEL_MODE_NEON
+static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
+static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
 /*
 * Kernel-side NEON support functions
 */
-void kernel_neon_begin(void)
+void kernel_neon_begin_partial(u32 num_regs)
 {
-        /* Avoid using the NEON in interrupt context */
+        if (in_interrupt()) {
-        BUG_ON(in_interrupt());
+                struct fpsimd_partial_state *s = this_cpu_ptr(
-        preempt_disable();
+                        in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
-        if (current->mm)
+                BUG_ON(num_regs > 32);
-                fpsimd_save_state(&current->thread.fpsimd_state);
+                fpsimd_save_partial_state(s, roundup(num_regs, 2));
+        } else {
+                /*
+                 * Save the userland FPSIMD state if we have one and if we
+                 * haven't done so already. Clear fpsimd_last_state to indicate
+                 * that there is no longer userland FPSIMD state in the
+                 * registers.
+                 */
+                preempt_disable();
+                if (current->mm &&
+                    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
+                        fpsimd_save_state(&current->thread.fpsimd_state);
+                this_cpu_write(fpsimd_last_state, NULL);
+        }
 }
-EXPORT_SYMBOL(kernel_neon_begin);
+EXPORT_SYMBOL(kernel_neon_begin_partial);
 void kernel_neon_end(void)
 {
-        if (current->mm)
+        if (in_interrupt()) {
-                fpsimd_load_state(&current->thread.fpsimd_state);
+                struct fpsimd_partial_state *s = this_cpu_ptr(
+                        in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
-        preempt_enable();
+                fpsimd_load_partial_state(s);
+        } else {
+                preempt_enable();
+        }
 }
 EXPORT_SYMBOL(kernel_neon_end);
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
 {
        switch (cmd) {
        case CPU_PM_ENTER:
-                if (current->mm)
+                if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
                        fpsimd_save_state(&current->thread.fpsimd_state);
                break;
        case CPU_PM_EXIT:
                if (current->mm)
-                        fpsimd_load_state(&current->thread.fpsimd_state);
+                        set_thread_flag(TIF_FOREIGN_FPSTATE);
                break;
        case CPU_PM_ENTER_FAILED:
        default:
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index d04eb871cb0e..9f2d6020b6c2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task)
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-        fpsimd_save_state(&current->thread.fpsimd_state);
+        fpsimd_preserve_current_state();
        *dst = *src;
        return 0;
 }
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 4b58e812cf67..32d52d3b079c 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
                return ret;
        target->thread.fpsimd_state.user_fpsimd = newstate;
+        fpsimd_flush_task_state(target);
        return ret;
 }
@@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target,
                uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
        }
+        fpsimd_flush_task_state(target);
        return ret;
 }
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 2ba72a11629f..6357b9c6c90e 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
        int err;
        /* dump the hardware registers to the fpsimd_state structure */
-        fpsimd_save_state(fpsimd);
+        fpsimd_preserve_current_state();
        /* copy the FP and status/control registers */
        err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
        __get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
        /* load the hardware registers from the fpsimd_state structure */
-        if (!err) {
+        if (!err)
-                preempt_disable();
+                fpsimd_update_current_state(&fpsimd);
-                fpsimd_load_state(&fpsimd);
-                preempt_enable();
-        }
        return err ? -EFAULT : 0;
 }
@@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
                clear_thread_flag(TIF_NOTIFY_RESUME);
                tracehook_notify_resume(regs);
        }
+        if (thread_flags & _TIF_FOREIGN_FPSTATE)
+                fpsimd_restore_current_state();
 }
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 050c1c2af777..3491c638f172 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
         * Note that this also saves V16-31, which aren't visible
         * in AArch32.
         */
-        fpsimd_save_state(fpsimd);
+        fpsimd_preserve_current_state();
        /* Place structure header on the stack */
        __put_user_error(magic, &frame->magic, err);
@@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
         * We don't need to touch the exception register, so
         * reload the hardware state.
         */
-        if (!err) {
+        if (!err)
-                preempt_disable();
+                fpsimd_update_current_state(&fpsimd);
-                fpsimd_load_state(&fpsimd);
-                preempt_enable();
-        }
        return err ? -EFAULT : 0;
 }
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 03cf5936bad6..1ac097279db1 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -4,22 +4,27 @@
 /*
 * This is the most generic implementation of unaligned accesses
 * and should work almost anywhere.
- *
- * If an architecture can handle unaligned accesses in hardware,
- * it may want to use the linux/unaligned/access_ok.h implementation
- * instead.
 */
 #include <asm/byteorder.h>
+/* Set by the arch if it can handle unaligned accesses in hardware. */
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+# include <linux/unaligned/access_ok.h>
+#endif
 #if defined(__LITTLE_ENDIAN)
-# include <linux/unaligned/le_struct.h>
+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-# include <linux/unaligned/be_byteshift.h>
+#  include <linux/unaligned/le_struct.h>
+#  include <linux/unaligned/be_byteshift.h>
+# endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned  __get_unaligned_le
 # define put_unaligned  __put_unaligned_le
 #elif defined(__BIG_ENDIAN)
-# include <linux/unaligned/be_struct.h>
+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-# include <linux/unaligned/le_byteshift.h>
+#  include <linux/unaligned/be_struct.h>
+#  include <linux/unaligned/le_byteshift.h>
+# endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned  __get_unaligned_be
 # define put_unaligned  __put_unaligned_be
author	Catalin Marinas <catalin.marinas@arm.com>	2014-05-16 05:05:11 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-16 05:05:11 -0400
commit	cf5c95db57ffa02e430c3840c08d1ee0403849d4 (patch)
tree	b3b4df5e1edcde098cf45b7fa00c8450e6d665f8
parent	fd92d4a54a069953b4679958121317f2a25389cd (diff)
parent	49788fe2a128217f78a21ee4edbe6e92e988f222 (diff)