aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCatalin Marinas <catalin.marinas@arm.com>2014-05-16 05:05:11 -0400
committerCatalin Marinas <catalin.marinas@arm.com>2014-05-16 05:05:11 -0400
commitcf5c95db57ffa02e430c3840c08d1ee0403849d4 (patch)
treeb3b4df5e1edcde098cf45b7fa00c8450e6d665f8
parentfd92d4a54a069953b4679958121317f2a25389cd (diff)
parent49788fe2a128217f78a21ee4edbe6e92e988f222 (diff)
Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream
FPSIMD register bank context switching and crypto algorithms optimisations for arm64 from Ard Biesheuvel. * tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm: arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions arm64: pull in <asm/simd.h> from asm-generic arm64/crypto: AES in CCM mode using ARMv8 Crypto Extensions arm64/crypto: AES using ARMv8 Crypto Extensions arm64/crypto: GHASH secure hash using ARMv8 Crypto Extensions arm64/crypto: SHA-224/SHA-256 using ARMv8 Crypto Extensions arm64/crypto: SHA-1 using ARMv8 Crypto Extensions arm64: add support for kernel mode NEON in interrupt context arm64: defer reloading a task's FPSIMD state to userland resume arm64: add abstractions for FPSIMD state manipulation asm-generic: allow generic unaligned access if the arch supports it Conflicts: arch/arm64/include/asm/thread_info.h
-rw-r--r--arch/arm64/Kconfig3
-rw-r--r--arch/arm64/Makefile1
-rw-r--r--arch/arm64/crypto/Kconfig53
-rw-r--r--arch/arm64/crypto/Makefile38
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-core.S222
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c297
-rw-r--r--arch/arm64/crypto/aes-ce-cipher.c155
-rw-r--r--arch/arm64/crypto/aes-ce.S133
-rw-r--r--arch/arm64/crypto/aes-glue.c446
-rw-r--r--arch/arm64/crypto/aes-modes.S532
-rw-r--r--arch/arm64/crypto/aes-neon.S382
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S95
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c155
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S153
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c174
-rw-r--r--arch/arm64/crypto/sha2-ce-core.S156
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c255
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/fpsimd.h23
-rw-r--r--arch/arm64/include/asm/fpsimdmacros.h35
-rw-r--r--arch/arm64/include/asm/neon.h6
-rw-r--r--arch/arm64/include/asm/thread_info.h4
-rw-r--r--arch/arm64/kernel/entry-fpsimd.S24
-rw-r--r--arch/arm64/kernel/entry.S2
-rw-r--r--arch/arm64/kernel/fpsimd.c186
-rw-r--r--arch/arm64/kernel/process.c2
-rw-r--r--arch/arm64/kernel/ptrace.c2
-rw-r--r--arch/arm64/kernel/signal.c13
-rw-r--r--arch/arm64/kernel/signal32.c9
-rw-r--r--include/asm-generic/unaligned.h21
30 files changed, 3535 insertions, 43 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9a5b5fea86ba..78b356d079dd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug"
343source "security/Kconfig" 343source "security/Kconfig"
344 344
345source "crypto/Kconfig" 345source "crypto/Kconfig"
346if CRYPTO
347source "arch/arm64/crypto/Kconfig"
348endif
346 349
347source "lib/Kconfig" 350source "lib/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 2fceb71ac3b7..8185a913c5ed 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS
45core-y += arch/arm64/kernel/ arch/arm64/mm/ 45core-y += arch/arm64/kernel/ arch/arm64/mm/
46core-$(CONFIG_KVM) += arch/arm64/kvm/ 46core-$(CONFIG_KVM) += arch/arm64/kvm/
47core-$(CONFIG_XEN) += arch/arm64/xen/ 47core-$(CONFIG_XEN) += arch/arm64/xen/
48core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
48libs-y := arch/arm64/lib/ $(libs-y) 49libs-y := arch/arm64/lib/ $(libs-y)
49libs-y += $(LIBGCC) 50libs-y += $(LIBGCC)
50 51
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
new file mode 100644
index 000000000000..5562652c5316
--- /dev/null
+++ b/arch/arm64/crypto/Kconfig
@@ -0,0 +1,53 @@
1
2menuconfig ARM64_CRYPTO
3 bool "ARM64 Accelerated Cryptographic Algorithms"
4 depends on ARM64
5 help
6 Say Y here to choose from a selection of cryptographic algorithms
7 implemented using ARM64 specific CPU features or instructions.
8
9if ARM64_CRYPTO
10
11config CRYPTO_SHA1_ARM64_CE
12 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
13 depends on ARM64 && KERNEL_MODE_NEON
14 select CRYPTO_HASH
15
16config CRYPTO_SHA2_ARM64_CE
17 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
18 depends on ARM64 && KERNEL_MODE_NEON
19 select CRYPTO_HASH
20
21config CRYPTO_GHASH_ARM64_CE
22 tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
23 depends on ARM64 && KERNEL_MODE_NEON
24 select CRYPTO_HASH
25
26config CRYPTO_AES_ARM64_CE
27 tristate "AES core cipher using ARMv8 Crypto Extensions"
28 depends on ARM64 && KERNEL_MODE_NEON
29 select CRYPTO_ALGAPI
30 select CRYPTO_AES
31
32config CRYPTO_AES_ARM64_CE_CCM
33 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
34 depends on ARM64 && KERNEL_MODE_NEON
35 select CRYPTO_ALGAPI
36 select CRYPTO_AES
37 select CRYPTO_AEAD
38
39config CRYPTO_AES_ARM64_CE_BLK
40 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
41 depends on ARM64 && KERNEL_MODE_NEON
42 select CRYPTO_BLKCIPHER
43 select CRYPTO_AES
44 select CRYPTO_ABLK_HELPER
45
46config CRYPTO_AES_ARM64_NEON_BLK
47 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
48 depends on ARM64 && KERNEL_MODE_NEON
49 select CRYPTO_BLKCIPHER
50 select CRYPTO_AES
51 select CRYPTO_ABLK_HELPER
52
53endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
new file mode 100644
index 000000000000..2070a56ecc46
--- /dev/null
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,38 @@
1#
2# linux/arch/arm64/crypto/Makefile
3#
4# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License version 2 as
8# published by the Free Software Foundation.
9#
10
11obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
12sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
13
14obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
15sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
16
17obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
18ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
19
20obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
21CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
22
23obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
24aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
25
26obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
27aes-ce-blk-y := aes-glue-ce.o aes-ce.o
28
29obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
30aes-neon-blk-y := aes-glue-neon.o aes-neon.o
31
32AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE
33AFLAGS_aes-neon.o := -DINTERLEAVE=4
34
35CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
36
37$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
38 $(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
new file mode 100644
index 000000000000..432e4841cd81
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -0,0 +1,222 @@
1/*
2 * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12
13 .text
14 .arch armv8-a+crypto
15
16 /*
17 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
18 * u32 *macp, u8 const rk[], u32 rounds);
19 */
20ENTRY(ce_aes_ccm_auth_data)
21 ldr w8, [x3] /* leftover from prev round? */
22 ld1 {v0.2d}, [x0] /* load mac */
23 cbz w8, 1f
24 sub w8, w8, #16
25 eor v1.16b, v1.16b, v1.16b
260: ldrb w7, [x1], #1 /* get 1 byte of input */
27 subs w2, w2, #1
28 add w8, w8, #1
29 ins v1.b[0], w7
30 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
31 beq 8f /* out of input? */
32 cbnz w8, 0b
33 eor v0.16b, v0.16b, v1.16b
341: ld1 {v3.2d}, [x4] /* load first round key */
35 prfm pldl1strm, [x1]
36 cmp w5, #12 /* which key size? */
37 add x6, x4, #16
38 sub w7, w5, #2 /* modified # of rounds */
39 bmi 2f
40 bne 5f
41 mov v5.16b, v3.16b
42 b 4f
432: mov v4.16b, v3.16b
44 ld1 {v5.2d}, [x6], #16 /* load 2nd round key */
453: aese v0.16b, v4.16b
46 aesmc v0.16b, v0.16b
474: ld1 {v3.2d}, [x6], #16 /* load next round key */
48 aese v0.16b, v5.16b
49 aesmc v0.16b, v0.16b
505: ld1 {v4.2d}, [x6], #16 /* load next round key */
51 subs w7, w7, #3
52 aese v0.16b, v3.16b
53 aesmc v0.16b, v0.16b
54 ld1 {v5.2d}, [x6], #16 /* load next round key */
55 bpl 3b
56 aese v0.16b, v4.16b
57 subs w2, w2, #16 /* last data? */
58 eor v0.16b, v0.16b, v5.16b /* final round */
59 bmi 6f
60 ld1 {v1.16b}, [x1], #16 /* load next input block */
61 eor v0.16b, v0.16b, v1.16b /* xor with mac */
62 bne 1b
636: st1 {v0.2d}, [x0] /* store mac */
64 beq 10f
65 adds w2, w2, #16
66 beq 10f
67 mov w8, w2
687: ldrb w7, [x1], #1
69 umov w6, v0.b[0]
70 eor w6, w6, w7
71 strb w6, [x0], #1
72 subs w2, w2, #1
73 beq 10f
74 ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
75 b 7b
768: mov w7, w8
77 add w8, w8, #16
789: ext v1.16b, v1.16b, v1.16b, #1
79 adds w7, w7, #1
80 bne 9b
81 eor v0.16b, v0.16b, v1.16b
82 st1 {v0.2d}, [x0]
8310: str w8, [x3]
84 ret
85ENDPROC(ce_aes_ccm_auth_data)
86
87 /*
88 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
89 * u32 rounds);
90 */
91ENTRY(ce_aes_ccm_final)
92 ld1 {v3.2d}, [x2], #16 /* load first round key */
93 ld1 {v0.2d}, [x0] /* load mac */
94 cmp w3, #12 /* which key size? */
95 sub w3, w3, #2 /* modified # of rounds */
96 ld1 {v1.2d}, [x1] /* load 1st ctriv */
97 bmi 0f
98 bne 3f
99 mov v5.16b, v3.16b
100 b 2f
1010: mov v4.16b, v3.16b
1021: ld1 {v5.2d}, [x2], #16 /* load next round key */
103 aese v0.16b, v4.16b
104 aese v1.16b, v4.16b
105 aesmc v0.16b, v0.16b
106 aesmc v1.16b, v1.16b
1072: ld1 {v3.2d}, [x2], #16 /* load next round key */
108 aese v0.16b, v5.16b
109 aese v1.16b, v5.16b
110 aesmc v0.16b, v0.16b
111 aesmc v1.16b, v1.16b
1123: ld1 {v4.2d}, [x2], #16 /* load next round key */
113 subs w3, w3, #3
114 aese v0.16b, v3.16b
115 aese v1.16b, v3.16b
116 aesmc v0.16b, v0.16b
117 aesmc v1.16b, v1.16b
118 bpl 1b
119 aese v0.16b, v4.16b
120 aese v1.16b, v4.16b
121 /* final round key cancels out */
122 eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
123 st1 {v0.2d}, [x0] /* store result */
124 ret
125ENDPROC(ce_aes_ccm_final)
126
127 .macro aes_ccm_do_crypt,enc
128 ldr x8, [x6, #8] /* load lower ctr */
129 ld1 {v0.2d}, [x5] /* load mac */
130 rev x8, x8 /* keep swabbed ctr in reg */
1310: /* outer loop */
132 ld1 {v1.1d}, [x6] /* load upper ctr */
133 prfm pldl1strm, [x1]
134 add x8, x8, #1
135 rev x9, x8
136 cmp w4, #12 /* which key size? */
137 sub w7, w4, #2 /* get modified # of rounds */
138 ins v1.d[1], x9 /* no carry in lower ctr */
139 ld1 {v3.2d}, [x3] /* load first round key */
140 add x10, x3, #16
141 bmi 1f
142 bne 4f
143 mov v5.16b, v3.16b
144 b 3f
1451: mov v4.16b, v3.16b
146 ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
1472: /* inner loop: 3 rounds, 2x interleaved */
148 aese v0.16b, v4.16b
149 aese v1.16b, v4.16b
150 aesmc v0.16b, v0.16b
151 aesmc v1.16b, v1.16b
1523: ld1 {v3.2d}, [x10], #16 /* load next round key */
153 aese v0.16b, v5.16b
154 aese v1.16b, v5.16b
155 aesmc v0.16b, v0.16b
156 aesmc v1.16b, v1.16b
1574: ld1 {v4.2d}, [x10], #16 /* load next round key */
158 subs w7, w7, #3
159 aese v0.16b, v3.16b
160 aese v1.16b, v3.16b
161 aesmc v0.16b, v0.16b
162 aesmc v1.16b, v1.16b
163 ld1 {v5.2d}, [x10], #16 /* load next round key */
164 bpl 2b
165 aese v0.16b, v4.16b
166 aese v1.16b, v4.16b
167 subs w2, w2, #16
168 bmi 6f /* partial block? */
169 ld1 {v2.16b}, [x1], #16 /* load next input block */
170 .if \enc == 1
171 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
172 eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
173 .else
174 eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */
175 eor v1.16b, v2.16b, v5.16b /* final round enc */
176 .endif
177 eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
178 st1 {v1.16b}, [x0], #16 /* write output block */
179 bne 0b
180 rev x8, x8
181 st1 {v0.2d}, [x5] /* store mac */
182 str x8, [x6, #8] /* store lsb end of ctr (BE) */
1835: ret
184
1856: eor v0.16b, v0.16b, v5.16b /* final round mac */
186 eor v1.16b, v1.16b, v5.16b /* final round enc */
187 st1 {v0.2d}, [x5] /* store mac */
188 add w2, w2, #16 /* process partial tail block */
1897: ldrb w9, [x1], #1 /* get 1 byte of input */
190 umov w6, v1.b[0] /* get top crypted ctr byte */
191 umov w7, v0.b[0] /* get top mac byte */
192 .if \enc == 1
193 eor w7, w7, w9
194 eor w9, w9, w6
195 .else
196 eor w9, w9, w6
197 eor w7, w7, w9
198 .endif
199 strb w9, [x0], #1 /* store out byte */
200 strb w7, [x5], #1 /* store mac byte */
201 subs w2, w2, #1
202 beq 5b
203 ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
204 ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
205 b 7b
206 .endm
207
208 /*
209 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
210 * u8 const rk[], u32 rounds, u8 mac[],
211 * u8 ctr[]);
212 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
213 * u8 const rk[], u32 rounds, u8 mac[],
214 * u8 ctr[]);
215 */
216ENTRY(ce_aes_ccm_encrypt)
217 aes_ccm_do_crypt 1
218ENDPROC(ce_aes_ccm_encrypt)
219
220ENTRY(ce_aes_ccm_decrypt)
221 aes_ccm_do_crypt 0
222ENDPROC(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
new file mode 100644
index 000000000000..9e6cdde9b43d
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -0,0 +1,297 @@
1/*
2 * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/aes.h>
14#include <crypto/algapi.h>
15#include <crypto/scatterwalk.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19static int num_rounds(struct crypto_aes_ctx *ctx)
20{
21 /*
22 * # of rounds specified by AES:
23 * 128 bit key 10 rounds
24 * 192 bit key 12 rounds
25 * 256 bit key 14 rounds
26 * => n byte key => 6 + (n/4) rounds
27 */
28 return 6 + ctx->key_length / 4;
29}
30
31asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
32 u32 *macp, u32 const rk[], u32 rounds);
33
34asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
35 u32 const rk[], u32 rounds, u8 mac[],
36 u8 ctr[]);
37
38asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
39 u32 const rk[], u32 rounds, u8 mac[],
40 u8 ctr[]);
41
42asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
43 u32 rounds);
44
45static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
46 unsigned int key_len)
47{
48 struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
49 int ret;
50
51 ret = crypto_aes_expand_key(ctx, in_key, key_len);
52 if (!ret)
53 return 0;
54
55 tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
56 return -EINVAL;
57}
58
59static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
60{
61 if ((authsize & 1) || authsize < 4)
62 return -EINVAL;
63 return 0;
64}
65
66static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
67{
68 struct crypto_aead *aead = crypto_aead_reqtfm(req);
69 __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
70 u32 l = req->iv[0] + 1;
71
72 /* verify that CCM dimension 'L' is set correctly in the IV */
73 if (l < 2 || l > 8)
74 return -EINVAL;
75
76 /* verify that msglen can in fact be represented in L bytes */
77 if (l < 4 && msglen >> (8 * l))
78 return -EOVERFLOW;
79
80 /*
81 * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
82 * uses a u32 type to represent msglen so the top 4 bytes are always 0.
83 */
84 n[0] = 0;
85 n[1] = cpu_to_be32(msglen);
86
87 memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
88
89 /*
90 * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
91 * - bits 0..2 : max # of bytes required to represent msglen, minus 1
92 * (already set by caller)
93 * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
94 * - bit 6 : indicates presence of authenticate-only data
95 */
96 maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
97 if (req->assoclen)
98 maciv[0] |= 0x40;
99
100 memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
101 return 0;
102}
103
104static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
105{
106 struct crypto_aead *aead = crypto_aead_reqtfm(req);
107 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
108 struct __packed { __be16 l; __be32 h; u16 len; } ltag;
109 struct scatter_walk walk;
110 u32 len = req->assoclen;
111 u32 macp = 0;
112
113 /* prepend the AAD with a length tag */
114 if (len < 0xff00) {
115 ltag.l = cpu_to_be16(len);
116 ltag.len = 2;
117 } else {
118 ltag.l = cpu_to_be16(0xfffe);
119 put_unaligned_be32(len, &ltag.h);
120 ltag.len = 6;
121 }
122
123 ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
124 num_rounds(ctx));
125 scatterwalk_start(&walk, req->assoc);
126
127 do {
128 u32 n = scatterwalk_clamp(&walk, len);
129 u8 *p;
130
131 if (!n) {
132 scatterwalk_start(&walk, sg_next(walk.sg));
133 n = scatterwalk_clamp(&walk, len);
134 }
135 p = scatterwalk_map(&walk);
136 ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
137 num_rounds(ctx));
138 len -= n;
139
140 scatterwalk_unmap(p);
141 scatterwalk_advance(&walk, n);
142 scatterwalk_done(&walk, 0, len);
143 } while (len);
144}
145
146static int ccm_encrypt(struct aead_request *req)
147{
148 struct crypto_aead *aead = crypto_aead_reqtfm(req);
149 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
150 struct blkcipher_desc desc = { .info = req->iv };
151 struct blkcipher_walk walk;
152 u8 __aligned(8) mac[AES_BLOCK_SIZE];
153 u8 buf[AES_BLOCK_SIZE];
154 u32 len = req->cryptlen;
155 int err;
156
157 err = ccm_init_mac(req, mac, len);
158 if (err)
159 return err;
160
161 kernel_neon_begin_partial(6);
162
163 if (req->assoclen)
164 ccm_calculate_auth_mac(req, mac);
165
166 /* preserve the original iv for the final round */
167 memcpy(buf, req->iv, AES_BLOCK_SIZE);
168
169 blkcipher_walk_init(&walk, req->dst, req->src, len);
170 err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
171 AES_BLOCK_SIZE);
172
173 while (walk.nbytes) {
174 u32 tail = walk.nbytes % AES_BLOCK_SIZE;
175
176 if (walk.nbytes == len)
177 tail = 0;
178
179 ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
180 walk.nbytes - tail, ctx->key_enc,
181 num_rounds(ctx), mac, walk.iv);
182
183 len -= walk.nbytes - tail;
184 err = blkcipher_walk_done(&desc, &walk, tail);
185 }
186 if (!err)
187 ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
188
189 kernel_neon_end();
190
191 if (err)
192 return err;
193
194 /* copy authtag to end of dst */
195 scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
196 crypto_aead_authsize(aead), 1);
197
198 return 0;
199}
200
201static int ccm_decrypt(struct aead_request *req)
202{
203 struct crypto_aead *aead = crypto_aead_reqtfm(req);
204 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
205 unsigned int authsize = crypto_aead_authsize(aead);
206 struct blkcipher_desc desc = { .info = req->iv };
207 struct blkcipher_walk walk;
208 u8 __aligned(8) mac[AES_BLOCK_SIZE];
209 u8 buf[AES_BLOCK_SIZE];
210 u32 len = req->cryptlen - authsize;
211 int err;
212
213 err = ccm_init_mac(req, mac, len);
214 if (err)
215 return err;
216
217 kernel_neon_begin_partial(6);
218
219 if (req->assoclen)
220 ccm_calculate_auth_mac(req, mac);
221
222 /* preserve the original iv for the final round */
223 memcpy(buf, req->iv, AES_BLOCK_SIZE);
224
225 blkcipher_walk_init(&walk, req->dst, req->src, len);
226 err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
227 AES_BLOCK_SIZE);
228
229 while (walk.nbytes) {
230 u32 tail = walk.nbytes % AES_BLOCK_SIZE;
231
232 if (walk.nbytes == len)
233 tail = 0;
234
235 ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
236 walk.nbytes - tail, ctx->key_enc,
237 num_rounds(ctx), mac, walk.iv);
238
239 len -= walk.nbytes - tail;
240 err = blkcipher_walk_done(&desc, &walk, tail);
241 }
242 if (!err)
243 ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
244
245 kernel_neon_end();
246
247 if (err)
248 return err;
249
250 /* compare calculated auth tag with the stored one */
251 scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
252 authsize, 0);
253
254 if (memcmp(mac, buf, authsize))
255 return -EBADMSG;
256 return 0;
257}
258
259static struct crypto_alg ccm_aes_alg = {
260 .cra_name = "ccm(aes)",
261 .cra_driver_name = "ccm-aes-ce",
262 .cra_priority = 300,
263 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
264 .cra_blocksize = 1,
265 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
266 .cra_alignmask = 7,
267 .cra_type = &crypto_aead_type,
268 .cra_module = THIS_MODULE,
269 .cra_aead = {
270 .ivsize = AES_BLOCK_SIZE,
271 .maxauthsize = AES_BLOCK_SIZE,
272 .setkey = ccm_setkey,
273 .setauthsize = ccm_setauthsize,
274 .encrypt = ccm_encrypt,
275 .decrypt = ccm_decrypt,
276 }
277};
278
279static int __init aes_mod_init(void)
280{
281 if (!(elf_hwcap & HWCAP_AES))
282 return -ENODEV;
283 return crypto_register_alg(&ccm_aes_alg);
284}
285
286static void __exit aes_mod_exit(void)
287{
288 crypto_unregister_alg(&ccm_aes_alg);
289}
290
291module_init(aes_mod_init);
292module_exit(aes_mod_exit);
293
294MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
295MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
296MODULE_LICENSE("GPL v2");
297MODULE_ALIAS("ccm(aes)");
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
new file mode 100644
index 000000000000..2075e1acae6b
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -0,0 +1,155 @@
1/*
2 * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <crypto/aes.h>
13#include <linux/cpufeature.h>
14#include <linux/crypto.h>
15#include <linux/module.h>
16
17MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
18MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
19MODULE_LICENSE("GPL v2");
20
21struct aes_block {
22 u8 b[AES_BLOCK_SIZE];
23};
24
25static int num_rounds(struct crypto_aes_ctx *ctx)
26{
27 /*
28 * # of rounds specified by AES:
29 * 128 bit key 10 rounds
30 * 192 bit key 12 rounds
31 * 256 bit key 14 rounds
32 * => n byte key => 6 + (n/4) rounds
33 */
34 return 6 + ctx->key_length / 4;
35}
36
37static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
38{
39 struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
40 struct aes_block *out = (struct aes_block *)dst;
41 struct aes_block const *in = (struct aes_block *)src;
42 void *dummy0;
43 int dummy1;
44
45 kernel_neon_begin_partial(4);
46
47 __asm__(" ld1 {v0.16b}, %[in] ;"
48 " ld1 {v1.2d}, [%[key]], #16 ;"
49 " cmp %w[rounds], #10 ;"
50 " bmi 0f ;"
51 " bne 3f ;"
52 " mov v3.16b, v1.16b ;"
53 " b 2f ;"
54 "0: mov v2.16b, v1.16b ;"
55 " ld1 {v3.2d}, [%[key]], #16 ;"
56 "1: aese v0.16b, v2.16b ;"
57 " aesmc v0.16b, v0.16b ;"
58 "2: ld1 {v1.2d}, [%[key]], #16 ;"
59 " aese v0.16b, v3.16b ;"
60 " aesmc v0.16b, v0.16b ;"
61 "3: ld1 {v2.2d}, [%[key]], #16 ;"
62 " subs %w[rounds], %w[rounds], #3 ;"
63 " aese v0.16b, v1.16b ;"
64 " aesmc v0.16b, v0.16b ;"
65 " ld1 {v3.2d}, [%[key]], #16 ;"
66 " bpl 1b ;"
67 " aese v0.16b, v2.16b ;"
68 " eor v0.16b, v0.16b, v3.16b ;"
69 " st1 {v0.16b}, %[out] ;"
70
71 : [out] "=Q"(*out),
72 [key] "=r"(dummy0),
73 [rounds] "=r"(dummy1)
74 : [in] "Q"(*in),
75 "1"(ctx->key_enc),
76 "2"(num_rounds(ctx) - 2)
77 : "cc");
78
79 kernel_neon_end();
80}
81
82static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
83{
84 struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
85 struct aes_block *out = (struct aes_block *)dst;
86 struct aes_block const *in = (struct aes_block *)src;
87 void *dummy0;
88 int dummy1;
89
90 kernel_neon_begin_partial(4);
91
92 __asm__(" ld1 {v0.16b}, %[in] ;"
93 " ld1 {v1.2d}, [%[key]], #16 ;"
94 " cmp %w[rounds], #10 ;"
95 " bmi 0f ;"
96 " bne 3f ;"
97 " mov v3.16b, v1.16b ;"
98 " b 2f ;"
99 "0: mov v2.16b, v1.16b ;"
100 " ld1 {v3.2d}, [%[key]], #16 ;"
101 "1: aesd v0.16b, v2.16b ;"
102 " aesimc v0.16b, v0.16b ;"
103 "2: ld1 {v1.2d}, [%[key]], #16 ;"
104 " aesd v0.16b, v3.16b ;"
105 " aesimc v0.16b, v0.16b ;"
106 "3: ld1 {v2.2d}, [%[key]], #16 ;"
107 " subs %w[rounds], %w[rounds], #3 ;"
108 " aesd v0.16b, v1.16b ;"
109 " aesimc v0.16b, v0.16b ;"
110 " ld1 {v3.2d}, [%[key]], #16 ;"
111 " bpl 1b ;"
112 " aesd v0.16b, v2.16b ;"
113 " eor v0.16b, v0.16b, v3.16b ;"
114 " st1 {v0.16b}, %[out] ;"
115
116 : [out] "=Q"(*out),
117 [key] "=r"(dummy0),
118 [rounds] "=r"(dummy1)
119 : [in] "Q"(*in),
120 "1"(ctx->key_dec),
121 "2"(num_rounds(ctx) - 2)
122 : "cc");
123
124 kernel_neon_end();
125}
126
127static struct crypto_alg aes_alg = {
128 .cra_name = "aes",
129 .cra_driver_name = "aes-ce",
130 .cra_priority = 300,
131 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
132 .cra_blocksize = AES_BLOCK_SIZE,
133 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
134 .cra_module = THIS_MODULE,
135 .cra_cipher = {
136 .cia_min_keysize = AES_MIN_KEY_SIZE,
137 .cia_max_keysize = AES_MAX_KEY_SIZE,
138 .cia_setkey = crypto_aes_set_key,
139 .cia_encrypt = aes_cipher_encrypt,
140 .cia_decrypt = aes_cipher_decrypt
141 }
142};
143
144static int __init aes_mod_init(void)
145{
146 return crypto_register_alg(&aes_alg);
147}
148
149static void __exit aes_mod_exit(void)
150{
151 crypto_unregister_alg(&aes_alg);
152}
153
154module_cpu_feature_match(AES, aes_mod_init);
155module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..685a18f731eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,133 @@
1/*
2 * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
3 * Crypto Extensions
4 *
5 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13
14#define AES_ENTRY(func) ENTRY(ce_ ## func)
15#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
16
17 .arch armv8-a+crypto
18
19 /* preload all round keys */
20 .macro load_round_keys, rounds, rk
21 cmp \rounds, #12
22 blo 2222f /* 128 bits */
23 beq 1111f /* 192 bits */
24 ld1 {v17.16b-v18.16b}, [\rk], #32
251111: ld1 {v19.16b-v20.16b}, [\rk], #32
262222: ld1 {v21.16b-v24.16b}, [\rk], #64
27 ld1 {v25.16b-v28.16b}, [\rk], #64
28 ld1 {v29.16b-v31.16b}, [\rk]
29 .endm
30
31 /* prepare for encryption with key in rk[] */
32 .macro enc_prepare, rounds, rk, ignore
33 load_round_keys \rounds, \rk
34 .endm
35
36 /* prepare for encryption (again) but with new key in rk[] */
37 .macro enc_switch_key, rounds, rk, ignore
38 load_round_keys \rounds, \rk
39 .endm
40
41 /* prepare for decryption with key in rk[] */
42 .macro dec_prepare, rounds, rk, ignore
43 load_round_keys \rounds, \rk
44 .endm
45
46 .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
47 aes\de \i0\().16b, \k\().16b
48 .ifnb \i1
49 aes\de \i1\().16b, \k\().16b
50 .ifnb \i3
51 aes\de \i2\().16b, \k\().16b
52 aes\de \i3\().16b, \k\().16b
53 .endif
54 .endif
55 aes\mc \i0\().16b, \i0\().16b
56 .ifnb \i1
57 aes\mc \i1\().16b, \i1\().16b
58 .ifnb \i3
59 aes\mc \i2\().16b, \i2\().16b
60 aes\mc \i3\().16b, \i3\().16b
61 .endif
62 .endif
63 .endm
64
65 /* up to 4 interleaved encryption rounds with the same round key */
66 .macro round_Nx, enc, k, i0, i1, i2, i3
67 .ifc \enc, e
68 do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
69 .else
70 do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
71 .endif
72 .endm
73
74 /* up to 4 interleaved final rounds */
75 .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
76 aes\de \i0\().16b, \k\().16b
77 .ifnb \i1
78 aes\de \i1\().16b, \k\().16b
79 .ifnb \i3
80 aes\de \i2\().16b, \k\().16b
81 aes\de \i3\().16b, \k\().16b
82 .endif
83 .endif
84 eor \i0\().16b, \i0\().16b, \k2\().16b
85 .ifnb \i1
86 eor \i1\().16b, \i1\().16b, \k2\().16b
87 .ifnb \i3
88 eor \i2\().16b, \i2\().16b, \k2\().16b
89 eor \i3\().16b, \i3\().16b, \k2\().16b
90 .endif
91 .endif
92 .endm
93
94 /* up to 4 interleaved blocks */
95 .macro do_block_Nx, enc, rounds, i0, i1, i2, i3
96 cmp \rounds, #12
97 blo 2222f /* 128 bits */
98 beq 1111f /* 192 bits */
99 round_Nx \enc, v17, \i0, \i1, \i2, \i3
100 round_Nx \enc, v18, \i0, \i1, \i2, \i3
1011111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
102 round_Nx \enc, v20, \i0, \i1, \i2, \i3
1032222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
104 round_Nx \enc, \key, \i0, \i1, \i2, \i3
105 .endr
106 fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
107 .endm
108
109 .macro encrypt_block, in, rounds, t0, t1, t2
110 do_block_Nx e, \rounds, \in
111 .endm
112
113 .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
114 do_block_Nx e, \rounds, \i0, \i1
115 .endm
116
117 .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
118 do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
119 .endm
120
121 .macro decrypt_block, in, rounds, t0, t1, t2
122 do_block_Nx d, \rounds, \in
123 .endm
124
125 .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
126 do_block_Nx d, \rounds, \i0, \i1
127 .endm
128
129 .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
130 do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
131 .endm
132
133#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
1/*
2 * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/hwcap.h>
13#include <crypto/aes.h>
14#include <crypto/ablk_helper.h>
15#include <crypto/algapi.h>
16#include <linux/module.h>
17#include <linux/cpufeature.h>
18
19#ifdef USE_V8_CRYPTO_EXTENSIONS
20#define MODE "ce"
21#define PRIO 300
22#define aes_ecb_encrypt ce_aes_ecb_encrypt
23#define aes_ecb_decrypt ce_aes_ecb_decrypt
24#define aes_cbc_encrypt ce_aes_cbc_encrypt
25#define aes_cbc_decrypt ce_aes_cbc_decrypt
26#define aes_ctr_encrypt ce_aes_ctr_encrypt
27#define aes_xts_encrypt ce_aes_xts_encrypt
28#define aes_xts_decrypt ce_aes_xts_decrypt
29MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
30#else
31#define MODE "neon"
32#define PRIO 200
33#define aes_ecb_encrypt neon_aes_ecb_encrypt
34#define aes_ecb_decrypt neon_aes_ecb_decrypt
35#define aes_cbc_encrypt neon_aes_cbc_encrypt
36#define aes_cbc_decrypt neon_aes_cbc_decrypt
37#define aes_ctr_encrypt neon_aes_ctr_encrypt
38#define aes_xts_encrypt neon_aes_xts_encrypt
39#define aes_xts_decrypt neon_aes_xts_decrypt
40MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
41MODULE_ALIAS("ecb(aes)");
42MODULE_ALIAS("cbc(aes)");
43MODULE_ALIAS("ctr(aes)");
44MODULE_ALIAS("xts(aes)");
45#endif
46
47MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
48MODULE_LICENSE("GPL v2");
49
50/* defined in aes-modes.S */
51asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
52 int rounds, int blocks, int first);
53asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
54 int rounds, int blocks, int first);
55
56asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
57 int rounds, int blocks, u8 iv[], int first);
58asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
59 int rounds, int blocks, u8 iv[], int first);
60
61asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
62 int rounds, int blocks, u8 ctr[], int first);
63
64asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
65 int rounds, int blocks, u8 const rk2[], u8 iv[],
66 int first);
67asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
68 int rounds, int blocks, u8 const rk2[], u8 iv[],
69 int first);
70
71struct crypto_aes_xts_ctx {
72 struct crypto_aes_ctx key1;
73 struct crypto_aes_ctx __aligned(8) key2;
74};
75
76static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
77 unsigned int key_len)
78{
79 struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
80 int ret;
81
82 ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
83 if (!ret)
84 ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
85 key_len / 2);
86 if (!ret)
87 return 0;
88
89 tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
90 return -EINVAL;
91}
92
93static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
94 struct scatterlist *src, unsigned int nbytes)
95{
96 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
97 int err, first, rounds = 6 + ctx->key_length / 4;
98 struct blkcipher_walk walk;
99 unsigned int blocks;
100
101 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
102 blkcipher_walk_init(&walk, dst, src, nbytes);
103 err = blkcipher_walk_virt(desc, &walk);
104
105 kernel_neon_begin();
106 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
107 aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
108 (u8 *)ctx->key_enc, rounds, blocks, first);
109 err = blkcipher_walk_done(desc, &walk, 0);
110 }
111 kernel_neon_end();
112 return err;
113}
114
115static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
116 struct scatterlist *src, unsigned int nbytes)
117{
118 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
119 int err, first, rounds = 6 + ctx->key_length / 4;
120 struct blkcipher_walk walk;
121 unsigned int blocks;
122
123 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
124 blkcipher_walk_init(&walk, dst, src, nbytes);
125 err = blkcipher_walk_virt(desc, &walk);
126
127 kernel_neon_begin();
128 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
129 aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
130 (u8 *)ctx->key_dec, rounds, blocks, first);
131 err = blkcipher_walk_done(desc, &walk, 0);
132 }
133 kernel_neon_end();
134 return err;
135}
136
137static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
138 struct scatterlist *src, unsigned int nbytes)
139{
140 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
141 int err, first, rounds = 6 + ctx->key_length / 4;
142 struct blkcipher_walk walk;
143 unsigned int blocks;
144
145 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
146 blkcipher_walk_init(&walk, dst, src, nbytes);
147 err = blkcipher_walk_virt(desc, &walk);
148
149 kernel_neon_begin();
150 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
151 aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
152 (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
153 first);
154 err = blkcipher_walk_done(desc, &walk, 0);
155 }
156 kernel_neon_end();
157 return err;
158}
159
160static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
162{
163 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
164 int err, first, rounds = 6 + ctx->key_length / 4;
165 struct blkcipher_walk walk;
166 unsigned int blocks;
167
168 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
169 blkcipher_walk_init(&walk, dst, src, nbytes);
170 err = blkcipher_walk_virt(desc, &walk);
171
172 kernel_neon_begin();
173 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
174 aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
175 (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
176 first);
177 err = blkcipher_walk_done(desc, &walk, 0);
178 }
179 kernel_neon_end();
180 return err;
181}
182
183static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
184 struct scatterlist *src, unsigned int nbytes)
185{
186 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
187 int err, first, rounds = 6 + ctx->key_length / 4;
188 struct blkcipher_walk walk;
189 int blocks;
190
191 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
192 blkcipher_walk_init(&walk, dst, src, nbytes);
193 err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
194
195 first = 1;
196 kernel_neon_begin();
197 while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
198 aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
199 (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
200 first);
201 first = 0;
202 nbytes -= blocks * AES_BLOCK_SIZE;
203 if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
204 break;
205 err = blkcipher_walk_done(desc, &walk,
206 walk.nbytes % AES_BLOCK_SIZE);
207 }
208 if (nbytes) {
209 u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
210 u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
211 u8 __aligned(8) tail[AES_BLOCK_SIZE];
212
213 /*
214 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
215 * to tell aes_ctr_encrypt() to only read half a block.
216 */
217 blocks = (nbytes <= 8) ? -1 : 1;
218
219 aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
220 blocks, walk.iv, first);
221 memcpy(tdst, tail, nbytes);
222 err = blkcipher_walk_done(desc, &walk, 0);
223 }
224 kernel_neon_end();
225
226 return err;
227}
228
229static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
230 struct scatterlist *src, unsigned int nbytes)
231{
232 struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
233 int err, first, rounds = 6 + ctx->key1.key_length / 4;
234 struct blkcipher_walk walk;
235 unsigned int blocks;
236
237 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
238 blkcipher_walk_init(&walk, dst, src, nbytes);
239 err = blkcipher_walk_virt(desc, &walk);
240
241 kernel_neon_begin();
242 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
243 aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
244 (u8 *)ctx->key1.key_enc, rounds, blocks,
245 (u8 *)ctx->key2.key_enc, walk.iv, first);
246 err = blkcipher_walk_done(desc, &walk, 0);
247 }
248 kernel_neon_end();
249
250 return err;
251}
252
253static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
254 struct scatterlist *src, unsigned int nbytes)
255{
256 struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
257 int err, first, rounds = 6 + ctx->key1.key_length / 4;
258 struct blkcipher_walk walk;
259 unsigned int blocks;
260
261 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264
265 kernel_neon_begin();
266 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
267 aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
268 (u8 *)ctx->key1.key_dec, rounds, blocks,
269 (u8 *)ctx->key2.key_enc, walk.iv, first);
270 err = blkcipher_walk_done(desc, &walk, 0);
271 }
272 kernel_neon_end();
273
274 return err;
275}
276
277static struct crypto_alg aes_algs[] = { {
278 .cra_name = "__ecb-aes-" MODE,
279 .cra_driver_name = "__driver-ecb-aes-" MODE,
280 .cra_priority = 0,
281 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
282 .cra_blocksize = AES_BLOCK_SIZE,
283 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
284 .cra_alignmask = 7,
285 .cra_type = &crypto_blkcipher_type,
286 .cra_module = THIS_MODULE,
287 .cra_blkcipher = {
288 .min_keysize = AES_MIN_KEY_SIZE,
289 .max_keysize = AES_MAX_KEY_SIZE,
290 .ivsize = AES_BLOCK_SIZE,
291 .setkey = crypto_aes_set_key,
292 .encrypt = ecb_encrypt,
293 .decrypt = ecb_decrypt,
294 },
295}, {
296 .cra_name = "__cbc-aes-" MODE,
297 .cra_driver_name = "__driver-cbc-aes-" MODE,
298 .cra_priority = 0,
299 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
300 .cra_blocksize = AES_BLOCK_SIZE,
301 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
302 .cra_alignmask = 7,
303 .cra_type = &crypto_blkcipher_type,
304 .cra_module = THIS_MODULE,
305 .cra_blkcipher = {
306 .min_keysize = AES_MIN_KEY_SIZE,
307 .max_keysize = AES_MAX_KEY_SIZE,
308 .ivsize = AES_BLOCK_SIZE,
309 .setkey = crypto_aes_set_key,
310 .encrypt = cbc_encrypt,
311 .decrypt = cbc_decrypt,
312 },
313}, {
314 .cra_name = "__ctr-aes-" MODE,
315 .cra_driver_name = "__driver-ctr-aes-" MODE,
316 .cra_priority = 0,
317 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
318 .cra_blocksize = 1,
319 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
320 .cra_alignmask = 7,
321 .cra_type = &crypto_blkcipher_type,
322 .cra_module = THIS_MODULE,
323 .cra_blkcipher = {
324 .min_keysize = AES_MIN_KEY_SIZE,
325 .max_keysize = AES_MAX_KEY_SIZE,
326 .ivsize = AES_BLOCK_SIZE,
327 .setkey = crypto_aes_set_key,
328 .encrypt = ctr_encrypt,
329 .decrypt = ctr_encrypt,
330 },
331}, {
332 .cra_name = "__xts-aes-" MODE,
333 .cra_driver_name = "__driver-xts-aes-" MODE,
334 .cra_priority = 0,
335 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
336 .cra_blocksize = AES_BLOCK_SIZE,
337 .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
338 .cra_alignmask = 7,
339 .cra_type = &crypto_blkcipher_type,
340 .cra_module = THIS_MODULE,
341 .cra_blkcipher = {
342 .min_keysize = 2 * AES_MIN_KEY_SIZE,
343 .max_keysize = 2 * AES_MAX_KEY_SIZE,
344 .ivsize = AES_BLOCK_SIZE,
345 .setkey = xts_set_key,
346 .encrypt = xts_encrypt,
347 .decrypt = xts_decrypt,
348 },
349}, {
350 .cra_name = "ecb(aes)",
351 .cra_driver_name = "ecb-aes-" MODE,
352 .cra_priority = PRIO,
353 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
354 .cra_blocksize = AES_BLOCK_SIZE,
355 .cra_ctxsize = sizeof(struct async_helper_ctx),
356 .cra_alignmask = 7,
357 .cra_type = &crypto_ablkcipher_type,
358 .cra_module = THIS_MODULE,
359 .cra_init = ablk_init,
360 .cra_exit = ablk_exit,
361 .cra_ablkcipher = {
362 .min_keysize = AES_MIN_KEY_SIZE,
363 .max_keysize = AES_MAX_KEY_SIZE,
364 .ivsize = AES_BLOCK_SIZE,
365 .setkey = ablk_set_key,
366 .encrypt = ablk_encrypt,
367 .decrypt = ablk_decrypt,
368 }
369}, {
370 .cra_name = "cbc(aes)",
371 .cra_driver_name = "cbc-aes-" MODE,
372 .cra_priority = PRIO,
373 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
374 .cra_blocksize = AES_BLOCK_SIZE,
375 .cra_ctxsize = sizeof(struct async_helper_ctx),
376 .cra_alignmask = 7,
377 .cra_type = &crypto_ablkcipher_type,
378 .cra_module = THIS_MODULE,
379 .cra_init = ablk_init,
380 .cra_exit = ablk_exit,
381 .cra_ablkcipher = {
382 .min_keysize = AES_MIN_KEY_SIZE,
383 .max_keysize = AES_MAX_KEY_SIZE,
384 .ivsize = AES_BLOCK_SIZE,
385 .setkey = ablk_set_key,
386 .encrypt = ablk_encrypt,
387 .decrypt = ablk_decrypt,
388 }
389}, {
390 .cra_name = "ctr(aes)",
391 .cra_driver_name = "ctr-aes-" MODE,
392 .cra_priority = PRIO,
393 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
394 .cra_blocksize = 1,
395 .cra_ctxsize = sizeof(struct async_helper_ctx),
396 .cra_alignmask = 7,
397 .cra_type = &crypto_ablkcipher_type,
398 .cra_module = THIS_MODULE,
399 .cra_init = ablk_init,
400 .cra_exit = ablk_exit,
401 .cra_ablkcipher = {
402 .min_keysize = AES_MIN_KEY_SIZE,
403 .max_keysize = AES_MAX_KEY_SIZE,
404 .ivsize = AES_BLOCK_SIZE,
405 .setkey = ablk_set_key,
406 .encrypt = ablk_encrypt,
407 .decrypt = ablk_decrypt,
408 }
409}, {
410 .cra_name = "xts(aes)",
411 .cra_driver_name = "xts-aes-" MODE,
412 .cra_priority = PRIO,
413 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
414 .cra_blocksize = AES_BLOCK_SIZE,
415 .cra_ctxsize = sizeof(struct async_helper_ctx),
416 .cra_alignmask = 7,
417 .cra_type = &crypto_ablkcipher_type,
418 .cra_module = THIS_MODULE,
419 .cra_init = ablk_init,
420 .cra_exit = ablk_exit,
421 .cra_ablkcipher = {
422 .min_keysize = 2 * AES_MIN_KEY_SIZE,
423 .max_keysize = 2 * AES_MAX_KEY_SIZE,
424 .ivsize = AES_BLOCK_SIZE,
425 .setkey = ablk_set_key,
426 .encrypt = ablk_encrypt,
427 .decrypt = ablk_decrypt,
428 }
429} };
430
431static int __init aes_init(void)
432{
433 return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
434}
435
436static void __exit aes_exit(void)
437{
438 crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
439}
440
441#ifdef USE_V8_CRYPTO_EXTENSIONS
442module_cpu_feature_match(AES, aes_init);
443#else
444module_init(aes_init);
445#endif
446module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..f6e372c528eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,532 @@
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13 .text
14 .align 4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare - setup NEON registers for encryption
26 * - dec_prepare - setup NEON registers for decryption
27 * - enc_switch_key - change to new key after having prepared for encryption
28 * - encrypt_block - encrypt a single block
29 * - decrypt block - decrypt a single block
30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43 encrypt_block2x v0, v1, w3, x2, x6, w7
44 ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48 decrypt_block2x v0, v1, w3, x2, x6, w7
49 ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56 ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61 ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68 .macro do_encrypt_block2x
69 bl aes_encrypt_block2x
70 .endm
71
72 .macro do_decrypt_block2x
73 bl aes_decrypt_block2x
74 .endm
75
76 .macro do_encrypt_block4x
77 bl aes_encrypt_block4x
78 .endm
79
80 .macro do_decrypt_block4x
81 bl aes_decrypt_block4x
82 .endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88 .macro do_encrypt_block2x
89 encrypt_block2x v0, v1, w3, x2, x6, w7
90 .endm
91
92 .macro do_decrypt_block2x
93 decrypt_block2x v0, v1, w3, x2, x6, w7
94 .endm
95
96 .macro do_encrypt_block4x
97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98 .endm
99
100 .macro do_decrypt_block4x
101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102 .endm
103
104#endif
105
106 /*
107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108 * int blocks, int first)
109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110 * int blocks, int first)
111 */
112
113AES_ENTRY(aes_ecb_encrypt)
114 FRAME_PUSH
115 cbz w5, .LecbencloopNx
116
117 enc_prepare w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121 subs w4, w4, #INTERLEAVE
122 bmi .Lecbenc1x
123#if INTERLEAVE == 2
124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
125 do_encrypt_block2x
126 st1 {v0.16b-v1.16b}, [x0], #32
127#else
128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
129 do_encrypt_block4x
130 st1 {v0.16b-v3.16b}, [x0], #64
131#endif
132 b .LecbencloopNx
133.Lecbenc1x:
134 adds w4, w4, #INTERLEAVE
135 beq .Lecbencout
136#endif
137.Lecbencloop:
138 ld1 {v0.16b}, [x1], #16 /* get next pt block */
139 encrypt_block v0, w3, x2, x5, w6
140 st1 {v0.16b}, [x0], #16
141 subs w4, w4, #1
142 bne .Lecbencloop
143.Lecbencout:
144 FRAME_POP
145 ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150 FRAME_PUSH
151 cbz w5, .LecbdecloopNx
152
153 dec_prepare w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157 subs w4, w4, #INTERLEAVE
158 bmi .Lecbdec1x
159#if INTERLEAVE == 2
160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
161 do_decrypt_block2x
162 st1 {v0.16b-v1.16b}, [x0], #32
163#else
164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
165 do_decrypt_block4x
166 st1 {v0.16b-v3.16b}, [x0], #64
167#endif
168 b .LecbdecloopNx
169.Lecbdec1x:
170 adds w4, w4, #INTERLEAVE
171 beq .Lecbdecout
172#endif
173.Lecbdecloop:
174 ld1 {v0.16b}, [x1], #16 /* get next ct block */
175 decrypt_block v0, w3, x2, x5, w6
176 st1 {v0.16b}, [x0], #16
177 subs w4, w4, #1
178 bne .Lecbdecloop
179.Lecbdecout:
180 FRAME_POP
181 ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185 /*
186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187 * int blocks, u8 iv[], int first)
188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189 * int blocks, u8 iv[], int first)
190 */
191
192AES_ENTRY(aes_cbc_encrypt)
193 cbz w6, .Lcbcencloop
194
195 ld1 {v0.16b}, [x5] /* get iv */
196 enc_prepare w3, x2, x5
197
198.Lcbcencloop:
199 ld1 {v1.16b}, [x1], #16 /* get next pt block */
200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
201 encrypt_block v0, w3, x2, x5, w6
202 st1 {v0.16b}, [x0], #16
203 subs w4, w4, #1
204 bne .Lcbcencloop
205 ret
206AES_ENDPROC(aes_cbc_encrypt)
207
208
209AES_ENTRY(aes_cbc_decrypt)
210 FRAME_PUSH
211 cbz w6, .LcbcdecloopNx
212
213 ld1 {v7.16b}, [x5] /* get iv */
214 dec_prepare w3, x2, x5
215
216.LcbcdecloopNx:
217#if INTERLEAVE >= 2
218 subs w4, w4, #INTERLEAVE
219 bmi .Lcbcdec1x
220#if INTERLEAVE == 2
221 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
222 mov v2.16b, v0.16b
223 mov v3.16b, v1.16b
224 do_decrypt_block2x
225 eor v0.16b, v0.16b, v7.16b
226 eor v1.16b, v1.16b, v2.16b
227 mov v7.16b, v3.16b
228 st1 {v0.16b-v1.16b}, [x0], #32
229#else
230 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
231 mov v4.16b, v0.16b
232 mov v5.16b, v1.16b
233 mov v6.16b, v2.16b
234 do_decrypt_block4x
235 sub x1, x1, #16
236 eor v0.16b, v0.16b, v7.16b
237 eor v1.16b, v1.16b, v4.16b
238 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
239 eor v2.16b, v2.16b, v5.16b
240 eor v3.16b, v3.16b, v6.16b
241 st1 {v0.16b-v3.16b}, [x0], #64
242#endif
243 b .LcbcdecloopNx
244.Lcbcdec1x:
245 adds w4, w4, #INTERLEAVE
246 beq .Lcbcdecout
247#endif
248.Lcbcdecloop:
249 ld1 {v1.16b}, [x1], #16 /* get next ct block */
250 mov v0.16b, v1.16b /* ...and copy to v0 */
251 decrypt_block v0, w3, x2, x5, w6
252 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
253 mov v7.16b, v1.16b /* ct is next iv */
254 st1 {v0.16b}, [x0], #16
255 subs w4, w4, #1
256 bne .Lcbcdecloop
257.Lcbcdecout:
258 FRAME_POP
259 ret
260AES_ENDPROC(aes_cbc_decrypt)
261
262
263 /*
264 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265 * int blocks, u8 ctr[], int first)
266 */
267
268AES_ENTRY(aes_ctr_encrypt)
269 FRAME_PUSH
270 cbnz w6, .Lctrfirst /* 1st time around? */
271 umov x5, v4.d[1] /* keep swabbed ctr in reg */
272 rev x5, x5
273#if INTERLEAVE >= 2
274 cmn w5, w4 /* 32 bit overflow? */
275 bcs .Lctrinc
276 add x5, x5, #1 /* increment BE ctr */
277 b .LctrincNx
278#else
279 b .Lctrinc
280#endif
281.Lctrfirst:
282 enc_prepare w3, x2, x6
283 ld1 {v4.16b}, [x5]
284 umov x5, v4.d[1] /* keep swabbed ctr in reg */
285 rev x5, x5
286#if INTERLEAVE >= 2
287 cmn w5, w4 /* 32 bit overflow? */
288 bcs .Lctrloop
289.LctrloopNx:
290 subs w4, w4, #INTERLEAVE
291 bmi .Lctr1x
292#if INTERLEAVE == 2
293 mov v0.8b, v4.8b
294 mov v1.8b, v4.8b
295 rev x7, x5
296 add x5, x5, #1
297 ins v0.d[1], x7
298 rev x7, x5
299 add x5, x5, #1
300 ins v1.d[1], x7
301 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
302 do_encrypt_block2x
303 eor v0.16b, v0.16b, v2.16b
304 eor v1.16b, v1.16b, v3.16b
305 st1 {v0.16b-v1.16b}, [x0], #32
306#else
307 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
308 dup v7.4s, w5
309 mov v0.16b, v4.16b
310 add v7.4s, v7.4s, v8.4s
311 mov v1.16b, v4.16b
312 rev32 v8.16b, v7.16b
313 mov v2.16b, v4.16b
314 mov v3.16b, v4.16b
315 mov v1.s[3], v8.s[0]
316 mov v2.s[3], v8.s[1]
317 mov v3.s[3], v8.s[2]
318 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
319 do_encrypt_block4x
320 eor v0.16b, v5.16b, v0.16b
321 ld1 {v5.16b}, [x1], #16 /* get 1 input block */
322 eor v1.16b, v6.16b, v1.16b
323 eor v2.16b, v7.16b, v2.16b
324 eor v3.16b, v5.16b, v3.16b
325 st1 {v0.16b-v3.16b}, [x0], #64
326 add x5, x5, #INTERLEAVE
327#endif
328 cbz w4, .LctroutNx
329.LctrincNx:
330 rev x7, x5
331 ins v4.d[1], x7
332 b .LctrloopNx
333.LctroutNx:
334 sub x5, x5, #1
335 rev x7, x5
336 ins v4.d[1], x7
337 b .Lctrout
338.Lctr1x:
339 adds w4, w4, #INTERLEAVE
340 beq .Lctrout
341#endif
342.Lctrloop:
343 mov v0.16b, v4.16b
344 encrypt_block v0, w3, x2, x6, w7
345 subs w4, w4, #1
346 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
347 ld1 {v3.16b}, [x1], #16
348 eor v3.16b, v0.16b, v3.16b
349 st1 {v3.16b}, [x0], #16
350 beq .Lctrout
351.Lctrinc:
352 adds x5, x5, #1 /* increment BE ctr */
353 rev x7, x5
354 ins v4.d[1], x7
355 bcc .Lctrloop /* no overflow? */
356 umov x7, v4.d[0] /* load upper word of ctr */
357 rev x7, x7 /* ... to handle the carry */
358 add x7, x7, #1
359 rev x7, x7
360 ins v4.d[0], x7
361 b .Lctrloop
362.Lctrhalfblock:
363 ld1 {v3.8b}, [x1]
364 eor v3.8b, v0.8b, v3.8b
365 st1 {v3.8b}, [x0]
366.Lctrout:
367 FRAME_POP
368 ret
369AES_ENDPROC(aes_ctr_encrypt)
370 .ltorg
371
372
373 /*
374 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375 * int blocks, u8 const rk2[], u8 iv[], int first)
376 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377 * int blocks, u8 const rk2[], u8 iv[], int first)
378 */
379
380 .macro next_tweak, out, in, const, tmp
381 sshr \tmp\().2d, \in\().2d, #63
382 and \tmp\().16b, \tmp\().16b, \const\().16b
383 add \out\().2d, \in\().2d, \in\().2d
384 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385 eor \out\().16b, \out\().16b, \tmp\().16b
386 .endm
387
388.Lxts_mul_x:
389 .word 1, 0, 0x87, 0
390
391AES_ENTRY(aes_xts_encrypt)
392 FRAME_PUSH
393 cbz w7, .LxtsencloopNx
394
395 ld1 {v4.16b}, [x6]
396 enc_prepare w3, x5, x6
397 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
398 enc_switch_key w3, x2, x6
399 ldr q7, .Lxts_mul_x
400 b .LxtsencNx
401
402.LxtsencloopNx:
403 ldr q7, .Lxts_mul_x
404 next_tweak v4, v4, v7, v8
405.LxtsencNx:
406#if INTERLEAVE >= 2
407 subs w4, w4, #INTERLEAVE
408 bmi .Lxtsenc1x
409#if INTERLEAVE == 2
410 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
411 next_tweak v5, v4, v7, v8
412 eor v0.16b, v0.16b, v4.16b
413 eor v1.16b, v1.16b, v5.16b
414 do_encrypt_block2x
415 eor v0.16b, v0.16b, v4.16b
416 eor v1.16b, v1.16b, v5.16b
417 st1 {v0.16b-v1.16b}, [x0], #32
418 cbz w4, .LxtsencoutNx
419 next_tweak v4, v5, v7, v8
420 b .LxtsencNx
421.LxtsencoutNx:
422 mov v4.16b, v5.16b
423 b .Lxtsencout
424#else
425 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
426 next_tweak v5, v4, v7, v8
427 eor v0.16b, v0.16b, v4.16b
428 next_tweak v6, v5, v7, v8
429 eor v1.16b, v1.16b, v5.16b
430 eor v2.16b, v2.16b, v6.16b
431 next_tweak v7, v6, v7, v8
432 eor v3.16b, v3.16b, v7.16b
433 do_encrypt_block4x
434 eor v3.16b, v3.16b, v7.16b
435 eor v0.16b, v0.16b, v4.16b
436 eor v1.16b, v1.16b, v5.16b
437 eor v2.16b, v2.16b, v6.16b
438 st1 {v0.16b-v3.16b}, [x0], #64
439 mov v4.16b, v7.16b
440 cbz w4, .Lxtsencout
441 b .LxtsencloopNx
442#endif
443.Lxtsenc1x:
444 adds w4, w4, #INTERLEAVE
445 beq .Lxtsencout
446#endif
447.Lxtsencloop:
448 ld1 {v1.16b}, [x1], #16
449 eor v0.16b, v1.16b, v4.16b
450 encrypt_block v0, w3, x2, x6, w7
451 eor v0.16b, v0.16b, v4.16b
452 st1 {v0.16b}, [x0], #16
453 subs w4, w4, #1
454 beq .Lxtsencout
455 next_tweak v4, v4, v7, v8
456 b .Lxtsencloop
457.Lxtsencout:
458 FRAME_POP
459 ret
460AES_ENDPROC(aes_xts_encrypt)
461
462
463AES_ENTRY(aes_xts_decrypt)
464 FRAME_PUSH
465 cbz w7, .LxtsdecloopNx
466
467 ld1 {v4.16b}, [x6]
468 enc_prepare w3, x5, x6
469 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
470 dec_prepare w3, x2, x6
471 ldr q7, .Lxts_mul_x
472 b .LxtsdecNx
473
474.LxtsdecloopNx:
475 ldr q7, .Lxts_mul_x
476 next_tweak v4, v4, v7, v8
477.LxtsdecNx:
478#if INTERLEAVE >= 2
479 subs w4, w4, #INTERLEAVE
480 bmi .Lxtsdec1x
481#if INTERLEAVE == 2
482 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
483 next_tweak v5, v4, v7, v8
484 eor v0.16b, v0.16b, v4.16b
485 eor v1.16b, v1.16b, v5.16b
486 do_decrypt_block2x
487 eor v0.16b, v0.16b, v4.16b
488 eor v1.16b, v1.16b, v5.16b
489 st1 {v0.16b-v1.16b}, [x0], #32
490 cbz w4, .LxtsdecoutNx
491 next_tweak v4, v5, v7, v8
492 b .LxtsdecNx
493.LxtsdecoutNx:
494 mov v4.16b, v5.16b
495 b .Lxtsdecout
496#else
497 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
498 next_tweak v5, v4, v7, v8
499 eor v0.16b, v0.16b, v4.16b
500 next_tweak v6, v5, v7, v8
501 eor v1.16b, v1.16b, v5.16b
502 eor v2.16b, v2.16b, v6.16b
503 next_tweak v7, v6, v7, v8
504 eor v3.16b, v3.16b, v7.16b
505 do_decrypt_block4x
506 eor v3.16b, v3.16b, v7.16b
507 eor v0.16b, v0.16b, v4.16b
508 eor v1.16b, v1.16b, v5.16b
509 eor v2.16b, v2.16b, v6.16b
510 st1 {v0.16b-v3.16b}, [x0], #64
511 mov v4.16b, v7.16b
512 cbz w4, .Lxtsdecout
513 b .LxtsdecloopNx
514#endif
515.Lxtsdec1x:
516 adds w4, w4, #INTERLEAVE
517 beq .Lxtsdecout
518#endif
519.Lxtsdecloop:
520 ld1 {v1.16b}, [x1], #16
521 eor v0.16b, v1.16b, v4.16b
522 decrypt_block v0, w3, x2, x6, w7
523 eor v0.16b, v0.16b, v4.16b
524 st1 {v0.16b}, [x0], #16
525 subs w4, w4, #1
526 beq .Lxtsdecout
527 next_tweak v4, v4, v7, v8
528 b .Lxtsdecloop
529.Lxtsdecout:
530 FRAME_POP
531 ret
532AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..b93170e1cc93
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12
13#define AES_ENTRY(func) ENTRY(neon_ ## func)
14#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
15
16 /* multiply by polynomial 'x' in GF(2^8) */
17 .macro mul_by_x, out, in, temp, const
18 sshr \temp, \in, #7
19 add \out, \in, \in
20 and \temp, \temp, \const
21 eor \out, \out, \temp
22 .endm
23
24 /* preload the entire Sbox */
25 .macro prepare, sbox, shiftrows, temp
26 adr \temp, \sbox
27 movi v12.16b, #0x40
28 ldr q13, \shiftrows
29 movi v14.16b, #0x1b
30 ld1 {v16.16b-v19.16b}, [\temp], #64
31 ld1 {v20.16b-v23.16b}, [\temp], #64
32 ld1 {v24.16b-v27.16b}, [\temp], #64
33 ld1 {v28.16b-v31.16b}, [\temp]
34 .endm
35
36 /* do preload for encryption */
37 .macro enc_prepare, ignore0, ignore1, temp
38 prepare .LForward_Sbox, .LForward_ShiftRows, \temp
39 .endm
40
41 .macro enc_switch_key, ignore0, ignore1, temp
42 /* do nothing */
43 .endm
44
45 /* do preload for decryption */
46 .macro dec_prepare, ignore0, ignore1, temp
47 prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
48 .endm
49
50 /* apply SubBytes transformation using the the preloaded Sbox */
51 .macro sub_bytes, in
52 sub v9.16b, \in\().16b, v12.16b
53 tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
54 sub v10.16b, v9.16b, v12.16b
55 tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
56 sub v11.16b, v10.16b, v12.16b
57 tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
58 tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
59 .endm
60
61 /* apply MixColumns transformation */
62 .macro mix_columns, in
63 mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
64 rev32 v8.8h, \in\().8h
65 eor \in\().16b, v10.16b, \in\().16b
66 shl v9.4s, v8.4s, #24
67 shl v11.4s, \in\().4s, #24
68 sri v9.4s, v8.4s, #8
69 sri v11.4s, \in\().4s, #8
70 eor v9.16b, v9.16b, v8.16b
71 eor v10.16b, v10.16b, v9.16b
72 eor \in\().16b, v10.16b, v11.16b
73 .endm
74
75 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
76 .macro inv_mix_columns, in
77 mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
78 mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
79 eor \in\().16b, \in\().16b, v11.16b
80 rev32 v11.8h, v11.8h
81 eor \in\().16b, \in\().16b, v11.16b
82 mix_columns \in
83 .endm
84
85 .macro do_block, enc, in, rounds, rk, rkp, i
86 ld1 {v15.16b}, [\rk]
87 add \rkp, \rk, #16
88 mov \i, \rounds
891111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
90 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
91 sub_bytes \in
92 ld1 {v15.16b}, [\rkp], #16
93 subs \i, \i, #1
94 beq 2222f
95 .if \enc == 1
96 mix_columns \in
97 .else
98 inv_mix_columns \in
99 .endif
100 b 1111b
1012222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
102 .endm
103
104 .macro encrypt_block, in, rounds, rk, rkp, i
105 do_block 1, \in, \rounds, \rk, \rkp, \i
106 .endm
107
108 .macro decrypt_block, in, rounds, rk, rkp, i
109 do_block 0, \in, \rounds, \rk, \rkp, \i
110 .endm
111
112 /*
113 * Interleaved versions: functionally equivalent to the
114 * ones above, but applied to 2 or 4 AES states in parallel.
115 */
116
117 .macro sub_bytes_2x, in0, in1
118 sub v8.16b, \in0\().16b, v12.16b
119 sub v9.16b, \in1\().16b, v12.16b
120 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
121 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
122 sub v10.16b, v8.16b, v12.16b
123 sub v11.16b, v9.16b, v12.16b
124 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
125 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
126 sub v8.16b, v10.16b, v12.16b
127 sub v9.16b, v11.16b, v12.16b
128 tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
129 tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
130 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
131 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
132 .endm
133
134 .macro sub_bytes_4x, in0, in1, in2, in3
135 sub v8.16b, \in0\().16b, v12.16b
136 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
137 sub v9.16b, \in1\().16b, v12.16b
138 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
139 sub v10.16b, \in2\().16b, v12.16b
140 tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
141 sub v11.16b, \in3\().16b, v12.16b
142 tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
143 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
144 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
145 sub v8.16b, v8.16b, v12.16b
146 tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
147 sub v9.16b, v9.16b, v12.16b
148 tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
149 sub v10.16b, v10.16b, v12.16b
150 tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
151 sub v11.16b, v11.16b, v12.16b
152 tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
153 sub v8.16b, v8.16b, v12.16b
154 tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
155 sub v9.16b, v9.16b, v12.16b
156 tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
157 sub v10.16b, v10.16b, v12.16b
158 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
159 sub v11.16b, v11.16b, v12.16b
160 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
161 tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
162 tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
163 .endm
164
165 .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
166 sshr \tmp0\().16b, \in0\().16b, #7
167 add \out0\().16b, \in0\().16b, \in0\().16b
168 sshr \tmp1\().16b, \in1\().16b, #7
169 and \tmp0\().16b, \tmp0\().16b, \const\().16b
170 add \out1\().16b, \in1\().16b, \in1\().16b
171 and \tmp1\().16b, \tmp1\().16b, \const\().16b
172 eor \out0\().16b, \out0\().16b, \tmp0\().16b
173 eor \out1\().16b, \out1\().16b, \tmp1\().16b
174 .endm
175
176 .macro mix_columns_2x, in0, in1
177 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
178 rev32 v10.8h, \in0\().8h
179 rev32 v11.8h, \in1\().8h
180 eor \in0\().16b, v8.16b, \in0\().16b
181 eor \in1\().16b, v9.16b, \in1\().16b
182 shl v12.4s, v10.4s, #24
183 shl v13.4s, v11.4s, #24
184 eor v8.16b, v8.16b, v10.16b
185 sri v12.4s, v10.4s, #8
186 shl v10.4s, \in0\().4s, #24
187 eor v9.16b, v9.16b, v11.16b
188 sri v13.4s, v11.4s, #8
189 shl v11.4s, \in1\().4s, #24
190 sri v10.4s, \in0\().4s, #8
191 eor \in0\().16b, v8.16b, v12.16b
192 sri v11.4s, \in1\().4s, #8
193 eor \in1\().16b, v9.16b, v13.16b
194 eor \in0\().16b, v10.16b, \in0\().16b
195 eor \in1\().16b, v11.16b, \in1\().16b
196 .endm
197
198 .macro inv_mix_cols_2x, in0, in1
199 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
200 mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
201 eor \in0\().16b, \in0\().16b, v8.16b
202 eor \in1\().16b, \in1\().16b, v9.16b
203 rev32 v8.8h, v8.8h
204 rev32 v9.8h, v9.8h
205 eor \in0\().16b, \in0\().16b, v8.16b
206 eor \in1\().16b, \in1\().16b, v9.16b
207 mix_columns_2x \in0, \in1
208 .endm
209
210 .macro inv_mix_cols_4x, in0, in1, in2, in3
211 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
212 mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
213 mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
214 mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
215 eor \in0\().16b, \in0\().16b, v8.16b
216 eor \in1\().16b, \in1\().16b, v9.16b
217 eor \in2\().16b, \in2\().16b, v10.16b
218 eor \in3\().16b, \in3\().16b, v11.16b
219 rev32 v8.8h, v8.8h
220 rev32 v9.8h, v9.8h
221 rev32 v10.8h, v10.8h
222 rev32 v11.8h, v11.8h
223 eor \in0\().16b, \in0\().16b, v8.16b
224 eor \in1\().16b, \in1\().16b, v9.16b
225 eor \in2\().16b, \in2\().16b, v10.16b
226 eor \in3\().16b, \in3\().16b, v11.16b
227 mix_columns_2x \in0, \in1
228 mix_columns_2x \in2, \in3
229 .endm
230
231 .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
232 ld1 {v15.16b}, [\rk]
233 add \rkp, \rk, #16
234 mov \i, \rounds
2351111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
236 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
237 sub_bytes_2x \in0, \in1
238 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
239 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
240 ld1 {v15.16b}, [\rkp], #16
241 subs \i, \i, #1
242 beq 2222f
243 .if \enc == 1
244 mix_columns_2x \in0, \in1
245 ldr q13, .LForward_ShiftRows
246 .else
247 inv_mix_cols_2x \in0, \in1
248 ldr q13, .LReverse_ShiftRows
249 .endif
250 movi v12.16b, #0x40
251 b 1111b
2522222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
253 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
254 .endm
255
256 .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
257 ld1 {v15.16b}, [\rk]
258 add \rkp, \rk, #16
259 mov \i, \rounds
2601111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
261 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
262 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
263 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
264 sub_bytes_4x \in0, \in1, \in2, \in3
265 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
266 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
267 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
268 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
269 ld1 {v15.16b}, [\rkp], #16
270 subs \i, \i, #1
271 beq 2222f
272 .if \enc == 1
273 mix_columns_2x \in0, \in1
274 mix_columns_2x \in2, \in3
275 ldr q13, .LForward_ShiftRows
276 .else
277 inv_mix_cols_4x \in0, \in1, \in2, \in3
278 ldr q13, .LReverse_ShiftRows
279 .endif
280 movi v12.16b, #0x40
281 b 1111b
2822222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
283 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
284 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
285 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
286 .endm
287
288 .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
289 do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
290 .endm
291
292 .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
293 do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
294 .endm
295
296 .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
297 do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
298 .endm
299
300 .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
301 do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
302 .endm
303
304#include "aes-modes.S"
305
306 .text
307 .align 4
308.LForward_ShiftRows:
309 .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
310 .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
311
312.LReverse_ShiftRows:
313 .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
314 .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
315
316.LForward_Sbox:
317 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
318 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
319 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
320 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
321 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
322 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
323 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
324 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
325 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
326 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
327 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
328 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
329 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
330 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
331 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
332 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
333 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
334 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
335 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
336 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
337 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
338 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
339 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
340 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
341 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
342 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
343 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
344 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
345 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
346 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
347 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
348 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
349
350.LReverse_Sbox:
351 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
352 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
353 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
354 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
355 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
356 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
357 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
358 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
359 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
360 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
361 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
362 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
363 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
364 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
365 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
366 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
367 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
368 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
369 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
370 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
371 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
372 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
373 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
374 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
375 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
376 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
377 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
378 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
379 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
380 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
381 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
382 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
new file mode 100644
index 000000000000..b9e6eaf41c9b
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -0,0 +1,95 @@
1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/assembler.h>
21
22 DATA .req v0
23 SHASH .req v1
24 IN1 .req v2
25 T1 .req v2
26 T2 .req v3
27 T3 .req v4
28 VZR .req v5
29
30 .text
31 .arch armv8-a+crypto
32
33 /*
34 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
35 * struct ghash_key const *k, const char *head)
36 */
37ENTRY(pmull_ghash_update)
38 ld1 {DATA.16b}, [x1]
39 ld1 {SHASH.16b}, [x3]
40 eor VZR.16b, VZR.16b, VZR.16b
41
42 /* do the head block first, if supplied */
43 cbz x4, 0f
44 ld1 {IN1.2d}, [x4]
45 b 1f
46
470: ld1 {IN1.2d}, [x2], #16
48 sub w0, w0, #1
491: ext IN1.16b, IN1.16b, IN1.16b, #8
50CPU_LE( rev64 IN1.16b, IN1.16b )
51 eor DATA.16b, DATA.16b, IN1.16b
52
53 /* multiply DATA by SHASH in GF(2^128) */
54 ext T2.16b, DATA.16b, DATA.16b, #8
55 ext T3.16b, SHASH.16b, SHASH.16b, #8
56 eor T2.16b, T2.16b, DATA.16b
57 eor T3.16b, T3.16b, SHASH.16b
58
59 pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
60 pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
61 pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
62 eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
63 eor T2.16b, T2.16b, DATA.16b
64
65 ext T3.16b, VZR.16b, T2.16b, #8
66 ext T2.16b, T2.16b, VZR.16b, #8
67 eor DATA.16b, DATA.16b, T3.16b
68 eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
69 // carry-less multiplication
70
71 /* first phase of the reduction */
72 shl T3.2d, DATA.2d, #1
73 eor T3.16b, T3.16b, DATA.16b
74 shl T3.2d, T3.2d, #5
75 eor T3.16b, T3.16b, DATA.16b
76 shl T3.2d, T3.2d, #57
77 ext T2.16b, VZR.16b, T3.16b, #8
78 ext T3.16b, T3.16b, VZR.16b, #8
79 eor DATA.16b, DATA.16b, T2.16b
80 eor T1.16b, T1.16b, T3.16b
81
82 /* second phase of the reduction */
83 ushr T2.2d, DATA.2d, #5
84 eor T2.16b, T2.16b, DATA.16b
85 ushr T2.2d, T2.2d, #1
86 eor T2.16b, T2.16b, DATA.16b
87 ushr T2.2d, T2.2d, #1
88 eor T1.16b, T1.16b, T2.16b
89 eor DATA.16b, DATA.16b, T1.16b
90
91 cbnz w0, 0b
92
93 st1 {DATA.16b}, [x1]
94 ret
95ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
new file mode 100644
index 000000000000..b92baf3f68c7
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -0,0 +1,155 @@
1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <linux/cpufeature.h>
15#include <linux/crypto.h>
16#include <linux/module.h>
17
18MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
19MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
20MODULE_LICENSE("GPL v2");
21
22#define GHASH_BLOCK_SIZE 16
23#define GHASH_DIGEST_SIZE 16
24
25struct ghash_key {
26 u64 a;
27 u64 b;
28};
29
30struct ghash_desc_ctx {
31 u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
32 u8 buf[GHASH_BLOCK_SIZE];
33 u32 count;
34};
35
36asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
37 struct ghash_key const *k, const char *head);
38
39static int ghash_init(struct shash_desc *desc)
40{
41 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
42
43 *ctx = (struct ghash_desc_ctx){};
44 return 0;
45}
46
47static int ghash_update(struct shash_desc *desc, const u8 *src,
48 unsigned int len)
49{
50 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
51 unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
52
53 ctx->count += len;
54
55 if ((partial + len) >= GHASH_BLOCK_SIZE) {
56 struct ghash_key *key = crypto_shash_ctx(desc->tfm);
57 int blocks;
58
59 if (partial) {
60 int p = GHASH_BLOCK_SIZE - partial;
61
62 memcpy(ctx->buf + partial, src, p);
63 src += p;
64 len -= p;
65 }
66
67 blocks = len / GHASH_BLOCK_SIZE;
68 len %= GHASH_BLOCK_SIZE;
69
70 kernel_neon_begin_partial(6);
71 pmull_ghash_update(blocks, ctx->digest, src, key,
72 partial ? ctx->buf : NULL);
73 kernel_neon_end();
74 src += blocks * GHASH_BLOCK_SIZE;
75 }
76 if (len)
77 memcpy(ctx->buf + partial, src, len);
78 return 0;
79}
80
81static int ghash_final(struct shash_desc *desc, u8 *dst)
82{
83 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
84 unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
85
86 if (partial) {
87 struct ghash_key *key = crypto_shash_ctx(desc->tfm);
88
89 memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
90
91 kernel_neon_begin_partial(6);
92 pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
93 kernel_neon_end();
94 }
95 put_unaligned_be64(ctx->digest[1], dst);
96 put_unaligned_be64(ctx->digest[0], dst + 8);
97
98 *ctx = (struct ghash_desc_ctx){};
99 return 0;
100}
101
102static int ghash_setkey(struct crypto_shash *tfm,
103 const u8 *inkey, unsigned int keylen)
104{
105 struct ghash_key *key = crypto_shash_ctx(tfm);
106 u64 a, b;
107
108 if (keylen != GHASH_BLOCK_SIZE) {
109 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
110 return -EINVAL;
111 }
112
113 /* perform multiplication by 'x' in GF(2^128) */
114 b = get_unaligned_be64(inkey);
115 a = get_unaligned_be64(inkey + 8);
116
117 key->a = (a << 1) | (b >> 63);
118 key->b = (b << 1) | (a >> 63);
119
120 if (b >> 63)
121 key->b ^= 0xc200000000000000UL;
122
123 return 0;
124}
125
126static struct shash_alg ghash_alg = {
127 .digestsize = GHASH_DIGEST_SIZE,
128 .init = ghash_init,
129 .update = ghash_update,
130 .final = ghash_final,
131 .setkey = ghash_setkey,
132 .descsize = sizeof(struct ghash_desc_ctx),
133 .base = {
134 .cra_name = "ghash",
135 .cra_driver_name = "ghash-ce",
136 .cra_priority = 200,
137 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
138 .cra_blocksize = GHASH_BLOCK_SIZE,
139 .cra_ctxsize = sizeof(struct ghash_key),
140 .cra_module = THIS_MODULE,
141 },
142};
143
144static int __init ghash_ce_mod_init(void)
145{
146 return crypto_register_shash(&ghash_alg);
147}
148
149static void __exit ghash_ce_mod_exit(void)
150{
151 crypto_unregister_shash(&ghash_alg);
152}
153
154module_cpu_feature_match(PMULL, ghash_ce_mod_init);
155module_exit(ghash_ce_mod_exit);
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
new file mode 100644
index 000000000000..09d57d98609c
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -0,0 +1,153 @@
1/*
2 * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 .text
15 .arch armv8-a+crypto
16
17 k0 .req v0
18 k1 .req v1
19 k2 .req v2
20 k3 .req v3
21
22 t0 .req v4
23 t1 .req v5
24
25 dga .req q6
26 dgav .req v6
27 dgb .req s7
28 dgbv .req v7
29
30 dg0q .req q12
31 dg0s .req s12
32 dg0v .req v12
33 dg1s .req s13
34 dg1v .req v13
35 dg2s .req s14
36
37 .macro add_only, op, ev, rc, s0, dg1
38 .ifc \ev, ev
39 add t1.4s, v\s0\().4s, \rc\().4s
40 sha1h dg2s, dg0s
41 .ifnb \dg1
42 sha1\op dg0q, \dg1, t0.4s
43 .else
44 sha1\op dg0q, dg1s, t0.4s
45 .endif
46 .else
47 .ifnb \s0
48 add t0.4s, v\s0\().4s, \rc\().4s
49 .endif
50 sha1h dg1s, dg0s
51 sha1\op dg0q, dg2s, t1.4s
52 .endif
53 .endm
54
55 .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
56 sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
57 add_only \op, \ev, \rc, \s1, \dg1
58 sha1su1 v\s0\().4s, v\s3\().4s
59 .endm
60
61 /*
62 * The SHA1 round constants
63 */
64 .align 4
65.Lsha1_rcon:
66 .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
67
68 /*
69 * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
70 * u8 *head, long bytes)
71 */
72ENTRY(sha1_ce_transform)
73 /* load round constants */
74 adr x6, .Lsha1_rcon
75 ld1r {k0.4s}, [x6], #4
76 ld1r {k1.4s}, [x6], #4
77 ld1r {k2.4s}, [x6], #4
78 ld1r {k3.4s}, [x6]
79
80 /* load state */
81 ldr dga, [x2]
82 ldr dgb, [x2, #16]
83
84 /* load partial state (if supplied) */
85 cbz x3, 0f
86 ld1 {v8.4s-v11.4s}, [x3]
87 b 1f
88
89 /* load input */
900: ld1 {v8.4s-v11.4s}, [x1], #64
91 sub w0, w0, #1
92
931:
94CPU_LE( rev32 v8.16b, v8.16b )
95CPU_LE( rev32 v9.16b, v9.16b )
96CPU_LE( rev32 v10.16b, v10.16b )
97CPU_LE( rev32 v11.16b, v11.16b )
98
992: add t0.4s, v8.4s, k0.4s
100 mov dg0v.16b, dgav.16b
101
102 add_update c, ev, k0, 8, 9, 10, 11, dgb
103 add_update c, od, k0, 9, 10, 11, 8
104 add_update c, ev, k0, 10, 11, 8, 9
105 add_update c, od, k0, 11, 8, 9, 10
106 add_update c, ev, k1, 8, 9, 10, 11
107
108 add_update p, od, k1, 9, 10, 11, 8
109 add_update p, ev, k1, 10, 11, 8, 9
110 add_update p, od, k1, 11, 8, 9, 10
111 add_update p, ev, k1, 8, 9, 10, 11
112 add_update p, od, k2, 9, 10, 11, 8
113
114 add_update m, ev, k2, 10, 11, 8, 9
115 add_update m, od, k2, 11, 8, 9, 10
116 add_update m, ev, k2, 8, 9, 10, 11
117 add_update m, od, k2, 9, 10, 11, 8
118 add_update m, ev, k3, 10, 11, 8, 9
119
120 add_update p, od, k3, 11, 8, 9, 10
121 add_only p, ev, k3, 9
122 add_only p, od, k3, 10
123 add_only p, ev, k3, 11
124 add_only p, od
125
126 /* update state */
127 add dgbv.2s, dgbv.2s, dg1v.2s
128 add dgav.4s, dgav.4s, dg0v.4s
129
130 cbnz w0, 0b
131
132 /*
133 * Final block: add padding and total bit count.
134 * Skip if we have no total byte count in x4. In that case, the input
135 * size was not a round multiple of the block size, and the padding is
136 * handled by the C code.
137 */
138 cbz x4, 3f
139 movi v9.2d, #0
140 mov x8, #0x80000000
141 movi v10.2d, #0
142 ror x7, x4, #29 // ror(lsl(x4, 3), 32)
143 fmov d8, x8
144 mov x4, #0
145 mov v11.d[0], xzr
146 mov v11.d[1], x7
147 b 2b
148
149 /* store new state */
1503: str dga, [x2]
151 str dgb, [x2, #16]
152 ret
153ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
new file mode 100644
index 000000000000..6fe83f37a750
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -0,0 +1,174 @@
1/*
2 * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <crypto/sha.h>
15#include <linux/cpufeature.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
20MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
21MODULE_LICENSE("GPL v2");
22
23asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
24 u8 *head, long bytes);
25
26static int sha1_init(struct shash_desc *desc)
27{
28 struct sha1_state *sctx = shash_desc_ctx(desc);
29
30 *sctx = (struct sha1_state){
31 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
32 };
33 return 0;
34}
35
36static int sha1_update(struct shash_desc *desc, const u8 *data,
37 unsigned int len)
38{
39 struct sha1_state *sctx = shash_desc_ctx(desc);
40 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
41
42 sctx->count += len;
43
44 if ((partial + len) >= SHA1_BLOCK_SIZE) {
45 int blocks;
46
47 if (partial) {
48 int p = SHA1_BLOCK_SIZE - partial;
49
50 memcpy(sctx->buffer + partial, data, p);
51 data += p;
52 len -= p;
53 }
54
55 blocks = len / SHA1_BLOCK_SIZE;
56 len %= SHA1_BLOCK_SIZE;
57
58 kernel_neon_begin_partial(16);
59 sha1_ce_transform(blocks, data, sctx->state,
60 partial ? sctx->buffer : NULL, 0);
61 kernel_neon_end();
62
63 data += blocks * SHA1_BLOCK_SIZE;
64 partial = 0;
65 }
66 if (len)
67 memcpy(sctx->buffer + partial, data, len);
68 return 0;
69}
70
71static int sha1_final(struct shash_desc *desc, u8 *out)
72{
73 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
74
75 struct sha1_state *sctx = shash_desc_ctx(desc);
76 __be64 bits = cpu_to_be64(sctx->count << 3);
77 __be32 *dst = (__be32 *)out;
78 int i;
79
80 u32 padlen = SHA1_BLOCK_SIZE
81 - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
82
83 sha1_update(desc, padding, padlen);
84 sha1_update(desc, (const u8 *)&bits, sizeof(bits));
85
86 for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
87 put_unaligned_be32(sctx->state[i], dst++);
88
89 *sctx = (struct sha1_state){};
90 return 0;
91}
92
93static int sha1_finup(struct shash_desc *desc, const u8 *data,
94 unsigned int len, u8 *out)
95{
96 struct sha1_state *sctx = shash_desc_ctx(desc);
97 __be32 *dst = (__be32 *)out;
98 int blocks;
99 int i;
100
101 if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
102 sha1_update(desc, data, len);
103 return sha1_final(desc, out);
104 }
105
106 /*
107 * Use a fast path if the input is a multiple of 64 bytes. In
108 * this case, there is no need to copy data around, and we can
109 * perform the entire digest calculation in a single invocation
110 * of sha1_ce_transform()
111 */
112 blocks = len / SHA1_BLOCK_SIZE;
113
114 kernel_neon_begin_partial(16);
115 sha1_ce_transform(blocks, data, sctx->state, NULL, len);
116 kernel_neon_end();
117
118 for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
119 put_unaligned_be32(sctx->state[i], dst++);
120
121 *sctx = (struct sha1_state){};
122 return 0;
123}
124
125static int sha1_export(struct shash_desc *desc, void *out)
126{
127 struct sha1_state *sctx = shash_desc_ctx(desc);
128 struct sha1_state *dst = out;
129
130 *dst = *sctx;
131 return 0;
132}
133
134static int sha1_import(struct shash_desc *desc, const void *in)
135{
136 struct sha1_state *sctx = shash_desc_ctx(desc);
137 struct sha1_state const *src = in;
138
139 *sctx = *src;
140 return 0;
141}
142
143static struct shash_alg alg = {
144 .init = sha1_init,
145 .update = sha1_update,
146 .final = sha1_final,
147 .finup = sha1_finup,
148 .export = sha1_export,
149 .import = sha1_import,
150 .descsize = sizeof(struct sha1_state),
151 .digestsize = SHA1_DIGEST_SIZE,
152 .statesize = sizeof(struct sha1_state),
153 .base = {
154 .cra_name = "sha1",
155 .cra_driver_name = "sha1-ce",
156 .cra_priority = 200,
157 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
158 .cra_blocksize = SHA1_BLOCK_SIZE,
159 .cra_module = THIS_MODULE,
160 }
161};
162
163static int __init sha1_ce_mod_init(void)
164{
165 return crypto_register_shash(&alg);
166}
167
168static void __exit sha1_ce_mod_fini(void)
169{
170 crypto_unregister_shash(&alg);
171}
172
173module_cpu_feature_match(SHA1, sha1_ce_mod_init);
174module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
new file mode 100644
index 000000000000..7f29fc031ea8
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -0,0 +1,156 @@
1/*
2 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 .text
15 .arch armv8-a+crypto
16
17 dga .req q20
18 dgav .req v20
19 dgb .req q21
20 dgbv .req v21
21
22 t0 .req v22
23 t1 .req v23
24
25 dg0q .req q24
26 dg0v .req v24
27 dg1q .req q25
28 dg1v .req v25
29 dg2q .req q26
30 dg2v .req v26
31
32 .macro add_only, ev, rc, s0
33 mov dg2v.16b, dg0v.16b
34 .ifeq \ev
35 add t1.4s, v\s0\().4s, \rc\().4s
36 sha256h dg0q, dg1q, t0.4s
37 sha256h2 dg1q, dg2q, t0.4s
38 .else
39 .ifnb \s0
40 add t0.4s, v\s0\().4s, \rc\().4s
41 .endif
42 sha256h dg0q, dg1q, t1.4s
43 sha256h2 dg1q, dg2q, t1.4s
44 .endif
45 .endm
46
47 .macro add_update, ev, rc, s0, s1, s2, s3
48 sha256su0 v\s0\().4s, v\s1\().4s
49 add_only \ev, \rc, \s1
50 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
51 .endm
52
53 /*
54 * The SHA-256 round constants
55 */
56 .align 4
57.Lsha2_rcon:
58 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
59 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
60 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
61 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
62 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
63 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
64 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
65 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
66 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
67 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
68 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
69 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
70 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
71 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
72 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
73 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
74
75 /*
76 * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
77 * u8 *head, long bytes)
78 */
79ENTRY(sha2_ce_transform)
80 /* load round constants */
81 adr x8, .Lsha2_rcon
82 ld1 { v0.4s- v3.4s}, [x8], #64
83 ld1 { v4.4s- v7.4s}, [x8], #64
84 ld1 { v8.4s-v11.4s}, [x8], #64
85 ld1 {v12.4s-v15.4s}, [x8]
86
87 /* load state */
88 ldp dga, dgb, [x2]
89
90 /* load partial input (if supplied) */
91 cbz x3, 0f
92 ld1 {v16.4s-v19.4s}, [x3]
93 b 1f
94
95 /* load input */
960: ld1 {v16.4s-v19.4s}, [x1], #64
97 sub w0, w0, #1
98
991:
100CPU_LE( rev32 v16.16b, v16.16b )
101CPU_LE( rev32 v17.16b, v17.16b )
102CPU_LE( rev32 v18.16b, v18.16b )
103CPU_LE( rev32 v19.16b, v19.16b )
104
1052: add t0.4s, v16.4s, v0.4s
106 mov dg0v.16b, dgav.16b
107 mov dg1v.16b, dgbv.16b
108
109 add_update 0, v1, 16, 17, 18, 19
110 add_update 1, v2, 17, 18, 19, 16
111 add_update 0, v3, 18, 19, 16, 17
112 add_update 1, v4, 19, 16, 17, 18
113
114 add_update 0, v5, 16, 17, 18, 19
115 add_update 1, v6, 17, 18, 19, 16
116 add_update 0, v7, 18, 19, 16, 17
117 add_update 1, v8, 19, 16, 17, 18
118
119 add_update 0, v9, 16, 17, 18, 19
120 add_update 1, v10, 17, 18, 19, 16
121 add_update 0, v11, 18, 19, 16, 17
122 add_update 1, v12, 19, 16, 17, 18
123
124 add_only 0, v13, 17
125 add_only 1, v14, 18
126 add_only 0, v15, 19
127 add_only 1
128
129 /* update state */
130 add dgav.4s, dgav.4s, dg0v.4s
131 add dgbv.4s, dgbv.4s, dg1v.4s
132
133 /* handled all input blocks? */
134 cbnz w0, 0b
135
136 /*
137 * Final block: add padding and total bit count.
138 * Skip if we have no total byte count in x4. In that case, the input
139 * size was not a round multiple of the block size, and the padding is
140 * handled by the C code.
141 */
142 cbz x4, 3f
143 movi v17.2d, #0
144 mov x8, #0x80000000
145 movi v18.2d, #0
146 ror x7, x4, #29 // ror(lsl(x4, 3), 32)
147 fmov d16, x8
148 mov x4, #0
149 mov v19.d[0], xzr
150 mov v19.d[1], x7
151 b 2b
152
153 /* store new state */
1543: stp dga, dgb, [x2]
155 ret
156ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
new file mode 100644
index 000000000000..c294e67d3925
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -0,0 +1,255 @@
1/*
2 * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <crypto/sha.h>
15#include <linux/cpufeature.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
20MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
21MODULE_LICENSE("GPL v2");
22
23asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
24 u8 *head, long bytes);
25
26static int sha224_init(struct shash_desc *desc)
27{
28 struct sha256_state *sctx = shash_desc_ctx(desc);
29
30 *sctx = (struct sha256_state){
31 .state = {
32 SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
33 SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
34 }
35 };
36 return 0;
37}
38
39static int sha256_init(struct shash_desc *desc)
40{
41 struct sha256_state *sctx = shash_desc_ctx(desc);
42
43 *sctx = (struct sha256_state){
44 .state = {
45 SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
46 SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
47 }
48 };
49 return 0;
50}
51
52static int sha2_update(struct shash_desc *desc, const u8 *data,
53 unsigned int len)
54{
55 struct sha256_state *sctx = shash_desc_ctx(desc);
56 unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
57
58 sctx->count += len;
59
60 if ((partial + len) >= SHA256_BLOCK_SIZE) {
61 int blocks;
62
63 if (partial) {
64 int p = SHA256_BLOCK_SIZE - partial;
65
66 memcpy(sctx->buf + partial, data, p);
67 data += p;
68 len -= p;
69 }
70
71 blocks = len / SHA256_BLOCK_SIZE;
72 len %= SHA256_BLOCK_SIZE;
73
74 kernel_neon_begin_partial(28);
75 sha2_ce_transform(blocks, data, sctx->state,
76 partial ? sctx->buf : NULL, 0);
77 kernel_neon_end();
78
79 data += blocks * SHA256_BLOCK_SIZE;
80 partial = 0;
81 }
82 if (len)
83 memcpy(sctx->buf + partial, data, len);
84 return 0;
85}
86
87static void sha2_final(struct shash_desc *desc)
88{
89 static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
90
91 struct sha256_state *sctx = shash_desc_ctx(desc);
92 __be64 bits = cpu_to_be64(sctx->count << 3);
93 u32 padlen = SHA256_BLOCK_SIZE
94 - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
95
96 sha2_update(desc, padding, padlen);
97 sha2_update(desc, (const u8 *)&bits, sizeof(bits));
98}
99
100static int sha224_final(struct shash_desc *desc, u8 *out)
101{
102 struct sha256_state *sctx = shash_desc_ctx(desc);
103 __be32 *dst = (__be32 *)out;
104 int i;
105
106 sha2_final(desc);
107
108 for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
109 put_unaligned_be32(sctx->state[i], dst++);
110
111 *sctx = (struct sha256_state){};
112 return 0;
113}
114
115static int sha256_final(struct shash_desc *desc, u8 *out)
116{
117 struct sha256_state *sctx = shash_desc_ctx(desc);
118 __be32 *dst = (__be32 *)out;
119 int i;
120
121 sha2_final(desc);
122
123 for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
124 put_unaligned_be32(sctx->state[i], dst++);
125
126 *sctx = (struct sha256_state){};
127 return 0;
128}
129
130static void sha2_finup(struct shash_desc *desc, const u8 *data,
131 unsigned int len)
132{
133 struct sha256_state *sctx = shash_desc_ctx(desc);
134 int blocks;
135
136 if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
137 sha2_update(desc, data, len);
138 sha2_final(desc);
139 return;
140 }
141
142 /*
143 * Use a fast path if the input is a multiple of 64 bytes. In
144 * this case, there is no need to copy data around, and we can
145 * perform the entire digest calculation in a single invocation
146 * of sha2_ce_transform()
147 */
148 blocks = len / SHA256_BLOCK_SIZE;
149
150 kernel_neon_begin_partial(28);
151 sha2_ce_transform(blocks, data, sctx->state, NULL, len);
152 kernel_neon_end();
153 data += blocks * SHA256_BLOCK_SIZE;
154}
155
156static int sha224_finup(struct shash_desc *desc, const u8 *data,
157 unsigned int len, u8 *out)
158{
159 struct sha256_state *sctx = shash_desc_ctx(desc);
160 __be32 *dst = (__be32 *)out;
161 int i;
162
163 sha2_finup(desc, data, len);
164
165 for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
166 put_unaligned_be32(sctx->state[i], dst++);
167
168 *sctx = (struct sha256_state){};
169 return 0;
170}
171
172static int sha256_finup(struct shash_desc *desc, const u8 *data,
173 unsigned int len, u8 *out)
174{
175 struct sha256_state *sctx = shash_desc_ctx(desc);
176 __be32 *dst = (__be32 *)out;
177 int i;
178
179 sha2_finup(desc, data, len);
180
181 for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
182 put_unaligned_be32(sctx->state[i], dst++);
183
184 *sctx = (struct sha256_state){};
185 return 0;
186}
187
188static int sha2_export(struct shash_desc *desc, void *out)
189{
190 struct sha256_state *sctx = shash_desc_ctx(desc);
191 struct sha256_state *dst = out;
192
193 *dst = *sctx;
194 return 0;
195}
196
197static int sha2_import(struct shash_desc *desc, const void *in)
198{
199 struct sha256_state *sctx = shash_desc_ctx(desc);
200 struct sha256_state const *src = in;
201
202 *sctx = *src;
203 return 0;
204}
205
206static struct shash_alg algs[] = { {
207 .init = sha224_init,
208 .update = sha2_update,
209 .final = sha224_final,
210 .finup = sha224_finup,
211 .export = sha2_export,
212 .import = sha2_import,
213 .descsize = sizeof(struct sha256_state),
214 .digestsize = SHA224_DIGEST_SIZE,
215 .statesize = sizeof(struct sha256_state),
216 .base = {
217 .cra_name = "sha224",
218 .cra_driver_name = "sha224-ce",
219 .cra_priority = 200,
220 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
221 .cra_blocksize = SHA256_BLOCK_SIZE,
222 .cra_module = THIS_MODULE,
223 }
224}, {
225 .init = sha256_init,
226 .update = sha2_update,
227 .final = sha256_final,
228 .finup = sha256_finup,
229 .export = sha2_export,
230 .import = sha2_import,
231 .descsize = sizeof(struct sha256_state),
232 .digestsize = SHA256_DIGEST_SIZE,
233 .statesize = sizeof(struct sha256_state),
234 .base = {
235 .cra_name = "sha256",
236 .cra_driver_name = "sha256-ce",
237 .cra_priority = 200,
238 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
239 .cra_blocksize = SHA256_BLOCK_SIZE,
240 .cra_module = THIS_MODULE,
241 }
242} };
243
244static int __init sha2_ce_mod_init(void)
245{
246 return crypto_register_shashes(algs, ARRAY_SIZE(algs));
247}
248
249static void __exit sha2_ce_mod_fini(void)
250{
251 crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
252}
253
254module_cpu_feature_match(SHA2, sha2_ce_mod_init);
255module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
40generic-y += sembuf.h 40generic-y += sembuf.h
41generic-y += serial.h 41generic-y += serial.h
42generic-y += shmbuf.h 42generic-y += shmbuf.h
43generic-y += simd.h
43generic-y += sizes.h 44generic-y += sizes.h
44generic-y += socket.h 45generic-y += socket.h
45generic-y += sockios.h 46generic-y += sockios.h
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac13008..50f559f574fe 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,8 +37,21 @@ struct fpsimd_state {
37 u32 fpcr; 37 u32 fpcr;
38 }; 38 };
39 }; 39 };
40 /* the id of the last cpu to have restored this state */
41 unsigned int cpu;
40}; 42};
41 43
44/*
45 * Struct for stacking the bottom 'n' FP/SIMD registers.
46 */
47struct fpsimd_partial_state {
48 u32 fpsr;
49 u32 fpcr;
50 u32 num_regs;
51 __uint128_t vregs[32];
52};
53
54
42#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 55#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
43/* Masks for extracting the FPSR and FPCR from the FPSCR */ 56/* Masks for extracting the FPSR and FPCR from the FPSCR */
44#define VFP_FPSCR_STAT_MASK 0xf800009f 57#define VFP_FPSCR_STAT_MASK 0xf800009f
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
58extern void fpsimd_thread_switch(struct task_struct *next); 71extern void fpsimd_thread_switch(struct task_struct *next);
59extern void fpsimd_flush_thread(void); 72extern void fpsimd_flush_thread(void);
60 73
74extern void fpsimd_preserve_current_state(void);
75extern void fpsimd_restore_current_state(void);
76extern void fpsimd_update_current_state(struct fpsimd_state *state);
77
78extern void fpsimd_flush_task_state(struct task_struct *target);
79
80extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
81 u32 num_regs);
82extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
83
61#endif 84#endif
62 85
63#endif 86#endif
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599c96bd..768414d55e64 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@
62 ldr w\tmpnr, [\state, #16 * 2 + 4] 62 ldr w\tmpnr, [\state, #16 * 2 + 4]
63 msr fpcr, x\tmpnr 63 msr fpcr, x\tmpnr
64.endm 64.endm
65
66.altmacro
67.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
68 mrs x\tmpnr1, fpsr
69 str w\numnr, [\state, #8]
70 mrs x\tmpnr2, fpcr
71 stp w\tmpnr1, w\tmpnr2, [\state]
72 adr x\tmpnr1, 0f
73 add \state, \state, x\numnr, lsl #4
74 sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
75 br x\tmpnr1
76 .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
77 .irp qb, %(qa + 1)
78 stp q\qa, q\qb, [\state, # -16 * \qa - 16]
79 .endr
80 .endr
810:
82.endm
83
84.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
85 ldp w\tmpnr1, w\tmpnr2, [\state]
86 msr fpsr, x\tmpnr1
87 msr fpcr, x\tmpnr2
88 adr x\tmpnr1, 0f
89 ldr w\tmpnr2, [\state, #8]
90 add \state, \state, x\tmpnr2, lsl #4
91 sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
92 br x\tmpnr1
93 .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
94 .irp qb, %(qa + 1)
95 ldp q\qa, q\qb, [\state, # -16 * \qa - 16]
96 .endr
97 .endr
980:
99.endm
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a97780..13ce4cc18e26 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10 10
11#include <linux/types.h>
12
11#define cpu_has_neon() (1) 13#define cpu_has_neon() (1)
12 14
13void kernel_neon_begin(void); 15#define kernel_neon_begin() kernel_neon_begin_partial(32)
16
17void kernel_neon_begin_partial(u32 num_regs);
14void kernel_neon_end(void); 18void kernel_neon_end(void);
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 0a8b2a97a32e..9c086c63f911 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void)
103#define TIF_SIGPENDING 0 103#define TIF_SIGPENDING 0
104#define TIF_NEED_RESCHED 1 104#define TIF_NEED_RESCHED 1
105#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 105#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
106#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
106#define TIF_SYSCALL_TRACE 8 107#define TIF_SYSCALL_TRACE 8
107#define TIF_SYSCALL_AUDIT 9 108#define TIF_SYSCALL_AUDIT 9
108#define TIF_SYSCALL_TRACEPOINT 10 109#define TIF_SYSCALL_TRACEPOINT 10
@@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void)
118#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 119#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
119#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 120#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
120#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 121#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
122#define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
121#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 123#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
122#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 124#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
123#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 125#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void)
125#define _TIF_32BIT (1 << TIF_32BIT) 127#define _TIF_32BIT (1 << TIF_32BIT)
126 128
127#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ 129#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
128 _TIF_NOTIFY_RESUME) 130 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
129 131
130#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 132#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
131 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) 133 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa6..d358ccacfc00 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
41 fpsimd_restore x0, 8 41 fpsimd_restore x0, 8
42 ret 42 ret
43ENDPROC(fpsimd_load_state) 43ENDPROC(fpsimd_load_state)
44
45#ifdef CONFIG_KERNEL_MODE_NEON
46
47/*
48 * Save the bottom n FP registers.
49 *
50 * x0 - pointer to struct fpsimd_partial_state
51 */
52ENTRY(fpsimd_save_partial_state)
53 fpsimd_save_partial x0, 1, 8, 9
54 ret
55ENDPROC(fpsimd_load_partial_state)
56
57/*
58 * Load the bottom n FP registers.
59 *
60 * x0 - pointer to struct fpsimd_partial_state
61 */
62ENTRY(fpsimd_load_partial_state)
63 fpsimd_restore_partial x0, 8, 9
64 ret
65ENDPROC(fpsimd_load_partial_state)
66
67#endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a670d0a98c89..bf017f4ffb4f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -562,7 +562,7 @@ fast_work_pending:
562 str x0, [sp, #S_X0] // returned x0 562 str x0, [sp, #S_X0] // returned x0
563work_pending: 563work_pending:
564 tbnz x1, #TIF_NEED_RESCHED, work_resched 564 tbnz x1, #TIF_NEED_RESCHED, work_resched
565 /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ 565 /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
566 ldr x2, [sp, #S_PSTATE] 566 ldr x2, [sp, #S_PSTATE]
567 mov x0, sp // 'regs' 567 mov x0, sp // 'regs'
568 tst x2, #PSR_MODE_MASK // user mode regs? 568 tst x2, #PSR_MODE_MASK // user mode regs?
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4aef42a04bdc..ad8aebb1cdef 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -35,6 +35,60 @@
35#define FPEXC_IDF (1 << 7) 35#define FPEXC_IDF (1 << 7)
36 36
37/* 37/*
38 * In order to reduce the number of times the FPSIMD state is needlessly saved
39 * and restored, we need to keep track of two things:
40 * (a) for each task, we need to remember which CPU was the last one to have
41 * the task's FPSIMD state loaded into its FPSIMD registers;
42 * (b) for each CPU, we need to remember which task's userland FPSIMD state has
43 * been loaded into its FPSIMD registers most recently, or whether it has
44 * been used to perform kernel mode NEON in the meantime.
45 *
46 * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
47 * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
48 * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
49 * address of the userland FPSIMD state of the task that was loaded onto the CPU
50 * the most recently, or NULL if kernel mode NEON has been performed after that.
51 *
52 * With this in place, we no longer have to restore the next FPSIMD state right
53 * when switching between tasks. Instead, we can defer this check to userland
54 * resume, at which time we verify whether the CPU's fpsimd_last_state and the
55 * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
56 * can omit the FPSIMD restore.
57 *
58 * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
59 * indicate whether or not the userland FPSIMD state of the current task is
60 * present in the registers. The flag is set unless the FPSIMD registers of this
61 * CPU currently contain the most recent userland FPSIMD state of the current
62 * task.
63 *
64 * For a certain task, the sequence may look something like this:
65 * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
66 * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
67 * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
68 * cleared, otherwise it is set;
69 *
70 * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
71 * userland FPSIMD state is copied from memory to the registers, the task's
72 * fpsimd_state.cpu field is set to the id of the current CPU, the current
73 * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
74 * TIF_FOREIGN_FPSTATE flag is cleared;
75 *
76 * - the task executes an ordinary syscall; upon return to userland, the
77 * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
78 * restored;
79 *
80 * - the task executes a syscall which executes some NEON instructions; this is
81 * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
82 * register contents to memory, clears the fpsimd_last_state per-cpu variable
83 * and sets the TIF_FOREIGN_FPSTATE flag;
84 *
85 * - the task gets preempted after kernel_neon_end() is called; as we have not
86 * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
87 * whatever is in the FPSIMD registers is not saved to memory, but discarded.
88 */
89static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
90
91/*
38 * Trapped FP/ASIMD access. 92 * Trapped FP/ASIMD access.
39 */ 93 */
40void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) 94void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
72 126
73void fpsimd_thread_switch(struct task_struct *next) 127void fpsimd_thread_switch(struct task_struct *next)
74{ 128{
75 /* check if not kernel threads */ 129 /*
76 if (current->mm) 130 * Save the current FPSIMD state to memory, but only if whatever is in
131 * the registers is in fact the most recent userland FPSIMD state of
132 * 'current'.
133 */
134 if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
77 fpsimd_save_state(&current->thread.fpsimd_state); 135 fpsimd_save_state(&current->thread.fpsimd_state);
78 if (next->mm) 136
79 fpsimd_load_state(&next->thread.fpsimd_state); 137 if (next->mm) {
138 /*
139 * If we are switching to a task whose most recent userland
140 * FPSIMD state is already in the registers of *this* cpu,
141 * we can skip loading the state from memory. Otherwise, set
142 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
143 * upon the next return to userland.
144 */
145 struct fpsimd_state *st = &next->thread.fpsimd_state;
146
147 if (__this_cpu_read(fpsimd_last_state) == st
148 && st->cpu == smp_processor_id())
149 clear_ti_thread_flag(task_thread_info(next),
150 TIF_FOREIGN_FPSTATE);
151 else
152 set_ti_thread_flag(task_thread_info(next),
153 TIF_FOREIGN_FPSTATE);
154 }
80} 155}
81 156
82void fpsimd_flush_thread(void) 157void fpsimd_flush_thread(void)
83{ 158{
84 preempt_disable();
85 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); 159 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
86 fpsimd_load_state(&current->thread.fpsimd_state); 160 set_thread_flag(TIF_FOREIGN_FPSTATE);
161}
162
163/*
164 * Save the userland FPSIMD state of 'current' to memory, but only if the state
165 * currently held in the registers does in fact belong to 'current'
166 */
167void fpsimd_preserve_current_state(void)
168{
169 preempt_disable();
170 if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
171 fpsimd_save_state(&current->thread.fpsimd_state);
172 preempt_enable();
173}
174
175/*
176 * Load the userland FPSIMD state of 'current' from memory, but only if the
177 * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
178 * state of 'current'
179 */
180void fpsimd_restore_current_state(void)
181{
182 preempt_disable();
183 if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
184 struct fpsimd_state *st = &current->thread.fpsimd_state;
185
186 fpsimd_load_state(st);
187 this_cpu_write(fpsimd_last_state, st);
188 st->cpu = smp_processor_id();
189 }
190 preempt_enable();
191}
192
193/*
194 * Load an updated userland FPSIMD state for 'current' from memory and set the
195 * flag that indicates that the FPSIMD register contents are the most recent
196 * FPSIMD state of 'current'
197 */
198void fpsimd_update_current_state(struct fpsimd_state *state)
199{
200 preempt_disable();
201 fpsimd_load_state(state);
202 if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
203 struct fpsimd_state *st = &current->thread.fpsimd_state;
204
205 this_cpu_write(fpsimd_last_state, st);
206 st->cpu = smp_processor_id();
207 }
87 preempt_enable(); 208 preempt_enable();
88} 209}
89 210
211/*
212 * Invalidate live CPU copies of task t's FPSIMD state
213 */
214void fpsimd_flush_task_state(struct task_struct *t)
215{
216 t->thread.fpsimd_state.cpu = NR_CPUS;
217}
218
90#ifdef CONFIG_KERNEL_MODE_NEON 219#ifdef CONFIG_KERNEL_MODE_NEON
91 220
221static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
222static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
223
92/* 224/*
93 * Kernel-side NEON support functions 225 * Kernel-side NEON support functions
94 */ 226 */
95void kernel_neon_begin(void) 227void kernel_neon_begin_partial(u32 num_regs)
96{ 228{
97 /* Avoid using the NEON in interrupt context */ 229 if (in_interrupt()) {
98 BUG_ON(in_interrupt()); 230 struct fpsimd_partial_state *s = this_cpu_ptr(
99 preempt_disable(); 231 in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
100 232
101 if (current->mm) 233 BUG_ON(num_regs > 32);
102 fpsimd_save_state(&current->thread.fpsimd_state); 234 fpsimd_save_partial_state(s, roundup(num_regs, 2));
235 } else {
236 /*
237 * Save the userland FPSIMD state if we have one and if we
238 * haven't done so already. Clear fpsimd_last_state to indicate
239 * that there is no longer userland FPSIMD state in the
240 * registers.
241 */
242 preempt_disable();
243 if (current->mm &&
244 !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
245 fpsimd_save_state(&current->thread.fpsimd_state);
246 this_cpu_write(fpsimd_last_state, NULL);
247 }
103} 248}
104EXPORT_SYMBOL(kernel_neon_begin); 249EXPORT_SYMBOL(kernel_neon_begin_partial);
105 250
106void kernel_neon_end(void) 251void kernel_neon_end(void)
107{ 252{
108 if (current->mm) 253 if (in_interrupt()) {
109 fpsimd_load_state(&current->thread.fpsimd_state); 254 struct fpsimd_partial_state *s = this_cpu_ptr(
110 255 in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
111 preempt_enable(); 256 fpsimd_load_partial_state(s);
257 } else {
258 preempt_enable();
259 }
112} 260}
113EXPORT_SYMBOL(kernel_neon_end); 261EXPORT_SYMBOL(kernel_neon_end);
114 262
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
120{ 268{
121 switch (cmd) { 269 switch (cmd) {
122 case CPU_PM_ENTER: 270 case CPU_PM_ENTER:
123 if (current->mm) 271 if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
124 fpsimd_save_state(&current->thread.fpsimd_state); 272 fpsimd_save_state(&current->thread.fpsimd_state);
125 break; 273 break;
126 case CPU_PM_EXIT: 274 case CPU_PM_EXIT:
127 if (current->mm) 275 if (current->mm)
128 fpsimd_load_state(&current->thread.fpsimd_state); 276 set_thread_flag(TIF_FOREIGN_FPSTATE);
129 break; 277 break;
130 case CPU_PM_ENTER_FAILED: 278 case CPU_PM_ENTER_FAILED:
131 default: 279 default:
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index d04eb871cb0e..9f2d6020b6c2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task)
206 206
207int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 207int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
208{ 208{
209 fpsimd_save_state(&current->thread.fpsimd_state); 209 fpsimd_preserve_current_state();
210 *dst = *src; 210 *dst = *src;
211 return 0; 211 return 0;
212} 212}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 4b58e812cf67..32d52d3b079c 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
518 return ret; 518 return ret;
519 519
520 target->thread.fpsimd_state.user_fpsimd = newstate; 520 target->thread.fpsimd_state.user_fpsimd = newstate;
521 fpsimd_flush_task_state(target);
521 return ret; 522 return ret;
522} 523}
523 524
@@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target,
765 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; 766 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
766 } 767 }
767 768
769 fpsimd_flush_task_state(target);
768 return ret; 770 return ret;
769} 771}
770 772
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 2ba72a11629f..6357b9c6c90e 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
51 int err; 51 int err;
52 52
53 /* dump the hardware registers to the fpsimd_state structure */ 53 /* dump the hardware registers to the fpsimd_state structure */
54 fpsimd_save_state(fpsimd); 54 fpsimd_preserve_current_state();
55 55
56 /* copy the FP and status/control registers */ 56 /* copy the FP and status/control registers */
57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); 57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); 86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
87 87
88 /* load the hardware registers from the fpsimd_state structure */ 88 /* load the hardware registers from the fpsimd_state structure */
89 if (!err) { 89 if (!err)
90 preempt_disable(); 90 fpsimd_update_current_state(&fpsimd);
91 fpsimd_load_state(&fpsimd);
92 preempt_enable();
93 }
94 91
95 return err ? -EFAULT : 0; 92 return err ? -EFAULT : 0;
96} 93}
@@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
433 clear_thread_flag(TIF_NOTIFY_RESUME); 430 clear_thread_flag(TIF_NOTIFY_RESUME);
434 tracehook_notify_resume(regs); 431 tracehook_notify_resume(regs);
435 } 432 }
433
434 if (thread_flags & _TIF_FOREIGN_FPSTATE)
435 fpsimd_restore_current_state();
436
436} 437}
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 050c1c2af777..3491c638f172 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
222 * Note that this also saves V16-31, which aren't visible 222 * Note that this also saves V16-31, which aren't visible
223 * in AArch32. 223 * in AArch32.
224 */ 224 */
225 fpsimd_save_state(fpsimd); 225 fpsimd_preserve_current_state();
226 226
227 /* Place structure header on the stack */ 227 /* Place structure header on the stack */
228 __put_user_error(magic, &frame->magic, err); 228 __put_user_error(magic, &frame->magic, err);
@@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
285 * We don't need to touch the exception register, so 285 * We don't need to touch the exception register, so
286 * reload the hardware state. 286 * reload the hardware state.
287 */ 287 */
288 if (!err) { 288 if (!err)
289 preempt_disable(); 289 fpsimd_update_current_state(&fpsimd);
290 fpsimd_load_state(&fpsimd);
291 preempt_enable();
292 }
293 290
294 return err ? -EFAULT : 0; 291 return err ? -EFAULT : 0;
295} 292}
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 03cf5936bad6..1ac097279db1 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -4,22 +4,27 @@
4/* 4/*
5 * This is the most generic implementation of unaligned accesses 5 * This is the most generic implementation of unaligned accesses
6 * and should work almost anywhere. 6 * and should work almost anywhere.
7 *
8 * If an architecture can handle unaligned accesses in hardware,
9 * it may want to use the linux/unaligned/access_ok.h implementation
10 * instead.
11 */ 7 */
12#include <asm/byteorder.h> 8#include <asm/byteorder.h>
13 9
10/* Set by the arch if it can handle unaligned accesses in hardware. */
11#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
12# include <linux/unaligned/access_ok.h>
13#endif
14
14#if defined(__LITTLE_ENDIAN) 15#if defined(__LITTLE_ENDIAN)
15# include <linux/unaligned/le_struct.h> 16# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
16# include <linux/unaligned/be_byteshift.h> 17# include <linux/unaligned/le_struct.h>
18# include <linux/unaligned/be_byteshift.h>
19# endif
17# include <linux/unaligned/generic.h> 20# include <linux/unaligned/generic.h>
18# define get_unaligned __get_unaligned_le 21# define get_unaligned __get_unaligned_le
19# define put_unaligned __put_unaligned_le 22# define put_unaligned __put_unaligned_le
20#elif defined(__BIG_ENDIAN) 23#elif defined(__BIG_ENDIAN)
21# include <linux/unaligned/be_struct.h> 24# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
22# include <linux/unaligned/le_byteshift.h> 25# include <linux/unaligned/be_struct.h>
26# include <linux/unaligned/le_byteshift.h>
27# endif
23# include <linux/unaligned/generic.h> 28# include <linux/unaligned/generic.h>
24# define get_unaligned __get_unaligned_be 29# define get_unaligned __get_unaligned_be
25# define put_unaligned __put_unaligned_be 30# define put_unaligned __put_unaligned_be