diff options
author | Catalin Marinas <catalin.marinas@arm.com> | 2014-05-16 05:05:11 -0400 |
---|---|---|
committer | Catalin Marinas <catalin.marinas@arm.com> | 2014-05-16 05:05:11 -0400 |
commit | cf5c95db57ffa02e430c3840c08d1ee0403849d4 (patch) | |
tree | b3b4df5e1edcde098cf45b7fa00c8450e6d665f8 /arch/arm64 | |
parent | fd92d4a54a069953b4679958121317f2a25389cd (diff) | |
parent | 49788fe2a128217f78a21ee4edbe6e92e988f222 (diff) |
Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream
FPSIMD register bank context switching and crypto algorithms
optimisations for arm64 from Ard Biesheuvel.
* tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm:
arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions
arm64: pull in <asm/simd.h> from asm-generic
arm64/crypto: AES in CCM mode using ARMv8 Crypto Extensions
arm64/crypto: AES using ARMv8 Crypto Extensions
arm64/crypto: GHASH secure hash using ARMv8 Crypto Extensions
arm64/crypto: SHA-224/SHA-256 using ARMv8 Crypto Extensions
arm64/crypto: SHA-1 using ARMv8 Crypto Extensions
arm64: add support for kernel mode NEON in interrupt context
arm64: defer reloading a task's FPSIMD state to userland resume
arm64: add abstractions for FPSIMD state manipulation
asm-generic: allow generic unaligned access if the arch supports it
Conflicts:
arch/arm64/include/asm/thread_info.h
Diffstat (limited to 'arch/arm64')
29 files changed, 3522 insertions, 35 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9a5b5fea86ba..78b356d079dd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug" | |||
343 | source "security/Kconfig" | 343 | source "security/Kconfig" |
344 | 344 | ||
345 | source "crypto/Kconfig" | 345 | source "crypto/Kconfig" |
346 | if CRYPTO | ||
347 | source "arch/arm64/crypto/Kconfig" | ||
348 | endif | ||
346 | 349 | ||
347 | source "lib/Kconfig" | 350 | source "lib/Kconfig" |
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 2fceb71ac3b7..8185a913c5ed 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile | |||
@@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS | |||
45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ | 45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ |
46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ | 46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ |
47 | core-$(CONFIG_XEN) += arch/arm64/xen/ | 47 | core-$(CONFIG_XEN) += arch/arm64/xen/ |
48 | core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ | ||
48 | libs-y := arch/arm64/lib/ $(libs-y) | 49 | libs-y := arch/arm64/lib/ $(libs-y) |
49 | libs-y += $(LIBGCC) | 50 | libs-y += $(LIBGCC) |
50 | 51 | ||
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 000000000000..5562652c5316 --- /dev/null +++ b/arch/arm64/crypto/Kconfig | |||
@@ -0,0 +1,53 @@ | |||
1 | |||
2 | menuconfig ARM64_CRYPTO | ||
3 | bool "ARM64 Accelerated Cryptographic Algorithms" | ||
4 | depends on ARM64 | ||
5 | help | ||
6 | Say Y here to choose from a selection of cryptographic algorithms | ||
7 | implemented using ARM64 specific CPU features or instructions. | ||
8 | |||
9 | if ARM64_CRYPTO | ||
10 | |||
11 | config CRYPTO_SHA1_ARM64_CE | ||
12 | tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" | ||
13 | depends on ARM64 && KERNEL_MODE_NEON | ||
14 | select CRYPTO_HASH | ||
15 | |||
16 | config CRYPTO_SHA2_ARM64_CE | ||
17 | tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" | ||
18 | depends on ARM64 && KERNEL_MODE_NEON | ||
19 | select CRYPTO_HASH | ||
20 | |||
21 | config CRYPTO_GHASH_ARM64_CE | ||
22 | tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" | ||
23 | depends on ARM64 && KERNEL_MODE_NEON | ||
24 | select CRYPTO_HASH | ||
25 | |||
26 | config CRYPTO_AES_ARM64_CE | ||
27 | tristate "AES core cipher using ARMv8 Crypto Extensions" | ||
28 | depends on ARM64 && KERNEL_MODE_NEON | ||
29 | select CRYPTO_ALGAPI | ||
30 | select CRYPTO_AES | ||
31 | |||
32 | config CRYPTO_AES_ARM64_CE_CCM | ||
33 | tristate "AES in CCM mode using ARMv8 Crypto Extensions" | ||
34 | depends on ARM64 && KERNEL_MODE_NEON | ||
35 | select CRYPTO_ALGAPI | ||
36 | select CRYPTO_AES | ||
37 | select CRYPTO_AEAD | ||
38 | |||
39 | config CRYPTO_AES_ARM64_CE_BLK | ||
40 | tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" | ||
41 | depends on ARM64 && KERNEL_MODE_NEON | ||
42 | select CRYPTO_BLKCIPHER | ||
43 | select CRYPTO_AES | ||
44 | select CRYPTO_ABLK_HELPER | ||
45 | |||
46 | config CRYPTO_AES_ARM64_NEON_BLK | ||
47 | tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" | ||
48 | depends on ARM64 && KERNEL_MODE_NEON | ||
49 | select CRYPTO_BLKCIPHER | ||
50 | select CRYPTO_AES | ||
51 | select CRYPTO_ABLK_HELPER | ||
52 | |||
53 | endif | ||
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 000000000000..2070a56ecc46 --- /dev/null +++ b/arch/arm64/crypto/Makefile | |||
@@ -0,0 +1,38 @@ | |||
1 | # | ||
2 | # linux/arch/arm64/crypto/Makefile | ||
3 | # | ||
4 | # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | # | ||
6 | # This program is free software; you can redistribute it and/or modify | ||
7 | # it under the terms of the GNU General Public License version 2 as | ||
8 | # published by the Free Software Foundation. | ||
9 | # | ||
10 | |||
11 | obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o | ||
12 | sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o | ||
13 | |||
14 | obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o | ||
15 | sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o | ||
16 | |||
17 | obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o | ||
18 | ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o | ||
19 | |||
20 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o | ||
21 | CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto | ||
22 | |||
23 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o | ||
24 | aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o | ||
25 | |||
26 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o | ||
27 | aes-ce-blk-y := aes-glue-ce.o aes-ce.o | ||
28 | |||
29 | obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o | ||
30 | aes-neon-blk-y := aes-glue-neon.o aes-neon.o | ||
31 | |||
32 | AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE | ||
33 | AFLAGS_aes-neon.o := -DINTERLEAVE=4 | ||
34 | |||
35 | CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS | ||
36 | |||
37 | $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE | ||
38 | $(call if_changed_dep,cc_o_c) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 000000000000..432e4841cd81 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S | |||
@@ -0,0 +1,222 @@ | |||
1 | /* | ||
2 | * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | |||
13 | .text | ||
14 | .arch armv8-a+crypto | ||
15 | |||
16 | /* | ||
17 | * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
18 | * u32 *macp, u8 const rk[], u32 rounds); | ||
19 | */ | ||
20 | ENTRY(ce_aes_ccm_auth_data) | ||
21 | ldr w8, [x3] /* leftover from prev round? */ | ||
22 | ld1 {v0.2d}, [x0] /* load mac */ | ||
23 | cbz w8, 1f | ||
24 | sub w8, w8, #16 | ||
25 | eor v1.16b, v1.16b, v1.16b | ||
26 | 0: ldrb w7, [x1], #1 /* get 1 byte of input */ | ||
27 | subs w2, w2, #1 | ||
28 | add w8, w8, #1 | ||
29 | ins v1.b[0], w7 | ||
30 | ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ | ||
31 | beq 8f /* out of input? */ | ||
32 | cbnz w8, 0b | ||
33 | eor v0.16b, v0.16b, v1.16b | ||
34 | 1: ld1 {v3.2d}, [x4] /* load first round key */ | ||
35 | prfm pldl1strm, [x1] | ||
36 | cmp w5, #12 /* which key size? */ | ||
37 | add x6, x4, #16 | ||
38 | sub w7, w5, #2 /* modified # of rounds */ | ||
39 | bmi 2f | ||
40 | bne 5f | ||
41 | mov v5.16b, v3.16b | ||
42 | b 4f | ||
43 | 2: mov v4.16b, v3.16b | ||
44 | ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ | ||
45 | 3: aese v0.16b, v4.16b | ||
46 | aesmc v0.16b, v0.16b | ||
47 | 4: ld1 {v3.2d}, [x6], #16 /* load next round key */ | ||
48 | aese v0.16b, v5.16b | ||
49 | aesmc v0.16b, v0.16b | ||
50 | 5: ld1 {v4.2d}, [x6], #16 /* load next round key */ | ||
51 | subs w7, w7, #3 | ||
52 | aese v0.16b, v3.16b | ||
53 | aesmc v0.16b, v0.16b | ||
54 | ld1 {v5.2d}, [x6], #16 /* load next round key */ | ||
55 | bpl 3b | ||
56 | aese v0.16b, v4.16b | ||
57 | subs w2, w2, #16 /* last data? */ | ||
58 | eor v0.16b, v0.16b, v5.16b /* final round */ | ||
59 | bmi 6f | ||
60 | ld1 {v1.16b}, [x1], #16 /* load next input block */ | ||
61 | eor v0.16b, v0.16b, v1.16b /* xor with mac */ | ||
62 | bne 1b | ||
63 | 6: st1 {v0.2d}, [x0] /* store mac */ | ||
64 | beq 10f | ||
65 | adds w2, w2, #16 | ||
66 | beq 10f | ||
67 | mov w8, w2 | ||
68 | 7: ldrb w7, [x1], #1 | ||
69 | umov w6, v0.b[0] | ||
70 | eor w6, w6, w7 | ||
71 | strb w6, [x0], #1 | ||
72 | subs w2, w2, #1 | ||
73 | beq 10f | ||
74 | ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ | ||
75 | b 7b | ||
76 | 8: mov w7, w8 | ||
77 | add w8, w8, #16 | ||
78 | 9: ext v1.16b, v1.16b, v1.16b, #1 | ||
79 | adds w7, w7, #1 | ||
80 | bne 9b | ||
81 | eor v0.16b, v0.16b, v1.16b | ||
82 | st1 {v0.2d}, [x0] | ||
83 | 10: str w8, [x3] | ||
84 | ret | ||
85 | ENDPROC(ce_aes_ccm_auth_data) | ||
86 | |||
87 | /* | ||
88 | * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], | ||
89 | * u32 rounds); | ||
90 | */ | ||
91 | ENTRY(ce_aes_ccm_final) | ||
92 | ld1 {v3.2d}, [x2], #16 /* load first round key */ | ||
93 | ld1 {v0.2d}, [x0] /* load mac */ | ||
94 | cmp w3, #12 /* which key size? */ | ||
95 | sub w3, w3, #2 /* modified # of rounds */ | ||
96 | ld1 {v1.2d}, [x1] /* load 1st ctriv */ | ||
97 | bmi 0f | ||
98 | bne 3f | ||
99 | mov v5.16b, v3.16b | ||
100 | b 2f | ||
101 | 0: mov v4.16b, v3.16b | ||
102 | 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ | ||
103 | aese v0.16b, v4.16b | ||
104 | aese v1.16b, v4.16b | ||
105 | aesmc v0.16b, v0.16b | ||
106 | aesmc v1.16b, v1.16b | ||
107 | 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ | ||
108 | aese v0.16b, v5.16b | ||
109 | aese v1.16b, v5.16b | ||
110 | aesmc v0.16b, v0.16b | ||
111 | aesmc v1.16b, v1.16b | ||
112 | 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ | ||
113 | subs w3, w3, #3 | ||
114 | aese v0.16b, v3.16b | ||
115 | aese v1.16b, v3.16b | ||
116 | aesmc v0.16b, v0.16b | ||
117 | aesmc v1.16b, v1.16b | ||
118 | bpl 1b | ||
119 | aese v0.16b, v4.16b | ||
120 | aese v1.16b, v4.16b | ||
121 | /* final round key cancels out */ | ||
122 | eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ | ||
123 | st1 {v0.2d}, [x0] /* store result */ | ||
124 | ret | ||
125 | ENDPROC(ce_aes_ccm_final) | ||
126 | |||
127 | .macro aes_ccm_do_crypt,enc | ||
128 | ldr x8, [x6, #8] /* load lower ctr */ | ||
129 | ld1 {v0.2d}, [x5] /* load mac */ | ||
130 | rev x8, x8 /* keep swabbed ctr in reg */ | ||
131 | 0: /* outer loop */ | ||
132 | ld1 {v1.1d}, [x6] /* load upper ctr */ | ||
133 | prfm pldl1strm, [x1] | ||
134 | add x8, x8, #1 | ||
135 | rev x9, x8 | ||
136 | cmp w4, #12 /* which key size? */ | ||
137 | sub w7, w4, #2 /* get modified # of rounds */ | ||
138 | ins v1.d[1], x9 /* no carry in lower ctr */ | ||
139 | ld1 {v3.2d}, [x3] /* load first round key */ | ||
140 | add x10, x3, #16 | ||
141 | bmi 1f | ||
142 | bne 4f | ||
143 | mov v5.16b, v3.16b | ||
144 | b 3f | ||
145 | 1: mov v4.16b, v3.16b | ||
146 | ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ | ||
147 | 2: /* inner loop: 3 rounds, 2x interleaved */ | ||
148 | aese v0.16b, v4.16b | ||
149 | aese v1.16b, v4.16b | ||
150 | aesmc v0.16b, v0.16b | ||
151 | aesmc v1.16b, v1.16b | ||
152 | 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ | ||
153 | aese v0.16b, v5.16b | ||
154 | aese v1.16b, v5.16b | ||
155 | aesmc v0.16b, v0.16b | ||
156 | aesmc v1.16b, v1.16b | ||
157 | 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ | ||
158 | subs w7, w7, #3 | ||
159 | aese v0.16b, v3.16b | ||
160 | aese v1.16b, v3.16b | ||
161 | aesmc v0.16b, v0.16b | ||
162 | aesmc v1.16b, v1.16b | ||
163 | ld1 {v5.2d}, [x10], #16 /* load next round key */ | ||
164 | bpl 2b | ||
165 | aese v0.16b, v4.16b | ||
166 | aese v1.16b, v4.16b | ||
167 | subs w2, w2, #16 | ||
168 | bmi 6f /* partial block? */ | ||
169 | ld1 {v2.16b}, [x1], #16 /* load next input block */ | ||
170 | .if \enc == 1 | ||
171 | eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ | ||
172 | eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ | ||
173 | .else | ||
174 | eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ | ||
175 | eor v1.16b, v2.16b, v5.16b /* final round enc */ | ||
176 | .endif | ||
177 | eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ | ||
178 | st1 {v1.16b}, [x0], #16 /* write output block */ | ||
179 | bne 0b | ||
180 | rev x8, x8 | ||
181 | st1 {v0.2d}, [x5] /* store mac */ | ||
182 | str x8, [x6, #8] /* store lsb end of ctr (BE) */ | ||
183 | 5: ret | ||
184 | |||
185 | 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ | ||
186 | eor v1.16b, v1.16b, v5.16b /* final round enc */ | ||
187 | st1 {v0.2d}, [x5] /* store mac */ | ||
188 | add w2, w2, #16 /* process partial tail block */ | ||
189 | 7: ldrb w9, [x1], #1 /* get 1 byte of input */ | ||
190 | umov w6, v1.b[0] /* get top crypted ctr byte */ | ||
191 | umov w7, v0.b[0] /* get top mac byte */ | ||
192 | .if \enc == 1 | ||
193 | eor w7, w7, w9 | ||
194 | eor w9, w9, w6 | ||
195 | .else | ||
196 | eor w9, w9, w6 | ||
197 | eor w7, w7, w9 | ||
198 | .endif | ||
199 | strb w9, [x0], #1 /* store out byte */ | ||
200 | strb w7, [x5], #1 /* store mac byte */ | ||
201 | subs w2, w2, #1 | ||
202 | beq 5b | ||
203 | ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ | ||
204 | ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ | ||
205 | b 7b | ||
206 | .endm | ||
207 | |||
208 | /* | ||
209 | * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
210 | * u8 const rk[], u32 rounds, u8 mac[], | ||
211 | * u8 ctr[]); | ||
212 | * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
213 | * u8 const rk[], u32 rounds, u8 mac[], | ||
214 | * u8 ctr[]); | ||
215 | */ | ||
216 | ENTRY(ce_aes_ccm_encrypt) | ||
217 | aes_ccm_do_crypt 1 | ||
218 | ENDPROC(ce_aes_ccm_encrypt) | ||
219 | |||
220 | ENTRY(ce_aes_ccm_decrypt) | ||
221 | aes_ccm_do_crypt 0 | ||
222 | ENDPROC(ce_aes_ccm_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 000000000000..9e6cdde9b43d --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c | |||
@@ -0,0 +1,297 @@ | |||
1 | /* | ||
2 | * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/aes.h> | ||
14 | #include <crypto/algapi.h> | ||
15 | #include <crypto/scatterwalk.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
20 | { | ||
21 | /* | ||
22 | * # of rounds specified by AES: | ||
23 | * 128 bit key 10 rounds | ||
24 | * 192 bit key 12 rounds | ||
25 | * 256 bit key 14 rounds | ||
26 | * => n byte key => 6 + (n/4) rounds | ||
27 | */ | ||
28 | return 6 + ctx->key_length / 4; | ||
29 | } | ||
30 | |||
31 | asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
32 | u32 *macp, u32 const rk[], u32 rounds); | ||
33 | |||
34 | asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
35 | u32 const rk[], u32 rounds, u8 mac[], | ||
36 | u8 ctr[]); | ||
37 | |||
38 | asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
39 | u32 const rk[], u32 rounds, u8 mac[], | ||
40 | u8 ctr[]); | ||
41 | |||
42 | asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], | ||
43 | u32 rounds); | ||
44 | |||
45 | static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, | ||
46 | unsigned int key_len) | ||
47 | { | ||
48 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); | ||
49 | int ret; | ||
50 | |||
51 | ret = crypto_aes_expand_key(ctx, in_key, key_len); | ||
52 | if (!ret) | ||
53 | return 0; | ||
54 | |||
55 | tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
56 | return -EINVAL; | ||
57 | } | ||
58 | |||
59 | static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) | ||
60 | { | ||
61 | if ((authsize & 1) || authsize < 4) | ||
62 | return -EINVAL; | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) | ||
67 | { | ||
68 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
69 | __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; | ||
70 | u32 l = req->iv[0] + 1; | ||
71 | |||
72 | /* verify that CCM dimension 'L' is set correctly in the IV */ | ||
73 | if (l < 2 || l > 8) | ||
74 | return -EINVAL; | ||
75 | |||
76 | /* verify that msglen can in fact be represented in L bytes */ | ||
77 | if (l < 4 && msglen >> (8 * l)) | ||
78 | return -EOVERFLOW; | ||
79 | |||
80 | /* | ||
81 | * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi | ||
82 | * uses a u32 type to represent msglen so the top 4 bytes are always 0. | ||
83 | */ | ||
84 | n[0] = 0; | ||
85 | n[1] = cpu_to_be32(msglen); | ||
86 | |||
87 | memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); | ||
88 | |||
89 | /* | ||
90 | * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) | ||
91 | * - bits 0..2 : max # of bytes required to represent msglen, minus 1 | ||
92 | * (already set by caller) | ||
93 | * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) | ||
94 | * - bit 6 : indicates presence of authenticate-only data | ||
95 | */ | ||
96 | maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; | ||
97 | if (req->assoclen) | ||
98 | maciv[0] |= 0x40; | ||
99 | |||
100 | memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) | ||
105 | { | ||
106 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
107 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
108 | struct __packed { __be16 l; __be32 h; u16 len; } ltag; | ||
109 | struct scatter_walk walk; | ||
110 | u32 len = req->assoclen; | ||
111 | u32 macp = 0; | ||
112 | |||
113 | /* prepend the AAD with a length tag */ | ||
114 | if (len < 0xff00) { | ||
115 | ltag.l = cpu_to_be16(len); | ||
116 | ltag.len = 2; | ||
117 | } else { | ||
118 | ltag.l = cpu_to_be16(0xfffe); | ||
119 | put_unaligned_be32(len, <ag.h); | ||
120 | ltag.len = 6; | ||
121 | } | ||
122 | |||
123 | ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, | ||
124 | num_rounds(ctx)); | ||
125 | scatterwalk_start(&walk, req->assoc); | ||
126 | |||
127 | do { | ||
128 | u32 n = scatterwalk_clamp(&walk, len); | ||
129 | u8 *p; | ||
130 | |||
131 | if (!n) { | ||
132 | scatterwalk_start(&walk, sg_next(walk.sg)); | ||
133 | n = scatterwalk_clamp(&walk, len); | ||
134 | } | ||
135 | p = scatterwalk_map(&walk); | ||
136 | ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, | ||
137 | num_rounds(ctx)); | ||
138 | len -= n; | ||
139 | |||
140 | scatterwalk_unmap(p); | ||
141 | scatterwalk_advance(&walk, n); | ||
142 | scatterwalk_done(&walk, 0, len); | ||
143 | } while (len); | ||
144 | } | ||
145 | |||
146 | static int ccm_encrypt(struct aead_request *req) | ||
147 | { | ||
148 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
149 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
150 | struct blkcipher_desc desc = { .info = req->iv }; | ||
151 | struct blkcipher_walk walk; | ||
152 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
153 | u8 buf[AES_BLOCK_SIZE]; | ||
154 | u32 len = req->cryptlen; | ||
155 | int err; | ||
156 | |||
157 | err = ccm_init_mac(req, mac, len); | ||
158 | if (err) | ||
159 | return err; | ||
160 | |||
161 | kernel_neon_begin_partial(6); | ||
162 | |||
163 | if (req->assoclen) | ||
164 | ccm_calculate_auth_mac(req, mac); | ||
165 | |||
166 | /* preserve the original iv for the final round */ | ||
167 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
168 | |||
169 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
170 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
171 | AES_BLOCK_SIZE); | ||
172 | |||
173 | while (walk.nbytes) { | ||
174 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
175 | |||
176 | if (walk.nbytes == len) | ||
177 | tail = 0; | ||
178 | |||
179 | ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
180 | walk.nbytes - tail, ctx->key_enc, | ||
181 | num_rounds(ctx), mac, walk.iv); | ||
182 | |||
183 | len -= walk.nbytes - tail; | ||
184 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
185 | } | ||
186 | if (!err) | ||
187 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
188 | |||
189 | kernel_neon_end(); | ||
190 | |||
191 | if (err) | ||
192 | return err; | ||
193 | |||
194 | /* copy authtag to end of dst */ | ||
195 | scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, | ||
196 | crypto_aead_authsize(aead), 1); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static int ccm_decrypt(struct aead_request *req) | ||
202 | { | ||
203 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
204 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
205 | unsigned int authsize = crypto_aead_authsize(aead); | ||
206 | struct blkcipher_desc desc = { .info = req->iv }; | ||
207 | struct blkcipher_walk walk; | ||
208 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
209 | u8 buf[AES_BLOCK_SIZE]; | ||
210 | u32 len = req->cryptlen - authsize; | ||
211 | int err; | ||
212 | |||
213 | err = ccm_init_mac(req, mac, len); | ||
214 | if (err) | ||
215 | return err; | ||
216 | |||
217 | kernel_neon_begin_partial(6); | ||
218 | |||
219 | if (req->assoclen) | ||
220 | ccm_calculate_auth_mac(req, mac); | ||
221 | |||
222 | /* preserve the original iv for the final round */ | ||
223 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
224 | |||
225 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
226 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
227 | AES_BLOCK_SIZE); | ||
228 | |||
229 | while (walk.nbytes) { | ||
230 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
231 | |||
232 | if (walk.nbytes == len) | ||
233 | tail = 0; | ||
234 | |||
235 | ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
236 | walk.nbytes - tail, ctx->key_enc, | ||
237 | num_rounds(ctx), mac, walk.iv); | ||
238 | |||
239 | len -= walk.nbytes - tail; | ||
240 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
241 | } | ||
242 | if (!err) | ||
243 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
244 | |||
245 | kernel_neon_end(); | ||
246 | |||
247 | if (err) | ||
248 | return err; | ||
249 | |||
250 | /* compare calculated auth tag with the stored one */ | ||
251 | scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, | ||
252 | authsize, 0); | ||
253 | |||
254 | if (memcmp(mac, buf, authsize)) | ||
255 | return -EBADMSG; | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static struct crypto_alg ccm_aes_alg = { | ||
260 | .cra_name = "ccm(aes)", | ||
261 | .cra_driver_name = "ccm-aes-ce", | ||
262 | .cra_priority = 300, | ||
263 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | ||
264 | .cra_blocksize = 1, | ||
265 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
266 | .cra_alignmask = 7, | ||
267 | .cra_type = &crypto_aead_type, | ||
268 | .cra_module = THIS_MODULE, | ||
269 | .cra_aead = { | ||
270 | .ivsize = AES_BLOCK_SIZE, | ||
271 | .maxauthsize = AES_BLOCK_SIZE, | ||
272 | .setkey = ccm_setkey, | ||
273 | .setauthsize = ccm_setauthsize, | ||
274 | .encrypt = ccm_encrypt, | ||
275 | .decrypt = ccm_decrypt, | ||
276 | } | ||
277 | }; | ||
278 | |||
279 | static int __init aes_mod_init(void) | ||
280 | { | ||
281 | if (!(elf_hwcap & HWCAP_AES)) | ||
282 | return -ENODEV; | ||
283 | return crypto_register_alg(&ccm_aes_alg); | ||
284 | } | ||
285 | |||
286 | static void __exit aes_mod_exit(void) | ||
287 | { | ||
288 | crypto_unregister_alg(&ccm_aes_alg); | ||
289 | } | ||
290 | |||
291 | module_init(aes_mod_init); | ||
292 | module_exit(aes_mod_exit); | ||
293 | |||
294 | MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); | ||
295 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
296 | MODULE_LICENSE("GPL v2"); | ||
297 | MODULE_ALIAS("ccm(aes)"); | ||
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c new file mode 100644 index 000000000000..2075e1acae6b --- /dev/null +++ b/arch/arm64/crypto/aes-ce-cipher.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <crypto/aes.h> | ||
13 | #include <linux/cpufeature.h> | ||
14 | #include <linux/crypto.h> | ||
15 | #include <linux/module.h> | ||
16 | |||
17 | MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); | ||
18 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
19 | MODULE_LICENSE("GPL v2"); | ||
20 | |||
21 | struct aes_block { | ||
22 | u8 b[AES_BLOCK_SIZE]; | ||
23 | }; | ||
24 | |||
25 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
26 | { | ||
27 | /* | ||
28 | * # of rounds specified by AES: | ||
29 | * 128 bit key 10 rounds | ||
30 | * 192 bit key 12 rounds | ||
31 | * 256 bit key 14 rounds | ||
32 | * => n byte key => 6 + (n/4) rounds | ||
33 | */ | ||
34 | return 6 + ctx->key_length / 4; | ||
35 | } | ||
36 | |||
37 | static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
38 | { | ||
39 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
40 | struct aes_block *out = (struct aes_block *)dst; | ||
41 | struct aes_block const *in = (struct aes_block *)src; | ||
42 | void *dummy0; | ||
43 | int dummy1; | ||
44 | |||
45 | kernel_neon_begin_partial(4); | ||
46 | |||
47 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
48 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
49 | " cmp %w[rounds], #10 ;" | ||
50 | " bmi 0f ;" | ||
51 | " bne 3f ;" | ||
52 | " mov v3.16b, v1.16b ;" | ||
53 | " b 2f ;" | ||
54 | "0: mov v2.16b, v1.16b ;" | ||
55 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
56 | "1: aese v0.16b, v2.16b ;" | ||
57 | " aesmc v0.16b, v0.16b ;" | ||
58 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
59 | " aese v0.16b, v3.16b ;" | ||
60 | " aesmc v0.16b, v0.16b ;" | ||
61 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
62 | " subs %w[rounds], %w[rounds], #3 ;" | ||
63 | " aese v0.16b, v1.16b ;" | ||
64 | " aesmc v0.16b, v0.16b ;" | ||
65 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
66 | " bpl 1b ;" | ||
67 | " aese v0.16b, v2.16b ;" | ||
68 | " eor v0.16b, v0.16b, v3.16b ;" | ||
69 | " st1 {v0.16b}, %[out] ;" | ||
70 | |||
71 | : [out] "=Q"(*out), | ||
72 | [key] "=r"(dummy0), | ||
73 | [rounds] "=r"(dummy1) | ||
74 | : [in] "Q"(*in), | ||
75 | "1"(ctx->key_enc), | ||
76 | "2"(num_rounds(ctx) - 2) | ||
77 | : "cc"); | ||
78 | |||
79 | kernel_neon_end(); | ||
80 | } | ||
81 | |||
82 | static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
83 | { | ||
84 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
85 | struct aes_block *out = (struct aes_block *)dst; | ||
86 | struct aes_block const *in = (struct aes_block *)src; | ||
87 | void *dummy0; | ||
88 | int dummy1; | ||
89 | |||
90 | kernel_neon_begin_partial(4); | ||
91 | |||
92 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
93 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
94 | " cmp %w[rounds], #10 ;" | ||
95 | " bmi 0f ;" | ||
96 | " bne 3f ;" | ||
97 | " mov v3.16b, v1.16b ;" | ||
98 | " b 2f ;" | ||
99 | "0: mov v2.16b, v1.16b ;" | ||
100 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
101 | "1: aesd v0.16b, v2.16b ;" | ||
102 | " aesimc v0.16b, v0.16b ;" | ||
103 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
104 | " aesd v0.16b, v3.16b ;" | ||
105 | " aesimc v0.16b, v0.16b ;" | ||
106 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
107 | " subs %w[rounds], %w[rounds], #3 ;" | ||
108 | " aesd v0.16b, v1.16b ;" | ||
109 | " aesimc v0.16b, v0.16b ;" | ||
110 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
111 | " bpl 1b ;" | ||
112 | " aesd v0.16b, v2.16b ;" | ||
113 | " eor v0.16b, v0.16b, v3.16b ;" | ||
114 | " st1 {v0.16b}, %[out] ;" | ||
115 | |||
116 | : [out] "=Q"(*out), | ||
117 | [key] "=r"(dummy0), | ||
118 | [rounds] "=r"(dummy1) | ||
119 | : [in] "Q"(*in), | ||
120 | "1"(ctx->key_dec), | ||
121 | "2"(num_rounds(ctx) - 2) | ||
122 | : "cc"); | ||
123 | |||
124 | kernel_neon_end(); | ||
125 | } | ||
126 | |||
127 | static struct crypto_alg aes_alg = { | ||
128 | .cra_name = "aes", | ||
129 | .cra_driver_name = "aes-ce", | ||
130 | .cra_priority = 300, | ||
131 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
132 | .cra_blocksize = AES_BLOCK_SIZE, | ||
133 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
134 | .cra_module = THIS_MODULE, | ||
135 | .cra_cipher = { | ||
136 | .cia_min_keysize = AES_MIN_KEY_SIZE, | ||
137 | .cia_max_keysize = AES_MAX_KEY_SIZE, | ||
138 | .cia_setkey = crypto_aes_set_key, | ||
139 | .cia_encrypt = aes_cipher_encrypt, | ||
140 | .cia_decrypt = aes_cipher_decrypt | ||
141 | } | ||
142 | }; | ||
143 | |||
144 | static int __init aes_mod_init(void) | ||
145 | { | ||
146 | return crypto_register_alg(&aes_alg); | ||
147 | } | ||
148 | |||
149 | static void __exit aes_mod_exit(void) | ||
150 | { | ||
151 | crypto_unregister_alg(&aes_alg); | ||
152 | } | ||
153 | |||
154 | module_cpu_feature_match(AES, aes_mod_init); | ||
155 | module_exit(aes_mod_exit); | ||
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 000000000000..685a18f731eb --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S | |||
@@ -0,0 +1,133 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with | ||
3 | * Crypto Extensions | ||
4 | * | ||
5 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | |||
14 | #define AES_ENTRY(func) ENTRY(ce_ ## func) | ||
15 | #define AES_ENDPROC(func) ENDPROC(ce_ ## func) | ||
16 | |||
17 | .arch armv8-a+crypto | ||
18 | |||
19 | /* preload all round keys */ | ||
20 | .macro load_round_keys, rounds, rk | ||
21 | cmp \rounds, #12 | ||
22 | blo 2222f /* 128 bits */ | ||
23 | beq 1111f /* 192 bits */ | ||
24 | ld1 {v17.16b-v18.16b}, [\rk], #32 | ||
25 | 1111: ld1 {v19.16b-v20.16b}, [\rk], #32 | ||
26 | 2222: ld1 {v21.16b-v24.16b}, [\rk], #64 | ||
27 | ld1 {v25.16b-v28.16b}, [\rk], #64 | ||
28 | ld1 {v29.16b-v31.16b}, [\rk] | ||
29 | .endm | ||
30 | |||
31 | /* prepare for encryption with key in rk[] */ | ||
32 | .macro enc_prepare, rounds, rk, ignore | ||
33 | load_round_keys \rounds, \rk | ||
34 | .endm | ||
35 | |||
36 | /* prepare for encryption (again) but with new key in rk[] */ | ||
37 | .macro enc_switch_key, rounds, rk, ignore | ||
38 | load_round_keys \rounds, \rk | ||
39 | .endm | ||
40 | |||
41 | /* prepare for decryption with key in rk[] */ | ||
42 | .macro dec_prepare, rounds, rk, ignore | ||
43 | load_round_keys \rounds, \rk | ||
44 | .endm | ||
45 | |||
46 | .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 | ||
47 | aes\de \i0\().16b, \k\().16b | ||
48 | .ifnb \i1 | ||
49 | aes\de \i1\().16b, \k\().16b | ||
50 | .ifnb \i3 | ||
51 | aes\de \i2\().16b, \k\().16b | ||
52 | aes\de \i3\().16b, \k\().16b | ||
53 | .endif | ||
54 | .endif | ||
55 | aes\mc \i0\().16b, \i0\().16b | ||
56 | .ifnb \i1 | ||
57 | aes\mc \i1\().16b, \i1\().16b | ||
58 | .ifnb \i3 | ||
59 | aes\mc \i2\().16b, \i2\().16b | ||
60 | aes\mc \i3\().16b, \i3\().16b | ||
61 | .endif | ||
62 | .endif | ||
63 | .endm | ||
64 | |||
65 | /* up to 4 interleaved encryption rounds with the same round key */ | ||
66 | .macro round_Nx, enc, k, i0, i1, i2, i3 | ||
67 | .ifc \enc, e | ||
68 | do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 | ||
69 | .else | ||
70 | do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 | ||
71 | .endif | ||
72 | .endm | ||
73 | |||
74 | /* up to 4 interleaved final rounds */ | ||
75 | .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 | ||
76 | aes\de \i0\().16b, \k\().16b | ||
77 | .ifnb \i1 | ||
78 | aes\de \i1\().16b, \k\().16b | ||
79 | .ifnb \i3 | ||
80 | aes\de \i2\().16b, \k\().16b | ||
81 | aes\de \i3\().16b, \k\().16b | ||
82 | .endif | ||
83 | .endif | ||
84 | eor \i0\().16b, \i0\().16b, \k2\().16b | ||
85 | .ifnb \i1 | ||
86 | eor \i1\().16b, \i1\().16b, \k2\().16b | ||
87 | .ifnb \i3 | ||
88 | eor \i2\().16b, \i2\().16b, \k2\().16b | ||
89 | eor \i3\().16b, \i3\().16b, \k2\().16b | ||
90 | .endif | ||
91 | .endif | ||
92 | .endm | ||
93 | |||
94 | /* up to 4 interleaved blocks */ | ||
95 | .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 | ||
96 | cmp \rounds, #12 | ||
97 | blo 2222f /* 128 bits */ | ||
98 | beq 1111f /* 192 bits */ | ||
99 | round_Nx \enc, v17, \i0, \i1, \i2, \i3 | ||
100 | round_Nx \enc, v18, \i0, \i1, \i2, \i3 | ||
101 | 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 | ||
102 | round_Nx \enc, v20, \i0, \i1, \i2, \i3 | ||
103 | 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 | ||
104 | round_Nx \enc, \key, \i0, \i1, \i2, \i3 | ||
105 | .endr | ||
106 | fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 | ||
107 | .endm | ||
108 | |||
109 | .macro encrypt_block, in, rounds, t0, t1, t2 | ||
110 | do_block_Nx e, \rounds, \in | ||
111 | .endm | ||
112 | |||
113 | .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
114 | do_block_Nx e, \rounds, \i0, \i1 | ||
115 | .endm | ||
116 | |||
117 | .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
118 | do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 | ||
119 | .endm | ||
120 | |||
121 | .macro decrypt_block, in, rounds, t0, t1, t2 | ||
122 | do_block_Nx d, \rounds, \in | ||
123 | .endm | ||
124 | |||
125 | .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
126 | do_block_Nx d, \rounds, \i0, \i1 | ||
127 | .endm | ||
128 | |||
129 | .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
130 | do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 | ||
131 | .endm | ||
132 | |||
133 | #include "aes-modes.S" | ||
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 000000000000..60f2f4c12256 --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c | |||
@@ -0,0 +1,446 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/hwcap.h> | ||
13 | #include <crypto/aes.h> | ||
14 | #include <crypto/ablk_helper.h> | ||
15 | #include <crypto/algapi.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/cpufeature.h> | ||
18 | |||
19 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
20 | #define MODE "ce" | ||
21 | #define PRIO 300 | ||
22 | #define aes_ecb_encrypt ce_aes_ecb_encrypt | ||
23 | #define aes_ecb_decrypt ce_aes_ecb_decrypt | ||
24 | #define aes_cbc_encrypt ce_aes_cbc_encrypt | ||
25 | #define aes_cbc_decrypt ce_aes_cbc_decrypt | ||
26 | #define aes_ctr_encrypt ce_aes_ctr_encrypt | ||
27 | #define aes_xts_encrypt ce_aes_xts_encrypt | ||
28 | #define aes_xts_decrypt ce_aes_xts_decrypt | ||
29 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); | ||
30 | #else | ||
31 | #define MODE "neon" | ||
32 | #define PRIO 200 | ||
33 | #define aes_ecb_encrypt neon_aes_ecb_encrypt | ||
34 | #define aes_ecb_decrypt neon_aes_ecb_decrypt | ||
35 | #define aes_cbc_encrypt neon_aes_cbc_encrypt | ||
36 | #define aes_cbc_decrypt neon_aes_cbc_decrypt | ||
37 | #define aes_ctr_encrypt neon_aes_ctr_encrypt | ||
38 | #define aes_xts_encrypt neon_aes_xts_encrypt | ||
39 | #define aes_xts_decrypt neon_aes_xts_decrypt | ||
40 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); | ||
41 | MODULE_ALIAS("ecb(aes)"); | ||
42 | MODULE_ALIAS("cbc(aes)"); | ||
43 | MODULE_ALIAS("ctr(aes)"); | ||
44 | MODULE_ALIAS("xts(aes)"); | ||
45 | #endif | ||
46 | |||
47 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
48 | MODULE_LICENSE("GPL v2"); | ||
49 | |||
50 | /* defined in aes-modes.S */ | ||
51 | asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
52 | int rounds, int blocks, int first); | ||
53 | asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
54 | int rounds, int blocks, int first); | ||
55 | |||
56 | asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
57 | int rounds, int blocks, u8 iv[], int first); | ||
58 | asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
59 | int rounds, int blocks, u8 iv[], int first); | ||
60 | |||
61 | asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
62 | int rounds, int blocks, u8 ctr[], int first); | ||
63 | |||
64 | asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
65 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
66 | int first); | ||
67 | asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
68 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
69 | int first); | ||
70 | |||
71 | struct crypto_aes_xts_ctx { | ||
72 | struct crypto_aes_ctx key1; | ||
73 | struct crypto_aes_ctx __aligned(8) key2; | ||
74 | }; | ||
75 | |||
76 | static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
77 | unsigned int key_len) | ||
78 | { | ||
79 | struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); | ||
80 | int ret; | ||
81 | |||
82 | ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); | ||
83 | if (!ret) | ||
84 | ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], | ||
85 | key_len / 2); | ||
86 | if (!ret) | ||
87 | return 0; | ||
88 | |||
89 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
90 | return -EINVAL; | ||
91 | } | ||
92 | |||
93 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
94 | struct scatterlist *src, unsigned int nbytes) | ||
95 | { | ||
96 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
97 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
98 | struct blkcipher_walk walk; | ||
99 | unsigned int blocks; | ||
100 | |||
101 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
102 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
103 | err = blkcipher_walk_virt(desc, &walk); | ||
104 | |||
105 | kernel_neon_begin(); | ||
106 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
107 | aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
108 | (u8 *)ctx->key_enc, rounds, blocks, first); | ||
109 | err = blkcipher_walk_done(desc, &walk, 0); | ||
110 | } | ||
111 | kernel_neon_end(); | ||
112 | return err; | ||
113 | } | ||
114 | |||
115 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
116 | struct scatterlist *src, unsigned int nbytes) | ||
117 | { | ||
118 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
119 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
120 | struct blkcipher_walk walk; | ||
121 | unsigned int blocks; | ||
122 | |||
123 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
124 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
125 | err = blkcipher_walk_virt(desc, &walk); | ||
126 | |||
127 | kernel_neon_begin(); | ||
128 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
129 | aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
130 | (u8 *)ctx->key_dec, rounds, blocks, first); | ||
131 | err = blkcipher_walk_done(desc, &walk, 0); | ||
132 | } | ||
133 | kernel_neon_end(); | ||
134 | return err; | ||
135 | } | ||
136 | |||
137 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
138 | struct scatterlist *src, unsigned int nbytes) | ||
139 | { | ||
140 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
141 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
142 | struct blkcipher_walk walk; | ||
143 | unsigned int blocks; | ||
144 | |||
145 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
146 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
147 | err = blkcipher_walk_virt(desc, &walk); | ||
148 | |||
149 | kernel_neon_begin(); | ||
150 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
151 | aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
152 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
153 | first); | ||
154 | err = blkcipher_walk_done(desc, &walk, 0); | ||
155 | } | ||
156 | kernel_neon_end(); | ||
157 | return err; | ||
158 | } | ||
159 | |||
160 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
164 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
165 | struct blkcipher_walk walk; | ||
166 | unsigned int blocks; | ||
167 | |||
168 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
169 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
170 | err = blkcipher_walk_virt(desc, &walk); | ||
171 | |||
172 | kernel_neon_begin(); | ||
173 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
174 | aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
175 | (u8 *)ctx->key_dec, rounds, blocks, walk.iv, | ||
176 | first); | ||
177 | err = blkcipher_walk_done(desc, &walk, 0); | ||
178 | } | ||
179 | kernel_neon_end(); | ||
180 | return err; | ||
181 | } | ||
182 | |||
183 | static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
184 | struct scatterlist *src, unsigned int nbytes) | ||
185 | { | ||
186 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
187 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
188 | struct blkcipher_walk walk; | ||
189 | int blocks; | ||
190 | |||
191 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
192 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
193 | err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); | ||
194 | |||
195 | first = 1; | ||
196 | kernel_neon_begin(); | ||
197 | while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { | ||
198 | aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
199 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
200 | first); | ||
201 | first = 0; | ||
202 | nbytes -= blocks * AES_BLOCK_SIZE; | ||
203 | if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) | ||
204 | break; | ||
205 | err = blkcipher_walk_done(desc, &walk, | ||
206 | walk.nbytes % AES_BLOCK_SIZE); | ||
207 | } | ||
208 | if (nbytes) { | ||
209 | u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; | ||
210 | u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; | ||
211 | u8 __aligned(8) tail[AES_BLOCK_SIZE]; | ||
212 | |||
213 | /* | ||
214 | * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need | ||
215 | * to tell aes_ctr_encrypt() to only read half a block. | ||
216 | */ | ||
217 | blocks = (nbytes <= 8) ? -1 : 1; | ||
218 | |||
219 | aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, | ||
220 | blocks, walk.iv, first); | ||
221 | memcpy(tdst, tail, nbytes); | ||
222 | err = blkcipher_walk_done(desc, &walk, 0); | ||
223 | } | ||
224 | kernel_neon_end(); | ||
225 | |||
226 | return err; | ||
227 | } | ||
228 | |||
229 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
230 | struct scatterlist *src, unsigned int nbytes) | ||
231 | { | ||
232 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
233 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
234 | struct blkcipher_walk walk; | ||
235 | unsigned int blocks; | ||
236 | |||
237 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
238 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
239 | err = blkcipher_walk_virt(desc, &walk); | ||
240 | |||
241 | kernel_neon_begin(); | ||
242 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
243 | aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
244 | (u8 *)ctx->key1.key_enc, rounds, blocks, | ||
245 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
246 | err = blkcipher_walk_done(desc, &walk, 0); | ||
247 | } | ||
248 | kernel_neon_end(); | ||
249 | |||
250 | return err; | ||
251 | } | ||
252 | |||
253 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
254 | struct scatterlist *src, unsigned int nbytes) | ||
255 | { | ||
256 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
257 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
258 | struct blkcipher_walk walk; | ||
259 | unsigned int blocks; | ||
260 | |||
261 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
262 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
263 | err = blkcipher_walk_virt(desc, &walk); | ||
264 | |||
265 | kernel_neon_begin(); | ||
266 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
267 | aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
268 | (u8 *)ctx->key1.key_dec, rounds, blocks, | ||
269 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
270 | err = blkcipher_walk_done(desc, &walk, 0); | ||
271 | } | ||
272 | kernel_neon_end(); | ||
273 | |||
274 | return err; | ||
275 | } | ||
276 | |||
277 | static struct crypto_alg aes_algs[] = { { | ||
278 | .cra_name = "__ecb-aes-" MODE, | ||
279 | .cra_driver_name = "__driver-ecb-aes-" MODE, | ||
280 | .cra_priority = 0, | ||
281 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
282 | .cra_blocksize = AES_BLOCK_SIZE, | ||
283 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
284 | .cra_alignmask = 7, | ||
285 | .cra_type = &crypto_blkcipher_type, | ||
286 | .cra_module = THIS_MODULE, | ||
287 | .cra_blkcipher = { | ||
288 | .min_keysize = AES_MIN_KEY_SIZE, | ||
289 | .max_keysize = AES_MAX_KEY_SIZE, | ||
290 | .ivsize = AES_BLOCK_SIZE, | ||
291 | .setkey = crypto_aes_set_key, | ||
292 | .encrypt = ecb_encrypt, | ||
293 | .decrypt = ecb_decrypt, | ||
294 | }, | ||
295 | }, { | ||
296 | .cra_name = "__cbc-aes-" MODE, | ||
297 | .cra_driver_name = "__driver-cbc-aes-" MODE, | ||
298 | .cra_priority = 0, | ||
299 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
300 | .cra_blocksize = AES_BLOCK_SIZE, | ||
301 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
302 | .cra_alignmask = 7, | ||
303 | .cra_type = &crypto_blkcipher_type, | ||
304 | .cra_module = THIS_MODULE, | ||
305 | .cra_blkcipher = { | ||
306 | .min_keysize = AES_MIN_KEY_SIZE, | ||
307 | .max_keysize = AES_MAX_KEY_SIZE, | ||
308 | .ivsize = AES_BLOCK_SIZE, | ||
309 | .setkey = crypto_aes_set_key, | ||
310 | .encrypt = cbc_encrypt, | ||
311 | .decrypt = cbc_decrypt, | ||
312 | }, | ||
313 | }, { | ||
314 | .cra_name = "__ctr-aes-" MODE, | ||
315 | .cra_driver_name = "__driver-ctr-aes-" MODE, | ||
316 | .cra_priority = 0, | ||
317 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
318 | .cra_blocksize = 1, | ||
319 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
320 | .cra_alignmask = 7, | ||
321 | .cra_type = &crypto_blkcipher_type, | ||
322 | .cra_module = THIS_MODULE, | ||
323 | .cra_blkcipher = { | ||
324 | .min_keysize = AES_MIN_KEY_SIZE, | ||
325 | .max_keysize = AES_MAX_KEY_SIZE, | ||
326 | .ivsize = AES_BLOCK_SIZE, | ||
327 | .setkey = crypto_aes_set_key, | ||
328 | .encrypt = ctr_encrypt, | ||
329 | .decrypt = ctr_encrypt, | ||
330 | }, | ||
331 | }, { | ||
332 | .cra_name = "__xts-aes-" MODE, | ||
333 | .cra_driver_name = "__driver-xts-aes-" MODE, | ||
334 | .cra_priority = 0, | ||
335 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
336 | .cra_blocksize = AES_BLOCK_SIZE, | ||
337 | .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), | ||
338 | .cra_alignmask = 7, | ||
339 | .cra_type = &crypto_blkcipher_type, | ||
340 | .cra_module = THIS_MODULE, | ||
341 | .cra_blkcipher = { | ||
342 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
343 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
344 | .ivsize = AES_BLOCK_SIZE, | ||
345 | .setkey = xts_set_key, | ||
346 | .encrypt = xts_encrypt, | ||
347 | .decrypt = xts_decrypt, | ||
348 | }, | ||
349 | }, { | ||
350 | .cra_name = "ecb(aes)", | ||
351 | .cra_driver_name = "ecb-aes-" MODE, | ||
352 | .cra_priority = PRIO, | ||
353 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
354 | .cra_blocksize = AES_BLOCK_SIZE, | ||
355 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
356 | .cra_alignmask = 7, | ||
357 | .cra_type = &crypto_ablkcipher_type, | ||
358 | .cra_module = THIS_MODULE, | ||
359 | .cra_init = ablk_init, | ||
360 | .cra_exit = ablk_exit, | ||
361 | .cra_ablkcipher = { | ||
362 | .min_keysize = AES_MIN_KEY_SIZE, | ||
363 | .max_keysize = AES_MAX_KEY_SIZE, | ||
364 | .ivsize = AES_BLOCK_SIZE, | ||
365 | .setkey = ablk_set_key, | ||
366 | .encrypt = ablk_encrypt, | ||
367 | .decrypt = ablk_decrypt, | ||
368 | } | ||
369 | }, { | ||
370 | .cra_name = "cbc(aes)", | ||
371 | .cra_driver_name = "cbc-aes-" MODE, | ||
372 | .cra_priority = PRIO, | ||
373 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
374 | .cra_blocksize = AES_BLOCK_SIZE, | ||
375 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
376 | .cra_alignmask = 7, | ||
377 | .cra_type = &crypto_ablkcipher_type, | ||
378 | .cra_module = THIS_MODULE, | ||
379 | .cra_init = ablk_init, | ||
380 | .cra_exit = ablk_exit, | ||
381 | .cra_ablkcipher = { | ||
382 | .min_keysize = AES_MIN_KEY_SIZE, | ||
383 | .max_keysize = AES_MAX_KEY_SIZE, | ||
384 | .ivsize = AES_BLOCK_SIZE, | ||
385 | .setkey = ablk_set_key, | ||
386 | .encrypt = ablk_encrypt, | ||
387 | .decrypt = ablk_decrypt, | ||
388 | } | ||
389 | }, { | ||
390 | .cra_name = "ctr(aes)", | ||
391 | .cra_driver_name = "ctr-aes-" MODE, | ||
392 | .cra_priority = PRIO, | ||
393 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
394 | .cra_blocksize = 1, | ||
395 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
396 | .cra_alignmask = 7, | ||
397 | .cra_type = &crypto_ablkcipher_type, | ||
398 | .cra_module = THIS_MODULE, | ||
399 | .cra_init = ablk_init, | ||
400 | .cra_exit = ablk_exit, | ||
401 | .cra_ablkcipher = { | ||
402 | .min_keysize = AES_MIN_KEY_SIZE, | ||
403 | .max_keysize = AES_MAX_KEY_SIZE, | ||
404 | .ivsize = AES_BLOCK_SIZE, | ||
405 | .setkey = ablk_set_key, | ||
406 | .encrypt = ablk_encrypt, | ||
407 | .decrypt = ablk_decrypt, | ||
408 | } | ||
409 | }, { | ||
410 | .cra_name = "xts(aes)", | ||
411 | .cra_driver_name = "xts-aes-" MODE, | ||
412 | .cra_priority = PRIO, | ||
413 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
414 | .cra_blocksize = AES_BLOCK_SIZE, | ||
415 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
416 | .cra_alignmask = 7, | ||
417 | .cra_type = &crypto_ablkcipher_type, | ||
418 | .cra_module = THIS_MODULE, | ||
419 | .cra_init = ablk_init, | ||
420 | .cra_exit = ablk_exit, | ||
421 | .cra_ablkcipher = { | ||
422 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
423 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
424 | .ivsize = AES_BLOCK_SIZE, | ||
425 | .setkey = ablk_set_key, | ||
426 | .encrypt = ablk_encrypt, | ||
427 | .decrypt = ablk_decrypt, | ||
428 | } | ||
429 | } }; | ||
430 | |||
431 | static int __init aes_init(void) | ||
432 | { | ||
433 | return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
434 | } | ||
435 | |||
436 | static void __exit aes_exit(void) | ||
437 | { | ||
438 | crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
439 | } | ||
440 | |||
441 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
442 | module_cpu_feature_match(AES, aes_init); | ||
443 | #else | ||
444 | module_init(aes_init); | ||
445 | #endif | ||
446 | module_exit(aes_exit); | ||
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 000000000000..f6e372c528eb --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S | |||
@@ -0,0 +1,532 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | /* included by aes-ce.S and aes-neon.S */ | ||
12 | |||
13 | .text | ||
14 | .align 4 | ||
15 | |||
16 | /* | ||
17 | * There are several ways to instantiate this code: | ||
18 | * - no interleave, all inline | ||
19 | * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) | ||
20 | * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) | ||
21 | * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) | ||
22 | * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) | ||
23 | * | ||
24 | * Macros imported by this code: | ||
25 | * - enc_prepare - setup NEON registers for encryption | ||
26 | * - dec_prepare - setup NEON registers for decryption | ||
27 | * - enc_switch_key - change to new key after having prepared for encryption | ||
28 | * - encrypt_block - encrypt a single block | ||
29 | * - decrypt block - decrypt a single block | ||
30 | * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
31 | * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
32 | * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
33 | * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
34 | */ | ||
35 | |||
36 | #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) | ||
37 | #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp | ||
38 | #define FRAME_POP ldp x29, x30, [sp],#16 | ||
39 | |||
40 | #if INTERLEAVE == 2 | ||
41 | |||
42 | aes_encrypt_block2x: | ||
43 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
44 | ret | ||
45 | ENDPROC(aes_encrypt_block2x) | ||
46 | |||
47 | aes_decrypt_block2x: | ||
48 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
49 | ret | ||
50 | ENDPROC(aes_decrypt_block2x) | ||
51 | |||
52 | #elif INTERLEAVE == 4 | ||
53 | |||
54 | aes_encrypt_block4x: | ||
55 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
56 | ret | ||
57 | ENDPROC(aes_encrypt_block4x) | ||
58 | |||
59 | aes_decrypt_block4x: | ||
60 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
61 | ret | ||
62 | ENDPROC(aes_decrypt_block4x) | ||
63 | |||
64 | #else | ||
65 | #error INTERLEAVE should equal 2 or 4 | ||
66 | #endif | ||
67 | |||
68 | .macro do_encrypt_block2x | ||
69 | bl aes_encrypt_block2x | ||
70 | .endm | ||
71 | |||
72 | .macro do_decrypt_block2x | ||
73 | bl aes_decrypt_block2x | ||
74 | .endm | ||
75 | |||
76 | .macro do_encrypt_block4x | ||
77 | bl aes_encrypt_block4x | ||
78 | .endm | ||
79 | |||
80 | .macro do_decrypt_block4x | ||
81 | bl aes_decrypt_block4x | ||
82 | .endm | ||
83 | |||
84 | #else | ||
85 | #define FRAME_PUSH | ||
86 | #define FRAME_POP | ||
87 | |||
88 | .macro do_encrypt_block2x | ||
89 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
90 | .endm | ||
91 | |||
92 | .macro do_decrypt_block2x | ||
93 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
94 | .endm | ||
95 | |||
96 | .macro do_encrypt_block4x | ||
97 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
98 | .endm | ||
99 | |||
100 | .macro do_decrypt_block4x | ||
101 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
102 | .endm | ||
103 | |||
104 | #endif | ||
105 | |||
106 | /* | ||
107 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
108 | * int blocks, int first) | ||
109 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
110 | * int blocks, int first) | ||
111 | */ | ||
112 | |||
113 | AES_ENTRY(aes_ecb_encrypt) | ||
114 | FRAME_PUSH | ||
115 | cbz w5, .LecbencloopNx | ||
116 | |||
117 | enc_prepare w3, x2, x5 | ||
118 | |||
119 | .LecbencloopNx: | ||
120 | #if INTERLEAVE >= 2 | ||
121 | subs w4, w4, #INTERLEAVE | ||
122 | bmi .Lecbenc1x | ||
123 | #if INTERLEAVE == 2 | ||
124 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
125 | do_encrypt_block2x | ||
126 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
127 | #else | ||
128 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
129 | do_encrypt_block4x | ||
130 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
131 | #endif | ||
132 | b .LecbencloopNx | ||
133 | .Lecbenc1x: | ||
134 | adds w4, w4, #INTERLEAVE | ||
135 | beq .Lecbencout | ||
136 | #endif | ||
137 | .Lecbencloop: | ||
138 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ | ||
139 | encrypt_block v0, w3, x2, x5, w6 | ||
140 | st1 {v0.16b}, [x0], #16 | ||
141 | subs w4, w4, #1 | ||
142 | bne .Lecbencloop | ||
143 | .Lecbencout: | ||
144 | FRAME_POP | ||
145 | ret | ||
146 | AES_ENDPROC(aes_ecb_encrypt) | ||
147 | |||
148 | |||
149 | AES_ENTRY(aes_ecb_decrypt) | ||
150 | FRAME_PUSH | ||
151 | cbz w5, .LecbdecloopNx | ||
152 | |||
153 | dec_prepare w3, x2, x5 | ||
154 | |||
155 | .LecbdecloopNx: | ||
156 | #if INTERLEAVE >= 2 | ||
157 | subs w4, w4, #INTERLEAVE | ||
158 | bmi .Lecbdec1x | ||
159 | #if INTERLEAVE == 2 | ||
160 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
161 | do_decrypt_block2x | ||
162 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
163 | #else | ||
164 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
165 | do_decrypt_block4x | ||
166 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
167 | #endif | ||
168 | b .LecbdecloopNx | ||
169 | .Lecbdec1x: | ||
170 | adds w4, w4, #INTERLEAVE | ||
171 | beq .Lecbdecout | ||
172 | #endif | ||
173 | .Lecbdecloop: | ||
174 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ | ||
175 | decrypt_block v0, w3, x2, x5, w6 | ||
176 | st1 {v0.16b}, [x0], #16 | ||
177 | subs w4, w4, #1 | ||
178 | bne .Lecbdecloop | ||
179 | .Lecbdecout: | ||
180 | FRAME_POP | ||
181 | ret | ||
182 | AES_ENDPROC(aes_ecb_decrypt) | ||
183 | |||
184 | |||
185 | /* | ||
186 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
187 | * int blocks, u8 iv[], int first) | ||
188 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
189 | * int blocks, u8 iv[], int first) | ||
190 | */ | ||
191 | |||
192 | AES_ENTRY(aes_cbc_encrypt) | ||
193 | cbz w6, .Lcbcencloop | ||
194 | |||
195 | ld1 {v0.16b}, [x5] /* get iv */ | ||
196 | enc_prepare w3, x2, x5 | ||
197 | |||
198 | .Lcbcencloop: | ||
199 | ld1 {v1.16b}, [x1], #16 /* get next pt block */ | ||
200 | eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ | ||
201 | encrypt_block v0, w3, x2, x5, w6 | ||
202 | st1 {v0.16b}, [x0], #16 | ||
203 | subs w4, w4, #1 | ||
204 | bne .Lcbcencloop | ||
205 | ret | ||
206 | AES_ENDPROC(aes_cbc_encrypt) | ||
207 | |||
208 | |||
209 | AES_ENTRY(aes_cbc_decrypt) | ||
210 | FRAME_PUSH | ||
211 | cbz w6, .LcbcdecloopNx | ||
212 | |||
213 | ld1 {v7.16b}, [x5] /* get iv */ | ||
214 | dec_prepare w3, x2, x5 | ||
215 | |||
216 | .LcbcdecloopNx: | ||
217 | #if INTERLEAVE >= 2 | ||
218 | subs w4, w4, #INTERLEAVE | ||
219 | bmi .Lcbcdec1x | ||
220 | #if INTERLEAVE == 2 | ||
221 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
222 | mov v2.16b, v0.16b | ||
223 | mov v3.16b, v1.16b | ||
224 | do_decrypt_block2x | ||
225 | eor v0.16b, v0.16b, v7.16b | ||
226 | eor v1.16b, v1.16b, v2.16b | ||
227 | mov v7.16b, v3.16b | ||
228 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
229 | #else | ||
230 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
231 | mov v4.16b, v0.16b | ||
232 | mov v5.16b, v1.16b | ||
233 | mov v6.16b, v2.16b | ||
234 | do_decrypt_block4x | ||
235 | sub x1, x1, #16 | ||
236 | eor v0.16b, v0.16b, v7.16b | ||
237 | eor v1.16b, v1.16b, v4.16b | ||
238 | ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ | ||
239 | eor v2.16b, v2.16b, v5.16b | ||
240 | eor v3.16b, v3.16b, v6.16b | ||
241 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
242 | #endif | ||
243 | b .LcbcdecloopNx | ||
244 | .Lcbcdec1x: | ||
245 | adds w4, w4, #INTERLEAVE | ||
246 | beq .Lcbcdecout | ||
247 | #endif | ||
248 | .Lcbcdecloop: | ||
249 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ | ||
250 | mov v0.16b, v1.16b /* ...and copy to v0 */ | ||
251 | decrypt_block v0, w3, x2, x5, w6 | ||
252 | eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ | ||
253 | mov v7.16b, v1.16b /* ct is next iv */ | ||
254 | st1 {v0.16b}, [x0], #16 | ||
255 | subs w4, w4, #1 | ||
256 | bne .Lcbcdecloop | ||
257 | .Lcbcdecout: | ||
258 | FRAME_POP | ||
259 | ret | ||
260 | AES_ENDPROC(aes_cbc_decrypt) | ||
261 | |||
262 | |||
263 | /* | ||
264 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
265 | * int blocks, u8 ctr[], int first) | ||
266 | */ | ||
267 | |||
268 | AES_ENTRY(aes_ctr_encrypt) | ||
269 | FRAME_PUSH | ||
270 | cbnz w6, .Lctrfirst /* 1st time around? */ | ||
271 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
272 | rev x5, x5 | ||
273 | #if INTERLEAVE >= 2 | ||
274 | cmn w5, w4 /* 32 bit overflow? */ | ||
275 | bcs .Lctrinc | ||
276 | add x5, x5, #1 /* increment BE ctr */ | ||
277 | b .LctrincNx | ||
278 | #else | ||
279 | b .Lctrinc | ||
280 | #endif | ||
281 | .Lctrfirst: | ||
282 | enc_prepare w3, x2, x6 | ||
283 | ld1 {v4.16b}, [x5] | ||
284 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
285 | rev x5, x5 | ||
286 | #if INTERLEAVE >= 2 | ||
287 | cmn w5, w4 /* 32 bit overflow? */ | ||
288 | bcs .Lctrloop | ||
289 | .LctrloopNx: | ||
290 | subs w4, w4, #INTERLEAVE | ||
291 | bmi .Lctr1x | ||
292 | #if INTERLEAVE == 2 | ||
293 | mov v0.8b, v4.8b | ||
294 | mov v1.8b, v4.8b | ||
295 | rev x7, x5 | ||
296 | add x5, x5, #1 | ||
297 | ins v0.d[1], x7 | ||
298 | rev x7, x5 | ||
299 | add x5, x5, #1 | ||
300 | ins v1.d[1], x7 | ||
301 | ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ | ||
302 | do_encrypt_block2x | ||
303 | eor v0.16b, v0.16b, v2.16b | ||
304 | eor v1.16b, v1.16b, v3.16b | ||
305 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
306 | #else | ||
307 | ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ | ||
308 | dup v7.4s, w5 | ||
309 | mov v0.16b, v4.16b | ||
310 | add v7.4s, v7.4s, v8.4s | ||
311 | mov v1.16b, v4.16b | ||
312 | rev32 v8.16b, v7.16b | ||
313 | mov v2.16b, v4.16b | ||
314 | mov v3.16b, v4.16b | ||
315 | mov v1.s[3], v8.s[0] | ||
316 | mov v2.s[3], v8.s[1] | ||
317 | mov v3.s[3], v8.s[2] | ||
318 | ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ | ||
319 | do_encrypt_block4x | ||
320 | eor v0.16b, v5.16b, v0.16b | ||
321 | ld1 {v5.16b}, [x1], #16 /* get 1 input block */ | ||
322 | eor v1.16b, v6.16b, v1.16b | ||
323 | eor v2.16b, v7.16b, v2.16b | ||
324 | eor v3.16b, v5.16b, v3.16b | ||
325 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
326 | add x5, x5, #INTERLEAVE | ||
327 | #endif | ||
328 | cbz w4, .LctroutNx | ||
329 | .LctrincNx: | ||
330 | rev x7, x5 | ||
331 | ins v4.d[1], x7 | ||
332 | b .LctrloopNx | ||
333 | .LctroutNx: | ||
334 | sub x5, x5, #1 | ||
335 | rev x7, x5 | ||
336 | ins v4.d[1], x7 | ||
337 | b .Lctrout | ||
338 | .Lctr1x: | ||
339 | adds w4, w4, #INTERLEAVE | ||
340 | beq .Lctrout | ||
341 | #endif | ||
342 | .Lctrloop: | ||
343 | mov v0.16b, v4.16b | ||
344 | encrypt_block v0, w3, x2, x6, w7 | ||
345 | subs w4, w4, #1 | ||
346 | bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ | ||
347 | ld1 {v3.16b}, [x1], #16 | ||
348 | eor v3.16b, v0.16b, v3.16b | ||
349 | st1 {v3.16b}, [x0], #16 | ||
350 | beq .Lctrout | ||
351 | .Lctrinc: | ||
352 | adds x5, x5, #1 /* increment BE ctr */ | ||
353 | rev x7, x5 | ||
354 | ins v4.d[1], x7 | ||
355 | bcc .Lctrloop /* no overflow? */ | ||
356 | umov x7, v4.d[0] /* load upper word of ctr */ | ||
357 | rev x7, x7 /* ... to handle the carry */ | ||
358 | add x7, x7, #1 | ||
359 | rev x7, x7 | ||
360 | ins v4.d[0], x7 | ||
361 | b .Lctrloop | ||
362 | .Lctrhalfblock: | ||
363 | ld1 {v3.8b}, [x1] | ||
364 | eor v3.8b, v0.8b, v3.8b | ||
365 | st1 {v3.8b}, [x0] | ||
366 | .Lctrout: | ||
367 | FRAME_POP | ||
368 | ret | ||
369 | AES_ENDPROC(aes_ctr_encrypt) | ||
370 | .ltorg | ||
371 | |||
372 | |||
373 | /* | ||
374 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
375 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
376 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
377 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
378 | */ | ||
379 | |||
380 | .macro next_tweak, out, in, const, tmp | ||
381 | sshr \tmp\().2d, \in\().2d, #63 | ||
382 | and \tmp\().16b, \tmp\().16b, \const\().16b | ||
383 | add \out\().2d, \in\().2d, \in\().2d | ||
384 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 | ||
385 | eor \out\().16b, \out\().16b, \tmp\().16b | ||
386 | .endm | ||
387 | |||
388 | .Lxts_mul_x: | ||
389 | .word 1, 0, 0x87, 0 | ||
390 | |||
391 | AES_ENTRY(aes_xts_encrypt) | ||
392 | FRAME_PUSH | ||
393 | cbz w7, .LxtsencloopNx | ||
394 | |||
395 | ld1 {v4.16b}, [x6] | ||
396 | enc_prepare w3, x5, x6 | ||
397 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
398 | enc_switch_key w3, x2, x6 | ||
399 | ldr q7, .Lxts_mul_x | ||
400 | b .LxtsencNx | ||
401 | |||
402 | .LxtsencloopNx: | ||
403 | ldr q7, .Lxts_mul_x | ||
404 | next_tweak v4, v4, v7, v8 | ||
405 | .LxtsencNx: | ||
406 | #if INTERLEAVE >= 2 | ||
407 | subs w4, w4, #INTERLEAVE | ||
408 | bmi .Lxtsenc1x | ||
409 | #if INTERLEAVE == 2 | ||
410 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
411 | next_tweak v5, v4, v7, v8 | ||
412 | eor v0.16b, v0.16b, v4.16b | ||
413 | eor v1.16b, v1.16b, v5.16b | ||
414 | do_encrypt_block2x | ||
415 | eor v0.16b, v0.16b, v4.16b | ||
416 | eor v1.16b, v1.16b, v5.16b | ||
417 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
418 | cbz w4, .LxtsencoutNx | ||
419 | next_tweak v4, v5, v7, v8 | ||
420 | b .LxtsencNx | ||
421 | .LxtsencoutNx: | ||
422 | mov v4.16b, v5.16b | ||
423 | b .Lxtsencout | ||
424 | #else | ||
425 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
426 | next_tweak v5, v4, v7, v8 | ||
427 | eor v0.16b, v0.16b, v4.16b | ||
428 | next_tweak v6, v5, v7, v8 | ||
429 | eor v1.16b, v1.16b, v5.16b | ||
430 | eor v2.16b, v2.16b, v6.16b | ||
431 | next_tweak v7, v6, v7, v8 | ||
432 | eor v3.16b, v3.16b, v7.16b | ||
433 | do_encrypt_block4x | ||
434 | eor v3.16b, v3.16b, v7.16b | ||
435 | eor v0.16b, v0.16b, v4.16b | ||
436 | eor v1.16b, v1.16b, v5.16b | ||
437 | eor v2.16b, v2.16b, v6.16b | ||
438 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
439 | mov v4.16b, v7.16b | ||
440 | cbz w4, .Lxtsencout | ||
441 | b .LxtsencloopNx | ||
442 | #endif | ||
443 | .Lxtsenc1x: | ||
444 | adds w4, w4, #INTERLEAVE | ||
445 | beq .Lxtsencout | ||
446 | #endif | ||
447 | .Lxtsencloop: | ||
448 | ld1 {v1.16b}, [x1], #16 | ||
449 | eor v0.16b, v1.16b, v4.16b | ||
450 | encrypt_block v0, w3, x2, x6, w7 | ||
451 | eor v0.16b, v0.16b, v4.16b | ||
452 | st1 {v0.16b}, [x0], #16 | ||
453 | subs w4, w4, #1 | ||
454 | beq .Lxtsencout | ||
455 | next_tweak v4, v4, v7, v8 | ||
456 | b .Lxtsencloop | ||
457 | .Lxtsencout: | ||
458 | FRAME_POP | ||
459 | ret | ||
460 | AES_ENDPROC(aes_xts_encrypt) | ||
461 | |||
462 | |||
463 | AES_ENTRY(aes_xts_decrypt) | ||
464 | FRAME_PUSH | ||
465 | cbz w7, .LxtsdecloopNx | ||
466 | |||
467 | ld1 {v4.16b}, [x6] | ||
468 | enc_prepare w3, x5, x6 | ||
469 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
470 | dec_prepare w3, x2, x6 | ||
471 | ldr q7, .Lxts_mul_x | ||
472 | b .LxtsdecNx | ||
473 | |||
474 | .LxtsdecloopNx: | ||
475 | ldr q7, .Lxts_mul_x | ||
476 | next_tweak v4, v4, v7, v8 | ||
477 | .LxtsdecNx: | ||
478 | #if INTERLEAVE >= 2 | ||
479 | subs w4, w4, #INTERLEAVE | ||
480 | bmi .Lxtsdec1x | ||
481 | #if INTERLEAVE == 2 | ||
482 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
483 | next_tweak v5, v4, v7, v8 | ||
484 | eor v0.16b, v0.16b, v4.16b | ||
485 | eor v1.16b, v1.16b, v5.16b | ||
486 | do_decrypt_block2x | ||
487 | eor v0.16b, v0.16b, v4.16b | ||
488 | eor v1.16b, v1.16b, v5.16b | ||
489 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
490 | cbz w4, .LxtsdecoutNx | ||
491 | next_tweak v4, v5, v7, v8 | ||
492 | b .LxtsdecNx | ||
493 | .LxtsdecoutNx: | ||
494 | mov v4.16b, v5.16b | ||
495 | b .Lxtsdecout | ||
496 | #else | ||
497 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
498 | next_tweak v5, v4, v7, v8 | ||
499 | eor v0.16b, v0.16b, v4.16b | ||
500 | next_tweak v6, v5, v7, v8 | ||
501 | eor v1.16b, v1.16b, v5.16b | ||
502 | eor v2.16b, v2.16b, v6.16b | ||
503 | next_tweak v7, v6, v7, v8 | ||
504 | eor v3.16b, v3.16b, v7.16b | ||
505 | do_decrypt_block4x | ||
506 | eor v3.16b, v3.16b, v7.16b | ||
507 | eor v0.16b, v0.16b, v4.16b | ||
508 | eor v1.16b, v1.16b, v5.16b | ||
509 | eor v2.16b, v2.16b, v6.16b | ||
510 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
511 | mov v4.16b, v7.16b | ||
512 | cbz w4, .Lxtsdecout | ||
513 | b .LxtsdecloopNx | ||
514 | #endif | ||
515 | .Lxtsdec1x: | ||
516 | adds w4, w4, #INTERLEAVE | ||
517 | beq .Lxtsdecout | ||
518 | #endif | ||
519 | .Lxtsdecloop: | ||
520 | ld1 {v1.16b}, [x1], #16 | ||
521 | eor v0.16b, v1.16b, v4.16b | ||
522 | decrypt_block v0, w3, x2, x6, w7 | ||
523 | eor v0.16b, v0.16b, v4.16b | ||
524 | st1 {v0.16b}, [x0], #16 | ||
525 | subs w4, w4, #1 | ||
526 | beq .Lxtsdecout | ||
527 | next_tweak v4, v4, v7, v8 | ||
528 | b .Lxtsdecloop | ||
529 | .Lxtsdecout: | ||
530 | FRAME_POP | ||
531 | ret | ||
532 | AES_ENDPROC(aes_xts_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 000000000000..b93170e1cc93 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S | |||
@@ -0,0 +1,382 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | |||
13 | #define AES_ENTRY(func) ENTRY(neon_ ## func) | ||
14 | #define AES_ENDPROC(func) ENDPROC(neon_ ## func) | ||
15 | |||
16 | /* multiply by polynomial 'x' in GF(2^8) */ | ||
17 | .macro mul_by_x, out, in, temp, const | ||
18 | sshr \temp, \in, #7 | ||
19 | add \out, \in, \in | ||
20 | and \temp, \temp, \const | ||
21 | eor \out, \out, \temp | ||
22 | .endm | ||
23 | |||
24 | /* preload the entire Sbox */ | ||
25 | .macro prepare, sbox, shiftrows, temp | ||
26 | adr \temp, \sbox | ||
27 | movi v12.16b, #0x40 | ||
28 | ldr q13, \shiftrows | ||
29 | movi v14.16b, #0x1b | ||
30 | ld1 {v16.16b-v19.16b}, [\temp], #64 | ||
31 | ld1 {v20.16b-v23.16b}, [\temp], #64 | ||
32 | ld1 {v24.16b-v27.16b}, [\temp], #64 | ||
33 | ld1 {v28.16b-v31.16b}, [\temp] | ||
34 | .endm | ||
35 | |||
36 | /* do preload for encryption */ | ||
37 | .macro enc_prepare, ignore0, ignore1, temp | ||
38 | prepare .LForward_Sbox, .LForward_ShiftRows, \temp | ||
39 | .endm | ||
40 | |||
41 | .macro enc_switch_key, ignore0, ignore1, temp | ||
42 | /* do nothing */ | ||
43 | .endm | ||
44 | |||
45 | /* do preload for decryption */ | ||
46 | .macro dec_prepare, ignore0, ignore1, temp | ||
47 | prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp | ||
48 | .endm | ||
49 | |||
50 | /* apply SubBytes transformation using the the preloaded Sbox */ | ||
51 | .macro sub_bytes, in | ||
52 | sub v9.16b, \in\().16b, v12.16b | ||
53 | tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b | ||
54 | sub v10.16b, v9.16b, v12.16b | ||
55 | tbx \in\().16b, {v20.16b-v23.16b}, v9.16b | ||
56 | sub v11.16b, v10.16b, v12.16b | ||
57 | tbx \in\().16b, {v24.16b-v27.16b}, v10.16b | ||
58 | tbx \in\().16b, {v28.16b-v31.16b}, v11.16b | ||
59 | .endm | ||
60 | |||
61 | /* apply MixColumns transformation */ | ||
62 | .macro mix_columns, in | ||
63 | mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b | ||
64 | rev32 v8.8h, \in\().8h | ||
65 | eor \in\().16b, v10.16b, \in\().16b | ||
66 | shl v9.4s, v8.4s, #24 | ||
67 | shl v11.4s, \in\().4s, #24 | ||
68 | sri v9.4s, v8.4s, #8 | ||
69 | sri v11.4s, \in\().4s, #8 | ||
70 | eor v9.16b, v9.16b, v8.16b | ||
71 | eor v10.16b, v10.16b, v9.16b | ||
72 | eor \in\().16b, v10.16b, v11.16b | ||
73 | .endm | ||
74 | |||
75 | /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ | ||
76 | .macro inv_mix_columns, in | ||
77 | mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b | ||
78 | mul_by_x v11.16b, v11.16b, v10.16b, v14.16b | ||
79 | eor \in\().16b, \in\().16b, v11.16b | ||
80 | rev32 v11.8h, v11.8h | ||
81 | eor \in\().16b, \in\().16b, v11.16b | ||
82 | mix_columns \in | ||
83 | .endm | ||
84 | |||
85 | .macro do_block, enc, in, rounds, rk, rkp, i | ||
86 | ld1 {v15.16b}, [\rk] | ||
87 | add \rkp, \rk, #16 | ||
88 | mov \i, \rounds | ||
89 | 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
90 | tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ | ||
91 | sub_bytes \in | ||
92 | ld1 {v15.16b}, [\rkp], #16 | ||
93 | subs \i, \i, #1 | ||
94 | beq 2222f | ||
95 | .if \enc == 1 | ||
96 | mix_columns \in | ||
97 | .else | ||
98 | inv_mix_columns \in | ||
99 | .endif | ||
100 | b 1111b | ||
101 | 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
102 | .endm | ||
103 | |||
104 | .macro encrypt_block, in, rounds, rk, rkp, i | ||
105 | do_block 1, \in, \rounds, \rk, \rkp, \i | ||
106 | .endm | ||
107 | |||
108 | .macro decrypt_block, in, rounds, rk, rkp, i | ||
109 | do_block 0, \in, \rounds, \rk, \rkp, \i | ||
110 | .endm | ||
111 | |||
112 | /* | ||
113 | * Interleaved versions: functionally equivalent to the | ||
114 | * ones above, but applied to 2 or 4 AES states in parallel. | ||
115 | */ | ||
116 | |||
117 | .macro sub_bytes_2x, in0, in1 | ||
118 | sub v8.16b, \in0\().16b, v12.16b | ||
119 | sub v9.16b, \in1\().16b, v12.16b | ||
120 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
121 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
122 | sub v10.16b, v8.16b, v12.16b | ||
123 | sub v11.16b, v9.16b, v12.16b | ||
124 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
125 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
126 | sub v8.16b, v10.16b, v12.16b | ||
127 | sub v9.16b, v11.16b, v12.16b | ||
128 | tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b | ||
129 | tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b | ||
130 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
131 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
132 | .endm | ||
133 | |||
134 | .macro sub_bytes_4x, in0, in1, in2, in3 | ||
135 | sub v8.16b, \in0\().16b, v12.16b | ||
136 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
137 | sub v9.16b, \in1\().16b, v12.16b | ||
138 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
139 | sub v10.16b, \in2\().16b, v12.16b | ||
140 | tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b | ||
141 | sub v11.16b, \in3\().16b, v12.16b | ||
142 | tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b | ||
143 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
144 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
145 | sub v8.16b, v8.16b, v12.16b | ||
146 | tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b | ||
147 | sub v9.16b, v9.16b, v12.16b | ||
148 | tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b | ||
149 | sub v10.16b, v10.16b, v12.16b | ||
150 | tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b | ||
151 | sub v11.16b, v11.16b, v12.16b | ||
152 | tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b | ||
153 | sub v8.16b, v8.16b, v12.16b | ||
154 | tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b | ||
155 | sub v9.16b, v9.16b, v12.16b | ||
156 | tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b | ||
157 | sub v10.16b, v10.16b, v12.16b | ||
158 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
159 | sub v11.16b, v11.16b, v12.16b | ||
160 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
161 | tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b | ||
162 | tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b | ||
163 | .endm | ||
164 | |||
165 | .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const | ||
166 | sshr \tmp0\().16b, \in0\().16b, #7 | ||
167 | add \out0\().16b, \in0\().16b, \in0\().16b | ||
168 | sshr \tmp1\().16b, \in1\().16b, #7 | ||
169 | and \tmp0\().16b, \tmp0\().16b, \const\().16b | ||
170 | add \out1\().16b, \in1\().16b, \in1\().16b | ||
171 | and \tmp1\().16b, \tmp1\().16b, \const\().16b | ||
172 | eor \out0\().16b, \out0\().16b, \tmp0\().16b | ||
173 | eor \out1\().16b, \out1\().16b, \tmp1\().16b | ||
174 | .endm | ||
175 | |||
176 | .macro mix_columns_2x, in0, in1 | ||
177 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
178 | rev32 v10.8h, \in0\().8h | ||
179 | rev32 v11.8h, \in1\().8h | ||
180 | eor \in0\().16b, v8.16b, \in0\().16b | ||
181 | eor \in1\().16b, v9.16b, \in1\().16b | ||
182 | shl v12.4s, v10.4s, #24 | ||
183 | shl v13.4s, v11.4s, #24 | ||
184 | eor v8.16b, v8.16b, v10.16b | ||
185 | sri v12.4s, v10.4s, #8 | ||
186 | shl v10.4s, \in0\().4s, #24 | ||
187 | eor v9.16b, v9.16b, v11.16b | ||
188 | sri v13.4s, v11.4s, #8 | ||
189 | shl v11.4s, \in1\().4s, #24 | ||
190 | sri v10.4s, \in0\().4s, #8 | ||
191 | eor \in0\().16b, v8.16b, v12.16b | ||
192 | sri v11.4s, \in1\().4s, #8 | ||
193 | eor \in1\().16b, v9.16b, v13.16b | ||
194 | eor \in0\().16b, v10.16b, \in0\().16b | ||
195 | eor \in1\().16b, v11.16b, \in1\().16b | ||
196 | .endm | ||
197 | |||
198 | .macro inv_mix_cols_2x, in0, in1 | ||
199 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
200 | mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 | ||
201 | eor \in0\().16b, \in0\().16b, v8.16b | ||
202 | eor \in1\().16b, \in1\().16b, v9.16b | ||
203 | rev32 v8.8h, v8.8h | ||
204 | rev32 v9.8h, v9.8h | ||
205 | eor \in0\().16b, \in0\().16b, v8.16b | ||
206 | eor \in1\().16b, \in1\().16b, v9.16b | ||
207 | mix_columns_2x \in0, \in1 | ||
208 | .endm | ||
209 | |||
210 | .macro inv_mix_cols_4x, in0, in1, in2, in3 | ||
211 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
212 | mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 | ||
213 | mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 | ||
214 | mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 | ||
215 | eor \in0\().16b, \in0\().16b, v8.16b | ||
216 | eor \in1\().16b, \in1\().16b, v9.16b | ||
217 | eor \in2\().16b, \in2\().16b, v10.16b | ||
218 | eor \in3\().16b, \in3\().16b, v11.16b | ||
219 | rev32 v8.8h, v8.8h | ||
220 | rev32 v9.8h, v9.8h | ||
221 | rev32 v10.8h, v10.8h | ||
222 | rev32 v11.8h, v11.8h | ||
223 | eor \in0\().16b, \in0\().16b, v8.16b | ||
224 | eor \in1\().16b, \in1\().16b, v9.16b | ||
225 | eor \in2\().16b, \in2\().16b, v10.16b | ||
226 | eor \in3\().16b, \in3\().16b, v11.16b | ||
227 | mix_columns_2x \in0, \in1 | ||
228 | mix_columns_2x \in2, \in3 | ||
229 | .endm | ||
230 | |||
231 | .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i | ||
232 | ld1 {v15.16b}, [\rk] | ||
233 | add \rkp, \rk, #16 | ||
234 | mov \i, \rounds | ||
235 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
236 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
237 | sub_bytes_2x \in0, \in1 | ||
238 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
239 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
240 | ld1 {v15.16b}, [\rkp], #16 | ||
241 | subs \i, \i, #1 | ||
242 | beq 2222f | ||
243 | .if \enc == 1 | ||
244 | mix_columns_2x \in0, \in1 | ||
245 | ldr q13, .LForward_ShiftRows | ||
246 | .else | ||
247 | inv_mix_cols_2x \in0, \in1 | ||
248 | ldr q13, .LReverse_ShiftRows | ||
249 | .endif | ||
250 | movi v12.16b, #0x40 | ||
251 | b 1111b | ||
252 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
253 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
254 | .endm | ||
255 | |||
256 | .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i | ||
257 | ld1 {v15.16b}, [\rk] | ||
258 | add \rkp, \rk, #16 | ||
259 | mov \i, \rounds | ||
260 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
261 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
262 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
263 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
264 | sub_bytes_4x \in0, \in1, \in2, \in3 | ||
265 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
266 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
267 | tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ | ||
268 | tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ | ||
269 | ld1 {v15.16b}, [\rkp], #16 | ||
270 | subs \i, \i, #1 | ||
271 | beq 2222f | ||
272 | .if \enc == 1 | ||
273 | mix_columns_2x \in0, \in1 | ||
274 | mix_columns_2x \in2, \in3 | ||
275 | ldr q13, .LForward_ShiftRows | ||
276 | .else | ||
277 | inv_mix_cols_4x \in0, \in1, \in2, \in3 | ||
278 | ldr q13, .LReverse_ShiftRows | ||
279 | .endif | ||
280 | movi v12.16b, #0x40 | ||
281 | b 1111b | ||
282 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
283 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
284 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
285 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
286 | .endm | ||
287 | |||
288 | .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
289 | do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i | ||
290 | .endm | ||
291 | |||
292 | .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
293 | do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i | ||
294 | .endm | ||
295 | |||
296 | .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
297 | do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
298 | .endm | ||
299 | |||
300 | .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
301 | do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
302 | .endm | ||
303 | |||
304 | #include "aes-modes.S" | ||
305 | |||
306 | .text | ||
307 | .align 4 | ||
308 | .LForward_ShiftRows: | ||
309 | .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 | ||
310 | .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb | ||
311 | |||
312 | .LReverse_ShiftRows: | ||
313 | .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb | ||
314 | .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 | ||
315 | |||
316 | .LForward_Sbox: | ||
317 | .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
318 | .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
319 | .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
320 | .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
321 | .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
322 | .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
323 | .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
324 | .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
325 | .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
326 | .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
327 | .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
328 | .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
329 | .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
330 | .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
331 | .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
332 | .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
333 | .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
334 | .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
335 | .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
336 | .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
337 | .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
338 | .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
339 | .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
340 | .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
341 | .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
342 | .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
343 | .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
344 | .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
345 | .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
346 | .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
347 | .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
348 | .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
349 | |||
350 | .LReverse_Sbox: | ||
351 | .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
352 | .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
353 | .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
354 | .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
355 | .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
356 | .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
357 | .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
358 | .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
359 | .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
360 | .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
361 | .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
362 | .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
363 | .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
364 | .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
365 | .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
366 | .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
367 | .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
368 | .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
369 | .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
370 | .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
371 | .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
372 | .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
373 | .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
374 | .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
375 | .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
376 | .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
377 | .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
378 | .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
379 | .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
380 | .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
381 | .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
382 | .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000000..b9e6eaf41c9b --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S | |||
@@ -0,0 +1,95 @@ | |||
1 | /* | ||
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S | ||
7 | * | ||
8 | * Copyright (c) 2009 Intel Corp. | ||
9 | * Author: Huang Ying <ying.huang@intel.com> | ||
10 | * Vinodh Gopal | ||
11 | * Erdinc Ozturk | ||
12 | * Deniz Karakoyunlu | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | ||
15 | * under the terms of the GNU General Public License version 2 as published | ||
16 | * by the Free Software Foundation. | ||
17 | */ | ||
18 | |||
19 | #include <linux/linkage.h> | ||
20 | #include <asm/assembler.h> | ||
21 | |||
22 | DATA .req v0 | ||
23 | SHASH .req v1 | ||
24 | IN1 .req v2 | ||
25 | T1 .req v2 | ||
26 | T2 .req v3 | ||
27 | T3 .req v4 | ||
28 | VZR .req v5 | ||
29 | |||
30 | .text | ||
31 | .arch armv8-a+crypto | ||
32 | |||
33 | /* | ||
34 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
35 | * struct ghash_key const *k, const char *head) | ||
36 | */ | ||
37 | ENTRY(pmull_ghash_update) | ||
38 | ld1 {DATA.16b}, [x1] | ||
39 | ld1 {SHASH.16b}, [x3] | ||
40 | eor VZR.16b, VZR.16b, VZR.16b | ||
41 | |||
42 | /* do the head block first, if supplied */ | ||
43 | cbz x4, 0f | ||
44 | ld1 {IN1.2d}, [x4] | ||
45 | b 1f | ||
46 | |||
47 | 0: ld1 {IN1.2d}, [x2], #16 | ||
48 | sub w0, w0, #1 | ||
49 | 1: ext IN1.16b, IN1.16b, IN1.16b, #8 | ||
50 | CPU_LE( rev64 IN1.16b, IN1.16b ) | ||
51 | eor DATA.16b, DATA.16b, IN1.16b | ||
52 | |||
53 | /* multiply DATA by SHASH in GF(2^128) */ | ||
54 | ext T2.16b, DATA.16b, DATA.16b, #8 | ||
55 | ext T3.16b, SHASH.16b, SHASH.16b, #8 | ||
56 | eor T2.16b, T2.16b, DATA.16b | ||
57 | eor T3.16b, T3.16b, SHASH.16b | ||
58 | |||
59 | pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 | ||
60 | pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 | ||
61 | pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) | ||
62 | eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) | ||
63 | eor T2.16b, T2.16b, DATA.16b | ||
64 | |||
65 | ext T3.16b, VZR.16b, T2.16b, #8 | ||
66 | ext T2.16b, T2.16b, VZR.16b, #8 | ||
67 | eor DATA.16b, DATA.16b, T3.16b | ||
68 | eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of | ||
69 | // carry-less multiplication | ||
70 | |||
71 | /* first phase of the reduction */ | ||
72 | shl T3.2d, DATA.2d, #1 | ||
73 | eor T3.16b, T3.16b, DATA.16b | ||
74 | shl T3.2d, T3.2d, #5 | ||
75 | eor T3.16b, T3.16b, DATA.16b | ||
76 | shl T3.2d, T3.2d, #57 | ||
77 | ext T2.16b, VZR.16b, T3.16b, #8 | ||
78 | ext T3.16b, T3.16b, VZR.16b, #8 | ||
79 | eor DATA.16b, DATA.16b, T2.16b | ||
80 | eor T1.16b, T1.16b, T3.16b | ||
81 | |||
82 | /* second phase of the reduction */ | ||
83 | ushr T2.2d, DATA.2d, #5 | ||
84 | eor T2.16b, T2.16b, DATA.16b | ||
85 | ushr T2.2d, T2.2d, #1 | ||
86 | eor T2.16b, T2.16b, DATA.16b | ||
87 | ushr T2.2d, T2.2d, #1 | ||
88 | eor T1.16b, T1.16b, T2.16b | ||
89 | eor DATA.16b, DATA.16b, T1.16b | ||
90 | |||
91 | cbnz w0, 0b | ||
92 | |||
93 | st1 {DATA.16b}, [x1] | ||
94 | ret | ||
95 | ENDPROC(pmull_ghash_update) | ||
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 000000000000..b92baf3f68c7 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of the GNU General Public License version 2 as published | ||
8 | * by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <linux/cpufeature.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/module.h> | ||
17 | |||
18 | MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); | ||
19 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
20 | MODULE_LICENSE("GPL v2"); | ||
21 | |||
22 | #define GHASH_BLOCK_SIZE 16 | ||
23 | #define GHASH_DIGEST_SIZE 16 | ||
24 | |||
25 | struct ghash_key { | ||
26 | u64 a; | ||
27 | u64 b; | ||
28 | }; | ||
29 | |||
30 | struct ghash_desc_ctx { | ||
31 | u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; | ||
32 | u8 buf[GHASH_BLOCK_SIZE]; | ||
33 | u32 count; | ||
34 | }; | ||
35 | |||
36 | asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
37 | struct ghash_key const *k, const char *head); | ||
38 | |||
39 | static int ghash_init(struct shash_desc *desc) | ||
40 | { | ||
41 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
42 | |||
43 | *ctx = (struct ghash_desc_ctx){}; | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static int ghash_update(struct shash_desc *desc, const u8 *src, | ||
48 | unsigned int len) | ||
49 | { | ||
50 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
51 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
52 | |||
53 | ctx->count += len; | ||
54 | |||
55 | if ((partial + len) >= GHASH_BLOCK_SIZE) { | ||
56 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
57 | int blocks; | ||
58 | |||
59 | if (partial) { | ||
60 | int p = GHASH_BLOCK_SIZE - partial; | ||
61 | |||
62 | memcpy(ctx->buf + partial, src, p); | ||
63 | src += p; | ||
64 | len -= p; | ||
65 | } | ||
66 | |||
67 | blocks = len / GHASH_BLOCK_SIZE; | ||
68 | len %= GHASH_BLOCK_SIZE; | ||
69 | |||
70 | kernel_neon_begin_partial(6); | ||
71 | pmull_ghash_update(blocks, ctx->digest, src, key, | ||
72 | partial ? ctx->buf : NULL); | ||
73 | kernel_neon_end(); | ||
74 | src += blocks * GHASH_BLOCK_SIZE; | ||
75 | } | ||
76 | if (len) | ||
77 | memcpy(ctx->buf + partial, src, len); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static int ghash_final(struct shash_desc *desc, u8 *dst) | ||
82 | { | ||
83 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
84 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
85 | |||
86 | if (partial) { | ||
87 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
88 | |||
89 | memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); | ||
90 | |||
91 | kernel_neon_begin_partial(6); | ||
92 | pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); | ||
93 | kernel_neon_end(); | ||
94 | } | ||
95 | put_unaligned_be64(ctx->digest[1], dst); | ||
96 | put_unaligned_be64(ctx->digest[0], dst + 8); | ||
97 | |||
98 | *ctx = (struct ghash_desc_ctx){}; | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static int ghash_setkey(struct crypto_shash *tfm, | ||
103 | const u8 *inkey, unsigned int keylen) | ||
104 | { | ||
105 | struct ghash_key *key = crypto_shash_ctx(tfm); | ||
106 | u64 a, b; | ||
107 | |||
108 | if (keylen != GHASH_BLOCK_SIZE) { | ||
109 | crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
110 | return -EINVAL; | ||
111 | } | ||
112 | |||
113 | /* perform multiplication by 'x' in GF(2^128) */ | ||
114 | b = get_unaligned_be64(inkey); | ||
115 | a = get_unaligned_be64(inkey + 8); | ||
116 | |||
117 | key->a = (a << 1) | (b >> 63); | ||
118 | key->b = (b << 1) | (a >> 63); | ||
119 | |||
120 | if (b >> 63) | ||
121 | key->b ^= 0xc200000000000000UL; | ||
122 | |||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | static struct shash_alg ghash_alg = { | ||
127 | .digestsize = GHASH_DIGEST_SIZE, | ||
128 | .init = ghash_init, | ||
129 | .update = ghash_update, | ||
130 | .final = ghash_final, | ||
131 | .setkey = ghash_setkey, | ||
132 | .descsize = sizeof(struct ghash_desc_ctx), | ||
133 | .base = { | ||
134 | .cra_name = "ghash", | ||
135 | .cra_driver_name = "ghash-ce", | ||
136 | .cra_priority = 200, | ||
137 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
138 | .cra_blocksize = GHASH_BLOCK_SIZE, | ||
139 | .cra_ctxsize = sizeof(struct ghash_key), | ||
140 | .cra_module = THIS_MODULE, | ||
141 | }, | ||
142 | }; | ||
143 | |||
144 | static int __init ghash_ce_mod_init(void) | ||
145 | { | ||
146 | return crypto_register_shash(&ghash_alg); | ||
147 | } | ||
148 | |||
149 | static void __exit ghash_ce_mod_exit(void) | ||
150 | { | ||
151 | crypto_unregister_shash(&ghash_alg); | ||
152 | } | ||
153 | |||
154 | module_cpu_feature_match(PMULL, ghash_ce_mod_init); | ||
155 | module_exit(ghash_ce_mod_exit); | ||
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 000000000000..09d57d98609c --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | #include <asm/assembler.h> | ||
13 | |||
14 | .text | ||
15 | .arch armv8-a+crypto | ||
16 | |||
17 | k0 .req v0 | ||
18 | k1 .req v1 | ||
19 | k2 .req v2 | ||
20 | k3 .req v3 | ||
21 | |||
22 | t0 .req v4 | ||
23 | t1 .req v5 | ||
24 | |||
25 | dga .req q6 | ||
26 | dgav .req v6 | ||
27 | dgb .req s7 | ||
28 | dgbv .req v7 | ||
29 | |||
30 | dg0q .req q12 | ||
31 | dg0s .req s12 | ||
32 | dg0v .req v12 | ||
33 | dg1s .req s13 | ||
34 | dg1v .req v13 | ||
35 | dg2s .req s14 | ||
36 | |||
37 | .macro add_only, op, ev, rc, s0, dg1 | ||
38 | .ifc \ev, ev | ||
39 | add t1.4s, v\s0\().4s, \rc\().4s | ||
40 | sha1h dg2s, dg0s | ||
41 | .ifnb \dg1 | ||
42 | sha1\op dg0q, \dg1, t0.4s | ||
43 | .else | ||
44 | sha1\op dg0q, dg1s, t0.4s | ||
45 | .endif | ||
46 | .else | ||
47 | .ifnb \s0 | ||
48 | add t0.4s, v\s0\().4s, \rc\().4s | ||
49 | .endif | ||
50 | sha1h dg1s, dg0s | ||
51 | sha1\op dg0q, dg2s, t1.4s | ||
52 | .endif | ||
53 | .endm | ||
54 | |||
55 | .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 | ||
56 | sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s | ||
57 | add_only \op, \ev, \rc, \s1, \dg1 | ||
58 | sha1su1 v\s0\().4s, v\s3\().4s | ||
59 | .endm | ||
60 | |||
61 | /* | ||
62 | * The SHA1 round constants | ||
63 | */ | ||
64 | .align 4 | ||
65 | .Lsha1_rcon: | ||
66 | .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 | ||
67 | |||
68 | /* | ||
69 | * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
70 | * u8 *head, long bytes) | ||
71 | */ | ||
72 | ENTRY(sha1_ce_transform) | ||
73 | /* load round constants */ | ||
74 | adr x6, .Lsha1_rcon | ||
75 | ld1r {k0.4s}, [x6], #4 | ||
76 | ld1r {k1.4s}, [x6], #4 | ||
77 | ld1r {k2.4s}, [x6], #4 | ||
78 | ld1r {k3.4s}, [x6] | ||
79 | |||
80 | /* load state */ | ||
81 | ldr dga, [x2] | ||
82 | ldr dgb, [x2, #16] | ||
83 | |||
84 | /* load partial state (if supplied) */ | ||
85 | cbz x3, 0f | ||
86 | ld1 {v8.4s-v11.4s}, [x3] | ||
87 | b 1f | ||
88 | |||
89 | /* load input */ | ||
90 | 0: ld1 {v8.4s-v11.4s}, [x1], #64 | ||
91 | sub w0, w0, #1 | ||
92 | |||
93 | 1: | ||
94 | CPU_LE( rev32 v8.16b, v8.16b ) | ||
95 | CPU_LE( rev32 v9.16b, v9.16b ) | ||
96 | CPU_LE( rev32 v10.16b, v10.16b ) | ||
97 | CPU_LE( rev32 v11.16b, v11.16b ) | ||
98 | |||
99 | 2: add t0.4s, v8.4s, k0.4s | ||
100 | mov dg0v.16b, dgav.16b | ||
101 | |||
102 | add_update c, ev, k0, 8, 9, 10, 11, dgb | ||
103 | add_update c, od, k0, 9, 10, 11, 8 | ||
104 | add_update c, ev, k0, 10, 11, 8, 9 | ||
105 | add_update c, od, k0, 11, 8, 9, 10 | ||
106 | add_update c, ev, k1, 8, 9, 10, 11 | ||
107 | |||
108 | add_update p, od, k1, 9, 10, 11, 8 | ||
109 | add_update p, ev, k1, 10, 11, 8, 9 | ||
110 | add_update p, od, k1, 11, 8, 9, 10 | ||
111 | add_update p, ev, k1, 8, 9, 10, 11 | ||
112 | add_update p, od, k2, 9, 10, 11, 8 | ||
113 | |||
114 | add_update m, ev, k2, 10, 11, 8, 9 | ||
115 | add_update m, od, k2, 11, 8, 9, 10 | ||
116 | add_update m, ev, k2, 8, 9, 10, 11 | ||
117 | add_update m, od, k2, 9, 10, 11, 8 | ||
118 | add_update m, ev, k3, 10, 11, 8, 9 | ||
119 | |||
120 | add_update p, od, k3, 11, 8, 9, 10 | ||
121 | add_only p, ev, k3, 9 | ||
122 | add_only p, od, k3, 10 | ||
123 | add_only p, ev, k3, 11 | ||
124 | add_only p, od | ||
125 | |||
126 | /* update state */ | ||
127 | add dgbv.2s, dgbv.2s, dg1v.2s | ||
128 | add dgav.4s, dgav.4s, dg0v.4s | ||
129 | |||
130 | cbnz w0, 0b | ||
131 | |||
132 | /* | ||
133 | * Final block: add padding and total bit count. | ||
134 | * Skip if we have no total byte count in x4. In that case, the input | ||
135 | * size was not a round multiple of the block size, and the padding is | ||
136 | * handled by the C code. | ||
137 | */ | ||
138 | cbz x4, 3f | ||
139 | movi v9.2d, #0 | ||
140 | mov x8, #0x80000000 | ||
141 | movi v10.2d, #0 | ||
142 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
143 | fmov d8, x8 | ||
144 | mov x4, #0 | ||
145 | mov v11.d[0], xzr | ||
146 | mov v11.d[1], x7 | ||
147 | b 2b | ||
148 | |||
149 | /* store new state */ | ||
150 | 3: str dga, [x2] | ||
151 | str dgb, [x2, #16] | ||
152 | ret | ||
153 | ENDPROC(sha1_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 000000000000..6fe83f37a750 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c | |||
@@ -0,0 +1,174 @@ | |||
1 | /* | ||
2 | * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <crypto/sha.h> | ||
15 | #include <linux/cpufeature.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); | ||
20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
21 | MODULE_LICENSE("GPL v2"); | ||
22 | |||
23 | asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
24 | u8 *head, long bytes); | ||
25 | |||
26 | static int sha1_init(struct shash_desc *desc) | ||
27 | { | ||
28 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
29 | |||
30 | *sctx = (struct sha1_state){ | ||
31 | .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, | ||
32 | }; | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | static int sha1_update(struct shash_desc *desc, const u8 *data, | ||
37 | unsigned int len) | ||
38 | { | ||
39 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
40 | unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; | ||
41 | |||
42 | sctx->count += len; | ||
43 | |||
44 | if ((partial + len) >= SHA1_BLOCK_SIZE) { | ||
45 | int blocks; | ||
46 | |||
47 | if (partial) { | ||
48 | int p = SHA1_BLOCK_SIZE - partial; | ||
49 | |||
50 | memcpy(sctx->buffer + partial, data, p); | ||
51 | data += p; | ||
52 | len -= p; | ||
53 | } | ||
54 | |||
55 | blocks = len / SHA1_BLOCK_SIZE; | ||
56 | len %= SHA1_BLOCK_SIZE; | ||
57 | |||
58 | kernel_neon_begin_partial(16); | ||
59 | sha1_ce_transform(blocks, data, sctx->state, | ||
60 | partial ? sctx->buffer : NULL, 0); | ||
61 | kernel_neon_end(); | ||
62 | |||
63 | data += blocks * SHA1_BLOCK_SIZE; | ||
64 | partial = 0; | ||
65 | } | ||
66 | if (len) | ||
67 | memcpy(sctx->buffer + partial, data, len); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static int sha1_final(struct shash_desc *desc, u8 *out) | ||
72 | { | ||
73 | static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; | ||
74 | |||
75 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
76 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
77 | __be32 *dst = (__be32 *)out; | ||
78 | int i; | ||
79 | |||
80 | u32 padlen = SHA1_BLOCK_SIZE | ||
81 | - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); | ||
82 | |||
83 | sha1_update(desc, padding, padlen); | ||
84 | sha1_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
85 | |||
86 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
87 | put_unaligned_be32(sctx->state[i], dst++); | ||
88 | |||
89 | *sctx = (struct sha1_state){}; | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | static int sha1_finup(struct shash_desc *desc, const u8 *data, | ||
94 | unsigned int len, u8 *out) | ||
95 | { | ||
96 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
97 | __be32 *dst = (__be32 *)out; | ||
98 | int blocks; | ||
99 | int i; | ||
100 | |||
101 | if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { | ||
102 | sha1_update(desc, data, len); | ||
103 | return sha1_final(desc, out); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
108 | * this case, there is no need to copy data around, and we can | ||
109 | * perform the entire digest calculation in a single invocation | ||
110 | * of sha1_ce_transform() | ||
111 | */ | ||
112 | blocks = len / SHA1_BLOCK_SIZE; | ||
113 | |||
114 | kernel_neon_begin_partial(16); | ||
115 | sha1_ce_transform(blocks, data, sctx->state, NULL, len); | ||
116 | kernel_neon_end(); | ||
117 | |||
118 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
119 | put_unaligned_be32(sctx->state[i], dst++); | ||
120 | |||
121 | *sctx = (struct sha1_state){}; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static int sha1_export(struct shash_desc *desc, void *out) | ||
126 | { | ||
127 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
128 | struct sha1_state *dst = out; | ||
129 | |||
130 | *dst = *sctx; | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | static int sha1_import(struct shash_desc *desc, const void *in) | ||
135 | { | ||
136 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
137 | struct sha1_state const *src = in; | ||
138 | |||
139 | *sctx = *src; | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static struct shash_alg alg = { | ||
144 | .init = sha1_init, | ||
145 | .update = sha1_update, | ||
146 | .final = sha1_final, | ||
147 | .finup = sha1_finup, | ||
148 | .export = sha1_export, | ||
149 | .import = sha1_import, | ||
150 | .descsize = sizeof(struct sha1_state), | ||
151 | .digestsize = SHA1_DIGEST_SIZE, | ||
152 | .statesize = sizeof(struct sha1_state), | ||
153 | .base = { | ||
154 | .cra_name = "sha1", | ||
155 | .cra_driver_name = "sha1-ce", | ||
156 | .cra_priority = 200, | ||
157 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
158 | .cra_blocksize = SHA1_BLOCK_SIZE, | ||
159 | .cra_module = THIS_MODULE, | ||
160 | } | ||
161 | }; | ||
162 | |||
163 | static int __init sha1_ce_mod_init(void) | ||
164 | { | ||
165 | return crypto_register_shash(&alg); | ||
166 | } | ||
167 | |||
168 | static void __exit sha1_ce_mod_fini(void) | ||
169 | { | ||
170 | crypto_unregister_shash(&alg); | ||
171 | } | ||
172 | |||
173 | module_cpu_feature_match(SHA1, sha1_ce_mod_init); | ||
174 | module_exit(sha1_ce_mod_fini); | ||
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 000000000000..7f29fc031ea8 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S | |||
@@ -0,0 +1,156 @@ | |||
1 | /* | ||
2 | * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | #include <asm/assembler.h> | ||
13 | |||
14 | .text | ||
15 | .arch armv8-a+crypto | ||
16 | |||
17 | dga .req q20 | ||
18 | dgav .req v20 | ||
19 | dgb .req q21 | ||
20 | dgbv .req v21 | ||
21 | |||
22 | t0 .req v22 | ||
23 | t1 .req v23 | ||
24 | |||
25 | dg0q .req q24 | ||
26 | dg0v .req v24 | ||
27 | dg1q .req q25 | ||
28 | dg1v .req v25 | ||
29 | dg2q .req q26 | ||
30 | dg2v .req v26 | ||
31 | |||
32 | .macro add_only, ev, rc, s0 | ||
33 | mov dg2v.16b, dg0v.16b | ||
34 | .ifeq \ev | ||
35 | add t1.4s, v\s0\().4s, \rc\().4s | ||
36 | sha256h dg0q, dg1q, t0.4s | ||
37 | sha256h2 dg1q, dg2q, t0.4s | ||
38 | .else | ||
39 | .ifnb \s0 | ||
40 | add t0.4s, v\s0\().4s, \rc\().4s | ||
41 | .endif | ||
42 | sha256h dg0q, dg1q, t1.4s | ||
43 | sha256h2 dg1q, dg2q, t1.4s | ||
44 | .endif | ||
45 | .endm | ||
46 | |||
47 | .macro add_update, ev, rc, s0, s1, s2, s3 | ||
48 | sha256su0 v\s0\().4s, v\s1\().4s | ||
49 | add_only \ev, \rc, \s1 | ||
50 | sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s | ||
51 | .endm | ||
52 | |||
53 | /* | ||
54 | * The SHA-256 round constants | ||
55 | */ | ||
56 | .align 4 | ||
57 | .Lsha2_rcon: | ||
58 | .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
59 | .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
60 | .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
61 | .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
62 | .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
63 | .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
64 | .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
65 | .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
66 | .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
67 | .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
68 | .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
69 | .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
70 | .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
71 | .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
72 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
73 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
74 | |||
75 | /* | ||
76 | * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
77 | * u8 *head, long bytes) | ||
78 | */ | ||
79 | ENTRY(sha2_ce_transform) | ||
80 | /* load round constants */ | ||
81 | adr x8, .Lsha2_rcon | ||
82 | ld1 { v0.4s- v3.4s}, [x8], #64 | ||
83 | ld1 { v4.4s- v7.4s}, [x8], #64 | ||
84 | ld1 { v8.4s-v11.4s}, [x8], #64 | ||
85 | ld1 {v12.4s-v15.4s}, [x8] | ||
86 | |||
87 | /* load state */ | ||
88 | ldp dga, dgb, [x2] | ||
89 | |||
90 | /* load partial input (if supplied) */ | ||
91 | cbz x3, 0f | ||
92 | ld1 {v16.4s-v19.4s}, [x3] | ||
93 | b 1f | ||
94 | |||
95 | /* load input */ | ||
96 | 0: ld1 {v16.4s-v19.4s}, [x1], #64 | ||
97 | sub w0, w0, #1 | ||
98 | |||
99 | 1: | ||
100 | CPU_LE( rev32 v16.16b, v16.16b ) | ||
101 | CPU_LE( rev32 v17.16b, v17.16b ) | ||
102 | CPU_LE( rev32 v18.16b, v18.16b ) | ||
103 | CPU_LE( rev32 v19.16b, v19.16b ) | ||
104 | |||
105 | 2: add t0.4s, v16.4s, v0.4s | ||
106 | mov dg0v.16b, dgav.16b | ||
107 | mov dg1v.16b, dgbv.16b | ||
108 | |||
109 | add_update 0, v1, 16, 17, 18, 19 | ||
110 | add_update 1, v2, 17, 18, 19, 16 | ||
111 | add_update 0, v3, 18, 19, 16, 17 | ||
112 | add_update 1, v4, 19, 16, 17, 18 | ||
113 | |||
114 | add_update 0, v5, 16, 17, 18, 19 | ||
115 | add_update 1, v6, 17, 18, 19, 16 | ||
116 | add_update 0, v7, 18, 19, 16, 17 | ||
117 | add_update 1, v8, 19, 16, 17, 18 | ||
118 | |||
119 | add_update 0, v9, 16, 17, 18, 19 | ||
120 | add_update 1, v10, 17, 18, 19, 16 | ||
121 | add_update 0, v11, 18, 19, 16, 17 | ||
122 | add_update 1, v12, 19, 16, 17, 18 | ||
123 | |||
124 | add_only 0, v13, 17 | ||
125 | add_only 1, v14, 18 | ||
126 | add_only 0, v15, 19 | ||
127 | add_only 1 | ||
128 | |||
129 | /* update state */ | ||
130 | add dgav.4s, dgav.4s, dg0v.4s | ||
131 | add dgbv.4s, dgbv.4s, dg1v.4s | ||
132 | |||
133 | /* handled all input blocks? */ | ||
134 | cbnz w0, 0b | ||
135 | |||
136 | /* | ||
137 | * Final block: add padding and total bit count. | ||
138 | * Skip if we have no total byte count in x4. In that case, the input | ||
139 | * size was not a round multiple of the block size, and the padding is | ||
140 | * handled by the C code. | ||
141 | */ | ||
142 | cbz x4, 3f | ||
143 | movi v17.2d, #0 | ||
144 | mov x8, #0x80000000 | ||
145 | movi v18.2d, #0 | ||
146 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
147 | fmov d16, x8 | ||
148 | mov x4, #0 | ||
149 | mov v19.d[0], xzr | ||
150 | mov v19.d[1], x7 | ||
151 | b 2b | ||
152 | |||
153 | /* store new state */ | ||
154 | 3: stp dga, dgb, [x2] | ||
155 | ret | ||
156 | ENDPROC(sha2_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 000000000000..c294e67d3925 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <crypto/sha.h> | ||
15 | #include <linux/cpufeature.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); | ||
20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
21 | MODULE_LICENSE("GPL v2"); | ||
22 | |||
23 | asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
24 | u8 *head, long bytes); | ||
25 | |||
26 | static int sha224_init(struct shash_desc *desc) | ||
27 | { | ||
28 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
29 | |||
30 | *sctx = (struct sha256_state){ | ||
31 | .state = { | ||
32 | SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, | ||
33 | SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, | ||
34 | } | ||
35 | }; | ||
36 | return 0; | ||
37 | } | ||
38 | |||
39 | static int sha256_init(struct shash_desc *desc) | ||
40 | { | ||
41 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
42 | |||
43 | *sctx = (struct sha256_state){ | ||
44 | .state = { | ||
45 | SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, | ||
46 | SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, | ||
47 | } | ||
48 | }; | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static int sha2_update(struct shash_desc *desc, const u8 *data, | ||
53 | unsigned int len) | ||
54 | { | ||
55 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
56 | unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; | ||
57 | |||
58 | sctx->count += len; | ||
59 | |||
60 | if ((partial + len) >= SHA256_BLOCK_SIZE) { | ||
61 | int blocks; | ||
62 | |||
63 | if (partial) { | ||
64 | int p = SHA256_BLOCK_SIZE - partial; | ||
65 | |||
66 | memcpy(sctx->buf + partial, data, p); | ||
67 | data += p; | ||
68 | len -= p; | ||
69 | } | ||
70 | |||
71 | blocks = len / SHA256_BLOCK_SIZE; | ||
72 | len %= SHA256_BLOCK_SIZE; | ||
73 | |||
74 | kernel_neon_begin_partial(28); | ||
75 | sha2_ce_transform(blocks, data, sctx->state, | ||
76 | partial ? sctx->buf : NULL, 0); | ||
77 | kernel_neon_end(); | ||
78 | |||
79 | data += blocks * SHA256_BLOCK_SIZE; | ||
80 | partial = 0; | ||
81 | } | ||
82 | if (len) | ||
83 | memcpy(sctx->buf + partial, data, len); | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | static void sha2_final(struct shash_desc *desc) | ||
88 | { | ||
89 | static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; | ||
90 | |||
91 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
92 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
93 | u32 padlen = SHA256_BLOCK_SIZE | ||
94 | - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); | ||
95 | |||
96 | sha2_update(desc, padding, padlen); | ||
97 | sha2_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
98 | } | ||
99 | |||
100 | static int sha224_final(struct shash_desc *desc, u8 *out) | ||
101 | { | ||
102 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
103 | __be32 *dst = (__be32 *)out; | ||
104 | int i; | ||
105 | |||
106 | sha2_final(desc); | ||
107 | |||
108 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
109 | put_unaligned_be32(sctx->state[i], dst++); | ||
110 | |||
111 | *sctx = (struct sha256_state){}; | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int sha256_final(struct shash_desc *desc, u8 *out) | ||
116 | { | ||
117 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
118 | __be32 *dst = (__be32 *)out; | ||
119 | int i; | ||
120 | |||
121 | sha2_final(desc); | ||
122 | |||
123 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
124 | put_unaligned_be32(sctx->state[i], dst++); | ||
125 | |||
126 | *sctx = (struct sha256_state){}; | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | static void sha2_finup(struct shash_desc *desc, const u8 *data, | ||
131 | unsigned int len) | ||
132 | { | ||
133 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
134 | int blocks; | ||
135 | |||
136 | if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { | ||
137 | sha2_update(desc, data, len); | ||
138 | sha2_final(desc); | ||
139 | return; | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
144 | * this case, there is no need to copy data around, and we can | ||
145 | * perform the entire digest calculation in a single invocation | ||
146 | * of sha2_ce_transform() | ||
147 | */ | ||
148 | blocks = len / SHA256_BLOCK_SIZE; | ||
149 | |||
150 | kernel_neon_begin_partial(28); | ||
151 | sha2_ce_transform(blocks, data, sctx->state, NULL, len); | ||
152 | kernel_neon_end(); | ||
153 | data += blocks * SHA256_BLOCK_SIZE; | ||
154 | } | ||
155 | |||
156 | static int sha224_finup(struct shash_desc *desc, const u8 *data, | ||
157 | unsigned int len, u8 *out) | ||
158 | { | ||
159 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
160 | __be32 *dst = (__be32 *)out; | ||
161 | int i; | ||
162 | |||
163 | sha2_finup(desc, data, len); | ||
164 | |||
165 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
166 | put_unaligned_be32(sctx->state[i], dst++); | ||
167 | |||
168 | *sctx = (struct sha256_state){}; | ||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | static int sha256_finup(struct shash_desc *desc, const u8 *data, | ||
173 | unsigned int len, u8 *out) | ||
174 | { | ||
175 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
176 | __be32 *dst = (__be32 *)out; | ||
177 | int i; | ||
178 | |||
179 | sha2_finup(desc, data, len); | ||
180 | |||
181 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
182 | put_unaligned_be32(sctx->state[i], dst++); | ||
183 | |||
184 | *sctx = (struct sha256_state){}; | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static int sha2_export(struct shash_desc *desc, void *out) | ||
189 | { | ||
190 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
191 | struct sha256_state *dst = out; | ||
192 | |||
193 | *dst = *sctx; | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int sha2_import(struct shash_desc *desc, const void *in) | ||
198 | { | ||
199 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
200 | struct sha256_state const *src = in; | ||
201 | |||
202 | *sctx = *src; | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | static struct shash_alg algs[] = { { | ||
207 | .init = sha224_init, | ||
208 | .update = sha2_update, | ||
209 | .final = sha224_final, | ||
210 | .finup = sha224_finup, | ||
211 | .export = sha2_export, | ||
212 | .import = sha2_import, | ||
213 | .descsize = sizeof(struct sha256_state), | ||
214 | .digestsize = SHA224_DIGEST_SIZE, | ||
215 | .statesize = sizeof(struct sha256_state), | ||
216 | .base = { | ||
217 | .cra_name = "sha224", | ||
218 | .cra_driver_name = "sha224-ce", | ||
219 | .cra_priority = 200, | ||
220 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
221 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
222 | .cra_module = THIS_MODULE, | ||
223 | } | ||
224 | }, { | ||
225 | .init = sha256_init, | ||
226 | .update = sha2_update, | ||
227 | .final = sha256_final, | ||
228 | .finup = sha256_finup, | ||
229 | .export = sha2_export, | ||
230 | .import = sha2_import, | ||
231 | .descsize = sizeof(struct sha256_state), | ||
232 | .digestsize = SHA256_DIGEST_SIZE, | ||
233 | .statesize = sizeof(struct sha256_state), | ||
234 | .base = { | ||
235 | .cra_name = "sha256", | ||
236 | .cra_driver_name = "sha256-ce", | ||
237 | .cra_priority = 200, | ||
238 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
239 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
240 | .cra_module = THIS_MODULE, | ||
241 | } | ||
242 | } }; | ||
243 | |||
244 | static int __init sha2_ce_mod_init(void) | ||
245 | { | ||
246 | return crypto_register_shashes(algs, ARRAY_SIZE(algs)); | ||
247 | } | ||
248 | |||
249 | static void __exit sha2_ce_mod_fini(void) | ||
250 | { | ||
251 | crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); | ||
252 | } | ||
253 | |||
254 | module_cpu_feature_match(SHA2, sha2_ce_mod_init); | ||
255 | module_exit(sha2_ce_mod_fini); | ||
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 83f71b3004a8..42c7eecd2bb6 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild | |||
@@ -40,6 +40,7 @@ generic-y += segment.h | |||
40 | generic-y += sembuf.h | 40 | generic-y += sembuf.h |
41 | generic-y += serial.h | 41 | generic-y += serial.h |
42 | generic-y += shmbuf.h | 42 | generic-y += shmbuf.h |
43 | generic-y += simd.h | ||
43 | generic-y += sizes.h | 44 | generic-y += sizes.h |
44 | generic-y += socket.h | 45 | generic-y += socket.h |
45 | generic-y += sockios.h | 46 | generic-y += sockios.h |
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index c43b4ac13008..50f559f574fe 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h | |||
@@ -37,8 +37,21 @@ struct fpsimd_state { | |||
37 | u32 fpcr; | 37 | u32 fpcr; |
38 | }; | 38 | }; |
39 | }; | 39 | }; |
40 | /* the id of the last cpu to have restored this state */ | ||
41 | unsigned int cpu; | ||
40 | }; | 42 | }; |
41 | 43 | ||
44 | /* | ||
45 | * Struct for stacking the bottom 'n' FP/SIMD registers. | ||
46 | */ | ||
47 | struct fpsimd_partial_state { | ||
48 | u32 fpsr; | ||
49 | u32 fpcr; | ||
50 | u32 num_regs; | ||
51 | __uint128_t vregs[32]; | ||
52 | }; | ||
53 | |||
54 | |||
42 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) | 55 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) |
43 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ | 56 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ |
44 | #define VFP_FPSCR_STAT_MASK 0xf800009f | 57 | #define VFP_FPSCR_STAT_MASK 0xf800009f |
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state); | |||
58 | extern void fpsimd_thread_switch(struct task_struct *next); | 71 | extern void fpsimd_thread_switch(struct task_struct *next); |
59 | extern void fpsimd_flush_thread(void); | 72 | extern void fpsimd_flush_thread(void); |
60 | 73 | ||
74 | extern void fpsimd_preserve_current_state(void); | ||
75 | extern void fpsimd_restore_current_state(void); | ||
76 | extern void fpsimd_update_current_state(struct fpsimd_state *state); | ||
77 | |||
78 | extern void fpsimd_flush_task_state(struct task_struct *target); | ||
79 | |||
80 | extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, | ||
81 | u32 num_regs); | ||
82 | extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); | ||
83 | |||
61 | #endif | 84 | #endif |
62 | 85 | ||
63 | #endif | 86 | #endif |
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index bbec599c96bd..768414d55e64 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h | |||
@@ -62,3 +62,38 @@ | |||
62 | ldr w\tmpnr, [\state, #16 * 2 + 4] | 62 | ldr w\tmpnr, [\state, #16 * 2 + 4] |
63 | msr fpcr, x\tmpnr | 63 | msr fpcr, x\tmpnr |
64 | .endm | 64 | .endm |
65 | |||
66 | .altmacro | ||
67 | .macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 | ||
68 | mrs x\tmpnr1, fpsr | ||
69 | str w\numnr, [\state, #8] | ||
70 | mrs x\tmpnr2, fpcr | ||
71 | stp w\tmpnr1, w\tmpnr2, [\state] | ||
72 | adr x\tmpnr1, 0f | ||
73 | add \state, \state, x\numnr, lsl #4 | ||
74 | sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 | ||
75 | br x\tmpnr1 | ||
76 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
77 | .irp qb, %(qa + 1) | ||
78 | stp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
79 | .endr | ||
80 | .endr | ||
81 | 0: | ||
82 | .endm | ||
83 | |||
84 | .macro fpsimd_restore_partial state, tmpnr1, tmpnr2 | ||
85 | ldp w\tmpnr1, w\tmpnr2, [\state] | ||
86 | msr fpsr, x\tmpnr1 | ||
87 | msr fpcr, x\tmpnr2 | ||
88 | adr x\tmpnr1, 0f | ||
89 | ldr w\tmpnr2, [\state, #8] | ||
90 | add \state, \state, x\tmpnr2, lsl #4 | ||
91 | sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 | ||
92 | br x\tmpnr1 | ||
93 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
94 | .irp qb, %(qa + 1) | ||
95 | ldp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
96 | .endr | ||
97 | .endr | ||
98 | 0: | ||
99 | .endm | ||
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index b0cc58a97780..13ce4cc18e26 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h | |||
@@ -8,7 +8,11 @@ | |||
8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/types.h> | ||
12 | |||
11 | #define cpu_has_neon() (1) | 13 | #define cpu_has_neon() (1) |
12 | 14 | ||
13 | void kernel_neon_begin(void); | 15 | #define kernel_neon_begin() kernel_neon_begin_partial(32) |
16 | |||
17 | void kernel_neon_begin_partial(u32 num_regs); | ||
14 | void kernel_neon_end(void); | 18 | void kernel_neon_end(void); |
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 0a8b2a97a32e..9c086c63f911 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h | |||
@@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void) | |||
103 | #define TIF_SIGPENDING 0 | 103 | #define TIF_SIGPENDING 0 |
104 | #define TIF_NEED_RESCHED 1 | 104 | #define TIF_NEED_RESCHED 1 |
105 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ | 105 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ |
106 | #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ | ||
106 | #define TIF_SYSCALL_TRACE 8 | 107 | #define TIF_SYSCALL_TRACE 8 |
107 | #define TIF_SYSCALL_AUDIT 9 | 108 | #define TIF_SYSCALL_AUDIT 9 |
108 | #define TIF_SYSCALL_TRACEPOINT 10 | 109 | #define TIF_SYSCALL_TRACEPOINT 10 |
@@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void) | |||
118 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | 119 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) |
119 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | 120 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) |
120 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 121 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
122 | #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) | ||
121 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | 123 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) |
122 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | 124 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) |
123 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) | 125 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) |
@@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void) | |||
125 | #define _TIF_32BIT (1 << TIF_32BIT) | 127 | #define _TIF_32BIT (1 << TIF_32BIT) |
126 | 128 | ||
127 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ | 129 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ |
128 | _TIF_NOTIFY_RESUME) | 130 | _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) |
129 | 131 | ||
130 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ | 132 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ |
131 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) | 133 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) |
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..d358ccacfc00 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S | |||
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state) | |||
41 | fpsimd_restore x0, 8 | 41 | fpsimd_restore x0, 8 |
42 | ret | 42 | ret |
43 | ENDPROC(fpsimd_load_state) | 43 | ENDPROC(fpsimd_load_state) |
44 | |||
45 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
46 | |||
47 | /* | ||
48 | * Save the bottom n FP registers. | ||
49 | * | ||
50 | * x0 - pointer to struct fpsimd_partial_state | ||
51 | */ | ||
52 | ENTRY(fpsimd_save_partial_state) | ||
53 | fpsimd_save_partial x0, 1, 8, 9 | ||
54 | ret | ||
55 | ENDPROC(fpsimd_load_partial_state) | ||
56 | |||
57 | /* | ||
58 | * Load the bottom n FP registers. | ||
59 | * | ||
60 | * x0 - pointer to struct fpsimd_partial_state | ||
61 | */ | ||
62 | ENTRY(fpsimd_load_partial_state) | ||
63 | fpsimd_restore_partial x0, 8, 9 | ||
64 | ret | ||
65 | ENDPROC(fpsimd_load_partial_state) | ||
66 | |||
67 | #endif | ||
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a670d0a98c89..bf017f4ffb4f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S | |||
@@ -562,7 +562,7 @@ fast_work_pending: | |||
562 | str x0, [sp, #S_X0] // returned x0 | 562 | str x0, [sp, #S_X0] // returned x0 |
563 | work_pending: | 563 | work_pending: |
564 | tbnz x1, #TIF_NEED_RESCHED, work_resched | 564 | tbnz x1, #TIF_NEED_RESCHED, work_resched |
565 | /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ | 565 | /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ |
566 | ldr x2, [sp, #S_PSTATE] | 566 | ldr x2, [sp, #S_PSTATE] |
567 | mov x0, sp // 'regs' | 567 | mov x0, sp // 'regs' |
568 | tst x2, #PSR_MODE_MASK // user mode regs? | 568 | tst x2, #PSR_MODE_MASK // user mode regs? |
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4aef42a04bdc..ad8aebb1cdef 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c | |||
@@ -35,6 +35,60 @@ | |||
35 | #define FPEXC_IDF (1 << 7) | 35 | #define FPEXC_IDF (1 << 7) |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * In order to reduce the number of times the FPSIMD state is needlessly saved | ||
39 | * and restored, we need to keep track of two things: | ||
40 | * (a) for each task, we need to remember which CPU was the last one to have | ||
41 | * the task's FPSIMD state loaded into its FPSIMD registers; | ||
42 | * (b) for each CPU, we need to remember which task's userland FPSIMD state has | ||
43 | * been loaded into its FPSIMD registers most recently, or whether it has | ||
44 | * been used to perform kernel mode NEON in the meantime. | ||
45 | * | ||
46 | * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to | ||
47 | * the id of the current CPU everytime the state is loaded onto a CPU. For (b), | ||
48 | * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the | ||
49 | * address of the userland FPSIMD state of the task that was loaded onto the CPU | ||
50 | * the most recently, or NULL if kernel mode NEON has been performed after that. | ||
51 | * | ||
52 | * With this in place, we no longer have to restore the next FPSIMD state right | ||
53 | * when switching between tasks. Instead, we can defer this check to userland | ||
54 | * resume, at which time we verify whether the CPU's fpsimd_last_state and the | ||
55 | * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we | ||
56 | * can omit the FPSIMD restore. | ||
57 | * | ||
58 | * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to | ||
59 | * indicate whether or not the userland FPSIMD state of the current task is | ||
60 | * present in the registers. The flag is set unless the FPSIMD registers of this | ||
61 | * CPU currently contain the most recent userland FPSIMD state of the current | ||
62 | * task. | ||
63 | * | ||
64 | * For a certain task, the sequence may look something like this: | ||
65 | * - the task gets scheduled in; if both the task's fpsimd_state.cpu field | ||
66 | * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu | ||
67 | * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is | ||
68 | * cleared, otherwise it is set; | ||
69 | * | ||
70 | * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's | ||
71 | * userland FPSIMD state is copied from memory to the registers, the task's | ||
72 | * fpsimd_state.cpu field is set to the id of the current CPU, the current | ||
73 | * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the | ||
74 | * TIF_FOREIGN_FPSTATE flag is cleared; | ||
75 | * | ||
76 | * - the task executes an ordinary syscall; upon return to userland, the | ||
77 | * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is | ||
78 | * restored; | ||
79 | * | ||
80 | * - the task executes a syscall which executes some NEON instructions; this is | ||
81 | * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD | ||
82 | * register contents to memory, clears the fpsimd_last_state per-cpu variable | ||
83 | * and sets the TIF_FOREIGN_FPSTATE flag; | ||
84 | * | ||
85 | * - the task gets preempted after kernel_neon_end() is called; as we have not | ||
86 | * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so | ||
87 | * whatever is in the FPSIMD registers is not saved to memory, but discarded. | ||
88 | */ | ||
89 | static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); | ||
90 | |||
91 | /* | ||
38 | * Trapped FP/ASIMD access. | 92 | * Trapped FP/ASIMD access. |
39 | */ | 93 | */ |
40 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) | 94 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) |
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) | |||
72 | 126 | ||
73 | void fpsimd_thread_switch(struct task_struct *next) | 127 | void fpsimd_thread_switch(struct task_struct *next) |
74 | { | 128 | { |
75 | /* check if not kernel threads */ | 129 | /* |
76 | if (current->mm) | 130 | * Save the current FPSIMD state to memory, but only if whatever is in |
131 | * the registers is in fact the most recent userland FPSIMD state of | ||
132 | * 'current'. | ||
133 | */ | ||
134 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
77 | fpsimd_save_state(¤t->thread.fpsimd_state); | 135 | fpsimd_save_state(¤t->thread.fpsimd_state); |
78 | if (next->mm) | 136 | |
79 | fpsimd_load_state(&next->thread.fpsimd_state); | 137 | if (next->mm) { |
138 | /* | ||
139 | * If we are switching to a task whose most recent userland | ||
140 | * FPSIMD state is already in the registers of *this* cpu, | ||
141 | * we can skip loading the state from memory. Otherwise, set | ||
142 | * the TIF_FOREIGN_FPSTATE flag so the state will be loaded | ||
143 | * upon the next return to userland. | ||
144 | */ | ||
145 | struct fpsimd_state *st = &next->thread.fpsimd_state; | ||
146 | |||
147 | if (__this_cpu_read(fpsimd_last_state) == st | ||
148 | && st->cpu == smp_processor_id()) | ||
149 | clear_ti_thread_flag(task_thread_info(next), | ||
150 | TIF_FOREIGN_FPSTATE); | ||
151 | else | ||
152 | set_ti_thread_flag(task_thread_info(next), | ||
153 | TIF_FOREIGN_FPSTATE); | ||
154 | } | ||
80 | } | 155 | } |
81 | 156 | ||
82 | void fpsimd_flush_thread(void) | 157 | void fpsimd_flush_thread(void) |
83 | { | 158 | { |
84 | preempt_disable(); | ||
85 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); | 159 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); |
86 | fpsimd_load_state(¤t->thread.fpsimd_state); | 160 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
161 | } | ||
162 | |||
163 | /* | ||
164 | * Save the userland FPSIMD state of 'current' to memory, but only if the state | ||
165 | * currently held in the registers does in fact belong to 'current' | ||
166 | */ | ||
167 | void fpsimd_preserve_current_state(void) | ||
168 | { | ||
169 | preempt_disable(); | ||
170 | if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
171 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
172 | preempt_enable(); | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Load the userland FPSIMD state of 'current' from memory, but only if the | ||
177 | * FPSIMD state already held in the registers is /not/ the most recent FPSIMD | ||
178 | * state of 'current' | ||
179 | */ | ||
180 | void fpsimd_restore_current_state(void) | ||
181 | { | ||
182 | preempt_disable(); | ||
183 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
184 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
185 | |||
186 | fpsimd_load_state(st); | ||
187 | this_cpu_write(fpsimd_last_state, st); | ||
188 | st->cpu = smp_processor_id(); | ||
189 | } | ||
190 | preempt_enable(); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Load an updated userland FPSIMD state for 'current' from memory and set the | ||
195 | * flag that indicates that the FPSIMD register contents are the most recent | ||
196 | * FPSIMD state of 'current' | ||
197 | */ | ||
198 | void fpsimd_update_current_state(struct fpsimd_state *state) | ||
199 | { | ||
200 | preempt_disable(); | ||
201 | fpsimd_load_state(state); | ||
202 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
203 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
204 | |||
205 | this_cpu_write(fpsimd_last_state, st); | ||
206 | st->cpu = smp_processor_id(); | ||
207 | } | ||
87 | preempt_enable(); | 208 | preempt_enable(); |
88 | } | 209 | } |
89 | 210 | ||
211 | /* | ||
212 | * Invalidate live CPU copies of task t's FPSIMD state | ||
213 | */ | ||
214 | void fpsimd_flush_task_state(struct task_struct *t) | ||
215 | { | ||
216 | t->thread.fpsimd_state.cpu = NR_CPUS; | ||
217 | } | ||
218 | |||
90 | #ifdef CONFIG_KERNEL_MODE_NEON | 219 | #ifdef CONFIG_KERNEL_MODE_NEON |
91 | 220 | ||
221 | static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); | ||
222 | static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); | ||
223 | |||
92 | /* | 224 | /* |
93 | * Kernel-side NEON support functions | 225 | * Kernel-side NEON support functions |
94 | */ | 226 | */ |
95 | void kernel_neon_begin(void) | 227 | void kernel_neon_begin_partial(u32 num_regs) |
96 | { | 228 | { |
97 | /* Avoid using the NEON in interrupt context */ | 229 | if (in_interrupt()) { |
98 | BUG_ON(in_interrupt()); | 230 | struct fpsimd_partial_state *s = this_cpu_ptr( |
99 | preempt_disable(); | 231 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); |
100 | 232 | ||
101 | if (current->mm) | 233 | BUG_ON(num_regs > 32); |
102 | fpsimd_save_state(¤t->thread.fpsimd_state); | 234 | fpsimd_save_partial_state(s, roundup(num_regs, 2)); |
235 | } else { | ||
236 | /* | ||
237 | * Save the userland FPSIMD state if we have one and if we | ||
238 | * haven't done so already. Clear fpsimd_last_state to indicate | ||
239 | * that there is no longer userland FPSIMD state in the | ||
240 | * registers. | ||
241 | */ | ||
242 | preempt_disable(); | ||
243 | if (current->mm && | ||
244 | !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
245 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
246 | this_cpu_write(fpsimd_last_state, NULL); | ||
247 | } | ||
103 | } | 248 | } |
104 | EXPORT_SYMBOL(kernel_neon_begin); | 249 | EXPORT_SYMBOL(kernel_neon_begin_partial); |
105 | 250 | ||
106 | void kernel_neon_end(void) | 251 | void kernel_neon_end(void) |
107 | { | 252 | { |
108 | if (current->mm) | 253 | if (in_interrupt()) { |
109 | fpsimd_load_state(¤t->thread.fpsimd_state); | 254 | struct fpsimd_partial_state *s = this_cpu_ptr( |
110 | 255 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); | |
111 | preempt_enable(); | 256 | fpsimd_load_partial_state(s); |
257 | } else { | ||
258 | preempt_enable(); | ||
259 | } | ||
112 | } | 260 | } |
113 | EXPORT_SYMBOL(kernel_neon_end); | 261 | EXPORT_SYMBOL(kernel_neon_end); |
114 | 262 | ||
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, | |||
120 | { | 268 | { |
121 | switch (cmd) { | 269 | switch (cmd) { |
122 | case CPU_PM_ENTER: | 270 | case CPU_PM_ENTER: |
123 | if (current->mm) | 271 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) |
124 | fpsimd_save_state(¤t->thread.fpsimd_state); | 272 | fpsimd_save_state(¤t->thread.fpsimd_state); |
125 | break; | 273 | break; |
126 | case CPU_PM_EXIT: | 274 | case CPU_PM_EXIT: |
127 | if (current->mm) | 275 | if (current->mm) |
128 | fpsimd_load_state(¤t->thread.fpsimd_state); | 276 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
129 | break; | 277 | break; |
130 | case CPU_PM_ENTER_FAILED: | 278 | case CPU_PM_ENTER_FAILED: |
131 | default: | 279 | default: |
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index d04eb871cb0e..9f2d6020b6c2 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c | |||
@@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task) | |||
206 | 206 | ||
207 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 207 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
208 | { | 208 | { |
209 | fpsimd_save_state(¤t->thread.fpsimd_state); | 209 | fpsimd_preserve_current_state(); |
210 | *dst = *src; | 210 | *dst = *src; |
211 | return 0; | 211 | return 0; |
212 | } | 212 | } |
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 4b58e812cf67..32d52d3b079c 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c | |||
@@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, | |||
518 | return ret; | 518 | return ret; |
519 | 519 | ||
520 | target->thread.fpsimd_state.user_fpsimd = newstate; | 520 | target->thread.fpsimd_state.user_fpsimd = newstate; |
521 | fpsimd_flush_task_state(target); | ||
521 | return ret; | 522 | return ret; |
522 | } | 523 | } |
523 | 524 | ||
@@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target, | |||
765 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; | 766 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; |
766 | } | 767 | } |
767 | 768 | ||
769 | fpsimd_flush_task_state(target); | ||
768 | return ret; | 770 | return ret; |
769 | } | 771 | } |
770 | 772 | ||
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 2ba72a11629f..6357b9c6c90e 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c | |||
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) | |||
51 | int err; | 51 | int err; |
52 | 52 | ||
53 | /* dump the hardware registers to the fpsimd_state structure */ | 53 | /* dump the hardware registers to the fpsimd_state structure */ |
54 | fpsimd_save_state(fpsimd); | 54 | fpsimd_preserve_current_state(); |
55 | 55 | ||
56 | /* copy the FP and status/control registers */ | 56 | /* copy the FP and status/control registers */ |
57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); | 57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); |
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) | |||
86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); | 86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); |
87 | 87 | ||
88 | /* load the hardware registers from the fpsimd_state structure */ | 88 | /* load the hardware registers from the fpsimd_state structure */ |
89 | if (!err) { | 89 | if (!err) |
90 | preempt_disable(); | 90 | fpsimd_update_current_state(&fpsimd); |
91 | fpsimd_load_state(&fpsimd); | ||
92 | preempt_enable(); | ||
93 | } | ||
94 | 91 | ||
95 | return err ? -EFAULT : 0; | 92 | return err ? -EFAULT : 0; |
96 | } | 93 | } |
@@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, | |||
433 | clear_thread_flag(TIF_NOTIFY_RESUME); | 430 | clear_thread_flag(TIF_NOTIFY_RESUME); |
434 | tracehook_notify_resume(regs); | 431 | tracehook_notify_resume(regs); |
435 | } | 432 | } |
433 | |||
434 | if (thread_flags & _TIF_FOREIGN_FPSTATE) | ||
435 | fpsimd_restore_current_state(); | ||
436 | |||
436 | } | 437 | } |
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 050c1c2af777..3491c638f172 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c | |||
@@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
222 | * Note that this also saves V16-31, which aren't visible | 222 | * Note that this also saves V16-31, which aren't visible |
223 | * in AArch32. | 223 | * in AArch32. |
224 | */ | 224 | */ |
225 | fpsimd_save_state(fpsimd); | 225 | fpsimd_preserve_current_state(); |
226 | 226 | ||
227 | /* Place structure header on the stack */ | 227 | /* Place structure header on the stack */ |
228 | __put_user_error(magic, &frame->magic, err); | 228 | __put_user_error(magic, &frame->magic, err); |
@@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
285 | * We don't need to touch the exception register, so | 285 | * We don't need to touch the exception register, so |
286 | * reload the hardware state. | 286 | * reload the hardware state. |
287 | */ | 287 | */ |
288 | if (!err) { | 288 | if (!err) |
289 | preempt_disable(); | 289 | fpsimd_update_current_state(&fpsimd); |
290 | fpsimd_load_state(&fpsimd); | ||
291 | preempt_enable(); | ||
292 | } | ||
293 | 290 | ||
294 | return err ? -EFAULT : 0; | 291 | return err ? -EFAULT : 0; |
295 | } | 292 | } |