diff options
| author | Catalin Marinas <catalin.marinas@arm.com> | 2014-05-16 05:05:11 -0400 |
|---|---|---|
| committer | Catalin Marinas <catalin.marinas@arm.com> | 2014-05-16 05:05:11 -0400 |
| commit | cf5c95db57ffa02e430c3840c08d1ee0403849d4 (patch) | |
| tree | b3b4df5e1edcde098cf45b7fa00c8450e6d665f8 | |
| parent | fd92d4a54a069953b4679958121317f2a25389cd (diff) | |
| parent | 49788fe2a128217f78a21ee4edbe6e92e988f222 (diff) | |
Merge tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm into upstream
FPSIMD register bank context switching and crypto algorithms
optimisations for arm64 from Ard Biesheuvel.
* tag 'for-3.16' of git://git.linaro.org/people/ard.biesheuvel/linux-arm:
arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions
arm64: pull in <asm/simd.h> from asm-generic
arm64/crypto: AES in CCM mode using ARMv8 Crypto Extensions
arm64/crypto: AES using ARMv8 Crypto Extensions
arm64/crypto: GHASH secure hash using ARMv8 Crypto Extensions
arm64/crypto: SHA-224/SHA-256 using ARMv8 Crypto Extensions
arm64/crypto: SHA-1 using ARMv8 Crypto Extensions
arm64: add support for kernel mode NEON in interrupt context
arm64: defer reloading a task's FPSIMD state to userland resume
arm64: add abstractions for FPSIMD state manipulation
asm-generic: allow generic unaligned access if the arch supports it
Conflicts:
arch/arm64/include/asm/thread_info.h
30 files changed, 3535 insertions, 43 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9a5b5fea86ba..78b356d079dd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
| @@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug" | |||
| 343 | source "security/Kconfig" | 343 | source "security/Kconfig" |
| 344 | 344 | ||
| 345 | source "crypto/Kconfig" | 345 | source "crypto/Kconfig" |
| 346 | if CRYPTO | ||
| 347 | source "arch/arm64/crypto/Kconfig" | ||
| 348 | endif | ||
| 346 | 349 | ||
| 347 | source "lib/Kconfig" | 350 | source "lib/Kconfig" |
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 2fceb71ac3b7..8185a913c5ed 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile | |||
| @@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS | |||
| 45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ | 45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ |
| 46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ | 46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ |
| 47 | core-$(CONFIG_XEN) += arch/arm64/xen/ | 47 | core-$(CONFIG_XEN) += arch/arm64/xen/ |
| 48 | core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ | ||
| 48 | libs-y := arch/arm64/lib/ $(libs-y) | 49 | libs-y := arch/arm64/lib/ $(libs-y) |
| 49 | libs-y += $(LIBGCC) | 50 | libs-y += $(LIBGCC) |
| 50 | 51 | ||
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 000000000000..5562652c5316 --- /dev/null +++ b/arch/arm64/crypto/Kconfig | |||
| @@ -0,0 +1,53 @@ | |||
| 1 | |||
| 2 | menuconfig ARM64_CRYPTO | ||
| 3 | bool "ARM64 Accelerated Cryptographic Algorithms" | ||
| 4 | depends on ARM64 | ||
| 5 | help | ||
| 6 | Say Y here to choose from a selection of cryptographic algorithms | ||
| 7 | implemented using ARM64 specific CPU features or instructions. | ||
| 8 | |||
| 9 | if ARM64_CRYPTO | ||
| 10 | |||
| 11 | config CRYPTO_SHA1_ARM64_CE | ||
| 12 | tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" | ||
| 13 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 14 | select CRYPTO_HASH | ||
| 15 | |||
| 16 | config CRYPTO_SHA2_ARM64_CE | ||
| 17 | tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" | ||
| 18 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 19 | select CRYPTO_HASH | ||
| 20 | |||
| 21 | config CRYPTO_GHASH_ARM64_CE | ||
| 22 | tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" | ||
| 23 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 24 | select CRYPTO_HASH | ||
| 25 | |||
| 26 | config CRYPTO_AES_ARM64_CE | ||
| 27 | tristate "AES core cipher using ARMv8 Crypto Extensions" | ||
| 28 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 29 | select CRYPTO_ALGAPI | ||
| 30 | select CRYPTO_AES | ||
| 31 | |||
| 32 | config CRYPTO_AES_ARM64_CE_CCM | ||
| 33 | tristate "AES in CCM mode using ARMv8 Crypto Extensions" | ||
| 34 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 35 | select CRYPTO_ALGAPI | ||
| 36 | select CRYPTO_AES | ||
| 37 | select CRYPTO_AEAD | ||
| 38 | |||
| 39 | config CRYPTO_AES_ARM64_CE_BLK | ||
| 40 | tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" | ||
| 41 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 42 | select CRYPTO_BLKCIPHER | ||
| 43 | select CRYPTO_AES | ||
| 44 | select CRYPTO_ABLK_HELPER | ||
| 45 | |||
| 46 | config CRYPTO_AES_ARM64_NEON_BLK | ||
| 47 | tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" | ||
| 48 | depends on ARM64 && KERNEL_MODE_NEON | ||
| 49 | select CRYPTO_BLKCIPHER | ||
| 50 | select CRYPTO_AES | ||
| 51 | select CRYPTO_ABLK_HELPER | ||
| 52 | |||
| 53 | endif | ||
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 000000000000..2070a56ecc46 --- /dev/null +++ b/arch/arm64/crypto/Makefile | |||
| @@ -0,0 +1,38 @@ | |||
| 1 | # | ||
| 2 | # linux/arch/arm64/crypto/Makefile | ||
| 3 | # | ||
| 4 | # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | # | ||
| 6 | # This program is free software; you can redistribute it and/or modify | ||
| 7 | # it under the terms of the GNU General Public License version 2 as | ||
| 8 | # published by the Free Software Foundation. | ||
| 9 | # | ||
| 10 | |||
| 11 | obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o | ||
| 12 | sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o | ||
| 13 | |||
| 14 | obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o | ||
| 15 | sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o | ||
| 16 | |||
| 17 | obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o | ||
| 18 | ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o | ||
| 19 | |||
| 20 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o | ||
| 21 | CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto | ||
| 22 | |||
| 23 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o | ||
| 24 | aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o | ||
| 25 | |||
| 26 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o | ||
| 27 | aes-ce-blk-y := aes-glue-ce.o aes-ce.o | ||
| 28 | |||
| 29 | obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o | ||
| 30 | aes-neon-blk-y := aes-glue-neon.o aes-neon.o | ||
| 31 | |||
| 32 | AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE | ||
| 33 | AFLAGS_aes-neon.o := -DINTERLEAVE=4 | ||
| 34 | |||
| 35 | CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS | ||
| 36 | |||
| 37 | $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE | ||
| 38 | $(call if_changed_dep,cc_o_c) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 000000000000..432e4841cd81 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S | |||
| @@ -0,0 +1,222 @@ | |||
| 1 | /* | ||
| 2 | * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/linkage.h> | ||
| 12 | |||
| 13 | .text | ||
| 14 | .arch armv8-a+crypto | ||
| 15 | |||
| 16 | /* | ||
| 17 | * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
| 18 | * u32 *macp, u8 const rk[], u32 rounds); | ||
| 19 | */ | ||
| 20 | ENTRY(ce_aes_ccm_auth_data) | ||
| 21 | ldr w8, [x3] /* leftover from prev round? */ | ||
| 22 | ld1 {v0.2d}, [x0] /* load mac */ | ||
| 23 | cbz w8, 1f | ||
| 24 | sub w8, w8, #16 | ||
| 25 | eor v1.16b, v1.16b, v1.16b | ||
| 26 | 0: ldrb w7, [x1], #1 /* get 1 byte of input */ | ||
| 27 | subs w2, w2, #1 | ||
| 28 | add w8, w8, #1 | ||
| 29 | ins v1.b[0], w7 | ||
| 30 | ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ | ||
| 31 | beq 8f /* out of input? */ | ||
| 32 | cbnz w8, 0b | ||
| 33 | eor v0.16b, v0.16b, v1.16b | ||
| 34 | 1: ld1 {v3.2d}, [x4] /* load first round key */ | ||
| 35 | prfm pldl1strm, [x1] | ||
| 36 | cmp w5, #12 /* which key size? */ | ||
| 37 | add x6, x4, #16 | ||
| 38 | sub w7, w5, #2 /* modified # of rounds */ | ||
| 39 | bmi 2f | ||
| 40 | bne 5f | ||
| 41 | mov v5.16b, v3.16b | ||
| 42 | b 4f | ||
| 43 | 2: mov v4.16b, v3.16b | ||
| 44 | ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ | ||
| 45 | 3: aese v0.16b, v4.16b | ||
| 46 | aesmc v0.16b, v0.16b | ||
| 47 | 4: ld1 {v3.2d}, [x6], #16 /* load next round key */ | ||
| 48 | aese v0.16b, v5.16b | ||
| 49 | aesmc v0.16b, v0.16b | ||
| 50 | 5: ld1 {v4.2d}, [x6], #16 /* load next round key */ | ||
| 51 | subs w7, w7, #3 | ||
| 52 | aese v0.16b, v3.16b | ||
| 53 | aesmc v0.16b, v0.16b | ||
| 54 | ld1 {v5.2d}, [x6], #16 /* load next round key */ | ||
| 55 | bpl 3b | ||
| 56 | aese v0.16b, v4.16b | ||
| 57 | subs w2, w2, #16 /* last data? */ | ||
| 58 | eor v0.16b, v0.16b, v5.16b /* final round */ | ||
| 59 | bmi 6f | ||
| 60 | ld1 {v1.16b}, [x1], #16 /* load next input block */ | ||
| 61 | eor v0.16b, v0.16b, v1.16b /* xor with mac */ | ||
| 62 | bne 1b | ||
| 63 | 6: st1 {v0.2d}, [x0] /* store mac */ | ||
| 64 | beq 10f | ||
| 65 | adds w2, w2, #16 | ||
| 66 | beq 10f | ||
| 67 | mov w8, w2 | ||
| 68 | 7: ldrb w7, [x1], #1 | ||
| 69 | umov w6, v0.b[0] | ||
| 70 | eor w6, w6, w7 | ||
| 71 | strb w6, [x0], #1 | ||
| 72 | subs w2, w2, #1 | ||
| 73 | beq 10f | ||
| 74 | ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ | ||
| 75 | b 7b | ||
| 76 | 8: mov w7, w8 | ||
| 77 | add w8, w8, #16 | ||
| 78 | 9: ext v1.16b, v1.16b, v1.16b, #1 | ||
| 79 | adds w7, w7, #1 | ||
| 80 | bne 9b | ||
| 81 | eor v0.16b, v0.16b, v1.16b | ||
| 82 | st1 {v0.2d}, [x0] | ||
| 83 | 10: str w8, [x3] | ||
| 84 | ret | ||
| 85 | ENDPROC(ce_aes_ccm_auth_data) | ||
| 86 | |||
| 87 | /* | ||
| 88 | * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], | ||
| 89 | * u32 rounds); | ||
| 90 | */ | ||
| 91 | ENTRY(ce_aes_ccm_final) | ||
| 92 | ld1 {v3.2d}, [x2], #16 /* load first round key */ | ||
| 93 | ld1 {v0.2d}, [x0] /* load mac */ | ||
| 94 | cmp w3, #12 /* which key size? */ | ||
| 95 | sub w3, w3, #2 /* modified # of rounds */ | ||
| 96 | ld1 {v1.2d}, [x1] /* load 1st ctriv */ | ||
| 97 | bmi 0f | ||
| 98 | bne 3f | ||
| 99 | mov v5.16b, v3.16b | ||
| 100 | b 2f | ||
| 101 | 0: mov v4.16b, v3.16b | ||
| 102 | 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ | ||
| 103 | aese v0.16b, v4.16b | ||
| 104 | aese v1.16b, v4.16b | ||
| 105 | aesmc v0.16b, v0.16b | ||
| 106 | aesmc v1.16b, v1.16b | ||
| 107 | 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ | ||
| 108 | aese v0.16b, v5.16b | ||
| 109 | aese v1.16b, v5.16b | ||
| 110 | aesmc v0.16b, v0.16b | ||
| 111 | aesmc v1.16b, v1.16b | ||
| 112 | 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ | ||
| 113 | subs w3, w3, #3 | ||
| 114 | aese v0.16b, v3.16b | ||
| 115 | aese v1.16b, v3.16b | ||
| 116 | aesmc v0.16b, v0.16b | ||
| 117 | aesmc v1.16b, v1.16b | ||
| 118 | bpl 1b | ||
| 119 | aese v0.16b, v4.16b | ||
| 120 | aese v1.16b, v4.16b | ||
| 121 | /* final round key cancels out */ | ||
| 122 | eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ | ||
| 123 | st1 {v0.2d}, [x0] /* store result */ | ||
| 124 | ret | ||
| 125 | ENDPROC(ce_aes_ccm_final) | ||
| 126 | |||
| 127 | .macro aes_ccm_do_crypt,enc | ||
| 128 | ldr x8, [x6, #8] /* load lower ctr */ | ||
| 129 | ld1 {v0.2d}, [x5] /* load mac */ | ||
| 130 | rev x8, x8 /* keep swabbed ctr in reg */ | ||
| 131 | 0: /* outer loop */ | ||
| 132 | ld1 {v1.1d}, [x6] /* load upper ctr */ | ||
| 133 | prfm pldl1strm, [x1] | ||
| 134 | add x8, x8, #1 | ||
| 135 | rev x9, x8 | ||
| 136 | cmp w4, #12 /* which key size? */ | ||
| 137 | sub w7, w4, #2 /* get modified # of rounds */ | ||
| 138 | ins v1.d[1], x9 /* no carry in lower ctr */ | ||
| 139 | ld1 {v3.2d}, [x3] /* load first round key */ | ||
| 140 | add x10, x3, #16 | ||
| 141 | bmi 1f | ||
| 142 | bne 4f | ||
| 143 | mov v5.16b, v3.16b | ||
| 144 | b 3f | ||
| 145 | 1: mov v4.16b, v3.16b | ||
| 146 | ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ | ||
| 147 | 2: /* inner loop: 3 rounds, 2x interleaved */ | ||
| 148 | aese v0.16b, v4.16b | ||
| 149 | aese v1.16b, v4.16b | ||
| 150 | aesmc v0.16b, v0.16b | ||
| 151 | aesmc v1.16b, v1.16b | ||
| 152 | 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ | ||
| 153 | aese v0.16b, v5.16b | ||
| 154 | aese v1.16b, v5.16b | ||
| 155 | aesmc v0.16b, v0.16b | ||
| 156 | aesmc v1.16b, v1.16b | ||
| 157 | 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ | ||
| 158 | subs w7, w7, #3 | ||
| 159 | aese v0.16b, v3.16b | ||
| 160 | aese v1.16b, v3.16b | ||
| 161 | aesmc v0.16b, v0.16b | ||
| 162 | aesmc v1.16b, v1.16b | ||
| 163 | ld1 {v5.2d}, [x10], #16 /* load next round key */ | ||
| 164 | bpl 2b | ||
| 165 | aese v0.16b, v4.16b | ||
| 166 | aese v1.16b, v4.16b | ||
| 167 | subs w2, w2, #16 | ||
| 168 | bmi 6f /* partial block? */ | ||
| 169 | ld1 {v2.16b}, [x1], #16 /* load next input block */ | ||
| 170 | .if \enc == 1 | ||
| 171 | eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ | ||
| 172 | eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ | ||
| 173 | .else | ||
| 174 | eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ | ||
| 175 | eor v1.16b, v2.16b, v5.16b /* final round enc */ | ||
| 176 | .endif | ||
| 177 | eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ | ||
| 178 | st1 {v1.16b}, [x0], #16 /* write output block */ | ||
| 179 | bne 0b | ||
| 180 | rev x8, x8 | ||
| 181 | st1 {v0.2d}, [x5] /* store mac */ | ||
| 182 | str x8, [x6, #8] /* store lsb end of ctr (BE) */ | ||
| 183 | 5: ret | ||
| 184 | |||
| 185 | 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ | ||
| 186 | eor v1.16b, v1.16b, v5.16b /* final round enc */ | ||
| 187 | st1 {v0.2d}, [x5] /* store mac */ | ||
| 188 | add w2, w2, #16 /* process partial tail block */ | ||
| 189 | 7: ldrb w9, [x1], #1 /* get 1 byte of input */ | ||
| 190 | umov w6, v1.b[0] /* get top crypted ctr byte */ | ||
| 191 | umov w7, v0.b[0] /* get top mac byte */ | ||
| 192 | .if \enc == 1 | ||
| 193 | eor w7, w7, w9 | ||
| 194 | eor w9, w9, w6 | ||
| 195 | .else | ||
| 196 | eor w9, w9, w6 | ||
| 197 | eor w7, w7, w9 | ||
| 198 | .endif | ||
| 199 | strb w9, [x0], #1 /* store out byte */ | ||
| 200 | strb w7, [x5], #1 /* store mac byte */ | ||
| 201 | subs w2, w2, #1 | ||
| 202 | beq 5b | ||
| 203 | ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ | ||
| 204 | ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ | ||
| 205 | b 7b | ||
| 206 | .endm | ||
| 207 | |||
| 208 | /* | ||
| 209 | * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
| 210 | * u8 const rk[], u32 rounds, u8 mac[], | ||
| 211 | * u8 ctr[]); | ||
| 212 | * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
| 213 | * u8 const rk[], u32 rounds, u8 mac[], | ||
| 214 | * u8 ctr[]); | ||
| 215 | */ | ||
| 216 | ENTRY(ce_aes_ccm_encrypt) | ||
| 217 | aes_ccm_do_crypt 1 | ||
| 218 | ENDPROC(ce_aes_ccm_encrypt) | ||
| 219 | |||
| 220 | ENTRY(ce_aes_ccm_decrypt) | ||
| 221 | aes_ccm_do_crypt 0 | ||
| 222 | ENDPROC(ce_aes_ccm_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 000000000000..9e6cdde9b43d --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c | |||
| @@ -0,0 +1,297 @@ | |||
| 1 | /* | ||
| 2 | * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <asm/unaligned.h> | ||
| 13 | #include <crypto/aes.h> | ||
| 14 | #include <crypto/algapi.h> | ||
| 15 | #include <crypto/scatterwalk.h> | ||
| 16 | #include <linux/crypto.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | |||
| 19 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
| 20 | { | ||
| 21 | /* | ||
| 22 | * # of rounds specified by AES: | ||
| 23 | * 128 bit key 10 rounds | ||
| 24 | * 192 bit key 12 rounds | ||
| 25 | * 256 bit key 14 rounds | ||
| 26 | * => n byte key => 6 + (n/4) rounds | ||
| 27 | */ | ||
| 28 | return 6 + ctx->key_length / 4; | ||
| 29 | } | ||
| 30 | |||
| 31 | asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
| 32 | u32 *macp, u32 const rk[], u32 rounds); | ||
| 33 | |||
| 34 | asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
| 35 | u32 const rk[], u32 rounds, u8 mac[], | ||
| 36 | u8 ctr[]); | ||
| 37 | |||
| 38 | asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
| 39 | u32 const rk[], u32 rounds, u8 mac[], | ||
| 40 | u8 ctr[]); | ||
| 41 | |||
| 42 | asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], | ||
| 43 | u32 rounds); | ||
| 44 | |||
| 45 | static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, | ||
| 46 | unsigned int key_len) | ||
| 47 | { | ||
| 48 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); | ||
| 49 | int ret; | ||
| 50 | |||
| 51 | ret = crypto_aes_expand_key(ctx, in_key, key_len); | ||
| 52 | if (!ret) | ||
| 53 | return 0; | ||
| 54 | |||
| 55 | tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
| 56 | return -EINVAL; | ||
| 57 | } | ||
| 58 | |||
| 59 | static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) | ||
| 60 | { | ||
| 61 | if ((authsize & 1) || authsize < 4) | ||
| 62 | return -EINVAL; | ||
| 63 | return 0; | ||
| 64 | } | ||
| 65 | |||
| 66 | static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) | ||
| 67 | { | ||
| 68 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
| 69 | __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; | ||
| 70 | u32 l = req->iv[0] + 1; | ||
| 71 | |||
| 72 | /* verify that CCM dimension 'L' is set correctly in the IV */ | ||
| 73 | if (l < 2 || l > 8) | ||
| 74 | return -EINVAL; | ||
| 75 | |||
| 76 | /* verify that msglen can in fact be represented in L bytes */ | ||
| 77 | if (l < 4 && msglen >> (8 * l)) | ||
| 78 | return -EOVERFLOW; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi | ||
| 82 | * uses a u32 type to represent msglen so the top 4 bytes are always 0. | ||
| 83 | */ | ||
| 84 | n[0] = 0; | ||
| 85 | n[1] = cpu_to_be32(msglen); | ||
| 86 | |||
| 87 | memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); | ||
| 88 | |||
| 89 | /* | ||
| 90 | * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) | ||
| 91 | * - bits 0..2 : max # of bytes required to represent msglen, minus 1 | ||
| 92 | * (already set by caller) | ||
| 93 | * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) | ||
| 94 | * - bit 6 : indicates presence of authenticate-only data | ||
| 95 | */ | ||
| 96 | maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; | ||
| 97 | if (req->assoclen) | ||
| 98 | maciv[0] |= 0x40; | ||
| 99 | |||
| 100 | memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); | ||
| 101 | return 0; | ||
| 102 | } | ||
| 103 | |||
| 104 | static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) | ||
| 105 | { | ||
| 106 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
| 107 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
| 108 | struct __packed { __be16 l; __be32 h; u16 len; } ltag; | ||
| 109 | struct scatter_walk walk; | ||
| 110 | u32 len = req->assoclen; | ||
| 111 | u32 macp = 0; | ||
| 112 | |||
| 113 | /* prepend the AAD with a length tag */ | ||
| 114 | if (len < 0xff00) { | ||
| 115 | ltag.l = cpu_to_be16(len); | ||
| 116 | ltag.len = 2; | ||
| 117 | } else { | ||
| 118 | ltag.l = cpu_to_be16(0xfffe); | ||
| 119 | put_unaligned_be32(len, <ag.h); | ||
| 120 | ltag.len = 6; | ||
| 121 | } | ||
| 122 | |||
| 123 | ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, | ||
| 124 | num_rounds(ctx)); | ||
| 125 | scatterwalk_start(&walk, req->assoc); | ||
| 126 | |||
| 127 | do { | ||
| 128 | u32 n = scatterwalk_clamp(&walk, len); | ||
| 129 | u8 *p; | ||
| 130 | |||
| 131 | if (!n) { | ||
| 132 | scatterwalk_start(&walk, sg_next(walk.sg)); | ||
| 133 | n = scatterwalk_clamp(&walk, len); | ||
| 134 | } | ||
| 135 | p = scatterwalk_map(&walk); | ||
| 136 | ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, | ||
| 137 | num_rounds(ctx)); | ||
| 138 | len -= n; | ||
| 139 | |||
| 140 | scatterwalk_unmap(p); | ||
| 141 | scatterwalk_advance(&walk, n); | ||
| 142 | scatterwalk_done(&walk, 0, len); | ||
| 143 | } while (len); | ||
| 144 | } | ||
| 145 | |||
| 146 | static int ccm_encrypt(struct aead_request *req) | ||
| 147 | { | ||
| 148 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
| 149 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
| 150 | struct blkcipher_desc desc = { .info = req->iv }; | ||
| 151 | struct blkcipher_walk walk; | ||
| 152 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
| 153 | u8 buf[AES_BLOCK_SIZE]; | ||
| 154 | u32 len = req->cryptlen; | ||
| 155 | int err; | ||
| 156 | |||
| 157 | err = ccm_init_mac(req, mac, len); | ||
| 158 | if (err) | ||
| 159 | return err; | ||
| 160 | |||
| 161 | kernel_neon_begin_partial(6); | ||
| 162 | |||
| 163 | if (req->assoclen) | ||
| 164 | ccm_calculate_auth_mac(req, mac); | ||
| 165 | |||
| 166 | /* preserve the original iv for the final round */ | ||
| 167 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
| 168 | |||
| 169 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
| 170 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
| 171 | AES_BLOCK_SIZE); | ||
| 172 | |||
| 173 | while (walk.nbytes) { | ||
| 174 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
| 175 | |||
| 176 | if (walk.nbytes == len) | ||
| 177 | tail = 0; | ||
| 178 | |||
| 179 | ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 180 | walk.nbytes - tail, ctx->key_enc, | ||
| 181 | num_rounds(ctx), mac, walk.iv); | ||
| 182 | |||
| 183 | len -= walk.nbytes - tail; | ||
| 184 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
| 185 | } | ||
| 186 | if (!err) | ||
| 187 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
| 188 | |||
| 189 | kernel_neon_end(); | ||
| 190 | |||
| 191 | if (err) | ||
| 192 | return err; | ||
| 193 | |||
| 194 | /* copy authtag to end of dst */ | ||
| 195 | scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, | ||
| 196 | crypto_aead_authsize(aead), 1); | ||
| 197 | |||
| 198 | return 0; | ||
| 199 | } | ||
| 200 | |||
| 201 | static int ccm_decrypt(struct aead_request *req) | ||
| 202 | { | ||
| 203 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
| 204 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
| 205 | unsigned int authsize = crypto_aead_authsize(aead); | ||
| 206 | struct blkcipher_desc desc = { .info = req->iv }; | ||
| 207 | struct blkcipher_walk walk; | ||
| 208 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
| 209 | u8 buf[AES_BLOCK_SIZE]; | ||
| 210 | u32 len = req->cryptlen - authsize; | ||
| 211 | int err; | ||
| 212 | |||
| 213 | err = ccm_init_mac(req, mac, len); | ||
| 214 | if (err) | ||
| 215 | return err; | ||
| 216 | |||
| 217 | kernel_neon_begin_partial(6); | ||
| 218 | |||
| 219 | if (req->assoclen) | ||
| 220 | ccm_calculate_auth_mac(req, mac); | ||
| 221 | |||
| 222 | /* preserve the original iv for the final round */ | ||
| 223 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
| 224 | |||
| 225 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
| 226 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
| 227 | AES_BLOCK_SIZE); | ||
| 228 | |||
| 229 | while (walk.nbytes) { | ||
| 230 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
| 231 | |||
| 232 | if (walk.nbytes == len) | ||
| 233 | tail = 0; | ||
| 234 | |||
| 235 | ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 236 | walk.nbytes - tail, ctx->key_enc, | ||
| 237 | num_rounds(ctx), mac, walk.iv); | ||
| 238 | |||
| 239 | len -= walk.nbytes - tail; | ||
| 240 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
| 241 | } | ||
| 242 | if (!err) | ||
| 243 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
| 244 | |||
| 245 | kernel_neon_end(); | ||
| 246 | |||
| 247 | if (err) | ||
| 248 | return err; | ||
| 249 | |||
| 250 | /* compare calculated auth tag with the stored one */ | ||
| 251 | scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, | ||
| 252 | authsize, 0); | ||
| 253 | |||
| 254 | if (memcmp(mac, buf, authsize)) | ||
| 255 | return -EBADMSG; | ||
| 256 | return 0; | ||
| 257 | } | ||
| 258 | |||
| 259 | static struct crypto_alg ccm_aes_alg = { | ||
| 260 | .cra_name = "ccm(aes)", | ||
| 261 | .cra_driver_name = "ccm-aes-ce", | ||
| 262 | .cra_priority = 300, | ||
| 263 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | ||
| 264 | .cra_blocksize = 1, | ||
| 265 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
| 266 | .cra_alignmask = 7, | ||
| 267 | .cra_type = &crypto_aead_type, | ||
| 268 | .cra_module = THIS_MODULE, | ||
| 269 | .cra_aead = { | ||
| 270 | .ivsize = AES_BLOCK_SIZE, | ||
| 271 | .maxauthsize = AES_BLOCK_SIZE, | ||
| 272 | .setkey = ccm_setkey, | ||
| 273 | .setauthsize = ccm_setauthsize, | ||
| 274 | .encrypt = ccm_encrypt, | ||
| 275 | .decrypt = ccm_decrypt, | ||
| 276 | } | ||
| 277 | }; | ||
| 278 | |||
| 279 | static int __init aes_mod_init(void) | ||
| 280 | { | ||
| 281 | if (!(elf_hwcap & HWCAP_AES)) | ||
| 282 | return -ENODEV; | ||
| 283 | return crypto_register_alg(&ccm_aes_alg); | ||
| 284 | } | ||
| 285 | |||
| 286 | static void __exit aes_mod_exit(void) | ||
| 287 | { | ||
| 288 | crypto_unregister_alg(&ccm_aes_alg); | ||
| 289 | } | ||
| 290 | |||
| 291 | module_init(aes_mod_init); | ||
| 292 | module_exit(aes_mod_exit); | ||
| 293 | |||
| 294 | MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); | ||
| 295 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 296 | MODULE_LICENSE("GPL v2"); | ||
| 297 | MODULE_ALIAS("ccm(aes)"); | ||
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c new file mode 100644 index 000000000000..2075e1acae6b --- /dev/null +++ b/arch/arm64/crypto/aes-ce-cipher.c | |||
| @@ -0,0 +1,155 @@ | |||
| 1 | /* | ||
| 2 | * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <crypto/aes.h> | ||
| 13 | #include <linux/cpufeature.h> | ||
| 14 | #include <linux/crypto.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | |||
| 17 | MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); | ||
| 18 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 19 | MODULE_LICENSE("GPL v2"); | ||
| 20 | |||
| 21 | struct aes_block { | ||
| 22 | u8 b[AES_BLOCK_SIZE]; | ||
| 23 | }; | ||
| 24 | |||
| 25 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
| 26 | { | ||
| 27 | /* | ||
| 28 | * # of rounds specified by AES: | ||
| 29 | * 128 bit key 10 rounds | ||
| 30 | * 192 bit key 12 rounds | ||
| 31 | * 256 bit key 14 rounds | ||
| 32 | * => n byte key => 6 + (n/4) rounds | ||
| 33 | */ | ||
| 34 | return 6 + ctx->key_length / 4; | ||
| 35 | } | ||
| 36 | |||
| 37 | static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
| 38 | { | ||
| 39 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
| 40 | struct aes_block *out = (struct aes_block *)dst; | ||
| 41 | struct aes_block const *in = (struct aes_block *)src; | ||
| 42 | void *dummy0; | ||
| 43 | int dummy1; | ||
| 44 | |||
| 45 | kernel_neon_begin_partial(4); | ||
| 46 | |||
| 47 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
| 48 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
| 49 | " cmp %w[rounds], #10 ;" | ||
| 50 | " bmi 0f ;" | ||
| 51 | " bne 3f ;" | ||
| 52 | " mov v3.16b, v1.16b ;" | ||
| 53 | " b 2f ;" | ||
| 54 | "0: mov v2.16b, v1.16b ;" | ||
| 55 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
| 56 | "1: aese v0.16b, v2.16b ;" | ||
| 57 | " aesmc v0.16b, v0.16b ;" | ||
| 58 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
| 59 | " aese v0.16b, v3.16b ;" | ||
| 60 | " aesmc v0.16b, v0.16b ;" | ||
| 61 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
| 62 | " subs %w[rounds], %w[rounds], #3 ;" | ||
| 63 | " aese v0.16b, v1.16b ;" | ||
| 64 | " aesmc v0.16b, v0.16b ;" | ||
| 65 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
| 66 | " bpl 1b ;" | ||
| 67 | " aese v0.16b, v2.16b ;" | ||
| 68 | " eor v0.16b, v0.16b, v3.16b ;" | ||
| 69 | " st1 {v0.16b}, %[out] ;" | ||
| 70 | |||
| 71 | : [out] "=Q"(*out), | ||
| 72 | [key] "=r"(dummy0), | ||
| 73 | [rounds] "=r"(dummy1) | ||
| 74 | : [in] "Q"(*in), | ||
| 75 | "1"(ctx->key_enc), | ||
| 76 | "2"(num_rounds(ctx) - 2) | ||
| 77 | : "cc"); | ||
| 78 | |||
| 79 | kernel_neon_end(); | ||
| 80 | } | ||
| 81 | |||
| 82 | static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
| 83 | { | ||
| 84 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
| 85 | struct aes_block *out = (struct aes_block *)dst; | ||
| 86 | struct aes_block const *in = (struct aes_block *)src; | ||
| 87 | void *dummy0; | ||
| 88 | int dummy1; | ||
| 89 | |||
| 90 | kernel_neon_begin_partial(4); | ||
| 91 | |||
| 92 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
| 93 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
| 94 | " cmp %w[rounds], #10 ;" | ||
| 95 | " bmi 0f ;" | ||
| 96 | " bne 3f ;" | ||
| 97 | " mov v3.16b, v1.16b ;" | ||
| 98 | " b 2f ;" | ||
| 99 | "0: mov v2.16b, v1.16b ;" | ||
| 100 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
| 101 | "1: aesd v0.16b, v2.16b ;" | ||
| 102 | " aesimc v0.16b, v0.16b ;" | ||
| 103 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
| 104 | " aesd v0.16b, v3.16b ;" | ||
| 105 | " aesimc v0.16b, v0.16b ;" | ||
| 106 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
| 107 | " subs %w[rounds], %w[rounds], #3 ;" | ||
| 108 | " aesd v0.16b, v1.16b ;" | ||
| 109 | " aesimc v0.16b, v0.16b ;" | ||
| 110 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
| 111 | " bpl 1b ;" | ||
| 112 | " aesd v0.16b, v2.16b ;" | ||
| 113 | " eor v0.16b, v0.16b, v3.16b ;" | ||
| 114 | " st1 {v0.16b}, %[out] ;" | ||
| 115 | |||
| 116 | : [out] "=Q"(*out), | ||
| 117 | [key] "=r"(dummy0), | ||
| 118 | [rounds] "=r"(dummy1) | ||
| 119 | : [in] "Q"(*in), | ||
| 120 | "1"(ctx->key_dec), | ||
| 121 | "2"(num_rounds(ctx) - 2) | ||
| 122 | : "cc"); | ||
| 123 | |||
| 124 | kernel_neon_end(); | ||
| 125 | } | ||
| 126 | |||
| 127 | static struct crypto_alg aes_alg = { | ||
| 128 | .cra_name = "aes", | ||
| 129 | .cra_driver_name = "aes-ce", | ||
| 130 | .cra_priority = 300, | ||
| 131 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
| 132 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 133 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
| 134 | .cra_module = THIS_MODULE, | ||
| 135 | .cra_cipher = { | ||
| 136 | .cia_min_keysize = AES_MIN_KEY_SIZE, | ||
| 137 | .cia_max_keysize = AES_MAX_KEY_SIZE, | ||
| 138 | .cia_setkey = crypto_aes_set_key, | ||
| 139 | .cia_encrypt = aes_cipher_encrypt, | ||
| 140 | .cia_decrypt = aes_cipher_decrypt | ||
| 141 | } | ||
| 142 | }; | ||
| 143 | |||
| 144 | static int __init aes_mod_init(void) | ||
| 145 | { | ||
| 146 | return crypto_register_alg(&aes_alg); | ||
| 147 | } | ||
| 148 | |||
| 149 | static void __exit aes_mod_exit(void) | ||
| 150 | { | ||
| 151 | crypto_unregister_alg(&aes_alg); | ||
| 152 | } | ||
| 153 | |||
| 154 | module_cpu_feature_match(AES, aes_mod_init); | ||
| 155 | module_exit(aes_mod_exit); | ||
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 000000000000..685a18f731eb --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S | |||
| @@ -0,0 +1,133 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with | ||
| 3 | * Crypto Extensions | ||
| 4 | * | ||
| 5 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/linkage.h> | ||
| 13 | |||
| 14 | #define AES_ENTRY(func) ENTRY(ce_ ## func) | ||
| 15 | #define AES_ENDPROC(func) ENDPROC(ce_ ## func) | ||
| 16 | |||
| 17 | .arch armv8-a+crypto | ||
| 18 | |||
| 19 | /* preload all round keys */ | ||
| 20 | .macro load_round_keys, rounds, rk | ||
| 21 | cmp \rounds, #12 | ||
| 22 | blo 2222f /* 128 bits */ | ||
| 23 | beq 1111f /* 192 bits */ | ||
| 24 | ld1 {v17.16b-v18.16b}, [\rk], #32 | ||
| 25 | 1111: ld1 {v19.16b-v20.16b}, [\rk], #32 | ||
| 26 | 2222: ld1 {v21.16b-v24.16b}, [\rk], #64 | ||
| 27 | ld1 {v25.16b-v28.16b}, [\rk], #64 | ||
| 28 | ld1 {v29.16b-v31.16b}, [\rk] | ||
| 29 | .endm | ||
| 30 | |||
| 31 | /* prepare for encryption with key in rk[] */ | ||
| 32 | .macro enc_prepare, rounds, rk, ignore | ||
| 33 | load_round_keys \rounds, \rk | ||
| 34 | .endm | ||
| 35 | |||
| 36 | /* prepare for encryption (again) but with new key in rk[] */ | ||
| 37 | .macro enc_switch_key, rounds, rk, ignore | ||
| 38 | load_round_keys \rounds, \rk | ||
| 39 | .endm | ||
| 40 | |||
| 41 | /* prepare for decryption with key in rk[] */ | ||
| 42 | .macro dec_prepare, rounds, rk, ignore | ||
| 43 | load_round_keys \rounds, \rk | ||
| 44 | .endm | ||
| 45 | |||
| 46 | .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 | ||
| 47 | aes\de \i0\().16b, \k\().16b | ||
| 48 | .ifnb \i1 | ||
| 49 | aes\de \i1\().16b, \k\().16b | ||
| 50 | .ifnb \i3 | ||
| 51 | aes\de \i2\().16b, \k\().16b | ||
| 52 | aes\de \i3\().16b, \k\().16b | ||
| 53 | .endif | ||
| 54 | .endif | ||
| 55 | aes\mc \i0\().16b, \i0\().16b | ||
| 56 | .ifnb \i1 | ||
| 57 | aes\mc \i1\().16b, \i1\().16b | ||
| 58 | .ifnb \i3 | ||
| 59 | aes\mc \i2\().16b, \i2\().16b | ||
| 60 | aes\mc \i3\().16b, \i3\().16b | ||
| 61 | .endif | ||
| 62 | .endif | ||
| 63 | .endm | ||
| 64 | |||
| 65 | /* up to 4 interleaved encryption rounds with the same round key */ | ||
| 66 | .macro round_Nx, enc, k, i0, i1, i2, i3 | ||
| 67 | .ifc \enc, e | ||
| 68 | do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 | ||
| 69 | .else | ||
| 70 | do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 | ||
| 71 | .endif | ||
| 72 | .endm | ||
| 73 | |||
| 74 | /* up to 4 interleaved final rounds */ | ||
| 75 | .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 | ||
| 76 | aes\de \i0\().16b, \k\().16b | ||
| 77 | .ifnb \i1 | ||
| 78 | aes\de \i1\().16b, \k\().16b | ||
| 79 | .ifnb \i3 | ||
| 80 | aes\de \i2\().16b, \k\().16b | ||
| 81 | aes\de \i3\().16b, \k\().16b | ||
| 82 | .endif | ||
| 83 | .endif | ||
| 84 | eor \i0\().16b, \i0\().16b, \k2\().16b | ||
| 85 | .ifnb \i1 | ||
| 86 | eor \i1\().16b, \i1\().16b, \k2\().16b | ||
| 87 | .ifnb \i3 | ||
| 88 | eor \i2\().16b, \i2\().16b, \k2\().16b | ||
| 89 | eor \i3\().16b, \i3\().16b, \k2\().16b | ||
| 90 | .endif | ||
| 91 | .endif | ||
| 92 | .endm | ||
| 93 | |||
| 94 | /* up to 4 interleaved blocks */ | ||
| 95 | .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 | ||
| 96 | cmp \rounds, #12 | ||
| 97 | blo 2222f /* 128 bits */ | ||
| 98 | beq 1111f /* 192 bits */ | ||
| 99 | round_Nx \enc, v17, \i0, \i1, \i2, \i3 | ||
| 100 | round_Nx \enc, v18, \i0, \i1, \i2, \i3 | ||
| 101 | 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 | ||
| 102 | round_Nx \enc, v20, \i0, \i1, \i2, \i3 | ||
| 103 | 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 | ||
| 104 | round_Nx \enc, \key, \i0, \i1, \i2, \i3 | ||
| 105 | .endr | ||
| 106 | fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 | ||
| 107 | .endm | ||
| 108 | |||
| 109 | .macro encrypt_block, in, rounds, t0, t1, t2 | ||
| 110 | do_block_Nx e, \rounds, \in | ||
| 111 | .endm | ||
| 112 | |||
| 113 | .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
| 114 | do_block_Nx e, \rounds, \i0, \i1 | ||
| 115 | .endm | ||
| 116 | |||
| 117 | .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
| 118 | do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 | ||
| 119 | .endm | ||
| 120 | |||
| 121 | .macro decrypt_block, in, rounds, t0, t1, t2 | ||
| 122 | do_block_Nx d, \rounds, \in | ||
| 123 | .endm | ||
| 124 | |||
| 125 | .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
| 126 | do_block_Nx d, \rounds, \i0, \i1 | ||
| 127 | .endm | ||
| 128 | |||
| 129 | .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
| 130 | do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 | ||
| 131 | .endm | ||
| 132 | |||
| 133 | #include "aes-modes.S" | ||
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 000000000000..60f2f4c12256 --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c | |||
| @@ -0,0 +1,446 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <asm/hwcap.h> | ||
| 13 | #include <crypto/aes.h> | ||
| 14 | #include <crypto/ablk_helper.h> | ||
| 15 | #include <crypto/algapi.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | #include <linux/cpufeature.h> | ||
| 18 | |||
| 19 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
| 20 | #define MODE "ce" | ||
| 21 | #define PRIO 300 | ||
| 22 | #define aes_ecb_encrypt ce_aes_ecb_encrypt | ||
| 23 | #define aes_ecb_decrypt ce_aes_ecb_decrypt | ||
| 24 | #define aes_cbc_encrypt ce_aes_cbc_encrypt | ||
| 25 | #define aes_cbc_decrypt ce_aes_cbc_decrypt | ||
| 26 | #define aes_ctr_encrypt ce_aes_ctr_encrypt | ||
| 27 | #define aes_xts_encrypt ce_aes_xts_encrypt | ||
| 28 | #define aes_xts_decrypt ce_aes_xts_decrypt | ||
| 29 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); | ||
| 30 | #else | ||
| 31 | #define MODE "neon" | ||
| 32 | #define PRIO 200 | ||
| 33 | #define aes_ecb_encrypt neon_aes_ecb_encrypt | ||
| 34 | #define aes_ecb_decrypt neon_aes_ecb_decrypt | ||
| 35 | #define aes_cbc_encrypt neon_aes_cbc_encrypt | ||
| 36 | #define aes_cbc_decrypt neon_aes_cbc_decrypt | ||
| 37 | #define aes_ctr_encrypt neon_aes_ctr_encrypt | ||
| 38 | #define aes_xts_encrypt neon_aes_xts_encrypt | ||
| 39 | #define aes_xts_decrypt neon_aes_xts_decrypt | ||
| 40 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); | ||
| 41 | MODULE_ALIAS("ecb(aes)"); | ||
| 42 | MODULE_ALIAS("cbc(aes)"); | ||
| 43 | MODULE_ALIAS("ctr(aes)"); | ||
| 44 | MODULE_ALIAS("xts(aes)"); | ||
| 45 | #endif | ||
| 46 | |||
| 47 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 48 | MODULE_LICENSE("GPL v2"); | ||
| 49 | |||
| 50 | /* defined in aes-modes.S */ | ||
| 51 | asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
| 52 | int rounds, int blocks, int first); | ||
| 53 | asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
| 54 | int rounds, int blocks, int first); | ||
| 55 | |||
| 56 | asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
| 57 | int rounds, int blocks, u8 iv[], int first); | ||
| 58 | asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
| 59 | int rounds, int blocks, u8 iv[], int first); | ||
| 60 | |||
| 61 | asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
| 62 | int rounds, int blocks, u8 ctr[], int first); | ||
| 63 | |||
| 64 | asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
| 65 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
| 66 | int first); | ||
| 67 | asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
| 68 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
| 69 | int first); | ||
| 70 | |||
| 71 | struct crypto_aes_xts_ctx { | ||
| 72 | struct crypto_aes_ctx key1; | ||
| 73 | struct crypto_aes_ctx __aligned(8) key2; | ||
| 74 | }; | ||
| 75 | |||
| 76 | static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
| 77 | unsigned int key_len) | ||
| 78 | { | ||
| 79 | struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); | ||
| 80 | int ret; | ||
| 81 | |||
| 82 | ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); | ||
| 83 | if (!ret) | ||
| 84 | ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], | ||
| 85 | key_len / 2); | ||
| 86 | if (!ret) | ||
| 87 | return 0; | ||
| 88 | |||
| 89 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
| 90 | return -EINVAL; | ||
| 91 | } | ||
| 92 | |||
| 93 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 94 | struct scatterlist *src, unsigned int nbytes) | ||
| 95 | { | ||
| 96 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 97 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
| 98 | struct blkcipher_walk walk; | ||
| 99 | unsigned int blocks; | ||
| 100 | |||
| 101 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 102 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 103 | err = blkcipher_walk_virt(desc, &walk); | ||
| 104 | |||
| 105 | kernel_neon_begin(); | ||
| 106 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 107 | aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 108 | (u8 *)ctx->key_enc, rounds, blocks, first); | ||
| 109 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 110 | } | ||
| 111 | kernel_neon_end(); | ||
| 112 | return err; | ||
| 113 | } | ||
| 114 | |||
| 115 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 116 | struct scatterlist *src, unsigned int nbytes) | ||
| 117 | { | ||
| 118 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 119 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
| 120 | struct blkcipher_walk walk; | ||
| 121 | unsigned int blocks; | ||
| 122 | |||
| 123 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 124 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 125 | err = blkcipher_walk_virt(desc, &walk); | ||
| 126 | |||
| 127 | kernel_neon_begin(); | ||
| 128 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 129 | aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 130 | (u8 *)ctx->key_dec, rounds, blocks, first); | ||
| 131 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 132 | } | ||
| 133 | kernel_neon_end(); | ||
| 134 | return err; | ||
| 135 | } | ||
| 136 | |||
| 137 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 138 | struct scatterlist *src, unsigned int nbytes) | ||
| 139 | { | ||
| 140 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 141 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
| 142 | struct blkcipher_walk walk; | ||
| 143 | unsigned int blocks; | ||
| 144 | |||
| 145 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 146 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 147 | err = blkcipher_walk_virt(desc, &walk); | ||
| 148 | |||
| 149 | kernel_neon_begin(); | ||
| 150 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 151 | aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 152 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
| 153 | first); | ||
| 154 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 155 | } | ||
| 156 | kernel_neon_end(); | ||
| 157 | return err; | ||
| 158 | } | ||
| 159 | |||
| 160 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 161 | struct scatterlist *src, unsigned int nbytes) | ||
| 162 | { | ||
| 163 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 164 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
| 165 | struct blkcipher_walk walk; | ||
| 166 | unsigned int blocks; | ||
| 167 | |||
| 168 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 169 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 170 | err = blkcipher_walk_virt(desc, &walk); | ||
| 171 | |||
| 172 | kernel_neon_begin(); | ||
| 173 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 174 | aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 175 | (u8 *)ctx->key_dec, rounds, blocks, walk.iv, | ||
| 176 | first); | ||
| 177 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 178 | } | ||
| 179 | kernel_neon_end(); | ||
| 180 | return err; | ||
| 181 | } | ||
| 182 | |||
| 183 | static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 184 | struct scatterlist *src, unsigned int nbytes) | ||
| 185 | { | ||
| 186 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 187 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
| 188 | struct blkcipher_walk walk; | ||
| 189 | int blocks; | ||
| 190 | |||
| 191 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 192 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 193 | err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); | ||
| 194 | |||
| 195 | first = 1; | ||
| 196 | kernel_neon_begin(); | ||
| 197 | while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { | ||
| 198 | aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 199 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
| 200 | first); | ||
| 201 | first = 0; | ||
| 202 | nbytes -= blocks * AES_BLOCK_SIZE; | ||
| 203 | if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) | ||
| 204 | break; | ||
| 205 | err = blkcipher_walk_done(desc, &walk, | ||
| 206 | walk.nbytes % AES_BLOCK_SIZE); | ||
| 207 | } | ||
| 208 | if (nbytes) { | ||
| 209 | u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; | ||
| 210 | u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; | ||
| 211 | u8 __aligned(8) tail[AES_BLOCK_SIZE]; | ||
| 212 | |||
| 213 | /* | ||
| 214 | * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need | ||
| 215 | * to tell aes_ctr_encrypt() to only read half a block. | ||
| 216 | */ | ||
| 217 | blocks = (nbytes <= 8) ? -1 : 1; | ||
| 218 | |||
| 219 | aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, | ||
| 220 | blocks, walk.iv, first); | ||
| 221 | memcpy(tdst, tail, nbytes); | ||
| 222 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 223 | } | ||
| 224 | kernel_neon_end(); | ||
| 225 | |||
| 226 | return err; | ||
| 227 | } | ||
| 228 | |||
| 229 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 230 | struct scatterlist *src, unsigned int nbytes) | ||
| 231 | { | ||
| 232 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 233 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
| 234 | struct blkcipher_walk walk; | ||
| 235 | unsigned int blocks; | ||
| 236 | |||
| 237 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 238 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 239 | err = blkcipher_walk_virt(desc, &walk); | ||
| 240 | |||
| 241 | kernel_neon_begin(); | ||
| 242 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 243 | aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 244 | (u8 *)ctx->key1.key_enc, rounds, blocks, | ||
| 245 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
| 246 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 247 | } | ||
| 248 | kernel_neon_end(); | ||
| 249 | |||
| 250 | return err; | ||
| 251 | } | ||
| 252 | |||
| 253 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
| 254 | struct scatterlist *src, unsigned int nbytes) | ||
| 255 | { | ||
| 256 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
| 257 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
| 258 | struct blkcipher_walk walk; | ||
| 259 | unsigned int blocks; | ||
| 260 | |||
| 261 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 262 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
| 263 | err = blkcipher_walk_virt(desc, &walk); | ||
| 264 | |||
| 265 | kernel_neon_begin(); | ||
| 266 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
| 267 | aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
| 268 | (u8 *)ctx->key1.key_dec, rounds, blocks, | ||
| 269 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
| 270 | err = blkcipher_walk_done(desc, &walk, 0); | ||
| 271 | } | ||
| 272 | kernel_neon_end(); | ||
| 273 | |||
| 274 | return err; | ||
| 275 | } | ||
| 276 | |||
| 277 | static struct crypto_alg aes_algs[] = { { | ||
| 278 | .cra_name = "__ecb-aes-" MODE, | ||
| 279 | .cra_driver_name = "__driver-ecb-aes-" MODE, | ||
| 280 | .cra_priority = 0, | ||
| 281 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
| 282 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 283 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
| 284 | .cra_alignmask = 7, | ||
| 285 | .cra_type = &crypto_blkcipher_type, | ||
| 286 | .cra_module = THIS_MODULE, | ||
| 287 | .cra_blkcipher = { | ||
| 288 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 289 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 290 | .ivsize = AES_BLOCK_SIZE, | ||
| 291 | .setkey = crypto_aes_set_key, | ||
| 292 | .encrypt = ecb_encrypt, | ||
| 293 | .decrypt = ecb_decrypt, | ||
| 294 | }, | ||
| 295 | }, { | ||
| 296 | .cra_name = "__cbc-aes-" MODE, | ||
| 297 | .cra_driver_name = "__driver-cbc-aes-" MODE, | ||
| 298 | .cra_priority = 0, | ||
| 299 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
| 300 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 301 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
| 302 | .cra_alignmask = 7, | ||
| 303 | .cra_type = &crypto_blkcipher_type, | ||
| 304 | .cra_module = THIS_MODULE, | ||
| 305 | .cra_blkcipher = { | ||
| 306 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 307 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 308 | .ivsize = AES_BLOCK_SIZE, | ||
| 309 | .setkey = crypto_aes_set_key, | ||
| 310 | .encrypt = cbc_encrypt, | ||
| 311 | .decrypt = cbc_decrypt, | ||
| 312 | }, | ||
| 313 | }, { | ||
| 314 | .cra_name = "__ctr-aes-" MODE, | ||
| 315 | .cra_driver_name = "__driver-ctr-aes-" MODE, | ||
| 316 | .cra_priority = 0, | ||
| 317 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
| 318 | .cra_blocksize = 1, | ||
| 319 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
| 320 | .cra_alignmask = 7, | ||
| 321 | .cra_type = &crypto_blkcipher_type, | ||
| 322 | .cra_module = THIS_MODULE, | ||
| 323 | .cra_blkcipher = { | ||
| 324 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 325 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 326 | .ivsize = AES_BLOCK_SIZE, | ||
| 327 | .setkey = crypto_aes_set_key, | ||
| 328 | .encrypt = ctr_encrypt, | ||
| 329 | .decrypt = ctr_encrypt, | ||
| 330 | }, | ||
| 331 | }, { | ||
| 332 | .cra_name = "__xts-aes-" MODE, | ||
| 333 | .cra_driver_name = "__driver-xts-aes-" MODE, | ||
| 334 | .cra_priority = 0, | ||
| 335 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
| 336 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 337 | .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), | ||
| 338 | .cra_alignmask = 7, | ||
| 339 | .cra_type = &crypto_blkcipher_type, | ||
| 340 | .cra_module = THIS_MODULE, | ||
| 341 | .cra_blkcipher = { | ||
| 342 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
| 343 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
| 344 | .ivsize = AES_BLOCK_SIZE, | ||
| 345 | .setkey = xts_set_key, | ||
| 346 | .encrypt = xts_encrypt, | ||
| 347 | .decrypt = xts_decrypt, | ||
| 348 | }, | ||
| 349 | }, { | ||
| 350 | .cra_name = "ecb(aes)", | ||
| 351 | .cra_driver_name = "ecb-aes-" MODE, | ||
| 352 | .cra_priority = PRIO, | ||
| 353 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
| 354 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 355 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
| 356 | .cra_alignmask = 7, | ||
| 357 | .cra_type = &crypto_ablkcipher_type, | ||
| 358 | .cra_module = THIS_MODULE, | ||
| 359 | .cra_init = ablk_init, | ||
| 360 | .cra_exit = ablk_exit, | ||
| 361 | .cra_ablkcipher = { | ||
| 362 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 363 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 364 | .ivsize = AES_BLOCK_SIZE, | ||
| 365 | .setkey = ablk_set_key, | ||
| 366 | .encrypt = ablk_encrypt, | ||
| 367 | .decrypt = ablk_decrypt, | ||
| 368 | } | ||
| 369 | }, { | ||
| 370 | .cra_name = "cbc(aes)", | ||
| 371 | .cra_driver_name = "cbc-aes-" MODE, | ||
| 372 | .cra_priority = PRIO, | ||
| 373 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
| 374 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 375 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
| 376 | .cra_alignmask = 7, | ||
| 377 | .cra_type = &crypto_ablkcipher_type, | ||
| 378 | .cra_module = THIS_MODULE, | ||
| 379 | .cra_init = ablk_init, | ||
| 380 | .cra_exit = ablk_exit, | ||
| 381 | .cra_ablkcipher = { | ||
| 382 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 383 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 384 | .ivsize = AES_BLOCK_SIZE, | ||
| 385 | .setkey = ablk_set_key, | ||
| 386 | .encrypt = ablk_encrypt, | ||
| 387 | .decrypt = ablk_decrypt, | ||
| 388 | } | ||
| 389 | }, { | ||
| 390 | .cra_name = "ctr(aes)", | ||
| 391 | .cra_driver_name = "ctr-aes-" MODE, | ||
| 392 | .cra_priority = PRIO, | ||
| 393 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
| 394 | .cra_blocksize = 1, | ||
| 395 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
| 396 | .cra_alignmask = 7, | ||
| 397 | .cra_type = &crypto_ablkcipher_type, | ||
| 398 | .cra_module = THIS_MODULE, | ||
| 399 | .cra_init = ablk_init, | ||
| 400 | .cra_exit = ablk_exit, | ||
| 401 | .cra_ablkcipher = { | ||
| 402 | .min_keysize = AES_MIN_KEY_SIZE, | ||
| 403 | .max_keysize = AES_MAX_KEY_SIZE, | ||
| 404 | .ivsize = AES_BLOCK_SIZE, | ||
| 405 | .setkey = ablk_set_key, | ||
| 406 | .encrypt = ablk_encrypt, | ||
| 407 | .decrypt = ablk_decrypt, | ||
| 408 | } | ||
| 409 | }, { | ||
| 410 | .cra_name = "xts(aes)", | ||
| 411 | .cra_driver_name = "xts-aes-" MODE, | ||
| 412 | .cra_priority = PRIO, | ||
| 413 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
| 414 | .cra_blocksize = AES_BLOCK_SIZE, | ||
| 415 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
| 416 | .cra_alignmask = 7, | ||
| 417 | .cra_type = &crypto_ablkcipher_type, | ||
| 418 | .cra_module = THIS_MODULE, | ||
| 419 | .cra_init = ablk_init, | ||
| 420 | .cra_exit = ablk_exit, | ||
| 421 | .cra_ablkcipher = { | ||
| 422 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
| 423 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
| 424 | .ivsize = AES_BLOCK_SIZE, | ||
| 425 | .setkey = ablk_set_key, | ||
| 426 | .encrypt = ablk_encrypt, | ||
| 427 | .decrypt = ablk_decrypt, | ||
| 428 | } | ||
| 429 | } }; | ||
| 430 | |||
| 431 | static int __init aes_init(void) | ||
| 432 | { | ||
| 433 | return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
| 434 | } | ||
| 435 | |||
| 436 | static void __exit aes_exit(void) | ||
| 437 | { | ||
| 438 | crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
| 439 | } | ||
| 440 | |||
| 441 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
| 442 | module_cpu_feature_match(AES, aes_init); | ||
| 443 | #else | ||
| 444 | module_init(aes_init); | ||
| 445 | #endif | ||
| 446 | module_exit(aes_exit); | ||
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 000000000000..f6e372c528eb --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S | |||
| @@ -0,0 +1,532 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | /* included by aes-ce.S and aes-neon.S */ | ||
| 12 | |||
| 13 | .text | ||
| 14 | .align 4 | ||
| 15 | |||
| 16 | /* | ||
| 17 | * There are several ways to instantiate this code: | ||
| 18 | * - no interleave, all inline | ||
| 19 | * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) | ||
| 20 | * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) | ||
| 21 | * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) | ||
| 22 | * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) | ||
| 23 | * | ||
| 24 | * Macros imported by this code: | ||
| 25 | * - enc_prepare - setup NEON registers for encryption | ||
| 26 | * - dec_prepare - setup NEON registers for decryption | ||
| 27 | * - enc_switch_key - change to new key after having prepared for encryption | ||
| 28 | * - encrypt_block - encrypt a single block | ||
| 29 | * - decrypt block - decrypt a single block | ||
| 30 | * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
| 31 | * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
| 32 | * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
| 33 | * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
| 34 | */ | ||
| 35 | |||
| 36 | #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) | ||
| 37 | #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp | ||
| 38 | #define FRAME_POP ldp x29, x30, [sp],#16 | ||
| 39 | |||
| 40 | #if INTERLEAVE == 2 | ||
| 41 | |||
| 42 | aes_encrypt_block2x: | ||
| 43 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
| 44 | ret | ||
| 45 | ENDPROC(aes_encrypt_block2x) | ||
| 46 | |||
| 47 | aes_decrypt_block2x: | ||
| 48 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
| 49 | ret | ||
| 50 | ENDPROC(aes_decrypt_block2x) | ||
| 51 | |||
| 52 | #elif INTERLEAVE == 4 | ||
| 53 | |||
| 54 | aes_encrypt_block4x: | ||
| 55 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
| 56 | ret | ||
| 57 | ENDPROC(aes_encrypt_block4x) | ||
| 58 | |||
| 59 | aes_decrypt_block4x: | ||
| 60 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
| 61 | ret | ||
| 62 | ENDPROC(aes_decrypt_block4x) | ||
| 63 | |||
| 64 | #else | ||
| 65 | #error INTERLEAVE should equal 2 or 4 | ||
| 66 | #endif | ||
| 67 | |||
| 68 | .macro do_encrypt_block2x | ||
| 69 | bl aes_encrypt_block2x | ||
| 70 | .endm | ||
| 71 | |||
| 72 | .macro do_decrypt_block2x | ||
| 73 | bl aes_decrypt_block2x | ||
| 74 | .endm | ||
| 75 | |||
| 76 | .macro do_encrypt_block4x | ||
| 77 | bl aes_encrypt_block4x | ||
| 78 | .endm | ||
| 79 | |||
| 80 | .macro do_decrypt_block4x | ||
| 81 | bl aes_decrypt_block4x | ||
| 82 | .endm | ||
| 83 | |||
| 84 | #else | ||
| 85 | #define FRAME_PUSH | ||
| 86 | #define FRAME_POP | ||
| 87 | |||
| 88 | .macro do_encrypt_block2x | ||
| 89 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
| 90 | .endm | ||
| 91 | |||
| 92 | .macro do_decrypt_block2x | ||
| 93 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
| 94 | .endm | ||
| 95 | |||
| 96 | .macro do_encrypt_block4x | ||
| 97 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
| 98 | .endm | ||
| 99 | |||
| 100 | .macro do_decrypt_block4x | ||
| 101 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
| 102 | .endm | ||
| 103 | |||
| 104 | #endif | ||
| 105 | |||
| 106 | /* | ||
| 107 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
| 108 | * int blocks, int first) | ||
| 109 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
| 110 | * int blocks, int first) | ||
| 111 | */ | ||
| 112 | |||
| 113 | AES_ENTRY(aes_ecb_encrypt) | ||
| 114 | FRAME_PUSH | ||
| 115 | cbz w5, .LecbencloopNx | ||
| 116 | |||
| 117 | enc_prepare w3, x2, x5 | ||
| 118 | |||
| 119 | .LecbencloopNx: | ||
| 120 | #if INTERLEAVE >= 2 | ||
| 121 | subs w4, w4, #INTERLEAVE | ||
| 122 | bmi .Lecbenc1x | ||
| 123 | #if INTERLEAVE == 2 | ||
| 124 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
| 125 | do_encrypt_block2x | ||
| 126 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 127 | #else | ||
| 128 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
| 129 | do_encrypt_block4x | ||
| 130 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 131 | #endif | ||
| 132 | b .LecbencloopNx | ||
| 133 | .Lecbenc1x: | ||
| 134 | adds w4, w4, #INTERLEAVE | ||
| 135 | beq .Lecbencout | ||
| 136 | #endif | ||
| 137 | .Lecbencloop: | ||
| 138 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ | ||
| 139 | encrypt_block v0, w3, x2, x5, w6 | ||
| 140 | st1 {v0.16b}, [x0], #16 | ||
| 141 | subs w4, w4, #1 | ||
| 142 | bne .Lecbencloop | ||
| 143 | .Lecbencout: | ||
| 144 | FRAME_POP | ||
| 145 | ret | ||
| 146 | AES_ENDPROC(aes_ecb_encrypt) | ||
| 147 | |||
| 148 | |||
| 149 | AES_ENTRY(aes_ecb_decrypt) | ||
| 150 | FRAME_PUSH | ||
| 151 | cbz w5, .LecbdecloopNx | ||
| 152 | |||
| 153 | dec_prepare w3, x2, x5 | ||
| 154 | |||
| 155 | .LecbdecloopNx: | ||
| 156 | #if INTERLEAVE >= 2 | ||
| 157 | subs w4, w4, #INTERLEAVE | ||
| 158 | bmi .Lecbdec1x | ||
| 159 | #if INTERLEAVE == 2 | ||
| 160 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
| 161 | do_decrypt_block2x | ||
| 162 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 163 | #else | ||
| 164 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
| 165 | do_decrypt_block4x | ||
| 166 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 167 | #endif | ||
| 168 | b .LecbdecloopNx | ||
| 169 | .Lecbdec1x: | ||
| 170 | adds w4, w4, #INTERLEAVE | ||
| 171 | beq .Lecbdecout | ||
| 172 | #endif | ||
| 173 | .Lecbdecloop: | ||
| 174 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ | ||
| 175 | decrypt_block v0, w3, x2, x5, w6 | ||
| 176 | st1 {v0.16b}, [x0], #16 | ||
| 177 | subs w4, w4, #1 | ||
| 178 | bne .Lecbdecloop | ||
| 179 | .Lecbdecout: | ||
| 180 | FRAME_POP | ||
| 181 | ret | ||
| 182 | AES_ENDPROC(aes_ecb_decrypt) | ||
| 183 | |||
| 184 | |||
| 185 | /* | ||
| 186 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
| 187 | * int blocks, u8 iv[], int first) | ||
| 188 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
| 189 | * int blocks, u8 iv[], int first) | ||
| 190 | */ | ||
| 191 | |||
| 192 | AES_ENTRY(aes_cbc_encrypt) | ||
| 193 | cbz w6, .Lcbcencloop | ||
| 194 | |||
| 195 | ld1 {v0.16b}, [x5] /* get iv */ | ||
| 196 | enc_prepare w3, x2, x5 | ||
| 197 | |||
| 198 | .Lcbcencloop: | ||
| 199 | ld1 {v1.16b}, [x1], #16 /* get next pt block */ | ||
| 200 | eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ | ||
| 201 | encrypt_block v0, w3, x2, x5, w6 | ||
| 202 | st1 {v0.16b}, [x0], #16 | ||
| 203 | subs w4, w4, #1 | ||
| 204 | bne .Lcbcencloop | ||
| 205 | ret | ||
| 206 | AES_ENDPROC(aes_cbc_encrypt) | ||
| 207 | |||
| 208 | |||
| 209 | AES_ENTRY(aes_cbc_decrypt) | ||
| 210 | FRAME_PUSH | ||
| 211 | cbz w6, .LcbcdecloopNx | ||
| 212 | |||
| 213 | ld1 {v7.16b}, [x5] /* get iv */ | ||
| 214 | dec_prepare w3, x2, x5 | ||
| 215 | |||
| 216 | .LcbcdecloopNx: | ||
| 217 | #if INTERLEAVE >= 2 | ||
| 218 | subs w4, w4, #INTERLEAVE | ||
| 219 | bmi .Lcbcdec1x | ||
| 220 | #if INTERLEAVE == 2 | ||
| 221 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
| 222 | mov v2.16b, v0.16b | ||
| 223 | mov v3.16b, v1.16b | ||
| 224 | do_decrypt_block2x | ||
| 225 | eor v0.16b, v0.16b, v7.16b | ||
| 226 | eor v1.16b, v1.16b, v2.16b | ||
| 227 | mov v7.16b, v3.16b | ||
| 228 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 229 | #else | ||
| 230 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
| 231 | mov v4.16b, v0.16b | ||
| 232 | mov v5.16b, v1.16b | ||
| 233 | mov v6.16b, v2.16b | ||
| 234 | do_decrypt_block4x | ||
| 235 | sub x1, x1, #16 | ||
| 236 | eor v0.16b, v0.16b, v7.16b | ||
| 237 | eor v1.16b, v1.16b, v4.16b | ||
| 238 | ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ | ||
| 239 | eor v2.16b, v2.16b, v5.16b | ||
| 240 | eor v3.16b, v3.16b, v6.16b | ||
| 241 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 242 | #endif | ||
| 243 | b .LcbcdecloopNx | ||
| 244 | .Lcbcdec1x: | ||
| 245 | adds w4, w4, #INTERLEAVE | ||
| 246 | beq .Lcbcdecout | ||
| 247 | #endif | ||
| 248 | .Lcbcdecloop: | ||
| 249 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ | ||
| 250 | mov v0.16b, v1.16b /* ...and copy to v0 */ | ||
| 251 | decrypt_block v0, w3, x2, x5, w6 | ||
| 252 | eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ | ||
| 253 | mov v7.16b, v1.16b /* ct is next iv */ | ||
| 254 | st1 {v0.16b}, [x0], #16 | ||
| 255 | subs w4, w4, #1 | ||
| 256 | bne .Lcbcdecloop | ||
| 257 | .Lcbcdecout: | ||
| 258 | FRAME_POP | ||
| 259 | ret | ||
| 260 | AES_ENDPROC(aes_cbc_decrypt) | ||
| 261 | |||
| 262 | |||
| 263 | /* | ||
| 264 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
| 265 | * int blocks, u8 ctr[], int first) | ||
| 266 | */ | ||
| 267 | |||
| 268 | AES_ENTRY(aes_ctr_encrypt) | ||
| 269 | FRAME_PUSH | ||
| 270 | cbnz w6, .Lctrfirst /* 1st time around? */ | ||
| 271 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
| 272 | rev x5, x5 | ||
| 273 | #if INTERLEAVE >= 2 | ||
| 274 | cmn w5, w4 /* 32 bit overflow? */ | ||
| 275 | bcs .Lctrinc | ||
| 276 | add x5, x5, #1 /* increment BE ctr */ | ||
| 277 | b .LctrincNx | ||
| 278 | #else | ||
| 279 | b .Lctrinc | ||
| 280 | #endif | ||
| 281 | .Lctrfirst: | ||
| 282 | enc_prepare w3, x2, x6 | ||
| 283 | ld1 {v4.16b}, [x5] | ||
| 284 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
| 285 | rev x5, x5 | ||
| 286 | #if INTERLEAVE >= 2 | ||
| 287 | cmn w5, w4 /* 32 bit overflow? */ | ||
| 288 | bcs .Lctrloop | ||
| 289 | .LctrloopNx: | ||
| 290 | subs w4, w4, #INTERLEAVE | ||
| 291 | bmi .Lctr1x | ||
| 292 | #if INTERLEAVE == 2 | ||
| 293 | mov v0.8b, v4.8b | ||
| 294 | mov v1.8b, v4.8b | ||
| 295 | rev x7, x5 | ||
| 296 | add x5, x5, #1 | ||
| 297 | ins v0.d[1], x7 | ||
| 298 | rev x7, x5 | ||
| 299 | add x5, x5, #1 | ||
| 300 | ins v1.d[1], x7 | ||
| 301 | ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ | ||
| 302 | do_encrypt_block2x | ||
| 303 | eor v0.16b, v0.16b, v2.16b | ||
| 304 | eor v1.16b, v1.16b, v3.16b | ||
| 305 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 306 | #else | ||
| 307 | ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ | ||
| 308 | dup v7.4s, w5 | ||
| 309 | mov v0.16b, v4.16b | ||
| 310 | add v7.4s, v7.4s, v8.4s | ||
| 311 | mov v1.16b, v4.16b | ||
| 312 | rev32 v8.16b, v7.16b | ||
| 313 | mov v2.16b, v4.16b | ||
| 314 | mov v3.16b, v4.16b | ||
| 315 | mov v1.s[3], v8.s[0] | ||
| 316 | mov v2.s[3], v8.s[1] | ||
| 317 | mov v3.s[3], v8.s[2] | ||
| 318 | ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ | ||
| 319 | do_encrypt_block4x | ||
| 320 | eor v0.16b, v5.16b, v0.16b | ||
| 321 | ld1 {v5.16b}, [x1], #16 /* get 1 input block */ | ||
| 322 | eor v1.16b, v6.16b, v1.16b | ||
| 323 | eor v2.16b, v7.16b, v2.16b | ||
| 324 | eor v3.16b, v5.16b, v3.16b | ||
| 325 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 326 | add x5, x5, #INTERLEAVE | ||
| 327 | #endif | ||
| 328 | cbz w4, .LctroutNx | ||
| 329 | .LctrincNx: | ||
| 330 | rev x7, x5 | ||
| 331 | ins v4.d[1], x7 | ||
| 332 | b .LctrloopNx | ||
| 333 | .LctroutNx: | ||
| 334 | sub x5, x5, #1 | ||
| 335 | rev x7, x5 | ||
| 336 | ins v4.d[1], x7 | ||
| 337 | b .Lctrout | ||
| 338 | .Lctr1x: | ||
| 339 | adds w4, w4, #INTERLEAVE | ||
| 340 | beq .Lctrout | ||
| 341 | #endif | ||
| 342 | .Lctrloop: | ||
| 343 | mov v0.16b, v4.16b | ||
| 344 | encrypt_block v0, w3, x2, x6, w7 | ||
| 345 | subs w4, w4, #1 | ||
| 346 | bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ | ||
| 347 | ld1 {v3.16b}, [x1], #16 | ||
| 348 | eor v3.16b, v0.16b, v3.16b | ||
| 349 | st1 {v3.16b}, [x0], #16 | ||
| 350 | beq .Lctrout | ||
| 351 | .Lctrinc: | ||
| 352 | adds x5, x5, #1 /* increment BE ctr */ | ||
| 353 | rev x7, x5 | ||
| 354 | ins v4.d[1], x7 | ||
| 355 | bcc .Lctrloop /* no overflow? */ | ||
| 356 | umov x7, v4.d[0] /* load upper word of ctr */ | ||
| 357 | rev x7, x7 /* ... to handle the carry */ | ||
| 358 | add x7, x7, #1 | ||
| 359 | rev x7, x7 | ||
| 360 | ins v4.d[0], x7 | ||
| 361 | b .Lctrloop | ||
| 362 | .Lctrhalfblock: | ||
| 363 | ld1 {v3.8b}, [x1] | ||
| 364 | eor v3.8b, v0.8b, v3.8b | ||
| 365 | st1 {v3.8b}, [x0] | ||
| 366 | .Lctrout: | ||
| 367 | FRAME_POP | ||
| 368 | ret | ||
| 369 | AES_ENDPROC(aes_ctr_encrypt) | ||
| 370 | .ltorg | ||
| 371 | |||
| 372 | |||
| 373 | /* | ||
| 374 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
| 375 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
| 376 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
| 377 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
| 378 | */ | ||
| 379 | |||
| 380 | .macro next_tweak, out, in, const, tmp | ||
| 381 | sshr \tmp\().2d, \in\().2d, #63 | ||
| 382 | and \tmp\().16b, \tmp\().16b, \const\().16b | ||
| 383 | add \out\().2d, \in\().2d, \in\().2d | ||
| 384 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 | ||
| 385 | eor \out\().16b, \out\().16b, \tmp\().16b | ||
| 386 | .endm | ||
| 387 | |||
| 388 | .Lxts_mul_x: | ||
| 389 | .word 1, 0, 0x87, 0 | ||
| 390 | |||
| 391 | AES_ENTRY(aes_xts_encrypt) | ||
| 392 | FRAME_PUSH | ||
| 393 | cbz w7, .LxtsencloopNx | ||
| 394 | |||
| 395 | ld1 {v4.16b}, [x6] | ||
| 396 | enc_prepare w3, x5, x6 | ||
| 397 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
| 398 | enc_switch_key w3, x2, x6 | ||
| 399 | ldr q7, .Lxts_mul_x | ||
| 400 | b .LxtsencNx | ||
| 401 | |||
| 402 | .LxtsencloopNx: | ||
| 403 | ldr q7, .Lxts_mul_x | ||
| 404 | next_tweak v4, v4, v7, v8 | ||
| 405 | .LxtsencNx: | ||
| 406 | #if INTERLEAVE >= 2 | ||
| 407 | subs w4, w4, #INTERLEAVE | ||
| 408 | bmi .Lxtsenc1x | ||
| 409 | #if INTERLEAVE == 2 | ||
| 410 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
| 411 | next_tweak v5, v4, v7, v8 | ||
| 412 | eor v0.16b, v0.16b, v4.16b | ||
| 413 | eor v1.16b, v1.16b, v5.16b | ||
| 414 | do_encrypt_block2x | ||
| 415 | eor v0.16b, v0.16b, v4.16b | ||
| 416 | eor v1.16b, v1.16b, v5.16b | ||
| 417 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 418 | cbz w4, .LxtsencoutNx | ||
| 419 | next_tweak v4, v5, v7, v8 | ||
| 420 | b .LxtsencNx | ||
| 421 | .LxtsencoutNx: | ||
| 422 | mov v4.16b, v5.16b | ||
| 423 | b .Lxtsencout | ||
| 424 | #else | ||
| 425 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
| 426 | next_tweak v5, v4, v7, v8 | ||
| 427 | eor v0.16b, v0.16b, v4.16b | ||
| 428 | next_tweak v6, v5, v7, v8 | ||
| 429 | eor v1.16b, v1.16b, v5.16b | ||
| 430 | eor v2.16b, v2.16b, v6.16b | ||
| 431 | next_tweak v7, v6, v7, v8 | ||
| 432 | eor v3.16b, v3.16b, v7.16b | ||
| 433 | do_encrypt_block4x | ||
| 434 | eor v3.16b, v3.16b, v7.16b | ||
| 435 | eor v0.16b, v0.16b, v4.16b | ||
| 436 | eor v1.16b, v1.16b, v5.16b | ||
| 437 | eor v2.16b, v2.16b, v6.16b | ||
| 438 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 439 | mov v4.16b, v7.16b | ||
| 440 | cbz w4, .Lxtsencout | ||
| 441 | b .LxtsencloopNx | ||
| 442 | #endif | ||
| 443 | .Lxtsenc1x: | ||
| 444 | adds w4, w4, #INTERLEAVE | ||
| 445 | beq .Lxtsencout | ||
| 446 | #endif | ||
| 447 | .Lxtsencloop: | ||
| 448 | ld1 {v1.16b}, [x1], #16 | ||
| 449 | eor v0.16b, v1.16b, v4.16b | ||
| 450 | encrypt_block v0, w3, x2, x6, w7 | ||
| 451 | eor v0.16b, v0.16b, v4.16b | ||
| 452 | st1 {v0.16b}, [x0], #16 | ||
| 453 | subs w4, w4, #1 | ||
| 454 | beq .Lxtsencout | ||
| 455 | next_tweak v4, v4, v7, v8 | ||
| 456 | b .Lxtsencloop | ||
| 457 | .Lxtsencout: | ||
| 458 | FRAME_POP | ||
| 459 | ret | ||
| 460 | AES_ENDPROC(aes_xts_encrypt) | ||
| 461 | |||
| 462 | |||
| 463 | AES_ENTRY(aes_xts_decrypt) | ||
| 464 | FRAME_PUSH | ||
| 465 | cbz w7, .LxtsdecloopNx | ||
| 466 | |||
| 467 | ld1 {v4.16b}, [x6] | ||
| 468 | enc_prepare w3, x5, x6 | ||
| 469 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
| 470 | dec_prepare w3, x2, x6 | ||
| 471 | ldr q7, .Lxts_mul_x | ||
| 472 | b .LxtsdecNx | ||
| 473 | |||
| 474 | .LxtsdecloopNx: | ||
| 475 | ldr q7, .Lxts_mul_x | ||
| 476 | next_tweak v4, v4, v7, v8 | ||
| 477 | .LxtsdecNx: | ||
| 478 | #if INTERLEAVE >= 2 | ||
| 479 | subs w4, w4, #INTERLEAVE | ||
| 480 | bmi .Lxtsdec1x | ||
| 481 | #if INTERLEAVE == 2 | ||
| 482 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
| 483 | next_tweak v5, v4, v7, v8 | ||
| 484 | eor v0.16b, v0.16b, v4.16b | ||
| 485 | eor v1.16b, v1.16b, v5.16b | ||
| 486 | do_decrypt_block2x | ||
| 487 | eor v0.16b, v0.16b, v4.16b | ||
| 488 | eor v1.16b, v1.16b, v5.16b | ||
| 489 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
| 490 | cbz w4, .LxtsdecoutNx | ||
| 491 | next_tweak v4, v5, v7, v8 | ||
| 492 | b .LxtsdecNx | ||
| 493 | .LxtsdecoutNx: | ||
| 494 | mov v4.16b, v5.16b | ||
| 495 | b .Lxtsdecout | ||
| 496 | #else | ||
| 497 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
| 498 | next_tweak v5, v4, v7, v8 | ||
| 499 | eor v0.16b, v0.16b, v4.16b | ||
| 500 | next_tweak v6, v5, v7, v8 | ||
| 501 | eor v1.16b, v1.16b, v5.16b | ||
| 502 | eor v2.16b, v2.16b, v6.16b | ||
| 503 | next_tweak v7, v6, v7, v8 | ||
| 504 | eor v3.16b, v3.16b, v7.16b | ||
| 505 | do_decrypt_block4x | ||
| 506 | eor v3.16b, v3.16b, v7.16b | ||
| 507 | eor v0.16b, v0.16b, v4.16b | ||
| 508 | eor v1.16b, v1.16b, v5.16b | ||
| 509 | eor v2.16b, v2.16b, v6.16b | ||
| 510 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
| 511 | mov v4.16b, v7.16b | ||
| 512 | cbz w4, .Lxtsdecout | ||
| 513 | b .LxtsdecloopNx | ||
| 514 | #endif | ||
| 515 | .Lxtsdec1x: | ||
| 516 | adds w4, w4, #INTERLEAVE | ||
| 517 | beq .Lxtsdecout | ||
| 518 | #endif | ||
| 519 | .Lxtsdecloop: | ||
| 520 | ld1 {v1.16b}, [x1], #16 | ||
| 521 | eor v0.16b, v1.16b, v4.16b | ||
| 522 | decrypt_block v0, w3, x2, x6, w7 | ||
| 523 | eor v0.16b, v0.16b, v4.16b | ||
| 524 | st1 {v0.16b}, [x0], #16 | ||
| 525 | subs w4, w4, #1 | ||
| 526 | beq .Lxtsdecout | ||
| 527 | next_tweak v4, v4, v7, v8 | ||
| 528 | b .Lxtsdecloop | ||
| 529 | .Lxtsdecout: | ||
| 530 | FRAME_POP | ||
| 531 | ret | ||
| 532 | AES_ENDPROC(aes_xts_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 000000000000..b93170e1cc93 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S | |||
| @@ -0,0 +1,382 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/linkage.h> | ||
| 12 | |||
| 13 | #define AES_ENTRY(func) ENTRY(neon_ ## func) | ||
| 14 | #define AES_ENDPROC(func) ENDPROC(neon_ ## func) | ||
| 15 | |||
| 16 | /* multiply by polynomial 'x' in GF(2^8) */ | ||
| 17 | .macro mul_by_x, out, in, temp, const | ||
| 18 | sshr \temp, \in, #7 | ||
| 19 | add \out, \in, \in | ||
| 20 | and \temp, \temp, \const | ||
| 21 | eor \out, \out, \temp | ||
| 22 | .endm | ||
| 23 | |||
| 24 | /* preload the entire Sbox */ | ||
| 25 | .macro prepare, sbox, shiftrows, temp | ||
| 26 | adr \temp, \sbox | ||
| 27 | movi v12.16b, #0x40 | ||
| 28 | ldr q13, \shiftrows | ||
| 29 | movi v14.16b, #0x1b | ||
| 30 | ld1 {v16.16b-v19.16b}, [\temp], #64 | ||
| 31 | ld1 {v20.16b-v23.16b}, [\temp], #64 | ||
| 32 | ld1 {v24.16b-v27.16b}, [\temp], #64 | ||
| 33 | ld1 {v28.16b-v31.16b}, [\temp] | ||
| 34 | .endm | ||
| 35 | |||
| 36 | /* do preload for encryption */ | ||
| 37 | .macro enc_prepare, ignore0, ignore1, temp | ||
| 38 | prepare .LForward_Sbox, .LForward_ShiftRows, \temp | ||
| 39 | .endm | ||
| 40 | |||
| 41 | .macro enc_switch_key, ignore0, ignore1, temp | ||
| 42 | /* do nothing */ | ||
| 43 | .endm | ||
| 44 | |||
| 45 | /* do preload for decryption */ | ||
| 46 | .macro dec_prepare, ignore0, ignore1, temp | ||
| 47 | prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp | ||
| 48 | .endm | ||
| 49 | |||
| 50 | /* apply SubBytes transformation using the the preloaded Sbox */ | ||
| 51 | .macro sub_bytes, in | ||
| 52 | sub v9.16b, \in\().16b, v12.16b | ||
| 53 | tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b | ||
| 54 | sub v10.16b, v9.16b, v12.16b | ||
| 55 | tbx \in\().16b, {v20.16b-v23.16b}, v9.16b | ||
| 56 | sub v11.16b, v10.16b, v12.16b | ||
| 57 | tbx \in\().16b, {v24.16b-v27.16b}, v10.16b | ||
| 58 | tbx \in\().16b, {v28.16b-v31.16b}, v11.16b | ||
| 59 | .endm | ||
| 60 | |||
| 61 | /* apply MixColumns transformation */ | ||
| 62 | .macro mix_columns, in | ||
| 63 | mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b | ||
| 64 | rev32 v8.8h, \in\().8h | ||
| 65 | eor \in\().16b, v10.16b, \in\().16b | ||
| 66 | shl v9.4s, v8.4s, #24 | ||
| 67 | shl v11.4s, \in\().4s, #24 | ||
| 68 | sri v9.4s, v8.4s, #8 | ||
| 69 | sri v11.4s, \in\().4s, #8 | ||
| 70 | eor v9.16b, v9.16b, v8.16b | ||
| 71 | eor v10.16b, v10.16b, v9.16b | ||
| 72 | eor \in\().16b, v10.16b, v11.16b | ||
| 73 | .endm | ||
| 74 | |||
| 75 | /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ | ||
| 76 | .macro inv_mix_columns, in | ||
| 77 | mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b | ||
| 78 | mul_by_x v11.16b, v11.16b, v10.16b, v14.16b | ||
| 79 | eor \in\().16b, \in\().16b, v11.16b | ||
| 80 | rev32 v11.8h, v11.8h | ||
| 81 | eor \in\().16b, \in\().16b, v11.16b | ||
| 82 | mix_columns \in | ||
| 83 | .endm | ||
| 84 | |||
| 85 | .macro do_block, enc, in, rounds, rk, rkp, i | ||
| 86 | ld1 {v15.16b}, [\rk] | ||
| 87 | add \rkp, \rk, #16 | ||
| 88 | mov \i, \rounds | ||
| 89 | 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
| 90 | tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ | ||
| 91 | sub_bytes \in | ||
| 92 | ld1 {v15.16b}, [\rkp], #16 | ||
| 93 | subs \i, \i, #1 | ||
| 94 | beq 2222f | ||
| 95 | .if \enc == 1 | ||
| 96 | mix_columns \in | ||
| 97 | .else | ||
| 98 | inv_mix_columns \in | ||
| 99 | .endif | ||
| 100 | b 1111b | ||
| 101 | 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
| 102 | .endm | ||
| 103 | |||
| 104 | .macro encrypt_block, in, rounds, rk, rkp, i | ||
| 105 | do_block 1, \in, \rounds, \rk, \rkp, \i | ||
| 106 | .endm | ||
| 107 | |||
| 108 | .macro decrypt_block, in, rounds, rk, rkp, i | ||
| 109 | do_block 0, \in, \rounds, \rk, \rkp, \i | ||
| 110 | .endm | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Interleaved versions: functionally equivalent to the | ||
| 114 | * ones above, but applied to 2 or 4 AES states in parallel. | ||
| 115 | */ | ||
| 116 | |||
| 117 | .macro sub_bytes_2x, in0, in1 | ||
| 118 | sub v8.16b, \in0\().16b, v12.16b | ||
| 119 | sub v9.16b, \in1\().16b, v12.16b | ||
| 120 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
| 121 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
| 122 | sub v10.16b, v8.16b, v12.16b | ||
| 123 | sub v11.16b, v9.16b, v12.16b | ||
| 124 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
| 125 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
| 126 | sub v8.16b, v10.16b, v12.16b | ||
| 127 | sub v9.16b, v11.16b, v12.16b | ||
| 128 | tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b | ||
| 129 | tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b | ||
| 130 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
| 131 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
| 132 | .endm | ||
| 133 | |||
| 134 | .macro sub_bytes_4x, in0, in1, in2, in3 | ||
| 135 | sub v8.16b, \in0\().16b, v12.16b | ||
| 136 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
| 137 | sub v9.16b, \in1\().16b, v12.16b | ||
| 138 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
| 139 | sub v10.16b, \in2\().16b, v12.16b | ||
| 140 | tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b | ||
| 141 | sub v11.16b, \in3\().16b, v12.16b | ||
| 142 | tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b | ||
| 143 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
| 144 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
| 145 | sub v8.16b, v8.16b, v12.16b | ||
| 146 | tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b | ||
| 147 | sub v9.16b, v9.16b, v12.16b | ||
| 148 | tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b | ||
| 149 | sub v10.16b, v10.16b, v12.16b | ||
| 150 | tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b | ||
| 151 | sub v11.16b, v11.16b, v12.16b | ||
| 152 | tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b | ||
| 153 | sub v8.16b, v8.16b, v12.16b | ||
| 154 | tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b | ||
| 155 | sub v9.16b, v9.16b, v12.16b | ||
| 156 | tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b | ||
| 157 | sub v10.16b, v10.16b, v12.16b | ||
| 158 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
| 159 | sub v11.16b, v11.16b, v12.16b | ||
| 160 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
| 161 | tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b | ||
| 162 | tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b | ||
| 163 | .endm | ||
| 164 | |||
| 165 | .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const | ||
| 166 | sshr \tmp0\().16b, \in0\().16b, #7 | ||
| 167 | add \out0\().16b, \in0\().16b, \in0\().16b | ||
| 168 | sshr \tmp1\().16b, \in1\().16b, #7 | ||
| 169 | and \tmp0\().16b, \tmp0\().16b, \const\().16b | ||
| 170 | add \out1\().16b, \in1\().16b, \in1\().16b | ||
| 171 | and \tmp1\().16b, \tmp1\().16b, \const\().16b | ||
| 172 | eor \out0\().16b, \out0\().16b, \tmp0\().16b | ||
| 173 | eor \out1\().16b, \out1\().16b, \tmp1\().16b | ||
| 174 | .endm | ||
| 175 | |||
| 176 | .macro mix_columns_2x, in0, in1 | ||
| 177 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
| 178 | rev32 v10.8h, \in0\().8h | ||
| 179 | rev32 v11.8h, \in1\().8h | ||
| 180 | eor \in0\().16b, v8.16b, \in0\().16b | ||
| 181 | eor \in1\().16b, v9.16b, \in1\().16b | ||
| 182 | shl v12.4s, v10.4s, #24 | ||
| 183 | shl v13.4s, v11.4s, #24 | ||
| 184 | eor v8.16b, v8.16b, v10.16b | ||
| 185 | sri v12.4s, v10.4s, #8 | ||
| 186 | shl v10.4s, \in0\().4s, #24 | ||
| 187 | eor v9.16b, v9.16b, v11.16b | ||
| 188 | sri v13.4s, v11.4s, #8 | ||
| 189 | shl v11.4s, \in1\().4s, #24 | ||
| 190 | sri v10.4s, \in0\().4s, #8 | ||
| 191 | eor \in0\().16b, v8.16b, v12.16b | ||
| 192 | sri v11.4s, \in1\().4s, #8 | ||
| 193 | eor \in1\().16b, v9.16b, v13.16b | ||
| 194 | eor \in0\().16b, v10.16b, \in0\().16b | ||
| 195 | eor \in1\().16b, v11.16b, \in1\().16b | ||
| 196 | .endm | ||
| 197 | |||
| 198 | .macro inv_mix_cols_2x, in0, in1 | ||
| 199 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
| 200 | mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 | ||
| 201 | eor \in0\().16b, \in0\().16b, v8.16b | ||
| 202 | eor \in1\().16b, \in1\().16b, v9.16b | ||
| 203 | rev32 v8.8h, v8.8h | ||
| 204 | rev32 v9.8h, v9.8h | ||
| 205 | eor \in0\().16b, \in0\().16b, v8.16b | ||
| 206 | eor \in1\().16b, \in1\().16b, v9.16b | ||
| 207 | mix_columns_2x \in0, \in1 | ||
| 208 | .endm | ||
| 209 | |||
| 210 | .macro inv_mix_cols_4x, in0, in1, in2, in3 | ||
| 211 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
| 212 | mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 | ||
| 213 | mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 | ||
| 214 | mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 | ||
| 215 | eor \in0\().16b, \in0\().16b, v8.16b | ||
| 216 | eor \in1\().16b, \in1\().16b, v9.16b | ||
| 217 | eor \in2\().16b, \in2\().16b, v10.16b | ||
| 218 | eor \in3\().16b, \in3\().16b, v11.16b | ||
| 219 | rev32 v8.8h, v8.8h | ||
| 220 | rev32 v9.8h, v9.8h | ||
| 221 | rev32 v10.8h, v10.8h | ||
| 222 | rev32 v11.8h, v11.8h | ||
| 223 | eor \in0\().16b, \in0\().16b, v8.16b | ||
| 224 | eor \in1\().16b, \in1\().16b, v9.16b | ||
| 225 | eor \in2\().16b, \in2\().16b, v10.16b | ||
| 226 | eor \in3\().16b, \in3\().16b, v11.16b | ||
| 227 | mix_columns_2x \in0, \in1 | ||
| 228 | mix_columns_2x \in2, \in3 | ||
| 229 | .endm | ||
| 230 | |||
| 231 | .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i | ||
| 232 | ld1 {v15.16b}, [\rk] | ||
| 233 | add \rkp, \rk, #16 | ||
| 234 | mov \i, \rounds | ||
| 235 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
| 236 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
| 237 | sub_bytes_2x \in0, \in1 | ||
| 238 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
| 239 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
| 240 | ld1 {v15.16b}, [\rkp], #16 | ||
| 241 | subs \i, \i, #1 | ||
| 242 | beq 2222f | ||
| 243 | .if \enc == 1 | ||
| 244 | mix_columns_2x \in0, \in1 | ||
| 245 | ldr q13, .LForward_ShiftRows | ||
| 246 | .else | ||
| 247 | inv_mix_cols_2x \in0, \in1 | ||
| 248 | ldr q13, .LReverse_ShiftRows | ||
| 249 | .endif | ||
| 250 | movi v12.16b, #0x40 | ||
| 251 | b 1111b | ||
| 252 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
| 253 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
| 254 | .endm | ||
| 255 | |||
| 256 | .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i | ||
| 257 | ld1 {v15.16b}, [\rk] | ||
| 258 | add \rkp, \rk, #16 | ||
| 259 | mov \i, \rounds | ||
| 260 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
| 261 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
| 262 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
| 263 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
| 264 | sub_bytes_4x \in0, \in1, \in2, \in3 | ||
| 265 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
| 266 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
| 267 | tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ | ||
| 268 | tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ | ||
| 269 | ld1 {v15.16b}, [\rkp], #16 | ||
| 270 | subs \i, \i, #1 | ||
| 271 | beq 2222f | ||
| 272 | .if \enc == 1 | ||
| 273 | mix_columns_2x \in0, \in1 | ||
| 274 | mix_columns_2x \in2, \in3 | ||
| 275 | ldr q13, .LForward_ShiftRows | ||
| 276 | .else | ||
| 277 | inv_mix_cols_4x \in0, \in1, \in2, \in3 | ||
| 278 | ldr q13, .LReverse_ShiftRows | ||
| 279 | .endif | ||
| 280 | movi v12.16b, #0x40 | ||
| 281 | b 1111b | ||
| 282 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
| 283 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
| 284 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
| 285 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
| 286 | .endm | ||
| 287 | |||
| 288 | .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
| 289 | do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i | ||
| 290 | .endm | ||
| 291 | |||
| 292 | .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
| 293 | do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i | ||
| 294 | .endm | ||
| 295 | |||
| 296 | .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
| 297 | do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
| 298 | .endm | ||
| 299 | |||
| 300 | .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
| 301 | do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
| 302 | .endm | ||
| 303 | |||
| 304 | #include "aes-modes.S" | ||
| 305 | |||
| 306 | .text | ||
| 307 | .align 4 | ||
| 308 | .LForward_ShiftRows: | ||
| 309 | .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 | ||
| 310 | .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb | ||
| 311 | |||
| 312 | .LReverse_ShiftRows: | ||
| 313 | .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb | ||
| 314 | .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 | ||
| 315 | |||
| 316 | .LForward_Sbox: | ||
| 317 | .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
| 318 | .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
| 319 | .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
| 320 | .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
| 321 | .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
| 322 | .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
| 323 | .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
| 324 | .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
| 325 | .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
| 326 | .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
| 327 | .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
| 328 | .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
| 329 | .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
| 330 | .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
| 331 | .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
| 332 | .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
| 333 | .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
| 334 | .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
| 335 | .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
| 336 | .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
| 337 | .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
| 338 | .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
| 339 | .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
| 340 | .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
| 341 | .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
| 342 | .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
| 343 | .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
| 344 | .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
| 345 | .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
| 346 | .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
| 347 | .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
| 348 | .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
| 349 | |||
| 350 | .LReverse_Sbox: | ||
| 351 | .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
| 352 | .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
| 353 | .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
| 354 | .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
| 355 | .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
| 356 | .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
| 357 | .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
| 358 | .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
| 359 | .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
| 360 | .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
| 361 | .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
| 362 | .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
| 363 | .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
| 364 | .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
| 365 | .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
| 366 | .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
| 367 | .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
| 368 | .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
| 369 | .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
| 370 | .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
| 371 | .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
| 372 | .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
| 373 | .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
| 374 | .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
| 375 | .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
| 376 | .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
| 377 | .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
| 378 | .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
| 379 | .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
| 380 | .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
| 381 | .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
| 382 | .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000000..b9e6eaf41c9b --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S | |||
| @@ -0,0 +1,95 @@ | |||
| 1 | /* | ||
| 2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S | ||
| 7 | * | ||
| 8 | * Copyright (c) 2009 Intel Corp. | ||
| 9 | * Author: Huang Ying <ying.huang@intel.com> | ||
| 10 | * Vinodh Gopal | ||
| 11 | * Erdinc Ozturk | ||
| 12 | * Deniz Karakoyunlu | ||
| 13 | * | ||
| 14 | * This program is free software; you can redistribute it and/or modify it | ||
| 15 | * under the terms of the GNU General Public License version 2 as published | ||
| 16 | * by the Free Software Foundation. | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/linkage.h> | ||
| 20 | #include <asm/assembler.h> | ||
| 21 | |||
| 22 | DATA .req v0 | ||
| 23 | SHASH .req v1 | ||
| 24 | IN1 .req v2 | ||
| 25 | T1 .req v2 | ||
| 26 | T2 .req v3 | ||
| 27 | T3 .req v4 | ||
| 28 | VZR .req v5 | ||
| 29 | |||
| 30 | .text | ||
| 31 | .arch armv8-a+crypto | ||
| 32 | |||
| 33 | /* | ||
| 34 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
| 35 | * struct ghash_key const *k, const char *head) | ||
| 36 | */ | ||
| 37 | ENTRY(pmull_ghash_update) | ||
| 38 | ld1 {DATA.16b}, [x1] | ||
| 39 | ld1 {SHASH.16b}, [x3] | ||
| 40 | eor VZR.16b, VZR.16b, VZR.16b | ||
| 41 | |||
| 42 | /* do the head block first, if supplied */ | ||
| 43 | cbz x4, 0f | ||
| 44 | ld1 {IN1.2d}, [x4] | ||
| 45 | b 1f | ||
| 46 | |||
| 47 | 0: ld1 {IN1.2d}, [x2], #16 | ||
| 48 | sub w0, w0, #1 | ||
| 49 | 1: ext IN1.16b, IN1.16b, IN1.16b, #8 | ||
| 50 | CPU_LE( rev64 IN1.16b, IN1.16b ) | ||
| 51 | eor DATA.16b, DATA.16b, IN1.16b | ||
| 52 | |||
| 53 | /* multiply DATA by SHASH in GF(2^128) */ | ||
| 54 | ext T2.16b, DATA.16b, DATA.16b, #8 | ||
| 55 | ext T3.16b, SHASH.16b, SHASH.16b, #8 | ||
| 56 | eor T2.16b, T2.16b, DATA.16b | ||
| 57 | eor T3.16b, T3.16b, SHASH.16b | ||
| 58 | |||
| 59 | pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 | ||
| 60 | pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 | ||
| 61 | pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) | ||
| 62 | eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) | ||
| 63 | eor T2.16b, T2.16b, DATA.16b | ||
| 64 | |||
| 65 | ext T3.16b, VZR.16b, T2.16b, #8 | ||
| 66 | ext T2.16b, T2.16b, VZR.16b, #8 | ||
| 67 | eor DATA.16b, DATA.16b, T3.16b | ||
| 68 | eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of | ||
| 69 | // carry-less multiplication | ||
| 70 | |||
| 71 | /* first phase of the reduction */ | ||
| 72 | shl T3.2d, DATA.2d, #1 | ||
| 73 | eor T3.16b, T3.16b, DATA.16b | ||
| 74 | shl T3.2d, T3.2d, #5 | ||
| 75 | eor T3.16b, T3.16b, DATA.16b | ||
| 76 | shl T3.2d, T3.2d, #57 | ||
| 77 | ext T2.16b, VZR.16b, T3.16b, #8 | ||
| 78 | ext T3.16b, T3.16b, VZR.16b, #8 | ||
| 79 | eor DATA.16b, DATA.16b, T2.16b | ||
| 80 | eor T1.16b, T1.16b, T3.16b | ||
| 81 | |||
| 82 | /* second phase of the reduction */ | ||
| 83 | ushr T2.2d, DATA.2d, #5 | ||
| 84 | eor T2.16b, T2.16b, DATA.16b | ||
| 85 | ushr T2.2d, T2.2d, #1 | ||
| 86 | eor T2.16b, T2.16b, DATA.16b | ||
| 87 | ushr T2.2d, T2.2d, #1 | ||
| 88 | eor T1.16b, T1.16b, T2.16b | ||
| 89 | eor DATA.16b, DATA.16b, T1.16b | ||
| 90 | |||
| 91 | cbnz w0, 0b | ||
| 92 | |||
| 93 | st1 {DATA.16b}, [x1] | ||
| 94 | ret | ||
| 95 | ENDPROC(pmull_ghash_update) | ||
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 000000000000..b92baf3f68c7 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c | |||
| @@ -0,0 +1,155 @@ | |||
| 1 | /* | ||
| 2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify it | ||
| 7 | * under the terms of the GNU General Public License version 2 as published | ||
| 8 | * by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <asm/unaligned.h> | ||
| 13 | #include <crypto/internal/hash.h> | ||
| 14 | #include <linux/cpufeature.h> | ||
| 15 | #include <linux/crypto.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | |||
| 18 | MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); | ||
| 19 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 20 | MODULE_LICENSE("GPL v2"); | ||
| 21 | |||
| 22 | #define GHASH_BLOCK_SIZE 16 | ||
| 23 | #define GHASH_DIGEST_SIZE 16 | ||
| 24 | |||
| 25 | struct ghash_key { | ||
| 26 | u64 a; | ||
| 27 | u64 b; | ||
| 28 | }; | ||
| 29 | |||
| 30 | struct ghash_desc_ctx { | ||
| 31 | u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; | ||
| 32 | u8 buf[GHASH_BLOCK_SIZE]; | ||
| 33 | u32 count; | ||
| 34 | }; | ||
| 35 | |||
| 36 | asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
| 37 | struct ghash_key const *k, const char *head); | ||
| 38 | |||
| 39 | static int ghash_init(struct shash_desc *desc) | ||
| 40 | { | ||
| 41 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
| 42 | |||
| 43 | *ctx = (struct ghash_desc_ctx){}; | ||
| 44 | return 0; | ||
| 45 | } | ||
| 46 | |||
| 47 | static int ghash_update(struct shash_desc *desc, const u8 *src, | ||
| 48 | unsigned int len) | ||
| 49 | { | ||
| 50 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
| 51 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
| 52 | |||
| 53 | ctx->count += len; | ||
| 54 | |||
| 55 | if ((partial + len) >= GHASH_BLOCK_SIZE) { | ||
| 56 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
| 57 | int blocks; | ||
| 58 | |||
| 59 | if (partial) { | ||
| 60 | int p = GHASH_BLOCK_SIZE - partial; | ||
| 61 | |||
| 62 | memcpy(ctx->buf + partial, src, p); | ||
| 63 | src += p; | ||
| 64 | len -= p; | ||
| 65 | } | ||
| 66 | |||
| 67 | blocks = len / GHASH_BLOCK_SIZE; | ||
| 68 | len %= GHASH_BLOCK_SIZE; | ||
| 69 | |||
| 70 | kernel_neon_begin_partial(6); | ||
| 71 | pmull_ghash_update(blocks, ctx->digest, src, key, | ||
| 72 | partial ? ctx->buf : NULL); | ||
| 73 | kernel_neon_end(); | ||
| 74 | src += blocks * GHASH_BLOCK_SIZE; | ||
| 75 | } | ||
| 76 | if (len) | ||
| 77 | memcpy(ctx->buf + partial, src, len); | ||
| 78 | return 0; | ||
| 79 | } | ||
| 80 | |||
| 81 | static int ghash_final(struct shash_desc *desc, u8 *dst) | ||
| 82 | { | ||
| 83 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
| 84 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
| 85 | |||
| 86 | if (partial) { | ||
| 87 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
| 88 | |||
| 89 | memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); | ||
| 90 | |||
| 91 | kernel_neon_begin_partial(6); | ||
| 92 | pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); | ||
| 93 | kernel_neon_end(); | ||
| 94 | } | ||
| 95 | put_unaligned_be64(ctx->digest[1], dst); | ||
| 96 | put_unaligned_be64(ctx->digest[0], dst + 8); | ||
| 97 | |||
| 98 | *ctx = (struct ghash_desc_ctx){}; | ||
| 99 | return 0; | ||
| 100 | } | ||
| 101 | |||
| 102 | static int ghash_setkey(struct crypto_shash *tfm, | ||
| 103 | const u8 *inkey, unsigned int keylen) | ||
| 104 | { | ||
| 105 | struct ghash_key *key = crypto_shash_ctx(tfm); | ||
| 106 | u64 a, b; | ||
| 107 | |||
| 108 | if (keylen != GHASH_BLOCK_SIZE) { | ||
| 109 | crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
| 110 | return -EINVAL; | ||
| 111 | } | ||
| 112 | |||
| 113 | /* perform multiplication by 'x' in GF(2^128) */ | ||
| 114 | b = get_unaligned_be64(inkey); | ||
| 115 | a = get_unaligned_be64(inkey + 8); | ||
| 116 | |||
| 117 | key->a = (a << 1) | (b >> 63); | ||
| 118 | key->b = (b << 1) | (a >> 63); | ||
| 119 | |||
| 120 | if (b >> 63) | ||
| 121 | key->b ^= 0xc200000000000000UL; | ||
| 122 | |||
| 123 | return 0; | ||
| 124 | } | ||
| 125 | |||
| 126 | static struct shash_alg ghash_alg = { | ||
| 127 | .digestsize = GHASH_DIGEST_SIZE, | ||
| 128 | .init = ghash_init, | ||
| 129 | .update = ghash_update, | ||
| 130 | .final = ghash_final, | ||
| 131 | .setkey = ghash_setkey, | ||
| 132 | .descsize = sizeof(struct ghash_desc_ctx), | ||
| 133 | .base = { | ||
| 134 | .cra_name = "ghash", | ||
| 135 | .cra_driver_name = "ghash-ce", | ||
| 136 | .cra_priority = 200, | ||
| 137 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
| 138 | .cra_blocksize = GHASH_BLOCK_SIZE, | ||
| 139 | .cra_ctxsize = sizeof(struct ghash_key), | ||
| 140 | .cra_module = THIS_MODULE, | ||
| 141 | }, | ||
| 142 | }; | ||
| 143 | |||
| 144 | static int __init ghash_ce_mod_init(void) | ||
| 145 | { | ||
| 146 | return crypto_register_shash(&ghash_alg); | ||
| 147 | } | ||
| 148 | |||
| 149 | static void __exit ghash_ce_mod_exit(void) | ||
| 150 | { | ||
| 151 | crypto_unregister_shash(&ghash_alg); | ||
| 152 | } | ||
| 153 | |||
| 154 | module_cpu_feature_match(PMULL, ghash_ce_mod_init); | ||
| 155 | module_exit(ghash_ce_mod_exit); | ||
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 000000000000..09d57d98609c --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S | |||
| @@ -0,0 +1,153 @@ | |||
| 1 | /* | ||
| 2 | * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/linkage.h> | ||
| 12 | #include <asm/assembler.h> | ||
| 13 | |||
| 14 | .text | ||
| 15 | .arch armv8-a+crypto | ||
| 16 | |||
| 17 | k0 .req v0 | ||
| 18 | k1 .req v1 | ||
| 19 | k2 .req v2 | ||
| 20 | k3 .req v3 | ||
| 21 | |||
| 22 | t0 .req v4 | ||
| 23 | t1 .req v5 | ||
| 24 | |||
| 25 | dga .req q6 | ||
| 26 | dgav .req v6 | ||
| 27 | dgb .req s7 | ||
| 28 | dgbv .req v7 | ||
| 29 | |||
| 30 | dg0q .req q12 | ||
| 31 | dg0s .req s12 | ||
| 32 | dg0v .req v12 | ||
| 33 | dg1s .req s13 | ||
| 34 | dg1v .req v13 | ||
| 35 | dg2s .req s14 | ||
| 36 | |||
| 37 | .macro add_only, op, ev, rc, s0, dg1 | ||
| 38 | .ifc \ev, ev | ||
| 39 | add t1.4s, v\s0\().4s, \rc\().4s | ||
| 40 | sha1h dg2s, dg0s | ||
| 41 | .ifnb \dg1 | ||
| 42 | sha1\op dg0q, \dg1, t0.4s | ||
| 43 | .else | ||
| 44 | sha1\op dg0q, dg1s, t0.4s | ||
| 45 | .endif | ||
| 46 | .else | ||
| 47 | .ifnb \s0 | ||
| 48 | add t0.4s, v\s0\().4s, \rc\().4s | ||
| 49 | .endif | ||
| 50 | sha1h dg1s, dg0s | ||
| 51 | sha1\op dg0q, dg2s, t1.4s | ||
| 52 | .endif | ||
| 53 | .endm | ||
| 54 | |||
| 55 | .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 | ||
| 56 | sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s | ||
| 57 | add_only \op, \ev, \rc, \s1, \dg1 | ||
| 58 | sha1su1 v\s0\().4s, v\s3\().4s | ||
| 59 | .endm | ||
| 60 | |||
| 61 | /* | ||
| 62 | * The SHA1 round constants | ||
| 63 | */ | ||
| 64 | .align 4 | ||
| 65 | .Lsha1_rcon: | ||
| 66 | .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 | ||
| 67 | |||
| 68 | /* | ||
| 69 | * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
| 70 | * u8 *head, long bytes) | ||
| 71 | */ | ||
| 72 | ENTRY(sha1_ce_transform) | ||
| 73 | /* load round constants */ | ||
| 74 | adr x6, .Lsha1_rcon | ||
| 75 | ld1r {k0.4s}, [x6], #4 | ||
| 76 | ld1r {k1.4s}, [x6], #4 | ||
| 77 | ld1r {k2.4s}, [x6], #4 | ||
| 78 | ld1r {k3.4s}, [x6] | ||
| 79 | |||
| 80 | /* load state */ | ||
| 81 | ldr dga, [x2] | ||
| 82 | ldr dgb, [x2, #16] | ||
| 83 | |||
| 84 | /* load partial state (if supplied) */ | ||
| 85 | cbz x3, 0f | ||
| 86 | ld1 {v8.4s-v11.4s}, [x3] | ||
| 87 | b 1f | ||
| 88 | |||
| 89 | /* load input */ | ||
| 90 | 0: ld1 {v8.4s-v11.4s}, [x1], #64 | ||
| 91 | sub w0, w0, #1 | ||
| 92 | |||
| 93 | 1: | ||
| 94 | CPU_LE( rev32 v8.16b, v8.16b ) | ||
| 95 | CPU_LE( rev32 v9.16b, v9.16b ) | ||
| 96 | CPU_LE( rev32 v10.16b, v10.16b ) | ||
| 97 | CPU_LE( rev32 v11.16b, v11.16b ) | ||
| 98 | |||
| 99 | 2: add t0.4s, v8.4s, k0.4s | ||
| 100 | mov dg0v.16b, dgav.16b | ||
| 101 | |||
| 102 | add_update c, ev, k0, 8, 9, 10, 11, dgb | ||
| 103 | add_update c, od, k0, 9, 10, 11, 8 | ||
| 104 | add_update c, ev, k0, 10, 11, 8, 9 | ||
| 105 | add_update c, od, k0, 11, 8, 9, 10 | ||
| 106 | add_update c, ev, k1, 8, 9, 10, 11 | ||
| 107 | |||
| 108 | add_update p, od, k1, 9, 10, 11, 8 | ||
| 109 | add_update p, ev, k1, 10, 11, 8, 9 | ||
| 110 | add_update p, od, k1, 11, 8, 9, 10 | ||
| 111 | add_update p, ev, k1, 8, 9, 10, 11 | ||
| 112 | add_update p, od, k2, 9, 10, 11, 8 | ||
| 113 | |||
| 114 | add_update m, ev, k2, 10, 11, 8, 9 | ||
| 115 | add_update m, od, k2, 11, 8, 9, 10 | ||
| 116 | add_update m, ev, k2, 8, 9, 10, 11 | ||
| 117 | add_update m, od, k2, 9, 10, 11, 8 | ||
| 118 | add_update m, ev, k3, 10, 11, 8, 9 | ||
| 119 | |||
| 120 | add_update p, od, k3, 11, 8, 9, 10 | ||
| 121 | add_only p, ev, k3, 9 | ||
| 122 | add_only p, od, k3, 10 | ||
| 123 | add_only p, ev, k3, 11 | ||
| 124 | add_only p, od | ||
| 125 | |||
| 126 | /* update state */ | ||
| 127 | add dgbv.2s, dgbv.2s, dg1v.2s | ||
| 128 | add dgav.4s, dgav.4s, dg0v.4s | ||
| 129 | |||
| 130 | cbnz w0, 0b | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Final block: add padding and total bit count. | ||
| 134 | * Skip if we have no total byte count in x4. In that case, the input | ||
| 135 | * size was not a round multiple of the block size, and the padding is | ||
| 136 | * handled by the C code. | ||
| 137 | */ | ||
| 138 | cbz x4, 3f | ||
| 139 | movi v9.2d, #0 | ||
| 140 | mov x8, #0x80000000 | ||
| 141 | movi v10.2d, #0 | ||
| 142 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
| 143 | fmov d8, x8 | ||
| 144 | mov x4, #0 | ||
| 145 | mov v11.d[0], xzr | ||
| 146 | mov v11.d[1], x7 | ||
| 147 | b 2b | ||
| 148 | |||
| 149 | /* store new state */ | ||
| 150 | 3: str dga, [x2] | ||
| 151 | str dgb, [x2, #16] | ||
| 152 | ret | ||
| 153 | ENDPROC(sha1_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 000000000000..6fe83f37a750 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c | |||
| @@ -0,0 +1,174 @@ | |||
| 1 | /* | ||
| 2 | * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <asm/unaligned.h> | ||
| 13 | #include <crypto/internal/hash.h> | ||
| 14 | #include <crypto/sha.h> | ||
| 15 | #include <linux/cpufeature.h> | ||
| 16 | #include <linux/crypto.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | |||
| 19 | MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); | ||
| 20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 21 | MODULE_LICENSE("GPL v2"); | ||
| 22 | |||
| 23 | asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
| 24 | u8 *head, long bytes); | ||
| 25 | |||
| 26 | static int sha1_init(struct shash_desc *desc) | ||
| 27 | { | ||
| 28 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 29 | |||
| 30 | *sctx = (struct sha1_state){ | ||
| 31 | .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, | ||
| 32 | }; | ||
| 33 | return 0; | ||
| 34 | } | ||
| 35 | |||
| 36 | static int sha1_update(struct shash_desc *desc, const u8 *data, | ||
| 37 | unsigned int len) | ||
| 38 | { | ||
| 39 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 40 | unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; | ||
| 41 | |||
| 42 | sctx->count += len; | ||
| 43 | |||
| 44 | if ((partial + len) >= SHA1_BLOCK_SIZE) { | ||
| 45 | int blocks; | ||
| 46 | |||
| 47 | if (partial) { | ||
| 48 | int p = SHA1_BLOCK_SIZE - partial; | ||
| 49 | |||
| 50 | memcpy(sctx->buffer + partial, data, p); | ||
| 51 | data += p; | ||
| 52 | len -= p; | ||
| 53 | } | ||
| 54 | |||
| 55 | blocks = len / SHA1_BLOCK_SIZE; | ||
| 56 | len %= SHA1_BLOCK_SIZE; | ||
| 57 | |||
| 58 | kernel_neon_begin_partial(16); | ||
| 59 | sha1_ce_transform(blocks, data, sctx->state, | ||
| 60 | partial ? sctx->buffer : NULL, 0); | ||
| 61 | kernel_neon_end(); | ||
| 62 | |||
| 63 | data += blocks * SHA1_BLOCK_SIZE; | ||
| 64 | partial = 0; | ||
| 65 | } | ||
| 66 | if (len) | ||
| 67 | memcpy(sctx->buffer + partial, data, len); | ||
| 68 | return 0; | ||
| 69 | } | ||
| 70 | |||
| 71 | static int sha1_final(struct shash_desc *desc, u8 *out) | ||
| 72 | { | ||
| 73 | static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; | ||
| 74 | |||
| 75 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 76 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
| 77 | __be32 *dst = (__be32 *)out; | ||
| 78 | int i; | ||
| 79 | |||
| 80 | u32 padlen = SHA1_BLOCK_SIZE | ||
| 81 | - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); | ||
| 82 | |||
| 83 | sha1_update(desc, padding, padlen); | ||
| 84 | sha1_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
| 85 | |||
| 86 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 87 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 88 | |||
| 89 | *sctx = (struct sha1_state){}; | ||
| 90 | return 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | static int sha1_finup(struct shash_desc *desc, const u8 *data, | ||
| 94 | unsigned int len, u8 *out) | ||
| 95 | { | ||
| 96 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 97 | __be32 *dst = (__be32 *)out; | ||
| 98 | int blocks; | ||
| 99 | int i; | ||
| 100 | |||
| 101 | if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { | ||
| 102 | sha1_update(desc, data, len); | ||
| 103 | return sha1_final(desc, out); | ||
| 104 | } | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
| 108 | * this case, there is no need to copy data around, and we can | ||
| 109 | * perform the entire digest calculation in a single invocation | ||
| 110 | * of sha1_ce_transform() | ||
| 111 | */ | ||
| 112 | blocks = len / SHA1_BLOCK_SIZE; | ||
| 113 | |||
| 114 | kernel_neon_begin_partial(16); | ||
| 115 | sha1_ce_transform(blocks, data, sctx->state, NULL, len); | ||
| 116 | kernel_neon_end(); | ||
| 117 | |||
| 118 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 119 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 120 | |||
| 121 | *sctx = (struct sha1_state){}; | ||
| 122 | return 0; | ||
| 123 | } | ||
| 124 | |||
| 125 | static int sha1_export(struct shash_desc *desc, void *out) | ||
| 126 | { | ||
| 127 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 128 | struct sha1_state *dst = out; | ||
| 129 | |||
| 130 | *dst = *sctx; | ||
| 131 | return 0; | ||
| 132 | } | ||
| 133 | |||
| 134 | static int sha1_import(struct shash_desc *desc, const void *in) | ||
| 135 | { | ||
| 136 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
| 137 | struct sha1_state const *src = in; | ||
| 138 | |||
| 139 | *sctx = *src; | ||
| 140 | return 0; | ||
| 141 | } | ||
| 142 | |||
| 143 | static struct shash_alg alg = { | ||
| 144 | .init = sha1_init, | ||
| 145 | .update = sha1_update, | ||
| 146 | .final = sha1_final, | ||
| 147 | .finup = sha1_finup, | ||
| 148 | .export = sha1_export, | ||
| 149 | .import = sha1_import, | ||
| 150 | .descsize = sizeof(struct sha1_state), | ||
| 151 | .digestsize = SHA1_DIGEST_SIZE, | ||
| 152 | .statesize = sizeof(struct sha1_state), | ||
| 153 | .base = { | ||
| 154 | .cra_name = "sha1", | ||
| 155 | .cra_driver_name = "sha1-ce", | ||
| 156 | .cra_priority = 200, | ||
| 157 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
| 158 | .cra_blocksize = SHA1_BLOCK_SIZE, | ||
| 159 | .cra_module = THIS_MODULE, | ||
| 160 | } | ||
| 161 | }; | ||
| 162 | |||
| 163 | static int __init sha1_ce_mod_init(void) | ||
| 164 | { | ||
| 165 | return crypto_register_shash(&alg); | ||
| 166 | } | ||
| 167 | |||
| 168 | static void __exit sha1_ce_mod_fini(void) | ||
| 169 | { | ||
| 170 | crypto_unregister_shash(&alg); | ||
| 171 | } | ||
| 172 | |||
| 173 | module_cpu_feature_match(SHA1, sha1_ce_mod_init); | ||
| 174 | module_exit(sha1_ce_mod_fini); | ||
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 000000000000..7f29fc031ea8 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | /* | ||
| 2 | * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/linkage.h> | ||
| 12 | #include <asm/assembler.h> | ||
| 13 | |||
| 14 | .text | ||
| 15 | .arch armv8-a+crypto | ||
| 16 | |||
| 17 | dga .req q20 | ||
| 18 | dgav .req v20 | ||
| 19 | dgb .req q21 | ||
| 20 | dgbv .req v21 | ||
| 21 | |||
| 22 | t0 .req v22 | ||
| 23 | t1 .req v23 | ||
| 24 | |||
| 25 | dg0q .req q24 | ||
| 26 | dg0v .req v24 | ||
| 27 | dg1q .req q25 | ||
| 28 | dg1v .req v25 | ||
| 29 | dg2q .req q26 | ||
| 30 | dg2v .req v26 | ||
| 31 | |||
| 32 | .macro add_only, ev, rc, s0 | ||
| 33 | mov dg2v.16b, dg0v.16b | ||
| 34 | .ifeq \ev | ||
| 35 | add t1.4s, v\s0\().4s, \rc\().4s | ||
| 36 | sha256h dg0q, dg1q, t0.4s | ||
| 37 | sha256h2 dg1q, dg2q, t0.4s | ||
| 38 | .else | ||
| 39 | .ifnb \s0 | ||
| 40 | add t0.4s, v\s0\().4s, \rc\().4s | ||
| 41 | .endif | ||
| 42 | sha256h dg0q, dg1q, t1.4s | ||
| 43 | sha256h2 dg1q, dg2q, t1.4s | ||
| 44 | .endif | ||
| 45 | .endm | ||
| 46 | |||
| 47 | .macro add_update, ev, rc, s0, s1, s2, s3 | ||
| 48 | sha256su0 v\s0\().4s, v\s1\().4s | ||
| 49 | add_only \ev, \rc, \s1 | ||
| 50 | sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s | ||
| 51 | .endm | ||
| 52 | |||
| 53 | /* | ||
| 54 | * The SHA-256 round constants | ||
| 55 | */ | ||
| 56 | .align 4 | ||
| 57 | .Lsha2_rcon: | ||
| 58 | .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
| 59 | .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
| 60 | .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
| 61 | .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
| 62 | .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
| 63 | .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
| 64 | .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
| 65 | .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
| 66 | .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
| 67 | .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
| 68 | .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
| 69 | .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
| 70 | .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
| 71 | .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
| 72 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
| 73 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
| 74 | |||
| 75 | /* | ||
| 76 | * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
| 77 | * u8 *head, long bytes) | ||
| 78 | */ | ||
| 79 | ENTRY(sha2_ce_transform) | ||
| 80 | /* load round constants */ | ||
| 81 | adr x8, .Lsha2_rcon | ||
| 82 | ld1 { v0.4s- v3.4s}, [x8], #64 | ||
| 83 | ld1 { v4.4s- v7.4s}, [x8], #64 | ||
| 84 | ld1 { v8.4s-v11.4s}, [x8], #64 | ||
| 85 | ld1 {v12.4s-v15.4s}, [x8] | ||
| 86 | |||
| 87 | /* load state */ | ||
| 88 | ldp dga, dgb, [x2] | ||
| 89 | |||
| 90 | /* load partial input (if supplied) */ | ||
| 91 | cbz x3, 0f | ||
| 92 | ld1 {v16.4s-v19.4s}, [x3] | ||
| 93 | b 1f | ||
| 94 | |||
| 95 | /* load input */ | ||
| 96 | 0: ld1 {v16.4s-v19.4s}, [x1], #64 | ||
| 97 | sub w0, w0, #1 | ||
| 98 | |||
| 99 | 1: | ||
| 100 | CPU_LE( rev32 v16.16b, v16.16b ) | ||
| 101 | CPU_LE( rev32 v17.16b, v17.16b ) | ||
| 102 | CPU_LE( rev32 v18.16b, v18.16b ) | ||
| 103 | CPU_LE( rev32 v19.16b, v19.16b ) | ||
| 104 | |||
| 105 | 2: add t0.4s, v16.4s, v0.4s | ||
| 106 | mov dg0v.16b, dgav.16b | ||
| 107 | mov dg1v.16b, dgbv.16b | ||
| 108 | |||
| 109 | add_update 0, v1, 16, 17, 18, 19 | ||
| 110 | add_update 1, v2, 17, 18, 19, 16 | ||
| 111 | add_update 0, v3, 18, 19, 16, 17 | ||
| 112 | add_update 1, v4, 19, 16, 17, 18 | ||
| 113 | |||
| 114 | add_update 0, v5, 16, 17, 18, 19 | ||
| 115 | add_update 1, v6, 17, 18, 19, 16 | ||
| 116 | add_update 0, v7, 18, 19, 16, 17 | ||
| 117 | add_update 1, v8, 19, 16, 17, 18 | ||
| 118 | |||
| 119 | add_update 0, v9, 16, 17, 18, 19 | ||
| 120 | add_update 1, v10, 17, 18, 19, 16 | ||
| 121 | add_update 0, v11, 18, 19, 16, 17 | ||
| 122 | add_update 1, v12, 19, 16, 17, 18 | ||
| 123 | |||
| 124 | add_only 0, v13, 17 | ||
| 125 | add_only 1, v14, 18 | ||
| 126 | add_only 0, v15, 19 | ||
| 127 | add_only 1 | ||
| 128 | |||
| 129 | /* update state */ | ||
| 130 | add dgav.4s, dgav.4s, dg0v.4s | ||
| 131 | add dgbv.4s, dgbv.4s, dg1v.4s | ||
| 132 | |||
| 133 | /* handled all input blocks? */ | ||
| 134 | cbnz w0, 0b | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Final block: add padding and total bit count. | ||
| 138 | * Skip if we have no total byte count in x4. In that case, the input | ||
| 139 | * size was not a round multiple of the block size, and the padding is | ||
| 140 | * handled by the C code. | ||
| 141 | */ | ||
| 142 | cbz x4, 3f | ||
| 143 | movi v17.2d, #0 | ||
| 144 | mov x8, #0x80000000 | ||
| 145 | movi v18.2d, #0 | ||
| 146 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
| 147 | fmov d16, x8 | ||
| 148 | mov x4, #0 | ||
| 149 | mov v19.d[0], xzr | ||
| 150 | mov v19.d[1], x7 | ||
| 151 | b 2b | ||
| 152 | |||
| 153 | /* store new state */ | ||
| 154 | 3: stp dga, dgb, [x2] | ||
| 155 | ret | ||
| 156 | ENDPROC(sha2_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 000000000000..c294e67d3925 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c | |||
| @@ -0,0 +1,255 @@ | |||
| 1 | /* | ||
| 2 | * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <asm/neon.h> | ||
| 12 | #include <asm/unaligned.h> | ||
| 13 | #include <crypto/internal/hash.h> | ||
| 14 | #include <crypto/sha.h> | ||
| 15 | #include <linux/cpufeature.h> | ||
| 16 | #include <linux/crypto.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | |||
| 19 | MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); | ||
| 20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
| 21 | MODULE_LICENSE("GPL v2"); | ||
| 22 | |||
| 23 | asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
| 24 | u8 *head, long bytes); | ||
| 25 | |||
| 26 | static int sha224_init(struct shash_desc *desc) | ||
| 27 | { | ||
| 28 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 29 | |||
| 30 | *sctx = (struct sha256_state){ | ||
| 31 | .state = { | ||
| 32 | SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, | ||
| 33 | SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, | ||
| 34 | } | ||
| 35 | }; | ||
| 36 | return 0; | ||
| 37 | } | ||
| 38 | |||
| 39 | static int sha256_init(struct shash_desc *desc) | ||
| 40 | { | ||
| 41 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 42 | |||
| 43 | *sctx = (struct sha256_state){ | ||
| 44 | .state = { | ||
| 45 | SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, | ||
| 46 | SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, | ||
| 47 | } | ||
| 48 | }; | ||
| 49 | return 0; | ||
| 50 | } | ||
| 51 | |||
| 52 | static int sha2_update(struct shash_desc *desc, const u8 *data, | ||
| 53 | unsigned int len) | ||
| 54 | { | ||
| 55 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 56 | unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; | ||
| 57 | |||
| 58 | sctx->count += len; | ||
| 59 | |||
| 60 | if ((partial + len) >= SHA256_BLOCK_SIZE) { | ||
| 61 | int blocks; | ||
| 62 | |||
| 63 | if (partial) { | ||
| 64 | int p = SHA256_BLOCK_SIZE - partial; | ||
| 65 | |||
| 66 | memcpy(sctx->buf + partial, data, p); | ||
| 67 | data += p; | ||
| 68 | len -= p; | ||
| 69 | } | ||
| 70 | |||
| 71 | blocks = len / SHA256_BLOCK_SIZE; | ||
| 72 | len %= SHA256_BLOCK_SIZE; | ||
| 73 | |||
| 74 | kernel_neon_begin_partial(28); | ||
| 75 | sha2_ce_transform(blocks, data, sctx->state, | ||
| 76 | partial ? sctx->buf : NULL, 0); | ||
| 77 | kernel_neon_end(); | ||
| 78 | |||
| 79 | data += blocks * SHA256_BLOCK_SIZE; | ||
| 80 | partial = 0; | ||
| 81 | } | ||
| 82 | if (len) | ||
| 83 | memcpy(sctx->buf + partial, data, len); | ||
| 84 | return 0; | ||
| 85 | } | ||
| 86 | |||
| 87 | static void sha2_final(struct shash_desc *desc) | ||
| 88 | { | ||
| 89 | static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; | ||
| 90 | |||
| 91 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 92 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
| 93 | u32 padlen = SHA256_BLOCK_SIZE | ||
| 94 | - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); | ||
| 95 | |||
| 96 | sha2_update(desc, padding, padlen); | ||
| 97 | sha2_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
| 98 | } | ||
| 99 | |||
| 100 | static int sha224_final(struct shash_desc *desc, u8 *out) | ||
| 101 | { | ||
| 102 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 103 | __be32 *dst = (__be32 *)out; | ||
| 104 | int i; | ||
| 105 | |||
| 106 | sha2_final(desc); | ||
| 107 | |||
| 108 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 109 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 110 | |||
| 111 | *sctx = (struct sha256_state){}; | ||
| 112 | return 0; | ||
| 113 | } | ||
| 114 | |||
| 115 | static int sha256_final(struct shash_desc *desc, u8 *out) | ||
| 116 | { | ||
| 117 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 118 | __be32 *dst = (__be32 *)out; | ||
| 119 | int i; | ||
| 120 | |||
| 121 | sha2_final(desc); | ||
| 122 | |||
| 123 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 124 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 125 | |||
| 126 | *sctx = (struct sha256_state){}; | ||
| 127 | return 0; | ||
| 128 | } | ||
| 129 | |||
| 130 | static void sha2_finup(struct shash_desc *desc, const u8 *data, | ||
| 131 | unsigned int len) | ||
| 132 | { | ||
| 133 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 134 | int blocks; | ||
| 135 | |||
| 136 | if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { | ||
| 137 | sha2_update(desc, data, len); | ||
| 138 | sha2_final(desc); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | |||
| 142 | /* | ||
| 143 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
| 144 | * this case, there is no need to copy data around, and we can | ||
| 145 | * perform the entire digest calculation in a single invocation | ||
| 146 | * of sha2_ce_transform() | ||
| 147 | */ | ||
| 148 | blocks = len / SHA256_BLOCK_SIZE; | ||
| 149 | |||
| 150 | kernel_neon_begin_partial(28); | ||
| 151 | sha2_ce_transform(blocks, data, sctx->state, NULL, len); | ||
| 152 | kernel_neon_end(); | ||
| 153 | data += blocks * SHA256_BLOCK_SIZE; | ||
| 154 | } | ||
| 155 | |||
| 156 | static int sha224_finup(struct shash_desc *desc, const u8 *data, | ||
| 157 | unsigned int len, u8 *out) | ||
| 158 | { | ||
| 159 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 160 | __be32 *dst = (__be32 *)out; | ||
| 161 | int i; | ||
| 162 | |||
| 163 | sha2_finup(desc, data, len); | ||
| 164 | |||
| 165 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 166 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 167 | |||
| 168 | *sctx = (struct sha256_state){}; | ||
| 169 | return 0; | ||
| 170 | } | ||
| 171 | |||
| 172 | static int sha256_finup(struct shash_desc *desc, const u8 *data, | ||
| 173 | unsigned int len, u8 *out) | ||
| 174 | { | ||
| 175 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 176 | __be32 *dst = (__be32 *)out; | ||
| 177 | int i; | ||
| 178 | |||
| 179 | sha2_finup(desc, data, len); | ||
| 180 | |||
| 181 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
| 182 | put_unaligned_be32(sctx->state[i], dst++); | ||
| 183 | |||
| 184 | *sctx = (struct sha256_state){}; | ||
| 185 | return 0; | ||
| 186 | } | ||
| 187 | |||
| 188 | static int sha2_export(struct shash_desc *desc, void *out) | ||
| 189 | { | ||
| 190 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 191 | struct sha256_state *dst = out; | ||
| 192 | |||
| 193 | *dst = *sctx; | ||
| 194 | return 0; | ||
| 195 | } | ||
| 196 | |||
| 197 | static int sha2_import(struct shash_desc *desc, const void *in) | ||
| 198 | { | ||
| 199 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
| 200 | struct sha256_state const *src = in; | ||
| 201 | |||
| 202 | *sctx = *src; | ||
| 203 | return 0; | ||
| 204 | } | ||
| 205 | |||
| 206 | static struct shash_alg algs[] = { { | ||
| 207 | .init = sha224_init, | ||
| 208 | .update = sha2_update, | ||
| 209 | .final = sha224_final, | ||
| 210 | .finup = sha224_finup, | ||
| 211 | .export = sha2_export, | ||
| 212 | .import = sha2_import, | ||
| 213 | .descsize = sizeof(struct sha256_state), | ||
| 214 | .digestsize = SHA224_DIGEST_SIZE, | ||
| 215 | .statesize = sizeof(struct sha256_state), | ||
| 216 | .base = { | ||
| 217 | .cra_name = "sha224", | ||
| 218 | .cra_driver_name = "sha224-ce", | ||
| 219 | .cra_priority = 200, | ||
| 220 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
| 221 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
| 222 | .cra_module = THIS_MODULE, | ||
| 223 | } | ||
| 224 | }, { | ||
| 225 | .init = sha256_init, | ||
| 226 | .update = sha2_update, | ||
| 227 | .final = sha256_final, | ||
| 228 | .finup = sha256_finup, | ||
| 229 | .export = sha2_export, | ||
| 230 | .import = sha2_import, | ||
| 231 | .descsize = sizeof(struct sha256_state), | ||
| 232 | .digestsize = SHA256_DIGEST_SIZE, | ||
| 233 | .statesize = sizeof(struct sha256_state), | ||
| 234 | .base = { | ||
| 235 | .cra_name = "sha256", | ||
| 236 | .cra_driver_name = "sha256-ce", | ||
| 237 | .cra_priority = 200, | ||
| 238 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
| 239 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
| 240 | .cra_module = THIS_MODULE, | ||
| 241 | } | ||
| 242 | } }; | ||
| 243 | |||
| 244 | static int __init sha2_ce_mod_init(void) | ||
| 245 | { | ||
| 246 | return crypto_register_shashes(algs, ARRAY_SIZE(algs)); | ||
| 247 | } | ||
| 248 | |||
| 249 | static void __exit sha2_ce_mod_fini(void) | ||
| 250 | { | ||
| 251 | crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); | ||
| 252 | } | ||
| 253 | |||
| 254 | module_cpu_feature_match(SHA2, sha2_ce_mod_init); | ||
| 255 | module_exit(sha2_ce_mod_fini); | ||
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 83f71b3004a8..42c7eecd2bb6 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild | |||
| @@ -40,6 +40,7 @@ generic-y += segment.h | |||
| 40 | generic-y += sembuf.h | 40 | generic-y += sembuf.h |
| 41 | generic-y += serial.h | 41 | generic-y += serial.h |
| 42 | generic-y += shmbuf.h | 42 | generic-y += shmbuf.h |
| 43 | generic-y += simd.h | ||
| 43 | generic-y += sizes.h | 44 | generic-y += sizes.h |
| 44 | generic-y += socket.h | 45 | generic-y += socket.h |
| 45 | generic-y += sockios.h | 46 | generic-y += sockios.h |
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index c43b4ac13008..50f559f574fe 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h | |||
| @@ -37,8 +37,21 @@ struct fpsimd_state { | |||
| 37 | u32 fpcr; | 37 | u32 fpcr; |
| 38 | }; | 38 | }; |
| 39 | }; | 39 | }; |
| 40 | /* the id of the last cpu to have restored this state */ | ||
| 41 | unsigned int cpu; | ||
| 40 | }; | 42 | }; |
| 41 | 43 | ||
| 44 | /* | ||
| 45 | * Struct for stacking the bottom 'n' FP/SIMD registers. | ||
| 46 | */ | ||
| 47 | struct fpsimd_partial_state { | ||
| 48 | u32 fpsr; | ||
| 49 | u32 fpcr; | ||
| 50 | u32 num_regs; | ||
| 51 | __uint128_t vregs[32]; | ||
| 52 | }; | ||
| 53 | |||
| 54 | |||
| 42 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) | 55 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) |
| 43 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ | 56 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ |
| 44 | #define VFP_FPSCR_STAT_MASK 0xf800009f | 57 | #define VFP_FPSCR_STAT_MASK 0xf800009f |
| @@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state); | |||
| 58 | extern void fpsimd_thread_switch(struct task_struct *next); | 71 | extern void fpsimd_thread_switch(struct task_struct *next); |
| 59 | extern void fpsimd_flush_thread(void); | 72 | extern void fpsimd_flush_thread(void); |
| 60 | 73 | ||
| 74 | extern void fpsimd_preserve_current_state(void); | ||
| 75 | extern void fpsimd_restore_current_state(void); | ||
| 76 | extern void fpsimd_update_current_state(struct fpsimd_state *state); | ||
| 77 | |||
| 78 | extern void fpsimd_flush_task_state(struct task_struct *target); | ||
| 79 | |||
| 80 | extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, | ||
| 81 | u32 num_regs); | ||
| 82 | extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); | ||
| 83 | |||
| 61 | #endif | 84 | #endif |
| 62 | 85 | ||
| 63 | #endif | 86 | #endif |
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index bbec599c96bd..768414d55e64 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h | |||
| @@ -62,3 +62,38 @@ | |||
| 62 | ldr w\tmpnr, [\state, #16 * 2 + 4] | 62 | ldr w\tmpnr, [\state, #16 * 2 + 4] |
| 63 | msr fpcr, x\tmpnr | 63 | msr fpcr, x\tmpnr |
| 64 | .endm | 64 | .endm |
| 65 | |||
| 66 | .altmacro | ||
| 67 | .macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 | ||
| 68 | mrs x\tmpnr1, fpsr | ||
| 69 | str w\numnr, [\state, #8] | ||
| 70 | mrs x\tmpnr2, fpcr | ||
| 71 | stp w\tmpnr1, w\tmpnr2, [\state] | ||
| 72 | adr x\tmpnr1, 0f | ||
| 73 | add \state, \state, x\numnr, lsl #4 | ||
| 74 | sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 | ||
| 75 | br x\tmpnr1 | ||
| 76 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
| 77 | .irp qb, %(qa + 1) | ||
| 78 | stp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
| 79 | .endr | ||
| 80 | .endr | ||
| 81 | 0: | ||
| 82 | .endm | ||
| 83 | |||
| 84 | .macro fpsimd_restore_partial state, tmpnr1, tmpnr2 | ||
| 85 | ldp w\tmpnr1, w\tmpnr2, [\state] | ||
| 86 | msr fpsr, x\tmpnr1 | ||
| 87 | msr fpcr, x\tmpnr2 | ||
| 88 | adr x\tmpnr1, 0f | ||
| 89 | ldr w\tmpnr2, [\state, #8] | ||
| 90 | add \state, \state, x\tmpnr2, lsl #4 | ||
| 91 | sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 | ||
| 92 | br x\tmpnr1 | ||
| 93 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
| 94 | .irp qb, %(qa + 1) | ||
| 95 | ldp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
| 96 | .endr | ||
| 97 | .endr | ||
| 98 | 0: | ||
| 99 | .endm | ||
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index b0cc58a97780..13ce4cc18e26 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h | |||
| @@ -8,7 +8,11 @@ | |||
| 8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/types.h> | ||
| 12 | |||
| 11 | #define cpu_has_neon() (1) | 13 | #define cpu_has_neon() (1) |
| 12 | 14 | ||
| 13 | void kernel_neon_begin(void); | 15 | #define kernel_neon_begin() kernel_neon_begin_partial(32) |
| 16 | |||
| 17 | void kernel_neon_begin_partial(u32 num_regs); | ||
| 14 | void kernel_neon_end(void); | 18 | void kernel_neon_end(void); |
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 0a8b2a97a32e..9c086c63f911 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h | |||
| @@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void) | |||
| 103 | #define TIF_SIGPENDING 0 | 103 | #define TIF_SIGPENDING 0 |
| 104 | #define TIF_NEED_RESCHED 1 | 104 | #define TIF_NEED_RESCHED 1 |
| 105 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ | 105 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ |
| 106 | #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ | ||
| 106 | #define TIF_SYSCALL_TRACE 8 | 107 | #define TIF_SYSCALL_TRACE 8 |
| 107 | #define TIF_SYSCALL_AUDIT 9 | 108 | #define TIF_SYSCALL_AUDIT 9 |
| 108 | #define TIF_SYSCALL_TRACEPOINT 10 | 109 | #define TIF_SYSCALL_TRACEPOINT 10 |
| @@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void) | |||
| 118 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | 119 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) |
| 119 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | 120 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) |
| 120 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 121 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
| 122 | #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) | ||
| 121 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | 123 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) |
| 122 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | 124 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) |
| 123 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) | 125 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) |
| @@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void) | |||
| 125 | #define _TIF_32BIT (1 << TIF_32BIT) | 127 | #define _TIF_32BIT (1 << TIF_32BIT) |
| 126 | 128 | ||
| 127 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ | 129 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ |
| 128 | _TIF_NOTIFY_RESUME) | 130 | _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) |
| 129 | 131 | ||
| 130 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ | 132 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ |
| 131 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) | 133 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) |
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..d358ccacfc00 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S | |||
| @@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state) | |||
| 41 | fpsimd_restore x0, 8 | 41 | fpsimd_restore x0, 8 |
| 42 | ret | 42 | ret |
| 43 | ENDPROC(fpsimd_load_state) | 43 | ENDPROC(fpsimd_load_state) |
| 44 | |||
| 45 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
| 46 | |||
| 47 | /* | ||
| 48 | * Save the bottom n FP registers. | ||
| 49 | * | ||
| 50 | * x0 - pointer to struct fpsimd_partial_state | ||
| 51 | */ | ||
| 52 | ENTRY(fpsimd_save_partial_state) | ||
| 53 | fpsimd_save_partial x0, 1, 8, 9 | ||
| 54 | ret | ||
| 55 | ENDPROC(fpsimd_load_partial_state) | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Load the bottom n FP registers. | ||
| 59 | * | ||
| 60 | * x0 - pointer to struct fpsimd_partial_state | ||
| 61 | */ | ||
| 62 | ENTRY(fpsimd_load_partial_state) | ||
| 63 | fpsimd_restore_partial x0, 8, 9 | ||
| 64 | ret | ||
| 65 | ENDPROC(fpsimd_load_partial_state) | ||
| 66 | |||
| 67 | #endif | ||
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a670d0a98c89..bf017f4ffb4f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S | |||
| @@ -562,7 +562,7 @@ fast_work_pending: | |||
| 562 | str x0, [sp, #S_X0] // returned x0 | 562 | str x0, [sp, #S_X0] // returned x0 |
| 563 | work_pending: | 563 | work_pending: |
| 564 | tbnz x1, #TIF_NEED_RESCHED, work_resched | 564 | tbnz x1, #TIF_NEED_RESCHED, work_resched |
| 565 | /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ | 565 | /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ |
| 566 | ldr x2, [sp, #S_PSTATE] | 566 | ldr x2, [sp, #S_PSTATE] |
| 567 | mov x0, sp // 'regs' | 567 | mov x0, sp // 'regs' |
| 568 | tst x2, #PSR_MODE_MASK // user mode regs? | 568 | tst x2, #PSR_MODE_MASK // user mode regs? |
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4aef42a04bdc..ad8aebb1cdef 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c | |||
| @@ -35,6 +35,60 @@ | |||
| 35 | #define FPEXC_IDF (1 << 7) | 35 | #define FPEXC_IDF (1 << 7) |
| 36 | 36 | ||
| 37 | /* | 37 | /* |
| 38 | * In order to reduce the number of times the FPSIMD state is needlessly saved | ||
| 39 | * and restored, we need to keep track of two things: | ||
| 40 | * (a) for each task, we need to remember which CPU was the last one to have | ||
| 41 | * the task's FPSIMD state loaded into its FPSIMD registers; | ||
| 42 | * (b) for each CPU, we need to remember which task's userland FPSIMD state has | ||
| 43 | * been loaded into its FPSIMD registers most recently, or whether it has | ||
| 44 | * been used to perform kernel mode NEON in the meantime. | ||
| 45 | * | ||
| 46 | * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to | ||
| 47 | * the id of the current CPU everytime the state is loaded onto a CPU. For (b), | ||
| 48 | * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the | ||
| 49 | * address of the userland FPSIMD state of the task that was loaded onto the CPU | ||
| 50 | * the most recently, or NULL if kernel mode NEON has been performed after that. | ||
| 51 | * | ||
| 52 | * With this in place, we no longer have to restore the next FPSIMD state right | ||
| 53 | * when switching between tasks. Instead, we can defer this check to userland | ||
| 54 | * resume, at which time we verify whether the CPU's fpsimd_last_state and the | ||
| 55 | * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we | ||
| 56 | * can omit the FPSIMD restore. | ||
| 57 | * | ||
| 58 | * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to | ||
| 59 | * indicate whether or not the userland FPSIMD state of the current task is | ||
| 60 | * present in the registers. The flag is set unless the FPSIMD registers of this | ||
| 61 | * CPU currently contain the most recent userland FPSIMD state of the current | ||
| 62 | * task. | ||
| 63 | * | ||
| 64 | * For a certain task, the sequence may look something like this: | ||
| 65 | * - the task gets scheduled in; if both the task's fpsimd_state.cpu field | ||
| 66 | * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu | ||
| 67 | * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is | ||
| 68 | * cleared, otherwise it is set; | ||
| 69 | * | ||
| 70 | * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's | ||
| 71 | * userland FPSIMD state is copied from memory to the registers, the task's | ||
| 72 | * fpsimd_state.cpu field is set to the id of the current CPU, the current | ||
| 73 | * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the | ||
| 74 | * TIF_FOREIGN_FPSTATE flag is cleared; | ||
| 75 | * | ||
| 76 | * - the task executes an ordinary syscall; upon return to userland, the | ||
| 77 | * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is | ||
| 78 | * restored; | ||
| 79 | * | ||
| 80 | * - the task executes a syscall which executes some NEON instructions; this is | ||
| 81 | * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD | ||
| 82 | * register contents to memory, clears the fpsimd_last_state per-cpu variable | ||
| 83 | * and sets the TIF_FOREIGN_FPSTATE flag; | ||
| 84 | * | ||
| 85 | * - the task gets preempted after kernel_neon_end() is called; as we have not | ||
| 86 | * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so | ||
| 87 | * whatever is in the FPSIMD registers is not saved to memory, but discarded. | ||
| 88 | */ | ||
| 89 | static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); | ||
| 90 | |||
| 91 | /* | ||
| 38 | * Trapped FP/ASIMD access. | 92 | * Trapped FP/ASIMD access. |
| 39 | */ | 93 | */ |
| 40 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) | 94 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) |
| @@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) | |||
| 72 | 126 | ||
| 73 | void fpsimd_thread_switch(struct task_struct *next) | 127 | void fpsimd_thread_switch(struct task_struct *next) |
| 74 | { | 128 | { |
| 75 | /* check if not kernel threads */ | 129 | /* |
| 76 | if (current->mm) | 130 | * Save the current FPSIMD state to memory, but only if whatever is in |
| 131 | * the registers is in fact the most recent userland FPSIMD state of | ||
| 132 | * 'current'. | ||
| 133 | */ | ||
| 134 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
| 77 | fpsimd_save_state(¤t->thread.fpsimd_state); | 135 | fpsimd_save_state(¤t->thread.fpsimd_state); |
| 78 | if (next->mm) | 136 | |
| 79 | fpsimd_load_state(&next->thread.fpsimd_state); | 137 | if (next->mm) { |
| 138 | /* | ||
| 139 | * If we are switching to a task whose most recent userland | ||
| 140 | * FPSIMD state is already in the registers of *this* cpu, | ||
| 141 | * we can skip loading the state from memory. Otherwise, set | ||
| 142 | * the TIF_FOREIGN_FPSTATE flag so the state will be loaded | ||
| 143 | * upon the next return to userland. | ||
| 144 | */ | ||
| 145 | struct fpsimd_state *st = &next->thread.fpsimd_state; | ||
| 146 | |||
| 147 | if (__this_cpu_read(fpsimd_last_state) == st | ||
| 148 | && st->cpu == smp_processor_id()) | ||
| 149 | clear_ti_thread_flag(task_thread_info(next), | ||
| 150 | TIF_FOREIGN_FPSTATE); | ||
| 151 | else | ||
| 152 | set_ti_thread_flag(task_thread_info(next), | ||
| 153 | TIF_FOREIGN_FPSTATE); | ||
| 154 | } | ||
| 80 | } | 155 | } |
| 81 | 156 | ||
| 82 | void fpsimd_flush_thread(void) | 157 | void fpsimd_flush_thread(void) |
| 83 | { | 158 | { |
| 84 | preempt_disable(); | ||
| 85 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); | 159 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); |
| 86 | fpsimd_load_state(¤t->thread.fpsimd_state); | 160 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
| 161 | } | ||
| 162 | |||
| 163 | /* | ||
| 164 | * Save the userland FPSIMD state of 'current' to memory, but only if the state | ||
| 165 | * currently held in the registers does in fact belong to 'current' | ||
| 166 | */ | ||
| 167 | void fpsimd_preserve_current_state(void) | ||
| 168 | { | ||
| 169 | preempt_disable(); | ||
| 170 | if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
| 171 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
| 172 | preempt_enable(); | ||
| 173 | } | ||
| 174 | |||
| 175 | /* | ||
| 176 | * Load the userland FPSIMD state of 'current' from memory, but only if the | ||
| 177 | * FPSIMD state already held in the registers is /not/ the most recent FPSIMD | ||
| 178 | * state of 'current' | ||
| 179 | */ | ||
| 180 | void fpsimd_restore_current_state(void) | ||
| 181 | { | ||
| 182 | preempt_disable(); | ||
| 183 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
| 184 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
| 185 | |||
| 186 | fpsimd_load_state(st); | ||
| 187 | this_cpu_write(fpsimd_last_state, st); | ||
| 188 | st->cpu = smp_processor_id(); | ||
| 189 | } | ||
| 190 | preempt_enable(); | ||
| 191 | } | ||
| 192 | |||
| 193 | /* | ||
| 194 | * Load an updated userland FPSIMD state for 'current' from memory and set the | ||
| 195 | * flag that indicates that the FPSIMD register contents are the most recent | ||
| 196 | * FPSIMD state of 'current' | ||
| 197 | */ | ||
| 198 | void fpsimd_update_current_state(struct fpsimd_state *state) | ||
| 199 | { | ||
| 200 | preempt_disable(); | ||
| 201 | fpsimd_load_state(state); | ||
| 202 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
| 203 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
| 204 | |||
| 205 | this_cpu_write(fpsimd_last_state, st); | ||
| 206 | st->cpu = smp_processor_id(); | ||
| 207 | } | ||
| 87 | preempt_enable(); | 208 | preempt_enable(); |
| 88 | } | 209 | } |
| 89 | 210 | ||
| 211 | /* | ||
| 212 | * Invalidate live CPU copies of task t's FPSIMD state | ||
| 213 | */ | ||
| 214 | void fpsimd_flush_task_state(struct task_struct *t) | ||
| 215 | { | ||
| 216 | t->thread.fpsimd_state.cpu = NR_CPUS; | ||
| 217 | } | ||
| 218 | |||
| 90 | #ifdef CONFIG_KERNEL_MODE_NEON | 219 | #ifdef CONFIG_KERNEL_MODE_NEON |
| 91 | 220 | ||
| 221 | static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); | ||
| 222 | static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); | ||
| 223 | |||
| 92 | /* | 224 | /* |
| 93 | * Kernel-side NEON support functions | 225 | * Kernel-side NEON support functions |
| 94 | */ | 226 | */ |
| 95 | void kernel_neon_begin(void) | 227 | void kernel_neon_begin_partial(u32 num_regs) |
| 96 | { | 228 | { |
| 97 | /* Avoid using the NEON in interrupt context */ | 229 | if (in_interrupt()) { |
| 98 | BUG_ON(in_interrupt()); | 230 | struct fpsimd_partial_state *s = this_cpu_ptr( |
| 99 | preempt_disable(); | 231 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); |
| 100 | 232 | ||
| 101 | if (current->mm) | 233 | BUG_ON(num_regs > 32); |
| 102 | fpsimd_save_state(¤t->thread.fpsimd_state); | 234 | fpsimd_save_partial_state(s, roundup(num_regs, 2)); |
| 235 | } else { | ||
| 236 | /* | ||
| 237 | * Save the userland FPSIMD state if we have one and if we | ||
| 238 | * haven't done so already. Clear fpsimd_last_state to indicate | ||
| 239 | * that there is no longer userland FPSIMD state in the | ||
| 240 | * registers. | ||
| 241 | */ | ||
| 242 | preempt_disable(); | ||
| 243 | if (current->mm && | ||
| 244 | !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
| 245 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
| 246 | this_cpu_write(fpsimd_last_state, NULL); | ||
| 247 | } | ||
| 103 | } | 248 | } |
| 104 | EXPORT_SYMBOL(kernel_neon_begin); | 249 | EXPORT_SYMBOL(kernel_neon_begin_partial); |
| 105 | 250 | ||
| 106 | void kernel_neon_end(void) | 251 | void kernel_neon_end(void) |
| 107 | { | 252 | { |
| 108 | if (current->mm) | 253 | if (in_interrupt()) { |
| 109 | fpsimd_load_state(¤t->thread.fpsimd_state); | 254 | struct fpsimd_partial_state *s = this_cpu_ptr( |
| 110 | 255 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); | |
| 111 | preempt_enable(); | 256 | fpsimd_load_partial_state(s); |
| 257 | } else { | ||
| 258 | preempt_enable(); | ||
| 259 | } | ||
| 112 | } | 260 | } |
| 113 | EXPORT_SYMBOL(kernel_neon_end); | 261 | EXPORT_SYMBOL(kernel_neon_end); |
| 114 | 262 | ||
| @@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, | |||
| 120 | { | 268 | { |
| 121 | switch (cmd) { | 269 | switch (cmd) { |
| 122 | case CPU_PM_ENTER: | 270 | case CPU_PM_ENTER: |
| 123 | if (current->mm) | 271 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) |
| 124 | fpsimd_save_state(¤t->thread.fpsimd_state); | 272 | fpsimd_save_state(¤t->thread.fpsimd_state); |
| 125 | break; | 273 | break; |
| 126 | case CPU_PM_EXIT: | 274 | case CPU_PM_EXIT: |
| 127 | if (current->mm) | 275 | if (current->mm) |
| 128 | fpsimd_load_state(¤t->thread.fpsimd_state); | 276 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
| 129 | break; | 277 | break; |
| 130 | case CPU_PM_ENTER_FAILED: | 278 | case CPU_PM_ENTER_FAILED: |
| 131 | default: | 279 | default: |
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index d04eb871cb0e..9f2d6020b6c2 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c | |||
| @@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task) | |||
| 206 | 206 | ||
| 207 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 207 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
| 208 | { | 208 | { |
| 209 | fpsimd_save_state(¤t->thread.fpsimd_state); | 209 | fpsimd_preserve_current_state(); |
| 210 | *dst = *src; | 210 | *dst = *src; |
| 211 | return 0; | 211 | return 0; |
| 212 | } | 212 | } |
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 4b58e812cf67..32d52d3b079c 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c | |||
| @@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, | |||
| 518 | return ret; | 518 | return ret; |
| 519 | 519 | ||
| 520 | target->thread.fpsimd_state.user_fpsimd = newstate; | 520 | target->thread.fpsimd_state.user_fpsimd = newstate; |
| 521 | fpsimd_flush_task_state(target); | ||
| 521 | return ret; | 522 | return ret; |
| 522 | } | 523 | } |
| 523 | 524 | ||
| @@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target, | |||
| 765 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; | 766 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; |
| 766 | } | 767 | } |
| 767 | 768 | ||
| 769 | fpsimd_flush_task_state(target); | ||
| 768 | return ret; | 770 | return ret; |
| 769 | } | 771 | } |
| 770 | 772 | ||
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 2ba72a11629f..6357b9c6c90e 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c | |||
| @@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) | |||
| 51 | int err; | 51 | int err; |
| 52 | 52 | ||
| 53 | /* dump the hardware registers to the fpsimd_state structure */ | 53 | /* dump the hardware registers to the fpsimd_state structure */ |
| 54 | fpsimd_save_state(fpsimd); | 54 | fpsimd_preserve_current_state(); |
| 55 | 55 | ||
| 56 | /* copy the FP and status/control registers */ | 56 | /* copy the FP and status/control registers */ |
| 57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); | 57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); |
| @@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) | |||
| 86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); | 86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); |
| 87 | 87 | ||
| 88 | /* load the hardware registers from the fpsimd_state structure */ | 88 | /* load the hardware registers from the fpsimd_state structure */ |
| 89 | if (!err) { | 89 | if (!err) |
| 90 | preempt_disable(); | 90 | fpsimd_update_current_state(&fpsimd); |
| 91 | fpsimd_load_state(&fpsimd); | ||
| 92 | preempt_enable(); | ||
| 93 | } | ||
| 94 | 91 | ||
| 95 | return err ? -EFAULT : 0; | 92 | return err ? -EFAULT : 0; |
| 96 | } | 93 | } |
| @@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, | |||
| 433 | clear_thread_flag(TIF_NOTIFY_RESUME); | 430 | clear_thread_flag(TIF_NOTIFY_RESUME); |
| 434 | tracehook_notify_resume(regs); | 431 | tracehook_notify_resume(regs); |
| 435 | } | 432 | } |
| 433 | |||
| 434 | if (thread_flags & _TIF_FOREIGN_FPSTATE) | ||
| 435 | fpsimd_restore_current_state(); | ||
| 436 | |||
| 436 | } | 437 | } |
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 050c1c2af777..3491c638f172 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c | |||
| @@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
| 222 | * Note that this also saves V16-31, which aren't visible | 222 | * Note that this also saves V16-31, which aren't visible |
| 223 | * in AArch32. | 223 | * in AArch32. |
| 224 | */ | 224 | */ |
| 225 | fpsimd_save_state(fpsimd); | 225 | fpsimd_preserve_current_state(); |
| 226 | 226 | ||
| 227 | /* Place structure header on the stack */ | 227 | /* Place structure header on the stack */ |
| 228 | __put_user_error(magic, &frame->magic, err); | 228 | __put_user_error(magic, &frame->magic, err); |
| @@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
| 285 | * We don't need to touch the exception register, so | 285 | * We don't need to touch the exception register, so |
| 286 | * reload the hardware state. | 286 | * reload the hardware state. |
| 287 | */ | 287 | */ |
| 288 | if (!err) { | 288 | if (!err) |
| 289 | preempt_disable(); | 289 | fpsimd_update_current_state(&fpsimd); |
| 290 | fpsimd_load_state(&fpsimd); | ||
| 291 | preempt_enable(); | ||
| 292 | } | ||
| 293 | 290 | ||
| 294 | return err ? -EFAULT : 0; | 291 | return err ? -EFAULT : 0; |
| 295 | } | 292 | } |
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index 03cf5936bad6..1ac097279db1 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h | |||
| @@ -4,22 +4,27 @@ | |||
| 4 | /* | 4 | /* |
| 5 | * This is the most generic implementation of unaligned accesses | 5 | * This is the most generic implementation of unaligned accesses |
| 6 | * and should work almost anywhere. | 6 | * and should work almost anywhere. |
| 7 | * | ||
| 8 | * If an architecture can handle unaligned accesses in hardware, | ||
| 9 | * it may want to use the linux/unaligned/access_ok.h implementation | ||
| 10 | * instead. | ||
| 11 | */ | 7 | */ |
| 12 | #include <asm/byteorder.h> | 8 | #include <asm/byteorder.h> |
| 13 | 9 | ||
| 10 | /* Set by the arch if it can handle unaligned accesses in hardware. */ | ||
| 11 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS | ||
| 12 | # include <linux/unaligned/access_ok.h> | ||
| 13 | #endif | ||
| 14 | |||
| 14 | #if defined(__LITTLE_ENDIAN) | 15 | #if defined(__LITTLE_ENDIAN) |
| 15 | # include <linux/unaligned/le_struct.h> | 16 | # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
| 16 | # include <linux/unaligned/be_byteshift.h> | 17 | # include <linux/unaligned/le_struct.h> |
| 18 | # include <linux/unaligned/be_byteshift.h> | ||
| 19 | # endif | ||
| 17 | # include <linux/unaligned/generic.h> | 20 | # include <linux/unaligned/generic.h> |
| 18 | # define get_unaligned __get_unaligned_le | 21 | # define get_unaligned __get_unaligned_le |
| 19 | # define put_unaligned __put_unaligned_le | 22 | # define put_unaligned __put_unaligned_le |
| 20 | #elif defined(__BIG_ENDIAN) | 23 | #elif defined(__BIG_ENDIAN) |
| 21 | # include <linux/unaligned/be_struct.h> | 24 | # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
| 22 | # include <linux/unaligned/le_byteshift.h> | 25 | # include <linux/unaligned/be_struct.h> |
| 26 | # include <linux/unaligned/le_byteshift.h> | ||
| 27 | # endif | ||
| 23 | # include <linux/unaligned/generic.h> | 28 | # include <linux/unaligned/generic.h> |
| 24 | # define get_unaligned __get_unaligned_be | 29 | # define get_unaligned __get_unaligned_be |
| 25 | # define put_unaligned __put_unaligned_be | 30 | # define put_unaligned __put_unaligned_be |
