aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Biggers <ebiggers@google.com>2018-10-18 00:37:59 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2018-11-09 04:36:48 -0500
commit913a3aa07d16e5b302f408d497a4b829910de247 (patch)
tree88e4aadf88930378116f3dd311f076fb6a78276d
parent0a6a40c2a8c184a2fb467efacfb1cd338d719e0b (diff)
crypto: arm/aes - add some hardening against cache-timing attacks
Make the ARM scalar AES implementation closer to constant-time by disabling interrupts and prefetching the tables into L1 cache. This is feasible because due to ARM's "free" rotations, the main tables are only 1024 bytes instead of the usual 4096 used by most AES implementations. On ARM Cortex-A7, the speed loss is only about 5%. The resulting code is still over twice as fast as aes_ti.c. Responsiveness is potentially a concern, but interrupts are only disabled for a single AES block. Note that even after these changes, the implementation still isn't necessarily guaranteed to be constant-time; see https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion of the many difficulties involved in writing truly constant-time AES software. But it's valuable to make such attacks more difficult. Much of this patch is based on patches suggested by Ard Biesheuvel. Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/arm/crypto/Kconfig9
-rw-r--r--arch/arm/crypto/aes-cipher-core.S62
-rw-r--r--crypto/aes_generic.c9
3 files changed, 66 insertions, 14 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index ef0c7feea6e2..0473a8f68389 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -69,6 +69,15 @@ config CRYPTO_AES_ARM
69 help 69 help
70 Use optimized AES assembler routines for ARM platforms. 70 Use optimized AES assembler routines for ARM platforms.
71 71
72 On ARM processors without the Crypto Extensions, this is the
73 fastest AES implementation for single blocks. For multiple
74 blocks, the NEON bit-sliced implementation is usually faster.
75
76 This implementation may be vulnerable to cache timing attacks,
77 since it uses lookup tables. However, as countermeasures it
78 disables IRQs and preloads the tables; it is hoped this makes
79 such attacks very difficult.
80
72config CRYPTO_AES_ARM_BS 81config CRYPTO_AES_ARM_BS
73 tristate "Bit sliced AES using NEON instructions" 82 tristate "Bit sliced AES using NEON instructions"
74 depends on KERNEL_MODE_NEON 83 depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5..f2d67c095e59 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <asm/assembler.h>
13#include <asm/cache.h> 14#include <asm/cache.h>
14 15
15 .text 16 .text
@@ -41,7 +42,7 @@
41 .endif 42 .endif
42 .endm 43 .endm
43 44
44 .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op 45 .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
45 __select \out0, \in0, 0 46 __select \out0, \in0, 0
46 __select t0, \in1, 1 47 __select t0, \in1, 1
47 __load \out0, \out0, 0, \sz, \op 48 __load \out0, \out0, 0, \sz, \op
@@ -73,6 +74,14 @@
73 __load t0, t0, 3, \sz, \op 74 __load t0, t0, 3, \sz, \op
74 __load \t4, \t4, 3, \sz, \op 75 __load \t4, \t4, 3, \sz, \op
75 76
77 .ifnb \oldcpsr
78 /*
79 * This is the final round and we're done with all data-dependent table
80 * lookups, so we can safely re-enable interrupts.
81 */
82 restore_irqs \oldcpsr
83 .endif
84
76 eor \out1, \out1, t1, ror #24 85 eor \out1, \out1, t1, ror #24
77 eor \out0, \out0, t2, ror #16 86 eor \out0, \out0, t2, ror #16
78 ldm rk!, {t1, t2} 87 ldm rk!, {t1, t2}
@@ -83,14 +92,14 @@
83 eor \out1, \out1, t2 92 eor \out1, \out1, t2
84 .endm 93 .endm
85 94
86 .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op 95 .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
87 __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op 96 __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
88 __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op 97 __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
89 .endm 98 .endm
90 99
91 .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op 100 .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
92 __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op 101 __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
93 __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op 102 __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
94 .endm 103 .endm
95 104
96 .macro __rev, out, in 105 .macro __rev, out, in
@@ -118,13 +127,14 @@
118 .macro do_crypt, round, ttab, ltab, bsz 127 .macro do_crypt, round, ttab, ltab, bsz
119 push {r3-r11, lr} 128 push {r3-r11, lr}
120 129
130 // Load keys first, to reduce latency in case they're not cached yet.
131 ldm rk!, {r8-r11}
132
121 ldr r4, [in] 133 ldr r4, [in]
122 ldr r5, [in, #4] 134 ldr r5, [in, #4]
123 ldr r6, [in, #8] 135 ldr r6, [in, #8]
124 ldr r7, [in, #12] 136 ldr r7, [in, #12]
125 137
126 ldm rk!, {r8-r11}
127
128#ifdef CONFIG_CPU_BIG_ENDIAN 138#ifdef CONFIG_CPU_BIG_ENDIAN
129 __rev r4, r4 139 __rev r4, r4
130 __rev r5, r5 140 __rev r5, r5
@@ -138,6 +148,25 @@
138 eor r7, r7, r11 148 eor r7, r7, r11
139 149
140 __adrl ttab, \ttab 150 __adrl ttab, \ttab
151 /*
152 * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
153 * L1 cache, assuming cacheline size >= 32. This is a hardening measure
154 * intended to make cache-timing attacks more difficult. They may not
155 * be fully prevented, however; see the paper
156 * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
157 * ("Cache-timing attacks on AES") for a discussion of the many
158 * difficulties involved in writing truly constant-time AES software.
159 */
160 save_and_disable_irqs t0
161 .set i, 0
162 .rept 1024 / 128
163 ldr r8, [ttab, #i + 0]
164 ldr r9, [ttab, #i + 32]
165 ldr r10, [ttab, #i + 64]
166 ldr r11, [ttab, #i + 96]
167 .set i, i + 128
168 .endr
169 push {t0} // oldcpsr
141 170
142 tst rounds, #2 171 tst rounds, #2
143 bne 1f 172 bne 1f
@@ -151,8 +180,21 @@
151 \round r4, r5, r6, r7, r8, r9, r10, r11 180 \round r4, r5, r6, r7, r8, r9, r10, r11
152 b 0b 181 b 0b
153 182
1542: __adrl ttab, \ltab 1832: .ifb \ltab
155 \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b 184 add ttab, ttab, #1
185 .else
186 __adrl ttab, \ltab
187 // Prefetch inverse S-box for final round; see explanation above
188 .set i, 0
189 .rept 256 / 64
190 ldr t0, [ttab, #i + 0]
191 ldr t1, [ttab, #i + 32]
192 .set i, i + 64
193 .endr
194 .endif
195
196 pop {rounds} // oldcpsr
197 \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
156 198
157#ifdef CONFIG_CPU_BIG_ENDIAN 199#ifdef CONFIG_CPU_BIG_ENDIAN
158 __rev r4, r4 200 __rev r4, r4
@@ -175,7 +217,7 @@
175 .endm 217 .endm
176 218
177ENTRY(__aes_arm_encrypt) 219ENTRY(__aes_arm_encrypt)
178 do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 220 do_crypt fround, crypto_ft_tab,, 2
179ENDPROC(__aes_arm_encrypt) 221ENDPROC(__aes_arm_encrypt)
180 222
181 .align 5 223 .align 5
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index ca554d57d01e..13df33aca463 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
63 63
64static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; 64static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
65 65
66__visible const u32 crypto_ft_tab[4][256] = { 66/* cacheline-aligned to facilitate prefetching into cache */
67__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
67 { 68 {
68 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 69 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
69 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, 70 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
327 } 328 }
328}; 329};
329 330
330__visible const u32 crypto_fl_tab[4][256] = { 331__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
331 { 332 {
332 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 333 0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
333 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, 334 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
@@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
591 } 592 }
592}; 593};
593 594
594__visible const u32 crypto_it_tab[4][256] = { 595__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
595 { 596 {
596 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 597 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
597 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, 598 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
855 } 856 }
856}; 857};
857 858
858__visible const u32 crypto_il_tab[4][256] = { 859__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
859 { 860 {
860 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, 861 0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
861 0x00000030, 0x00000036, 0x000000a5, 0x00000038, 862 0x00000030, 0x00000036, 0x000000a5, 0x00000038,