diff options
-rw-r--r-- | arch/arm/crypto/Kconfig | 9 | ||||
-rw-r--r-- | arch/arm/crypto/aes-cipher-core.S | 62 | ||||
-rw-r--r-- | crypto/aes_generic.c | 9 |
3 files changed, 66 insertions, 14 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index ef0c7feea6e2..0473a8f68389 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig | |||
@@ -69,6 +69,15 @@ config CRYPTO_AES_ARM | |||
69 | help | 69 | help |
70 | Use optimized AES assembler routines for ARM platforms. | 70 | Use optimized AES assembler routines for ARM platforms. |
71 | 71 | ||
72 | On ARM processors without the Crypto Extensions, this is the | ||
73 | fastest AES implementation for single blocks. For multiple | ||
74 | blocks, the NEON bit-sliced implementation is usually faster. | ||
75 | |||
76 | This implementation may be vulnerable to cache timing attacks, | ||
77 | since it uses lookup tables. However, as countermeasures it | ||
78 | disables IRQs and preloads the tables; it is hoped this makes | ||
79 | such attacks very difficult. | ||
80 | |||
72 | config CRYPTO_AES_ARM_BS | 81 | config CRYPTO_AES_ARM_BS |
73 | tristate "Bit sliced AES using NEON instructions" | 82 | tristate "Bit sliced AES using NEON instructions" |
74 | depends on KERNEL_MODE_NEON | 83 | depends on KERNEL_MODE_NEON |
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index 184d6c2d15d5..f2d67c095e59 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S | |||
@@ -10,6 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
13 | #include <asm/assembler.h> | ||
13 | #include <asm/cache.h> | 14 | #include <asm/cache.h> |
14 | 15 | ||
15 | .text | 16 | .text |
@@ -41,7 +42,7 @@ | |||
41 | .endif | 42 | .endif |
42 | .endm | 43 | .endm |
43 | 44 | ||
44 | .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op | 45 | .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr |
45 | __select \out0, \in0, 0 | 46 | __select \out0, \in0, 0 |
46 | __select t0, \in1, 1 | 47 | __select t0, \in1, 1 |
47 | __load \out0, \out0, 0, \sz, \op | 48 | __load \out0, \out0, 0, \sz, \op |
@@ -73,6 +74,14 @@ | |||
73 | __load t0, t0, 3, \sz, \op | 74 | __load t0, t0, 3, \sz, \op |
74 | __load \t4, \t4, 3, \sz, \op | 75 | __load \t4, \t4, 3, \sz, \op |
75 | 76 | ||
77 | .ifnb \oldcpsr | ||
78 | /* | ||
79 | * This is the final round and we're done with all data-dependent table | ||
80 | * lookups, so we can safely re-enable interrupts. | ||
81 | */ | ||
82 | restore_irqs \oldcpsr | ||
83 | .endif | ||
84 | |||
76 | eor \out1, \out1, t1, ror #24 | 85 | eor \out1, \out1, t1, ror #24 |
77 | eor \out0, \out0, t2, ror #16 | 86 | eor \out0, \out0, t2, ror #16 |
78 | ldm rk!, {t1, t2} | 87 | ldm rk!, {t1, t2} |
@@ -83,14 +92,14 @@ | |||
83 | eor \out1, \out1, t2 | 92 | eor \out1, \out1, t2 |
84 | .endm | 93 | .endm |
85 | 94 | ||
86 | .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op | 95 | .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr |
87 | __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op | 96 | __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op |
88 | __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op | 97 | __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr |
89 | .endm | 98 | .endm |
90 | 99 | ||
91 | .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op | 100 | .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr |
92 | __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op | 101 | __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op |
93 | __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op | 102 | __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr |
94 | .endm | 103 | .endm |
95 | 104 | ||
96 | .macro __rev, out, in | 105 | .macro __rev, out, in |
@@ -118,13 +127,14 @@ | |||
118 | .macro do_crypt, round, ttab, ltab, bsz | 127 | .macro do_crypt, round, ttab, ltab, bsz |
119 | push {r3-r11, lr} | 128 | push {r3-r11, lr} |
120 | 129 | ||
130 | // Load keys first, to reduce latency in case they're not cached yet. | ||
131 | ldm rk!, {r8-r11} | ||
132 | |||
121 | ldr r4, [in] | 133 | ldr r4, [in] |
122 | ldr r5, [in, #4] | 134 | ldr r5, [in, #4] |
123 | ldr r6, [in, #8] | 135 | ldr r6, [in, #8] |
124 | ldr r7, [in, #12] | 136 | ldr r7, [in, #12] |
125 | 137 | ||
126 | ldm rk!, {r8-r11} | ||
127 | |||
128 | #ifdef CONFIG_CPU_BIG_ENDIAN | 138 | #ifdef CONFIG_CPU_BIG_ENDIAN |
129 | __rev r4, r4 | 139 | __rev r4, r4 |
130 | __rev r5, r5 | 140 | __rev r5, r5 |
@@ -138,6 +148,25 @@ | |||
138 | eor r7, r7, r11 | 148 | eor r7, r7, r11 |
139 | 149 | ||
140 | __adrl ttab, \ttab | 150 | __adrl ttab, \ttab |
151 | /* | ||
152 | * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into | ||
153 | * L1 cache, assuming cacheline size >= 32. This is a hardening measure | ||
154 | * intended to make cache-timing attacks more difficult. They may not | ||
155 | * be fully prevented, however; see the paper | ||
156 | * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf | ||
157 | * ("Cache-timing attacks on AES") for a discussion of the many | ||
158 | * difficulties involved in writing truly constant-time AES software. | ||
159 | */ | ||
160 | save_and_disable_irqs t0 | ||
161 | .set i, 0 | ||
162 | .rept 1024 / 128 | ||
163 | ldr r8, [ttab, #i + 0] | ||
164 | ldr r9, [ttab, #i + 32] | ||
165 | ldr r10, [ttab, #i + 64] | ||
166 | ldr r11, [ttab, #i + 96] | ||
167 | .set i, i + 128 | ||
168 | .endr | ||
169 | push {t0} // oldcpsr | ||
141 | 170 | ||
142 | tst rounds, #2 | 171 | tst rounds, #2 |
143 | bne 1f | 172 | bne 1f |
@@ -151,8 +180,21 @@ | |||
151 | \round r4, r5, r6, r7, r8, r9, r10, r11 | 180 | \round r4, r5, r6, r7, r8, r9, r10, r11 |
152 | b 0b | 181 | b 0b |
153 | 182 | ||
154 | 2: __adrl ttab, \ltab | 183 | 2: .ifb \ltab |
155 | \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b | 184 | add ttab, ttab, #1 |
185 | .else | ||
186 | __adrl ttab, \ltab | ||
187 | // Prefetch inverse S-box for final round; see explanation above | ||
188 | .set i, 0 | ||
189 | .rept 256 / 64 | ||
190 | ldr t0, [ttab, #i + 0] | ||
191 | ldr t1, [ttab, #i + 32] | ||
192 | .set i, i + 64 | ||
193 | .endr | ||
194 | .endif | ||
195 | |||
196 | pop {rounds} // oldcpsr | ||
197 | \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds | ||
156 | 198 | ||
157 | #ifdef CONFIG_CPU_BIG_ENDIAN | 199 | #ifdef CONFIG_CPU_BIG_ENDIAN |
158 | __rev r4, r4 | 200 | __rev r4, r4 |
@@ -175,7 +217,7 @@ | |||
175 | .endm | 217 | .endm |
176 | 218 | ||
177 | ENTRY(__aes_arm_encrypt) | 219 | ENTRY(__aes_arm_encrypt) |
178 | do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 | 220 | do_crypt fround, crypto_ft_tab,, 2 |
179 | ENDPROC(__aes_arm_encrypt) | 221 | ENDPROC(__aes_arm_encrypt) |
180 | 222 | ||
181 | .align 5 | 223 | .align 5 |
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index ca554d57d01e..13df33aca463 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c | |||
@@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n) | |||
63 | 63 | ||
64 | static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; | 64 | static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; |
65 | 65 | ||
66 | __visible const u32 crypto_ft_tab[4][256] = { | 66 | /* cacheline-aligned to facilitate prefetching into cache */ |
67 | __visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = { | ||
67 | { | 68 | { |
68 | 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, | 69 | 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, |
69 | 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, | 70 | 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, |
@@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = { | |||
327 | } | 328 | } |
328 | }; | 329 | }; |
329 | 330 | ||
330 | __visible const u32 crypto_fl_tab[4][256] = { | 331 | __visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = { |
331 | { | 332 | { |
332 | 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, | 333 | 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, |
333 | 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, | 334 | 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, |
@@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = { | |||
591 | } | 592 | } |
592 | }; | 593 | }; |
593 | 594 | ||
594 | __visible const u32 crypto_it_tab[4][256] = { | 595 | __visible const u32 crypto_it_tab[4][256] __cacheline_aligned = { |
595 | { | 596 | { |
596 | 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, | 597 | 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, |
597 | 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, | 598 | 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, |
@@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = { | |||
855 | } | 856 | } |
856 | }; | 857 | }; |
857 | 858 | ||
858 | __visible const u32 crypto_il_tab[4][256] = { | 859 | __visible const u32 crypto_il_tab[4][256] __cacheline_aligned = { |
859 | { | 860 | { |
860 | 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, | 861 | 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, |
861 | 0x00000030, 0x00000036, 0x000000a5, 0x00000038, | 862 | 0x00000030, 0x00000036, 0x000000a5, 0x00000038, |