summaryrefslogtreecommitdiffstats
path: root/crypto
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-01-27 04:18:32 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2018-02-08 06:37:08 -0500
commit4767b9ad7d762876a5865a06465e13e139a01b6b (patch)
tree889c7e047525513f6b519760cf122f8cba8fbbeb /crypto
parent87a81dce53b1ea61acaeefa5191a0376a2d1d721 (diff)
crypto: sha3-generic - deal with oversize stack frames
As reported by kbuild test robot, the optimized SHA3 C implementation compiles to mn10300 code that uses a disproportionate amount of stack space, i.e., crypto/sha3_generic.c: In function 'keccakf': crypto/sha3_generic.c:147:1: warning: the frame size of 1232 bytes is larger than 1024 bytes [-Wframe-larger-than=] As kindly diagnosed by Arnd, this does not only occur when building for the mn10300 architecture (which is what the report was about) but also for h8300, and builds for other 32-bit architectures show an increase in stack space utilization as well. Given that SHA3 operates on 64-bit quantities, and keeps a state matrix of 25 64-bit words, it is not surprising that 32-bit architectures with few general purpose registers are impacted the most by this, and it is therefore reasonable to implement a workaround that distinguishes between 32-bit and 64-bit architectures. Arnd figured out that taking the round calculation out of the loop, and inlining it explicitly but only on 64-bit architectures preserves most of the performance gain achieved by the rewrite, and also gets rid of the excessive use of stack space. Reported-by: kbuild test robot <fengguang.wu@intel.com> Suggested-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'crypto')
-rw-r--r--crypto/sha3_generic.c218
1 files changed, 118 insertions, 100 deletions
diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
index a965b9d80559..951c4eb70262 100644
--- a/crypto/sha3_generic.c
+++ b/crypto/sha3_generic.c
@@ -20,6 +20,20 @@
20#include <crypto/sha3.h> 20#include <crypto/sha3.h>
21#include <asm/unaligned.h> 21#include <asm/unaligned.h>
22 22
23/*
24 * On some 32-bit architectures (mn10300 and h8300), GCC ends up using
25 * over 1 KB of stack if we inline the round calculation into the loop
26 * in keccakf(). On the other hand, on 64-bit architectures with plenty
27 * of [64-bit wide] general purpose registers, not inlining it severely
28 * hurts performance. So let's use 64-bitness as a heuristic to decide
29 * whether to inline or not.
30 */
31#ifdef CONFIG_64BIT
32#define SHA3_INLINE inline
33#else
34#define SHA3_INLINE noinline
35#endif
36
23#define KECCAK_ROUNDS 24 37#define KECCAK_ROUNDS 24
24 38
25static const u64 keccakf_rndc[24] = { 39static const u64 keccakf_rndc[24] = {
@@ -35,111 +49,115 @@ static const u64 keccakf_rndc[24] = {
35 49
36/* update the state with given number of rounds */ 50/* update the state with given number of rounds */
37 51
38static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) 52static SHA3_INLINE void keccakf_round(u64 st[25])
39{ 53{
40 u64 t[5], tt, bc[5]; 54 u64 t[5], tt, bc[5];
41 int round;
42 55
43 for (round = 0; round < KECCAK_ROUNDS; round++) { 56 /* Theta */
57 bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
58 bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
59 bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
60 bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
61 bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
62
63 t[0] = bc[4] ^ rol64(bc[1], 1);
64 t[1] = bc[0] ^ rol64(bc[2], 1);
65 t[2] = bc[1] ^ rol64(bc[3], 1);
66 t[3] = bc[2] ^ rol64(bc[4], 1);
67 t[4] = bc[3] ^ rol64(bc[0], 1);
68
69 st[0] ^= t[0];
70
71 /* Rho Pi */
72 tt = st[1];
73 st[ 1] = rol64(st[ 6] ^ t[1], 44);
74 st[ 6] = rol64(st[ 9] ^ t[4], 20);
75 st[ 9] = rol64(st[22] ^ t[2], 61);
76 st[22] = rol64(st[14] ^ t[4], 39);
77 st[14] = rol64(st[20] ^ t[0], 18);
78 st[20] = rol64(st[ 2] ^ t[2], 62);
79 st[ 2] = rol64(st[12] ^ t[2], 43);
80 st[12] = rol64(st[13] ^ t[3], 25);
81 st[13] = rol64(st[19] ^ t[4], 8);
82 st[19] = rol64(st[23] ^ t[3], 56);
83 st[23] = rol64(st[15] ^ t[0], 41);
84 st[15] = rol64(st[ 4] ^ t[4], 27);
85 st[ 4] = rol64(st[24] ^ t[4], 14);
86 st[24] = rol64(st[21] ^ t[1], 2);
87 st[21] = rol64(st[ 8] ^ t[3], 55);
88 st[ 8] = rol64(st[16] ^ t[1], 45);
89 st[16] = rol64(st[ 5] ^ t[0], 36);
90 st[ 5] = rol64(st[ 3] ^ t[3], 28);
91 st[ 3] = rol64(st[18] ^ t[3], 21);
92 st[18] = rol64(st[17] ^ t[2], 15);
93 st[17] = rol64(st[11] ^ t[1], 10);
94 st[11] = rol64(st[ 7] ^ t[2], 6);
95 st[ 7] = rol64(st[10] ^ t[0], 3);
96 st[10] = rol64( tt ^ t[1], 1);
97
98 /* Chi */
99 bc[ 0] = ~st[ 1] & st[ 2];
100 bc[ 1] = ~st[ 2] & st[ 3];
101 bc[ 2] = ~st[ 3] & st[ 4];
102 bc[ 3] = ~st[ 4] & st[ 0];
103 bc[ 4] = ~st[ 0] & st[ 1];
104 st[ 0] ^= bc[ 0];
105 st[ 1] ^= bc[ 1];
106 st[ 2] ^= bc[ 2];
107 st[ 3] ^= bc[ 3];
108 st[ 4] ^= bc[ 4];
109
110 bc[ 0] = ~st[ 6] & st[ 7];
111 bc[ 1] = ~st[ 7] & st[ 8];
112 bc[ 2] = ~st[ 8] & st[ 9];
113 bc[ 3] = ~st[ 9] & st[ 5];
114 bc[ 4] = ~st[ 5] & st[ 6];
115 st[ 5] ^= bc[ 0];
116 st[ 6] ^= bc[ 1];
117 st[ 7] ^= bc[ 2];
118 st[ 8] ^= bc[ 3];
119 st[ 9] ^= bc[ 4];
120
121 bc[ 0] = ~st[11] & st[12];
122 bc[ 1] = ~st[12] & st[13];
123 bc[ 2] = ~st[13] & st[14];
124 bc[ 3] = ~st[14] & st[10];
125 bc[ 4] = ~st[10] & st[11];
126 st[10] ^= bc[ 0];
127 st[11] ^= bc[ 1];
128 st[12] ^= bc[ 2];
129 st[13] ^= bc[ 3];
130 st[14] ^= bc[ 4];
131
132 bc[ 0] = ~st[16] & st[17];
133 bc[ 1] = ~st[17] & st[18];
134 bc[ 2] = ~st[18] & st[19];
135 bc[ 3] = ~st[19] & st[15];
136 bc[ 4] = ~st[15] & st[16];
137 st[15] ^= bc[ 0];
138 st[16] ^= bc[ 1];
139 st[17] ^= bc[ 2];
140 st[18] ^= bc[ 3];
141 st[19] ^= bc[ 4];
142
143 bc[ 0] = ~st[21] & st[22];
144 bc[ 1] = ~st[22] & st[23];
145 bc[ 2] = ~st[23] & st[24];
146 bc[ 3] = ~st[24] & st[20];
147 bc[ 4] = ~st[20] & st[21];
148 st[20] ^= bc[ 0];
149 st[21] ^= bc[ 1];
150 st[22] ^= bc[ 2];
151 st[23] ^= bc[ 3];
152 st[24] ^= bc[ 4];
153}
44 154
45 /* Theta */ 155static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
46 bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; 156{
47 bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; 157 int round;
48 bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
49 bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
50 bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
51
52 t[0] = bc[4] ^ rol64(bc[1], 1);
53 t[1] = bc[0] ^ rol64(bc[2], 1);
54 t[2] = bc[1] ^ rol64(bc[3], 1);
55 t[3] = bc[2] ^ rol64(bc[4], 1);
56 t[4] = bc[3] ^ rol64(bc[0], 1);
57
58 st[0] ^= t[0];
59
60 /* Rho Pi */
61 tt = st[1];
62 st[ 1] = rol64(st[ 6] ^ t[1], 44);
63 st[ 6] = rol64(st[ 9] ^ t[4], 20);
64 st[ 9] = rol64(st[22] ^ t[2], 61);
65 st[22] = rol64(st[14] ^ t[4], 39);
66 st[14] = rol64(st[20] ^ t[0], 18);
67 st[20] = rol64(st[ 2] ^ t[2], 62);
68 st[ 2] = rol64(st[12] ^ t[2], 43);
69 st[12] = rol64(st[13] ^ t[3], 25);
70 st[13] = rol64(st[19] ^ t[4], 8);
71 st[19] = rol64(st[23] ^ t[3], 56);
72 st[23] = rol64(st[15] ^ t[0], 41);
73 st[15] = rol64(st[ 4] ^ t[4], 27);
74 st[ 4] = rol64(st[24] ^ t[4], 14);
75 st[24] = rol64(st[21] ^ t[1], 2);
76 st[21] = rol64(st[ 8] ^ t[3], 55);
77 st[ 8] = rol64(st[16] ^ t[1], 45);
78 st[16] = rol64(st[ 5] ^ t[0], 36);
79 st[ 5] = rol64(st[ 3] ^ t[3], 28);
80 st[ 3] = rol64(st[18] ^ t[3], 21);
81 st[18] = rol64(st[17] ^ t[2], 15);
82 st[17] = rol64(st[11] ^ t[1], 10);
83 st[11] = rol64(st[ 7] ^ t[2], 6);
84 st[ 7] = rol64(st[10] ^ t[0], 3);
85 st[10] = rol64( tt ^ t[1], 1);
86
87 /* Chi */
88 bc[ 0] = ~st[ 1] & st[ 2];
89 bc[ 1] = ~st[ 2] & st[ 3];
90 bc[ 2] = ~st[ 3] & st[ 4];
91 bc[ 3] = ~st[ 4] & st[ 0];
92 bc[ 4] = ~st[ 0] & st[ 1];
93 st[ 0] ^= bc[ 0];
94 st[ 1] ^= bc[ 1];
95 st[ 2] ^= bc[ 2];
96 st[ 3] ^= bc[ 3];
97 st[ 4] ^= bc[ 4];
98
99 bc[ 0] = ~st[ 6] & st[ 7];
100 bc[ 1] = ~st[ 7] & st[ 8];
101 bc[ 2] = ~st[ 8] & st[ 9];
102 bc[ 3] = ~st[ 9] & st[ 5];
103 bc[ 4] = ~st[ 5] & st[ 6];
104 st[ 5] ^= bc[ 0];
105 st[ 6] ^= bc[ 1];
106 st[ 7] ^= bc[ 2];
107 st[ 8] ^= bc[ 3];
108 st[ 9] ^= bc[ 4];
109
110 bc[ 0] = ~st[11] & st[12];
111 bc[ 1] = ~st[12] & st[13];
112 bc[ 2] = ~st[13] & st[14];
113 bc[ 3] = ~st[14] & st[10];
114 bc[ 4] = ~st[10] & st[11];
115 st[10] ^= bc[ 0];
116 st[11] ^= bc[ 1];
117 st[12] ^= bc[ 2];
118 st[13] ^= bc[ 3];
119 st[14] ^= bc[ 4];
120
121 bc[ 0] = ~st[16] & st[17];
122 bc[ 1] = ~st[17] & st[18];
123 bc[ 2] = ~st[18] & st[19];
124 bc[ 3] = ~st[19] & st[15];
125 bc[ 4] = ~st[15] & st[16];
126 st[15] ^= bc[ 0];
127 st[16] ^= bc[ 1];
128 st[17] ^= bc[ 2];
129 st[18] ^= bc[ 3];
130 st[19] ^= bc[ 4];
131
132 bc[ 0] = ~st[21] & st[22];
133 bc[ 1] = ~st[22] & st[23];
134 bc[ 2] = ~st[23] & st[24];
135 bc[ 3] = ~st[24] & st[20];
136 bc[ 4] = ~st[20] & st[21];
137 st[20] ^= bc[ 0];
138 st[21] ^= bc[ 1];
139 st[22] ^= bc[ 2];
140 st[23] ^= bc[ 3];
141 st[24] ^= bc[ 4];
142 158
159 for (round = 0; round < KECCAK_ROUNDS; round++) {
160 keccakf_round(st);
143 /* Iota */ 161 /* Iota */
144 st[0] ^= keccakf_rndc[round]; 162 st[0] ^= keccakf_rndc[round];
145 } 163 }