aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-01-19 07:04:34 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2018-01-25 09:10:33 -0500
commit83dee2ce1ae791c3dc0c9d4d3a8d42cb109613f6 (patch)
treed4a9df73a5b80d21367d70eb0e1358f66bbdd00f
parentc013cee99d5a18aec8c71fee8f5f41369cd12595 (diff)
crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize
The way the KECCAK transform is currently coded involves many references into the state array using indexes that are calculated at runtime using simple but non-trivial arithmetic. This forces the compiler to treat the state matrix as an array in memory rather than keep it in registers, which results in poor performance. So instead, let's rephrase the algorithm using fixed array indexes only. This helps the compiler keep the state matrix in registers, resulting in the following speedup (SHA3-256 performance in cycles per byte): before after speedup Intel Core i7 @ 2.0 GHz (2.9 turbo) 100.6 35.7 2.8x Cortex-A57 @ 2.0 GHz (64-bit mode) 101.6 12.7 8.0x Cortex-A53 @ 1.0 GHz 224.4 15.8 14.2x Cortex-A57 @ 2.0 GHz (32-bit mode) 201.8 63.0 3.2x Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--crypto/sha3_generic.c134
1 files changed, 96 insertions, 38 deletions
diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
index a68be626017c..5fecb609e3be 100644
--- a/crypto/sha3_generic.c
+++ b/crypto/sha3_generic.c
@@ -5,6 +5,7 @@
5 * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf 5 * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
6 * 6 *
7 * SHA-3 code by Jeff Garzik <jeff@garzik.org> 7 * SHA-3 code by Jeff Garzik <jeff@garzik.org>
8 * Ard Biesheuvel <ard.biesheuvel@linaro.org>
8 * 9 *
9 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the Free 11 * under the terms of the GNU General Public License as published by the Free
@@ -22,8 +23,6 @@
22 23
23#define KECCAK_ROUNDS 24 24#define KECCAK_ROUNDS 24
24 25
25#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
26
27static const u64 keccakf_rndc[24] = { 26static const u64 keccakf_rndc[24] = {
28 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 27 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
29 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, 28 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
@@ -35,53 +34,112 @@ static const u64 keccakf_rndc[24] = {
35 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL 34 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
36}; 35};
37 36
38static const int keccakf_rotc[24] = {
39 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
40 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
41};
42
43static const int keccakf_piln[24] = {
44 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
45 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
46};
47
48/* update the state with given number of rounds */ 37/* update the state with given number of rounds */
49 38
50static void keccakf(u64 st[25]) 39static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
51{ 40{
52 int i, j, round; 41 u64 t[5], tt, bc[5];
53 u64 t, bc[5]; 42 int round;
54 43
55 for (round = 0; round < KECCAK_ROUNDS; round++) { 44 for (round = 0; round < KECCAK_ROUNDS; round++) {
56 45
57 /* Theta */ 46 /* Theta */
58 for (i = 0; i < 5; i++) 47 bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
59 bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] 48 bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
60 ^ st[i + 20]; 49 bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
61 50 bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
62 for (i = 0; i < 5; i++) { 51 bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
63 t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); 52
64 for (j = 0; j < 25; j += 5) 53 t[0] = bc[4] ^ rol64(bc[1], 1);
65 st[j + i] ^= t; 54 t[1] = bc[0] ^ rol64(bc[2], 1);
66 } 55 t[2] = bc[1] ^ rol64(bc[3], 1);
56 t[3] = bc[2] ^ rol64(bc[4], 1);
57 t[4] = bc[3] ^ rol64(bc[0], 1);
58
59 st[0] ^= t[0];
67 60
68 /* Rho Pi */ 61 /* Rho Pi */
69 t = st[1]; 62 tt = st[1];
70 for (i = 0; i < 24; i++) { 63 st[ 1] = rol64(st[ 6] ^ t[1], 44);
71 j = keccakf_piln[i]; 64 st[ 6] = rol64(st[ 9] ^ t[4], 20);
72 bc[0] = st[j]; 65 st[ 9] = rol64(st[22] ^ t[2], 61);
73 st[j] = ROTL64(t, keccakf_rotc[i]); 66 st[22] = rol64(st[14] ^ t[4], 39);
74 t = bc[0]; 67 st[14] = rol64(st[20] ^ t[0], 18);
75 } 68 st[20] = rol64(st[ 2] ^ t[2], 62);
69 st[ 2] = rol64(st[12] ^ t[2], 43);
70 st[12] = rol64(st[13] ^ t[3], 25);
71 st[13] = rol64(st[19] ^ t[4], 8);
72 st[19] = rol64(st[23] ^ t[3], 56);
73 st[23] = rol64(st[15] ^ t[0], 41);
74 st[15] = rol64(st[ 4] ^ t[4], 27);
75 st[ 4] = rol64(st[24] ^ t[4], 14);
76 st[24] = rol64(st[21] ^ t[1], 2);
77 st[21] = rol64(st[ 8] ^ t[3], 55);
78 st[ 8] = rol64(st[16] ^ t[1], 45);
79 st[16] = rol64(st[ 5] ^ t[0], 36);
80 st[ 5] = rol64(st[ 3] ^ t[3], 28);
81 st[ 3] = rol64(st[18] ^ t[3], 21);
82 st[18] = rol64(st[17] ^ t[2], 15);
83 st[17] = rol64(st[11] ^ t[1], 10);
84 st[11] = rol64(st[ 7] ^ t[2], 6);
85 st[ 7] = rol64(st[10] ^ t[0], 3);
86 st[10] = rol64( tt ^ t[1], 1);
76 87
77 /* Chi */ 88 /* Chi */
78 for (j = 0; j < 25; j += 5) { 89 bc[ 0] = ~st[ 1] & st[ 2];
79 for (i = 0; i < 5; i++) 90 bc[ 1] = ~st[ 2] & st[ 3];
80 bc[i] = st[j + i]; 91 bc[ 2] = ~st[ 3] & st[ 4];
81 for (i = 0; i < 5; i++) 92 bc[ 3] = ~st[ 4] & st[ 0];
82 st[j + i] ^= (~bc[(i + 1) % 5]) & 93 bc[ 4] = ~st[ 0] & st[ 1];
83 bc[(i + 2) % 5]; 94 st[ 0] ^= bc[ 0];
84 } 95 st[ 1] ^= bc[ 1];
96 st[ 2] ^= bc[ 2];
97 st[ 3] ^= bc[ 3];
98 st[ 4] ^= bc[ 4];
99
100 bc[ 0] = ~st[ 6] & st[ 7];
101 bc[ 1] = ~st[ 7] & st[ 8];
102 bc[ 2] = ~st[ 8] & st[ 9];
103 bc[ 3] = ~st[ 9] & st[ 5];
104 bc[ 4] = ~st[ 5] & st[ 6];
105 st[ 5] ^= bc[ 0];
106 st[ 6] ^= bc[ 1];
107 st[ 7] ^= bc[ 2];
108 st[ 8] ^= bc[ 3];
109 st[ 9] ^= bc[ 4];
110
111 bc[ 0] = ~st[11] & st[12];
112 bc[ 1] = ~st[12] & st[13];
113 bc[ 2] = ~st[13] & st[14];
114 bc[ 3] = ~st[14] & st[10];
115 bc[ 4] = ~st[10] & st[11];
116 st[10] ^= bc[ 0];
117 st[11] ^= bc[ 1];
118 st[12] ^= bc[ 2];
119 st[13] ^= bc[ 3];
120 st[14] ^= bc[ 4];
121
122 bc[ 0] = ~st[16] & st[17];
123 bc[ 1] = ~st[17] & st[18];
124 bc[ 2] = ~st[18] & st[19];
125 bc[ 3] = ~st[19] & st[15];
126 bc[ 4] = ~st[15] & st[16];
127 st[15] ^= bc[ 0];
128 st[16] ^= bc[ 1];
129 st[17] ^= bc[ 2];
130 st[18] ^= bc[ 3];
131 st[19] ^= bc[ 4];
132
133 bc[ 0] = ~st[21] & st[22];
134 bc[ 1] = ~st[22] & st[23];
135 bc[ 2] = ~st[23] & st[24];
136 bc[ 3] = ~st[24] & st[20];
137 bc[ 4] = ~st[20] & st[21];
138 st[20] ^= bc[ 0];
139 st[21] ^= bc[ 1];
140 st[22] ^= bc[ 2];
141 st[23] ^= bc[ 3];
142 st[24] ^= bc[ 4];
85 143
86 /* Iota */ 144 /* Iota */
87 st[0] ^= keccakf_rndc[round]; 145 st[0] ^= keccakf_rndc[round];