diff options
| author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2015-03-17 14:05:13 -0400 |
|---|---|---|
| committer | Will Deacon <will.deacon@arm.com> | 2015-03-19 06:43:57 -0400 |
| commit | 4a97abd44329bf7b9c57f020224da5f823c9c9ea (patch) | |
| tree | 7c22535e94706459719f71071113c57897de4bad /arch/arm64/crypto | |
| parent | b63dbef93f91d56cb4385fdd8d1765201d451136 (diff) | |
arm64/crypto: issue aese/aesmc instructions in pairs
This changes the AES core transform implementations to issue aese/aesmc
(and aesd/aesimc) in pairs. This enables a micro-architectural optimization
in recent Cortex-A5x cores that improves performance by 50-90%.
Measured performance in cycles per byte (Cortex-A57):
CBC enc CBC dec CTR
before 3.64 1.34 1.32
after 1.95 0.85 0.93
Note that this results in a ~5% performance decrease for older cores.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Diffstat (limited to 'arch/arm64/crypto')
| -rw-r--r-- | arch/arm64/crypto/aes-ce-ccm-core.S | 12 | ||||
| -rw-r--r-- | arch/arm64/crypto/aes-ce.S | 10 |
2 files changed, 9 insertions, 13 deletions
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 432e4841cd81..a2a7fbcacc14 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S | |||
| @@ -101,19 +101,19 @@ ENTRY(ce_aes_ccm_final) | |||
| 101 | 0: mov v4.16b, v3.16b | 101 | 0: mov v4.16b, v3.16b |
| 102 | 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ | 102 | 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ |
| 103 | aese v0.16b, v4.16b | 103 | aese v0.16b, v4.16b |
| 104 | aese v1.16b, v4.16b | ||
| 105 | aesmc v0.16b, v0.16b | 104 | aesmc v0.16b, v0.16b |
| 105 | aese v1.16b, v4.16b | ||
| 106 | aesmc v1.16b, v1.16b | 106 | aesmc v1.16b, v1.16b |
| 107 | 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ | 107 | 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ |
| 108 | aese v0.16b, v5.16b | 108 | aese v0.16b, v5.16b |
| 109 | aese v1.16b, v5.16b | ||
| 110 | aesmc v0.16b, v0.16b | 109 | aesmc v0.16b, v0.16b |
| 110 | aese v1.16b, v5.16b | ||
| 111 | aesmc v1.16b, v1.16b | 111 | aesmc v1.16b, v1.16b |
| 112 | 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ | 112 | 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ |
| 113 | subs w3, w3, #3 | 113 | subs w3, w3, #3 |
| 114 | aese v0.16b, v3.16b | 114 | aese v0.16b, v3.16b |
| 115 | aese v1.16b, v3.16b | ||
| 116 | aesmc v0.16b, v0.16b | 115 | aesmc v0.16b, v0.16b |
| 116 | aese v1.16b, v3.16b | ||
| 117 | aesmc v1.16b, v1.16b | 117 | aesmc v1.16b, v1.16b |
| 118 | bpl 1b | 118 | bpl 1b |
| 119 | aese v0.16b, v4.16b | 119 | aese v0.16b, v4.16b |
| @@ -146,19 +146,19 @@ ENDPROC(ce_aes_ccm_final) | |||
| 146 | ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ | 146 | ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ |
| 147 | 2: /* inner loop: 3 rounds, 2x interleaved */ | 147 | 2: /* inner loop: 3 rounds, 2x interleaved */ |
| 148 | aese v0.16b, v4.16b | 148 | aese v0.16b, v4.16b |
| 149 | aese v1.16b, v4.16b | ||
| 150 | aesmc v0.16b, v0.16b | 149 | aesmc v0.16b, v0.16b |
| 150 | aese v1.16b, v4.16b | ||
| 151 | aesmc v1.16b, v1.16b | 151 | aesmc v1.16b, v1.16b |
| 152 | 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ | 152 | 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ |
| 153 | aese v0.16b, v5.16b | 153 | aese v0.16b, v5.16b |
| 154 | aese v1.16b, v5.16b | ||
| 155 | aesmc v0.16b, v0.16b | 154 | aesmc v0.16b, v0.16b |
| 155 | aese v1.16b, v5.16b | ||
| 156 | aesmc v1.16b, v1.16b | 156 | aesmc v1.16b, v1.16b |
| 157 | 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ | 157 | 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ |
| 158 | subs w7, w7, #3 | 158 | subs w7, w7, #3 |
| 159 | aese v0.16b, v3.16b | 159 | aese v0.16b, v3.16b |
| 160 | aese v1.16b, v3.16b | ||
| 161 | aesmc v0.16b, v0.16b | 160 | aesmc v0.16b, v0.16b |
| 161 | aese v1.16b, v3.16b | ||
| 162 | aesmc v1.16b, v1.16b | 162 | aesmc v1.16b, v1.16b |
| 163 | ld1 {v5.2d}, [x10], #16 /* load next round key */ | 163 | ld1 {v5.2d}, [x10], #16 /* load next round key */ |
| 164 | bpl 2b | 164 | bpl 2b |
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index 685a18f731eb..78f3cfe92c08 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S | |||
| @@ -45,18 +45,14 @@ | |||
| 45 | 45 | ||
| 46 | .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 | 46 | .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 |
| 47 | aes\de \i0\().16b, \k\().16b | 47 | aes\de \i0\().16b, \k\().16b |
| 48 | .ifnb \i1 | ||
| 49 | aes\de \i1\().16b, \k\().16b | ||
| 50 | .ifnb \i3 | ||
| 51 | aes\de \i2\().16b, \k\().16b | ||
| 52 | aes\de \i3\().16b, \k\().16b | ||
| 53 | .endif | ||
| 54 | .endif | ||
| 55 | aes\mc \i0\().16b, \i0\().16b | 48 | aes\mc \i0\().16b, \i0\().16b |
| 56 | .ifnb \i1 | 49 | .ifnb \i1 |
| 50 | aes\de \i1\().16b, \k\().16b | ||
| 57 | aes\mc \i1\().16b, \i1\().16b | 51 | aes\mc \i1\().16b, \i1\().16b |
| 58 | .ifnb \i3 | 52 | .ifnb \i3 |
| 53 | aes\de \i2\().16b, \k\().16b | ||
| 59 | aes\mc \i2\().16b, \i2\().16b | 54 | aes\mc \i2\().16b, \i2\().16b |
| 55 | aes\de \i3\().16b, \k\().16b | ||
| 60 | aes\mc \i3\().16b, \i3\().16b | 56 | aes\mc \i3\().16b, \i3\().16b |
| 61 | .endif | 57 | .endif |
| 62 | .endif | 58 | .endif |
