aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMathias Krause <minipli@googlemail.com>2014-09-28 16:23:59 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2014-10-02 02:35:03 -0400
commit80dca4734b3561be59879b02bce359b6f661e921 (patch)
tree4822ed2121ea7371691ab6a5b14e5e42584a7ae8
parent7a1ae9c0ce39d839044745956f08eabbea00d420 (diff)
crypto: aesni - fix counter overflow handling in "by8" variant
The "by8" CTR AVX implementation fails to propperly handle counter overflows. That was the reason it got disabled in commit 7da4b29d496b ("crypto: aesni - disable "by8" AVX CTR optimization"). Fix the overflow handling by incrementing the counter block as a double quad word, i.e. a 128 bit, and testing for overflows afterwards. We need to use VPTEST to do so as VPADD* does not set the flags itself and silently drops the carry bit. As this change adds branches to the hot path, minor performance regressions might be a side effect. But, OTOH, we now have a conforming implementation -- the preferable goal. A tcrypt test on a SandyBridge system (i7-2620M) showed almost identical numbers for the old and this version with differences within the noise range. A dm-crypt test with the fixed version gave even slightly better results for this version. So the performance impact might not be as big as expected. Tested-by: Romain Francoise <romain@orebokech.com> Signed-off-by: Mathias Krause <minipli@googlemail.com> Cc: Chandramouli Narayanan <mouli@linux.intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S17
1 files changed, 15 insertions, 2 deletions
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index f091f122ed24..a029bc744244 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -108,6 +108,10 @@
108 108
109byteswap_const: 109byteswap_const:
110 .octa 0x000102030405060708090A0B0C0D0E0F 110 .octa 0x000102030405060708090A0B0C0D0E0F
111ddq_low_msk:
112 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
113ddq_high_add_1:
114 .octa 0x00000000000000010000000000000000
111ddq_add_1: 115ddq_add_1:
112 .octa 0x00000000000000000000000000000001 116 .octa 0x00000000000000000000000000000001
113ddq_add_2: 117ddq_add_2:
@@ -169,7 +173,12 @@ ddq_add_8:
169 .rept (by - 1) 173 .rept (by - 1)
170 club DDQ_DATA, i 174 club DDQ_DATA, i
171 club XDATA, i 175 club XDATA, i
172 vpaddd var_ddq_add(%rip), xcounter, var_xdata 176 vpaddq var_ddq_add(%rip), xcounter, var_xdata
177 vptest ddq_low_msk(%rip), var_xdata
178 jnz 1f
179 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
180 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
181 1:
173 vpshufb xbyteswap, var_xdata, var_xdata 182 vpshufb xbyteswap, var_xdata, var_xdata
174 .set i, (i +1) 183 .set i, (i +1)
175 .endr 184 .endr
@@ -178,7 +187,11 @@ ddq_add_8:
178 187
179 vpxor xkey0, xdata0, xdata0 188 vpxor xkey0, xdata0, xdata0
180 club DDQ_DATA, by 189 club DDQ_DATA, by
181 vpaddd var_ddq_add(%rip), xcounter, xcounter 190 vpaddq var_ddq_add(%rip), xcounter, xcounter
191 vptest ddq_low_msk(%rip), xcounter
192 jnz 1f
193 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
194 1:
182 195
183 .set i, 1 196 .set i, 1
184 .rept (by - 1) 197 .rept (by - 1)