aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorchandramouli narayanan <mouli@linux.intel.com>2014-06-10 12:22:47 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2014-06-20 09:27:58 -0400
commit22cddcc7df8fd35d52646ee220658d26ef09da17 (patch)
treef9c3de5c99d79f8898610c8f6b97828c4d1f4bfa
parent6574e6c64e971c9adb629e81e497afdb52b1c9df (diff)
crypto: aes - AES CTR x86_64 "by8" AVX optimization
This patch introduces "by8" AES CTR mode AVX optimization inspired by Intel Optimized IPSEC Cryptograhpic library. For additional information, please see: http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 The functions aes_ctr_enc_128_avx_by8(), aes_ctr_enc_192_avx_by8() and aes_ctr_enc_256_avx_by8() are adapted from Intel Optimized IPSEC Cryptographic library. When both AES and AVX features are enabled in a platform, the glue code in AESNI module overrieds the existing "by4" CTR mode en/decryption with the "by8" AES CTR mode en/decryption. On a Haswell desktop, with turbo disabled and all cpus running at maximum frequency, the "by8" CTR mode optimization shows better performance results across data & key sizes as measured by tcrypt. The average performance improvement of the "by8" version over the "by4" version is as follows: For 128 bit key and data sizes >= 256 bytes, there is a 10-16% improvement. For 192 bit key and data sizes >= 256 bytes, there is a 20-22% improvement. For 256 bit key and data sizes >= 256 bytes, there is a 20-25% improvement. A typical run of tcrypt with AES CTR mode encryption of the "by4" and "by8" optimization shows the following results: tcrypt with "by4" AES CTR mode encryption optimization on a Haswell Desktop: --------------------------------------------------------------------------- testing speed of __ctr-aes-aesni encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 343 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 336 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 491 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1130 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 7309 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 346 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 361 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 543 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1321 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 9649 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 369 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 366 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 595 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1531 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 10522 cycles (8192 bytes) testing speed of __ctr-aes-aesni decryption test 0 (128 bit key, 16 byte blocks): 1 operation in 336 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 350 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 487 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1129 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 7287 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 350 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 359 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 635 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1324 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 9595 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 364 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 377 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 604 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1527 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 10549 cycles (8192 bytes) tcrypt with "by8" AES CTR mode encryption optimization on a Haswell Desktop: --------------------------------------------------------------------------- testing speed of __ctr-aes-aesni encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 340 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 330 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 450 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1043 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 6597 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 339 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 352 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 539 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1153 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 8458 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 353 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 360 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 512 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1277 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 8745 cycles (8192 bytes) testing speed of __ctr-aes-aesni decryption test 0 (128 bit key, 16 byte blocks): 1 operation in 348 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 335 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 451 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1030 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 6611 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 354 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 346 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 488 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1154 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 8390 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 357 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 362 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 515 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1284 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 8681 cycles (8192 bytes) crypto: Incorporate feed back to AES CTR mode optimization patch Specifically, the following: a) alignment around main loop in aes_ctrby8_avx_x86_64.S b) .rodata around data constants used in the assembely code. c) the use of CONFIG_AVX in the glue code. d) fix up white space. e) informational message for "by8" AES CTR mode optimization f) "by8" AES CTR mode optimization can be simply enabled if the platform supports both AES and AVX features. The optimization works superbly on Sandybridge as well. Testing on Haswell shows no performance change since the last. Testing on Sandybridge shows that the "by8" AES CTR mode optimization greatly improves performance. tcrypt log with "by4" AES CTR mode optimization on Sandybridge -------------------------------------------------------------- testing speed of __ctr-aes-aesni encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 383 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 408 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 707 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1864 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 12813 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 395 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 432 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 780 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 2132 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 15765 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 416 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 438 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 842 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 2383 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 16945 cycles (8192 bytes) testing speed of __ctr-aes-aesni decryption test 0 (128 bit key, 16 byte blocks): 1 operation in 389 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 409 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 704 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1865 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 12783 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 409 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 434 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 792 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 2151 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 15804 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 421 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 444 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 840 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 2394 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 16928 cycles (8192 bytes) tcrypt log with "by8" AES CTR mode optimization on Sandybridge -------------------------------------------------------------- testing speed of __ctr-aes-aesni encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 383 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 401 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 522 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1136 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 7046 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 394 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 418 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 559 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1263 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 9072 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 408 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 428 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 595 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1385 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 9224 cycles (8192 bytes) testing speed of __ctr-aes-aesni decryption test 0 (128 bit key, 16 byte blocks): 1 operation in 390 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 402 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 530 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 1135 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 7079 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 414 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 417 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 572 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 1312 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 9073 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 415 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 454 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 598 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 1407 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 9288 cycles (8192 bytes) crypto: Fix redundant checks a) Fix the redundant check for cpu_has_aes b) Fix the key length check when invoking the CTR mode "by8" encryptor/decryptor. crypto: fix typo in AES ctr mode transform Signed-off-by: Chandramouli Narayanan <mouli@linux.intel.com> Reviewed-by: Mathias Krause <minipli@googlemail.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S546
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c40
3 files changed, 585 insertions, 3 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a470de25570f..d551165a3159 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -78,7 +78,7 @@ ifeq ($(avx2_supported),yes)
78endif 78endif
79 79
80aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 80aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
81aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o 81aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
82ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 82ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
83sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 83sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
84ifeq ($(avx2_supported),yes) 84ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 000000000000..f091f122ed24
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,546 @@
1/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b) a##b
69#define VMOVDQ vmovdqu
70
71#define xdata0 %xmm0
72#define xdata1 %xmm1
73#define xdata2 %xmm2
74#define xdata3 %xmm3
75#define xdata4 %xmm4
76#define xdata5 %xmm5
77#define xdata6 %xmm6
78#define xdata7 %xmm7
79#define xcounter %xmm8
80#define xbyteswap %xmm9
81#define xkey0 %xmm10
82#define xkey3 %xmm11
83#define xkey6 %xmm12
84#define xkey9 %xmm13
85#define xkey4 %xmm11
86#define xkey8 %xmm12
87#define xkey12 %xmm13
88#define xkeyA %xmm14
89#define xkeyB %xmm15
90
91#define p_in %rdi
92#define p_iv %rsi
93#define p_keys %rdx
94#define p_out %rcx
95#define num_bytes %r8
96
97#define tmp %r10
98#define DDQ(i) CONCAT(ddq_add_,i)
99#define XMM(i) CONCAT(%xmm, i)
100#define DDQ_DATA 0
101#define XDATA 1
102#define KEY_128 1
103#define KEY_192 2
104#define KEY_256 3
105
106.section .rodata
107.align 16
108
109byteswap_const:
110 .octa 0x000102030405060708090A0B0C0D0E0F
111ddq_add_1:
112 .octa 0x00000000000000000000000000000001
113ddq_add_2:
114 .octa 0x00000000000000000000000000000002
115ddq_add_3:
116 .octa 0x00000000000000000000000000000003
117ddq_add_4:
118 .octa 0x00000000000000000000000000000004
119ddq_add_5:
120 .octa 0x00000000000000000000000000000005
121ddq_add_6:
122 .octa 0x00000000000000000000000000000006
123ddq_add_7:
124 .octa 0x00000000000000000000000000000007
125ddq_add_8:
126 .octa 0x00000000000000000000000000000008
127
128.text
129
130/* generate a unique variable for ddq_add_x */
131
132.macro setddq n
133 var_ddq_add = DDQ(\n)
134.endm
135
136/* generate a unique variable for xmm register */
137.macro setxdata n
138 var_xdata = XMM(\n)
139.endm
140
141/* club the numeric 'id' to the symbol 'name' */
142
143.macro club name, id
144.altmacro
145 .if \name == DDQ_DATA
146 setddq %\id
147 .elseif \name == XDATA
148 setxdata %\id
149 .endif
150.noaltmacro
151.endm
152
153/*
154 * do_aes num_in_par load_keys key_len
155 * This increments p_in, but not p_out
156 */
157.macro do_aes b, k, key_len
158 .set by, \b
159 .set load_keys, \k
160 .set klen, \key_len
161
162 .if (load_keys)
163 vmovdqa 0*16(p_keys), xkey0
164 .endif
165
166 vpshufb xbyteswap, xcounter, xdata0
167
168 .set i, 1
169 .rept (by - 1)
170 club DDQ_DATA, i
171 club XDATA, i
172 vpaddd var_ddq_add(%rip), xcounter, var_xdata
173 vpshufb xbyteswap, var_xdata, var_xdata
174 .set i, (i +1)
175 .endr
176
177 vmovdqa 1*16(p_keys), xkeyA
178
179 vpxor xkey0, xdata0, xdata0
180 club DDQ_DATA, by
181 vpaddd var_ddq_add(%rip), xcounter, xcounter
182
183 .set i, 1
184 .rept (by - 1)
185 club XDATA, i
186 vpxor xkey0, var_xdata, var_xdata
187 .set i, (i +1)
188 .endr
189
190 vmovdqa 2*16(p_keys), xkeyB
191
192 .set i, 0
193 .rept by
194 club XDATA, i
195 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
196 .set i, (i +1)
197 .endr
198
199 .if (klen == KEY_128)
200 .if (load_keys)
201 vmovdqa 3*16(p_keys), xkeyA
202 .endif
203 .else
204 vmovdqa 3*16(p_keys), xkeyA
205 .endif
206
207 .set i, 0
208 .rept by
209 club XDATA, i
210 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
211 .set i, (i +1)
212 .endr
213
214 add $(16*by), p_in
215
216 .if (klen == KEY_128)
217 vmovdqa 4*16(p_keys), xkey4
218 .else
219 .if (load_keys)
220 vmovdqa 4*16(p_keys), xkey4
221 .endif
222 .endif
223
224 .set i, 0
225 .rept by
226 club XDATA, i
227 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
228 .set i, (i +1)
229 .endr
230
231 vmovdqa 5*16(p_keys), xkeyA
232
233 .set i, 0
234 .rept by
235 club XDATA, i
236 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
237 .set i, (i +1)
238 .endr
239
240 .if (klen == KEY_128)
241 .if (load_keys)
242 vmovdqa 6*16(p_keys), xkeyB
243 .endif
244 .else
245 vmovdqa 6*16(p_keys), xkeyB
246 .endif
247
248 .set i, 0
249 .rept by
250 club XDATA, i
251 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
252 .set i, (i +1)
253 .endr
254
255 vmovdqa 7*16(p_keys), xkeyA
256
257 .set i, 0
258 .rept by
259 club XDATA, i
260 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
261 .set i, (i +1)
262 .endr
263
264 .if (klen == KEY_128)
265 vmovdqa 8*16(p_keys), xkey8
266 .else
267 .if (load_keys)
268 vmovdqa 8*16(p_keys), xkey8
269 .endif
270 .endif
271
272 .set i, 0
273 .rept by
274 club XDATA, i
275 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
276 .set i, (i +1)
277 .endr
278
279 .if (klen == KEY_128)
280 .if (load_keys)
281 vmovdqa 9*16(p_keys), xkeyA
282 .endif
283 .else
284 vmovdqa 9*16(p_keys), xkeyA
285 .endif
286
287 .set i, 0
288 .rept by
289 club XDATA, i
290 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
291 .set i, (i +1)
292 .endr
293
294 vmovdqa 10*16(p_keys), xkeyB
295
296 .set i, 0
297 .rept by
298 club XDATA, i
299 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
300 .set i, (i +1)
301 .endr
302
303 .if (klen != KEY_128)
304 vmovdqa 11*16(p_keys), xkeyA
305 .endif
306
307 .set i, 0
308 .rept by
309 club XDATA, i
310 /* key 10 */
311 .if (klen == KEY_128)
312 vaesenclast xkeyB, var_xdata, var_xdata
313 .else
314 vaesenc xkeyB, var_xdata, var_xdata
315 .endif
316 .set i, (i +1)
317 .endr
318
319 .if (klen != KEY_128)
320 .if (load_keys)
321 vmovdqa 12*16(p_keys), xkey12
322 .endif
323
324 .set i, 0
325 .rept by
326 club XDATA, i
327 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
328 .set i, (i +1)
329 .endr
330
331 .if (klen == KEY_256)
332 vmovdqa 13*16(p_keys), xkeyA
333 .endif
334
335 .set i, 0
336 .rept by
337 club XDATA, i
338 .if (klen == KEY_256)
339 /* key 12 */
340 vaesenc xkey12, var_xdata, var_xdata
341 .else
342 vaesenclast xkey12, var_xdata, var_xdata
343 .endif
344 .set i, (i +1)
345 .endr
346
347 .if (klen == KEY_256)
348 vmovdqa 14*16(p_keys), xkeyB
349
350 .set i, 0
351 .rept by
352 club XDATA, i
353 /* key 13 */
354 vaesenc xkeyA, var_xdata, var_xdata
355 .set i, (i +1)
356 .endr
357
358 .set i, 0
359 .rept by
360 club XDATA, i
361 /* key 14 */
362 vaesenclast xkeyB, var_xdata, var_xdata
363 .set i, (i +1)
364 .endr
365 .endif
366 .endif
367
368 .set i, 0
369 .rept (by / 2)
370 .set j, (i+1)
371 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
372 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
373 club XDATA, i
374 vpxor xkeyA, var_xdata, var_xdata
375 club XDATA, j
376 vpxor xkeyB, var_xdata, var_xdata
377 .set i, (i+2)
378 .endr
379
380 .if (i < by)
381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
382 club XDATA, i
383 vpxor xkeyA, var_xdata, var_xdata
384 .endif
385
386 .set i, 0
387 .rept by
388 club XDATA, i
389 VMOVDQ var_xdata, i*16(p_out)
390 .set i, (i+1)
391 .endr
392.endm
393
394.macro do_aes_load val, key_len
395 do_aes \val, 1, \key_len
396.endm
397
398.macro do_aes_noload val, key_len
399 do_aes \val, 0, \key_len
400.endm
401
402/* main body of aes ctr load */
403
404.macro do_aes_ctrmain key_len
405
406 cmp $16, num_bytes
407 jb .Ldo_return2\key_len
408
409 vmovdqa byteswap_const(%rip), xbyteswap
410 vmovdqu (p_iv), xcounter
411 vpshufb xbyteswap, xcounter, xcounter
412
413 mov num_bytes, tmp
414 and $(7*16), tmp
415 jz .Lmult_of_8_blks\key_len
416
417 /* 1 <= tmp <= 7 */
418 cmp $(4*16), tmp
419 jg .Lgt4\key_len
420 je .Leq4\key_len
421
422.Llt4\key_len:
423 cmp $(2*16), tmp
424 jg .Leq3\key_len
425 je .Leq2\key_len
426
427.Leq1\key_len:
428 do_aes_load 1, \key_len
429 add $(1*16), p_out
430 and $(~7*16), num_bytes
431 jz .Ldo_return2\key_len
432 jmp .Lmain_loop2\key_len
433
434.Leq2\key_len:
435 do_aes_load 2, \key_len
436 add $(2*16), p_out
437 and $(~7*16), num_bytes
438 jz .Ldo_return2\key_len
439 jmp .Lmain_loop2\key_len
440
441
442.Leq3\key_len:
443 do_aes_load 3, \key_len
444 add $(3*16), p_out
445 and $(~7*16), num_bytes
446 jz .Ldo_return2\key_len
447 jmp .Lmain_loop2\key_len
448
449.Leq4\key_len:
450 do_aes_load 4, \key_len
451 add $(4*16), p_out
452 and $(~7*16), num_bytes
453 jz .Ldo_return2\key_len
454 jmp .Lmain_loop2\key_len
455
456.Lgt4\key_len:
457 cmp $(6*16), tmp
458 jg .Leq7\key_len
459 je .Leq6\key_len
460
461.Leq5\key_len:
462 do_aes_load 5, \key_len
463 add $(5*16), p_out
464 and $(~7*16), num_bytes
465 jz .Ldo_return2\key_len
466 jmp .Lmain_loop2\key_len
467
468.Leq6\key_len:
469 do_aes_load 6, \key_len
470 add $(6*16), p_out
471 and $(~7*16), num_bytes
472 jz .Ldo_return2\key_len
473 jmp .Lmain_loop2\key_len
474
475.Leq7\key_len:
476 do_aes_load 7, \key_len
477 add $(7*16), p_out
478 and $(~7*16), num_bytes
479 jz .Ldo_return2\key_len
480 jmp .Lmain_loop2\key_len
481
482.Lmult_of_8_blks\key_len:
483 .if (\key_len != KEY_128)
484 vmovdqa 0*16(p_keys), xkey0
485 vmovdqa 4*16(p_keys), xkey4
486 vmovdqa 8*16(p_keys), xkey8
487 vmovdqa 12*16(p_keys), xkey12
488 .else
489 vmovdqa 0*16(p_keys), xkey0
490 vmovdqa 3*16(p_keys), xkey4
491 vmovdqa 6*16(p_keys), xkey8
492 vmovdqa 9*16(p_keys), xkey12
493 .endif
494.align 16
495.Lmain_loop2\key_len:
496 /* num_bytes is a multiple of 8 and >0 */
497 do_aes_noload 8, \key_len
498 add $(8*16), p_out
499 sub $(8*16), num_bytes
500 jne .Lmain_loop2\key_len
501
502.Ldo_return2\key_len:
503 /* return updated IV */
504 vpshufb xbyteswap, xcounter, xcounter
505 vmovdqu xcounter, (p_iv)
506 ret
507.endm
508
509/*
510 * routine to do AES128 CTR enc/decrypt "by8"
511 * XMM registers are clobbered.
512 * Saving/restoring must be done at a higher level
513 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
514 * unsigned int num_bytes)
515 */
516ENTRY(aes_ctr_enc_128_avx_by8)
517 /* call the aes main loop */
518 do_aes_ctrmain KEY_128
519
520ENDPROC(aes_ctr_enc_128_avx_by8)
521
522/*
523 * routine to do AES192 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
527 * unsigned int num_bytes)
528 */
529ENTRY(aes_ctr_enc_192_avx_by8)
530 /* call the aes main loop */
531 do_aes_ctrmain KEY_192
532
533ENDPROC(aes_ctr_enc_192_avx_by8)
534
535/*
536 * routine to do AES256 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
540 * unsigned int num_bytes)
541 */
542ENTRY(aes_ctr_enc_256_avx_by8)
543 /* call the aes main loop */
544 do_aes_ctrmain KEY_256
545
546ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 948ad0e77741..888950f29fd9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -105,6 +105,9 @@ void crypto_fpu_exit(void);
105#define AVX_GEN4_OPTSIZE 4096 105#define AVX_GEN4_OPTSIZE 4096
106 106
107#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
108
109static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
110 const u8 *in, unsigned int len, u8 *iv);
108asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 111asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
109 const u8 *in, unsigned int len, u8 *iv); 112 const u8 *in, unsigned int len, u8 *iv);
110 113
@@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
155 158
156 159
157#ifdef CONFIG_AS_AVX 160#ifdef CONFIG_AS_AVX
161asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
162 void *keys, u8 *out, unsigned int num_bytes);
163asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
164 void *keys, u8 *out, unsigned int num_bytes);
165asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
166 void *keys, u8 *out, unsigned int num_bytes);
158/* 167/*
159 * asmlinkage void aesni_gcm_precomp_avx_gen2() 168 * asmlinkage void aesni_gcm_precomp_avx_gen2()
160 * gcm_data *my_ctx_data, context data 169 * gcm_data *my_ctx_data, context data
@@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
472 crypto_inc(ctrblk, AES_BLOCK_SIZE); 481 crypto_inc(ctrblk, AES_BLOCK_SIZE);
473} 482}
474 483
484#ifdef CONFIG_AS_AVX
485static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
486 const u8 *in, unsigned int len, u8 *iv)
487{
488 /*
489 * based on key length, override with the by8 version
490 * of ctr mode encryption/decryption for improved performance
491 * aes_set_key_common() ensures that key length is one of
492 * {128,192,256}
493 */
494 if (ctx->key_length == AES_KEYSIZE_128)
495 aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
496 else if (ctx->key_length == AES_KEYSIZE_192)
497 aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
498 else
499 aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
500}
501#endif
502
475static int ctr_crypt(struct blkcipher_desc *desc, 503static int ctr_crypt(struct blkcipher_desc *desc,
476 struct scatterlist *dst, struct scatterlist *src, 504 struct scatterlist *dst, struct scatterlist *src,
477 unsigned int nbytes) 505 unsigned int nbytes)
@@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc,
486 514
487 kernel_fpu_begin(); 515 kernel_fpu_begin();
488 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { 516 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
489 aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, 517 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
490 nbytes & AES_BLOCK_MASK, walk.iv); 518 nbytes & AES_BLOCK_MASK, walk.iv);
491 nbytes &= AES_BLOCK_SIZE - 1; 519 nbytes &= AES_BLOCK_SIZE - 1;
492 err = blkcipher_walk_done(desc, &walk, nbytes); 520 err = blkcipher_walk_done(desc, &walk, nbytes);
493 } 521 }
@@ -1493,6 +1521,14 @@ static int __init aesni_init(void)
1493 aesni_gcm_enc_tfm = aesni_gcm_enc; 1521 aesni_gcm_enc_tfm = aesni_gcm_enc;
1494 aesni_gcm_dec_tfm = aesni_gcm_dec; 1522 aesni_gcm_dec_tfm = aesni_gcm_dec;
1495 } 1523 }
1524 aesni_ctr_enc_tfm = aesni_ctr_enc;
1525#ifdef CONFIG_AS_AVX
1526 if (cpu_has_avx) {
1527 /* optimize performance of ctr mode encryption transform */
1528 aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
1529 pr_info("AES CTR mode by8 optimization enabled\n");
1530 }
1531#endif
1496#endif 1532#endif
1497 1533
1498 err = crypto_fpu_init(); 1534 err = crypto_fpu_init();