diff options
author | Tim Chen <tim.c.chen@linux.intel.com> | 2013-12-11 17:28:41 -0500 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2013-12-20 07:06:24 -0500 |
commit | d764593af924930d5c15685bc5946cb943da1a55 (patch) | |
tree | 5f01056b5662ba704c85274f79c70e8cd6972bda /arch/x86/crypto | |
parent | fed286110f4bab01f93f06c32951fbc120fb71b1 (diff) |
crypto: aesni - AVX and AVX2 version of AESNI-GCM encode and decode
We have added AVX and AVX2 routines that optimize AESNI-GCM encode/decode.
These routines are optimized for encrypt and decrypt of large buffers.
In tests we have seen up to 6% speedup for 1K, 11% speedup for 2K and
18% speedup for 8K buffer over the existing SSE version. These routines
should provide even better speedup for future Intel x86_64 cpus.
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_avx.S | 2811 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 143 |
3 files changed, 2953 insertions, 3 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e0fc24db234a..84ee1e14f3b6 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -75,7 +75,7 @@ ifeq ($(avx2_supported),yes) | |||
75 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o | 75 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o |
76 | endif | 76 | endif |
77 | 77 | ||
78 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 78 | aesni-intel-y := aesni-intel_asm.o aesni-intel_avx.o aesni-intel_glue.o fpu.o |
79 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | 79 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
80 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | 80 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
81 | crc32c-intel-y := crc32c-intel_glue.o | 81 | crc32c-intel-y := crc32c-intel_glue.o |
diff --git a/arch/x86/crypto/aesni-intel_avx.S b/arch/x86/crypto/aesni-intel_avx.S new file mode 100644 index 000000000000..522ab68d1c88 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_avx.S | |||
@@ -0,0 +1,2811 @@ | |||
1 | ######################################################################## | ||
2 | # Copyright (c) 2013, Intel Corporation | ||
3 | # | ||
4 | # This software is available to you under a choice of one of two | ||
5 | # licenses. You may choose to be licensed under the terms of the GNU | ||
6 | # General Public License (GPL) Version 2, available from the file | ||
7 | # COPYING in the main directory of this source tree, or the | ||
8 | # OpenIB.org BSD license below: | ||
9 | # | ||
10 | # Redistribution and use in source and binary forms, with or without | ||
11 | # modification, are permitted provided that the following conditions are | ||
12 | # met: | ||
13 | # | ||
14 | # * Redistributions of source code must retain the above copyright | ||
15 | # notice, this list of conditions and the following disclaimer. | ||
16 | # | ||
17 | # * Redistributions in binary form must reproduce the above copyright | ||
18 | # notice, this list of conditions and the following disclaimer in the | ||
19 | # documentation and/or other materials provided with the | ||
20 | # distribution. | ||
21 | # | ||
22 | # * Neither the name of the Intel Corporation nor the names of its | ||
23 | # contributors may be used to endorse or promote products derived from | ||
24 | # this software without specific prior written permission. | ||
25 | # | ||
26 | # | ||
27 | # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY | ||
28 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
29 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
30 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR | ||
31 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
32 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
33 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR | ||
34 | # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
35 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | ||
36 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
37 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
38 | ######################################################################## | ||
39 | ## | ||
40 | ## Authors: | ||
41 | ## Erdinc Ozturk <erdinc.ozturk@intel.com> | ||
42 | ## Vinodh Gopal <vinodh.gopal@intel.com> | ||
43 | ## James Guilford <james.guilford@intel.com> | ||
44 | ## Tim Chen <tim.c.chen@linux.intel.com> | ||
45 | ## | ||
46 | ## References: | ||
47 | ## This code was derived and highly optimized from the code described in paper: | ||
48 | ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation | ||
49 | ## on Intel Architecture Processors. August, 2010 | ||
50 | ## The details of the implementation is explained in: | ||
51 | ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode | ||
52 | ## on Intel Architecture Processors. October, 2012. | ||
53 | ## | ||
54 | ## Assumptions: | ||
55 | ## | ||
56 | ## | ||
57 | ## | ||
58 | ## iv: | ||
59 | ## 0 1 2 3 | ||
60 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
61 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
62 | ## | Salt (From the SA) | | ||
63 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
64 | ## | Initialization Vector | | ||
65 | ## | (This is the sequence number from IPSec header) | | ||
66 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
67 | ## | 0x1 | | ||
68 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
69 | ## | ||
70 | ## | ||
71 | ## | ||
72 | ## AAD: | ||
73 | ## AAD padded to 128 bits with 0 | ||
74 | ## for example, assume AAD is a u32 vector | ||
75 | ## | ||
76 | ## if AAD is 8 bytes: | ||
77 | ## AAD[3] = {A0, A1}# | ||
78 | ## padded AAD in xmm register = {A1 A0 0 0} | ||
79 | ## | ||
80 | ## 0 1 2 3 | ||
81 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
82 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
83 | ## | SPI (A1) | | ||
84 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
85 | ## | 32-bit Sequence Number (A0) | | ||
86 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
87 | ## | 0x0 | | ||
88 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
89 | ## | ||
90 | ## AAD Format with 32-bit Sequence Number | ||
91 | ## | ||
92 | ## if AAD is 12 bytes: | ||
93 | ## AAD[3] = {A0, A1, A2}# | ||
94 | ## padded AAD in xmm register = {A2 A1 A0 0} | ||
95 | ## | ||
96 | ## 0 1 2 3 | ||
97 | ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
98 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
99 | ## | SPI (A2) | | ||
100 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
101 | ## | 64-bit Extended Sequence Number {A1,A0} | | ||
102 | ## | | | ||
103 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
104 | ## | 0x0 | | ||
105 | ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
106 | ## | ||
107 | ## AAD Format with 64-bit Extended Sequence Number | ||
108 | ## | ||
109 | ## | ||
110 | ## aadLen: | ||
111 | ## from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
112 | ## The code additionally supports aadLen of length 16 bytes. | ||
113 | ## | ||
114 | ## TLen: | ||
115 | ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
116 | ## | ||
117 | ## poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
118 | ## throughout the code, one tab and two tab indentations are used. one tab is | ||
119 | ## for GHASH part, two tabs is for AES part. | ||
120 | ## | ||
121 | |||
122 | #include <linux/linkage.h> | ||
123 | #include <asm/inst.h> | ||
124 | |||
125 | .data | ||
126 | .align 16 | ||
127 | |||
128 | POLY: .octa 0xC2000000000000000000000000000001 | ||
129 | POLY2: .octa 0xC20000000000000000000001C2000000 | ||
130 | TWOONE: .octa 0x00000001000000000000000000000001 | ||
131 | |||
132 | # order of these constants should not change. | ||
133 | # more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F | ||
134 | |||
135 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F | ||
136 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | ||
137 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | ||
138 | ZERO: .octa 0x00000000000000000000000000000000 | ||
139 | ONE: .octa 0x00000000000000000000000000000001 | ||
140 | ONEf: .octa 0x01000000000000000000000000000000 | ||
141 | |||
142 | .text | ||
143 | |||
144 | |||
145 | ##define the fields of the gcm aes context | ||
146 | #{ | ||
147 | # u8 expanded_keys[16*11] store expanded keys | ||
148 | # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here | ||
149 | # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here | ||
150 | # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here | ||
151 | # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here | ||
152 | # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here | ||
153 | # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here | ||
154 | # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here | ||
155 | # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here | ||
156 | # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) | ||
157 | # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) | ||
158 | # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) | ||
159 | # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) | ||
160 | # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) | ||
161 | # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) | ||
162 | # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) | ||
163 | # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) | ||
164 | #} gcm_ctx# | ||
165 | |||
166 | HashKey = 16*11 # store HashKey <<1 mod poly here | ||
167 | HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here | ||
168 | HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here | ||
169 | HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here | ||
170 | HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here | ||
171 | HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here | ||
172 | HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here | ||
173 | HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here | ||
174 | HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) | ||
175 | HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) | ||
176 | HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) | ||
177 | HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) | ||
178 | HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) | ||
179 | HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) | ||
180 | HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) | ||
181 | HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) | ||
182 | |||
183 | #define arg1 %rdi | ||
184 | #define arg2 %rsi | ||
185 | #define arg3 %rdx | ||
186 | #define arg4 %rcx | ||
187 | #define arg5 %r8 | ||
188 | #define arg6 %r9 | ||
189 | #define arg7 STACK_OFFSET+8*1(%r14) | ||
190 | #define arg8 STACK_OFFSET+8*2(%r14) | ||
191 | #define arg9 STACK_OFFSET+8*3(%r14) | ||
192 | |||
193 | i = 0 | ||
194 | j = 0 | ||
195 | |||
196 | out_order = 0 | ||
197 | in_order = 1 | ||
198 | DEC = 0 | ||
199 | ENC = 1 | ||
200 | |||
201 | .macro define_reg r n | ||
202 | reg_\r = %xmm\n | ||
203 | .endm | ||
204 | |||
205 | .macro setreg | ||
206 | .altmacro | ||
207 | define_reg i %i | ||
208 | define_reg j %j | ||
209 | .noaltmacro | ||
210 | .endm | ||
211 | |||
212 | # need to push 4 registers into stack to maintain | ||
213 | STACK_OFFSET = 8*4 | ||
214 | |||
215 | TMP1 = 16*0 # Temporary storage for AAD | ||
216 | TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) | ||
217 | TMP3 = 16*2 # Temporary storage for AES State 3 | ||
218 | TMP4 = 16*3 # Temporary storage for AES State 4 | ||
219 | TMP5 = 16*4 # Temporary storage for AES State 5 | ||
220 | TMP6 = 16*5 # Temporary storage for AES State 6 | ||
221 | TMP7 = 16*6 # Temporary storage for AES State 7 | ||
222 | TMP8 = 16*7 # Temporary storage for AES State 8 | ||
223 | |||
224 | VARIABLE_OFFSET = 16*8 | ||
225 | |||
226 | ################################ | ||
227 | # Utility Macros | ||
228 | ################################ | ||
229 | |||
230 | # Encryption of a single block | ||
231 | .macro ENCRYPT_SINGLE_BLOCK XMM0 | ||
232 | vpxor (arg1), \XMM0, \XMM0 | ||
233 | i = 1 | ||
234 | setreg | ||
235 | .rep 9 | ||
236 | vaesenc 16*i(arg1), \XMM0, \XMM0 | ||
237 | i = (i+1) | ||
238 | setreg | ||
239 | .endr | ||
240 | vaesenclast 16*10(arg1), \XMM0, \XMM0 | ||
241 | .endm | ||
242 | |||
243 | #ifdef CONFIG_AS_AVX | ||
244 | ############################################################################### | ||
245 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | ||
246 | # Input: A and B (128-bits each, bit-reflected) | ||
247 | # Output: C = A*B*x mod poly, (i.e. >>1 ) | ||
248 | # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | ||
249 | # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | ||
250 | ############################################################################### | ||
251 | .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 | ||
252 | |||
253 | vpshufd $0b01001110, \GH, \T2 | ||
254 | vpshufd $0b01001110, \HK, \T3 | ||
255 | vpxor \GH , \T2, \T2 # T2 = (a1+a0) | ||
256 | vpxor \HK , \T3, \T3 # T3 = (b1+b0) | ||
257 | |||
258 | vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 | ||
259 | vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 | ||
260 | vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) | ||
261 | vpxor \GH, \T2,\T2 | ||
262 | vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 | ||
263 | |||
264 | vpslldq $8, \T2,\T3 # shift-L T3 2 DWs | ||
265 | vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs | ||
266 | vpxor \T3, \GH, \GH | ||
267 | vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK | ||
268 | |||
269 | #first phase of the reduction | ||
270 | vpslld $31, \GH, \T2 # packed right shifting << 31 | ||
271 | vpslld $30, \GH, \T3 # packed right shifting shift << 30 | ||
272 | vpslld $25, \GH, \T4 # packed right shifting shift << 25 | ||
273 | |||
274 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
275 | vpxor \T4, \T2, \T2 | ||
276 | |||
277 | vpsrldq $4, \T2, \T5 # shift-R T5 1 DW | ||
278 | |||
279 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | ||
280 | vpxor \T2, \GH, \GH # first phase of the reduction complete | ||
281 | |||
282 | #second phase of the reduction | ||
283 | |||
284 | vpsrld $1,\GH, \T2 # packed left shifting >> 1 | ||
285 | vpsrld $2,\GH, \T3 # packed left shifting >> 2 | ||
286 | vpsrld $7,\GH, \T4 # packed left shifting >> 7 | ||
287 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
288 | vpxor \T4, \T2, \T2 | ||
289 | |||
290 | vpxor \T5, \T2, \T2 | ||
291 | vpxor \T2, \GH, \GH | ||
292 | vpxor \T1, \GH, \GH # the result is in GH | ||
293 | |||
294 | |||
295 | .endm | ||
296 | |||
297 | .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 | ||
298 | |||
299 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
300 | vmovdqa \HK, \T5 | ||
301 | |||
302 | vpshufd $0b01001110, \T5, \T1 | ||
303 | vpxor \T5, \T1, \T1 | ||
304 | vmovdqa \T1, HashKey_k(arg1) | ||
305 | |||
306 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly | ||
307 | vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly | ||
308 | vpshufd $0b01001110, \T5, \T1 | ||
309 | vpxor \T5, \T1, \T1 | ||
310 | vmovdqa \T1, HashKey_2_k(arg1) | ||
311 | |||
312 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly | ||
313 | vmovdqa \T5, HashKey_3(arg1) | ||
314 | vpshufd $0b01001110, \T5, \T1 | ||
315 | vpxor \T5, \T1, \T1 | ||
316 | vmovdqa \T1, HashKey_3_k(arg1) | ||
317 | |||
318 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly | ||
319 | vmovdqa \T5, HashKey_4(arg1) | ||
320 | vpshufd $0b01001110, \T5, \T1 | ||
321 | vpxor \T5, \T1, \T1 | ||
322 | vmovdqa \T1, HashKey_4_k(arg1) | ||
323 | |||
324 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly | ||
325 | vmovdqa \T5, HashKey_5(arg1) | ||
326 | vpshufd $0b01001110, \T5, \T1 | ||
327 | vpxor \T5, \T1, \T1 | ||
328 | vmovdqa \T1, HashKey_5_k(arg1) | ||
329 | |||
330 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly | ||
331 | vmovdqa \T5, HashKey_6(arg1) | ||
332 | vpshufd $0b01001110, \T5, \T1 | ||
333 | vpxor \T5, \T1, \T1 | ||
334 | vmovdqa \T1, HashKey_6_k(arg1) | ||
335 | |||
336 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly | ||
337 | vmovdqa \T5, HashKey_7(arg1) | ||
338 | vpshufd $0b01001110, \T5, \T1 | ||
339 | vpxor \T5, \T1, \T1 | ||
340 | vmovdqa \T1, HashKey_7_k(arg1) | ||
341 | |||
342 | GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly | ||
343 | vmovdqa \T5, HashKey_8(arg1) | ||
344 | vpshufd $0b01001110, \T5, \T1 | ||
345 | vpxor \T5, \T1, \T1 | ||
346 | vmovdqa \T1, HashKey_8_k(arg1) | ||
347 | |||
348 | .endm | ||
349 | |||
350 | ## if a = number of total plaintext bytes | ||
351 | ## b = floor(a/16) | ||
352 | ## num_initial_blocks = b mod 4# | ||
353 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext | ||
354 | ## r10, r11, r12, rax are clobbered | ||
355 | ## arg1, arg2, arg3, r14 are used as a pointer only, not modified | ||
356 | |||
357 | .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC | ||
358 | i = (8-\num_initial_blocks) | ||
359 | setreg | ||
360 | |||
361 | mov arg6, %r10 # r10 = AAD | ||
362 | mov arg7, %r12 # r12 = aadLen | ||
363 | |||
364 | |||
365 | mov %r12, %r11 | ||
366 | |||
367 | vpxor reg_i, reg_i, reg_i | ||
368 | _get_AAD_loop\@: | ||
369 | vmovd (%r10), \T1 | ||
370 | vpslldq $12, \T1, \T1 | ||
371 | vpsrldq $4, reg_i, reg_i | ||
372 | vpxor \T1, reg_i, reg_i | ||
373 | |||
374 | add $4, %r10 | ||
375 | sub $4, %r12 | ||
376 | jg _get_AAD_loop\@ | ||
377 | |||
378 | |||
379 | cmp $16, %r11 | ||
380 | je _get_AAD_loop2_done\@ | ||
381 | mov $16, %r12 | ||
382 | |||
383 | _get_AAD_loop2\@: | ||
384 | vpsrldq $4, reg_i, reg_i | ||
385 | sub $4, %r12 | ||
386 | cmp %r11, %r12 | ||
387 | jg _get_AAD_loop2\@ | ||
388 | |||
389 | _get_AAD_loop2_done\@: | ||
390 | |||
391 | #byte-reflect the AAD data | ||
392 | vpshufb SHUF_MASK(%rip), reg_i, reg_i | ||
393 | |||
394 | # initialize the data pointer offset as zero | ||
395 | xor %r11, %r11 | ||
396 | |||
397 | # start AES for num_initial_blocks blocks | ||
398 | mov arg5, %rax # rax = *Y0 | ||
399 | vmovdqu (%rax), \CTR # CTR = Y0 | ||
400 | vpshufb SHUF_MASK(%rip), \CTR, \CTR | ||
401 | |||
402 | |||
403 | i = (9-\num_initial_blocks) | ||
404 | setreg | ||
405 | .rep \num_initial_blocks | ||
406 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
407 | vmovdqa \CTR, reg_i | ||
408 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap | ||
409 | i = (i+1) | ||
410 | setreg | ||
411 | .endr | ||
412 | |||
413 | vmovdqa (arg1), \T_key | ||
414 | i = (9-\num_initial_blocks) | ||
415 | setreg | ||
416 | .rep \num_initial_blocks | ||
417 | vpxor \T_key, reg_i, reg_i | ||
418 | i = (i+1) | ||
419 | setreg | ||
420 | .endr | ||
421 | |||
422 | j = 1 | ||
423 | setreg | ||
424 | .rep 9 | ||
425 | vmovdqa 16*j(arg1), \T_key | ||
426 | i = (9-\num_initial_blocks) | ||
427 | setreg | ||
428 | .rep \num_initial_blocks | ||
429 | vaesenc \T_key, reg_i, reg_i | ||
430 | i = (i+1) | ||
431 | setreg | ||
432 | .endr | ||
433 | |||
434 | j = (j+1) | ||
435 | setreg | ||
436 | .endr | ||
437 | |||
438 | |||
439 | vmovdqa 16*10(arg1), \T_key | ||
440 | i = (9-\num_initial_blocks) | ||
441 | setreg | ||
442 | .rep \num_initial_blocks | ||
443 | vaesenclast \T_key, reg_i, reg_i | ||
444 | i = (i+1) | ||
445 | setreg | ||
446 | .endr | ||
447 | |||
448 | i = (9-\num_initial_blocks) | ||
449 | setreg | ||
450 | .rep \num_initial_blocks | ||
451 | vmovdqu (arg3, %r11), \T1 | ||
452 | vpxor \T1, reg_i, reg_i | ||
453 | vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks | ||
454 | add $16, %r11 | ||
455 | .if \ENC_DEC == DEC | ||
456 | vmovdqa \T1, reg_i | ||
457 | .endif | ||
458 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations | ||
459 | i = (i+1) | ||
460 | setreg | ||
461 | .endr | ||
462 | |||
463 | |||
464 | i = (8-\num_initial_blocks) | ||
465 | j = (9-\num_initial_blocks) | ||
466 | setreg | ||
467 | GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 | ||
468 | |||
469 | .rep \num_initial_blocks | ||
470 | vpxor reg_i, reg_j, reg_j | ||
471 | GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks | ||
472 | i = (i+1) | ||
473 | j = (j+1) | ||
474 | setreg | ||
475 | .endr | ||
476 | # XMM8 has the combined result here | ||
477 | |||
478 | vmovdqa \XMM8, TMP1(%rsp) | ||
479 | vmovdqa \XMM8, \T3 | ||
480 | |||
481 | cmp $128, %r13 | ||
482 | jl _initial_blocks_done\@ # no need for precomputed constants | ||
483 | |||
484 | ############################################################################### | ||
485 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
486 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
487 | vmovdqa \CTR, \XMM1 | ||
488 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
489 | |||
490 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
491 | vmovdqa \CTR, \XMM2 | ||
492 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
493 | |||
494 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
495 | vmovdqa \CTR, \XMM3 | ||
496 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
497 | |||
498 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
499 | vmovdqa \CTR, \XMM4 | ||
500 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
501 | |||
502 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
503 | vmovdqa \CTR, \XMM5 | ||
504 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
505 | |||
506 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
507 | vmovdqa \CTR, \XMM6 | ||
508 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
509 | |||
510 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
511 | vmovdqa \CTR, \XMM7 | ||
512 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
513 | |||
514 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
515 | vmovdqa \CTR, \XMM8 | ||
516 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
517 | |||
518 | vmovdqa (arg1), \T_key | ||
519 | vpxor \T_key, \XMM1, \XMM1 | ||
520 | vpxor \T_key, \XMM2, \XMM2 | ||
521 | vpxor \T_key, \XMM3, \XMM3 | ||
522 | vpxor \T_key, \XMM4, \XMM4 | ||
523 | vpxor \T_key, \XMM5, \XMM5 | ||
524 | vpxor \T_key, \XMM6, \XMM6 | ||
525 | vpxor \T_key, \XMM7, \XMM7 | ||
526 | vpxor \T_key, \XMM8, \XMM8 | ||
527 | |||
528 | i = 1 | ||
529 | setreg | ||
530 | .rep 9 # do 9 rounds | ||
531 | vmovdqa 16*i(arg1), \T_key | ||
532 | vaesenc \T_key, \XMM1, \XMM1 | ||
533 | vaesenc \T_key, \XMM2, \XMM2 | ||
534 | vaesenc \T_key, \XMM3, \XMM3 | ||
535 | vaesenc \T_key, \XMM4, \XMM4 | ||
536 | vaesenc \T_key, \XMM5, \XMM5 | ||
537 | vaesenc \T_key, \XMM6, \XMM6 | ||
538 | vaesenc \T_key, \XMM7, \XMM7 | ||
539 | vaesenc \T_key, \XMM8, \XMM8 | ||
540 | i = (i+1) | ||
541 | setreg | ||
542 | .endr | ||
543 | |||
544 | |||
545 | vmovdqa 16*i(arg1), \T_key | ||
546 | vaesenclast \T_key, \XMM1, \XMM1 | ||
547 | vaesenclast \T_key, \XMM2, \XMM2 | ||
548 | vaesenclast \T_key, \XMM3, \XMM3 | ||
549 | vaesenclast \T_key, \XMM4, \XMM4 | ||
550 | vaesenclast \T_key, \XMM5, \XMM5 | ||
551 | vaesenclast \T_key, \XMM6, \XMM6 | ||
552 | vaesenclast \T_key, \XMM7, \XMM7 | ||
553 | vaesenclast \T_key, \XMM8, \XMM8 | ||
554 | |||
555 | vmovdqu (arg3, %r11), \T1 | ||
556 | vpxor \T1, \XMM1, \XMM1 | ||
557 | vmovdqu \XMM1, (arg2 , %r11) | ||
558 | .if \ENC_DEC == DEC | ||
559 | vmovdqa \T1, \XMM1 | ||
560 | .endif | ||
561 | |||
562 | vmovdqu 16*1(arg3, %r11), \T1 | ||
563 | vpxor \T1, \XMM2, \XMM2 | ||
564 | vmovdqu \XMM2, 16*1(arg2 , %r11) | ||
565 | .if \ENC_DEC == DEC | ||
566 | vmovdqa \T1, \XMM2 | ||
567 | .endif | ||
568 | |||
569 | vmovdqu 16*2(arg3, %r11), \T1 | ||
570 | vpxor \T1, \XMM3, \XMM3 | ||
571 | vmovdqu \XMM3, 16*2(arg2 , %r11) | ||
572 | .if \ENC_DEC == DEC | ||
573 | vmovdqa \T1, \XMM3 | ||
574 | .endif | ||
575 | |||
576 | vmovdqu 16*3(arg3, %r11), \T1 | ||
577 | vpxor \T1, \XMM4, \XMM4 | ||
578 | vmovdqu \XMM4, 16*3(arg2 , %r11) | ||
579 | .if \ENC_DEC == DEC | ||
580 | vmovdqa \T1, \XMM4 | ||
581 | .endif | ||
582 | |||
583 | vmovdqu 16*4(arg3, %r11), \T1 | ||
584 | vpxor \T1, \XMM5, \XMM5 | ||
585 | vmovdqu \XMM5, 16*4(arg2 , %r11) | ||
586 | .if \ENC_DEC == DEC | ||
587 | vmovdqa \T1, \XMM5 | ||
588 | .endif | ||
589 | |||
590 | vmovdqu 16*5(arg3, %r11), \T1 | ||
591 | vpxor \T1, \XMM6, \XMM6 | ||
592 | vmovdqu \XMM6, 16*5(arg2 , %r11) | ||
593 | .if \ENC_DEC == DEC | ||
594 | vmovdqa \T1, \XMM6 | ||
595 | .endif | ||
596 | |||
597 | vmovdqu 16*6(arg3, %r11), \T1 | ||
598 | vpxor \T1, \XMM7, \XMM7 | ||
599 | vmovdqu \XMM7, 16*6(arg2 , %r11) | ||
600 | .if \ENC_DEC == DEC | ||
601 | vmovdqa \T1, \XMM7 | ||
602 | .endif | ||
603 | |||
604 | vmovdqu 16*7(arg3, %r11), \T1 | ||
605 | vpxor \T1, \XMM8, \XMM8 | ||
606 | vmovdqu \XMM8, 16*7(arg2 , %r11) | ||
607 | .if \ENC_DEC == DEC | ||
608 | vmovdqa \T1, \XMM8 | ||
609 | .endif | ||
610 | |||
611 | add $128, %r11 | ||
612 | |||
613 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
614 | vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext | ||
615 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
616 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
617 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
618 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
619 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
620 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
621 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
622 | |||
623 | ############################################################################### | ||
624 | |||
625 | _initial_blocks_done\@: | ||
626 | |||
627 | .endm | ||
628 | |||
629 | # encrypt 8 blocks at a time | ||
630 | # ghash the 8 previously encrypted ciphertext blocks | ||
631 | # arg1, arg2, arg3 are used as pointers only, not modified | ||
632 | # r11 is the data offset value | ||
633 | .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC | ||
634 | |||
635 | vmovdqa \XMM1, \T2 | ||
636 | vmovdqa \XMM2, TMP2(%rsp) | ||
637 | vmovdqa \XMM3, TMP3(%rsp) | ||
638 | vmovdqa \XMM4, TMP4(%rsp) | ||
639 | vmovdqa \XMM5, TMP5(%rsp) | ||
640 | vmovdqa \XMM6, TMP6(%rsp) | ||
641 | vmovdqa \XMM7, TMP7(%rsp) | ||
642 | vmovdqa \XMM8, TMP8(%rsp) | ||
643 | |||
644 | .if \loop_idx == in_order | ||
645 | vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT | ||
646 | vpaddd ONE(%rip), \XMM1, \XMM2 | ||
647 | vpaddd ONE(%rip), \XMM2, \XMM3 | ||
648 | vpaddd ONE(%rip), \XMM3, \XMM4 | ||
649 | vpaddd ONE(%rip), \XMM4, \XMM5 | ||
650 | vpaddd ONE(%rip), \XMM5, \XMM6 | ||
651 | vpaddd ONE(%rip), \XMM6, \XMM7 | ||
652 | vpaddd ONE(%rip), \XMM7, \XMM8 | ||
653 | vmovdqa \XMM8, \CTR | ||
654 | |||
655 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
656 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
657 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
658 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
659 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
660 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
661 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
662 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
663 | .else | ||
664 | vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT | ||
665 | vpaddd ONEf(%rip), \XMM1, \XMM2 | ||
666 | vpaddd ONEf(%rip), \XMM2, \XMM3 | ||
667 | vpaddd ONEf(%rip), \XMM3, \XMM4 | ||
668 | vpaddd ONEf(%rip), \XMM4, \XMM5 | ||
669 | vpaddd ONEf(%rip), \XMM5, \XMM6 | ||
670 | vpaddd ONEf(%rip), \XMM6, \XMM7 | ||
671 | vpaddd ONEf(%rip), \XMM7, \XMM8 | ||
672 | vmovdqa \XMM8, \CTR | ||
673 | .endif | ||
674 | |||
675 | |||
676 | ####################################################################### | ||
677 | |||
678 | vmovdqu (arg1), \T1 | ||
679 | vpxor \T1, \XMM1, \XMM1 | ||
680 | vpxor \T1, \XMM2, \XMM2 | ||
681 | vpxor \T1, \XMM3, \XMM3 | ||
682 | vpxor \T1, \XMM4, \XMM4 | ||
683 | vpxor \T1, \XMM5, \XMM5 | ||
684 | vpxor \T1, \XMM6, \XMM6 | ||
685 | vpxor \T1, \XMM7, \XMM7 | ||
686 | vpxor \T1, \XMM8, \XMM8 | ||
687 | |||
688 | ####################################################################### | ||
689 | |||
690 | |||
691 | |||
692 | |||
693 | |||
694 | vmovdqu 16*1(arg1), \T1 | ||
695 | vaesenc \T1, \XMM1, \XMM1 | ||
696 | vaesenc \T1, \XMM2, \XMM2 | ||
697 | vaesenc \T1, \XMM3, \XMM3 | ||
698 | vaesenc \T1, \XMM4, \XMM4 | ||
699 | vaesenc \T1, \XMM5, \XMM5 | ||
700 | vaesenc \T1, \XMM6, \XMM6 | ||
701 | vaesenc \T1, \XMM7, \XMM7 | ||
702 | vaesenc \T1, \XMM8, \XMM8 | ||
703 | |||
704 | vmovdqu 16*2(arg1), \T1 | ||
705 | vaesenc \T1, \XMM1, \XMM1 | ||
706 | vaesenc \T1, \XMM2, \XMM2 | ||
707 | vaesenc \T1, \XMM3, \XMM3 | ||
708 | vaesenc \T1, \XMM4, \XMM4 | ||
709 | vaesenc \T1, \XMM5, \XMM5 | ||
710 | vaesenc \T1, \XMM6, \XMM6 | ||
711 | vaesenc \T1, \XMM7, \XMM7 | ||
712 | vaesenc \T1, \XMM8, \XMM8 | ||
713 | |||
714 | |||
715 | ####################################################################### | ||
716 | |||
717 | vmovdqa HashKey_8(arg1), \T5 | ||
718 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 | ||
719 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 | ||
720 | |||
721 | vpshufd $0b01001110, \T2, \T6 | ||
722 | vpxor \T2, \T6, \T6 | ||
723 | |||
724 | vmovdqa HashKey_8_k(arg1), \T5 | ||
725 | vpclmulqdq $0x00, \T5, \T6, \T6 | ||
726 | |||
727 | vmovdqu 16*3(arg1), \T1 | ||
728 | vaesenc \T1, \XMM1, \XMM1 | ||
729 | vaesenc \T1, \XMM2, \XMM2 | ||
730 | vaesenc \T1, \XMM3, \XMM3 | ||
731 | vaesenc \T1, \XMM4, \XMM4 | ||
732 | vaesenc \T1, \XMM5, \XMM5 | ||
733 | vaesenc \T1, \XMM6, \XMM6 | ||
734 | vaesenc \T1, \XMM7, \XMM7 | ||
735 | vaesenc \T1, \XMM8, \XMM8 | ||
736 | |||
737 | vmovdqa TMP2(%rsp), \T1 | ||
738 | vmovdqa HashKey_7(arg1), \T5 | ||
739 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
740 | vpxor \T3, \T4, \T4 | ||
741 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
742 | vpxor \T3, \T7, \T7 | ||
743 | |||
744 | vpshufd $0b01001110, \T1, \T3 | ||
745 | vpxor \T1, \T3, \T3 | ||
746 | vmovdqa HashKey_7_k(arg1), \T5 | ||
747 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
748 | vpxor \T3, \T6, \T6 | ||
749 | |||
750 | vmovdqu 16*4(arg1), \T1 | ||
751 | vaesenc \T1, \XMM1, \XMM1 | ||
752 | vaesenc \T1, \XMM2, \XMM2 | ||
753 | vaesenc \T1, \XMM3, \XMM3 | ||
754 | vaesenc \T1, \XMM4, \XMM4 | ||
755 | vaesenc \T1, \XMM5, \XMM5 | ||
756 | vaesenc \T1, \XMM6, \XMM6 | ||
757 | vaesenc \T1, \XMM7, \XMM7 | ||
758 | vaesenc \T1, \XMM8, \XMM8 | ||
759 | |||
760 | ####################################################################### | ||
761 | |||
762 | vmovdqa TMP3(%rsp), \T1 | ||
763 | vmovdqa HashKey_6(arg1), \T5 | ||
764 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
765 | vpxor \T3, \T4, \T4 | ||
766 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
767 | vpxor \T3, \T7, \T7 | ||
768 | |||
769 | vpshufd $0b01001110, \T1, \T3 | ||
770 | vpxor \T1, \T3, \T3 | ||
771 | vmovdqa HashKey_6_k(arg1), \T5 | ||
772 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
773 | vpxor \T3, \T6, \T6 | ||
774 | |||
775 | vmovdqu 16*5(arg1), \T1 | ||
776 | vaesenc \T1, \XMM1, \XMM1 | ||
777 | vaesenc \T1, \XMM2, \XMM2 | ||
778 | vaesenc \T1, \XMM3, \XMM3 | ||
779 | vaesenc \T1, \XMM4, \XMM4 | ||
780 | vaesenc \T1, \XMM5, \XMM5 | ||
781 | vaesenc \T1, \XMM6, \XMM6 | ||
782 | vaesenc \T1, \XMM7, \XMM7 | ||
783 | vaesenc \T1, \XMM8, \XMM8 | ||
784 | |||
785 | vmovdqa TMP4(%rsp), \T1 | ||
786 | vmovdqa HashKey_5(arg1), \T5 | ||
787 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
788 | vpxor \T3, \T4, \T4 | ||
789 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
790 | vpxor \T3, \T7, \T7 | ||
791 | |||
792 | vpshufd $0b01001110, \T1, \T3 | ||
793 | vpxor \T1, \T3, \T3 | ||
794 | vmovdqa HashKey_5_k(arg1), \T5 | ||
795 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
796 | vpxor \T3, \T6, \T6 | ||
797 | |||
798 | vmovdqu 16*6(arg1), \T1 | ||
799 | vaesenc \T1, \XMM1, \XMM1 | ||
800 | vaesenc \T1, \XMM2, \XMM2 | ||
801 | vaesenc \T1, \XMM3, \XMM3 | ||
802 | vaesenc \T1, \XMM4, \XMM4 | ||
803 | vaesenc \T1, \XMM5, \XMM5 | ||
804 | vaesenc \T1, \XMM6, \XMM6 | ||
805 | vaesenc \T1, \XMM7, \XMM7 | ||
806 | vaesenc \T1, \XMM8, \XMM8 | ||
807 | |||
808 | |||
809 | vmovdqa TMP5(%rsp), \T1 | ||
810 | vmovdqa HashKey_4(arg1), \T5 | ||
811 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
812 | vpxor \T3, \T4, \T4 | ||
813 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
814 | vpxor \T3, \T7, \T7 | ||
815 | |||
816 | vpshufd $0b01001110, \T1, \T3 | ||
817 | vpxor \T1, \T3, \T3 | ||
818 | vmovdqa HashKey_4_k(arg1), \T5 | ||
819 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
820 | vpxor \T3, \T6, \T6 | ||
821 | |||
822 | vmovdqu 16*7(arg1), \T1 | ||
823 | vaesenc \T1, \XMM1, \XMM1 | ||
824 | vaesenc \T1, \XMM2, \XMM2 | ||
825 | vaesenc \T1, \XMM3, \XMM3 | ||
826 | vaesenc \T1, \XMM4, \XMM4 | ||
827 | vaesenc \T1, \XMM5, \XMM5 | ||
828 | vaesenc \T1, \XMM6, \XMM6 | ||
829 | vaesenc \T1, \XMM7, \XMM7 | ||
830 | vaesenc \T1, \XMM8, \XMM8 | ||
831 | |||
832 | vmovdqa TMP6(%rsp), \T1 | ||
833 | vmovdqa HashKey_3(arg1), \T5 | ||
834 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
835 | vpxor \T3, \T4, \T4 | ||
836 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
837 | vpxor \T3, \T7, \T7 | ||
838 | |||
839 | vpshufd $0b01001110, \T1, \T3 | ||
840 | vpxor \T1, \T3, \T3 | ||
841 | vmovdqa HashKey_3_k(arg1), \T5 | ||
842 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
843 | vpxor \T3, \T6, \T6 | ||
844 | |||
845 | |||
846 | vmovdqu 16*8(arg1), \T1 | ||
847 | vaesenc \T1, \XMM1, \XMM1 | ||
848 | vaesenc \T1, \XMM2, \XMM2 | ||
849 | vaesenc \T1, \XMM3, \XMM3 | ||
850 | vaesenc \T1, \XMM4, \XMM4 | ||
851 | vaesenc \T1, \XMM5, \XMM5 | ||
852 | vaesenc \T1, \XMM6, \XMM6 | ||
853 | vaesenc \T1, \XMM7, \XMM7 | ||
854 | vaesenc \T1, \XMM8, \XMM8 | ||
855 | |||
856 | vmovdqa TMP7(%rsp), \T1 | ||
857 | vmovdqa HashKey_2(arg1), \T5 | ||
858 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
859 | vpxor \T3, \T4, \T4 | ||
860 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
861 | vpxor \T3, \T7, \T7 | ||
862 | |||
863 | vpshufd $0b01001110, \T1, \T3 | ||
864 | vpxor \T1, \T3, \T3 | ||
865 | vmovdqa HashKey_2_k(arg1), \T5 | ||
866 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
867 | vpxor \T3, \T6, \T6 | ||
868 | |||
869 | ####################################################################### | ||
870 | |||
871 | vmovdqu 16*9(arg1), \T5 | ||
872 | vaesenc \T5, \XMM1, \XMM1 | ||
873 | vaesenc \T5, \XMM2, \XMM2 | ||
874 | vaesenc \T5, \XMM3, \XMM3 | ||
875 | vaesenc \T5, \XMM4, \XMM4 | ||
876 | vaesenc \T5, \XMM5, \XMM5 | ||
877 | vaesenc \T5, \XMM6, \XMM6 | ||
878 | vaesenc \T5, \XMM7, \XMM7 | ||
879 | vaesenc \T5, \XMM8, \XMM8 | ||
880 | |||
881 | vmovdqa TMP8(%rsp), \T1 | ||
882 | vmovdqa HashKey(arg1), \T5 | ||
883 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
884 | vpxor \T3, \T4, \T4 | ||
885 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
886 | vpxor \T3, \T7, \T7 | ||
887 | |||
888 | vpshufd $0b01001110, \T1, \T3 | ||
889 | vpxor \T1, \T3, \T3 | ||
890 | vmovdqa HashKey_k(arg1), \T5 | ||
891 | vpclmulqdq $0x10, \T5, \T3, \T3 | ||
892 | vpxor \T3, \T6, \T6 | ||
893 | |||
894 | vpxor \T4, \T6, \T6 | ||
895 | vpxor \T7, \T6, \T6 | ||
896 | |||
897 | vmovdqu 16*10(arg1), \T5 | ||
898 | |||
899 | i = 0 | ||
900 | j = 1 | ||
901 | setreg | ||
902 | .rep 8 | ||
903 | vpxor 16*i(arg3, %r11), \T5, \T2 | ||
904 | .if \ENC_DEC == ENC | ||
905 | vaesenclast \T2, reg_j, reg_j | ||
906 | .else | ||
907 | vaesenclast \T2, reg_j, \T3 | ||
908 | vmovdqu 16*i(arg3, %r11), reg_j | ||
909 | vmovdqu \T3, 16*i(arg2, %r11) | ||
910 | .endif | ||
911 | i = (i+1) | ||
912 | j = (j+1) | ||
913 | setreg | ||
914 | .endr | ||
915 | ####################################################################### | ||
916 | |||
917 | |||
918 | vpslldq $8, \T6, \T3 # shift-L T3 2 DWs | ||
919 | vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs | ||
920 | vpxor \T3, \T7, \T7 | ||
921 | vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 | ||
922 | |||
923 | |||
924 | |||
925 | ####################################################################### | ||
926 | #first phase of the reduction | ||
927 | ####################################################################### | ||
928 | vpslld $31, \T7, \T2 # packed right shifting << 31 | ||
929 | vpslld $30, \T7, \T3 # packed right shifting shift << 30 | ||
930 | vpslld $25, \T7, \T4 # packed right shifting shift << 25 | ||
931 | |||
932 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
933 | vpxor \T4, \T2, \T2 | ||
934 | |||
935 | vpsrldq $4, \T2, \T1 # shift-R T1 1 DW | ||
936 | |||
937 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | ||
938 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | ||
939 | ####################################################################### | ||
940 | .if \ENC_DEC == ENC | ||
941 | vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer | ||
942 | vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer | ||
943 | vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer | ||
944 | vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer | ||
945 | vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer | ||
946 | vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer | ||
947 | vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer | ||
948 | vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer | ||
949 | .endif | ||
950 | |||
951 | ####################################################################### | ||
952 | #second phase of the reduction | ||
953 | vpsrld $1, \T7, \T2 # packed left shifting >> 1 | ||
954 | vpsrld $2, \T7, \T3 # packed left shifting >> 2 | ||
955 | vpsrld $7, \T7, \T4 # packed left shifting >> 7 | ||
956 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
957 | vpxor \T4, \T2, \T2 | ||
958 | |||
959 | vpxor \T1, \T2, \T2 | ||
960 | vpxor \T2, \T7, \T7 | ||
961 | vpxor \T7, \T6, \T6 # the result is in T6 | ||
962 | ####################################################################### | ||
963 | |||
964 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
965 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
966 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
967 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
968 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
969 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
970 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
971 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
972 | |||
973 | |||
974 | vpxor \T6, \XMM1, \XMM1 | ||
975 | |||
976 | |||
977 | |||
978 | .endm | ||
979 | |||
980 | |||
981 | # GHASH the last 4 ciphertext blocks. | ||
982 | .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 | ||
983 | |||
984 | ## Karatsuba Method | ||
985 | |||
986 | |||
987 | vpshufd $0b01001110, \XMM1, \T2 | ||
988 | vpxor \XMM1, \T2, \T2 | ||
989 | vmovdqa HashKey_8(arg1), \T5 | ||
990 | vpclmulqdq $0x11, \T5, \XMM1, \T6 | ||
991 | vpclmulqdq $0x00, \T5, \XMM1, \T7 | ||
992 | |||
993 | vmovdqa HashKey_8_k(arg1), \T3 | ||
994 | vpclmulqdq $0x00, \T3, \T2, \XMM1 | ||
995 | |||
996 | ###################### | ||
997 | |||
998 | vpshufd $0b01001110, \XMM2, \T2 | ||
999 | vpxor \XMM2, \T2, \T2 | ||
1000 | vmovdqa HashKey_7(arg1), \T5 | ||
1001 | vpclmulqdq $0x11, \T5, \XMM2, \T4 | ||
1002 | vpxor \T4, \T6, \T6 | ||
1003 | |||
1004 | vpclmulqdq $0x00, \T5, \XMM2, \T4 | ||
1005 | vpxor \T4, \T7, \T7 | ||
1006 | |||
1007 | vmovdqa HashKey_7_k(arg1), \T3 | ||
1008 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1009 | vpxor \T2, \XMM1, \XMM1 | ||
1010 | |||
1011 | ###################### | ||
1012 | |||
1013 | vpshufd $0b01001110, \XMM3, \T2 | ||
1014 | vpxor \XMM3, \T2, \T2 | ||
1015 | vmovdqa HashKey_6(arg1), \T5 | ||
1016 | vpclmulqdq $0x11, \T5, \XMM3, \T4 | ||
1017 | vpxor \T4, \T6, \T6 | ||
1018 | |||
1019 | vpclmulqdq $0x00, \T5, \XMM3, \T4 | ||
1020 | vpxor \T4, \T7, \T7 | ||
1021 | |||
1022 | vmovdqa HashKey_6_k(arg1), \T3 | ||
1023 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1024 | vpxor \T2, \XMM1, \XMM1 | ||
1025 | |||
1026 | ###################### | ||
1027 | |||
1028 | vpshufd $0b01001110, \XMM4, \T2 | ||
1029 | vpxor \XMM4, \T2, \T2 | ||
1030 | vmovdqa HashKey_5(arg1), \T5 | ||
1031 | vpclmulqdq $0x11, \T5, \XMM4, \T4 | ||
1032 | vpxor \T4, \T6, \T6 | ||
1033 | |||
1034 | vpclmulqdq $0x00, \T5, \XMM4, \T4 | ||
1035 | vpxor \T4, \T7, \T7 | ||
1036 | |||
1037 | vmovdqa HashKey_5_k(arg1), \T3 | ||
1038 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1039 | vpxor \T2, \XMM1, \XMM1 | ||
1040 | |||
1041 | ###################### | ||
1042 | |||
1043 | vpshufd $0b01001110, \XMM5, \T2 | ||
1044 | vpxor \XMM5, \T2, \T2 | ||
1045 | vmovdqa HashKey_4(arg1), \T5 | ||
1046 | vpclmulqdq $0x11, \T5, \XMM5, \T4 | ||
1047 | vpxor \T4, \T6, \T6 | ||
1048 | |||
1049 | vpclmulqdq $0x00, \T5, \XMM5, \T4 | ||
1050 | vpxor \T4, \T7, \T7 | ||
1051 | |||
1052 | vmovdqa HashKey_4_k(arg1), \T3 | ||
1053 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1054 | vpxor \T2, \XMM1, \XMM1 | ||
1055 | |||
1056 | ###################### | ||
1057 | |||
1058 | vpshufd $0b01001110, \XMM6, \T2 | ||
1059 | vpxor \XMM6, \T2, \T2 | ||
1060 | vmovdqa HashKey_3(arg1), \T5 | ||
1061 | vpclmulqdq $0x11, \T5, \XMM6, \T4 | ||
1062 | vpxor \T4, \T6, \T6 | ||
1063 | |||
1064 | vpclmulqdq $0x00, \T5, \XMM6, \T4 | ||
1065 | vpxor \T4, \T7, \T7 | ||
1066 | |||
1067 | vmovdqa HashKey_3_k(arg1), \T3 | ||
1068 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1069 | vpxor \T2, \XMM1, \XMM1 | ||
1070 | |||
1071 | ###################### | ||
1072 | |||
1073 | vpshufd $0b01001110, \XMM7, \T2 | ||
1074 | vpxor \XMM7, \T2, \T2 | ||
1075 | vmovdqa HashKey_2(arg1), \T5 | ||
1076 | vpclmulqdq $0x11, \T5, \XMM7, \T4 | ||
1077 | vpxor \T4, \T6, \T6 | ||
1078 | |||
1079 | vpclmulqdq $0x00, \T5, \XMM7, \T4 | ||
1080 | vpxor \T4, \T7, \T7 | ||
1081 | |||
1082 | vmovdqa HashKey_2_k(arg1), \T3 | ||
1083 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1084 | vpxor \T2, \XMM1, \XMM1 | ||
1085 | |||
1086 | ###################### | ||
1087 | |||
1088 | vpshufd $0b01001110, \XMM8, \T2 | ||
1089 | vpxor \XMM8, \T2, \T2 | ||
1090 | vmovdqa HashKey(arg1), \T5 | ||
1091 | vpclmulqdq $0x11, \T5, \XMM8, \T4 | ||
1092 | vpxor \T4, \T6, \T6 | ||
1093 | |||
1094 | vpclmulqdq $0x00, \T5, \XMM8, \T4 | ||
1095 | vpxor \T4, \T7, \T7 | ||
1096 | |||
1097 | vmovdqa HashKey_k(arg1), \T3 | ||
1098 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
1099 | |||
1100 | vpxor \T2, \XMM1, \XMM1 | ||
1101 | vpxor \T6, \XMM1, \XMM1 | ||
1102 | vpxor \T7, \XMM1, \T2 | ||
1103 | |||
1104 | |||
1105 | |||
1106 | |||
1107 | vpslldq $8, \T2, \T4 | ||
1108 | vpsrldq $8, \T2, \T2 | ||
1109 | |||
1110 | vpxor \T4, \T7, \T7 | ||
1111 | vpxor \T2, \T6, \T6 # <T6:T7> holds the result of | ||
1112 | # the accumulated carry-less multiplications | ||
1113 | |||
1114 | ####################################################################### | ||
1115 | #first phase of the reduction | ||
1116 | vpslld $31, \T7, \T2 # packed right shifting << 31 | ||
1117 | vpslld $30, \T7, \T3 # packed right shifting shift << 30 | ||
1118 | vpslld $25, \T7, \T4 # packed right shifting shift << 25 | ||
1119 | |||
1120 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
1121 | vpxor \T4, \T2, \T2 | ||
1122 | |||
1123 | vpsrldq $4, \T2, \T1 # shift-R T1 1 DW | ||
1124 | |||
1125 | vpslldq $12, \T2, \T2 # shift-L T2 3 DWs | ||
1126 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | ||
1127 | ####################################################################### | ||
1128 | |||
1129 | |||
1130 | #second phase of the reduction | ||
1131 | vpsrld $1, \T7, \T2 # packed left shifting >> 1 | ||
1132 | vpsrld $2, \T7, \T3 # packed left shifting >> 2 | ||
1133 | vpsrld $7, \T7, \T4 # packed left shifting >> 7 | ||
1134 | vpxor \T3, \T2, \T2 # xor the shifted versions | ||
1135 | vpxor \T4, \T2, \T2 | ||
1136 | |||
1137 | vpxor \T1, \T2, \T2 | ||
1138 | vpxor \T2, \T7, \T7 | ||
1139 | vpxor \T7, \T6, \T6 # the result is in T6 | ||
1140 | |||
1141 | .endm | ||
1142 | |||
1143 | |||
1144 | # combined for GCM encrypt and decrypt functions | ||
1145 | # clobbering all xmm registers | ||
1146 | # clobbering r10, r11, r12, r13, r14, r15 | ||
1147 | .macro GCM_ENC_DEC_AVX ENC_DEC | ||
1148 | |||
1149 | #the number of pushes must equal STACK_OFFSET | ||
1150 | push %r12 | ||
1151 | push %r13 | ||
1152 | push %r14 | ||
1153 | push %r15 | ||
1154 | |||
1155 | mov %rsp, %r14 | ||
1156 | |||
1157 | |||
1158 | |||
1159 | |||
1160 | sub $VARIABLE_OFFSET, %rsp | ||
1161 | and $~63, %rsp # align rsp to 64 bytes | ||
1162 | |||
1163 | |||
1164 | vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey | ||
1165 | |||
1166 | mov arg4, %r13 # save the number of bytes of plaintext/ciphertext | ||
1167 | and $-16, %r13 # r13 = r13 - (r13 mod 16) | ||
1168 | |||
1169 | mov %r13, %r12 | ||
1170 | shr $4, %r12 | ||
1171 | and $7, %r12 | ||
1172 | jz _initial_num_blocks_is_0\@ | ||
1173 | |||
1174 | cmp $7, %r12 | ||
1175 | je _initial_num_blocks_is_7\@ | ||
1176 | cmp $6, %r12 | ||
1177 | je _initial_num_blocks_is_6\@ | ||
1178 | cmp $5, %r12 | ||
1179 | je _initial_num_blocks_is_5\@ | ||
1180 | cmp $4, %r12 | ||
1181 | je _initial_num_blocks_is_4\@ | ||
1182 | cmp $3, %r12 | ||
1183 | je _initial_num_blocks_is_3\@ | ||
1184 | cmp $2, %r12 | ||
1185 | je _initial_num_blocks_is_2\@ | ||
1186 | |||
1187 | jmp _initial_num_blocks_is_1\@ | ||
1188 | |||
1189 | _initial_num_blocks_is_7\@: | ||
1190 | INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1191 | sub $16*7, %r13 | ||
1192 | jmp _initial_blocks_encrypted\@ | ||
1193 | |||
1194 | _initial_num_blocks_is_6\@: | ||
1195 | INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1196 | sub $16*6, %r13 | ||
1197 | jmp _initial_blocks_encrypted\@ | ||
1198 | |||
1199 | _initial_num_blocks_is_5\@: | ||
1200 | INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1201 | sub $16*5, %r13 | ||
1202 | jmp _initial_blocks_encrypted\@ | ||
1203 | |||
1204 | _initial_num_blocks_is_4\@: | ||
1205 | INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1206 | sub $16*4, %r13 | ||
1207 | jmp _initial_blocks_encrypted\@ | ||
1208 | |||
1209 | _initial_num_blocks_is_3\@: | ||
1210 | INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1211 | sub $16*3, %r13 | ||
1212 | jmp _initial_blocks_encrypted\@ | ||
1213 | |||
1214 | _initial_num_blocks_is_2\@: | ||
1215 | INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1216 | sub $16*2, %r13 | ||
1217 | jmp _initial_blocks_encrypted\@ | ||
1218 | |||
1219 | _initial_num_blocks_is_1\@: | ||
1220 | INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1221 | sub $16*1, %r13 | ||
1222 | jmp _initial_blocks_encrypted\@ | ||
1223 | |||
1224 | _initial_num_blocks_is_0\@: | ||
1225 | INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
1226 | |||
1227 | |||
1228 | _initial_blocks_encrypted\@: | ||
1229 | cmp $0, %r13 | ||
1230 | je _zero_cipher_left\@ | ||
1231 | |||
1232 | sub $128, %r13 | ||
1233 | je _eight_cipher_left\@ | ||
1234 | |||
1235 | |||
1236 | |||
1237 | |||
1238 | vmovd %xmm9, %r15d | ||
1239 | and $255, %r15d | ||
1240 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1241 | |||
1242 | |||
1243 | _encrypt_by_8_new\@: | ||
1244 | cmp $(255-8), %r15d | ||
1245 | jg _encrypt_by_8\@ | ||
1246 | |||
1247 | |||
1248 | |||
1249 | add $8, %r15b | ||
1250 | GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC | ||
1251 | add $128, %r11 | ||
1252 | sub $128, %r13 | ||
1253 | jne _encrypt_by_8_new\@ | ||
1254 | |||
1255 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1256 | jmp _eight_cipher_left\@ | ||
1257 | |||
1258 | _encrypt_by_8\@: | ||
1259 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1260 | add $8, %r15b | ||
1261 | GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC | ||
1262 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1263 | add $128, %r11 | ||
1264 | sub $128, %r13 | ||
1265 | jne _encrypt_by_8_new\@ | ||
1266 | |||
1267 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1268 | |||
1269 | |||
1270 | |||
1271 | |||
1272 | _eight_cipher_left\@: | ||
1273 | GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 | ||
1274 | |||
1275 | |||
1276 | _zero_cipher_left\@: | ||
1277 | cmp $16, arg4 | ||
1278 | jl _only_less_than_16\@ | ||
1279 | |||
1280 | mov arg4, %r13 | ||
1281 | and $15, %r13 # r13 = (arg4 mod 16) | ||
1282 | |||
1283 | je _multiple_of_16_bytes\@ | ||
1284 | |||
1285 | # handle the last <16 Byte block seperately | ||
1286 | |||
1287 | |||
1288 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | ||
1289 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1290 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | ||
1291 | |||
1292 | sub $16, %r11 | ||
1293 | add %r13, %r11 | ||
1294 | vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block | ||
1295 | |||
1296 | lea SHIFT_MASK+16(%rip), %r12 | ||
1297 | sub %r13, %r12 # adjust the shuffle mask pointer to be | ||
1298 | # able to shift 16-r13 bytes (r13 is the | ||
1299 | # number of bytes in plaintext mod 16) | ||
1300 | vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
1301 | vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes | ||
1302 | jmp _final_ghash_mul\@ | ||
1303 | |||
1304 | _only_less_than_16\@: | ||
1305 | # check for 0 length | ||
1306 | mov arg4, %r13 | ||
1307 | and $15, %r13 # r13 = (arg4 mod 16) | ||
1308 | |||
1309 | je _multiple_of_16_bytes\@ | ||
1310 | |||
1311 | # handle the last <16 Byte block seperately | ||
1312 | |||
1313 | |||
1314 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | ||
1315 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1316 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | ||
1317 | |||
1318 | |||
1319 | lea SHIFT_MASK+16(%rip), %r12 | ||
1320 | sub %r13, %r12 # adjust the shuffle mask pointer to be | ||
1321 | # able to shift 16-r13 bytes (r13 is the | ||
1322 | # number of bytes in plaintext mod 16) | ||
1323 | |||
1324 | _get_last_16_byte_loop\@: | ||
1325 | movb (arg3, %r11), %al | ||
1326 | movb %al, TMP1 (%rsp , %r11) | ||
1327 | add $1, %r11 | ||
1328 | cmp %r13, %r11 | ||
1329 | jne _get_last_16_byte_loop\@ | ||
1330 | |||
1331 | vmovdqu TMP1(%rsp), %xmm1 | ||
1332 | |||
1333 | sub $16, %r11 | ||
1334 | |||
1335 | _final_ghash_mul\@: | ||
1336 | .if \ENC_DEC == DEC | ||
1337 | vmovdqa %xmm1, %xmm2 | ||
1338 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | ||
1339 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to | ||
1340 | # mask out top 16-r13 bytes of xmm9 | ||
1341 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | ||
1342 | vpand %xmm1, %xmm2, %xmm2 | ||
1343 | vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 | ||
1344 | vpxor %xmm2, %xmm14, %xmm14 | ||
1345 | #GHASH computation for the last <16 Byte block | ||
1346 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1347 | sub %r13, %r11 | ||
1348 | add $16, %r11 | ||
1349 | .else | ||
1350 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | ||
1351 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to | ||
1352 | # mask out top 16-r13 bytes of xmm9 | ||
1353 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | ||
1354 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
1355 | vpxor %xmm9, %xmm14, %xmm14 | ||
1356 | #GHASH computation for the last <16 Byte block | ||
1357 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1358 | sub %r13, %r11 | ||
1359 | add $16, %r11 | ||
1360 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext | ||
1361 | .endif | ||
1362 | |||
1363 | |||
1364 | ############################# | ||
1365 | # output r13 Bytes | ||
1366 | vmovq %xmm9, %rax | ||
1367 | cmp $8, %r13 | ||
1368 | jle _less_than_8_bytes_left\@ | ||
1369 | |||
1370 | mov %rax, (arg2 , %r11) | ||
1371 | add $8, %r11 | ||
1372 | vpsrldq $8, %xmm9, %xmm9 | ||
1373 | vmovq %xmm9, %rax | ||
1374 | sub $8, %r13 | ||
1375 | |||
1376 | _less_than_8_bytes_left\@: | ||
1377 | movb %al, (arg2 , %r11) | ||
1378 | add $1, %r11 | ||
1379 | shr $8, %rax | ||
1380 | sub $1, %r13 | ||
1381 | jne _less_than_8_bytes_left\@ | ||
1382 | ############################# | ||
1383 | |||
1384 | _multiple_of_16_bytes\@: | ||
1385 | mov arg7, %r12 # r12 = aadLen (number of bytes) | ||
1386 | shl $3, %r12 # convert into number of bits | ||
1387 | vmovd %r12d, %xmm15 # len(A) in xmm15 | ||
1388 | |||
1389 | shl $3, arg4 # len(C) in bits (*128) | ||
1390 | vmovq arg4, %xmm1 | ||
1391 | vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 | ||
1392 | vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) | ||
1393 | |||
1394 | vpxor %xmm15, %xmm14, %xmm14 | ||
1395 | GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation | ||
1396 | vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap | ||
1397 | |||
1398 | mov arg5, %rax # rax = *Y0 | ||
1399 | vmovdqu (%rax), %xmm9 # xmm9 = Y0 | ||
1400 | |||
1401 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) | ||
1402 | |||
1403 | vpxor %xmm14, %xmm9, %xmm9 | ||
1404 | |||
1405 | |||
1406 | |||
1407 | _return_T\@: | ||
1408 | mov arg8, %r10 # r10 = authTag | ||
1409 | mov arg9, %r11 # r11 = auth_tag_len | ||
1410 | |||
1411 | cmp $16, %r11 | ||
1412 | je _T_16\@ | ||
1413 | |||
1414 | cmp $12, %r11 | ||
1415 | je _T_12\@ | ||
1416 | |||
1417 | _T_8\@: | ||
1418 | vmovq %xmm9, %rax | ||
1419 | mov %rax, (%r10) | ||
1420 | jmp _return_T_done\@ | ||
1421 | _T_12\@: | ||
1422 | vmovq %xmm9, %rax | ||
1423 | mov %rax, (%r10) | ||
1424 | vpsrldq $8, %xmm9, %xmm9 | ||
1425 | vmovd %xmm9, %eax | ||
1426 | mov %eax, 8(%r10) | ||
1427 | jmp _return_T_done\@ | ||
1428 | |||
1429 | _T_16\@: | ||
1430 | vmovdqu %xmm9, (%r10) | ||
1431 | |||
1432 | _return_T_done\@: | ||
1433 | mov %r14, %rsp | ||
1434 | |||
1435 | pop %r15 | ||
1436 | pop %r14 | ||
1437 | pop %r13 | ||
1438 | pop %r12 | ||
1439 | .endm | ||
1440 | |||
1441 | |||
1442 | ############################################################# | ||
1443 | #void aesni_gcm_precomp_avx_gen2 | ||
1444 | # (gcm_data *my_ctx_data, | ||
1445 | # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ | ||
1446 | ############################################################# | ||
1447 | ENTRY(aesni_gcm_precomp_avx_gen2) | ||
1448 | #the number of pushes must equal STACK_OFFSET | ||
1449 | push %r12 | ||
1450 | push %r13 | ||
1451 | push %r14 | ||
1452 | push %r15 | ||
1453 | |||
1454 | mov %rsp, %r14 | ||
1455 | |||
1456 | |||
1457 | |||
1458 | sub $VARIABLE_OFFSET, %rsp | ||
1459 | and $~63, %rsp # align rsp to 64 bytes | ||
1460 | |||
1461 | vmovdqu (arg2), %xmm6 # xmm6 = HashKey | ||
1462 | |||
1463 | vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 | ||
1464 | ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey | ||
1465 | vmovdqa %xmm6, %xmm2 | ||
1466 | vpsllq $1, %xmm6, %xmm6 | ||
1467 | vpsrlq $63, %xmm2, %xmm2 | ||
1468 | vmovdqa %xmm2, %xmm1 | ||
1469 | vpslldq $8, %xmm2, %xmm2 | ||
1470 | vpsrldq $8, %xmm1, %xmm1 | ||
1471 | vpor %xmm2, %xmm6, %xmm6 | ||
1472 | #reduction | ||
1473 | vpshufd $0b00100100, %xmm1, %xmm2 | ||
1474 | vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 | ||
1475 | vpand POLY(%rip), %xmm2, %xmm2 | ||
1476 | vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly | ||
1477 | ####################################################################### | ||
1478 | vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly | ||
1479 | |||
1480 | |||
1481 | PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 | ||
1482 | |||
1483 | mov %r14, %rsp | ||
1484 | |||
1485 | pop %r15 | ||
1486 | pop %r14 | ||
1487 | pop %r13 | ||
1488 | pop %r12 | ||
1489 | ret | ||
1490 | ENDPROC(aesni_gcm_precomp_avx_gen2) | ||
1491 | |||
1492 | ############################################################################### | ||
1493 | #void aesni_gcm_enc_avx_gen2( | ||
1494 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | ||
1495 | # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ | ||
1496 | # const u8 *in, /* Plaintext input */ | ||
1497 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | ||
1498 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | ||
1499 | # (from Security Association) concatenated with 8 byte | ||
1500 | # Initialisation Vector (from IPSec ESP Payload) | ||
1501 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | ||
1502 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | ||
1503 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | ||
1504 | # u8 *auth_tag, /* Authenticated Tag output. */ | ||
1505 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | ||
1506 | # Valid values are 16 (most likely), 12 or 8. */ | ||
1507 | ############################################################################### | ||
1508 | ENTRY(aesni_gcm_enc_avx_gen2) | ||
1509 | GCM_ENC_DEC_AVX ENC | ||
1510 | ret | ||
1511 | ENDPROC(aesni_gcm_enc_avx_gen2) | ||
1512 | |||
1513 | ############################################################################### | ||
1514 | #void aesni_gcm_dec_avx_gen2( | ||
1515 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | ||
1516 | # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ | ||
1517 | # const u8 *in, /* Ciphertext input */ | ||
1518 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | ||
1519 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | ||
1520 | # (from Security Association) concatenated with 8 byte | ||
1521 | # Initialisation Vector (from IPSec ESP Payload) | ||
1522 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | ||
1523 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | ||
1524 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | ||
1525 | # u8 *auth_tag, /* Authenticated Tag output. */ | ||
1526 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | ||
1527 | # Valid values are 16 (most likely), 12 or 8. */ | ||
1528 | ############################################################################### | ||
1529 | ENTRY(aesni_gcm_dec_avx_gen2) | ||
1530 | GCM_ENC_DEC_AVX DEC | ||
1531 | ret | ||
1532 | ENDPROC(aesni_gcm_dec_avx_gen2) | ||
1533 | #endif /* CONFIG_AS_AVX */ | ||
1534 | |||
1535 | #ifdef CONFIG_AS_AVX2 | ||
1536 | ############################################################################### | ||
1537 | # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | ||
1538 | # Input: A and B (128-bits each, bit-reflected) | ||
1539 | # Output: C = A*B*x mod poly, (i.e. >>1 ) | ||
1540 | # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | ||
1541 | # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | ||
1542 | ############################################################################### | ||
1543 | .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 | ||
1544 | |||
1545 | vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 | ||
1546 | vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 | ||
1547 | vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 | ||
1548 | vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 | ||
1549 | vpxor \T3, \GH, \GH | ||
1550 | |||
1551 | |||
1552 | vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs | ||
1553 | vpslldq $8 , \GH, \GH # shift-L GH 2 DWs | ||
1554 | |||
1555 | vpxor \T3, \T1, \T1 | ||
1556 | vpxor \T2, \GH, \GH | ||
1557 | |||
1558 | ####################################################################### | ||
1559 | #first phase of the reduction | ||
1560 | vmovdqa POLY2(%rip), \T3 | ||
1561 | |||
1562 | vpclmulqdq $0x01, \GH, \T3, \T2 | ||
1563 | vpslldq $8, \T2, \T2 # shift-L T2 2 DWs | ||
1564 | |||
1565 | vpxor \T2, \GH, \GH # first phase of the reduction complete | ||
1566 | ####################################################################### | ||
1567 | #second phase of the reduction | ||
1568 | vpclmulqdq $0x00, \GH, \T3, \T2 | ||
1569 | vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | ||
1570 | |||
1571 | vpclmulqdq $0x10, \GH, \T3, \GH | ||
1572 | vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) | ||
1573 | |||
1574 | vpxor \T2, \GH, \GH # second phase of the reduction complete | ||
1575 | ####################################################################### | ||
1576 | vpxor \T1, \GH, \GH # the result is in GH | ||
1577 | |||
1578 | |||
1579 | .endm | ||
1580 | |||
1581 | .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 | ||
1582 | |||
1583 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
1584 | vmovdqa \HK, \T5 | ||
1585 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly | ||
1586 | vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly | ||
1587 | |||
1588 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly | ||
1589 | vmovdqa \T5, HashKey_3(arg1) | ||
1590 | |||
1591 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly | ||
1592 | vmovdqa \T5, HashKey_4(arg1) | ||
1593 | |||
1594 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly | ||
1595 | vmovdqa \T5, HashKey_5(arg1) | ||
1596 | |||
1597 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly | ||
1598 | vmovdqa \T5, HashKey_6(arg1) | ||
1599 | |||
1600 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly | ||
1601 | vmovdqa \T5, HashKey_7(arg1) | ||
1602 | |||
1603 | GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly | ||
1604 | vmovdqa \T5, HashKey_8(arg1) | ||
1605 | |||
1606 | .endm | ||
1607 | |||
1608 | |||
1609 | ## if a = number of total plaintext bytes | ||
1610 | ## b = floor(a/16) | ||
1611 | ## num_initial_blocks = b mod 4# | ||
1612 | ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext | ||
1613 | ## r10, r11, r12, rax are clobbered | ||
1614 | ## arg1, arg2, arg3, r14 are used as a pointer only, not modified | ||
1615 | |||
1616 | .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER | ||
1617 | i = (8-\num_initial_blocks) | ||
1618 | setreg | ||
1619 | |||
1620 | mov arg6, %r10 # r10 = AAD | ||
1621 | mov arg7, %r12 # r12 = aadLen | ||
1622 | |||
1623 | |||
1624 | mov %r12, %r11 | ||
1625 | |||
1626 | vpxor reg_i, reg_i, reg_i | ||
1627 | _get_AAD_loop\@: | ||
1628 | vmovd (%r10), \T1 | ||
1629 | vpslldq $12, \T1, \T1 | ||
1630 | vpsrldq $4, reg_i, reg_i | ||
1631 | vpxor \T1, reg_i, reg_i | ||
1632 | |||
1633 | add $4, %r10 | ||
1634 | sub $4, %r12 | ||
1635 | jg _get_AAD_loop\@ | ||
1636 | |||
1637 | |||
1638 | cmp $16, %r11 | ||
1639 | je _get_AAD_loop2_done\@ | ||
1640 | mov $16, %r12 | ||
1641 | |||
1642 | _get_AAD_loop2\@: | ||
1643 | vpsrldq $4, reg_i, reg_i | ||
1644 | sub $4, %r12 | ||
1645 | cmp %r11, %r12 | ||
1646 | jg _get_AAD_loop2\@ | ||
1647 | |||
1648 | _get_AAD_loop2_done\@: | ||
1649 | |||
1650 | #byte-reflect the AAD data | ||
1651 | vpshufb SHUF_MASK(%rip), reg_i, reg_i | ||
1652 | |||
1653 | # initialize the data pointer offset as zero | ||
1654 | xor %r11, %r11 | ||
1655 | |||
1656 | # start AES for num_initial_blocks blocks | ||
1657 | mov arg5, %rax # rax = *Y0 | ||
1658 | vmovdqu (%rax), \CTR # CTR = Y0 | ||
1659 | vpshufb SHUF_MASK(%rip), \CTR, \CTR | ||
1660 | |||
1661 | |||
1662 | i = (9-\num_initial_blocks) | ||
1663 | setreg | ||
1664 | .rep \num_initial_blocks | ||
1665 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1666 | vmovdqa \CTR, reg_i | ||
1667 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap | ||
1668 | i = (i+1) | ||
1669 | setreg | ||
1670 | .endr | ||
1671 | |||
1672 | vmovdqa (arg1), \T_key | ||
1673 | i = (9-\num_initial_blocks) | ||
1674 | setreg | ||
1675 | .rep \num_initial_blocks | ||
1676 | vpxor \T_key, reg_i, reg_i | ||
1677 | i = (i+1) | ||
1678 | setreg | ||
1679 | .endr | ||
1680 | |||
1681 | j = 1 | ||
1682 | setreg | ||
1683 | .rep 9 | ||
1684 | vmovdqa 16*j(arg1), \T_key | ||
1685 | i = (9-\num_initial_blocks) | ||
1686 | setreg | ||
1687 | .rep \num_initial_blocks | ||
1688 | vaesenc \T_key, reg_i, reg_i | ||
1689 | i = (i+1) | ||
1690 | setreg | ||
1691 | .endr | ||
1692 | |||
1693 | j = (j+1) | ||
1694 | setreg | ||
1695 | .endr | ||
1696 | |||
1697 | |||
1698 | vmovdqa 16*10(arg1), \T_key | ||
1699 | i = (9-\num_initial_blocks) | ||
1700 | setreg | ||
1701 | .rep \num_initial_blocks | ||
1702 | vaesenclast \T_key, reg_i, reg_i | ||
1703 | i = (i+1) | ||
1704 | setreg | ||
1705 | .endr | ||
1706 | |||
1707 | i = (9-\num_initial_blocks) | ||
1708 | setreg | ||
1709 | .rep \num_initial_blocks | ||
1710 | vmovdqu (arg3, %r11), \T1 | ||
1711 | vpxor \T1, reg_i, reg_i | ||
1712 | vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for | ||
1713 | # num_initial_blocks blocks | ||
1714 | add $16, %r11 | ||
1715 | .if \ENC_DEC == DEC | ||
1716 | vmovdqa \T1, reg_i | ||
1717 | .endif | ||
1718 | vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations | ||
1719 | i = (i+1) | ||
1720 | setreg | ||
1721 | .endr | ||
1722 | |||
1723 | |||
1724 | i = (8-\num_initial_blocks) | ||
1725 | j = (9-\num_initial_blocks) | ||
1726 | setreg | ||
1727 | GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 | ||
1728 | |||
1729 | .rep \num_initial_blocks | ||
1730 | vpxor reg_i, reg_j, reg_j | ||
1731 | GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks | ||
1732 | i = (i+1) | ||
1733 | j = (j+1) | ||
1734 | setreg | ||
1735 | .endr | ||
1736 | # XMM8 has the combined result here | ||
1737 | |||
1738 | vmovdqa \XMM8, TMP1(%rsp) | ||
1739 | vmovdqa \XMM8, \T3 | ||
1740 | |||
1741 | cmp $128, %r13 | ||
1742 | jl _initial_blocks_done\@ # no need for precomputed constants | ||
1743 | |||
1744 | ############################################################################### | ||
1745 | # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
1746 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1747 | vmovdqa \CTR, \XMM1 | ||
1748 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
1749 | |||
1750 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1751 | vmovdqa \CTR, \XMM2 | ||
1752 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
1753 | |||
1754 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1755 | vmovdqa \CTR, \XMM3 | ||
1756 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
1757 | |||
1758 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1759 | vmovdqa \CTR, \XMM4 | ||
1760 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
1761 | |||
1762 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1763 | vmovdqa \CTR, \XMM5 | ||
1764 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
1765 | |||
1766 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1767 | vmovdqa \CTR, \XMM6 | ||
1768 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
1769 | |||
1770 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1771 | vmovdqa \CTR, \XMM7 | ||
1772 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
1773 | |||
1774 | vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 | ||
1775 | vmovdqa \CTR, \XMM8 | ||
1776 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
1777 | |||
1778 | vmovdqa (arg1), \T_key | ||
1779 | vpxor \T_key, \XMM1, \XMM1 | ||
1780 | vpxor \T_key, \XMM2, \XMM2 | ||
1781 | vpxor \T_key, \XMM3, \XMM3 | ||
1782 | vpxor \T_key, \XMM4, \XMM4 | ||
1783 | vpxor \T_key, \XMM5, \XMM5 | ||
1784 | vpxor \T_key, \XMM6, \XMM6 | ||
1785 | vpxor \T_key, \XMM7, \XMM7 | ||
1786 | vpxor \T_key, \XMM8, \XMM8 | ||
1787 | |||
1788 | i = 1 | ||
1789 | setreg | ||
1790 | .rep 9 # do 9 rounds | ||
1791 | vmovdqa 16*i(arg1), \T_key | ||
1792 | vaesenc \T_key, \XMM1, \XMM1 | ||
1793 | vaesenc \T_key, \XMM2, \XMM2 | ||
1794 | vaesenc \T_key, \XMM3, \XMM3 | ||
1795 | vaesenc \T_key, \XMM4, \XMM4 | ||
1796 | vaesenc \T_key, \XMM5, \XMM5 | ||
1797 | vaesenc \T_key, \XMM6, \XMM6 | ||
1798 | vaesenc \T_key, \XMM7, \XMM7 | ||
1799 | vaesenc \T_key, \XMM8, \XMM8 | ||
1800 | i = (i+1) | ||
1801 | setreg | ||
1802 | .endr | ||
1803 | |||
1804 | |||
1805 | vmovdqa 16*i(arg1), \T_key | ||
1806 | vaesenclast \T_key, \XMM1, \XMM1 | ||
1807 | vaesenclast \T_key, \XMM2, \XMM2 | ||
1808 | vaesenclast \T_key, \XMM3, \XMM3 | ||
1809 | vaesenclast \T_key, \XMM4, \XMM4 | ||
1810 | vaesenclast \T_key, \XMM5, \XMM5 | ||
1811 | vaesenclast \T_key, \XMM6, \XMM6 | ||
1812 | vaesenclast \T_key, \XMM7, \XMM7 | ||
1813 | vaesenclast \T_key, \XMM8, \XMM8 | ||
1814 | |||
1815 | vmovdqu (arg3, %r11), \T1 | ||
1816 | vpxor \T1, \XMM1, \XMM1 | ||
1817 | vmovdqu \XMM1, (arg2 , %r11) | ||
1818 | .if \ENC_DEC == DEC | ||
1819 | vmovdqa \T1, \XMM1 | ||
1820 | .endif | ||
1821 | |||
1822 | vmovdqu 16*1(arg3, %r11), \T1 | ||
1823 | vpxor \T1, \XMM2, \XMM2 | ||
1824 | vmovdqu \XMM2, 16*1(arg2 , %r11) | ||
1825 | .if \ENC_DEC == DEC | ||
1826 | vmovdqa \T1, \XMM2 | ||
1827 | .endif | ||
1828 | |||
1829 | vmovdqu 16*2(arg3, %r11), \T1 | ||
1830 | vpxor \T1, \XMM3, \XMM3 | ||
1831 | vmovdqu \XMM3, 16*2(arg2 , %r11) | ||
1832 | .if \ENC_DEC == DEC | ||
1833 | vmovdqa \T1, \XMM3 | ||
1834 | .endif | ||
1835 | |||
1836 | vmovdqu 16*3(arg3, %r11), \T1 | ||
1837 | vpxor \T1, \XMM4, \XMM4 | ||
1838 | vmovdqu \XMM4, 16*3(arg2 , %r11) | ||
1839 | .if \ENC_DEC == DEC | ||
1840 | vmovdqa \T1, \XMM4 | ||
1841 | .endif | ||
1842 | |||
1843 | vmovdqu 16*4(arg3, %r11), \T1 | ||
1844 | vpxor \T1, \XMM5, \XMM5 | ||
1845 | vmovdqu \XMM5, 16*4(arg2 , %r11) | ||
1846 | .if \ENC_DEC == DEC | ||
1847 | vmovdqa \T1, \XMM5 | ||
1848 | .endif | ||
1849 | |||
1850 | vmovdqu 16*5(arg3, %r11), \T1 | ||
1851 | vpxor \T1, \XMM6, \XMM6 | ||
1852 | vmovdqu \XMM6, 16*5(arg2 , %r11) | ||
1853 | .if \ENC_DEC == DEC | ||
1854 | vmovdqa \T1, \XMM6 | ||
1855 | .endif | ||
1856 | |||
1857 | vmovdqu 16*6(arg3, %r11), \T1 | ||
1858 | vpxor \T1, \XMM7, \XMM7 | ||
1859 | vmovdqu \XMM7, 16*6(arg2 , %r11) | ||
1860 | .if \ENC_DEC == DEC | ||
1861 | vmovdqa \T1, \XMM7 | ||
1862 | .endif | ||
1863 | |||
1864 | vmovdqu 16*7(arg3, %r11), \T1 | ||
1865 | vpxor \T1, \XMM8, \XMM8 | ||
1866 | vmovdqu \XMM8, 16*7(arg2 , %r11) | ||
1867 | .if \ENC_DEC == DEC | ||
1868 | vmovdqa \T1, \XMM8 | ||
1869 | .endif | ||
1870 | |||
1871 | add $128, %r11 | ||
1872 | |||
1873 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
1874 | vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with | ||
1875 | # the corresponding ciphertext | ||
1876 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
1877 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
1878 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
1879 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
1880 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
1881 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
1882 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
1883 | |||
1884 | ############################################################################### | ||
1885 | |||
1886 | _initial_blocks_done\@: | ||
1887 | |||
1888 | |||
1889 | .endm | ||
1890 | |||
1891 | |||
1892 | |||
1893 | # encrypt 8 blocks at a time | ||
1894 | # ghash the 8 previously encrypted ciphertext blocks | ||
1895 | # arg1, arg2, arg3 are used as pointers only, not modified | ||
1896 | # r11 is the data offset value | ||
1897 | .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC | ||
1898 | |||
1899 | vmovdqa \XMM1, \T2 | ||
1900 | vmovdqa \XMM2, TMP2(%rsp) | ||
1901 | vmovdqa \XMM3, TMP3(%rsp) | ||
1902 | vmovdqa \XMM4, TMP4(%rsp) | ||
1903 | vmovdqa \XMM5, TMP5(%rsp) | ||
1904 | vmovdqa \XMM6, TMP6(%rsp) | ||
1905 | vmovdqa \XMM7, TMP7(%rsp) | ||
1906 | vmovdqa \XMM8, TMP8(%rsp) | ||
1907 | |||
1908 | .if \loop_idx == in_order | ||
1909 | vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT | ||
1910 | vpaddd ONE(%rip), \XMM1, \XMM2 | ||
1911 | vpaddd ONE(%rip), \XMM2, \XMM3 | ||
1912 | vpaddd ONE(%rip), \XMM3, \XMM4 | ||
1913 | vpaddd ONE(%rip), \XMM4, \XMM5 | ||
1914 | vpaddd ONE(%rip), \XMM5, \XMM6 | ||
1915 | vpaddd ONE(%rip), \XMM6, \XMM7 | ||
1916 | vpaddd ONE(%rip), \XMM7, \XMM8 | ||
1917 | vmovdqa \XMM8, \CTR | ||
1918 | |||
1919 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
1920 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
1921 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
1922 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
1923 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
1924 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
1925 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
1926 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
1927 | .else | ||
1928 | vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT | ||
1929 | vpaddd ONEf(%rip), \XMM1, \XMM2 | ||
1930 | vpaddd ONEf(%rip), \XMM2, \XMM3 | ||
1931 | vpaddd ONEf(%rip), \XMM3, \XMM4 | ||
1932 | vpaddd ONEf(%rip), \XMM4, \XMM5 | ||
1933 | vpaddd ONEf(%rip), \XMM5, \XMM6 | ||
1934 | vpaddd ONEf(%rip), \XMM6, \XMM7 | ||
1935 | vpaddd ONEf(%rip), \XMM7, \XMM8 | ||
1936 | vmovdqa \XMM8, \CTR | ||
1937 | .endif | ||
1938 | |||
1939 | |||
1940 | ####################################################################### | ||
1941 | |||
1942 | vmovdqu (arg1), \T1 | ||
1943 | vpxor \T1, \XMM1, \XMM1 | ||
1944 | vpxor \T1, \XMM2, \XMM2 | ||
1945 | vpxor \T1, \XMM3, \XMM3 | ||
1946 | vpxor \T1, \XMM4, \XMM4 | ||
1947 | vpxor \T1, \XMM5, \XMM5 | ||
1948 | vpxor \T1, \XMM6, \XMM6 | ||
1949 | vpxor \T1, \XMM7, \XMM7 | ||
1950 | vpxor \T1, \XMM8, \XMM8 | ||
1951 | |||
1952 | ####################################################################### | ||
1953 | |||
1954 | |||
1955 | |||
1956 | |||
1957 | |||
1958 | vmovdqu 16*1(arg1), \T1 | ||
1959 | vaesenc \T1, \XMM1, \XMM1 | ||
1960 | vaesenc \T1, \XMM2, \XMM2 | ||
1961 | vaesenc \T1, \XMM3, \XMM3 | ||
1962 | vaesenc \T1, \XMM4, \XMM4 | ||
1963 | vaesenc \T1, \XMM5, \XMM5 | ||
1964 | vaesenc \T1, \XMM6, \XMM6 | ||
1965 | vaesenc \T1, \XMM7, \XMM7 | ||
1966 | vaesenc \T1, \XMM8, \XMM8 | ||
1967 | |||
1968 | vmovdqu 16*2(arg1), \T1 | ||
1969 | vaesenc \T1, \XMM1, \XMM1 | ||
1970 | vaesenc \T1, \XMM2, \XMM2 | ||
1971 | vaesenc \T1, \XMM3, \XMM3 | ||
1972 | vaesenc \T1, \XMM4, \XMM4 | ||
1973 | vaesenc \T1, \XMM5, \XMM5 | ||
1974 | vaesenc \T1, \XMM6, \XMM6 | ||
1975 | vaesenc \T1, \XMM7, \XMM7 | ||
1976 | vaesenc \T1, \XMM8, \XMM8 | ||
1977 | |||
1978 | |||
1979 | ####################################################################### | ||
1980 | |||
1981 | vmovdqa HashKey_8(arg1), \T5 | ||
1982 | vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 | ||
1983 | vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 | ||
1984 | vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 | ||
1985 | vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 | ||
1986 | vpxor \T5, \T6, \T6 | ||
1987 | |||
1988 | vmovdqu 16*3(arg1), \T1 | ||
1989 | vaesenc \T1, \XMM1, \XMM1 | ||
1990 | vaesenc \T1, \XMM2, \XMM2 | ||
1991 | vaesenc \T1, \XMM3, \XMM3 | ||
1992 | vaesenc \T1, \XMM4, \XMM4 | ||
1993 | vaesenc \T1, \XMM5, \XMM5 | ||
1994 | vaesenc \T1, \XMM6, \XMM6 | ||
1995 | vaesenc \T1, \XMM7, \XMM7 | ||
1996 | vaesenc \T1, \XMM8, \XMM8 | ||
1997 | |||
1998 | vmovdqa TMP2(%rsp), \T1 | ||
1999 | vmovdqa HashKey_7(arg1), \T5 | ||
2000 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2001 | vpxor \T3, \T4, \T4 | ||
2002 | |||
2003 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2004 | vpxor \T3, \T7, \T7 | ||
2005 | |||
2006 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2007 | vpxor \T3, \T6, \T6 | ||
2008 | |||
2009 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2010 | vpxor \T3, \T6, \T6 | ||
2011 | |||
2012 | vmovdqu 16*4(arg1), \T1 | ||
2013 | vaesenc \T1, \XMM1, \XMM1 | ||
2014 | vaesenc \T1, \XMM2, \XMM2 | ||
2015 | vaesenc \T1, \XMM3, \XMM3 | ||
2016 | vaesenc \T1, \XMM4, \XMM4 | ||
2017 | vaesenc \T1, \XMM5, \XMM5 | ||
2018 | vaesenc \T1, \XMM6, \XMM6 | ||
2019 | vaesenc \T1, \XMM7, \XMM7 | ||
2020 | vaesenc \T1, \XMM8, \XMM8 | ||
2021 | |||
2022 | ####################################################################### | ||
2023 | |||
2024 | vmovdqa TMP3(%rsp), \T1 | ||
2025 | vmovdqa HashKey_6(arg1), \T5 | ||
2026 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2027 | vpxor \T3, \T4, \T4 | ||
2028 | |||
2029 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2030 | vpxor \T3, \T7, \T7 | ||
2031 | |||
2032 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2033 | vpxor \T3, \T6, \T6 | ||
2034 | |||
2035 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2036 | vpxor \T3, \T6, \T6 | ||
2037 | |||
2038 | vmovdqu 16*5(arg1), \T1 | ||
2039 | vaesenc \T1, \XMM1, \XMM1 | ||
2040 | vaesenc \T1, \XMM2, \XMM2 | ||
2041 | vaesenc \T1, \XMM3, \XMM3 | ||
2042 | vaesenc \T1, \XMM4, \XMM4 | ||
2043 | vaesenc \T1, \XMM5, \XMM5 | ||
2044 | vaesenc \T1, \XMM6, \XMM6 | ||
2045 | vaesenc \T1, \XMM7, \XMM7 | ||
2046 | vaesenc \T1, \XMM8, \XMM8 | ||
2047 | |||
2048 | vmovdqa TMP4(%rsp), \T1 | ||
2049 | vmovdqa HashKey_5(arg1), \T5 | ||
2050 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2051 | vpxor \T3, \T4, \T4 | ||
2052 | |||
2053 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2054 | vpxor \T3, \T7, \T7 | ||
2055 | |||
2056 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2057 | vpxor \T3, \T6, \T6 | ||
2058 | |||
2059 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2060 | vpxor \T3, \T6, \T6 | ||
2061 | |||
2062 | vmovdqu 16*6(arg1), \T1 | ||
2063 | vaesenc \T1, \XMM1, \XMM1 | ||
2064 | vaesenc \T1, \XMM2, \XMM2 | ||
2065 | vaesenc \T1, \XMM3, \XMM3 | ||
2066 | vaesenc \T1, \XMM4, \XMM4 | ||
2067 | vaesenc \T1, \XMM5, \XMM5 | ||
2068 | vaesenc \T1, \XMM6, \XMM6 | ||
2069 | vaesenc \T1, \XMM7, \XMM7 | ||
2070 | vaesenc \T1, \XMM8, \XMM8 | ||
2071 | |||
2072 | |||
2073 | vmovdqa TMP5(%rsp), \T1 | ||
2074 | vmovdqa HashKey_4(arg1), \T5 | ||
2075 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2076 | vpxor \T3, \T4, \T4 | ||
2077 | |||
2078 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2079 | vpxor \T3, \T7, \T7 | ||
2080 | |||
2081 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2082 | vpxor \T3, \T6, \T6 | ||
2083 | |||
2084 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2085 | vpxor \T3, \T6, \T6 | ||
2086 | |||
2087 | vmovdqu 16*7(arg1), \T1 | ||
2088 | vaesenc \T1, \XMM1, \XMM1 | ||
2089 | vaesenc \T1, \XMM2, \XMM2 | ||
2090 | vaesenc \T1, \XMM3, \XMM3 | ||
2091 | vaesenc \T1, \XMM4, \XMM4 | ||
2092 | vaesenc \T1, \XMM5, \XMM5 | ||
2093 | vaesenc \T1, \XMM6, \XMM6 | ||
2094 | vaesenc \T1, \XMM7, \XMM7 | ||
2095 | vaesenc \T1, \XMM8, \XMM8 | ||
2096 | |||
2097 | vmovdqa TMP6(%rsp), \T1 | ||
2098 | vmovdqa HashKey_3(arg1), \T5 | ||
2099 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2100 | vpxor \T3, \T4, \T4 | ||
2101 | |||
2102 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2103 | vpxor \T3, \T7, \T7 | ||
2104 | |||
2105 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2106 | vpxor \T3, \T6, \T6 | ||
2107 | |||
2108 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2109 | vpxor \T3, \T6, \T6 | ||
2110 | |||
2111 | vmovdqu 16*8(arg1), \T1 | ||
2112 | vaesenc \T1, \XMM1, \XMM1 | ||
2113 | vaesenc \T1, \XMM2, \XMM2 | ||
2114 | vaesenc \T1, \XMM3, \XMM3 | ||
2115 | vaesenc \T1, \XMM4, \XMM4 | ||
2116 | vaesenc \T1, \XMM5, \XMM5 | ||
2117 | vaesenc \T1, \XMM6, \XMM6 | ||
2118 | vaesenc \T1, \XMM7, \XMM7 | ||
2119 | vaesenc \T1, \XMM8, \XMM8 | ||
2120 | |||
2121 | vmovdqa TMP7(%rsp), \T1 | ||
2122 | vmovdqa HashKey_2(arg1), \T5 | ||
2123 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2124 | vpxor \T3, \T4, \T4 | ||
2125 | |||
2126 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2127 | vpxor \T3, \T7, \T7 | ||
2128 | |||
2129 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2130 | vpxor \T3, \T6, \T6 | ||
2131 | |||
2132 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2133 | vpxor \T3, \T6, \T6 | ||
2134 | |||
2135 | |||
2136 | ####################################################################### | ||
2137 | |||
2138 | vmovdqu 16*9(arg1), \T5 | ||
2139 | vaesenc \T5, \XMM1, \XMM1 | ||
2140 | vaesenc \T5, \XMM2, \XMM2 | ||
2141 | vaesenc \T5, \XMM3, \XMM3 | ||
2142 | vaesenc \T5, \XMM4, \XMM4 | ||
2143 | vaesenc \T5, \XMM5, \XMM5 | ||
2144 | vaesenc \T5, \XMM6, \XMM6 | ||
2145 | vaesenc \T5, \XMM7, \XMM7 | ||
2146 | vaesenc \T5, \XMM8, \XMM8 | ||
2147 | |||
2148 | vmovdqa TMP8(%rsp), \T1 | ||
2149 | vmovdqa HashKey(arg1), \T5 | ||
2150 | |||
2151 | vpclmulqdq $0x00, \T5, \T1, \T3 | ||
2152 | vpxor \T3, \T7, \T7 | ||
2153 | |||
2154 | vpclmulqdq $0x01, \T5, \T1, \T3 | ||
2155 | vpxor \T3, \T6, \T6 | ||
2156 | |||
2157 | vpclmulqdq $0x10, \T5, \T1, \T3 | ||
2158 | vpxor \T3, \T6, \T6 | ||
2159 | |||
2160 | vpclmulqdq $0x11, \T5, \T1, \T3 | ||
2161 | vpxor \T3, \T4, \T1 | ||
2162 | |||
2163 | |||
2164 | vmovdqu 16*10(arg1), \T5 | ||
2165 | |||
2166 | i = 0 | ||
2167 | j = 1 | ||
2168 | setreg | ||
2169 | .rep 8 | ||
2170 | vpxor 16*i(arg3, %r11), \T5, \T2 | ||
2171 | .if \ENC_DEC == ENC | ||
2172 | vaesenclast \T2, reg_j, reg_j | ||
2173 | .else | ||
2174 | vaesenclast \T2, reg_j, \T3 | ||
2175 | vmovdqu 16*i(arg3, %r11), reg_j | ||
2176 | vmovdqu \T3, 16*i(arg2, %r11) | ||
2177 | .endif | ||
2178 | i = (i+1) | ||
2179 | j = (j+1) | ||
2180 | setreg | ||
2181 | .endr | ||
2182 | ####################################################################### | ||
2183 | |||
2184 | |||
2185 | vpslldq $8, \T6, \T3 # shift-L T3 2 DWs | ||
2186 | vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs | ||
2187 | vpxor \T3, \T7, \T7 | ||
2188 | vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 | ||
2189 | |||
2190 | |||
2191 | |||
2192 | ####################################################################### | ||
2193 | #first phase of the reduction | ||
2194 | vmovdqa POLY2(%rip), \T3 | ||
2195 | |||
2196 | vpclmulqdq $0x01, \T7, \T3, \T2 | ||
2197 | vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs | ||
2198 | |||
2199 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | ||
2200 | ####################################################################### | ||
2201 | .if \ENC_DEC == ENC | ||
2202 | vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer | ||
2203 | vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer | ||
2204 | vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer | ||
2205 | vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer | ||
2206 | vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer | ||
2207 | vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer | ||
2208 | vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer | ||
2209 | vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer | ||
2210 | .endif | ||
2211 | |||
2212 | ####################################################################### | ||
2213 | #second phase of the reduction | ||
2214 | vpclmulqdq $0x00, \T7, \T3, \T2 | ||
2215 | vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | ||
2216 | |||
2217 | vpclmulqdq $0x10, \T7, \T3, \T4 | ||
2218 | vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) | ||
2219 | |||
2220 | vpxor \T2, \T4, \T4 # second phase of the reduction complete | ||
2221 | ####################################################################### | ||
2222 | vpxor \T4, \T1, \T1 # the result is in T1 | ||
2223 | |||
2224 | vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap | ||
2225 | vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap | ||
2226 | vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap | ||
2227 | vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap | ||
2228 | vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap | ||
2229 | vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap | ||
2230 | vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap | ||
2231 | vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap | ||
2232 | |||
2233 | |||
2234 | vpxor \T1, \XMM1, \XMM1 | ||
2235 | |||
2236 | |||
2237 | |||
2238 | .endm | ||
2239 | |||
2240 | |||
2241 | # GHASH the last 4 ciphertext blocks. | ||
2242 | .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 | ||
2243 | |||
2244 | ## Karatsuba Method | ||
2245 | |||
2246 | vmovdqa HashKey_8(arg1), \T5 | ||
2247 | |||
2248 | vpshufd $0b01001110, \XMM1, \T2 | ||
2249 | vpshufd $0b01001110, \T5, \T3 | ||
2250 | vpxor \XMM1, \T2, \T2 | ||
2251 | vpxor \T5, \T3, \T3 | ||
2252 | |||
2253 | vpclmulqdq $0x11, \T5, \XMM1, \T6 | ||
2254 | vpclmulqdq $0x00, \T5, \XMM1, \T7 | ||
2255 | |||
2256 | vpclmulqdq $0x00, \T3, \T2, \XMM1 | ||
2257 | |||
2258 | ###################### | ||
2259 | |||
2260 | vmovdqa HashKey_7(arg1), \T5 | ||
2261 | vpshufd $0b01001110, \XMM2, \T2 | ||
2262 | vpshufd $0b01001110, \T5, \T3 | ||
2263 | vpxor \XMM2, \T2, \T2 | ||
2264 | vpxor \T5, \T3, \T3 | ||
2265 | |||
2266 | vpclmulqdq $0x11, \T5, \XMM2, \T4 | ||
2267 | vpxor \T4, \T6, \T6 | ||
2268 | |||
2269 | vpclmulqdq $0x00, \T5, \XMM2, \T4 | ||
2270 | vpxor \T4, \T7, \T7 | ||
2271 | |||
2272 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2273 | |||
2274 | vpxor \T2, \XMM1, \XMM1 | ||
2275 | |||
2276 | ###################### | ||
2277 | |||
2278 | vmovdqa HashKey_6(arg1), \T5 | ||
2279 | vpshufd $0b01001110, \XMM3, \T2 | ||
2280 | vpshufd $0b01001110, \T5, \T3 | ||
2281 | vpxor \XMM3, \T2, \T2 | ||
2282 | vpxor \T5, \T3, \T3 | ||
2283 | |||
2284 | vpclmulqdq $0x11, \T5, \XMM3, \T4 | ||
2285 | vpxor \T4, \T6, \T6 | ||
2286 | |||
2287 | vpclmulqdq $0x00, \T5, \XMM3, \T4 | ||
2288 | vpxor \T4, \T7, \T7 | ||
2289 | |||
2290 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2291 | |||
2292 | vpxor \T2, \XMM1, \XMM1 | ||
2293 | |||
2294 | ###################### | ||
2295 | |||
2296 | vmovdqa HashKey_5(arg1), \T5 | ||
2297 | vpshufd $0b01001110, \XMM4, \T2 | ||
2298 | vpshufd $0b01001110, \T5, \T3 | ||
2299 | vpxor \XMM4, \T2, \T2 | ||
2300 | vpxor \T5, \T3, \T3 | ||
2301 | |||
2302 | vpclmulqdq $0x11, \T5, \XMM4, \T4 | ||
2303 | vpxor \T4, \T6, \T6 | ||
2304 | |||
2305 | vpclmulqdq $0x00, \T5, \XMM4, \T4 | ||
2306 | vpxor \T4, \T7, \T7 | ||
2307 | |||
2308 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2309 | |||
2310 | vpxor \T2, \XMM1, \XMM1 | ||
2311 | |||
2312 | ###################### | ||
2313 | |||
2314 | vmovdqa HashKey_4(arg1), \T5 | ||
2315 | vpshufd $0b01001110, \XMM5, \T2 | ||
2316 | vpshufd $0b01001110, \T5, \T3 | ||
2317 | vpxor \XMM5, \T2, \T2 | ||
2318 | vpxor \T5, \T3, \T3 | ||
2319 | |||
2320 | vpclmulqdq $0x11, \T5, \XMM5, \T4 | ||
2321 | vpxor \T4, \T6, \T6 | ||
2322 | |||
2323 | vpclmulqdq $0x00, \T5, \XMM5, \T4 | ||
2324 | vpxor \T4, \T7, \T7 | ||
2325 | |||
2326 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2327 | |||
2328 | vpxor \T2, \XMM1, \XMM1 | ||
2329 | |||
2330 | ###################### | ||
2331 | |||
2332 | vmovdqa HashKey_3(arg1), \T5 | ||
2333 | vpshufd $0b01001110, \XMM6, \T2 | ||
2334 | vpshufd $0b01001110, \T5, \T3 | ||
2335 | vpxor \XMM6, \T2, \T2 | ||
2336 | vpxor \T5, \T3, \T3 | ||
2337 | |||
2338 | vpclmulqdq $0x11, \T5, \XMM6, \T4 | ||
2339 | vpxor \T4, \T6, \T6 | ||
2340 | |||
2341 | vpclmulqdq $0x00, \T5, \XMM6, \T4 | ||
2342 | vpxor \T4, \T7, \T7 | ||
2343 | |||
2344 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2345 | |||
2346 | vpxor \T2, \XMM1, \XMM1 | ||
2347 | |||
2348 | ###################### | ||
2349 | |||
2350 | vmovdqa HashKey_2(arg1), \T5 | ||
2351 | vpshufd $0b01001110, \XMM7, \T2 | ||
2352 | vpshufd $0b01001110, \T5, \T3 | ||
2353 | vpxor \XMM7, \T2, \T2 | ||
2354 | vpxor \T5, \T3, \T3 | ||
2355 | |||
2356 | vpclmulqdq $0x11, \T5, \XMM7, \T4 | ||
2357 | vpxor \T4, \T6, \T6 | ||
2358 | |||
2359 | vpclmulqdq $0x00, \T5, \XMM7, \T4 | ||
2360 | vpxor \T4, \T7, \T7 | ||
2361 | |||
2362 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2363 | |||
2364 | vpxor \T2, \XMM1, \XMM1 | ||
2365 | |||
2366 | ###################### | ||
2367 | |||
2368 | vmovdqa HashKey(arg1), \T5 | ||
2369 | vpshufd $0b01001110, \XMM8, \T2 | ||
2370 | vpshufd $0b01001110, \T5, \T3 | ||
2371 | vpxor \XMM8, \T2, \T2 | ||
2372 | vpxor \T5, \T3, \T3 | ||
2373 | |||
2374 | vpclmulqdq $0x11, \T5, \XMM8, \T4 | ||
2375 | vpxor \T4, \T6, \T6 | ||
2376 | |||
2377 | vpclmulqdq $0x00, \T5, \XMM8, \T4 | ||
2378 | vpxor \T4, \T7, \T7 | ||
2379 | |||
2380 | vpclmulqdq $0x00, \T3, \T2, \T2 | ||
2381 | |||
2382 | vpxor \T2, \XMM1, \XMM1 | ||
2383 | vpxor \T6, \XMM1, \XMM1 | ||
2384 | vpxor \T7, \XMM1, \T2 | ||
2385 | |||
2386 | |||
2387 | |||
2388 | |||
2389 | vpslldq $8, \T2, \T4 | ||
2390 | vpsrldq $8, \T2, \T2 | ||
2391 | |||
2392 | vpxor \T4, \T7, \T7 | ||
2393 | vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the | ||
2394 | # accumulated carry-less multiplications | ||
2395 | |||
2396 | ####################################################################### | ||
2397 | #first phase of the reduction | ||
2398 | vmovdqa POLY2(%rip), \T3 | ||
2399 | |||
2400 | vpclmulqdq $0x01, \T7, \T3, \T2 | ||
2401 | vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs | ||
2402 | |||
2403 | vpxor \T2, \T7, \T7 # first phase of the reduction complete | ||
2404 | ####################################################################### | ||
2405 | |||
2406 | |||
2407 | #second phase of the reduction | ||
2408 | vpclmulqdq $0x00, \T7, \T3, \T2 | ||
2409 | vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) | ||
2410 | |||
2411 | vpclmulqdq $0x10, \T7, \T3, \T4 | ||
2412 | vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) | ||
2413 | |||
2414 | vpxor \T2, \T4, \T4 # second phase of the reduction complete | ||
2415 | ####################################################################### | ||
2416 | vpxor \T4, \T6, \T6 # the result is in T6 | ||
2417 | .endm | ||
2418 | |||
2419 | |||
2420 | |||
2421 | # combined for GCM encrypt and decrypt functions | ||
2422 | # clobbering all xmm registers | ||
2423 | # clobbering r10, r11, r12, r13, r14, r15 | ||
2424 | .macro GCM_ENC_DEC_AVX2 ENC_DEC | ||
2425 | |||
2426 | #the number of pushes must equal STACK_OFFSET | ||
2427 | push %r12 | ||
2428 | push %r13 | ||
2429 | push %r14 | ||
2430 | push %r15 | ||
2431 | |||
2432 | mov %rsp, %r14 | ||
2433 | |||
2434 | |||
2435 | |||
2436 | |||
2437 | sub $VARIABLE_OFFSET, %rsp | ||
2438 | and $~63, %rsp # align rsp to 64 bytes | ||
2439 | |||
2440 | |||
2441 | vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey | ||
2442 | |||
2443 | mov arg4, %r13 # save the number of bytes of plaintext/ciphertext | ||
2444 | and $-16, %r13 # r13 = r13 - (r13 mod 16) | ||
2445 | |||
2446 | mov %r13, %r12 | ||
2447 | shr $4, %r12 | ||
2448 | and $7, %r12 | ||
2449 | jz _initial_num_blocks_is_0\@ | ||
2450 | |||
2451 | cmp $7, %r12 | ||
2452 | je _initial_num_blocks_is_7\@ | ||
2453 | cmp $6, %r12 | ||
2454 | je _initial_num_blocks_is_6\@ | ||
2455 | cmp $5, %r12 | ||
2456 | je _initial_num_blocks_is_5\@ | ||
2457 | cmp $4, %r12 | ||
2458 | je _initial_num_blocks_is_4\@ | ||
2459 | cmp $3, %r12 | ||
2460 | je _initial_num_blocks_is_3\@ | ||
2461 | cmp $2, %r12 | ||
2462 | je _initial_num_blocks_is_2\@ | ||
2463 | |||
2464 | jmp _initial_num_blocks_is_1\@ | ||
2465 | |||
2466 | _initial_num_blocks_is_7\@: | ||
2467 | INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2468 | sub $16*7, %r13 | ||
2469 | jmp _initial_blocks_encrypted\@ | ||
2470 | |||
2471 | _initial_num_blocks_is_6\@: | ||
2472 | INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2473 | sub $16*6, %r13 | ||
2474 | jmp _initial_blocks_encrypted\@ | ||
2475 | |||
2476 | _initial_num_blocks_is_5\@: | ||
2477 | INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2478 | sub $16*5, %r13 | ||
2479 | jmp _initial_blocks_encrypted\@ | ||
2480 | |||
2481 | _initial_num_blocks_is_4\@: | ||
2482 | INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2483 | sub $16*4, %r13 | ||
2484 | jmp _initial_blocks_encrypted\@ | ||
2485 | |||
2486 | _initial_num_blocks_is_3\@: | ||
2487 | INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2488 | sub $16*3, %r13 | ||
2489 | jmp _initial_blocks_encrypted\@ | ||
2490 | |||
2491 | _initial_num_blocks_is_2\@: | ||
2492 | INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2493 | sub $16*2, %r13 | ||
2494 | jmp _initial_blocks_encrypted\@ | ||
2495 | |||
2496 | _initial_num_blocks_is_1\@: | ||
2497 | INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2498 | sub $16*1, %r13 | ||
2499 | jmp _initial_blocks_encrypted\@ | ||
2500 | |||
2501 | _initial_num_blocks_is_0\@: | ||
2502 | INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC | ||
2503 | |||
2504 | |||
2505 | _initial_blocks_encrypted\@: | ||
2506 | cmp $0, %r13 | ||
2507 | je _zero_cipher_left\@ | ||
2508 | |||
2509 | sub $128, %r13 | ||
2510 | je _eight_cipher_left\@ | ||
2511 | |||
2512 | |||
2513 | |||
2514 | |||
2515 | vmovd %xmm9, %r15d | ||
2516 | and $255, %r15d | ||
2517 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2518 | |||
2519 | |||
2520 | _encrypt_by_8_new\@: | ||
2521 | cmp $(255-8), %r15d | ||
2522 | jg _encrypt_by_8\@ | ||
2523 | |||
2524 | |||
2525 | |||
2526 | add $8, %r15b | ||
2527 | GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC | ||
2528 | add $128, %r11 | ||
2529 | sub $128, %r13 | ||
2530 | jne _encrypt_by_8_new\@ | ||
2531 | |||
2532 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2533 | jmp _eight_cipher_left\@ | ||
2534 | |||
2535 | _encrypt_by_8\@: | ||
2536 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2537 | add $8, %r15b | ||
2538 | GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC | ||
2539 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2540 | add $128, %r11 | ||
2541 | sub $128, %r13 | ||
2542 | jne _encrypt_by_8_new\@ | ||
2543 | |||
2544 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2545 | |||
2546 | |||
2547 | |||
2548 | |||
2549 | _eight_cipher_left\@: | ||
2550 | GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 | ||
2551 | |||
2552 | |||
2553 | _zero_cipher_left\@: | ||
2554 | cmp $16, arg4 | ||
2555 | jl _only_less_than_16\@ | ||
2556 | |||
2557 | mov arg4, %r13 | ||
2558 | and $15, %r13 # r13 = (arg4 mod 16) | ||
2559 | |||
2560 | je _multiple_of_16_bytes\@ | ||
2561 | |||
2562 | # handle the last <16 Byte block seperately | ||
2563 | |||
2564 | |||
2565 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | ||
2566 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2567 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | ||
2568 | |||
2569 | sub $16, %r11 | ||
2570 | add %r13, %r11 | ||
2571 | vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block | ||
2572 | |||
2573 | lea SHIFT_MASK+16(%rip), %r12 | ||
2574 | sub %r13, %r12 # adjust the shuffle mask pointer | ||
2575 | # to be able to shift 16-r13 bytes | ||
2576 | # (r13 is the number of bytes in plaintext mod 16) | ||
2577 | vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
2578 | vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes | ||
2579 | jmp _final_ghash_mul\@ | ||
2580 | |||
2581 | _only_less_than_16\@: | ||
2582 | # check for 0 length | ||
2583 | mov arg4, %r13 | ||
2584 | and $15, %r13 # r13 = (arg4 mod 16) | ||
2585 | |||
2586 | je _multiple_of_16_bytes\@ | ||
2587 | |||
2588 | # handle the last <16 Byte block seperately | ||
2589 | |||
2590 | |||
2591 | vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn | ||
2592 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2593 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) | ||
2594 | |||
2595 | |||
2596 | lea SHIFT_MASK+16(%rip), %r12 | ||
2597 | sub %r13, %r12 # adjust the shuffle mask pointer to be | ||
2598 | # able to shift 16-r13 bytes (r13 is the | ||
2599 | # number of bytes in plaintext mod 16) | ||
2600 | |||
2601 | _get_last_16_byte_loop\@: | ||
2602 | movb (arg3, %r11), %al | ||
2603 | movb %al, TMP1 (%rsp , %r11) | ||
2604 | add $1, %r11 | ||
2605 | cmp %r13, %r11 | ||
2606 | jne _get_last_16_byte_loop\@ | ||
2607 | |||
2608 | vmovdqu TMP1(%rsp), %xmm1 | ||
2609 | |||
2610 | sub $16, %r11 | ||
2611 | |||
2612 | _final_ghash_mul\@: | ||
2613 | .if \ENC_DEC == DEC | ||
2614 | vmovdqa %xmm1, %xmm2 | ||
2615 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | ||
2616 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 | ||
2617 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | ||
2618 | vpand %xmm1, %xmm2, %xmm2 | ||
2619 | vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 | ||
2620 | vpxor %xmm2, %xmm14, %xmm14 | ||
2621 | #GHASH computation for the last <16 Byte block | ||
2622 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | ||
2623 | sub %r13, %r11 | ||
2624 | add $16, %r11 | ||
2625 | .else | ||
2626 | vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) | ||
2627 | vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 | ||
2628 | vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 | ||
2629 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 | ||
2630 | vpxor %xmm9, %xmm14, %xmm14 | ||
2631 | #GHASH computation for the last <16 Byte block | ||
2632 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 | ||
2633 | sub %r13, %r11 | ||
2634 | add $16, %r11 | ||
2635 | vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext | ||
2636 | .endif | ||
2637 | |||
2638 | |||
2639 | ############################# | ||
2640 | # output r13 Bytes | ||
2641 | vmovq %xmm9, %rax | ||
2642 | cmp $8, %r13 | ||
2643 | jle _less_than_8_bytes_left\@ | ||
2644 | |||
2645 | mov %rax, (arg2 , %r11) | ||
2646 | add $8, %r11 | ||
2647 | vpsrldq $8, %xmm9, %xmm9 | ||
2648 | vmovq %xmm9, %rax | ||
2649 | sub $8, %r13 | ||
2650 | |||
2651 | _less_than_8_bytes_left\@: | ||
2652 | movb %al, (arg2 , %r11) | ||
2653 | add $1, %r11 | ||
2654 | shr $8, %rax | ||
2655 | sub $1, %r13 | ||
2656 | jne _less_than_8_bytes_left\@ | ||
2657 | ############################# | ||
2658 | |||
2659 | _multiple_of_16_bytes\@: | ||
2660 | mov arg7, %r12 # r12 = aadLen (number of bytes) | ||
2661 | shl $3, %r12 # convert into number of bits | ||
2662 | vmovd %r12d, %xmm15 # len(A) in xmm15 | ||
2663 | |||
2664 | shl $3, arg4 # len(C) in bits (*128) | ||
2665 | vmovq arg4, %xmm1 | ||
2666 | vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 | ||
2667 | vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) | ||
2668 | |||
2669 | vpxor %xmm15, %xmm14, %xmm14 | ||
2670 | GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation | ||
2671 | vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap | ||
2672 | |||
2673 | mov arg5, %rax # rax = *Y0 | ||
2674 | vmovdqu (%rax), %xmm9 # xmm9 = Y0 | ||
2675 | |||
2676 | ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) | ||
2677 | |||
2678 | vpxor %xmm14, %xmm9, %xmm9 | ||
2679 | |||
2680 | |||
2681 | |||
2682 | _return_T\@: | ||
2683 | mov arg8, %r10 # r10 = authTag | ||
2684 | mov arg9, %r11 # r11 = auth_tag_len | ||
2685 | |||
2686 | cmp $16, %r11 | ||
2687 | je _T_16\@ | ||
2688 | |||
2689 | cmp $12, %r11 | ||
2690 | je _T_12\@ | ||
2691 | |||
2692 | _T_8\@: | ||
2693 | vmovq %xmm9, %rax | ||
2694 | mov %rax, (%r10) | ||
2695 | jmp _return_T_done\@ | ||
2696 | _T_12\@: | ||
2697 | vmovq %xmm9, %rax | ||
2698 | mov %rax, (%r10) | ||
2699 | vpsrldq $8, %xmm9, %xmm9 | ||
2700 | vmovd %xmm9, %eax | ||
2701 | mov %eax, 8(%r10) | ||
2702 | jmp _return_T_done\@ | ||
2703 | |||
2704 | _T_16\@: | ||
2705 | vmovdqu %xmm9, (%r10) | ||
2706 | |||
2707 | _return_T_done\@: | ||
2708 | mov %r14, %rsp | ||
2709 | |||
2710 | pop %r15 | ||
2711 | pop %r14 | ||
2712 | pop %r13 | ||
2713 | pop %r12 | ||
2714 | .endm | ||
2715 | |||
2716 | |||
2717 | ############################################################# | ||
2718 | #void aesni_gcm_precomp_avx_gen4 | ||
2719 | # (gcm_data *my_ctx_data, | ||
2720 | # u8 *hash_subkey)# /* H, the Hash sub key input. | ||
2721 | # Data starts on a 16-byte boundary. */ | ||
2722 | ############################################################# | ||
2723 | ENTRY(aesni_gcm_precomp_avx_gen4) | ||
2724 | #the number of pushes must equal STACK_OFFSET | ||
2725 | push %r12 | ||
2726 | push %r13 | ||
2727 | push %r14 | ||
2728 | push %r15 | ||
2729 | |||
2730 | mov %rsp, %r14 | ||
2731 | |||
2732 | |||
2733 | |||
2734 | sub $VARIABLE_OFFSET, %rsp | ||
2735 | and $~63, %rsp # align rsp to 64 bytes | ||
2736 | |||
2737 | vmovdqu (arg2), %xmm6 # xmm6 = HashKey | ||
2738 | |||
2739 | vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 | ||
2740 | ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey | ||
2741 | vmovdqa %xmm6, %xmm2 | ||
2742 | vpsllq $1, %xmm6, %xmm6 | ||
2743 | vpsrlq $63, %xmm2, %xmm2 | ||
2744 | vmovdqa %xmm2, %xmm1 | ||
2745 | vpslldq $8, %xmm2, %xmm2 | ||
2746 | vpsrldq $8, %xmm1, %xmm1 | ||
2747 | vpor %xmm2, %xmm6, %xmm6 | ||
2748 | #reduction | ||
2749 | vpshufd $0b00100100, %xmm1, %xmm2 | ||
2750 | vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 | ||
2751 | vpand POLY(%rip), %xmm2, %xmm2 | ||
2752 | vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly | ||
2753 | ####################################################################### | ||
2754 | vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly | ||
2755 | |||
2756 | |||
2757 | PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 | ||
2758 | |||
2759 | mov %r14, %rsp | ||
2760 | |||
2761 | pop %r15 | ||
2762 | pop %r14 | ||
2763 | pop %r13 | ||
2764 | pop %r12 | ||
2765 | ret | ||
2766 | ENDPROC(aesni_gcm_precomp_avx_gen4) | ||
2767 | |||
2768 | |||
2769 | ############################################################################### | ||
2770 | #void aesni_gcm_enc_avx_gen4( | ||
2771 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | ||
2772 | # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ | ||
2773 | # const u8 *in, /* Plaintext input */ | ||
2774 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | ||
2775 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | ||
2776 | # (from Security Association) concatenated with 8 byte | ||
2777 | # Initialisation Vector (from IPSec ESP Payload) | ||
2778 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | ||
2779 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | ||
2780 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | ||
2781 | # u8 *auth_tag, /* Authenticated Tag output. */ | ||
2782 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | ||
2783 | # Valid values are 16 (most likely), 12 or 8. */ | ||
2784 | ############################################################################### | ||
2785 | ENTRY(aesni_gcm_enc_avx_gen4) | ||
2786 | GCM_ENC_DEC_AVX2 ENC | ||
2787 | ret | ||
2788 | ENDPROC(aesni_gcm_enc_avx_gen4) | ||
2789 | |||
2790 | ############################################################################### | ||
2791 | #void aesni_gcm_dec_avx_gen4( | ||
2792 | # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ | ||
2793 | # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ | ||
2794 | # const u8 *in, /* Ciphertext input */ | ||
2795 | # u64 plaintext_len, /* Length of data in Bytes for encryption. */ | ||
2796 | # u8 *iv, /* Pre-counter block j0: 4 byte salt | ||
2797 | # (from Security Association) concatenated with 8 byte | ||
2798 | # Initialisation Vector (from IPSec ESP Payload) | ||
2799 | # concatenated with 0x00000001. 16-byte aligned pointer. */ | ||
2800 | # const u8 *aad, /* Additional Authentication Data (AAD)*/ | ||
2801 | # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ | ||
2802 | # u8 *auth_tag, /* Authenticated Tag output. */ | ||
2803 | # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. | ||
2804 | # Valid values are 16 (most likely), 12 or 8. */ | ||
2805 | ############################################################################### | ||
2806 | ENTRY(aesni_gcm_dec_avx_gen4) | ||
2807 | GCM_ENC_DEC_AVX2 DEC | ||
2808 | ret | ||
2809 | ENDPROC(aesni_gcm_dec_avx_gen4) | ||
2810 | |||
2811 | #endif /* CONFIG_AS_AVX2 */ | ||
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index aba34b8e514c..3ae311dd684e 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, | |||
101 | int crypto_fpu_init(void); | 101 | int crypto_fpu_init(void); |
102 | void crypto_fpu_exit(void); | 102 | void crypto_fpu_exit(void); |
103 | 103 | ||
104 | #define AVX_GEN2_OPTSIZE 640 | ||
105 | #define AVX_GEN4_OPTSIZE 4096 | ||
106 | |||
104 | #ifdef CONFIG_X86_64 | 107 | #ifdef CONFIG_X86_64 |
105 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, | 108 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, |
106 | const u8 *in, unsigned int len, u8 *iv); | 109 | const u8 *in, unsigned int len, u8 *iv); |
@@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, | |||
150 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | 153 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, |
151 | u8 *auth_tag, unsigned long auth_tag_len); | 154 | u8 *auth_tag, unsigned long auth_tag_len); |
152 | 155 | ||
156 | |||
157 | #ifdef CONFIG_AS_AVX | ||
158 | /* | ||
159 | * asmlinkage void aesni_gcm_precomp_avx_gen2() | ||
160 | * gcm_data *my_ctx_data, context data | ||
161 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
162 | */ | ||
163 | asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); | ||
164 | |||
165 | asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, | ||
166 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
167 | const u8 *aad, unsigned long aad_len, | ||
168 | u8 *auth_tag, unsigned long auth_tag_len); | ||
169 | |||
170 | asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, | ||
171 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
172 | const u8 *aad, unsigned long aad_len, | ||
173 | u8 *auth_tag, unsigned long auth_tag_len); | ||
174 | |||
175 | static void aesni_gcm_enc_avx(void *ctx, u8 *out, | ||
176 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
177 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
178 | u8 *auth_tag, unsigned long auth_tag_len) | ||
179 | { | ||
180 | if (plaintext_len < AVX_GEN2_OPTSIZE) { | ||
181 | aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, | ||
182 | aad_len, auth_tag, auth_tag_len); | ||
183 | } else { | ||
184 | aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); | ||
185 | aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, | ||
186 | aad_len, auth_tag, auth_tag_len); | ||
187 | } | ||
188 | } | ||
189 | |||
190 | static void aesni_gcm_dec_avx(void *ctx, u8 *out, | ||
191 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
192 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
193 | u8 *auth_tag, unsigned long auth_tag_len) | ||
194 | { | ||
195 | if (ciphertext_len < AVX_GEN2_OPTSIZE) { | ||
196 | aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, | ||
197 | aad_len, auth_tag, auth_tag_len); | ||
198 | } else { | ||
199 | aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); | ||
200 | aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, | ||
201 | aad_len, auth_tag, auth_tag_len); | ||
202 | } | ||
203 | } | ||
204 | #endif | ||
205 | |||
206 | #ifdef CONFIG_AS_AVX2 | ||
207 | /* | ||
208 | * asmlinkage void aesni_gcm_precomp_avx_gen4() | ||
209 | * gcm_data *my_ctx_data, context data | ||
210 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
211 | */ | ||
212 | asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); | ||
213 | |||
214 | asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, | ||
215 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
216 | const u8 *aad, unsigned long aad_len, | ||
217 | u8 *auth_tag, unsigned long auth_tag_len); | ||
218 | |||
219 | asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, | ||
220 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
221 | const u8 *aad, unsigned long aad_len, | ||
222 | u8 *auth_tag, unsigned long auth_tag_len); | ||
223 | |||
224 | static void aesni_gcm_enc_avx2(void *ctx, u8 *out, | ||
225 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
226 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
227 | u8 *auth_tag, unsigned long auth_tag_len) | ||
228 | { | ||
229 | if (plaintext_len < AVX_GEN2_OPTSIZE) { | ||
230 | aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, | ||
231 | aad_len, auth_tag, auth_tag_len); | ||
232 | } else if (plaintext_len < AVX_GEN4_OPTSIZE) { | ||
233 | aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); | ||
234 | aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, | ||
235 | aad_len, auth_tag, auth_tag_len); | ||
236 | } else { | ||
237 | aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); | ||
238 | aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, | ||
239 | aad_len, auth_tag, auth_tag_len); | ||
240 | } | ||
241 | } | ||
242 | |||
243 | static void aesni_gcm_dec_avx2(void *ctx, u8 *out, | ||
244 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
245 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
246 | u8 *auth_tag, unsigned long auth_tag_len) | ||
247 | { | ||
248 | if (ciphertext_len < AVX_GEN2_OPTSIZE) { | ||
249 | aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, | ||
250 | aad, aad_len, auth_tag, auth_tag_len); | ||
251 | } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { | ||
252 | aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); | ||
253 | aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, | ||
254 | aad_len, auth_tag, auth_tag_len); | ||
255 | } else { | ||
256 | aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); | ||
257 | aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, | ||
258 | aad_len, auth_tag, auth_tag_len); | ||
259 | } | ||
260 | } | ||
261 | #endif | ||
262 | |||
263 | static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out, | ||
264 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
265 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
266 | u8 *auth_tag, unsigned long auth_tag_len); | ||
267 | |||
268 | static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, | ||
269 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
270 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
271 | u8 *auth_tag, unsigned long auth_tag_len); | ||
272 | |||
153 | static inline struct | 273 | static inline struct |
154 | aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) | 274 | aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) |
155 | { | 275 | { |
@@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) | |||
915 | dst = src; | 1035 | dst = src; |
916 | } | 1036 | } |
917 | 1037 | ||
918 | aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, | 1038 | aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, |
919 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst | 1039 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst |
920 | + ((unsigned long)req->cryptlen), auth_tag_len); | 1040 | + ((unsigned long)req->cryptlen), auth_tag_len); |
921 | 1041 | ||
@@ -996,7 +1116,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) | |||
996 | dst = src; | 1116 | dst = src; |
997 | } | 1117 | } |
998 | 1118 | ||
999 | aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, | 1119 | aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, |
1000 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, | 1120 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, |
1001 | authTag, auth_tag_len); | 1121 | authTag, auth_tag_len); |
1002 | 1122 | ||
@@ -1353,6 +1473,25 @@ static int __init aesni_init(void) | |||
1353 | 1473 | ||
1354 | if (!x86_match_cpu(aesni_cpu_id)) | 1474 | if (!x86_match_cpu(aesni_cpu_id)) |
1355 | return -ENODEV; | 1475 | return -ENODEV; |
1476 | #ifdef CONFIG_AS_AVX2 | ||
1477 | if (boot_cpu_has(X86_FEATURE_AVX2)) { | ||
1478 | pr_info("AVX2 version of gcm_enc/dec engaged.\n"); | ||
1479 | aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; | ||
1480 | aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; | ||
1481 | } else | ||
1482 | #endif | ||
1483 | #ifdef CONFIG_AS_AVX | ||
1484 | if (boot_cpu_has(X86_FEATURE_AVX)) { | ||
1485 | pr_info("AVX version of gcm_enc/dec engaged.\n"); | ||
1486 | aesni_gcm_enc_tfm = aesni_gcm_enc_avx; | ||
1487 | aesni_gcm_dec_tfm = aesni_gcm_dec_avx; | ||
1488 | } else | ||
1489 | #endif | ||
1490 | { | ||
1491 | pr_info("SSE version of gcm_enc/dec engaged.\n"); | ||
1492 | aesni_gcm_enc_tfm = aesni_gcm_enc; | ||
1493 | aesni_gcm_dec_tfm = aesni_gcm_dec; | ||
1494 | } | ||
1356 | 1495 | ||
1357 | err = crypto_fpu_init(); | 1496 | err = crypto_fpu_init(); |
1358 | if (err) | 1497 | if (err) |