diff options
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 1192 |
1 files changed, 1192 insertions, 0 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index ff16756a51c1..aafced54df64 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -9,6 +9,17 @@ | |||
9 | * Vinodh Gopal <vinodh.gopal@intel.com> | 9 | * Vinodh Gopal <vinodh.gopal@intel.com> |
10 | * Kahraman Akdemir | 10 | * Kahraman Akdemir |
11 | * | 11 | * |
12 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | ||
13 | * interface for 64-bit kernels. | ||
14 | * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | ||
15 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | ||
16 | * Adrian Hoban <adrian.hoban@intel.com> | ||
17 | * James Guilford (james.guilford@intel.com) | ||
18 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | ||
19 | * Tadeusz Struk (tadeusz.struk@intel.com) | ||
20 | * Wajdi Feghali (wajdi.k.feghali@intel.com) | ||
21 | * Copyright (c) 2010, Intel Corporation. | ||
22 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | 23 | * This program is free software; you can redistribute it and/or modify |
13 | * it under the terms of the GNU General Public License as published by | 24 | * it under the terms of the GNU General Public License as published by |
14 | * the Free Software Foundation; either version 2 of the License, or | 25 | * the Free Software Foundation; either version 2 of the License, or |
@@ -18,8 +29,60 @@ | |||
18 | #include <linux/linkage.h> | 29 | #include <linux/linkage.h> |
19 | #include <asm/inst.h> | 30 | #include <asm/inst.h> |
20 | 31 | ||
32 | .data | ||
33 | POLY: .octa 0xC2000000000000000000000000000001 | ||
34 | TWOONE: .octa 0x00000001000000000000000000000001 | ||
35 | |||
36 | # order of these constants should not change. | ||
37 | # more specifically, ALL_F should follow SHIFT_MASK, | ||
38 | # and ZERO should follow ALL_F | ||
39 | |||
40 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F | ||
41 | MASK1: .octa 0x0000000000000000ffffffffffffffff | ||
42 | MASK2: .octa 0xffffffffffffffff0000000000000000 | ||
43 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | ||
44 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | ||
45 | ZERO: .octa 0x00000000000000000000000000000000 | ||
46 | ONE: .octa 0x00000000000000000000000000000001 | ||
47 | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 | ||
48 | dec: .octa 0x1 | ||
49 | enc: .octa 0x2 | ||
50 | |||
51 | |||
21 | .text | 52 | .text |
22 | 53 | ||
54 | |||
55 | #define STACK_OFFSET 8*3 | ||
56 | #define HashKey 16*0 // store HashKey <<1 mod poly here | ||
57 | #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here | ||
58 | #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here | ||
59 | #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here | ||
60 | #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 | ||
61 | // bits of HashKey <<1 mod poly here | ||
62 | //(for Karatsuba purposes) | ||
63 | #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 | ||
64 | // bits of HashKey^2 <<1 mod poly here | ||
65 | // (for Karatsuba purposes) | ||
66 | #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 | ||
67 | // bits of HashKey^3 <<1 mod poly here | ||
68 | // (for Karatsuba purposes) | ||
69 | #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 | ||
70 | // bits of HashKey^4 <<1 mod poly here | ||
71 | // (for Karatsuba purposes) | ||
72 | #define VARIABLE_OFFSET 16*8 | ||
73 | |||
74 | #define arg1 rdi | ||
75 | #define arg2 rsi | ||
76 | #define arg3 rdx | ||
77 | #define arg4 rcx | ||
78 | #define arg5 r8 | ||
79 | #define arg6 r9 | ||
80 | #define arg7 STACK_OFFSET+8(%r14) | ||
81 | #define arg8 STACK_OFFSET+16(%r14) | ||
82 | #define arg9 STACK_OFFSET+24(%r14) | ||
83 | #define arg10 STACK_OFFSET+32(%r14) | ||
84 | |||
85 | |||
23 | #define STATE1 %xmm0 | 86 | #define STATE1 %xmm0 |
24 | #define STATE2 %xmm4 | 87 | #define STATE2 %xmm4 |
25 | #define STATE3 %xmm5 | 88 | #define STATE3 %xmm5 |
@@ -47,6 +110,1135 @@ | |||
47 | #define T2 %r11 | 110 | #define T2 %r11 |
48 | #define TCTR_LOW T2 | 111 | #define TCTR_LOW T2 |
49 | 112 | ||
113 | |||
114 | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | ||
115 | * | ||
116 | * | ||
117 | * Input: A and B (128-bits each, bit-reflected) | ||
118 | * Output: C = A*B*x mod poly, (i.e. >>1 ) | ||
119 | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | ||
120 | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | ||
121 | * | ||
122 | */ | ||
123 | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | ||
124 | movdqa \GH, \TMP1 | ||
125 | pshufd $78, \GH, \TMP2 | ||
126 | pshufd $78, \HK, \TMP3 | ||
127 | pxor \GH, \TMP2 # TMP2 = a1+a0 | ||
128 | pxor \HK, \TMP3 # TMP3 = b1+b0 | ||
129 | PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 | ||
130 | PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 | ||
131 | PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) | ||
132 | pxor \GH, \TMP2 | ||
133 | pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) | ||
134 | movdqa \TMP2, \TMP3 | ||
135 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
136 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
137 | pxor \TMP3, \GH | ||
138 | pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK | ||
139 | |||
140 | # first phase of the reduction | ||
141 | |||
142 | movdqa \GH, \TMP2 | ||
143 | movdqa \GH, \TMP3 | ||
144 | movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 | ||
145 | # in in order to perform | ||
146 | # independent shifts | ||
147 | pslld $31, \TMP2 # packed right shift <<31 | ||
148 | pslld $30, \TMP3 # packed right shift <<30 | ||
149 | pslld $25, \TMP4 # packed right shift <<25 | ||
150 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
151 | pxor \TMP4, \TMP2 | ||
152 | movdqa \TMP2, \TMP5 | ||
153 | psrldq $4, \TMP5 # right shift TMP5 1 DW | ||
154 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
155 | pxor \TMP2, \GH | ||
156 | |||
157 | # second phase of the reduction | ||
158 | |||
159 | movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 | ||
160 | # in in order to perform | ||
161 | # independent shifts | ||
162 | movdqa \GH,\TMP3 | ||
163 | movdqa \GH,\TMP4 | ||
164 | psrld $1,\TMP2 # packed left shift >>1 | ||
165 | psrld $2,\TMP3 # packed left shift >>2 | ||
166 | psrld $7,\TMP4 # packed left shift >>7 | ||
167 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
168 | pxor \TMP4,\TMP2 | ||
169 | pxor \TMP5, \TMP2 | ||
170 | pxor \TMP2, \GH | ||
171 | pxor \TMP1, \GH # result is in TMP1 | ||
172 | .endm | ||
173 | |||
174 | /* | ||
175 | * if a = number of total plaintext bytes | ||
176 | * b = floor(a/16) | ||
177 | * num_initial_blocks = b mod 4 | ||
178 | * encrypt the initial num_initial_blocks blocks and apply ghash on | ||
179 | * the ciphertext | ||
180 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | ||
181 | * are clobbered | ||
182 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | ||
183 | */ | ||
184 | |||
185 | .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | ||
186 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | ||
187 | |||
188 | mov arg7, %r10 # %r10 = AAD | ||
189 | mov arg8, %r12 # %r12 = aadLen | ||
190 | mov %r12, %r11 | ||
191 | pxor %xmm\i, %xmm\i | ||
192 | _get_AAD_loop\num_initial_blocks\operation: | ||
193 | movd (%r10), \TMP1 | ||
194 | pslldq $12, \TMP1 | ||
195 | psrldq $4, %xmm\i | ||
196 | pxor \TMP1, %xmm\i | ||
197 | add $4, %r10 | ||
198 | sub $4, %r12 | ||
199 | jne _get_AAD_loop\num_initial_blocks\operation | ||
200 | cmp $16, %r11 | ||
201 | je _get_AAD_loop2_done\num_initial_blocks\operation | ||
202 | mov $16, %r12 | ||
203 | _get_AAD_loop2\num_initial_blocks\operation: | ||
204 | psrldq $4, %xmm\i | ||
205 | sub $4, %r12 | ||
206 | cmp %r11, %r12 | ||
207 | jne _get_AAD_loop2\num_initial_blocks\operation | ||
208 | _get_AAD_loop2_done\num_initial_blocks\operation: | ||
209 | pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data | ||
210 | xor %r11, %r11 # initialise the data pointer offset as zero | ||
211 | |||
212 | # start AES for num_initial_blocks blocks | ||
213 | |||
214 | mov %arg5, %rax # %rax = *Y0 | ||
215 | movdqu (%rax), \XMM0 # XMM0 = Y0 | ||
216 | pshufb SHUF_MASK(%rip), \XMM0 | ||
217 | .if \i_seq != 0 | ||
218 | .irpc index, \i_seq | ||
219 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
220 | movdqa \XMM0, %xmm\index | ||
221 | pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap | ||
222 | .endr | ||
223 | .irpc index, \i_seq | ||
224 | pxor 16*0(%arg1), %xmm\index | ||
225 | .endr | ||
226 | .irpc index, \i_seq | ||
227 | movaps 0x10(%rdi), \TMP1 | ||
228 | AESENC \TMP1, %xmm\index # Round 1 | ||
229 | .endr | ||
230 | .irpc index, \i_seq | ||
231 | movaps 0x20(%arg1), \TMP1 | ||
232 | AESENC \TMP1, %xmm\index # Round 2 | ||
233 | .endr | ||
234 | .irpc index, \i_seq | ||
235 | movaps 0x30(%arg1), \TMP1 | ||
236 | AESENC \TMP1, %xmm\index # Round 2 | ||
237 | .endr | ||
238 | .irpc index, \i_seq | ||
239 | movaps 0x40(%arg1), \TMP1 | ||
240 | AESENC \TMP1, %xmm\index # Round 2 | ||
241 | .endr | ||
242 | .irpc index, \i_seq | ||
243 | movaps 0x50(%arg1), \TMP1 | ||
244 | AESENC \TMP1, %xmm\index # Round 2 | ||
245 | .endr | ||
246 | .irpc index, \i_seq | ||
247 | movaps 0x60(%arg1), \TMP1 | ||
248 | AESENC \TMP1, %xmm\index # Round 2 | ||
249 | .endr | ||
250 | .irpc index, \i_seq | ||
251 | movaps 0x70(%arg1), \TMP1 | ||
252 | AESENC \TMP1, %xmm\index # Round 2 | ||
253 | .endr | ||
254 | .irpc index, \i_seq | ||
255 | movaps 0x80(%arg1), \TMP1 | ||
256 | AESENC \TMP1, %xmm\index # Round 2 | ||
257 | .endr | ||
258 | .irpc index, \i_seq | ||
259 | movaps 0x90(%arg1), \TMP1 | ||
260 | AESENC \TMP1, %xmm\index # Round 2 | ||
261 | .endr | ||
262 | .irpc index, \i_seq | ||
263 | movaps 0xa0(%arg1), \TMP1 | ||
264 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
265 | .endr | ||
266 | .irpc index, \i_seq | ||
267 | movdqu (%arg3 , %r11, 1), \TMP1 | ||
268 | pxor \TMP1, %xmm\index | ||
269 | movdqu %xmm\index, (%arg2 , %r11, 1) | ||
270 | # write back plaintext/ciphertext for num_initial_blocks | ||
271 | add $16, %r11 | ||
272 | .if \operation == dec | ||
273 | movdqa \TMP1, %xmm\index | ||
274 | .endif | ||
275 | pshufb SHUF_MASK(%rip), %xmm\index | ||
276 | # prepare plaintext/ciphertext for GHASH computation | ||
277 | .endr | ||
278 | .endif | ||
279 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
280 | # apply GHASH on num_initial_blocks blocks | ||
281 | |||
282 | .if \i == 5 | ||
283 | pxor %xmm5, %xmm6 | ||
284 | GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
285 | pxor %xmm6, %xmm7 | ||
286 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
287 | pxor %xmm7, %xmm8 | ||
288 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
289 | .elseif \i == 6 | ||
290 | pxor %xmm6, %xmm7 | ||
291 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
292 | pxor %xmm7, %xmm8 | ||
293 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
294 | .elseif \i == 7 | ||
295 | pxor %xmm7, %xmm8 | ||
296 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
297 | .endif | ||
298 | cmp $64, %r13 | ||
299 | jl _initial_blocks_done\num_initial_blocks\operation | ||
300 | # no need for precomputed values | ||
301 | /* | ||
302 | * | ||
303 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | ||
304 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
305 | */ | ||
306 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
307 | movdqa \XMM0, \XMM1 | ||
308 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
309 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
310 | movdqa \XMM0, \XMM2 | ||
311 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
312 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
313 | movdqa \XMM0, \XMM3 | ||
314 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
315 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
316 | movdqa \XMM0, \XMM4 | ||
317 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
318 | pxor 16*0(%arg1), \XMM1 | ||
319 | pxor 16*0(%arg1), \XMM2 | ||
320 | pxor 16*0(%arg1), \XMM3 | ||
321 | pxor 16*0(%arg1), \XMM4 | ||
322 | movdqa \TMP3, \TMP5 | ||
323 | pshufd $78, \TMP3, \TMP1 | ||
324 | pxor \TMP3, \TMP1 | ||
325 | movdqa \TMP1, HashKey_k(%rsp) | ||
326 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
327 | # TMP5 = HashKey^2<<1 (mod poly) | ||
328 | movdqa \TMP5, HashKey_2(%rsp) | ||
329 | # HashKey_2 = HashKey^2<<1 (mod poly) | ||
330 | pshufd $78, \TMP5, \TMP1 | ||
331 | pxor \TMP5, \TMP1 | ||
332 | movdqa \TMP1, HashKey_2_k(%rsp) | ||
333 | .irpc index, 1234 # do 4 rounds | ||
334 | movaps 0x10*\index(%arg1), \TMP1 | ||
335 | AESENC \TMP1, \XMM1 | ||
336 | AESENC \TMP1, \XMM2 | ||
337 | AESENC \TMP1, \XMM3 | ||
338 | AESENC \TMP1, \XMM4 | ||
339 | .endr | ||
340 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
341 | # TMP5 = HashKey^3<<1 (mod poly) | ||
342 | movdqa \TMP5, HashKey_3(%rsp) | ||
343 | pshufd $78, \TMP5, \TMP1 | ||
344 | pxor \TMP5, \TMP1 | ||
345 | movdqa \TMP1, HashKey_3_k(%rsp) | ||
346 | .irpc index, 56789 # do next 5 rounds | ||
347 | movaps 0x10*\index(%arg1), \TMP1 | ||
348 | AESENC \TMP1, \XMM1 | ||
349 | AESENC \TMP1, \XMM2 | ||
350 | AESENC \TMP1, \XMM3 | ||
351 | AESENC \TMP1, \XMM4 | ||
352 | .endr | ||
353 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
354 | # TMP5 = HashKey^3<<1 (mod poly) | ||
355 | movdqa \TMP5, HashKey_4(%rsp) | ||
356 | pshufd $78, \TMP5, \TMP1 | ||
357 | pxor \TMP5, \TMP1 | ||
358 | movdqa \TMP1, HashKey_4_k(%rsp) | ||
359 | movaps 0xa0(%arg1), \TMP2 | ||
360 | AESENCLAST \TMP2, \XMM1 | ||
361 | AESENCLAST \TMP2, \XMM2 | ||
362 | AESENCLAST \TMP2, \XMM3 | ||
363 | AESENCLAST \TMP2, \XMM4 | ||
364 | movdqu 16*0(%arg3 , %r11 , 1), \TMP1 | ||
365 | pxor \TMP1, \XMM1 | ||
366 | .if \operation == dec | ||
367 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
368 | movdqa \TMP1, \XMM1 | ||
369 | .endif | ||
370 | movdqu 16*1(%arg3 , %r11 , 1), \TMP1 | ||
371 | pxor \TMP1, \XMM2 | ||
372 | .if \operation == dec | ||
373 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
374 | movdqa \TMP1, \XMM2 | ||
375 | .endif | ||
376 | movdqu 16*2(%arg3 , %r11 , 1), \TMP1 | ||
377 | pxor \TMP1, \XMM3 | ||
378 | .if \operation == dec | ||
379 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
380 | movdqa \TMP1, \XMM3 | ||
381 | .endif | ||
382 | movdqu 16*3(%arg3 , %r11 , 1), \TMP1 | ||
383 | pxor \TMP1, \XMM4 | ||
384 | .if \operation == dec | ||
385 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
386 | movdqa \TMP1, \XMM4 | ||
387 | .else | ||
388 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
389 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
390 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
391 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
392 | .endif | ||
393 | add $64, %r11 | ||
394 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
395 | pxor \XMMDst, \XMM1 | ||
396 | # combine GHASHed value with the corresponding ciphertext | ||
397 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
398 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
399 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
400 | _initial_blocks_done\num_initial_blocks\operation: | ||
401 | .endm | ||
402 | |||
403 | /* | ||
404 | * encrypt 4 blocks at a time | ||
405 | * ghash the 4 previously encrypted ciphertext blocks | ||
406 | * arg1, %arg2, %arg3 are used as pointers only, not modified | ||
407 | * %r11 is the data offset value | ||
408 | */ | ||
409 | .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \ | ||
410 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | ||
411 | |||
412 | movdqa \XMM1, \XMM5 | ||
413 | movdqa \XMM2, \XMM6 | ||
414 | movdqa \XMM3, \XMM7 | ||
415 | movdqa \XMM4, \XMM8 | ||
416 | |||
417 | # multiply TMP5 * HashKey using karatsuba | ||
418 | |||
419 | movdqa \XMM5, \TMP4 | ||
420 | pshufd $78, \XMM5, \TMP6 | ||
421 | pxor \XMM5, \TMP6 | ||
422 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
423 | movdqa HashKey_4(%rsp), \TMP5 | ||
424 | PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 | ||
425 | movdqa \XMM0, \XMM1 | ||
426 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
427 | movdqa \XMM0, \XMM2 | ||
428 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
429 | movdqa \XMM0, \XMM3 | ||
430 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
431 | movdqa \XMM0, \XMM4 | ||
432 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
433 | PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | ||
434 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
435 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
436 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
437 | pxor (%arg1), \XMM1 | ||
438 | pxor (%arg1), \XMM2 | ||
439 | pxor (%arg1), \XMM3 | ||
440 | pxor (%arg1), \XMM4 | ||
441 | movdqa HashKey_4_k(%rsp), \TMP5 | ||
442 | PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) | ||
443 | movaps 0x10(%arg1), \TMP1 | ||
444 | AESENC \TMP1, \XMM1 # Round 1 | ||
445 | AESENC \TMP1, \XMM2 | ||
446 | AESENC \TMP1, \XMM3 | ||
447 | AESENC \TMP1, \XMM4 | ||
448 | movaps 0x20(%arg1), \TMP1 | ||
449 | AESENC \TMP1, \XMM1 # Round 2 | ||
450 | AESENC \TMP1, \XMM2 | ||
451 | AESENC \TMP1, \XMM3 | ||
452 | AESENC \TMP1, \XMM4 | ||
453 | movdqa \XMM6, \TMP1 | ||
454 | pshufd $78, \XMM6, \TMP2 | ||
455 | pxor \XMM6, \TMP2 | ||
456 | movdqa HashKey_3(%rsp), \TMP5 | ||
457 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 | ||
458 | movaps 0x30(%arg1), \TMP3 | ||
459 | AESENC \TMP3, \XMM1 # Round 3 | ||
460 | AESENC \TMP3, \XMM2 | ||
461 | AESENC \TMP3, \XMM3 | ||
462 | AESENC \TMP3, \XMM4 | ||
463 | PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | ||
464 | movaps 0x40(%arg1), \TMP3 | ||
465 | AESENC \TMP3, \XMM1 # Round 4 | ||
466 | AESENC \TMP3, \XMM2 | ||
467 | AESENC \TMP3, \XMM3 | ||
468 | AESENC \TMP3, \XMM4 | ||
469 | movdqa HashKey_3_k(%rsp), \TMP5 | ||
470 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
471 | movaps 0x50(%arg1), \TMP3 | ||
472 | AESENC \TMP3, \XMM1 # Round 5 | ||
473 | AESENC \TMP3, \XMM2 | ||
474 | AESENC \TMP3, \XMM3 | ||
475 | AESENC \TMP3, \XMM4 | ||
476 | pxor \TMP1, \TMP4 | ||
477 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
478 | pxor \XMM6, \XMM5 | ||
479 | pxor \TMP2, \TMP6 | ||
480 | movdqa \XMM7, \TMP1 | ||
481 | pshufd $78, \XMM7, \TMP2 | ||
482 | pxor \XMM7, \TMP2 | ||
483 | movdqa HashKey_2(%rsp ), \TMP5 | ||
484 | |||
485 | # Multiply TMP5 * HashKey using karatsuba | ||
486 | |||
487 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
488 | movaps 0x60(%arg1), \TMP3 | ||
489 | AESENC \TMP3, \XMM1 # Round 6 | ||
490 | AESENC \TMP3, \XMM2 | ||
491 | AESENC \TMP3, \XMM3 | ||
492 | AESENC \TMP3, \XMM4 | ||
493 | PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | ||
494 | movaps 0x70(%arg1), \TMP3 | ||
495 | AESENC \TMP3, \XMM1 # Round 7 | ||
496 | AESENC \TMP3, \XMM2 | ||
497 | AESENC \TMP3, \XMM3 | ||
498 | AESENC \TMP3, \XMM4 | ||
499 | movdqa HashKey_2_k(%rsp), \TMP5 | ||
500 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
501 | movaps 0x80(%arg1), \TMP3 | ||
502 | AESENC \TMP3, \XMM1 # Round 8 | ||
503 | AESENC \TMP3, \XMM2 | ||
504 | AESENC \TMP3, \XMM3 | ||
505 | AESENC \TMP3, \XMM4 | ||
506 | pxor \TMP1, \TMP4 | ||
507 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
508 | pxor \XMM7, \XMM5 | ||
509 | pxor \TMP2, \TMP6 | ||
510 | |||
511 | # Multiply XMM8 * HashKey | ||
512 | # XMM8 and TMP5 hold the values for the two operands | ||
513 | |||
514 | movdqa \XMM8, \TMP1 | ||
515 | pshufd $78, \XMM8, \TMP2 | ||
516 | pxor \XMM8, \TMP2 | ||
517 | movdqa HashKey(%rsp), \TMP5 | ||
518 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
519 | movaps 0x90(%arg1), \TMP3 | ||
520 | AESENC \TMP3, \XMM1 # Round 9 | ||
521 | AESENC \TMP3, \XMM2 | ||
522 | AESENC \TMP3, \XMM3 | ||
523 | AESENC \TMP3, \XMM4 | ||
524 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | ||
525 | movaps 0xa0(%arg1), \TMP3 | ||
526 | AESENCLAST \TMP3, \XMM1 # Round 10 | ||
527 | AESENCLAST \TMP3, \XMM2 | ||
528 | AESENCLAST \TMP3, \XMM3 | ||
529 | AESENCLAST \TMP3, \XMM4 | ||
530 | movdqa HashKey_k(%rsp), \TMP5 | ||
531 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
532 | movdqu (%arg3,%r11,1), \TMP3 | ||
533 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK | ||
534 | .if \operation == dec | ||
535 | movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer | ||
536 | movdqa \TMP3, \XMM1 | ||
537 | .endif | ||
538 | movdqu 16(%arg3,%r11,1), \TMP3 | ||
539 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK | ||
540 | .if \operation == dec | ||
541 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer | ||
542 | movdqa \TMP3, \XMM2 | ||
543 | .endif | ||
544 | movdqu 32(%arg3,%r11,1), \TMP3 | ||
545 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK | ||
546 | .if \operation == dec | ||
547 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer | ||
548 | movdqa \TMP3, \XMM3 | ||
549 | .endif | ||
550 | movdqu 48(%arg3,%r11,1), \TMP3 | ||
551 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK | ||
552 | .if \operation == dec | ||
553 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer | ||
554 | movdqa \TMP3, \XMM4 | ||
555 | .else | ||
556 | movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer | ||
557 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer | ||
558 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer | ||
559 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer | ||
560 | .endif | ||
561 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
562 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
563 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
564 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway | ||
565 | |||
566 | pxor \TMP4, \TMP1 | ||
567 | pxor \XMM8, \XMM5 | ||
568 | pxor \TMP6, \TMP2 | ||
569 | pxor \TMP1, \TMP2 | ||
570 | pxor \XMM5, \TMP2 | ||
571 | movdqa \TMP2, \TMP3 | ||
572 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
573 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
574 | pxor \TMP3, \XMM5 | ||
575 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | ||
576 | |||
577 | # first phase of reduction | ||
578 | |||
579 | movdqa \XMM5, \TMP2 | ||
580 | movdqa \XMM5, \TMP3 | ||
581 | movdqa \XMM5, \TMP4 | ||
582 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | ||
583 | pslld $31, \TMP2 # packed right shift << 31 | ||
584 | pslld $30, \TMP3 # packed right shift << 30 | ||
585 | pslld $25, \TMP4 # packed right shift << 25 | ||
586 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
587 | pxor \TMP4, \TMP2 | ||
588 | movdqa \TMP2, \TMP5 | ||
589 | psrldq $4, \TMP5 # right shift T5 1 DW | ||
590 | pslldq $12, \TMP2 # left shift T2 3 DWs | ||
591 | pxor \TMP2, \XMM5 | ||
592 | |||
593 | # second phase of reduction | ||
594 | |||
595 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | ||
596 | movdqa \XMM5,\TMP3 | ||
597 | movdqa \XMM5,\TMP4 | ||
598 | psrld $1, \TMP2 # packed left shift >>1 | ||
599 | psrld $2, \TMP3 # packed left shift >>2 | ||
600 | psrld $7, \TMP4 # packed left shift >>7 | ||
601 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
602 | pxor \TMP4,\TMP2 | ||
603 | pxor \TMP5, \TMP2 | ||
604 | pxor \TMP2, \XMM5 | ||
605 | pxor \TMP1, \XMM5 # result is in TMP1 | ||
606 | |||
607 | pxor \XMM5, \XMM1 | ||
608 | .endm | ||
609 | |||
610 | /* GHASH the last 4 ciphertext blocks. */ | ||
611 | .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | ||
612 | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | ||
613 | |||
614 | # Multiply TMP6 * HashKey (using Karatsuba) | ||
615 | |||
616 | movdqa \XMM1, \TMP6 | ||
617 | pshufd $78, \XMM1, \TMP2 | ||
618 | pxor \XMM1, \TMP2 | ||
619 | movdqa HashKey_4(%rsp), \TMP5 | ||
620 | PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 | ||
621 | PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 | ||
622 | movdqa HashKey_4_k(%rsp), \TMP4 | ||
623 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
624 | movdqa \XMM1, \XMMDst | ||
625 | movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 | ||
626 | |||
627 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
628 | |||
629 | movdqa \XMM2, \TMP1 | ||
630 | pshufd $78, \XMM2, \TMP2 | ||
631 | pxor \XMM2, \TMP2 | ||
632 | movdqa HashKey_3(%rsp), \TMP5 | ||
633 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
634 | PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 | ||
635 | movdqa HashKey_3_k(%rsp), \TMP4 | ||
636 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
637 | pxor \TMP1, \TMP6 | ||
638 | pxor \XMM2, \XMMDst | ||
639 | pxor \TMP2, \XMM1 | ||
640 | # results accumulated in TMP6, XMMDst, XMM1 | ||
641 | |||
642 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
643 | |||
644 | movdqa \XMM3, \TMP1 | ||
645 | pshufd $78, \XMM3, \TMP2 | ||
646 | pxor \XMM3, \TMP2 | ||
647 | movdqa HashKey_2(%rsp), \TMP5 | ||
648 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
649 | PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 | ||
650 | movdqa HashKey_2_k(%rsp), \TMP4 | ||
651 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
652 | pxor \TMP1, \TMP6 | ||
653 | pxor \XMM3, \XMMDst | ||
654 | pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 | ||
655 | |||
656 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
657 | movdqa \XMM4, \TMP1 | ||
658 | pshufd $78, \XMM4, \TMP2 | ||
659 | pxor \XMM4, \TMP2 | ||
660 | movdqa HashKey(%rsp), \TMP5 | ||
661 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
662 | PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 | ||
663 | movdqa HashKey_k(%rsp), \TMP4 | ||
664 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
665 | pxor \TMP1, \TMP6 | ||
666 | pxor \XMM4, \XMMDst | ||
667 | pxor \XMM1, \TMP2 | ||
668 | pxor \TMP6, \TMP2 | ||
669 | pxor \XMMDst, \TMP2 | ||
670 | # middle section of the temp results combined as in karatsuba algorithm | ||
671 | movdqa \TMP2, \TMP4 | ||
672 | pslldq $8, \TMP4 # left shift TMP4 2 DWs | ||
673 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
674 | pxor \TMP4, \XMMDst | ||
675 | pxor \TMP2, \TMP6 | ||
676 | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | ||
677 | # first phase of the reduction | ||
678 | movdqa \XMMDst, \TMP2 | ||
679 | movdqa \XMMDst, \TMP3 | ||
680 | movdqa \XMMDst, \TMP4 | ||
681 | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | ||
682 | pslld $31, \TMP2 # packed right shifting << 31 | ||
683 | pslld $30, \TMP3 # packed right shifting << 30 | ||
684 | pslld $25, \TMP4 # packed right shifting << 25 | ||
685 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
686 | pxor \TMP4, \TMP2 | ||
687 | movdqa \TMP2, \TMP7 | ||
688 | psrldq $4, \TMP7 # right shift TMP7 1 DW | ||
689 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
690 | pxor \TMP2, \XMMDst | ||
691 | |||
692 | # second phase of the reduction | ||
693 | movdqa \XMMDst, \TMP2 | ||
694 | # make 3 copies of XMMDst for doing 3 shift operations | ||
695 | movdqa \XMMDst, \TMP3 | ||
696 | movdqa \XMMDst, \TMP4 | ||
697 | psrld $1, \TMP2 # packed left shift >> 1 | ||
698 | psrld $2, \TMP3 # packed left shift >> 2 | ||
699 | psrld $7, \TMP4 # packed left shift >> 7 | ||
700 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
701 | pxor \TMP4, \TMP2 | ||
702 | pxor \TMP7, \TMP2 | ||
703 | pxor \TMP2, \XMMDst | ||
704 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | ||
705 | .endm | ||
706 | |||
707 | /* Encryption of a single block done*/ | ||
708 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | ||
709 | |||
710 | pxor (%arg1), \XMM0 | ||
711 | movaps 16(%arg1), \TMP1 | ||
712 | AESENC \TMP1, \XMM0 | ||
713 | movaps 32(%arg1), \TMP1 | ||
714 | AESENC \TMP1, \XMM0 | ||
715 | movaps 48(%arg1), \TMP1 | ||
716 | AESENC \TMP1, \XMM0 | ||
717 | movaps 64(%arg1), \TMP1 | ||
718 | AESENC \TMP1, \XMM0 | ||
719 | movaps 80(%arg1), \TMP1 | ||
720 | AESENC \TMP1, \XMM0 | ||
721 | movaps 96(%arg1), \TMP1 | ||
722 | AESENC \TMP1, \XMM0 | ||
723 | movaps 112(%arg1), \TMP1 | ||
724 | AESENC \TMP1, \XMM0 | ||
725 | movaps 128(%arg1), \TMP1 | ||
726 | AESENC \TMP1, \XMM0 | ||
727 | movaps 144(%arg1), \TMP1 | ||
728 | AESENC \TMP1, \XMM0 | ||
729 | movaps 160(%arg1), \TMP1 | ||
730 | AESENCLAST \TMP1, \XMM0 | ||
731 | .endm | ||
732 | |||
733 | |||
734 | /***************************************************************************** | ||
735 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
736 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. | ||
737 | * const u8 *in, // Ciphertext input | ||
738 | * u64 plaintext_len, // Length of data in bytes for decryption. | ||
739 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
740 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
741 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
742 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
743 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
744 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
745 | * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the | ||
746 | * // given authentication tag and only return the plaintext if they match. | ||
747 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | ||
748 | * // (most likely), 12 or 8. | ||
749 | * | ||
750 | * Assumptions: | ||
751 | * | ||
752 | * keys: | ||
753 | * keys are pre-expanded and aligned to 16 bytes. we are using the first | ||
754 | * set of 11 keys in the data structure void *aes_ctx | ||
755 | * | ||
756 | * iv: | ||
757 | * 0 1 2 3 | ||
758 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
759 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
760 | * | Salt (From the SA) | | ||
761 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
762 | * | Initialization Vector | | ||
763 | * | (This is the sequence number from IPSec header) | | ||
764 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
765 | * | 0x1 | | ||
766 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
767 | * | ||
768 | * | ||
769 | * | ||
770 | * AAD: | ||
771 | * AAD padded to 128 bits with 0 | ||
772 | * for example, assume AAD is a u32 vector | ||
773 | * | ||
774 | * if AAD is 8 bytes: | ||
775 | * AAD[3] = {A0, A1}; | ||
776 | * padded AAD in xmm register = {A1 A0 0 0} | ||
777 | * | ||
778 | * 0 1 2 3 | ||
779 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
780 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
781 | * | SPI (A1) | | ||
782 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
783 | * | 32-bit Sequence Number (A0) | | ||
784 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
785 | * | 0x0 | | ||
786 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
787 | * | ||
788 | * AAD Format with 32-bit Sequence Number | ||
789 | * | ||
790 | * if AAD is 12 bytes: | ||
791 | * AAD[3] = {A0, A1, A2}; | ||
792 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
793 | * | ||
794 | * 0 1 2 3 | ||
795 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
796 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
797 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
798 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
799 | * | SPI (A2) | | ||
800 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
801 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
802 | * | | | ||
803 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
804 | * | 0x0 | | ||
805 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
806 | * | ||
807 | * AAD Format with 64-bit Extended Sequence Number | ||
808 | * | ||
809 | * aadLen: | ||
810 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
811 | * The code supports 16 too but for other sizes, the code will fail. | ||
812 | * | ||
813 | * TLen: | ||
814 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
815 | * For other sizes, the code will fail. | ||
816 | * | ||
817 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
818 | * | ||
819 | *****************************************************************************/ | ||
820 | |||
821 | ENTRY(aesni_gcm_dec) | ||
822 | push %r12 | ||
823 | push %r13 | ||
824 | push %r14 | ||
825 | mov %rsp, %r14 | ||
826 | /* | ||
827 | * states of %xmm registers %xmm6:%xmm15 not saved | ||
828 | * all %xmm registers are clobbered | ||
829 | */ | ||
830 | sub $VARIABLE_OFFSET, %rsp | ||
831 | and $~63, %rsp # align rsp to 64 bytes | ||
832 | mov %arg6, %r12 | ||
833 | movdqu (%r12), %xmm13 # %xmm13 = HashKey | ||
834 | pshufb SHUF_MASK(%rip), %xmm13 | ||
835 | |||
836 | # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) | ||
837 | |||
838 | movdqa %xmm13, %xmm2 | ||
839 | psllq $1, %xmm13 | ||
840 | psrlq $63, %xmm2 | ||
841 | movdqa %xmm2, %xmm1 | ||
842 | pslldq $8, %xmm2 | ||
843 | psrldq $8, %xmm1 | ||
844 | por %xmm2, %xmm13 | ||
845 | |||
846 | # Reduction | ||
847 | |||
848 | pshufd $0x24, %xmm1, %xmm2 | ||
849 | pcmpeqd TWOONE(%rip), %xmm2 | ||
850 | pand POLY(%rip), %xmm2 | ||
851 | pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) | ||
852 | |||
853 | |||
854 | # Decrypt first few blocks | ||
855 | |||
856 | movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) | ||
857 | mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext | ||
858 | and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) | ||
859 | mov %r13, %r12 | ||
860 | and $(3<<4), %r12 | ||
861 | jz _initial_num_blocks_is_0_decrypt | ||
862 | cmp $(2<<4), %r12 | ||
863 | jb _initial_num_blocks_is_1_decrypt | ||
864 | je _initial_num_blocks_is_2_decrypt | ||
865 | _initial_num_blocks_is_3_decrypt: | ||
866 | INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
867 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec | ||
868 | sub $48, %r13 | ||
869 | jmp _initial_blocks_decrypted | ||
870 | _initial_num_blocks_is_2_decrypt: | ||
871 | INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
872 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec | ||
873 | sub $32, %r13 | ||
874 | jmp _initial_blocks_decrypted | ||
875 | _initial_num_blocks_is_1_decrypt: | ||
876 | INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
877 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec | ||
878 | sub $16, %r13 | ||
879 | jmp _initial_blocks_decrypted | ||
880 | _initial_num_blocks_is_0_decrypt: | ||
881 | INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
882 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec | ||
883 | _initial_blocks_decrypted: | ||
884 | cmp $0, %r13 | ||
885 | je _zero_cipher_left_decrypt | ||
886 | sub $64, %r13 | ||
887 | je _four_cipher_left_decrypt | ||
888 | _decrypt_by_4: | ||
889 | GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
890 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec | ||
891 | add $64, %r11 | ||
892 | sub $64, %r13 | ||
893 | jne _decrypt_by_4 | ||
894 | _four_cipher_left_decrypt: | ||
895 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
896 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
897 | _zero_cipher_left_decrypt: | ||
898 | mov %arg4, %r13 | ||
899 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
900 | je _multiple_of_16_bytes_decrypt | ||
901 | |||
902 | # Handle the last <16 byte block seperately | ||
903 | |||
904 | paddd ONE(%rip), %xmm0 # increment CNT to get Yn | ||
905 | pshufb SHUF_MASK(%rip), %xmm0 | ||
906 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) | ||
907 | sub $16, %r11 | ||
908 | add %r13, %r11 | ||
909 | movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block | ||
910 | lea SHIFT_MASK+16(%rip), %r12 | ||
911 | sub %r13, %r12 | ||
912 | # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes | ||
913 | # (%r13 is the number of bytes in plaintext mod 16) | ||
914 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
915 | pshufb %xmm2, %xmm1 # right shift 16-%r13 butes | ||
916 | movdqa %xmm1, %xmm2 | ||
917 | pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) | ||
918 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
919 | # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 | ||
920 | pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 | ||
921 | pand %xmm1, %xmm2 | ||
922 | pshufb SHUF_MASK(%rip),%xmm2 | ||
923 | pxor %xmm2, %xmm8 | ||
924 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
925 | # GHASH computation for the last <16 byte block | ||
926 | sub %r13, %r11 | ||
927 | add $16, %r11 | ||
928 | |||
929 | # output %r13 bytes | ||
930 | movq %xmm0, %rax | ||
931 | cmp $8, %r13 | ||
932 | jle _less_than_8_bytes_left_decrypt | ||
933 | mov %rax, (%arg2 , %r11, 1) | ||
934 | add $8, %r11 | ||
935 | psrldq $8, %xmm0 | ||
936 | movq %xmm0, %rax | ||
937 | sub $8, %r13 | ||
938 | _less_than_8_bytes_left_decrypt: | ||
939 | mov %al, (%arg2, %r11, 1) | ||
940 | add $1, %r11 | ||
941 | shr $8, %rax | ||
942 | sub $1, %r13 | ||
943 | jne _less_than_8_bytes_left_decrypt | ||
944 | _multiple_of_16_bytes_decrypt: | ||
945 | mov arg8, %r12 # %r13 = aadLen (number of bytes) | ||
946 | shl $3, %r12 # convert into number of bits | ||
947 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
948 | shl $3, %arg4 # len(C) in bits (*128) | ||
949 | movq %arg4, %xmm1 | ||
950 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
951 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
952 | pxor %xmm15, %xmm8 | ||
953 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
954 | # final GHASH computation | ||
955 | pshufb SHUF_MASK(%rip), %xmm8 | ||
956 | mov %arg5, %rax # %rax = *Y0 | ||
957 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
958 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) | ||
959 | pxor %xmm8, %xmm0 | ||
960 | _return_T_decrypt: | ||
961 | mov arg9, %r10 # %r10 = authTag | ||
962 | mov arg10, %r11 # %r11 = auth_tag_len | ||
963 | cmp $16, %r11 | ||
964 | je _T_16_decrypt | ||
965 | cmp $12, %r11 | ||
966 | je _T_12_decrypt | ||
967 | _T_8_decrypt: | ||
968 | movq %xmm0, %rax | ||
969 | mov %rax, (%r10) | ||
970 | jmp _return_T_done_decrypt | ||
971 | _T_12_decrypt: | ||
972 | movq %xmm0, %rax | ||
973 | mov %rax, (%r10) | ||
974 | psrldq $8, %xmm0 | ||
975 | movd %xmm0, %eax | ||
976 | mov %eax, 8(%r10) | ||
977 | jmp _return_T_done_decrypt | ||
978 | _T_16_decrypt: | ||
979 | movdqu %xmm0, (%r10) | ||
980 | _return_T_done_decrypt: | ||
981 | mov %r14, %rsp | ||
982 | pop %r14 | ||
983 | pop %r13 | ||
984 | pop %r12 | ||
985 | ret | ||
986 | |||
987 | |||
988 | /***************************************************************************** | ||
989 | * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
990 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. | ||
991 | * const u8 *in, // Plaintext input | ||
992 | * u64 plaintext_len, // Length of data in bytes for encryption. | ||
993 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
994 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
995 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
996 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
997 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
998 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
999 | * u8 *auth_tag, // Authenticated Tag output. | ||
1000 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | ||
1001 | * // 12 or 8. | ||
1002 | * | ||
1003 | * Assumptions: | ||
1004 | * | ||
1005 | * keys: | ||
1006 | * keys are pre-expanded and aligned to 16 bytes. we are using the | ||
1007 | * first set of 11 keys in the data structure void *aes_ctx | ||
1008 | * | ||
1009 | * | ||
1010 | * iv: | ||
1011 | * 0 1 2 3 | ||
1012 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1013 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1014 | * | Salt (From the SA) | | ||
1015 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1016 | * | Initialization Vector | | ||
1017 | * | (This is the sequence number from IPSec header) | | ||
1018 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1019 | * | 0x1 | | ||
1020 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1021 | * | ||
1022 | * | ||
1023 | * | ||
1024 | * AAD: | ||
1025 | * AAD padded to 128 bits with 0 | ||
1026 | * for example, assume AAD is a u32 vector | ||
1027 | * | ||
1028 | * if AAD is 8 bytes: | ||
1029 | * AAD[3] = {A0, A1}; | ||
1030 | * padded AAD in xmm register = {A1 A0 0 0} | ||
1031 | * | ||
1032 | * 0 1 2 3 | ||
1033 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1034 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1035 | * | SPI (A1) | | ||
1036 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1037 | * | 32-bit Sequence Number (A0) | | ||
1038 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1039 | * | 0x0 | | ||
1040 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1041 | * | ||
1042 | * AAD Format with 32-bit Sequence Number | ||
1043 | * | ||
1044 | * if AAD is 12 bytes: | ||
1045 | * AAD[3] = {A0, A1, A2}; | ||
1046 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
1047 | * | ||
1048 | * 0 1 2 3 | ||
1049 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1050 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1051 | * | SPI (A2) | | ||
1052 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1053 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
1054 | * | | | ||
1055 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1056 | * | 0x0 | | ||
1057 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1058 | * | ||
1059 | * AAD Format with 64-bit Extended Sequence Number | ||
1060 | * | ||
1061 | * aadLen: | ||
1062 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
1063 | * The code supports 16 too but for other sizes, the code will fail. | ||
1064 | * | ||
1065 | * TLen: | ||
1066 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
1067 | * For other sizes, the code will fail. | ||
1068 | * | ||
1069 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
1070 | ***************************************************************************/ | ||
1071 | ENTRY(aesni_gcm_enc) | ||
1072 | push %r12 | ||
1073 | push %r13 | ||
1074 | push %r14 | ||
1075 | mov %rsp, %r14 | ||
1076 | # | ||
1077 | # states of %xmm registers %xmm6:%xmm15 not saved | ||
1078 | # all %xmm registers are clobbered | ||
1079 | # | ||
1080 | sub $VARIABLE_OFFSET, %rsp | ||
1081 | and $~63, %rsp | ||
1082 | mov %arg6, %r12 | ||
1083 | movdqu (%r12), %xmm13 | ||
1084 | pshufb SHUF_MASK(%rip), %xmm13 | ||
1085 | |||
1086 | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | ||
1087 | |||
1088 | movdqa %xmm13, %xmm2 | ||
1089 | psllq $1, %xmm13 | ||
1090 | psrlq $63, %xmm2 | ||
1091 | movdqa %xmm2, %xmm1 | ||
1092 | pslldq $8, %xmm2 | ||
1093 | psrldq $8, %xmm1 | ||
1094 | por %xmm2, %xmm13 | ||
1095 | |||
1096 | # reduce HashKey<<1 | ||
1097 | |||
1098 | pshufd $0x24, %xmm1, %xmm2 | ||
1099 | pcmpeqd TWOONE(%rip), %xmm2 | ||
1100 | pand POLY(%rip), %xmm2 | ||
1101 | pxor %xmm2, %xmm13 | ||
1102 | movdqa %xmm13, HashKey(%rsp) | ||
1103 | mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) | ||
1104 | and $-16, %r13 | ||
1105 | mov %r13, %r12 | ||
1106 | |||
1107 | # Encrypt first few blocks | ||
1108 | |||
1109 | and $(3<<4), %r12 | ||
1110 | jz _initial_num_blocks_is_0_encrypt | ||
1111 | cmp $(2<<4), %r12 | ||
1112 | jb _initial_num_blocks_is_1_encrypt | ||
1113 | je _initial_num_blocks_is_2_encrypt | ||
1114 | _initial_num_blocks_is_3_encrypt: | ||
1115 | INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1116 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc | ||
1117 | sub $48, %r13 | ||
1118 | jmp _initial_blocks_encrypted | ||
1119 | _initial_num_blocks_is_2_encrypt: | ||
1120 | INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1121 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc | ||
1122 | sub $32, %r13 | ||
1123 | jmp _initial_blocks_encrypted | ||
1124 | _initial_num_blocks_is_1_encrypt: | ||
1125 | INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1126 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc | ||
1127 | sub $16, %r13 | ||
1128 | jmp _initial_blocks_encrypted | ||
1129 | _initial_num_blocks_is_0_encrypt: | ||
1130 | INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1131 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc | ||
1132 | _initial_blocks_encrypted: | ||
1133 | |||
1134 | # Main loop - Encrypt remaining blocks | ||
1135 | |||
1136 | cmp $0, %r13 | ||
1137 | je _zero_cipher_left_encrypt | ||
1138 | sub $64, %r13 | ||
1139 | je _four_cipher_left_encrypt | ||
1140 | _encrypt_by_4_encrypt: | ||
1141 | GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
1142 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc | ||
1143 | add $64, %r11 | ||
1144 | sub $64, %r13 | ||
1145 | jne _encrypt_by_4_encrypt | ||
1146 | _four_cipher_left_encrypt: | ||
1147 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
1148 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
1149 | _zero_cipher_left_encrypt: | ||
1150 | mov %arg4, %r13 | ||
1151 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
1152 | je _multiple_of_16_bytes_encrypt | ||
1153 | |||
1154 | # Handle the last <16 Byte block seperately | ||
1155 | paddd ONE(%rip), %xmm0 # INCR CNT to get Yn | ||
1156 | pshufb SHUF_MASK(%rip), %xmm0 | ||
1157 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) | ||
1158 | sub $16, %r11 | ||
1159 | add %r13, %r11 | ||
1160 | movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks | ||
1161 | lea SHIFT_MASK+16(%rip), %r12 | ||
1162 | sub %r13, %r12 | ||
1163 | # adjust the shuffle mask pointer to be able to shift 16-r13 bytes | ||
1164 | # (%r13 is the number of bytes in plaintext mod 16) | ||
1165 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
1166 | pshufb %xmm2, %xmm1 # shift right 16-r13 byte | ||
1167 | pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) | ||
1168 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
1169 | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | ||
1170 | pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 | ||
1171 | |||
1172 | pshufb SHUF_MASK(%rip),%xmm0 | ||
1173 | pxor %xmm0, %xmm8 | ||
1174 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1175 | # GHASH computation for the last <16 byte block | ||
1176 | sub %r13, %r11 | ||
1177 | add $16, %r11 | ||
1178 | pshufb SHUF_MASK(%rip), %xmm0 | ||
1179 | # shuffle xmm0 back to output as ciphertext | ||
1180 | |||
1181 | # Output %r13 bytes | ||
1182 | movq %xmm0, %rax | ||
1183 | cmp $8, %r13 | ||
1184 | jle _less_than_8_bytes_left_encrypt | ||
1185 | mov %rax, (%arg2 , %r11, 1) | ||
1186 | add $8, %r11 | ||
1187 | psrldq $8, %xmm0 | ||
1188 | movq %xmm0, %rax | ||
1189 | sub $8, %r13 | ||
1190 | _less_than_8_bytes_left_encrypt: | ||
1191 | mov %al, (%arg2, %r11, 1) | ||
1192 | add $1, %r11 | ||
1193 | shr $8, %rax | ||
1194 | sub $1, %r13 | ||
1195 | jne _less_than_8_bytes_left_encrypt | ||
1196 | _multiple_of_16_bytes_encrypt: | ||
1197 | mov arg8, %r12 # %r12 = addLen (number of bytes) | ||
1198 | shl $3, %r12 | ||
1199 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
1200 | shl $3, %arg4 # len(C) in bits (*128) | ||
1201 | movq %arg4, %xmm1 | ||
1202 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
1203 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
1204 | pxor %xmm15, %xmm8 | ||
1205 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1206 | # final GHASH computation | ||
1207 | |||
1208 | pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap | ||
1209 | mov %arg5, %rax # %rax = *Y0 | ||
1210 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
1211 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) | ||
1212 | pxor %xmm8, %xmm0 | ||
1213 | _return_T_encrypt: | ||
1214 | mov arg9, %r10 # %r10 = authTag | ||
1215 | mov arg10, %r11 # %r11 = auth_tag_len | ||
1216 | cmp $16, %r11 | ||
1217 | je _T_16_encrypt | ||
1218 | cmp $12, %r11 | ||
1219 | je _T_12_encrypt | ||
1220 | _T_8_encrypt: | ||
1221 | movq %xmm0, %rax | ||
1222 | mov %rax, (%r10) | ||
1223 | jmp _return_T_done_encrypt | ||
1224 | _T_12_encrypt: | ||
1225 | movq %xmm0, %rax | ||
1226 | mov %rax, (%r10) | ||
1227 | psrldq $8, %xmm0 | ||
1228 | movd %xmm0, %eax | ||
1229 | mov %eax, 8(%r10) | ||
1230 | jmp _return_T_done_encrypt | ||
1231 | _T_16_encrypt: | ||
1232 | movdqu %xmm0, (%r10) | ||
1233 | _return_T_done_encrypt: | ||
1234 | mov %r14, %rsp | ||
1235 | pop %r14 | ||
1236 | pop %r13 | ||
1237 | pop %r12 | ||
1238 | ret | ||
1239 | |||
1240 | |||
1241 | |||
50 | _key_expansion_128: | 1242 | _key_expansion_128: |
51 | _key_expansion_256a: | 1243 | _key_expansion_256a: |
52 | pshufd $0b11111111, %xmm1, %xmm1 | 1244 | pshufd $0b11111111, %xmm1, %xmm1 |